2023-10-24 16:43:14 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2023-11-22 12:47:34 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
FIXME
|
|
|
|
|
|
|
|
1) feed_mode_scan doesn't find feed for https://www.blender.org/
|
|
|
|
even though it should be according to the pathnames dictionary.
|
|
|
|
|
2023-11-23 17:55:36 +01:00
|
|
|
TODO
|
|
|
|
|
|
|
|
1) Support Gemini and Gopher.
|
|
|
|
|
2023-12-26 12:22:45 +01:00
|
|
|
2) Check also for HTML, not only feed.bozo.
|
|
|
|
|
2024-01-02 12:42:41 +01:00
|
|
|
3) Add "if utility.is_feed(url, feed)" to view_entry and view_feed
|
2023-12-27 23:48:31 +01:00
|
|
|
|
2024-01-14 19:05:12 +01:00
|
|
|
4) Replace sqlite.remove_nonexistent_entries by sqlite.check_entry_exist
|
2024-01-02 19:11:36 +01:00
|
|
|
Same check, just reverse.
|
|
|
|
|
2024-02-10 18:53:53 +01:00
|
|
|
5) Support protocol Gopher
|
|
|
|
See project /michael-lazar/pygopherd
|
|
|
|
See project /gopherball/gb
|
|
|
|
|
|
|
|
6) Support ActivityPub @person@domain (see Tip Of The Day).
|
|
|
|
|
|
|
|
7) See project /offpunk/offblocklist.py
|
|
|
|
|
2024-05-12 11:55:23 +02:00
|
|
|
NOTE
|
|
|
|
|
|
|
|
1) You might not want to utilize aiohttp, because you
|
|
|
|
no more scan as many feeds as possible all at once
|
|
|
|
due to CPU spike.
|
|
|
|
Consider https://pythonhosted.org/feedparser/http-useragent.html
|
|
|
|
|
2023-11-22 12:47:34 +01:00
|
|
|
"""
|
|
|
|
|
2024-07-07 10:16:00 +02:00
|
|
|
import aiofiles
|
2023-12-05 09:18:29 +01:00
|
|
|
from aiohttp import ClientError, ClientSession, ClientTimeout
|
2023-12-04 15:41:02 +01:00
|
|
|
from asyncio import TimeoutError
|
2024-01-06 23:03:08 +01:00
|
|
|
# from asyncio.exceptions import IncompleteReadError
|
|
|
|
# from http.client import IncompleteRead
|
|
|
|
# from lxml import html
|
2023-10-24 16:43:14 +02:00
|
|
|
# from xml.etree.ElementTree import ElementTree, ParseError
|
2024-02-04 18:08:12 +01:00
|
|
|
import requests
|
2024-01-13 18:17:43 +01:00
|
|
|
import slixfeed.config as config
|
2024-06-13 17:53:53 +02:00
|
|
|
from slixfeed.log import Logger
|
2024-07-05 18:04:24 +02:00
|
|
|
# import urllib.request
|
|
|
|
# from urllib.error import HTTPError
|
2024-06-13 17:53:53 +02:00
|
|
|
|
|
|
|
logger = Logger(__name__)
|
|
|
|
|
2024-01-13 18:17:43 +01:00
|
|
|
try:
|
|
|
|
from magnet2torrent import Magnet2Torrent, FailedToFetchException
|
|
|
|
except:
|
2024-06-13 17:53:53 +02:00
|
|
|
logger.info(
|
2024-01-13 18:17:43 +01:00
|
|
|
"Package magnet2torrent was not found.\n"
|
|
|
|
"BitTorrent is disabled.")
|
2023-11-13 14:45:10 +01:00
|
|
|
|
2024-06-13 17:53:53 +02:00
|
|
|
# class Dat:
|
2024-01-04 02:16:24 +01:00
|
|
|
# async def dat():
|
2023-11-26 06:48:09 +01:00
|
|
|
|
2024-06-13 17:53:53 +02:00
|
|
|
# class Ftp:
|
2024-01-04 02:16:24 +01:00
|
|
|
# async def ftp():
|
2024-02-10 18:53:53 +01:00
|
|
|
|
2024-06-13 17:53:53 +02:00
|
|
|
# class Gemini:
|
2024-01-04 02:16:24 +01:00
|
|
|
# async def gemini():
|
2023-10-24 16:43:14 +02:00
|
|
|
|
2024-06-13 17:53:53 +02:00
|
|
|
# class Gopher:
|
2024-01-04 02:16:24 +01:00
|
|
|
# async def gopher():
|
2023-10-24 16:43:14 +02:00
|
|
|
|
2024-06-13 17:53:53 +02:00
|
|
|
# class Ipfs:
|
2024-01-04 02:16:24 +01:00
|
|
|
# async def ipfs():
|
2023-11-26 06:48:09 +01:00
|
|
|
|
2024-02-10 18:53:53 +01:00
|
|
|
|
2024-07-05 18:04:24 +02:00
|
|
|
class Http:
|
|
|
|
|
|
|
|
|
|
|
|
# def fetch_media(url, pathname):
|
|
|
|
# try:
|
|
|
|
# urllib.request.urlretrieve(url, pathname)
|
|
|
|
# status = 1
|
|
|
|
# except HTTPError as e:
|
|
|
|
# logger.error(e)
|
|
|
|
# status = 0
|
|
|
|
# return status
|
|
|
|
|
|
|
|
|
|
|
|
async def fetch_headers(url):
|
|
|
|
network_settings = config.get_values('settings.toml', 'network')
|
|
|
|
user_agent = (network_settings['user_agent'] or 'Slixfeed/0.1')
|
|
|
|
headers = {'User-Agent': user_agent}
|
|
|
|
proxy = (network_settings['http_proxy'] or None)
|
|
|
|
timeout = ClientTimeout(total=10)
|
|
|
|
async with ClientSession(headers=headers) as session:
|
|
|
|
async with session.get(url, proxy=proxy,
|
|
|
|
# proxy_auth=(proxy_username, proxy_password),
|
|
|
|
timeout=timeout
|
|
|
|
) as response:
|
|
|
|
headers = response.headers
|
|
|
|
return headers
|
|
|
|
# print("Headers for URL:", url)
|
|
|
|
# for header_name, header_value in headers.items():
|
|
|
|
# print(f"{header_name}: {header_value}")
|
|
|
|
|
|
|
|
|
|
|
|
# TODO Write file to disk. Consider aiofiles
|
|
|
|
async def fetch_media(url, pathname):
|
|
|
|
"""
|
|
|
|
Download media content of given URL.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
url : str
|
|
|
|
URL.
|
|
|
|
pathname : list
|
|
|
|
Pathname (including filename) to save content to.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
msg: list or str
|
|
|
|
Document or error message.
|
|
|
|
"""
|
|
|
|
network_settings = config.get_values('settings.toml', 'network')
|
|
|
|
user_agent = (network_settings['user_agent'] or 'Slixfeed/0.1')
|
|
|
|
headers = {'User-Agent': user_agent}
|
|
|
|
proxy = (network_settings['http_proxy'] or None)
|
|
|
|
timeout = ClientTimeout(total=10)
|
|
|
|
async with ClientSession(headers=headers) as session:
|
|
|
|
# async with ClientSession(trust_env=True) as session:
|
|
|
|
try:
|
|
|
|
async with session.get(url, proxy=proxy,
|
|
|
|
# proxy_auth=(proxy_username, proxy_password),
|
|
|
|
timeout=timeout
|
|
|
|
) as response:
|
|
|
|
status = response.status
|
|
|
|
if status in (200, 201):
|
2024-07-07 10:16:00 +02:00
|
|
|
f = await aiofiles.open(pathname, mode='wb')
|
|
|
|
await f.write(await response.read())
|
|
|
|
await f.close()
|
2024-07-05 18:04:24 +02:00
|
|
|
try:
|
|
|
|
result = {'charset': response.charset,
|
|
|
|
'content_length': response.content_length,
|
|
|
|
'content_type': response.content_type,
|
|
|
|
'error': False,
|
|
|
|
'message': None,
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
|
|
|
except:
|
|
|
|
result = {'error': True,
|
|
|
|
'message': 'Could not get document.',
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
|
|
|
else:
|
|
|
|
result = {'error': True,
|
|
|
|
'message': 'HTTP Error:' + str(status),
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
|
|
|
except ClientError as e:
|
|
|
|
result = {'error': True,
|
|
|
|
'message': 'Error:' + str(e) if e else 'ClientError',
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
|
|
|
except TimeoutError as e:
|
|
|
|
result = {'error': True,
|
|
|
|
'message': 'Timeout:' + str(e) if e else 'TimeoutError',
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
|
|
|
except Exception as e:
|
|
|
|
logger.error(e)
|
|
|
|
result = {'error': True,
|
|
|
|
'message': 'Error:' + str(e) if e else 'Error',
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def http_response(url):
|
|
|
|
"""
|
|
|
|
Download response headers.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
url : str
|
|
|
|
URL.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
response: requests.models.Response
|
|
|
|
HTTP Header Response.
|
|
|
|
|
|
|
|
Result would contain these:
|
|
|
|
response.encoding
|
|
|
|
response.headers
|
|
|
|
response.history
|
|
|
|
response.reason
|
|
|
|
response.status_code
|
|
|
|
response.url
|
|
|
|
"""
|
|
|
|
user_agent = (
|
|
|
|
config.get_value(
|
|
|
|
"settings", "Network", "user_agent")
|
|
|
|
) or 'Slixfeed/0.1'
|
|
|
|
headers = {
|
|
|
|
"User-Agent": user_agent
|
|
|
|
}
|
|
|
|
try:
|
|
|
|
# Do not use HEAD request because it appears that too many sites would
|
|
|
|
# deny it.
|
|
|
|
# response = requests.head(url, headers=headers, allow_redirects=True)
|
|
|
|
response = requests.get(url, headers=headers, allow_redirects=True)
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning('Error in HTTP response')
|
|
|
|
logger.error(e)
|
|
|
|
response = None
|
|
|
|
return response
|
2024-02-04 18:08:12 +01:00
|
|
|
|
2024-02-18 00:21:44 +01:00
|
|
|
|
2024-01-11 11:55:42 +01:00
|
|
|
async def http(url):
|
2023-10-24 16:43:14 +02:00
|
|
|
"""
|
|
|
|
Download content of given URL.
|
2023-11-02 06:17:04 +01:00
|
|
|
|
2023-11-13 14:45:10 +01:00
|
|
|
Parameters
|
|
|
|
----------
|
2024-01-04 02:16:24 +01:00
|
|
|
url : list
|
2023-11-13 14:45:10 +01:00
|
|
|
URL.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
msg: list or str
|
|
|
|
Document or error message.
|
2023-10-24 16:43:14 +02:00
|
|
|
"""
|
2024-03-13 17:17:43 +01:00
|
|
|
network_settings = config.get_values('settings.toml', 'network')
|
|
|
|
user_agent = (network_settings['user_agent'] or 'Slixfeed/0.1')
|
2024-01-04 02:16:24 +01:00
|
|
|
headers = {'User-Agent': user_agent}
|
2024-03-13 17:17:43 +01:00
|
|
|
proxy = (network_settings['http_proxy'] or None)
|
2023-12-04 15:41:02 +01:00
|
|
|
timeout = ClientTimeout(total=10)
|
2023-12-18 16:29:32 +01:00
|
|
|
async with ClientSession(headers=headers) as session:
|
2023-12-04 15:41:02 +01:00
|
|
|
# async with ClientSession(trust_env=True) as session:
|
2023-10-24 16:43:14 +02:00
|
|
|
try:
|
2024-01-04 02:16:24 +01:00
|
|
|
async with session.get(url, proxy=proxy,
|
|
|
|
# proxy_auth=(proxy_username, proxy_password),
|
|
|
|
timeout=timeout
|
|
|
|
) as response:
|
2023-10-24 16:43:14 +02:00
|
|
|
status = response.status
|
2024-02-18 00:21:44 +01:00
|
|
|
if status == 200:
|
2023-10-24 16:43:14 +02:00
|
|
|
try:
|
2024-02-18 00:21:44 +01:00
|
|
|
document = await response.text()
|
|
|
|
result = {'charset': response.charset,
|
|
|
|
'content': document,
|
|
|
|
'content_length': response.content_length,
|
|
|
|
'content_type': response.content_type,
|
|
|
|
'error': False,
|
|
|
|
'message': None,
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
2023-10-24 16:43:14 +02:00
|
|
|
except:
|
2024-02-18 00:21:44 +01:00
|
|
|
result = {'error': True,
|
|
|
|
'message': 'Could not get document.',
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
2023-10-24 16:43:14 +02:00
|
|
|
else:
|
2024-02-18 00:21:44 +01:00
|
|
|
result = {'error': True,
|
|
|
|
'message': 'HTTP Error:' + str(status),
|
|
|
|
'original_url': url,
|
|
|
|
'status_code': status,
|
|
|
|
'response_url': response.url}
|
2023-12-04 15:41:02 +01:00
|
|
|
except ClientError as e:
|
2024-02-18 00:21:44 +01:00
|
|
|
result = {'error': True,
|
2024-03-13 16:44:20 +01:00
|
|
|
'message': 'Error:' + str(e) if e else 'ClientError',
|
2024-03-07 20:06:31 +01:00
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
2023-12-04 15:41:02 +01:00
|
|
|
except TimeoutError as e:
|
2024-02-18 00:21:44 +01:00
|
|
|
result = {'error': True,
|
2024-03-13 16:44:20 +01:00
|
|
|
'message': 'Timeout:' + str(e) if e else 'TimeoutError',
|
2024-03-07 20:06:31 +01:00
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
2024-02-29 18:08:53 +01:00
|
|
|
except Exception as e:
|
2024-06-13 17:53:53 +02:00
|
|
|
logger.error(e)
|
2024-02-29 18:08:53 +01:00
|
|
|
result = {'error': True,
|
2024-03-13 16:44:20 +01:00
|
|
|
'message': 'Error:' + str(e) if e else 'Error',
|
2024-03-07 20:06:31 +01:00
|
|
|
'original_url': url,
|
|
|
|
'status_code': None}
|
2024-02-18 00:21:44 +01:00
|
|
|
return result
|
2024-01-13 18:17:43 +01:00
|
|
|
|
|
|
|
|
|
|
|
async def magnet(link):
|
|
|
|
m2t = Magnet2Torrent(link)
|
|
|
|
try:
|
|
|
|
filename, torrent_data = await m2t.retrieve_torrent()
|
|
|
|
except FailedToFetchException:
|
2024-06-13 17:53:53 +02:00
|
|
|
logger.debug("Failed")
|