Slixfeed/slixfeed/datahandler.py

1053 lines
30 KiB
Python
Raw Normal View History

2023-10-24 16:43:14 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FIXME
1) feed_mode_scan doesn't find feed for https://www.blender.org/
even though it should be according to the pathnames dictionary.
2023-11-23 17:55:36 +01:00
TODO
1) Support Gemini and Gopher.
"""
2023-10-24 16:43:14 +02:00
import aiohttp
import asyncio
import feedparser
2023-10-24 16:43:14 +02:00
import sqlitehandler
2023-11-02 06:17:04 +01:00
import confighandler
import datetimehandler
import listhandler
2023-10-24 16:43:14 +02:00
from asyncio.exceptions import IncompleteReadError
from http.client import IncompleteRead
2023-10-24 16:43:14 +02:00
from urllib import error
from bs4 import BeautifulSoup
# from xml.etree.ElementTree import ElementTree, ParseError
from urllib.parse import urljoin
from urllib.parse import urlsplit
from urllib.parse import urlunsplit
2023-10-24 16:43:14 +02:00
from lxml import html
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def download_updates(db_file, url=None):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if url:
urls = [url] # Valid [url] and [url,] and (url,)
else:
urls = await sqlitehandler.get_feeds_url(db_file)
2023-10-24 16:43:14 +02:00
for url in urls:
# print(os.path.basename(db_file), url[0])
source = url[0]
res = await download_feed(source)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed
# urls.next()
# next(urls)
continue
await sqlitehandler.update_source_status(
db_file,
res[1],
source
)
2023-10-24 16:43:14 +02:00
if res[0]:
try:
feed = feedparser.parse(res[0])
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(source)
# print(bozo)
2023-10-24 16:43:14 +02:00
valid = 0
else:
valid = 1
await sqlitehandler.update_source_validity(
db_file,
source,
valid)
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
None
2023-10-24 16:43:14 +02:00
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res[1] == 200:
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed.entries
# length = len(entries)
# await sqlitehandler.remove_entry(db_file, source, length)
await sqlitehandler.remove_nonexistent_entries(
db_file,
feed,
source
)
# new_entry = 0
2023-10-24 16:43:14 +02:00
for entry in entries:
if entry.has_key("title"):
# title = entry.title
title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
2023-10-24 16:43:14 +02:00
else:
title = feed["feed"]["title"]
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = await join_url(source, entry.link)
link = await trim_url(link)
2023-10-24 16:43:14 +02:00
else:
link = source
if entry.has_key("id"):
eid = entry.id
else:
eid = link
# TODO Pass date too for comparion check
if entry.has_key("published"):
date = entry.published
date = await datetimehandler.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = await datetimehandler.rfc2822_to_iso8601(date)
else:
2023-11-29 16:32:35 +01:00
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = await datetimehandler.now()
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
exist = await sqlitehandler.check_entry_exist(
db_file,
source,
eid=eid,
title=title,
link=link,
date=date
)
2023-10-24 16:43:14 +02:00
if not exist:
# new_entry = new_entry + 1
2023-10-24 16:43:14 +02:00
# TODO Enhance summary
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
2023-11-29 16:32:35 +01:00
summary = summary.replace("\n\n\n", "\n\n")
summary = summary[:300] + " ‍⃨"
summary = summary.strip().split('\n')
summary = ["> " + line for line in summary]
summary = "\n".join(summary)
2023-10-24 16:43:14 +02:00
else:
2023-11-29 16:32:35 +01:00
summary = "> *** No summary ***"
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title,
summary,
pathname
)
allow_list = await listhandler.is_listed(
db_file,
"filter-allow",
string
)
if not allow_list:
reject_list = await listhandler.is_listed(
db_file,
"filter-deny",
string
)
if reject_list:
# print(">>> REJECTED", title)
2023-11-23 17:55:36 +01:00
summary = (
"REJECTED {}".format(
reject_list.upper()
)
)
# summary = ""
read_status = 1
entry = (
title,
summary,
link,
eid,
source,
date,
read_status
)
await sqlitehandler.add_entry_and_set_date(
db_file,
source,
entry
)
# print(await datetimehandler.current_time(), entry, title)
# else:
# print(await datetimehandler.current_time(), exist, title)
# NOTE Why (if result[0]) and (if result[1] == 200)?
async def view_feed(url):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
Returns
-------
msg : str
Feed content or error message.
"""
result = await download_feed(url)
if result[0]:
try:
feed = feedparser.parse(result[0])
if feed.bozo:
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
msg = await probe_page(view_feed, url, result[0])
return msg
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
msg = (
"> {}\n"
"Error: {}"
).format(url, e)
breakpoint()
if result[1] == 200:
title = await get_title(url, result[0])
entries = feed.entries
msg = "Preview of {}:\n```\n".format(title)
count = 0
for entry in entries:
count += 1
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = await join_url(url, entry.link)
link = await trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = await datetimehandler.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = await datetimehandler.rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
msg += (
"Title : {}\n"
"Date : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(
title,
date,
link,
count
)
if count > 4:
break
msg += (
"```\nSource: {}"
).format(url)
else:
msg = (
">{}\nFailed to load URL. Reason: {}"
).format(url, result[1])
return msg
# NOTE Why (if result[0]) and (if result[1] == 200)?
async def view_entry(url, num):
result = await download_feed(url)
if result[0]:
try:
feed = feedparser.parse(result[0])
if feed.bozo:
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
msg = await probe_page(view_entry, url, result[0], num)
return msg
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
msg = (
"> {}\n"
"Error: {}"
).format(url, e)
breakpoint()
if result[1] == 200:
feed = feedparser.parse(result[0])
title = await get_title(url, result[0])
entries = feed.entries
num = int(num) - 1
entry = entries[num]
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("published"):
date = entry.published
date = await datetimehandler.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = await datetimehandler.rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
else:
summary = "*** No summary ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = await join_url(url, entry.link)
link = await trim_url(link)
else:
link = "*** No link ***"
msg = (
"{}\n"
"\n"
"{}\n"
"\n"
"{}\n"
"\n"
).format(
title,
summary,
link
)
else:
msg = (
">{}\n"
"Failed to load URL. Reason: {}\n"
"Try again momentarily."
).format(url, result[1])
return msg
async def add_feed_no_check(db_file, data):
"""
Add given feed without validity check.
Parameters
----------
db_file : str
Path to database file.
data : str
URL or URL and Title.
Returns
-------
msg : str
Status message.
"""
url = data[0]
title = data[1]
url = await trim_url(url)
exist = await sqlitehandler.check_feed_exist(db_file, url)
if not exist:
msg = await sqlitehandler.add_feed(db_file, url, title)
await download_updates(db_file, [url])
else:
ix = exist[0]
name = exist[1]
msg = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
return msg
2023-10-24 16:43:14 +02:00
async def add_feed(db_file, url):
"""
Check whether feed exist, otherwise process it.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
Returns
-------
msg : str
Status message.
2023-10-24 16:43:14 +02:00
"""
msg = None
url = await trim_url(url)
2023-10-24 16:43:14 +02:00
exist = await sqlitehandler.check_feed_exist(db_file, url)
if not exist:
res = await download_feed(url)
if res[0]:
feed = feedparser.parse(res[0])
2023-11-02 06:17:04 +01:00
title = await get_title(url, feed)
2023-10-24 16:43:14 +02:00
if feed.bozo:
bozo = (
"Bozo detected. Failed to load: {}."
).format(url)
2023-10-24 16:43:14 +02:00
print(bozo)
msg = await probe_page(add_feed, url, res[0], db_file)
2023-10-24 16:43:14 +02:00
else:
status = res[1]
msg = await sqlitehandler.add_feed(
db_file,
url,
title,
status
)
await download_updates(db_file, [url])
2023-10-24 16:43:14 +02:00
else:
status = res[1]
msg = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
2023-10-24 16:43:14 +02:00
else:
ix = exist[0]
name = exist[1]
msg = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
2023-10-24 16:43:14 +02:00
return msg
# TODO callback for use with add_feed and view_feed
async def probe_page(callback, url, doc, num=None, db_file=None):
msg = None
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(doc)
except:
msg = (
"> {}\nFailed to parse URL as feed."
).format(url)
if not msg:
print("RSS Auto-Discovery Engaged")
msg = await feed_mode_auto_discovery(url, tree)
if not msg:
print("RSS Scan Mode Engaged")
msg = await feed_mode_scan(url, tree)
if not msg:
print("RSS Arbitrary Mode Engaged")
msg = await feed_mode_request(url, tree)
if not msg:
msg = (
"> {}\nNo news feeds were found for URL."
).format(url)
# elif msg:
else:
if isinstance(msg, str):
return msg
elif isinstance(msg, list):
url = msg[0]
if db_file:
return await callback(db_file, url)
elif num:
return await callback(url, num)
else:
return await callback(url)
2023-10-24 16:43:14 +02:00
async def download_feed(url):
"""
Download content of given URL.
2023-11-02 06:17:04 +01:00
Parameters
----------
url : str
URL.
Returns
-------
msg: list or str
Document or error message.
2023-10-24 16:43:14 +02:00
"""
timeout = aiohttp.ClientTimeout(total=10)
async with aiohttp.ClientSession() as session:
# async with aiohttp.ClientSession(trust_env=True) as session:
try:
async with session.get(url, timeout=timeout) as response:
status = response.status
if response.status == 200:
try:
doc = await response.text()
# print (response.content_type)
msg = [
doc,
status
]
2023-10-24 16:43:14 +02:00
except:
# msg = [
# False,
# ("The content of this document "
# "doesn't appear to be textual."
# )
# ]
msg = [
False,
"Document is too large or is not textual."
]
2023-10-24 16:43:14 +02:00
else:
msg = [
False,
"HTTP Error: " + str(status)
]
2023-10-24 16:43:14 +02:00
except aiohttp.ClientError as e:
# print('Error', str(e))
msg = [
False,
"Error: " + str(e)
]
2023-10-24 16:43:14 +02:00
except asyncio.TimeoutError as e:
# print('Timeout:', str(e))
msg = [
False,
"Timeout: " + str(e)
]
return msg
2023-11-02 06:17:04 +01:00
async def get_title(url, feed):
"""
Get title of feed.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed document.
Returns
-------
title : str
Title or URL hostname.
2023-11-02 06:17:04 +01:00
"""
try:
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
2023-11-02 06:17:04 +01:00
return title
# NOTE Read the documentation
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
def complete_url(source, link):
"""
Check if URL is pathname and complete it into URL.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith("www."):
return "http://" + link
parted_link = urlsplit(link)
parted_feed = urlsplit(source)
if parted_link.scheme == "magnet" and parted_link.query:
return link
if parted_link.scheme and parted_link.netloc:
return link
if link.startswith("//"):
if parted_link.netloc and parted_link.path:
new_link = urlunsplit([
parted_feed.scheme,
parted_link.netloc,
parted_link.path,
parted_link.query,
parted_link.fragment
])
elif link.startswith("/"):
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
parted_link.path,
parted_link.query,
parted_link.fragment
])
elif link.startswith("../"):
pathlink = parted_link.path.split("/")
pathfeed = parted_feed.path.split("/")
for i in pathlink:
if i == "..":
if pathlink.index("..") == 0:
pathfeed.pop()
else:
break
while pathlink.count(".."):
if pathlink.index("..") == 0:
pathlink.remove("..")
else:
break
pathlink = "/".join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
"/".join(pathfeed),
parted_link.query,
parted_link.fragment
])
else:
pathlink = parted_link.path.split("/")
pathfeed = parted_feed.path.split("/")
if link.startswith("./"):
pathlink.remove(".")
if not source.endswith("/"):
pathfeed.pop()
pathlink = "/".join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
"/".join(pathfeed),
parted_link.query,
parted_link.fragment
])
return new_link
"""
TODO
Feed https://www.ocaml.org/feed.xml
Link %20https://frama-c.com/fc-versions/cobalt.html%20
FIXME
Feed https://cyber.dabamos.de/blog/feed.rss
Link https://cyber.dabamos.de/blog/#article-2022-07-15
"""
async def join_url(source, link):
"""
Join base URL with given pathname.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith("www."):
new_link = "http://" + link
elif link.startswith("%20") and link.endswith("%20"):
old_link = link.split("%20")
del old_link[0]
old_link.pop()
new_link = "".join(old_link)
else:
new_link = urljoin(source, link)
return new_link
async def trim_url(url):
"""
Check URL pathname for double slash.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
while "//" in pathname:
pathname = pathname.replace("//", "/")
url = urlunsplit([
protocol,
hostname,
pathname,
queries,
fragment
])
return url
# TODO Improve scan by gradual decreasing of path
async def feed_mode_request(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
parted_url = urlsplit(url)
paths = confighandler.get_list()
for path in paths:
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
# print(feedparser.parse(res[0])["feed"]["title"])
# feeds[address] = feedparser.parse(res[0])["feed"]["title"]
try:
title = feedparser.parse(res[0])["feed"]["title"]
except:
title = '*** No Title ***'
feeds[address] = title
# Check whether URL has path (i.e. not root)
if parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
parted_url.path.split('/')[1] + path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = feedparser.parse(res[0])
# print(feeds)
except:
continue
if len(feeds) > 1:
positive = 0
msg = (
"RSS URL discovery has found {} feeds:\n```\n"
).format(len(feeds))
for feed in feeds:
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
2023-11-23 17:55:36 +01:00
# AttributeError: 'str' object has no attribute 'entries'
try:
feed_amnt = len(feeds[feed].entries)
except:
continue
if feed_amnt:
positive = 1
msg += (
"Title: {}\n"
"Link : {}\n"
"Items: {}\n"
"\n"
).format(
feed_name,
feed_addr,
feed_amnt
)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
if not positive:
msg = (
"No feeds were found for {}."
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
# paths = []
# TODO Test
paths = confighandler.get_list()
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = "//a[contains(@href,'{}')]".format(path)
addresses = tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
print(address.xpath('@href')[0])
print(addresses)
address = address.xpath('@href')[0]
if "/" not in address:
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
if address.startswith('/'):
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = feedparser.parse(res[0])
# print(feeds)
except:
continue
if len(feeds) > 1:
positive = 0
msg = (
"RSS URL scan has found {} feeds:\n```\n"
).format(len(feeds))
for feed in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
positive = 1
msg += (
"Title: {}\n"
" Link: {}\n"
"Count: {}\n"
"\n"
).format(
feed_name,
feed_addr,
feed_amnt
)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
if not positive:
msg = (
"No feeds were found for {}."
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
if len(feeds) > 1:
msg = (
"RSS Auto-Discovery has found {} feeds:\n```\n"
).format(len(feeds))
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
# disco = feedparser.parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed.xpath('@title')[0]
feed_addr = await join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
return msg
elif feeds:
feed_addr = await join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr]
async def feed_to_http(url):
"""
Replace scheme feed by http.
Parameters
----------
url : str
URL.
Returns
-------
new_url : str
URL.
"""
par_url = urlsplit(url)
new_url = urlunsplit([
"http",
par_url.netloc,
par_url.path,
par_url.query,
par_url.fragment
])
return new_url
async def activitypub_to_http(namespace):
"""
Replace ActivityPub namespace by http.
Parameters
----------
namespace : str
Namespace.
Returns
-------
new_url : str
URL.
"""
par_url = urlsplit(namespace)
new_url = urlunsplit([
"http",
par_url.netloc,
par_url.path,
par_url.query,
par_url.fragment
])
return new_url