Improve connectivity recovery

This commit is contained in:
Schimon Jehudah 2023-12-24 18:37:05 +00:00
parent 0566589a9d
commit 56d0da9a76
5 changed files with 97 additions and 62 deletions

View file

@ -13,42 +13,44 @@ FIXME
TODO TODO
0) from slixfeed.FILENAME import XYZ 1) from slixfeed.FILENAME import XYZ
See project feed2toot See project /chaica/feed2toot
1) SQL prepared statements. 2) SQL prepared statements;
2) Machine Learning for scrapping Title, Link, Summary and Timstamp. 3) Machine Learning for scrapping Title, Link, Summary and Timstamp;
Scrape element </article> (example: Liferea)
http://intertwingly.net/blog/
https://www.brandenburg.de/
3) Set MUC subject 4) Set MUC subject
Feeds which entries are to be set as groupchat subject. Feeds which entries are to be set as groupchat subject.
Perhaps not, as it would require to check every feed for this setting. Perhaps not, as it would require to check every feed for this setting.
Maybe a separate bot. Maybe a separate bot;
4) Support categories. 5) Support categories;
5) Default prepackaged list of feeds. 6) XMPP commands;
6) XMPP commands. 7) Bot as transport;
7) Bot as transport. 8) OMEMO;
8) OMEMO. 9) Logging;
https://docs.python.org/3/howto/logging.html
9) Logging. 10) Readability
See project /buriy/python-readability
10) Default feeds (e.g. Blacklisted News, TBOT etc.) 11) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, TXT).
11) Download and upload/send article (xHTML, xHTMLZ, Markdown, MHTML, TXT).
Use Readability.
12) Fetch summary from URL, instead of storing summary, or 12) Fetch summary from URL, instead of storing summary, or
Store 5 upcoming summaries. Store 5 upcoming summaries.
This would help making the database files smaller. This would help making the database files smaller.
13) Support protocol Gopher 13) Support protocol Gopher
https://github.com/michael-lazar/pygopherd See project /michael-lazar/pygopherd
https://github.com/gopherball/gb See project /gopherball/gb
14) Support ActivityPub @person@domain (see Tip Of The Day). 14) Support ActivityPub @person@domain (see Tip Of The Day).
@ -60,7 +62,11 @@ TODO
16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger 16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
17) See project offpunk/offblocklist.py 17) See project /offpunk/offblocklist.py
18) Search messages of government regulated publishers, and promote other sources.
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
However, you might want to get news from (1) (2) and (3) instead!
""" """

View file

@ -19,14 +19,14 @@ from asyncio import TimeoutError
from asyncio.exceptions import IncompleteReadError from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from confighandler import get_list, get_value_default from confighandler import get_list, get_value_default
from datetimehandler import now, rfc2822_to_iso8601
from email.utils import parseaddr from email.utils import parseaddr
from feedparser import parse from feedparser import parse
from http.client import IncompleteRead from http.client import IncompleteRead
from lxml import html
from datetimehandler import now, rfc2822_to_iso8601
from urlhandler import complete_url, join_url, trim_url
from listhandler import is_listed from listhandler import is_listed
from lxml import html
import sqlitehandler as sqlite import sqlitehandler as sqlite
from urlhandler import complete_url, join_url, trim_url
from urllib import error from urllib import error
# from xml.etree.ElementTree import ElementTree, ParseError # from xml.etree.ElementTree import ElementTree, ParseError
from urllib.parse import urljoin, urlsplit, urlunsplit from urllib.parse import urljoin, urlsplit, urlunsplit
@ -202,8 +202,8 @@ async def download_updates(db_file, url=None):
print("PROBLEM: date is int") print("PROBLEM: date is int")
print(date) print(date)
# breakpoint() # breakpoint()
print(source) # print(source)
print(date) # print(date)
await sqlite.add_entry_and_set_date( await sqlite.add_entry_and_set_date(
db_file, db_file,
source, source,
@ -261,9 +261,9 @@ async def view_feed(url):
title = get_title(url, feed) title = get_title(url, feed)
entries = feed.entries entries = feed.entries
msg = "Preview of {}:\n```\n".format(title) msg = "Preview of {}:\n```\n".format(title)
count = 0 counter = 0
for entry in entries: for entry in entries:
count += 1 counter += 1
if entry.has_key("title"): if entry.has_key("title"):
title = entry.title title = entry.title
else: else:
@ -292,9 +292,9 @@ async def view_feed(url):
title, title,
date, date,
link, link,
count counter
) )
if count > 4: if counter > 4:
break break
msg += ( msg += (
"```\nSource: {}" "```\nSource: {}"
@ -446,7 +446,7 @@ async def add_feed(db_file, url):
title = get_title(url, feed) title = get_title(url, feed)
if feed.bozo: if feed.bozo:
bozo = ( bozo = (
"Bozo detected. Failed to load: {}." "Bozo detected. Failed to load: {}"
).format(url) ).format(url)
print(bozo) print(bozo)
msg = await probe_page(add_feed, url, res[0], db_file=db_file) msg = await probe_page(add_feed, url, res[0], db_file=db_file)
@ -505,7 +505,7 @@ async def probe_page(callback, url, doc, num=None, db_file=None):
elif isinstance(msg, list): elif isinstance(msg, list):
url = msg[0] url = msg[0]
if db_file: if db_file:
print("if db_file", db_file) # print("if db_file", db_file)
return await callback(db_file, url) return await callback(db_file, url)
elif num: elif num:
return await callback(url, num) return await callback(url, num)
@ -531,6 +531,8 @@ async def download_feed(url):
user_agent = await get_value_default("user-agent", "Network") user_agent = await get_value_default("user-agent", "Network")
except: except:
user_agent = "Slixfeed/0.1" user_agent = "Slixfeed/0.1"
if not len(user_agent):
user_agent = "Slixfeed/0.1"
timeout = ClientTimeout(total=10) timeout = ClientTimeout(total=10)
headers = {'User-Agent': user_agent} headers = {'User-Agent': user_agent}
async with ClientSession(headers=headers) as session: async with ClientSession(headers=headers) as session:
@ -597,6 +599,8 @@ def get_title(url, feed):
title = feed["feed"]["title"] title = feed["feed"]["title"]
except: except:
title = urlsplit(url).netloc title = urlsplit(url).netloc
if not title:
title = urlsplit(url).netloc
return title return title
@ -621,7 +625,7 @@ async def feed_mode_request(url, tree):
""" """
feeds = {} feeds = {}
parted_url = urlsplit(url) parted_url = urlsplit(url)
paths = await get_list("pathnames") paths = await get_list("pathnames", "lists.yaml")
for path in paths: for path in paths:
address = urlunsplit([ address = urlunsplit([
parted_url.scheme, parted_url.scheme,
@ -693,7 +697,7 @@ async def feed_mode_request(url, tree):
).format(url) ).format(url)
if not positive: if not positive:
msg = ( msg = (
"No feeds were found for {}." "No feeds were found for {}"
).format(url) ).format(url)
return msg return msg
elif feeds: elif feeds:
@ -721,17 +725,21 @@ async def feed_mode_scan(url, tree):
feeds = {} feeds = {}
# paths = [] # paths = []
# TODO Test # TODO Test
paths = await get_list("pathnames") paths = await get_list("pathnames", "lists.yaml")
for path in paths: for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path) # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = "//a[contains(@href,'{}')]".format(path) # xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
addresses = tree.xpath(xpath_query) addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url) parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or # NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end # perhaps be N from the start and N from the end
for address in addresses: for address in addresses:
print(address.xpath('@href')[0]) # print(address.xpath('@href')[0])
print(addresses) # print(addresses)
address = address.xpath('@href')[0] address = address.xpath('@href')[0]
if "/" not in address: if "/" not in address:
protocol = parted_url.scheme protocol = parted_url.scheme
@ -759,11 +767,15 @@ async def feed_mode_scan(url, tree):
if res[1] == 200: if res[1] == 200:
try: try:
feeds[address] = parse(res[0]) feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint()
# print(feeds) # print(feeds)
except: except:
continue continue
if len(feeds) > 1: if len(feeds) > 1:
positive = 0 # print(feeds)
# breakpoint()
counter = 0
msg = ( msg = (
"RSS URL scan has found {} feeds:\n```\n" "RSS URL scan has found {} feeds:\n```\n"
).format(len(feeds)) ).format(len(feeds))
@ -779,23 +791,32 @@ async def feed_mode_scan(url, tree):
feed_addr = feed feed_addr = feed
feed_amnt = len(feeds[feed].entries) feed_amnt = len(feeds[feed].entries)
if feed_amnt: if feed_amnt:
positive = 1 # NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += ( msg += (
"Title: {}\n" "Title : {}\n"
" Link: {}\n" "Link : {}\n"
"Count: {}\n" "Count : {}\n"
"\n" "\n"
).format( ).format(
feed_name, feed_name,
feed_addr, feed_addr,
feed_amnt feed_amnt
) )
msg += ( if counter > 1:
"```\nThe above feeds were extracted from\n{}" msg += (
).format(url) "```\nThe above feeds were extracted from\n{}"
if not positive: ).format(url)
elif feed_mark:
return feed_mark
else:
msg = ( msg = (
"No feeds were found for {}." "No feeds were found for {}"
).format(url) ).format(url)
return msg return msg
elif feeds: elif feeds:

View file

@ -471,7 +471,10 @@ async def get_entry_unread(db_file, num=None):
title = result[1] title = result[1]
summary = result[2] summary = result[2]
# Remove HTML tags # Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text try:
summary = BeautifulSoup(summary, "lxml").text
except:
print(result[2])
# TODO Limit text length # TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n") summary = summary.replace("\n\n\n", "\n\n")
length = await get_settings_value(db_file, "length") length = await get_settings_value(db_file, "length")

View file

@ -78,7 +78,7 @@ await taskhandler.start_tasks(
""" """
async def start_tasks_xmpp(self, jid, tasks): async def start_tasks_xmpp(self, jid, tasks):
print("start_tasks_xmpp", jid, tasks) # print("start_tasks_xmpp", jid, tasks)
task_manager[jid] = {} task_manager[jid] = {}
for task in tasks: for task in tasks:
# print("task:", task) # print("task:", task)
@ -109,7 +109,7 @@ async def start_tasks_xmpp(self, jid, tasks):
# await task # await task
async def clean_tasks_xmpp(jid, tasks): async def clean_tasks_xmpp(jid, tasks):
print("clean_tasks_xmpp", jid, tasks) # print("clean_tasks_xmpp", jid, tasks)
for task in tasks: for task in tasks:
# if task_manager[jid][task]: # if task_manager[jid][task]:
try: try:
@ -132,7 +132,7 @@ Consider callback e.g. Slixfeed.send_status.
Or taskhandler for each protocol or specific taskhandler function. Or taskhandler for each protocol or specific taskhandler function.
""" """
async def task_jid(self, jid): async def task_jid(self, jid):
print("task_jid", jid) # print("task_jid", jid)
""" """
JID (Jabber ID) task manager. JID (Jabber ID) task manager.
@ -258,7 +258,7 @@ async def send_update(self, jid, num=None):
async def send_status(self, jid): async def send_status(self, jid):
print("send_status", jid) # print("send_status", jid)
# print(await current_time(), jid, "def send_status") # print(await current_time(), jid, "def send_status")
""" """
Send status message. Send status message.
@ -336,7 +336,7 @@ async def send_status(self, jid):
async def refresh_task(self, jid, callback, key, val=None): async def refresh_task(self, jid, callback, key, val=None):
print("refresh_task", jid, key) # print("refresh_task", jid, key)
""" """
Apply new setting at runtime. Apply new setting at runtime.
@ -382,7 +382,7 @@ async def refresh_task(self, jid, callback, key, val=None):
# TODO Take this function out of # TODO Take this function out of
# <class 'slixmpp.clientxmpp.ClientXMPP'> # <class 'slixmpp.clientxmpp.ClientXMPP'>
async def check_updates(jid): async def check_updates(jid):
print("check_updates", jid) # print("check_updates", jid)
# print(await current_time(), jid, "def check_updates") # print(await current_time(), jid, "def check_updates")
""" """
Start calling for update check up. Start calling for update check up.

View file

@ -64,6 +64,7 @@ import listhandler as lister
import sqlitehandler as sqlite import sqlitehandler as sqlite
import taskhandler as tasker import taskhandler as tasker
import urlhandler as urlfixer import urlhandler as urlfixer
from time import sleep
from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound
# from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference
@ -102,7 +103,6 @@ class Slixfeed(slixmpp.ClientXMPP):
# The bot works fine when the nickname is hardcoded; or # The bot works fine when the nickname is hardcoded; or
# The bot won't join some MUCs when its nickname has brackets # The bot won't join some MUCs when its nickname has brackets
self.nick = nick self.nick = nick
# The session_start event will be triggered when # The session_start event will be triggered when
# the bot establishes its connection with the server # the bot establishes its connection with the server
# and the XML streams are ready for use. We want to # and the XML streams are ready for use. We want to
@ -387,10 +387,12 @@ class Slixfeed(slixmpp.ClientXMPP):
# print(current_time(),"Maximum connection attempts exceeded.") # print(current_time(),"Maximum connection attempts exceeded.")
# logging.error("Maximum connection attempts exceeded.") # logging.error("Maximum connection attempts exceeded.")
print(current_time(), "Attempt number", self.connection_attempts) print(current_time(), "Attempt number", self.connection_attempts)
self.reconnect(wait=5.0) seconds = 30
seconds = 5
print(current_time(), "Next attempt within", seconds, "seconds") print(current_time(), "Next attempt within", seconds, "seconds")
await asyncio.sleep(seconds) # NOTE asyncio.sleep doesn't interval as expected
# await asyncio.sleep(seconds)
sleep(seconds)
self.reconnect(wait=5.0)
async def inspect_connection(self, event): async def inspect_connection(self, event):
@ -912,7 +914,7 @@ class Slixfeed(slixmpp.ClientXMPP):
["status"] ["status"]
) )
task = ( task = (
"📫️ Processing request to fetch data from {} ..." "📫️ Processing request to fetch data from {}"
).format(url) ).format(url)
process_task_message(self, jid, task) process_task_message(self, jid, task)
action = await initdb( action = await initdb(
@ -1080,8 +1082,9 @@ class Slixfeed(slixmpp.ClientXMPP):
action = ( action = (
"Only new items of newly added feeds will be sent." "Only new items of newly added feeds will be sent."
) )
case _ if message_lowercase.startswith("next"): # TODO Will you add support for number of messages?
num = message[5:] case "next":
# num = message[5:]
await tasker.clean_tasks_xmpp( await tasker.clean_tasks_xmpp(
jid, jid,
["interval", "status"] ["interval", "status"]
@ -1137,13 +1140,15 @@ class Slixfeed(slixmpp.ClientXMPP):
else: else:
action = "Missing value." action = "Missing value."
case "random": case "random":
action = "Updates will be sent randomly." # TODO /questions/2279706/select-random-row-from-a-sqlite-table
# NOTE sqlitehandler.get_entry_unread
action = "Updates will be sent by random order."
case _ if message_lowercase.startswith("read"): case _ if message_lowercase.startswith("read"):
data = message[5:] data = message[5:]
data = data.split() data = data.split()
url = data[0] url = data[0]
task = ( task = (
"📫️ Processing request to fetch data from {} ..." "📫️ Processing request to fetch data from {}"
).format(url) ).format(url)
process_task_message(self, jid, task) process_task_message(self, jid, task)
await tasker.clean_tasks_xmpp( await tasker.clean_tasks_xmpp(