1
0
Fork 0
forked from sch/Slixfeed

Improve connectivity recovery

This commit is contained in:
Schimon Jehudah 2023-12-24 18:37:05 +00:00
parent 0566589a9d
commit 56d0da9a76
5 changed files with 97 additions and 62 deletions

View file

@ -13,42 +13,44 @@ FIXME
TODO
0) from slixfeed.FILENAME import XYZ
See project feed2toot
1) from slixfeed.FILENAME import XYZ
See project /chaica/feed2toot
1) SQL prepared statements.
2) SQL prepared statements;
2) Machine Learning for scrapping Title, Link, Summary and Timstamp.
3) Machine Learning for scrapping Title, Link, Summary and Timstamp;
Scrape element </article> (example: Liferea)
http://intertwingly.net/blog/
https://www.brandenburg.de/
3) Set MUC subject
4) Set MUC subject
Feeds which entries are to be set as groupchat subject.
Perhaps not, as it would require to check every feed for this setting.
Maybe a separate bot.
Maybe a separate bot;
4) Support categories.
5) Support categories;
5) Default prepackaged list of feeds.
6) XMPP commands;
6) XMPP commands.
7) Bot as transport;
7) Bot as transport.
8) OMEMO;
8) OMEMO.
9) Logging;
https://docs.python.org/3/howto/logging.html
9) Logging.
10) Readability
See project /buriy/python-readability
10) Default feeds (e.g. Blacklisted News, TBOT etc.)
11) Download and upload/send article (xHTML, xHTMLZ, Markdown, MHTML, TXT).
Use Readability.
11) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, TXT).
12) Fetch summary from URL, instead of storing summary, or
Store 5 upcoming summaries.
This would help making the database files smaller.
13) Support protocol Gopher
https://github.com/michael-lazar/pygopherd
https://github.com/gopherball/gb
See project /michael-lazar/pygopherd
See project /gopherball/gb
14) Support ActivityPub @person@domain (see Tip Of The Day).
@ -60,7 +62,11 @@ TODO
16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
17) See project offpunk/offblocklist.py
17) See project /offpunk/offblocklist.py
18) Search messages of government regulated publishers, and promote other sources.
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
However, you might want to get news from (1) (2) and (3) instead!
"""

View file

@ -19,14 +19,14 @@ from asyncio import TimeoutError
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
from confighandler import get_list, get_value_default
from datetimehandler import now, rfc2822_to_iso8601
from email.utils import parseaddr
from feedparser import parse
from http.client import IncompleteRead
from lxml import html
from datetimehandler import now, rfc2822_to_iso8601
from urlhandler import complete_url, join_url, trim_url
from listhandler import is_listed
from lxml import html
import sqlitehandler as sqlite
from urlhandler import complete_url, join_url, trim_url
from urllib import error
# from xml.etree.ElementTree import ElementTree, ParseError
from urllib.parse import urljoin, urlsplit, urlunsplit
@ -202,8 +202,8 @@ async def download_updates(db_file, url=None):
print("PROBLEM: date is int")
print(date)
# breakpoint()
print(source)
print(date)
# print(source)
# print(date)
await sqlite.add_entry_and_set_date(
db_file,
source,
@ -261,9 +261,9 @@ async def view_feed(url):
title = get_title(url, feed)
entries = feed.entries
msg = "Preview of {}:\n```\n".format(title)
count = 0
counter = 0
for entry in entries:
count += 1
counter += 1
if entry.has_key("title"):
title = entry.title
else:
@ -292,9 +292,9 @@ async def view_feed(url):
title,
date,
link,
count
counter
)
if count > 4:
if counter > 4:
break
msg += (
"```\nSource: {}"
@ -446,7 +446,7 @@ async def add_feed(db_file, url):
title = get_title(url, feed)
if feed.bozo:
bozo = (
"Bozo detected. Failed to load: {}."
"Bozo detected. Failed to load: {}"
).format(url)
print(bozo)
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
@ -505,7 +505,7 @@ async def probe_page(callback, url, doc, num=None, db_file=None):
elif isinstance(msg, list):
url = msg[0]
if db_file:
print("if db_file", db_file)
# print("if db_file", db_file)
return await callback(db_file, url)
elif num:
return await callback(url, num)
@ -531,6 +531,8 @@ async def download_feed(url):
user_agent = await get_value_default("user-agent", "Network")
except:
user_agent = "Slixfeed/0.1"
if not len(user_agent):
user_agent = "Slixfeed/0.1"
timeout = ClientTimeout(total=10)
headers = {'User-Agent': user_agent}
async with ClientSession(headers=headers) as session:
@ -597,6 +599,8 @@ def get_title(url, feed):
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
if not title:
title = urlsplit(url).netloc
return title
@ -621,7 +625,7 @@ async def feed_mode_request(url, tree):
"""
feeds = {}
parted_url = urlsplit(url)
paths = await get_list("pathnames")
paths = await get_list("pathnames", "lists.yaml")
for path in paths:
address = urlunsplit([
parted_url.scheme,
@ -693,7 +697,7 @@ async def feed_mode_request(url, tree):
).format(url)
if not positive:
msg = (
"No feeds were found for {}."
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
@ -721,17 +725,21 @@ async def feed_mode_scan(url, tree):
feeds = {}
# paths = []
# TODO Test
paths = await get_list("pathnames")
paths = await get_list("pathnames", "lists.yaml")
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = "//a[contains(@href,'{}')]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
print(address.xpath('@href')[0])
print(addresses)
# print(address.xpath('@href')[0])
# print(addresses)
address = address.xpath('@href')[0]
if "/" not in address:
protocol = parted_url.scheme
@ -759,11 +767,15 @@ async def feed_mode_scan(url, tree):
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint()
# print(feeds)
except:
continue
if len(feeds) > 1:
positive = 0
# print(feeds)
# breakpoint()
counter = 0
msg = (
"RSS URL scan has found {} feeds:\n```\n"
).format(len(feeds))
@ -779,7 +791,13 @@ async def feed_mode_scan(url, tree):
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
positive = 1
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title : {}\n"
"Link : {}\n"
@ -790,12 +808,15 @@ async def feed_mode_scan(url, tree):
feed_addr,
feed_amnt
)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
if not positive:
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}."
"No feeds were found for {}"
).format(url)
return msg
elif feeds:

View file

@ -471,7 +471,10 @@ async def get_entry_unread(db_file, num=None):
title = result[1]
summary = result[2]
# Remove HTML tags
try:
summary = BeautifulSoup(summary, "lxml").text
except:
print(result[2])
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
length = await get_settings_value(db_file, "length")

View file

@ -78,7 +78,7 @@ await taskhandler.start_tasks(
"""
async def start_tasks_xmpp(self, jid, tasks):
print("start_tasks_xmpp", jid, tasks)
# print("start_tasks_xmpp", jid, tasks)
task_manager[jid] = {}
for task in tasks:
# print("task:", task)
@ -109,7 +109,7 @@ async def start_tasks_xmpp(self, jid, tasks):
# await task
async def clean_tasks_xmpp(jid, tasks):
print("clean_tasks_xmpp", jid, tasks)
# print("clean_tasks_xmpp", jid, tasks)
for task in tasks:
# if task_manager[jid][task]:
try:
@ -132,7 +132,7 @@ Consider callback e.g. Slixfeed.send_status.
Or taskhandler for each protocol or specific taskhandler function.
"""
async def task_jid(self, jid):
print("task_jid", jid)
# print("task_jid", jid)
"""
JID (Jabber ID) task manager.
@ -258,7 +258,7 @@ async def send_update(self, jid, num=None):
async def send_status(self, jid):
print("send_status", jid)
# print("send_status", jid)
# print(await current_time(), jid, "def send_status")
"""
Send status message.
@ -336,7 +336,7 @@ async def send_status(self, jid):
async def refresh_task(self, jid, callback, key, val=None):
print("refresh_task", jid, key)
# print("refresh_task", jid, key)
"""
Apply new setting at runtime.
@ -382,7 +382,7 @@ async def refresh_task(self, jid, callback, key, val=None):
# TODO Take this function out of
# <class 'slixmpp.clientxmpp.ClientXMPP'>
async def check_updates(jid):
print("check_updates", jid)
# print("check_updates", jid)
# print(await current_time(), jid, "def check_updates")
"""
Start calling for update check up.

View file

@ -64,6 +64,7 @@ import listhandler as lister
import sqlitehandler as sqlite
import taskhandler as tasker
import urlhandler as urlfixer
from time import sleep
from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound
# from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference
@ -102,7 +103,6 @@ class Slixfeed(slixmpp.ClientXMPP):
# The bot works fine when the nickname is hardcoded; or
# The bot won't join some MUCs when its nickname has brackets
self.nick = nick
# The session_start event will be triggered when
# the bot establishes its connection with the server
# and the XML streams are ready for use. We want to
@ -387,10 +387,12 @@ class Slixfeed(slixmpp.ClientXMPP):
# print(current_time(),"Maximum connection attempts exceeded.")
# logging.error("Maximum connection attempts exceeded.")
print(current_time(), "Attempt number", self.connection_attempts)
self.reconnect(wait=5.0)
seconds = 5
seconds = 30
print(current_time(), "Next attempt within", seconds, "seconds")
await asyncio.sleep(seconds)
# NOTE asyncio.sleep doesn't interval as expected
# await asyncio.sleep(seconds)
sleep(seconds)
self.reconnect(wait=5.0)
async def inspect_connection(self, event):
@ -912,7 +914,7 @@ class Slixfeed(slixmpp.ClientXMPP):
["status"]
)
task = (
"📫️ Processing request to fetch data from {} ..."
"📫️ Processing request to fetch data from {}"
).format(url)
process_task_message(self, jid, task)
action = await initdb(
@ -1080,8 +1082,9 @@ class Slixfeed(slixmpp.ClientXMPP):
action = (
"Only new items of newly added feeds will be sent."
)
case _ if message_lowercase.startswith("next"):
num = message[5:]
# TODO Will you add support for number of messages?
case "next":
# num = message[5:]
await tasker.clean_tasks_xmpp(
jid,
["interval", "status"]
@ -1137,13 +1140,15 @@ class Slixfeed(slixmpp.ClientXMPP):
else:
action = "Missing value."
case "random":
action = "Updates will be sent randomly."
# TODO /questions/2279706/select-random-row-from-a-sqlite-table
# NOTE sqlitehandler.get_entry_unread
action = "Updates will be sent by random order."
case _ if message_lowercase.startswith("read"):
data = message[5:]
data = data.split()
url = data[0]
task = (
"📫️ Processing request to fetch data from {} ..."
"📫️ Processing request to fetch data from {}"
).format(url)
process_task_message(self, jid, task)
await tasker.clean_tasks_xmpp(