Improve connectivity recovery

2023-12-24 18:37:05 +00:00 · 2023-12-24 18:37:05 +00:00 · 56d0da9a76
commit 56d0da9a76
parent 0566589a9d
5 changed files with 97 additions and 62 deletions
--- a/slixfeed/main.py
+++ b/slixfeed/main.py
@ -13,42 +13,44 @@ FIXME

 TODO

-0) from slixfeed.FILENAME import XYZ
-      See project feed2toot
+1) from slixfeed.FILENAME import XYZ
+   See project /chaica/feed2toot

-1) SQL prepared statements.
+2) SQL prepared statements;

-2) Machine Learning for scrapping Title, Link, Summary and Timstamp.
+3) Machine Learning for scrapping Title, Link, Summary and Timstamp;
+   Scrape element </article> (example: Liferea)
+   http://intertwingly.net/blog/
+   https://www.brandenburg.de/

-3) Set MUC subject
+4) Set MUC subject
   Feeds which entries are to be set as groupchat subject.
   Perhaps not, as it would require to check every feed for this setting.
-   Maybe a separate bot.
+   Maybe a separate bot;

-4) Support categories.
+5) Support categories;

-5) Default prepackaged list of feeds.
+6) XMPP commands;

-6) XMPP commands.
+7) Bot as transport;

-7) Bot as transport.
+8) OMEMO;

-8) OMEMO.
+9) Logging;
+   https://docs.python.org/3/howto/logging.html

-9) Logging.
+10) Readability
+    See project /buriy/python-readability

-10) Default feeds (e.g. Blacklisted News, TBOT etc.)
-
-11) Download and upload/send article (xHTML, xHTMLZ, Markdown, MHTML, TXT).
-    Use Readability.
+11) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, TXT).

 12) Fetch summary from URL, instead of storing summary, or
    Store 5 upcoming summaries.
    This would help making the database files smaller.

 13) Support protocol Gopher
-    https://github.com/michael-lazar/pygopherd
-    https://github.com/gopherball/gb
+    See project /michael-lazar/pygopherd
+    See project /gopherball/gb

 14) Support ActivityPub @person@domain (see Tip Of The Day).

@ -60,7 +62,11 @@ TODO

 16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
    
-17) See project offpunk/offblocklist.py
+17) See project /offpunk/offblocklist.py
+
+18) Search messages of government regulated publishers, and promote other sources.
+    Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
+    However, you might want to get news from (1) (2) and (3) instead!

 """

--- a/slixfeed/datahandler.py
+++ b/slixfeed/datahandler.py
@ -19,14 +19,14 @@ from asyncio import TimeoutError
 from asyncio.exceptions import IncompleteReadError
 from bs4 import BeautifulSoup
 from confighandler import get_list, get_value_default
+from datetimehandler import now, rfc2822_to_iso8601
 from email.utils import parseaddr
 from feedparser import parse
 from http.client import IncompleteRead
-from lxml import html
-from datetimehandler import now, rfc2822_to_iso8601
-from urlhandler import complete_url, join_url, trim_url
 from listhandler import is_listed
+from lxml import html
 import sqlitehandler as sqlite
+from urlhandler import complete_url, join_url, trim_url
 from urllib import error
 # from xml.etree.ElementTree import ElementTree, ParseError
 from urllib.parse import urljoin, urlsplit, urlunsplit
@ -202,8 +202,8 @@ async def download_updates(db_file, url=None):
                        print("PROBLEM: date is int")
                        print(date)
                        # breakpoint()
-                    print(source)
-                    print(date)
+                    # print(source)
+                    # print(date)
                    await sqlite.add_entry_and_set_date(
                        db_file,
                        source,
@ -261,9 +261,9 @@ async def view_feed(url):
        title = get_title(url, feed)
        entries = feed.entries
        msg = "Preview of {}:\n```\n".format(title)
-        count = 0
+        counter = 0
        for entry in entries:
-            count += 1
+            counter += 1
            if entry.has_key("title"):
                title = entry.title
            else:
@ -292,9 +292,9 @@ async def view_feed(url):
                    title,
                    date,
                    link,
-                    count
+                    counter
                    )
-            if count > 4:
+            if counter > 4:
                break
        msg += (
            "```\nSource: {}"
@ -446,7 +446,7 @@ async def add_feed(db_file, url):
            title = get_title(url, feed)
            if feed.bozo:
                bozo = (
-                    "Bozo detected. Failed to load: {}."
+                    "Bozo detected. Failed to load: {}"
                    ).format(url)
                print(bozo)
                msg = await probe_page(add_feed, url, res[0], db_file=db_file)
@ -505,7 +505,7 @@ async def probe_page(callback, url, doc, num=None, db_file=None):
        elif isinstance(msg, list):
            url = msg[0]
            if db_file:
-                print("if db_file", db_file)
+                # print("if db_file", db_file)
                return await callback(db_file, url)
            elif num:
                return await callback(url, num)
@ -531,6 +531,8 @@ async def download_feed(url):
        user_agent = await get_value_default("user-agent", "Network")
    except:
        user_agent = "Slixfeed/0.1"
+    if not len(user_agent):
+        user_agent = "Slixfeed/0.1"
    timeout = ClientTimeout(total=10)
    headers = {'User-Agent': user_agent}
    async with ClientSession(headers=headers) as session:
@ -597,6 +599,8 @@ def get_title(url, feed):
        title = feed["feed"]["title"]
    except:
        title = urlsplit(url).netloc
+    if not title:
+        title = urlsplit(url).netloc
    return title


@ -621,7 +625,7 @@ async def feed_mode_request(url, tree):
    """
    feeds = {}
    parted_url = urlsplit(url)
-    paths = await get_list("pathnames")
+    paths = await get_list("pathnames", "lists.yaml")
    for path in paths:
        address = urlunsplit([
            parted_url.scheme,
@ -693,7 +697,7 @@ async def feed_mode_request(url, tree):
            ).format(url)
        if not positive:
            msg = (
-                "No feeds were found for {}."
+                "No feeds were found for {}"
                ).format(url)
        return msg
    elif feeds:
@ -721,17 +725,21 @@ async def feed_mode_scan(url, tree):
    feeds = {}
    # paths = []
    # TODO Test
-    paths = await get_list("pathnames")
+    paths = await get_list("pathnames", "lists.yaml")
    for path in paths:
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
-        xpath_query = "//a[contains(@href,'{}')]".format(path)
+        # xpath_query = "//a[contains(@href,'{}')]".format(path)
+        num = 5
+        xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
        addresses = tree.xpath(xpath_query)
+        xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
+        addresses += tree.xpath(xpath_query)
        parted_url = urlsplit(url)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
        for address in addresses:
-            print(address.xpath('@href')[0])
-            print(addresses)
+            # print(address.xpath('@href')[0])
+            # print(addresses)
            address = address.xpath('@href')[0]
            if "/" not in address:
                protocol = parted_url.scheme
@ -759,11 +767,15 @@ async def feed_mode_scan(url, tree):
            if res[1] == 200:
                try:
                    feeds[address] = parse(res[0])
+                    # print(feeds[address])
+                    # breakpoint()
                    # print(feeds)
                except:
                    continue
    if len(feeds) > 1:
-        positive = 0
+        # print(feeds)
+        # breakpoint()
+        counter = 0
        msg = (
            "RSS URL scan has found {} feeds:\n```\n"
            ).format(len(feeds))
@ -779,23 +791,32 @@ async def feed_mode_scan(url, tree):
            feed_addr = feed
            feed_amnt = len(feeds[feed].entries)
            if feed_amnt:
-                positive = 1
+                # NOTE Because there could be many false positives
+                # which are revealed in second phase of scan, we
+                # could end with a single feed, which would be
+                # listed instead of fetched, so feed_mark is
+                # utilized in order to make fetch possible.
+                feed_mark = [feed_addr]
+                counter += 1
                msg += (
-                    "Title: {}\n"
-                    " Link: {}\n"
-                    "Count: {}\n"
+                    "Title : {}\n"
+                    "Link  : {}\n"
+                    "Count : {}\n"
                    "\n"
                    ).format(
                        feed_name,
                        feed_addr,
                        feed_amnt
                        )
-        msg += (
-            "```\nThe above feeds were extracted from\n{}"
-            ).format(url)
-        if not positive:
+        if counter > 1:
+            msg += (
+                "```\nThe above feeds were extracted from\n{}"
+                ).format(url)
+        elif feed_mark:
+            return feed_mark
+        else:
            msg = (
-                "No feeds were found for {}."
+                "No feeds were found for {}"
                ).format(url)
        return msg
    elif feeds:
--- a/slixfeed/sqlitehandler.py
+++ b/slixfeed/sqlitehandler.py
@ -471,7 +471,10 @@ async def get_entry_unread(db_file, num=None):
            title = result[1]
            summary = result[2]
            # Remove HTML tags
-            summary = BeautifulSoup(summary, "lxml").text
+            try:
+                summary = BeautifulSoup(summary, "lxml").text
+            except:
+                print(result[2])
            # TODO Limit text length
            summary = summary.replace("\n\n\n", "\n\n")
            length = await get_settings_value(db_file, "length")
--- a/slixfeed/taskhandler.py
+++ b/slixfeed/taskhandler.py
@ -78,7 +78,7 @@ await taskhandler.start_tasks(

 """
 async def start_tasks_xmpp(self, jid, tasks):
-    print("start_tasks_xmpp", jid, tasks)
+    # print("start_tasks_xmpp", jid, tasks)
    task_manager[jid] = {}
    for task in tasks:
        # print("task:", task)
@ -109,7 +109,7 @@ async def start_tasks_xmpp(self, jid, tasks):
    #     await task

 async def clean_tasks_xmpp(jid, tasks):
-    print("clean_tasks_xmpp", jid, tasks)
+    # print("clean_tasks_xmpp", jid, tasks)
    for task in tasks:
        # if task_manager[jid][task]:
        try:
@ -132,7 +132,7 @@ Consider callback e.g. Slixfeed.send_status.
 Or taskhandler for each protocol or specific taskhandler function.
 """
 async def task_jid(self, jid):
-    print("task_jid", jid)
+    # print("task_jid", jid)
    """
    JID (Jabber ID) task manager.

@ -258,7 +258,7 @@ async def send_update(self, jid, num=None):


 async def send_status(self, jid):
-    print("send_status", jid)
+    # print("send_status", jid)
    # print(await current_time(), jid, "def send_status")
    """
    Send status message.
@ -336,7 +336,7 @@ async def send_status(self, jid):


 async def refresh_task(self, jid, callback, key, val=None):
-    print("refresh_task", jid, key)
+    # print("refresh_task", jid, key)
    """
    Apply new setting at runtime.

@ -382,7 +382,7 @@ async def refresh_task(self, jid, callback, key, val=None):
 # TODO Take this function out of
 # <class 'slixmpp.clientxmpp.ClientXMPP'>
 async def check_updates(jid):
-    print("check_updates", jid)
+    # print("check_updates", jid)
    # print(await current_time(), jid, "def check_updates")
    """
    Start calling for update check up.
--- a/slixfeed/xmpphandler.py
+++ b/slixfeed/xmpphandler.py
@ -64,6 +64,7 @@ import listhandler as lister
 import sqlitehandler as sqlite
 import taskhandler as tasker
 import urlhandler as urlfixer
+from time import sleep

 from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound
 # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference
@ -102,7 +103,6 @@ class Slixfeed(slixmpp.ClientXMPP):
        # The bot works fine when the nickname is hardcoded; or
        # The bot won't join some MUCs when its nickname has brackets
        self.nick = nick
-
        # The session_start event will be triggered when
        # the bot establishes its connection with the server
        # and the XML streams are ready for use. We want to
@ -387,10 +387,12 @@ class Slixfeed(slixmpp.ClientXMPP):
        #     print(current_time(),"Maximum connection attempts exceeded.")
        #     logging.error("Maximum connection attempts exceeded.")
        print(current_time(), "Attempt number", self.connection_attempts)
-        self.reconnect(wait=5.0)
-        seconds = 5
+        seconds = 30
        print(current_time(), "Next attempt within", seconds, "seconds")
-        await asyncio.sleep(seconds)
+        # NOTE asyncio.sleep doesn't interval as expected
+        # await asyncio.sleep(seconds)
+        sleep(seconds)
+        self.reconnect(wait=5.0)


    async def inspect_connection(self, event):
@ -912,7 +914,7 @@ class Slixfeed(slixmpp.ClientXMPP):
                        ["status"]
                        )
                    task = (
-                        "📫️ Processing request to fetch data from {} ..."
+                        "📫️ Processing request to fetch data from {}"
                        ).format(url)
                    process_task_message(self, jid, task)
                    action = await initdb(
@ -1080,8 +1082,9 @@ class Slixfeed(slixmpp.ClientXMPP):
                    action = (
                        "Only new items of newly added feeds will be sent."
                        )
-                case _ if message_lowercase.startswith("next"):
-                    num = message[5:]
+                # TODO Will you add support for number of messages?
+                case "next":
+                    # num = message[5:]
                    await tasker.clean_tasks_xmpp(
                        jid,
                        ["interval", "status"]
@ -1137,13 +1140,15 @@ class Slixfeed(slixmpp.ClientXMPP):
                    else:
                        action = "Missing value."
                case "random":
-                    action = "Updates will be sent randomly."
+                    # TODO /questions/2279706/select-random-row-from-a-sqlite-table
+                    # NOTE sqlitehandler.get_entry_unread
+                    action = "Updates will be sent by random order."
                case _ if message_lowercase.startswith("read"):
                    data = message[5:]
                    data = data.split()
                    url = data[0]
                    task = (
-                        "📫️ Processing request to fetch data from {} ..."
+                        "📫️ Processing request to fetch data from {}"
                        ).format(url)
                    process_task_message(self, jid, task)
                    await tasker.clean_tasks_xmpp(