From 56d0da9a76540c1eabb477675aa31bb1fa17abfe Mon Sep 17 00:00:00 2001
From: Schimon Jehudah <sjehuda@yandex.com>
Date: Sun, 24 Dec 2023 18:37:05 +0000
Subject: [PATCH] Improve connectivity recovery

---
 slixfeed/__main__.py      | 44 +++++++++++++----------
 slixfeed/datahandler.py   | 75 +++++++++++++++++++++++++--------------
 slixfeed/sqlitehandler.py |  5 ++-
 slixfeed/taskhandler.py   | 12 +++----
 slixfeed/xmpphandler.py   | 23 +++++++-----
 5 files changed, 97 insertions(+), 62 deletions(-)
diff --git a/slixfeed/__main__.py b/slixfeed/__main__.py
index 7f95840..86e3b5d 100644
--- a/slixfeed/__main__.py
+++ b/slixfeed/__main__.py
@@ -13,42 +13,44 @@ FIXME
 
 TODO
 
-0) from slixfeed.FILENAME import XYZ
-      See project feed2toot
+1) from slixfeed.FILENAME import XYZ
+   See project /chaica/feed2toot
 
-1) SQL prepared statements.
+2) SQL prepared statements;
 
-2) Machine Learning for scrapping Title, Link, Summary and Timstamp.
+3) Machine Learning for scrapping Title, Link, Summary and Timstamp;
+   Scrape element </article> (example: Liferea)
+   http://intertwingly.net/blog/
+   https://www.brandenburg.de/
 
-3) Set MUC subject
+4) Set MUC subject
    Feeds which entries are to be set as groupchat subject.
    Perhaps not, as it would require to check every feed for this setting.
-   Maybe a separate bot.
+   Maybe a separate bot;
 
-4) Support categories.
+5) Support categories;
 
-5) Default prepackaged list of feeds.
+6) XMPP commands;
 
-6) XMPP commands.
+7) Bot as transport;
 
-7) Bot as transport.
+8) OMEMO;
 
-8) OMEMO.
+9) Logging;
+   https://docs.python.org/3/howto/logging.html
 
-9) Logging.
+10) Readability
+    See project /buriy/python-readability
 
-10) Default feeds (e.g. Blacklisted News, TBOT etc.)
-
-11) Download and upload/send article (xHTML, xHTMLZ, Markdown, MHTML, TXT).
-    Use Readability.
+11) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, TXT).
 
 12) Fetch summary from URL, instead of storing summary, or
     Store 5 upcoming summaries.
     This would help making the database files smaller.
 
 13) Support protocol Gopher
-    https://github.com/michael-lazar/pygopherd
-    https://github.com/gopherball/gb
+    See project /michael-lazar/pygopherd
+    See project /gopherball/gb
 
 14) Support ActivityPub @person@domain (see Tip Of The Day).
 
@@ -60,7 +62,11 @@ TODO
 
 16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
     
-17) See project offpunk/offblocklist.py
+17) See project /offpunk/offblocklist.py
+
+18) Search messages of government regulated publishers, and promote other sources.
+    Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
+    However, you might want to get news from (1) (2) and (3) instead!
 
 """
 
diff --git a/slixfeed/datahandler.py b/slixfeed/datahandler.py
index 5860ba2..86f5850 100644
--- a/slixfeed/datahandler.py
+++ b/slixfeed/datahandler.py
@@ -19,14 +19,14 @@ from asyncio import TimeoutError
 from asyncio.exceptions import IncompleteReadError
 from bs4 import BeautifulSoup
 from confighandler import get_list, get_value_default
+from datetimehandler import now, rfc2822_to_iso8601
 from email.utils import parseaddr
 from feedparser import parse
 from http.client import IncompleteRead
-from lxml import html
-from datetimehandler import now, rfc2822_to_iso8601
-from urlhandler import complete_url, join_url, trim_url
 from listhandler import is_listed
+from lxml import html
 import sqlitehandler as sqlite
+from urlhandler import complete_url, join_url, trim_url
 from urllib import error
 # from xml.etree.ElementTree import ElementTree, ParseError
 from urllib.parse import urljoin, urlsplit, urlunsplit
@@ -202,8 +202,8 @@ async def download_updates(db_file, url=None):
                         print("PROBLEM: date is int")
                         print(date)
                         # breakpoint()
-                    print(source)
-                    print(date)
+                    # print(source)
+                    # print(date)
                     await sqlite.add_entry_and_set_date(
                         db_file,
                         source,
@@ -261,9 +261,9 @@ async def view_feed(url):
         title = get_title(url, feed)
         entries = feed.entries
         msg = "Preview of {}:\n```\n".format(title)
-        count = 0
+        counter = 0
         for entry in entries:
-            count += 1
+            counter += 1
             if entry.has_key("title"):
                 title = entry.title
             else:
@@ -292,9 +292,9 @@ async def view_feed(url):
                     title,
                     date,
                     link,
-                    count
+                    counter
                     )
-            if count > 4:
+            if counter > 4:
                 break
         msg += (
             "```\nSource: {}"
@@ -446,7 +446,7 @@ async def add_feed(db_file, url):
             title = get_title(url, feed)
             if feed.bozo:
                 bozo = (
-                    "Bozo detected. Failed to load: {}."
+                    "Bozo detected. Failed to load: {}"
                     ).format(url)
                 print(bozo)
                 msg = await probe_page(add_feed, url, res[0], db_file=db_file)
@@ -505,7 +505,7 @@ async def probe_page(callback, url, doc, num=None, db_file=None):
         elif isinstance(msg, list):
             url = msg[0]
             if db_file:
-                print("if db_file", db_file)
+                # print("if db_file", db_file)
                 return await callback(db_file, url)
             elif num:
                 return await callback(url, num)
@@ -531,6 +531,8 @@ async def download_feed(url):
         user_agent = await get_value_default("user-agent", "Network")
     except:
         user_agent = "Slixfeed/0.1"
+    if not len(user_agent):
+        user_agent = "Slixfeed/0.1"
     timeout = ClientTimeout(total=10)
     headers = {'User-Agent': user_agent}
     async with ClientSession(headers=headers) as session:
@@ -597,6 +599,8 @@ def get_title(url, feed):
         title = feed["feed"]["title"]
     except:
         title = urlsplit(url).netloc
+    if not title:
+        title = urlsplit(url).netloc
     return title
 
 
@@ -621,7 +625,7 @@ async def feed_mode_request(url, tree):
     """
     feeds = {}
     parted_url = urlsplit(url)
-    paths = await get_list("pathnames")
+    paths = await get_list("pathnames", "lists.yaml")
     for path in paths:
         address = urlunsplit([
             parted_url.scheme,
@@ -693,7 +697,7 @@ async def feed_mode_request(url, tree):
             ).format(url)
         if not positive:
             msg = (
-                "No feeds were found for {}."
+                "No feeds were found for {}"
                 ).format(url)
         return msg
     elif feeds:
@@ -721,17 +725,21 @@ async def feed_mode_scan(url, tree):
     feeds = {}
     # paths = []
     # TODO Test
-    paths = await get_list("pathnames")
+    paths = await get_list("pathnames", "lists.yaml")
     for path in paths:
         # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
-        xpath_query = "//a[contains(@href,'{}')]".format(path)
+        # xpath_query = "//a[contains(@href,'{}')]".format(path)
+        num = 5
+        xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
         addresses = tree.xpath(xpath_query)
+        xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
+        addresses += tree.xpath(xpath_query)
         parted_url = urlsplit(url)
         # NOTE Should number of addresses be limited or
         # perhaps be N from the start and N from the end
         for address in addresses:
-            print(address.xpath('@href')[0])
-            print(addresses)
+            # print(address.xpath('@href')[0])
+            # print(addresses)
             address = address.xpath('@href')[0]
             if "/" not in address:
                 protocol = parted_url.scheme
@@ -759,11 +767,15 @@ async def feed_mode_scan(url, tree):
             if res[1] == 200:
                 try:
                     feeds[address] = parse(res[0])
+                    # print(feeds[address])
+                    # breakpoint()
                     # print(feeds)
                 except:
                     continue
     if len(feeds) > 1:
-        positive = 0
+        # print(feeds)
+        # breakpoint()
+        counter = 0
         msg = (
             "RSS URL scan has found {} feeds:\n```\n"
             ).format(len(feeds))
@@ -779,23 +791,32 @@ async def feed_mode_scan(url, tree):
             feed_addr = feed
             feed_amnt = len(feeds[feed].entries)
             if feed_amnt:
-                positive = 1
+                # NOTE Because there could be many false positives
+                # which are revealed in second phase of scan, we
+                # could end with a single feed, which would be
+                # listed instead of fetched, so feed_mark is
+                # utilized in order to make fetch possible.
+                feed_mark = [feed_addr]
+                counter += 1
                 msg += (
-                    "Title: {}\n"
-                    " Link: {}\n"
-                    "Count: {}\n"
+                    "Title : {}\n"
+                    "Link  : {}\n"
+                    "Count : {}\n"
                     "\n"
                     ).format(
                         feed_name,
                         feed_addr,
                         feed_amnt
                         )
-        msg += (
-            "```\nThe above feeds were extracted from\n{}"
-            ).format(url)
-        if not positive:
+        if counter > 1:
+            msg += (
+                "```\nThe above feeds were extracted from\n{}"
+                ).format(url)
+        elif feed_mark:
+            return feed_mark
+        else:
             msg = (
-                "No feeds were found for {}."
+                "No feeds were found for {}"
                 ).format(url)
         return msg
     elif feeds:
diff --git a/slixfeed/sqlitehandler.py b/slixfeed/sqlitehandler.py
index d3a3abc..452d83e 100644
--- a/slixfeed/sqlitehandler.py
+++ b/slixfeed/sqlitehandler.py
@@ -471,7 +471,10 @@ async def get_entry_unread(db_file, num=None):
             title = result[1]
             summary = result[2]
             # Remove HTML tags
-            summary = BeautifulSoup(summary, "lxml").text
+            try:
+                summary = BeautifulSoup(summary, "lxml").text
+            except:
+                print(result[2])
             # TODO Limit text length
             summary = summary.replace("\n\n\n", "\n\n")
             length = await get_settings_value(db_file, "length")
diff --git a/slixfeed/taskhandler.py b/slixfeed/taskhandler.py
index 9fe519c..d3a51f1 100644
--- a/slixfeed/taskhandler.py
+++ b/slixfeed/taskhandler.py
@@ -78,7 +78,7 @@ await taskhandler.start_tasks(
 
 """
 async def start_tasks_xmpp(self, jid, tasks):
-    print("start_tasks_xmpp", jid, tasks)
+    # print("start_tasks_xmpp", jid, tasks)
     task_manager[jid] = {}
     for task in tasks:
         # print("task:", task)
@@ -109,7 +109,7 @@ async def start_tasks_xmpp(self, jid, tasks):
     #     await task
 
 async def clean_tasks_xmpp(jid, tasks):
-    print("clean_tasks_xmpp", jid, tasks)
+    # print("clean_tasks_xmpp", jid, tasks)
     for task in tasks:
         # if task_manager[jid][task]:
         try:
@@ -132,7 +132,7 @@ Consider callback e.g. Slixfeed.send_status.
 Or taskhandler for each protocol or specific taskhandler function.
 """
 async def task_jid(self, jid):
-    print("task_jid", jid)
+    # print("task_jid", jid)
     """
     JID (Jabber ID) task manager.
 
@@ -258,7 +258,7 @@ async def send_update(self, jid, num=None):
 
 
 async def send_status(self, jid):
-    print("send_status", jid)
+    # print("send_status", jid)
     # print(await current_time(), jid, "def send_status")
     """
     Send status message.
@@ -336,7 +336,7 @@ async def send_status(self, jid):
 
 
 async def refresh_task(self, jid, callback, key, val=None):
-    print("refresh_task", jid, key)
+    # print("refresh_task", jid, key)
     """
     Apply new setting at runtime.
 
@@ -382,7 +382,7 @@ async def refresh_task(self, jid, callback, key, val=None):
 # TODO Take this function out of
 # <class 'slixmpp.clientxmpp.ClientXMPP'>
 async def check_updates(jid):
-    print("check_updates", jid)
+    # print("check_updates", jid)
     # print(await current_time(), jid, "def check_updates")
     """
     Start calling for update check up.
diff --git a/slixfeed/xmpphandler.py b/slixfeed/xmpphandler.py
index 0052a73..1596435 100644
--- a/slixfeed/xmpphandler.py
+++ b/slixfeed/xmpphandler.py
@@ -64,6 +64,7 @@ import listhandler as lister
 import sqlitehandler as sqlite
 import taskhandler as tasker
 import urlhandler as urlfixer
+from time import sleep
 
 from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound
 # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference
@@ -102,7 +103,6 @@ class Slixfeed(slixmpp.ClientXMPP):
         # The bot works fine when the nickname is hardcoded; or
         # The bot won't join some MUCs when its nickname has brackets
         self.nick = nick
-
         # The session_start event will be triggered when
         # the bot establishes its connection with the server
         # and the XML streams are ready for use. We want to
@@ -387,10 +387,12 @@ class Slixfeed(slixmpp.ClientXMPP):
         #     print(current_time(),"Maximum connection attempts exceeded.")
         #     logging.error("Maximum connection attempts exceeded.")
         print(current_time(), "Attempt number", self.connection_attempts)
-        self.reconnect(wait=5.0)
-        seconds = 5
+        seconds = 30
         print(current_time(), "Next attempt within", seconds, "seconds")
-        await asyncio.sleep(seconds)
+        # NOTE asyncio.sleep doesn't interval as expected
+        # await asyncio.sleep(seconds)
+        sleep(seconds)
+        self.reconnect(wait=5.0)
 
 
     async def inspect_connection(self, event):
@@ -912,7 +914,7 @@ class Slixfeed(slixmpp.ClientXMPP):
                         ["status"]
                         )
                     task = (
-                        "📫️ Processing request to fetch data from {} ..."
+                        "📫️ Processing request to fetch data from {}"
                         ).format(url)
                     process_task_message(self, jid, task)
                     action = await initdb(
@@ -1080,8 +1082,9 @@ class Slixfeed(slixmpp.ClientXMPP):
                     action = (
                         "Only new items of newly added feeds will be sent."
                         )
-                case _ if message_lowercase.startswith("next"):
-                    num = message[5:]
+                # TODO Will you add support for number of messages?
+                case "next":
+                    # num = message[5:]
                     await tasker.clean_tasks_xmpp(
                         jid,
                         ["interval", "status"]
@@ -1137,13 +1140,15 @@ class Slixfeed(slixmpp.ClientXMPP):
                     else:
                         action = "Missing value."
                 case "random":
-                    action = "Updates will be sent randomly."
+                    # TODO /questions/2279706/select-random-row-from-a-sqlite-table
+                    # NOTE sqlitehandler.get_entry_unread
+                    action = "Updates will be sent by random order."
                 case _ if message_lowercase.startswith("read"):
                     data = message[5:]
                     data = data.split()
                     url = data[0]
                     task = (
-                        "📫️ Processing request to fetch data from {} ..."
+                        "📫️ Processing request to fetch data from {}"
                         ).format(url)
                     process_task_message(self, jid, task)
                     await tasker.clean_tasks_xmpp(