Improve code of module crawl.py

2024-01-09 12:34:10 +00:00 · 2024-01-09 12:34:10 +00:00 · 9709c052ee
commit 9709c052ee
parent 956ce69fcb
4 changed files with 267 additions and 267 deletions
--- a/slixfeed/action.py
+++ b/slixfeed/action.py
@ -81,7 +81,7 @@ def is_feed(feed):
        True or False.
    """
    value = False
-    message = None
+    # message = None
    if not feed.entries:
        if "version" in feed.keys():
            feed["version"]
@ -110,7 +110,6 @@ def is_feed(feed):
        # message = (
        #     "Good feed for {}"
        #     ).format(url)
-    print(message)
    return value


@ -402,15 +401,11 @@ async def add_feed(db_file, url):
                else:
                    result = await crawl.probe_page(
                        url, document)
-                    # TODO Check length and for a write a
-                    # unified message for a set of feeds.
-                    # Use logging if you so choose to
-                    # distinct the methods
-                    if isinstance(result, list):
-                        url = result[0]
-                    elif isinstance(result, str):
+                    if isinstance(result, str):
                        response = result
                        break
+                    else:
+                        url = result[0]
            else:
                response = (
                    "> {}\nFailed to load URL.  Reason: {}"
@ -480,15 +475,11 @@ async def view_feed(url):
            else:
                result = await crawl.probe_page(
                    url, document)
-                # TODO Check length and for a write a
-                # unified message for a set of feeds.
-                # Use logging if you so choose to
-                # distinct the methods
-                if isinstance(result, list):
-                    url = result[0]
-                elif isinstance(result, str):
+                if isinstance(result, str):
                    response = result
                    break
+                else:
+                    url = result[0]
        else:
            response = (
                "> {}\nFailed to load URL.  Reason: {}"
@ -553,15 +544,11 @@ async def view_entry(url, num):
            else:
                result = await crawl.probe_page(
                    url, document)
-                # TODO Check length and for a write a
-                # unified message for a set of feeds.
-                # Use logging if you so choose to
-                # distinct the methods
-                if isinstance(result, list):
-                    url = result[0]
-                elif isinstance(result, str):
+                if isinstance(result, str):
                    response = result
                    break
+                else:
+                    url = result[0]
        else:
            response = (
                "> {}\nFailed to load URL.  Reason: {}"
@ -660,8 +647,11 @@ async def scan(db_file, url):
                        db_file, "filter-deny", string)
                    if reject_list:
                        read_status = 1
+                        logging.debug(
+                            "Rejected due to keyword {}".format(reject_list))
                if isinstance(date, int):
-                    logging.error("Variable 'date' is int:", date)
+                    logging.error(
+                        "Variable 'date' is int: {}".format(date))
                await sqlite.add_entry(
                    db_file, title, link, entry_id,
                    url, date, read_status)
@ -723,7 +713,7 @@ async def organize_items(db_file, urls):
                    IncompleteRead,
                    error.URLError
                    ) as e:
-                print(e)
+                logging.error(e)
                # TODO Print error to log
                # None
                # NOTE I don't think there should be "return"
--- a/slixfeed/crawl.py
+++ b/slixfeed/crawl.py
@ -19,6 +19,7 @@ TODO

 from aiohttp import ClientError, ClientSession, ClientTimeout
 from feedparser import parse
+import logging
 from lxml import html
 import slixfeed.config as config
 from slixfeed.fetch import download_feed
@ -88,15 +89,20 @@ async def probe_page(url, document):
            "> {}\nFailed to parse URL as feed."
            ).format(url)
    if not result:
-        print("RSS Auto-Discovery Engaged")
+        logging.debug(
+            "Feed auto-discovery engaged for {}".format(url))
        result = await feed_mode_auto_discovery(url, tree)
    if not result:
-        print("RSS Scan Mode Engaged")
+        logging.debug(
+            "Feed link scan mode engaged for {}".format(url))
        result = await feed_mode_scan(url, tree)
    if not result:
-        print("RSS Arbitrary Mode Engaged")
-        result = await feed_mode_request(url, tree)
+        logging.debug(
+            "Feed arbitrary mode engaged for {}".format(url))
+        result = await feed_mode_guess(url, tree)
    if not result:
+        logging.debug(
+            "No feeds were found for {}".format(url))
        result = (
            "> {}\nNo news feeds were found for URL."
            ).format(url)
@ -104,7 +110,7 @@ async def probe_page(url, document):


 # TODO Improve scan by gradual decreasing of path
-async def feed_mode_request(url, tree):
+async def feed_mode_guess(url, tree):
    """
    Lookup for feeds by pathname using HTTP Requests.

@ -122,94 +128,26 @@ async def feed_mode_request(url, tree):
    msg : str
        Message with URLs.
    """
-    feeds = {}
+    urls = []
    parted_url = urlsplit(url)
    paths = config.get_list("lists.yaml", "pathnames")
+    # Check whether URL has path (i.e. not root)
+    # Check parted_url.path to avoid error in case root wasn't given
+    # TODO Make more tests
+    if parted_url.path and parted_url.path.split('/')[1]:
+        paths.extend(
+            [".atom", ".feed", ".rdf", ".rss"]
+            ) if '.rss' not in paths else -1
+        # if paths.index('.rss'):
+        #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
    for path in paths:
-        address = urlunsplit([
-            parted_url.scheme,
-            parted_url.netloc,
-            path,
-            None,
-            None
-            ])
-        res = await download_feed(address)
-        if res[1] == 200:
-            # print(parse(res[0])["feed"]["title"])
-            # feeds[address] = parse(res[0])["feed"]["title"]
-            try:
-                title = parse(res[0])["feed"]["title"]
-            except:
-                title = '*** No Title ***'
-            feeds[address] = title
-        # Check whether URL has path (i.e. not root)
-        # Check parted_url.path to avoid error in case root wasn't given
-        # TODO Make more tests
-        if parted_url.path and parted_url.path.split('/')[1]:
-            paths.extend(
-                [".atom", ".feed", ".rdf", ".rss"]
-                ) if '.rss' not in paths else -1
-            # if paths.index('.rss'):
-            #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
-            address = urlunsplit([
-                parted_url.scheme,
-                parted_url.netloc,
-                parted_url.path.split('/')[1] + path,
-                None,
-                None
-                ])
-            res = await download_feed(address)
-            if res[1] == 200:
-                try:
-                    feeds[address] = parse(res[0])
-                    # print(feeds)
-                except:
-                    continue
-    # TODO return feeds
-    if len(feeds) > 1:
-        counter = 0
-        msg = (
-            "RSS URL discovery has found {} feeds:\n\n```\n"
-            ).format(len(feeds))
-        feed_mark = 0
-        for feed in feeds:
-            try:
-                feed_name = feeds[feed]["feed"]["title"]
-            except:
-                feed_name = urlsplit(feed).netloc
-            feed_addr = feed
-            # AttributeError: 'str' object has no attribute 'entries'
-            try:
-                feed_amnt = len(feeds[feed].entries)
-            except:
-                continue
-            if feed_amnt:
-                # NOTE Because there could be many false positives
-                # which are revealed in second phase of scan, we
-                # could end with a single feed, which would be
-                # listed instead of fetched, so feed_mark is
-                # utilized in order to make fetch possible.
-                feed_mark = [feed_addr]
-                counter += 1
-                msg += (
-                    "Title: {}\n"
-                    "Link : {}\n"
-                    "Items: {}\n"
-                    "\n"
-                    ).format(feed_name, feed_addr, feed_amnt)
-        if counter > 1:
-            msg += (
-                "```\nThe above feeds were extracted from\n{}"
-                ).format(url)
-        elif feed_mark:
-            return feed_mark
-        else:
-            msg = (
-                "No feeds were found for {}"
-                ).format(url)
-        return msg
-    elif feeds:
-        return feeds
+        address = join_url(url, parted_url.path.split('/')[1] + path)
+        if address not in urls:
+            urls.extend([address])
+    # breakpoint()
+    # print("feed_mode_guess")
+    urls = await process_feed_selection(url, urls)
+    return urls


 async def feed_mode_scan(url, tree):
@ -230,9 +168,7 @@ async def feed_mode_scan(url, tree):
    msg : str
        Message with URLs.
    """
-    feeds = {}
-    # paths = []
-    # TODO Test
+    urls = []
    paths = config.get_list("lists.yaml", "pathnames")
    for path in paths:
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
@ -242,91 +178,16 @@ async def feed_mode_scan(url, tree):
        addresses = tree.xpath(xpath_query)
        xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
        addresses += tree.xpath(xpath_query)
-        parted_url = urlsplit(url)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
        for address in addresses:
-            # print(address.xpath('@href')[0])
-            # print(addresses)
-            address = address.xpath('@href')[0]
-            if "/" not in address:
-                protocol = parted_url.scheme
-                hostname = parted_url.netloc
-                pathname = address
-                address = urlunsplit([
-                    protocol,
-                    hostname,
-                    pathname,
-                    None,
-                    None
-                    ])
-            if address.startswith('/'):
-                protocol = parted_url.scheme
-                hostname = parted_url.netloc
-                pathname = address
-                address = urlunsplit([
-                    protocol,
-                    hostname,
-                    pathname,
-                    None,
-                    None
-                    ])
-            res = await download_feed(address)
-            if res[1] == 200:
-                try:
-                    feeds[address] = parse(res[0])
-                    # print(feeds[address])
-                    # breakpoint()
-                    # print(feeds)
-                except:
-                    continue
-    # TODO return feeds
-    if len(feeds) > 1:
-        # print(feeds)
-        # breakpoint()
-        counter = 0
-        msg = (
-            "RSS URL scan has found {} feeds:\n\n```\n"
-            ).format(len(feeds))
-        feed_mark = 0
-        for feed in feeds:
-            # try:
-            #     res = await download_feed(feed)
-            # except:
-            #     continue
-            try:
-                feed_name = feeds[feed]["feed"]["title"]
-            except:
-                feed_name = urlsplit(feed).netloc
-            feed_addr = feed
-            feed_amnt = len(feeds[feed].entries)
-            if feed_amnt:
-                # NOTE Because there could be many false positives
-                # which are revealed in second phase of scan, we
-                # could end with a single feed, which would be
-                # listed instead of fetched, so feed_mark is
-                # utilized in order to make fetch possible.
-                feed_mark = [feed_addr]
-                counter += 1
-                msg += (
-                    "Title : {}\n"
-                    "Link  : {}\n"
-                    "Count : {}\n"
-                    "\n"
-                    ).format(feed_name, feed_addr, feed_amnt)
-        if counter > 1:
-            msg += (
-                "```\nThe above feeds were extracted from\n{}"
-                ).format(url)
-        elif feed_mark:
-            return feed_mark
-        else:
-            msg = (
-                "No feeds were found for {}"
-                ).format(url)
-        return msg
-    elif feeds:
-        return feeds
+            address = join_url(url, address.xpath('@href')[0])
+            if address not in urls:
+                urls.extend([address])
+    # breakpoint()
+    # print("feed_mode_scan")
+    urls = await process_feed_selection(url, urls)
+    return urls


 async def feed_mode_auto_discovery(url, tree):
@ -358,11 +219,8 @@ async def feed_mode_auto_discovery(url, tree):
    # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
    # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
    feeds = tree.xpath(xpath_query)
-    # TODO return feeds
-    if len(feeds) > 1:
-        msg = (
-            "RSS Auto-Discovery has found {} feeds:\n\n```\n"
-            ).format(len(feeds))
+    if feeds:
+        urls = []
        for feed in feeds:
            # # The following code works;
            # # The following code will catch
@ -373,15 +231,129 @@ async def feed_mode_auto_discovery(url, tree):
            #     disco = parse(res[0])
            #     title = disco["feed"]["title"]
            #     msg += "{} \n {} \n\n".format(title, feed)
-            feed_name = feed.xpath('@title')[0]
-            feed_addr = join_url(url, feed.xpath('@href')[0])
+
+            # feed_name = feed.xpath('@title')[0]
+            # feed_addr = join_url(url, feed.xpath('@href')[0])
+
            # if feed_addr.startswith("/"):
            #     feed_addr = url + feed_addr
-            msg += "{}\n{}\n\n".format(feed_name, feed_addr)
-        msg += (
-            "```\nThe above feeds were extracted from\n{}"
-            ).format(url)
-        return msg
-    elif feeds:
-        feed_addr = join_url(url, feeds[0].xpath('@href')[0])
-        return [feed_addr]
+            address = join_url(url, feed.xpath('@href')[0])
+            if address not in urls:
+                urls.extend([address])
+        # breakpoint()
+        # print("feed_mode_auto_discovery")
+        urls = await process_feed_selection(url, urls)
+        return urls
+
+
+# TODO Segregate function into function that returns
+# URLs (string) and Feeds (dict) and function that
+# composes text message (string).
+# Maybe that's not necessary.
+async def process_feed_selection(url, urls):
+    feeds = {}
+    for i in urls:
+        res = await download_feed(i)
+        if res[1] == 200:
+            try:
+                feeds[i] = [parse(res[0])]
+            except:
+                continue
+    message = (
+        "Web feeds found for {}\n\n```\n"
+        ).format(url)
+    counter = 0
+    feed_url_mark = 0
+    for feed_url in feeds:
+        # try:
+        #     res = await download_feed(feed)
+        # except:
+        #     continue
+        feed_name = None
+        if "title" in feeds[feed_url][0]["feed"].keys():
+            feed_name = feeds[feed_url][0].feed.title
+        feed_name = feed_name if feed_name else "Untitled"
+        # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
+        # AttributeError: 'str' object has no attribute 'entries'
+        if "entries" in feeds[feed_url][0].keys():
+            feed_amnt = feeds[feed_url][0].entries
+        else:
+            continue
+        if feed_amnt:
+            # NOTE Because there could be many false positives
+            # which are revealed in second phase of scan, we
+            # could end with a single feed, which would be
+            # listed instead of fetched, so feed_url_mark is
+            # utilized in order to make fetch possible.
+            feed_url_mark = [feed_url]
+            counter += 1
+            message += (
+                "Title : {}\n"
+                "Link  : {}\n"
+                "\n"
+                ).format(feed_name, feed_url)
+    if counter > 1:
+        message += (
+            "```\nTotal of {} feeds."
+            ).format(counter)
+        result = message
+    elif feed_url_mark:
+        result = feed_url_mark
+    else:
+        result = None
+    return result
+
+
+# def get_discovered_feeds(url, urls):
+#     message = (
+#         "Found {} web feeds:\n\n```\n"
+#         ).format(len(urls))
+#     if len(urls) > 1:
+#         for urls in urls:
+#                 message += (
+#                     "Title : {}\n"
+#                     "Link  : {}\n"
+#                     "\n"
+#                     ).format(url, url.title)
+#         message += (
+#             "```\nThe above feeds were extracted from\n{}"
+#             ).format(url)
+#     elif len(urls) > 0:
+#         result = urls
+#     else:
+#         message = (
+#             "No feeds were found for {}"
+#             ).format(url)
+#     return result
+
+
+# Test module
+# TODO ModuleNotFoundError: No module named 'slixfeed'
+# import slixfeed.fetch as fetch
+# from slixfeed.action import is_feed, process_feed_selection
+
+# async def start(url):
+#     while True:
+#         result = await fetch.download_feed(url)
+#         document = result[0]
+#         status = result[1]
+#         if document:
+#             feed = parse(document)
+#             if is_feed(feed):
+#                 print(url)
+#             else:
+#                 urls = await probe_page(
+#                     url, document)
+#                 if len(urls) > 1:
+#                     await process_feed_selection(urls)
+#                 elif urls:
+#                     url = urls[0]
+#         else:
+#             response = (
+#                 "> {}\nFailed to load URL.  Reason: {}"
+#                 ).format(url, status)
+#             break
+#     return response
+
+# url = "https://www.smh.com.au/rssheadlines"
+# start(url)
--- a/slixfeed/xmpp/process.py
+++ b/slixfeed/xmpp/process.py
@ -18,6 +18,7 @@ TODO

 """

+import logging
 import os
 import slixfeed.action as action
 from slixfeed.config import (
@ -78,6 +79,38 @@ async def message(self, message):
    """
    if message["type"] in ("chat", "groupchat", "normal"):
        jid = message["from"].bare
+        message_text = " ".join(message["body"].split())
+
+        # BOTE This is an exceptional case in which we treat
+        # type groupchat the same as type chat.
+        if (message_text.lower().startswith("http")) and(
+            message_text.lower().endswith(".opml")):
+            url = message_text
+            await task.clean_tasks_xmpp(
+                jid, ["status"])
+            status_type = "dnd"
+            status_message = (
+                "📥️ Procesing request to import feeds ..."
+                )
+            send_status_message(
+                self, jid, status_type, status_message)
+            db_file = get_pathname_to_database(jid)
+            count = await action.import_opml(db_file, url)
+            if count:
+                response = (
+                    "Successfully imported {} feeds"
+                    ).format(count)
+            else:
+                response = (
+                    "OPML file was not imported."
+                    )
+            await task.clean_tasks_xmpp(
+                jid, ["status"])
+            await task.start_tasks_xmpp(
+                self, jid, ["status"])
+            send_reply_message(self, message, response)
+
+
        if message["type"] == "groupchat":
            # nick = message["from"][message["from"].index("/")+1:]
            nick = str(message["from"])
@ -135,18 +168,26 @@ async def message(self, message):

        # await compose.message(self, jid, message)

-        message_text = " ".join(message["body"].split())
        if message["type"] == "groupchat":
            message_text = message_text[1:]
        message_lowercase = message_text.lower()
    
-        print(current_time(), "ACCOUNT: " + str(message["from"]))
-        print(current_time(), "COMMAND:", message_text)
-        response = 0
+        logging.debug(
+            [str(message["from"]), ":", message_text])
+        response = None
        match message_lowercase:
            # case "breakpoint":
            #     if jid == get_value("accounts", "XMPP", "operator"):
            #         breakpoint()
+            #         print("task_manager[jid]")
+            #         print(task_manager[jid])
+            #         await self.get_roster()
+            #         print("roster 1")
+            #         print(self.client_roster)
+            #         print("roster 2")
+            #         print(self.client_roster.keys())
+            #         print("jid")
+            #         print(jid)
            #     else:
            #         response = (
            #             "This action is restricted. "
@ -171,15 +212,6 @@ async def message(self, message):
                    "Send \"help\" for instructions.\n"
                    )
                send_reply_message(self, message, response)
-                # print("task_manager[jid]")
-                # print(task_manager[jid])
-                await self.get_roster()
-                print("roster 1")
-                print(self.client_roster)
-                print("roster 2")
-                print(self.client_roster.keys())
-                print("jid")
-                print(jid)
    
            # case _ if message_lowercase.startswith("activate"):
            #     if message["type"] == "groupchat":
@ -242,8 +274,8 @@ async def message(self, message):
                        response = (
                            "> {}\nNews source \"{}\" is already "
                            "listed in the subscription list at "
-                            "index {}".format(url, name, ix)
-                            )
+                            "index {}"
+                            ).format(url, name, ix)
                else:
                    response = "Missing URL."
                send_reply_message(self, message, response)
@ -406,32 +438,32 @@ async def message(self, message):
                        message_lowercase.startswith("gopher:")):
                response = "Gemini and Gopher are not supported yet."
                send_reply_message(self, message, response)
-            case _ if (message_lowercase.startswith("http")) and(
-                message_lowercase.endswith(".opml")):
-                url = message_text
-                await task.clean_tasks_xmpp(
-                    jid, ["status"])
-                status_type = "dnd"
-                status_message = (
-                    "📥️ Procesing request to import feeds ..."
-                    )
-                send_status_message(
-                    self, jid, status_type, status_message)
-                db_file = get_pathname_to_database(jid)
-                count = await action.import_opml(db_file, url)
-                if count:
-                    response = (
-                        "Successfully imported {} feeds"
-                        ).format(count)
-                else:
-                    response = (
-                        "OPML file was not imported."
-                        )
-                await task.clean_tasks_xmpp(
-                    jid, ["status"])
-                await task.start_tasks_xmpp(
-                    self, jid, ["status"])
-                send_reply_message(self, message, response)
+            # case _ if (message_lowercase.startswith("http")) and(
+            #     message_lowercase.endswith(".opml")):
+            #     url = message_text
+            #     await task.clean_tasks_xmpp(
+            #         jid, ["status"])
+            #     status_type = "dnd"
+            #     status_message = (
+            #         "📥️ Procesing request to import feeds ..."
+            #         )
+            #     send_status_message(
+            #         self, jid, status_type, status_message)
+            #     db_file = get_pathname_to_database(jid)
+            #     count = await action.import_opml(db_file, url)
+            #     if count:
+            #         response = (
+            #             "Successfully imported {} feeds"
+            #             ).format(count)
+            #     else:
+            #         response = (
+            #             "OPML file was not imported."
+            #             )
+            #     await task.clean_tasks_xmpp(
+            #         jid, ["status"])
+            #     await task.start_tasks_xmpp(
+            #         self, jid, ["status"])
+            #     send_reply_message(self, message, response)
            case _ if (message_lowercase.startswith("http") or
                        message_lowercase.startswith("feed:")):
                url = message_text
@ -447,7 +479,8 @@ async def message(self, message):
                    url = uri.feed_to_http(url)
                url = (uri.replace_hostname(url, "feed")) or url
                db_file = get_pathname_to_database(jid)
-                response = await action.add_feed(db_file, url)
+                response = await action.add_feed(
+                    db_file, url)
                await task.clean_tasks_xmpp(
                    jid, ["status"])
                await task.start_tasks_xmpp(
@ -458,8 +491,10 @@ async def message(self, message):
                if query:
                    if len(query) > 3:
                        db_file = get_pathname_to_database(jid)
-                        result = await sqlite.search_feeds(db_file, query)
-                        response = action.list_feeds_by_query(query, result)
+                        result = await sqlite.search_feeds(
+                            db_file, query)
+                        response = action.list_feeds_by_query(
+                            query, result)
                    else:
                        response = (
                            "Enter at least 4 characters to search"
@ -506,11 +541,11 @@ async def message(self, message):
                    await groupchat.join(self, jid, muc_jid)
                    response = (
                        "Joined groupchat {}"
-                                ).format(message_text)
+                        ).format(message_text)
                else:
                    response = (
                        "> {}\nXMPP URI is not valid."
-                                ).format(message_text)
+                        ).format(message_text)
                send_reply_message(self, message, response)
            case _ if message_lowercase.startswith("length"):
                    key = message_text[:6]
@ -685,16 +720,19 @@ async def message(self, message):
                                db_file, ix)
                            response = (
                                "> {}\nNews source {} has been removed "
-                                "from subscription list.").format(url, ix)
+                                "from subscription list."
+                                ).format(url, ix)
                        except:
                            response = (
-                                "No news source with ID {}.".format(ix))
+                                "No news source with ID {}."
+                                ).format(ix)
                    except:
                        url = ix_url
                        await sqlite.remove_feed_by_url(db_file, url)
                        response = (
                            "> {}\nNews source has been removed "
-                            "from subscription list.").format(url)
+                            "from subscription list."
+                            ).format(url)
                    # await refresh_task(
                    #     self,
                    #     jid,
@ -835,11 +873,11 @@ async def message(self, message):
                    await groupchat.join(self, jid, muc_jid)
                    response = (
                        "Joined groupchat {}"
-                                ).format(message_text)
+                        ).format(message_text)
                else:
                    response = (
                        "> {}\nXMPP URI is not valid."
-                                ).format(message_text)
+                        ).format(message_text)
                send_reply_message(self, message, response)
            case _:
                response = (
--- a/slixfeed/xmpp/text.py
+++ b/slixfeed/xmpp/text.py
@ -216,7 +216,7 @@ def print_help():
        " info\n"
        "   Print information page.\n"
        " support\n"
-        "   Join xmpp:slixmpp@muc.poez.io?join\n"
+        "   Join xmpp:slixfeed@chat.woodpeckersnest.space?join\n"
        # "\n"
        # "PROTOCOLS\n"
        # " Supported prootcols are IRC, Matrix and XMPP.\n"