Improve code of module crawl.py

2024-01-09 12:34:10 +00:00 · 2024-01-09 12:34:10 +00:00 · 9709c052ee
commit 9709c052ee
parent 956ce69fcb
4 changed files with 267 additions and 267 deletions
--- a/slixfeed/action.py
+++ b/slixfeed/action.py
@ -81,7 +81,7 @@ def is_feed(feed):
        True or False.
    """
    value = False
-    message = None
+    # message = None
    if not feed.entries:
        if "version" in feed.keys():
            feed["version"]
@ -110,7 +110,6 @@ def is_feed(feed):
        # message = (
        #     "Good feed for {}"
        #     ).format(url)
    print(message)
    return value
@ -402,15 +401,11 @@ async def add_feed(db_file, url):
                else:
                    result = await crawl.probe_page(
                        url, document)
-                    # TODO Check length and for a write a
+                    if isinstance(result, str):
                    # unified message for a set of feeds.
                    # Use logging if you so choose to
                    # distinct the methods
                    if isinstance(result, list):
                        url = result[0]
                    elif isinstance(result, str):
                        response = result
                        break
                    else:
                        url = result[0]
            else:
                response = (
                    "> {}\nFailed to load URL.  Reason: {}"
@ -480,15 +475,11 @@ async def view_feed(url):
            else:
                result = await crawl.probe_page(
                    url, document)
-                # TODO Check length and for a write a
+                if isinstance(result, str):
                # unified message for a set of feeds.
                # Use logging if you so choose to
                # distinct the methods
                if isinstance(result, list):
                    url = result[0]
                elif isinstance(result, str):
                    response = result
                    break
                else:
                    url = result[0]
        else:
            response = (
                "> {}\nFailed to load URL.  Reason: {}"
@ -553,15 +544,11 @@ async def view_entry(url, num):
            else:
                result = await crawl.probe_page(
                    url, document)
-                # TODO Check length and for a write a
+                if isinstance(result, str):
                # unified message for a set of feeds.
                # Use logging if you so choose to
                # distinct the methods
                if isinstance(result, list):
                    url = result[0]
                elif isinstance(result, str):
                    response = result
                    break
                else:
                    url = result[0]
        else:
            response = (
                "> {}\nFailed to load URL.  Reason: {}"
@ -660,8 +647,11 @@ async def scan(db_file, url):
                        db_file, "filter-deny", string)
                    if reject_list:
                        read_status = 1
                        logging.debug(
                            "Rejected due to keyword {}".format(reject_list))
                if isinstance(date, int):
-                    logging.error("Variable 'date' is int:", date)
+                    logging.error(
                        "Variable 'date' is int: {}".format(date))
                await sqlite.add_entry(
                    db_file, title, link, entry_id,
                    url, date, read_status)
@ -723,7 +713,7 @@ async def organize_items(db_file, urls):
                    IncompleteRead,
                    error.URLError
                    ) as e:
-                print(e)
+                logging.error(e)
                # TODO Print error to log
                # None
                # NOTE I don't think there should be "return"
--- a/slixfeed/crawl.py
+++ b/slixfeed/crawl.py
@ -19,6 +19,7 @@ TODO
 from aiohttp import ClientError, ClientSession, ClientTimeout
 from feedparser import parse
 import logging
 from lxml import html
 import slixfeed.config as config
 from slixfeed.fetch import download_feed
@ -88,15 +89,20 @@ async def probe_page(url, document):
            "> {}\nFailed to parse URL as feed."
            ).format(url)
    if not result:
-        print("RSS Auto-Discovery Engaged")
+        logging.debug(
            "Feed auto-discovery engaged for {}".format(url))
        result = await feed_mode_auto_discovery(url, tree)
    if not result:
-        print("RSS Scan Mode Engaged")
+        logging.debug(
            "Feed link scan mode engaged for {}".format(url))
        result = await feed_mode_scan(url, tree)
    if not result:
-        print("RSS Arbitrary Mode Engaged")
+        logging.debug(
-        result = await feed_mode_request(url, tree)
+            "Feed arbitrary mode engaged for {}".format(url))
        result = await feed_mode_guess(url, tree)
    if not result:
        logging.debug(
            "No feeds were found for {}".format(url))
        result = (
            "> {}\nNo news feeds were found for URL."
            ).format(url)
@ -104,7 +110,7 @@ async def probe_page(url, document):
 # TODO Improve scan by gradual decreasing of path
-async def feed_mode_request(url, tree):
+async def feed_mode_guess(url, tree):
    """
    Lookup for feeds by pathname using HTTP Requests.
@ -122,94 +128,26 @@ async def feed_mode_request(url, tree):
    msg : str
        Message with URLs.
    """
-    feeds = {}
+    urls = []
    parted_url = urlsplit(url)
    paths = config.get_list("lists.yaml", "pathnames")
    # Check whether URL has path (i.e. not root)
    # Check parted_url.path to avoid error in case root wasn't given
    # TODO Make more tests
    if parted_url.path and parted_url.path.split('/')[1]:
        paths.extend(
            [".atom", ".feed", ".rdf", ".rss"]
            ) if '.rss' not in paths else -1
        # if paths.index('.rss'):
        #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
    for path in paths:
-        address = urlunsplit([
+        address = join_url(url, parted_url.path.split('/')[1] + path)
-            parted_url.scheme,
+        if address not in urls:
-            parted_url.netloc,
+            urls.extend([address])
-            path,
+    # breakpoint()
-            None,
+    # print("feed_mode_guess")
-            None
+    urls = await process_feed_selection(url, urls)
-            ])
+    return urls
        res = await download_feed(address)
        if res[1] == 200:
            # print(parse(res[0])["feed"]["title"])
            # feeds[address] = parse(res[0])["feed"]["title"]
            try:
                title = parse(res[0])["feed"]["title"]
            except:
                title = '*** No Title ***'
            feeds[address] = title
        # Check whether URL has path (i.e. not root)
        # Check parted_url.path to avoid error in case root wasn't given
        # TODO Make more tests
        if parted_url.path and parted_url.path.split('/')[1]:
            paths.extend(
                [".atom", ".feed", ".rdf", ".rss"]
                ) if '.rss' not in paths else -1
            # if paths.index('.rss'):
            #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
            address = urlunsplit([
                parted_url.scheme,
                parted_url.netloc,
                parted_url.path.split('/')[1] + path,
                None,
                None
                ])
            res = await download_feed(address)
            if res[1] == 200:
                try:
                    feeds[address] = parse(res[0])
                    # print(feeds)
                except:
                    continue
    # TODO return feeds
    if len(feeds) > 1:
        counter = 0
        msg = (
            "RSS URL discovery has found {} feeds:\n\n```\n"
            ).format(len(feeds))
        feed_mark = 0
        for feed in feeds:
            try:
                feed_name = feeds[feed]["feed"]["title"]
            except:
                feed_name = urlsplit(feed).netloc
            feed_addr = feed
            # AttributeError: 'str' object has no attribute 'entries'
            try:
                feed_amnt = len(feeds[feed].entries)
            except:
                continue
            if feed_amnt:
                # NOTE Because there could be many false positives
                # which are revealed in second phase of scan, we
                # could end with a single feed, which would be
                # listed instead of fetched, so feed_mark is
                # utilized in order to make fetch possible.
                feed_mark = [feed_addr]
                counter += 1
                msg += (
                    "Title: {}\n"
                    "Link : {}\n"
                    "Items: {}\n"
                    "\n"
                    ).format(feed_name, feed_addr, feed_amnt)
        if counter > 1:
            msg += (
                "```\nThe above feeds were extracted from\n{}"
                ).format(url)
        elif feed_mark:
            return feed_mark
        else:
            msg = (
                "No feeds were found for {}"
                ).format(url)
        return msg
    elif feeds:
        return feeds
 async def feed_mode_scan(url, tree):
@ -230,9 +168,7 @@ async def feed_mode_scan(url, tree):
    msg : str
        Message with URLs.
    """
-    feeds = {}
+    urls = []
    # paths = []
    # TODO Test
    paths = config.get_list("lists.yaml", "pathnames")
    for path in paths:
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
@ -242,91 +178,16 @@ async def feed_mode_scan(url, tree):
        addresses = tree.xpath(xpath_query)
        xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
        addresses += tree.xpath(xpath_query)
        parted_url = urlsplit(url)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
        for address in addresses:
-            # print(address.xpath('@href')[0])
+            address = join_url(url, address.xpath('@href')[0])
-            # print(addresses)
+            if address not in urls:
-            address = address.xpath('@href')[0]
+                urls.extend([address])
-            if "/" not in address:
+    # breakpoint()
-                protocol = parted_url.scheme
+    # print("feed_mode_scan")
-                hostname = parted_url.netloc
+    urls = await process_feed_selection(url, urls)
-                pathname = address
+    return urls
                address = urlunsplit([
                    protocol,
                    hostname,
                    pathname,
                    None,
                    None
                    ])
            if address.startswith('/'):
                protocol = parted_url.scheme
                hostname = parted_url.netloc
                pathname = address
                address = urlunsplit([
                    protocol,
                    hostname,
                    pathname,
                    None,
                    None
                    ])
            res = await download_feed(address)
            if res[1] == 200:
                try:
                    feeds[address] = parse(res[0])
                    # print(feeds[address])
                    # breakpoint()
                    # print(feeds)
                except:
                    continue
    # TODO return feeds
    if len(feeds) > 1:
        # print(feeds)
        # breakpoint()
        counter = 0
        msg = (
            "RSS URL scan has found {} feeds:\n\n```\n"
            ).format(len(feeds))
        feed_mark = 0
        for feed in feeds:
            # try:
            #     res = await download_feed(feed)
            # except:
            #     continue
            try:
                feed_name = feeds[feed]["feed"]["title"]
            except:
                feed_name = urlsplit(feed).netloc
            feed_addr = feed
            feed_amnt = len(feeds[feed].entries)
            if feed_amnt:
                # NOTE Because there could be many false positives
                # which are revealed in second phase of scan, we
                # could end with a single feed, which would be
                # listed instead of fetched, so feed_mark is
                # utilized in order to make fetch possible.
                feed_mark = [feed_addr]
                counter += 1
                msg += (
                    "Title : {}\n"
                    "Link  : {}\n"
                    "Count : {}\n"
                    "\n"
                    ).format(feed_name, feed_addr, feed_amnt)
        if counter > 1:
            msg += (
                "```\nThe above feeds were extracted from\n{}"
                ).format(url)
        elif feed_mark:
            return feed_mark
        else:
            msg = (
                "No feeds were found for {}"
                ).format(url)
        return msg
    elif feeds:
        return feeds
 async def feed_mode_auto_discovery(url, tree):
@ -358,11 +219,8 @@ async def feed_mode_auto_discovery(url, tree):
    # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
    # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
    feeds = tree.xpath(xpath_query)
-    # TODO return feeds
+    if feeds:
-    if len(feeds) > 1:
+        urls = []
        msg = (
            "RSS Auto-Discovery has found {} feeds:\n\n```\n"
            ).format(len(feeds))
        for feed in feeds:
            # # The following code works;
            # # The following code will catch
@ -373,15 +231,129 @@ async def feed_mode_auto_discovery(url, tree):
            #     disco = parse(res[0])
            #     title = disco["feed"]["title"]
            #     msg += "{} \n {} \n\n".format(title, feed)
-            feed_name = feed.xpath('@title')[0]
+
-            feed_addr = join_url(url, feed.xpath('@href')[0])
+            # feed_name = feed.xpath('@title')[0]
            # feed_addr = join_url(url, feed.xpath('@href')[0])
            # if feed_addr.startswith("/"):
            #     feed_addr = url + feed_addr
-            msg += "{}\n{}\n\n".format(feed_name, feed_addr)
+            address = join_url(url, feed.xpath('@href')[0])
-        msg += (
+            if address not in urls:
-            "```\nThe above feeds were extracted from\n{}"
+                urls.extend([address])
-            ).format(url)
+        # breakpoint()
-        return msg
+        # print("feed_mode_auto_discovery")
-    elif feeds:
+        urls = await process_feed_selection(url, urls)
-        feed_addr = join_url(url, feeds[0].xpath('@href')[0])
+        return urls
-        return [feed_addr]
+
 # TODO Segregate function into function that returns
 # URLs (string) and Feeds (dict) and function that
 # composes text message (string).
 # Maybe that's not necessary.
 async def process_feed_selection(url, urls):
    feeds = {}
    for i in urls:
        res = await download_feed(i)
        if res[1] == 200:
            try:
                feeds[i] = [parse(res[0])]
            except:
                continue
    message = (
        "Web feeds found for {}\n\n```\n"
        ).format(url)
    counter = 0
    feed_url_mark = 0
    for feed_url in feeds:
        # try:
        #     res = await download_feed(feed)
        # except:
        #     continue
        feed_name = None
        if "title" in feeds[feed_url][0]["feed"].keys():
            feed_name = feeds[feed_url][0].feed.title
        feed_name = feed_name if feed_name else "Untitled"
        # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
        # AttributeError: 'str' object has no attribute 'entries'
        if "entries" in feeds[feed_url][0].keys():
            feed_amnt = feeds[feed_url][0].entries
        else:
            continue
        if feed_amnt:
            # NOTE Because there could be many false positives
            # which are revealed in second phase of scan, we
            # could end with a single feed, which would be
            # listed instead of fetched, so feed_url_mark is
            # utilized in order to make fetch possible.
            feed_url_mark = [feed_url]
            counter += 1
            message += (
                "Title : {}\n"
                "Link  : {}\n"
                "\n"
                ).format(feed_name, feed_url)
    if counter > 1:
        message += (
            "```\nTotal of {} feeds."
            ).format(counter)
        result = message
    elif feed_url_mark:
        result = feed_url_mark
    else:
        result = None
    return result
 # def get_discovered_feeds(url, urls):
 #     message = (
 #         "Found {} web feeds:\n\n```\n"
 #         ).format(len(urls))
 #     if len(urls) > 1:
 #         for urls in urls:
 #                 message += (
 #                     "Title : {}\n"
 #                     "Link  : {}\n"
 #                     "\n"
 #                     ).format(url, url.title)
 #         message += (
 #             "```\nThe above feeds were extracted from\n{}"
 #             ).format(url)
 #     elif len(urls) > 0:
 #         result = urls
 #     else:
 #         message = (
 #             "No feeds were found for {}"
 #             ).format(url)
 #     return result
 # Test module
 # TODO ModuleNotFoundError: No module named 'slixfeed'
 # import slixfeed.fetch as fetch
 # from slixfeed.action import is_feed, process_feed_selection
 # async def start(url):
 #     while True:
 #         result = await fetch.download_feed(url)
 #         document = result[0]
 #         status = result[1]
 #         if document:
 #             feed = parse(document)
 #             if is_feed(feed):
 #                 print(url)
 #             else:
 #                 urls = await probe_page(
 #                     url, document)
 #                 if len(urls) > 1:
 #                     await process_feed_selection(urls)
 #                 elif urls:
 #                     url = urls[0]
 #         else:
 #             response = (
 #                 "> {}\nFailed to load URL.  Reason: {}"
 #                 ).format(url, status)
 #             break
 #     return response
 # url = "https://www.smh.com.au/rssheadlines"
 # start(url)
--- a/slixfeed/xmpp/process.py
+++ b/slixfeed/xmpp/process.py
@ -18,6 +18,7 @@ TODO
 """
 import logging
 import os
 import slixfeed.action as action
 from slixfeed.config import (
@ -78,6 +79,38 @@ async def message(self, message):
    """
    if message["type"] in ("chat", "groupchat", "normal"):
        jid = message["from"].bare
        message_text = " ".join(message["body"].split())
        # BOTE This is an exceptional case in which we treat
        # type groupchat the same as type chat.
        if (message_text.lower().startswith("http")) and(
            message_text.lower().endswith(".opml")):
            url = message_text
            await task.clean_tasks_xmpp(
                jid, ["status"])
            status_type = "dnd"
            status_message = (
                "📥️ Procesing request to import feeds ..."
                )
            send_status_message(
                self, jid, status_type, status_message)
            db_file = get_pathname_to_database(jid)
            count = await action.import_opml(db_file, url)
            if count:
                response = (
                    "Successfully imported {} feeds"
                    ).format(count)
            else:
                response = (
                    "OPML file was not imported."
                    )
            await task.clean_tasks_xmpp(
                jid, ["status"])
            await task.start_tasks_xmpp(
                self, jid, ["status"])
            send_reply_message(self, message, response)
        if message["type"] == "groupchat":
            # nick = message["from"][message["from"].index("/")+1:]
            nick = str(message["from"])
@ -135,18 +168,26 @@ async def message(self, message):
        # await compose.message(self, jid, message)
        message_text = " ".join(message["body"].split())
        if message["type"] == "groupchat":
            message_text = message_text[1:]
        message_lowercase = message_text.lower()
-        print(current_time(), "ACCOUNT: " + str(message["from"]))
+        logging.debug(
-        print(current_time(), "COMMAND:", message_text)
+            [str(message["from"]), ":", message_text])
-        response = 0
+        response = None
        match message_lowercase:
            # case "breakpoint":
            #     if jid == get_value("accounts", "XMPP", "operator"):
            #         breakpoint()
            #         print("task_manager[jid]")
            #         print(task_manager[jid])
            #         await self.get_roster()
            #         print("roster 1")
            #         print(self.client_roster)
            #         print("roster 2")
            #         print(self.client_roster.keys())
            #         print("jid")
            #         print(jid)
            #     else:
            #         response = (
            #             "This action is restricted. "
@ -171,15 +212,6 @@ async def message(self, message):
                    "Send \"help\" for instructions.\n"
                    )
                send_reply_message(self, message, response)
                # print("task_manager[jid]")
                # print(task_manager[jid])
                await self.get_roster()
                print("roster 1")
                print(self.client_roster)
                print("roster 2")
                print(self.client_roster.keys())
                print("jid")
                print(jid)
            # case _ if message_lowercase.startswith("activate"):
            #     if message["type"] == "groupchat":
@ -242,8 +274,8 @@ async def message(self, message):
                        response = (
                            "> {}\nNews source \"{}\" is already "
                            "listed in the subscription list at "
-                            "index {}".format(url, name, ix)
+                            "index {}"
-                            )
+                            ).format(url, name, ix)
                else:
                    response = "Missing URL."
                send_reply_message(self, message, response)
@ -406,32 +438,32 @@ async def message(self, message):
                        message_lowercase.startswith("gopher:")):
                response = "Gemini and Gopher are not supported yet."
                send_reply_message(self, message, response)
-            case _ if (message_lowercase.startswith("http")) and(
+            # case _ if (message_lowercase.startswith("http")) and(
-                message_lowercase.endswith(".opml")):
+            #     message_lowercase.endswith(".opml")):
-                url = message_text
+            #     url = message_text
-                await task.clean_tasks_xmpp(
+            #     await task.clean_tasks_xmpp(
-                    jid, ["status"])
+            #         jid, ["status"])
-                status_type = "dnd"
+            #     status_type = "dnd"
-                status_message = (
+            #     status_message = (
-                    "📥️ Procesing request to import feeds ..."
+            #         "📥️ Procesing request to import feeds ..."
-                    )
+            #         )
-                send_status_message(
+            #     send_status_message(
-                    self, jid, status_type, status_message)
+            #         self, jid, status_type, status_message)
-                db_file = get_pathname_to_database(jid)
+            #     db_file = get_pathname_to_database(jid)
-                count = await action.import_opml(db_file, url)
+            #     count = await action.import_opml(db_file, url)
-                if count:
+            #     if count:
-                    response = (
+            #         response = (
-                        "Successfully imported {} feeds"
+            #             "Successfully imported {} feeds"
-                        ).format(count)
+            #             ).format(count)
-                else:
+            #     else:
-                    response = (
+            #         response = (
-                        "OPML file was not imported."
+            #             "OPML file was not imported."
-                        )
+            #             )
-                await task.clean_tasks_xmpp(
+            #     await task.clean_tasks_xmpp(
-                    jid, ["status"])
+            #         jid, ["status"])
-                await task.start_tasks_xmpp(
+            #     await task.start_tasks_xmpp(
-                    self, jid, ["status"])
+            #         self, jid, ["status"])
-                send_reply_message(self, message, response)
+            #     send_reply_message(self, message, response)
            case _ if (message_lowercase.startswith("http") or
                        message_lowercase.startswith("feed:")):
                url = message_text
@ -447,7 +479,8 @@ async def message(self, message):
                    url = uri.feed_to_http(url)
                url = (uri.replace_hostname(url, "feed")) or url
                db_file = get_pathname_to_database(jid)
-                response = await action.add_feed(db_file, url)
+                response = await action.add_feed(
                    db_file, url)
                await task.clean_tasks_xmpp(
                    jid, ["status"])
                await task.start_tasks_xmpp(
@ -458,8 +491,10 @@ async def message(self, message):
                if query:
                    if len(query) > 3:
                        db_file = get_pathname_to_database(jid)
-                        result = await sqlite.search_feeds(db_file, query)
+                        result = await sqlite.search_feeds(
-                        response = action.list_feeds_by_query(query, result)
+                            db_file, query)
                        response = action.list_feeds_by_query(
                            query, result)
                    else:
                        response = (
                            "Enter at least 4 characters to search"
@ -506,11 +541,11 @@ async def message(self, message):
                    await groupchat.join(self, jid, muc_jid)
                    response = (
                        "Joined groupchat {}"
-                                ).format(message_text)
+                        ).format(message_text)
                else:
                    response = (
                        "> {}\nXMPP URI is not valid."
-                                ).format(message_text)
+                        ).format(message_text)
                send_reply_message(self, message, response)
            case _ if message_lowercase.startswith("length"):
                    key = message_text[:6]
@ -685,16 +720,19 @@ async def message(self, message):
                                db_file, ix)
                            response = (
                                "> {}\nNews source {} has been removed "
-                                "from subscription list.").format(url, ix)
+                                "from subscription list."
                                ).format(url, ix)
                        except:
                            response = (
-                                "No news source with ID {}.".format(ix))
+                                "No news source with ID {}."
                                ).format(ix)
                    except:
                        url = ix_url
                        await sqlite.remove_feed_by_url(db_file, url)
                        response = (
                            "> {}\nNews source has been removed "
-                            "from subscription list.").format(url)
+                            "from subscription list."
                            ).format(url)
                    # await refresh_task(
                    #     self,
                    #     jid,
@ -835,11 +873,11 @@ async def message(self, message):
                    await groupchat.join(self, jid, muc_jid)
                    response = (
                        "Joined groupchat {}"
-                                ).format(message_text)
+                        ).format(message_text)
                else:
                    response = (
                        "> {}\nXMPP URI is not valid."
-                                ).format(message_text)
+                        ).format(message_text)
                send_reply_message(self, message, response)
            case _:
                response = (
--- a/slixfeed/xmpp/text.py
+++ b/slixfeed/xmpp/text.py
@ -216,7 +216,7 @@ def print_help():
        " info\n"
        "   Print information page.\n"
        " support\n"
-        "   Join xmpp:slixmpp@muc.poez.io?join\n"
+        "   Join xmpp:slixfeed@chat.woodpeckersnest.space?join\n"
        # "\n"
        # "PROTOCOLS\n"
        # " Supported prootcols are IRC, Matrix and XMPP.\n"