From 9709c052ee89a7e2eb408ea3303856a6dd135d24 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Tue, 9 Jan 2024 12:34:10 +0000 Subject: [PATCH] Improve code of module crawl.py --- slixfeed/action.py | 40 ++--- slixfeed/crawl.py | 352 ++++++++++++++++++--------------------- slixfeed/xmpp/process.py | 140 ++++++++++------ slixfeed/xmpp/text.py | 2 +- 4 files changed, 267 insertions(+), 267 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index fa2d05e..084f0b1 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -81,7 +81,7 @@ def is_feed(feed): True or False. """ value = False - message = None + # message = None if not feed.entries: if "version" in feed.keys(): feed["version"] @@ -110,7 +110,6 @@ def is_feed(feed): # message = ( # "Good feed for {}" # ).format(url) - print(message) return value @@ -402,15 +401,11 @@ async def add_feed(db_file, url): else: result = await crawl.probe_page( url, document) - # TODO Check length and for a write a - # unified message for a set of feeds. - # Use logging if you so choose to - # distinct the methods - if isinstance(result, list): - url = result[0] - elif isinstance(result, str): + if isinstance(result, str): response = result break + else: + url = result[0] else: response = ( "> {}\nFailed to load URL. Reason: {}" @@ -480,15 +475,11 @@ async def view_feed(url): else: result = await crawl.probe_page( url, document) - # TODO Check length and for a write a - # unified message for a set of feeds. - # Use logging if you so choose to - # distinct the methods - if isinstance(result, list): - url = result[0] - elif isinstance(result, str): + if isinstance(result, str): response = result break + else: + url = result[0] else: response = ( "> {}\nFailed to load URL. Reason: {}" @@ -553,15 +544,11 @@ async def view_entry(url, num): else: result = await crawl.probe_page( url, document) - # TODO Check length and for a write a - # unified message for a set of feeds. - # Use logging if you so choose to - # distinct the methods - if isinstance(result, list): - url = result[0] - elif isinstance(result, str): + if isinstance(result, str): response = result break + else: + url = result[0] else: response = ( "> {}\nFailed to load URL. Reason: {}" @@ -660,8 +647,11 @@ async def scan(db_file, url): db_file, "filter-deny", string) if reject_list: read_status = 1 + logging.debug( + "Rejected due to keyword {}".format(reject_list)) if isinstance(date, int): - logging.error("Variable 'date' is int:", date) + logging.error( + "Variable 'date' is int: {}".format(date)) await sqlite.add_entry( db_file, title, link, entry_id, url, date, read_status) @@ -723,7 +713,7 @@ async def organize_items(db_file, urls): IncompleteRead, error.URLError ) as e: - print(e) + logging.error(e) # TODO Print error to log # None # NOTE I don't think there should be "return" diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py index df595ed..7fada28 100644 --- a/slixfeed/crawl.py +++ b/slixfeed/crawl.py @@ -19,6 +19,7 @@ TODO from aiohttp import ClientError, ClientSession, ClientTimeout from feedparser import parse +import logging from lxml import html import slixfeed.config as config from slixfeed.fetch import download_feed @@ -88,15 +89,20 @@ async def probe_page(url, document): "> {}\nFailed to parse URL as feed." ).format(url) if not result: - print("RSS Auto-Discovery Engaged") + logging.debug( + "Feed auto-discovery engaged for {}".format(url)) result = await feed_mode_auto_discovery(url, tree) if not result: - print("RSS Scan Mode Engaged") + logging.debug( + "Feed link scan mode engaged for {}".format(url)) result = await feed_mode_scan(url, tree) if not result: - print("RSS Arbitrary Mode Engaged") - result = await feed_mode_request(url, tree) + logging.debug( + "Feed arbitrary mode engaged for {}".format(url)) + result = await feed_mode_guess(url, tree) if not result: + logging.debug( + "No feeds were found for {}".format(url)) result = ( "> {}\nNo news feeds were found for URL." ).format(url) @@ -104,7 +110,7 @@ async def probe_page(url, document): # TODO Improve scan by gradual decreasing of path -async def feed_mode_request(url, tree): +async def feed_mode_guess(url, tree): """ Lookup for feeds by pathname using HTTP Requests. @@ -122,94 +128,26 @@ async def feed_mode_request(url, tree): msg : str Message with URLs. """ - feeds = {} + urls = [] parted_url = urlsplit(url) paths = config.get_list("lists.yaml", "pathnames") + # Check whether URL has path (i.e. not root) + # Check parted_url.path to avoid error in case root wasn't given + # TODO Make more tests + if parted_url.path and parted_url.path.split('/')[1]: + paths.extend( + [".atom", ".feed", ".rdf", ".rss"] + ) if '.rss' not in paths else -1 + # if paths.index('.rss'): + # paths.extend([".atom", ".feed", ".rdf", ".rss"]) for path in paths: - address = urlunsplit([ - parted_url.scheme, - parted_url.netloc, - path, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - # print(parse(res[0])["feed"]["title"]) - # feeds[address] = parse(res[0])["feed"]["title"] - try: - title = parse(res[0])["feed"]["title"] - except: - title = '*** No Title ***' - feeds[address] = title - # Check whether URL has path (i.e. not root) - # Check parted_url.path to avoid error in case root wasn't given - # TODO Make more tests - if parted_url.path and parted_url.path.split('/')[1]: - paths.extend( - [".atom", ".feed", ".rdf", ".rss"] - ) if '.rss' not in paths else -1 - # if paths.index('.rss'): - # paths.extend([".atom", ".feed", ".rdf", ".rss"]) - address = urlunsplit([ - parted_url.scheme, - parted_url.netloc, - parted_url.path.split('/')[1] + path, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - try: - feeds[address] = parse(res[0]) - # print(feeds) - except: - continue - # TODO return feeds - if len(feeds) > 1: - counter = 0 - msg = ( - "RSS URL discovery has found {} feeds:\n\n```\n" - ).format(len(feeds)) - feed_mark = 0 - for feed in feeds: - try: - feed_name = feeds[feed]["feed"]["title"] - except: - feed_name = urlsplit(feed).netloc - feed_addr = feed - # AttributeError: 'str' object has no attribute 'entries' - try: - feed_amnt = len(feeds[feed].entries) - except: - continue - if feed_amnt: - # NOTE Because there could be many false positives - # which are revealed in second phase of scan, we - # could end with a single feed, which would be - # listed instead of fetched, so feed_mark is - # utilized in order to make fetch possible. - feed_mark = [feed_addr] - counter += 1 - msg += ( - "Title: {}\n" - "Link : {}\n" - "Items: {}\n" - "\n" - ).format(feed_name, feed_addr, feed_amnt) - if counter > 1: - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - elif feed_mark: - return feed_mark - else: - msg = ( - "No feeds were found for {}" - ).format(url) - return msg - elif feeds: - return feeds + address = join_url(url, parted_url.path.split('/')[1] + path) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_guess") + urls = await process_feed_selection(url, urls) + return urls async def feed_mode_scan(url, tree): @@ -230,9 +168,7 @@ async def feed_mode_scan(url, tree): msg : str Message with URLs. """ - feeds = {} - # paths = [] - # TODO Test + urls = [] paths = config.get_list("lists.yaml", "pathnames") for path in paths: # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) @@ -242,91 +178,16 @@ async def feed_mode_scan(url, tree): addresses = tree.xpath(xpath_query) xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) addresses += tree.xpath(xpath_query) - parted_url = urlsplit(url) # NOTE Should number of addresses be limited or # perhaps be N from the start and N from the end for address in addresses: - # print(address.xpath('@href')[0]) - # print(addresses) - address = address.xpath('@href')[0] - if "/" not in address: - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = address - address = urlunsplit([ - protocol, - hostname, - pathname, - None, - None - ]) - if address.startswith('/'): - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = address - address = urlunsplit([ - protocol, - hostname, - pathname, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - try: - feeds[address] = parse(res[0]) - # print(feeds[address]) - # breakpoint() - # print(feeds) - except: - continue - # TODO return feeds - if len(feeds) > 1: - # print(feeds) - # breakpoint() - counter = 0 - msg = ( - "RSS URL scan has found {} feeds:\n\n```\n" - ).format(len(feeds)) - feed_mark = 0 - for feed in feeds: - # try: - # res = await download_feed(feed) - # except: - # continue - try: - feed_name = feeds[feed]["feed"]["title"] - except: - feed_name = urlsplit(feed).netloc - feed_addr = feed - feed_amnt = len(feeds[feed].entries) - if feed_amnt: - # NOTE Because there could be many false positives - # which are revealed in second phase of scan, we - # could end with a single feed, which would be - # listed instead of fetched, so feed_mark is - # utilized in order to make fetch possible. - feed_mark = [feed_addr] - counter += 1 - msg += ( - "Title : {}\n" - "Link : {}\n" - "Count : {}\n" - "\n" - ).format(feed_name, feed_addr, feed_amnt) - if counter > 1: - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - elif feed_mark: - return feed_mark - else: - msg = ( - "No feeds were found for {}" - ).format(url) - return msg - elif feeds: - return feeds + address = join_url(url, address.xpath('@href')[0]) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_scan") + urls = await process_feed_selection(url, urls) + return urls async def feed_mode_auto_discovery(url, tree): @@ -358,11 +219,8 @@ async def feed_mode_auto_discovery(url, tree): # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" feeds = tree.xpath(xpath_query) - # TODO return feeds - if len(feeds) > 1: - msg = ( - "RSS Auto-Discovery has found {} feeds:\n\n```\n" - ).format(len(feeds)) + if feeds: + urls = [] for feed in feeds: # # The following code works; # # The following code will catch @@ -373,15 +231,129 @@ async def feed_mode_auto_discovery(url, tree): # disco = parse(res[0]) # title = disco["feed"]["title"] # msg += "{} \n {} \n\n".format(title, feed) - feed_name = feed.xpath('@title')[0] - feed_addr = join_url(url, feed.xpath('@href')[0]) + + # feed_name = feed.xpath('@title')[0] + # feed_addr = join_url(url, feed.xpath('@href')[0]) + # if feed_addr.startswith("/"): # feed_addr = url + feed_addr - msg += "{}\n{}\n\n".format(feed_name, feed_addr) - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - return msg - elif feeds: - feed_addr = join_url(url, feeds[0].xpath('@href')[0]) - return [feed_addr] + address = join_url(url, feed.xpath('@href')[0]) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_auto_discovery") + urls = await process_feed_selection(url, urls) + return urls + + +# TODO Segregate function into function that returns +# URLs (string) and Feeds (dict) and function that +# composes text message (string). +# Maybe that's not necessary. +async def process_feed_selection(url, urls): + feeds = {} + for i in urls: + res = await download_feed(i) + if res[1] == 200: + try: + feeds[i] = [parse(res[0])] + except: + continue + message = ( + "Web feeds found for {}\n\n```\n" + ).format(url) + counter = 0 + feed_url_mark = 0 + for feed_url in feeds: + # try: + # res = await download_feed(feed) + # except: + # continue + feed_name = None + if "title" in feeds[feed_url][0]["feed"].keys(): + feed_name = feeds[feed_url][0].feed.title + feed_name = feed_name if feed_name else "Untitled" + # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc + # AttributeError: 'str' object has no attribute 'entries' + if "entries" in feeds[feed_url][0].keys(): + feed_amnt = feeds[feed_url][0].entries + else: + continue + if feed_amnt: + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_url_mark is + # utilized in order to make fetch possible. + feed_url_mark = [feed_url] + counter += 1 + message += ( + "Title : {}\n" + "Link : {}\n" + "\n" + ).format(feed_name, feed_url) + if counter > 1: + message += ( + "```\nTotal of {} feeds." + ).format(counter) + result = message + elif feed_url_mark: + result = feed_url_mark + else: + result = None + return result + + +# def get_discovered_feeds(url, urls): +# message = ( +# "Found {} web feeds:\n\n```\n" +# ).format(len(urls)) +# if len(urls) > 1: +# for urls in urls: +# message += ( +# "Title : {}\n" +# "Link : {}\n" +# "\n" +# ).format(url, url.title) +# message += ( +# "```\nThe above feeds were extracted from\n{}" +# ).format(url) +# elif len(urls) > 0: +# result = urls +# else: +# message = ( +# "No feeds were found for {}" +# ).format(url) +# return result + + +# Test module +# TODO ModuleNotFoundError: No module named 'slixfeed' +# import slixfeed.fetch as fetch +# from slixfeed.action import is_feed, process_feed_selection + +# async def start(url): +# while True: +# result = await fetch.download_feed(url) +# document = result[0] +# status = result[1] +# if document: +# feed = parse(document) +# if is_feed(feed): +# print(url) +# else: +# urls = await probe_page( +# url, document) +# if len(urls) > 1: +# await process_feed_selection(urls) +# elif urls: +# url = urls[0] +# else: +# response = ( +# "> {}\nFailed to load URL. Reason: {}" +# ).format(url, status) +# break +# return response + +# url = "https://www.smh.com.au/rssheadlines" +# start(url) \ No newline at end of file diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index c8f3197..30459b1 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -18,6 +18,7 @@ TODO """ +import logging import os import slixfeed.action as action from slixfeed.config import ( @@ -78,6 +79,38 @@ async def message(self, message): """ if message["type"] in ("chat", "groupchat", "normal"): jid = message["from"].bare + message_text = " ".join(message["body"].split()) + + # BOTE This is an exceptional case in which we treat + # type groupchat the same as type chat. + if (message_text.lower().startswith("http")) and( + message_text.lower().endswith(".opml")): + url = message_text + await task.clean_tasks_xmpp( + jid, ["status"]) + status_type = "dnd" + status_message = ( + "📥️ Procesing request to import feeds ..." + ) + send_status_message( + self, jid, status_type, status_message) + db_file = get_pathname_to_database(jid) + count = await action.import_opml(db_file, url) + if count: + response = ( + "Successfully imported {} feeds" + ).format(count) + else: + response = ( + "OPML file was not imported." + ) + await task.clean_tasks_xmpp( + jid, ["status"]) + await task.start_tasks_xmpp( + self, jid, ["status"]) + send_reply_message(self, message, response) + + if message["type"] == "groupchat": # nick = message["from"][message["from"].index("/")+1:] nick = str(message["from"]) @@ -135,18 +168,26 @@ async def message(self, message): # await compose.message(self, jid, message) - message_text = " ".join(message["body"].split()) if message["type"] == "groupchat": message_text = message_text[1:] message_lowercase = message_text.lower() - print(current_time(), "ACCOUNT: " + str(message["from"])) - print(current_time(), "COMMAND:", message_text) - response = 0 + logging.debug( + [str(message["from"]), ":", message_text]) + response = None match message_lowercase: # case "breakpoint": # if jid == get_value("accounts", "XMPP", "operator"): # breakpoint() + # print("task_manager[jid]") + # print(task_manager[jid]) + # await self.get_roster() + # print("roster 1") + # print(self.client_roster) + # print("roster 2") + # print(self.client_roster.keys()) + # print("jid") + # print(jid) # else: # response = ( # "This action is restricted. " @@ -171,15 +212,6 @@ async def message(self, message): "Send \"help\" for instructions.\n" ) send_reply_message(self, message, response) - # print("task_manager[jid]") - # print(task_manager[jid]) - await self.get_roster() - print("roster 1") - print(self.client_roster) - print("roster 2") - print(self.client_roster.keys()) - print("jid") - print(jid) # case _ if message_lowercase.startswith("activate"): # if message["type"] == "groupchat": @@ -242,8 +274,8 @@ async def message(self, message): response = ( "> {}\nNews source \"{}\" is already " "listed in the subscription list at " - "index {}".format(url, name, ix) - ) + "index {}" + ).format(url, name, ix) else: response = "Missing URL." send_reply_message(self, message, response) @@ -406,32 +438,32 @@ async def message(self, message): message_lowercase.startswith("gopher:")): response = "Gemini and Gopher are not supported yet." send_reply_message(self, message, response) - case _ if (message_lowercase.startswith("http")) and( - message_lowercase.endswith(".opml")): - url = message_text - await task.clean_tasks_xmpp( - jid, ["status"]) - status_type = "dnd" - status_message = ( - "📥️ Procesing request to import feeds ..." - ) - send_status_message( - self, jid, status_type, status_message) - db_file = get_pathname_to_database(jid) - count = await action.import_opml(db_file, url) - if count: - response = ( - "Successfully imported {} feeds" - ).format(count) - else: - response = ( - "OPML file was not imported." - ) - await task.clean_tasks_xmpp( - jid, ["status"]) - await task.start_tasks_xmpp( - self, jid, ["status"]) - send_reply_message(self, message, response) + # case _ if (message_lowercase.startswith("http")) and( + # message_lowercase.endswith(".opml")): + # url = message_text + # await task.clean_tasks_xmpp( + # jid, ["status"]) + # status_type = "dnd" + # status_message = ( + # "📥️ Procesing request to import feeds ..." + # ) + # send_status_message( + # self, jid, status_type, status_message) + # db_file = get_pathname_to_database(jid) + # count = await action.import_opml(db_file, url) + # if count: + # response = ( + # "Successfully imported {} feeds" + # ).format(count) + # else: + # response = ( + # "OPML file was not imported." + # ) + # await task.clean_tasks_xmpp( + # jid, ["status"]) + # await task.start_tasks_xmpp( + # self, jid, ["status"]) + # send_reply_message(self, message, response) case _ if (message_lowercase.startswith("http") or message_lowercase.startswith("feed:")): url = message_text @@ -447,7 +479,8 @@ async def message(self, message): url = uri.feed_to_http(url) url = (uri.replace_hostname(url, "feed")) or url db_file = get_pathname_to_database(jid) - response = await action.add_feed(db_file, url) + response = await action.add_feed( + db_file, url) await task.clean_tasks_xmpp( jid, ["status"]) await task.start_tasks_xmpp( @@ -458,8 +491,10 @@ async def message(self, message): if query: if len(query) > 3: db_file = get_pathname_to_database(jid) - result = await sqlite.search_feeds(db_file, query) - response = action.list_feeds_by_query(query, result) + result = await sqlite.search_feeds( + db_file, query) + response = action.list_feeds_by_query( + query, result) else: response = ( "Enter at least 4 characters to search" @@ -506,11 +541,11 @@ async def message(self, message): await groupchat.join(self, jid, muc_jid) response = ( "Joined groupchat {}" - ).format(message_text) + ).format(message_text) else: response = ( "> {}\nXMPP URI is not valid." - ).format(message_text) + ).format(message_text) send_reply_message(self, message, response) case _ if message_lowercase.startswith("length"): key = message_text[:6] @@ -685,16 +720,19 @@ async def message(self, message): db_file, ix) response = ( "> {}\nNews source {} has been removed " - "from subscription list.").format(url, ix) + "from subscription list." + ).format(url, ix) except: response = ( - "No news source with ID {}.".format(ix)) + "No news source with ID {}." + ).format(ix) except: url = ix_url await sqlite.remove_feed_by_url(db_file, url) response = ( "> {}\nNews source has been removed " - "from subscription list.").format(url) + "from subscription list." + ).format(url) # await refresh_task( # self, # jid, @@ -835,11 +873,11 @@ async def message(self, message): await groupchat.join(self, jid, muc_jid) response = ( "Joined groupchat {}" - ).format(message_text) + ).format(message_text) else: response = ( "> {}\nXMPP URI is not valid." - ).format(message_text) + ).format(message_text) send_reply_message(self, message, response) case _: response = ( diff --git a/slixfeed/xmpp/text.py b/slixfeed/xmpp/text.py index ac44e1a..8afd583 100644 --- a/slixfeed/xmpp/text.py +++ b/slixfeed/xmpp/text.py @@ -216,7 +216,7 @@ def print_help(): " info\n" " Print information page.\n" " support\n" - " Join xmpp:slixmpp@muc.poez.io?join\n" + " Join xmpp:slixfeed@chat.woodpeckersnest.space?join\n" # "\n" # "PROTOCOLS\n" # " Supported prootcols are IRC, Matrix and XMPP.\n"