Improve code of module crawl.py

This commit is contained in:
Schimon Jehudah 2024-01-09 12:34:10 +00:00
parent 956ce69fcb
commit 9709c052ee
4 changed files with 267 additions and 267 deletions

View file

@ -81,7 +81,7 @@ def is_feed(feed):
True or False. True or False.
""" """
value = False value = False
message = None # message = None
if not feed.entries: if not feed.entries:
if "version" in feed.keys(): if "version" in feed.keys():
feed["version"] feed["version"]
@ -110,7 +110,6 @@ def is_feed(feed):
# message = ( # message = (
# "Good feed for {}" # "Good feed for {}"
# ).format(url) # ).format(url)
print(message)
return value return value
@ -402,15 +401,11 @@ async def add_feed(db_file, url):
else: else:
result = await crawl.probe_page( result = await crawl.probe_page(
url, document) url, document)
# TODO Check length and for a write a if isinstance(result, str):
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result response = result
break break
else:
url = result[0]
else: else:
response = ( response = (
"> {}\nFailed to load URL. Reason: {}" "> {}\nFailed to load URL. Reason: {}"
@ -480,15 +475,11 @@ async def view_feed(url):
else: else:
result = await crawl.probe_page( result = await crawl.probe_page(
url, document) url, document)
# TODO Check length and for a write a if isinstance(result, str):
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result response = result
break break
else:
url = result[0]
else: else:
response = ( response = (
"> {}\nFailed to load URL. Reason: {}" "> {}\nFailed to load URL. Reason: {}"
@ -553,15 +544,11 @@ async def view_entry(url, num):
else: else:
result = await crawl.probe_page( result = await crawl.probe_page(
url, document) url, document)
# TODO Check length and for a write a if isinstance(result, str):
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result response = result
break break
else:
url = result[0]
else: else:
response = ( response = (
"> {}\nFailed to load URL. Reason: {}" "> {}\nFailed to load URL. Reason: {}"
@ -660,8 +647,11 @@ async def scan(db_file, url):
db_file, "filter-deny", string) db_file, "filter-deny", string)
if reject_list: if reject_list:
read_status = 1 read_status = 1
logging.debug(
"Rejected due to keyword {}".format(reject_list))
if isinstance(date, int): if isinstance(date, int):
logging.error("Variable 'date' is int:", date) logging.error(
"Variable 'date' is int: {}".format(date))
await sqlite.add_entry( await sqlite.add_entry(
db_file, title, link, entry_id, db_file, title, link, entry_id,
url, date, read_status) url, date, read_status)
@ -723,7 +713,7 @@ async def organize_items(db_file, urls):
IncompleteRead, IncompleteRead,
error.URLError error.URLError
) as e: ) as e:
print(e) logging.error(e)
# TODO Print error to log # TODO Print error to log
# None # None
# NOTE I don't think there should be "return" # NOTE I don't think there should be "return"

View file

@ -19,6 +19,7 @@ TODO
from aiohttp import ClientError, ClientSession, ClientTimeout from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse from feedparser import parse
import logging
from lxml import html from lxml import html
import slixfeed.config as config import slixfeed.config as config
from slixfeed.fetch import download_feed from slixfeed.fetch import download_feed
@ -88,15 +89,20 @@ async def probe_page(url, document):
"> {}\nFailed to parse URL as feed." "> {}\nFailed to parse URL as feed."
).format(url) ).format(url)
if not result: if not result:
print("RSS Auto-Discovery Engaged") logging.debug(
"Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree) result = await feed_mode_auto_discovery(url, tree)
if not result: if not result:
print("RSS Scan Mode Engaged") logging.debug(
"Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree) result = await feed_mode_scan(url, tree)
if not result: if not result:
print("RSS Arbitrary Mode Engaged") logging.debug(
result = await feed_mode_request(url, tree) "Feed arbitrary mode engaged for {}".format(url))
result = await feed_mode_guess(url, tree)
if not result: if not result:
logging.debug(
"No feeds were found for {}".format(url))
result = ( result = (
"> {}\nNo news feeds were found for URL." "> {}\nNo news feeds were found for URL."
).format(url) ).format(url)
@ -104,7 +110,7 @@ async def probe_page(url, document):
# TODO Improve scan by gradual decreasing of path # TODO Improve scan by gradual decreasing of path
async def feed_mode_request(url, tree): async def feed_mode_guess(url, tree):
""" """
Lookup for feeds by pathname using HTTP Requests. Lookup for feeds by pathname using HTTP Requests.
@ -122,26 +128,9 @@ async def feed_mode_request(url, tree):
msg : str msg : str
Message with URLs. Message with URLs.
""" """
feeds = {} urls = []
parted_url = urlsplit(url) parted_url = urlsplit(url)
paths = config.get_list("lists.yaml", "pathnames") paths = config.get_list("lists.yaml", "pathnames")
for path in paths:
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
# print(parse(res[0])["feed"]["title"])
# feeds[address] = parse(res[0])["feed"]["title"]
try:
title = parse(res[0])["feed"]["title"]
except:
title = '*** No Title ***'
feeds[address] = title
# Check whether URL has path (i.e. not root) # Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given # Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests # TODO Make more tests
@ -151,65 +140,14 @@ async def feed_mode_request(url, tree):
) if '.rss' not in paths else -1 ) if '.rss' not in paths else -1
# if paths.index('.rss'): # if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"]) # paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit([ for path in paths:
parted_url.scheme, address = join_url(url, parted_url.path.split('/')[1] + path)
parted_url.netloc, if address not in urls:
parted_url.path.split('/')[1] + path, urls.extend([address])
None, # breakpoint()
None # print("feed_mode_guess")
]) urls = await process_feed_selection(url, urls)
res = await download_feed(address) return urls
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds)
except:
continue
# TODO return feeds
if len(feeds) > 1:
counter = 0
msg = (
"RSS URL discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
# AttributeError: 'str' object has no attribute 'entries'
try:
feed_amnt = len(feeds[feed].entries)
except:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title: {}\n"
"Link : {}\n"
"Items: {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_scan(url, tree): async def feed_mode_scan(url, tree):
@ -230,9 +168,7 @@ async def feed_mode_scan(url, tree):
msg : str msg : str
Message with URLs. Message with URLs.
""" """
feeds = {} urls = []
# paths = []
# TODO Test
paths = config.get_list("lists.yaml", "pathnames") paths = config.get_list("lists.yaml", "pathnames")
for path in paths: for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path) # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
@ -242,91 +178,16 @@ async def feed_mode_scan(url, tree):
addresses = tree.xpath(xpath_query) addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query) addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or # NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end # perhaps be N from the start and N from the end
for address in addresses: for address in addresses:
# print(address.xpath('@href')[0]) address = join_url(url, address.xpath('@href')[0])
# print(addresses) if address not in urls:
address = address.xpath('@href')[0] urls.extend([address])
if "/" not in address:
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
if address.startswith('/'):
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint() # breakpoint()
# print(feeds) # print("feed_mode_scan")
except: urls = await process_feed_selection(url, urls)
continue return urls
# TODO return feeds
if len(feeds) > 1:
# print(feeds)
# breakpoint()
counter = 0
msg = (
"RSS URL scan has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_auto_discovery(url, tree): async def feed_mode_auto_discovery(url, tree):
@ -358,11 +219,8 @@ async def feed_mode_auto_discovery(url, tree):
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query) feeds = tree.xpath(xpath_query)
# TODO return feeds if feeds:
if len(feeds) > 1: urls = []
msg = (
"RSS Auto-Discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
for feed in feeds: for feed in feeds:
# # The following code works; # # The following code works;
# # The following code will catch # # The following code will catch
@ -373,15 +231,129 @@ async def feed_mode_auto_discovery(url, tree):
# disco = parse(res[0]) # disco = parse(res[0])
# title = disco["feed"]["title"] # title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed) # msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed.xpath('@title')[0]
feed_addr = join_url(url, feed.xpath('@href')[0]) # feed_name = feed.xpath('@title')[0]
# feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"): # if feed_addr.startswith("/"):
# feed_addr = url + feed_addr # feed_addr = url + feed_addr
msg += "{}\n{}\n\n".format(feed_name, feed_addr) address = join_url(url, feed.xpath('@href')[0])
msg += ( if address not in urls:
"```\nThe above feeds were extracted from\n{}" urls.extend([address])
# breakpoint()
# print("feed_mode_auto_discovery")
urls = await process_feed_selection(url, urls)
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
res = await download_feed(i)
if res[1] == 200:
try:
feeds[i] = [parse(res[0])]
except:
continue
message = (
"Web feeds found for {}\n\n```\n"
).format(url) ).format(url)
return msg counter = 0
elif feeds: feed_url_mark = 0
feed_addr = join_url(url, feeds[0].xpath('@href')[0]) for feed_url in feeds:
return [feed_addr] # try:
# res = await download_feed(feed)
# except:
# continue
feed_name = None
if "title" in feeds[feed_url][0]["feed"].keys():
feed_name = feeds[feed_url][0].feed.title
feed_name = feed_name if feed_name else "Untitled"
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if "entries" in feeds[feed_url][0].keys():
feed_amnt = feeds[feed_url][0].entries
else:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
feed_url_mark = [feed_url]
counter += 1
message += (
"Title : {}\n"
"Link : {}\n"
"\n"
).format(feed_name, feed_url)
if counter > 1:
message += (
"```\nTotal of {} feeds."
).format(counter)
result = message
elif feed_url_mark:
result = feed_url_mark
else:
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.download_feed(url)
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)

View file

@ -18,6 +18,7 @@ TODO
""" """
import logging
import os import os
import slixfeed.action as action import slixfeed.action as action
from slixfeed.config import ( from slixfeed.config import (
@ -78,6 +79,38 @@ async def message(self, message):
""" """
if message["type"] in ("chat", "groupchat", "normal"): if message["type"] in ("chat", "groupchat", "normal"):
jid = message["from"].bare jid = message["from"].bare
message_text = " ".join(message["body"].split())
# BOTE This is an exceptional case in which we treat
# type groupchat the same as type chat.
if (message_text.lower().startswith("http")) and(
message_text.lower().endswith(".opml")):
url = message_text
await task.clean_tasks_xmpp(
jid, ["status"])
status_type = "dnd"
status_message = (
"📥️ Procesing request to import feeds ..."
)
send_status_message(
self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid)
count = await action.import_opml(db_file, url)
if count:
response = (
"Successfully imported {} feeds"
).format(count)
else:
response = (
"OPML file was not imported."
)
await task.clean_tasks_xmpp(
jid, ["status"])
await task.start_tasks_xmpp(
self, jid, ["status"])
send_reply_message(self, message, response)
if message["type"] == "groupchat": if message["type"] == "groupchat":
# nick = message["from"][message["from"].index("/")+1:] # nick = message["from"][message["from"].index("/")+1:]
nick = str(message["from"]) nick = str(message["from"])
@ -135,18 +168,26 @@ async def message(self, message):
# await compose.message(self, jid, message) # await compose.message(self, jid, message)
message_text = " ".join(message["body"].split())
if message["type"] == "groupchat": if message["type"] == "groupchat":
message_text = message_text[1:] message_text = message_text[1:]
message_lowercase = message_text.lower() message_lowercase = message_text.lower()
print(current_time(), "ACCOUNT: " + str(message["from"])) logging.debug(
print(current_time(), "COMMAND:", message_text) [str(message["from"]), ":", message_text])
response = 0 response = None
match message_lowercase: match message_lowercase:
# case "breakpoint": # case "breakpoint":
# if jid == get_value("accounts", "XMPP", "operator"): # if jid == get_value("accounts", "XMPP", "operator"):
# breakpoint() # breakpoint()
# print("task_manager[jid]")
# print(task_manager[jid])
# await self.get_roster()
# print("roster 1")
# print(self.client_roster)
# print("roster 2")
# print(self.client_roster.keys())
# print("jid")
# print(jid)
# else: # else:
# response = ( # response = (
# "This action is restricted. " # "This action is restricted. "
@ -171,15 +212,6 @@ async def message(self, message):
"Send \"help\" for instructions.\n" "Send \"help\" for instructions.\n"
) )
send_reply_message(self, message, response) send_reply_message(self, message, response)
# print("task_manager[jid]")
# print(task_manager[jid])
await self.get_roster()
print("roster 1")
print(self.client_roster)
print("roster 2")
print(self.client_roster.keys())
print("jid")
print(jid)
# case _ if message_lowercase.startswith("activate"): # case _ if message_lowercase.startswith("activate"):
# if message["type"] == "groupchat": # if message["type"] == "groupchat":
@ -242,8 +274,8 @@ async def message(self, message):
response = ( response = (
"> {}\nNews source \"{}\" is already " "> {}\nNews source \"{}\" is already "
"listed in the subscription list at " "listed in the subscription list at "
"index {}".format(url, name, ix) "index {}"
) ).format(url, name, ix)
else: else:
response = "Missing URL." response = "Missing URL."
send_reply_message(self, message, response) send_reply_message(self, message, response)
@ -406,32 +438,32 @@ async def message(self, message):
message_lowercase.startswith("gopher:")): message_lowercase.startswith("gopher:")):
response = "Gemini and Gopher are not supported yet." response = "Gemini and Gopher are not supported yet."
send_reply_message(self, message, response) send_reply_message(self, message, response)
case _ if (message_lowercase.startswith("http")) and( # case _ if (message_lowercase.startswith("http")) and(
message_lowercase.endswith(".opml")): # message_lowercase.endswith(".opml")):
url = message_text # url = message_text
await task.clean_tasks_xmpp( # await task.clean_tasks_xmpp(
jid, ["status"]) # jid, ["status"])
status_type = "dnd" # status_type = "dnd"
status_message = ( # status_message = (
"📥️ Procesing request to import feeds ..." # "📥️ Procesing request to import feeds ..."
) # )
send_status_message( # send_status_message(
self, jid, status_type, status_message) # self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid) # db_file = get_pathname_to_database(jid)
count = await action.import_opml(db_file, url) # count = await action.import_opml(db_file, url)
if count: # if count:
response = ( # response = (
"Successfully imported {} feeds" # "Successfully imported {} feeds"
).format(count) # ).format(count)
else: # else:
response = ( # response = (
"OPML file was not imported." # "OPML file was not imported."
) # )
await task.clean_tasks_xmpp( # await task.clean_tasks_xmpp(
jid, ["status"]) # jid, ["status"])
await task.start_tasks_xmpp( # await task.start_tasks_xmpp(
self, jid, ["status"]) # self, jid, ["status"])
send_reply_message(self, message, response) # send_reply_message(self, message, response)
case _ if (message_lowercase.startswith("http") or case _ if (message_lowercase.startswith("http") or
message_lowercase.startswith("feed:")): message_lowercase.startswith("feed:")):
url = message_text url = message_text
@ -447,7 +479,8 @@ async def message(self, message):
url = uri.feed_to_http(url) url = uri.feed_to_http(url)
url = (uri.replace_hostname(url, "feed")) or url url = (uri.replace_hostname(url, "feed")) or url
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
response = await action.add_feed(db_file, url) response = await action.add_feed(
db_file, url)
await task.clean_tasks_xmpp( await task.clean_tasks_xmpp(
jid, ["status"]) jid, ["status"])
await task.start_tasks_xmpp( await task.start_tasks_xmpp(
@ -458,8 +491,10 @@ async def message(self, message):
if query: if query:
if len(query) > 3: if len(query) > 3:
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
result = await sqlite.search_feeds(db_file, query) result = await sqlite.search_feeds(
response = action.list_feeds_by_query(query, result) db_file, query)
response = action.list_feeds_by_query(
query, result)
else: else:
response = ( response = (
"Enter at least 4 characters to search" "Enter at least 4 characters to search"
@ -685,16 +720,19 @@ async def message(self, message):
db_file, ix) db_file, ix)
response = ( response = (
"> {}\nNews source {} has been removed " "> {}\nNews source {} has been removed "
"from subscription list.").format(url, ix) "from subscription list."
).format(url, ix)
except: except:
response = ( response = (
"No news source with ID {}.".format(ix)) "No news source with ID {}."
).format(ix)
except: except:
url = ix_url url = ix_url
await sqlite.remove_feed_by_url(db_file, url) await sqlite.remove_feed_by_url(db_file, url)
response = ( response = (
"> {}\nNews source has been removed " "> {}\nNews source has been removed "
"from subscription list.").format(url) "from subscription list."
).format(url)
# await refresh_task( # await refresh_task(
# self, # self,
# jid, # jid,

View file

@ -216,7 +216,7 @@ def print_help():
" info\n" " info\n"
" Print information page.\n" " Print information page.\n"
" support\n" " support\n"
" Join xmpp:slixmpp@muc.poez.io?join\n" " Join xmpp:slixfeed@chat.woodpeckersnest.space?join\n"
# "\n" # "\n"
# "PROTOCOLS\n" # "PROTOCOLS\n"
# " Supported prootcols are IRC, Matrix and XMPP.\n" # " Supported prootcols are IRC, Matrix and XMPP.\n"