Improve code of module crawl.py

This commit is contained in:
Schimon Jehudah 2024-01-09 12:34:10 +00:00
parent 956ce69fcb
commit 9709c052ee
4 changed files with 267 additions and 267 deletions

View file

@ -81,7 +81,7 @@ def is_feed(feed):
True or False.
"""
value = False
message = None
# message = None
if not feed.entries:
if "version" in feed.keys():
feed["version"]
@ -110,7 +110,6 @@ def is_feed(feed):
# message = (
# "Good feed for {}"
# ).format(url)
print(message)
return value
@ -402,15 +401,11 @@ async def add_feed(db_file, url):
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
if isinstance(result, str):
response = result
break
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
@ -480,15 +475,11 @@ async def view_feed(url):
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
if isinstance(result, str):
response = result
break
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
@ -553,15 +544,11 @@ async def view_entry(url, num):
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
if isinstance(result, str):
response = result
break
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
@ -660,8 +647,11 @@ async def scan(db_file, url):
db_file, "filter-deny", string)
if reject_list:
read_status = 1
logging.debug(
"Rejected due to keyword {}".format(reject_list))
if isinstance(date, int):
logging.error("Variable 'date' is int:", date)
logging.error(
"Variable 'date' is int: {}".format(date))
await sqlite.add_entry(
db_file, title, link, entry_id,
url, date, read_status)
@ -723,7 +713,7 @@ async def organize_items(db_file, urls):
IncompleteRead,
error.URLError
) as e:
print(e)
logging.error(e)
# TODO Print error to log
# None
# NOTE I don't think there should be "return"

View file

@ -19,6 +19,7 @@ TODO
from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
import logging
from lxml import html
import slixfeed.config as config
from slixfeed.fetch import download_feed
@ -88,15 +89,20 @@ async def probe_page(url, document):
"> {}\nFailed to parse URL as feed."
).format(url)
if not result:
print("RSS Auto-Discovery Engaged")
logging.debug(
"Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree)
if not result:
print("RSS Scan Mode Engaged")
logging.debug(
"Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree)
if not result:
print("RSS Arbitrary Mode Engaged")
result = await feed_mode_request(url, tree)
logging.debug(
"Feed arbitrary mode engaged for {}".format(url))
result = await feed_mode_guess(url, tree)
if not result:
logging.debug(
"No feeds were found for {}".format(url))
result = (
"> {}\nNo news feeds were found for URL."
).format(url)
@ -104,7 +110,7 @@ async def probe_page(url, document):
# TODO Improve scan by gradual decreasing of path
async def feed_mode_request(url, tree):
async def feed_mode_guess(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
@ -122,94 +128,26 @@ async def feed_mode_request(url, tree):
msg : str
Message with URLs.
"""
feeds = {}
urls = []
parted_url = urlsplit(url)
paths = config.get_list("lists.yaml", "pathnames")
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
for path in paths:
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
# print(parse(res[0])["feed"]["title"])
# feeds[address] = parse(res[0])["feed"]["title"]
try:
title = parse(res[0])["feed"]["title"]
except:
title = '*** No Title ***'
feeds[address] = title
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
parted_url.path.split('/')[1] + path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds)
except:
continue
# TODO return feeds
if len(feeds) > 1:
counter = 0
msg = (
"RSS URL discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
# AttributeError: 'str' object has no attribute 'entries'
try:
feed_amnt = len(feeds[feed].entries)
except:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title: {}\n"
"Link : {}\n"
"Items: {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
address = join_url(url, parted_url.path.split('/')[1] + path)
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_guess")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_scan(url, tree):
@ -230,9 +168,7 @@ async def feed_mode_scan(url, tree):
msg : str
Message with URLs.
"""
feeds = {}
# paths = []
# TODO Test
urls = []
paths = config.get_list("lists.yaml", "pathnames")
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
@ -242,91 +178,16 @@ async def feed_mode_scan(url, tree):
addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
# print(address.xpath('@href')[0])
# print(addresses)
address = address.xpath('@href')[0]
if "/" not in address:
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
if address.startswith('/'):
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint()
# print(feeds)
except:
continue
# TODO return feeds
if len(feeds) > 1:
# print(feeds)
# breakpoint()
counter = 0
msg = (
"RSS URL scan has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
address = join_url(url, address.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_scan")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_auto_discovery(url, tree):
@ -358,11 +219,8 @@ async def feed_mode_auto_discovery(url, tree):
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
# TODO return feeds
if len(feeds) > 1:
msg = (
"RSS Auto-Discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
if feeds:
urls = []
for feed in feeds:
# # The following code works;
# # The following code will catch
@ -373,15 +231,129 @@ async def feed_mode_auto_discovery(url, tree):
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed.xpath('@title')[0]
feed_addr = join_url(url, feed.xpath('@href')[0])
# feed_name = feed.xpath('@title')[0]
# feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
return msg
elif feeds:
feed_addr = join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr]
address = join_url(url, feed.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_auto_discovery")
urls = await process_feed_selection(url, urls)
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
res = await download_feed(i)
if res[1] == 200:
try:
feeds[i] = [parse(res[0])]
except:
continue
message = (
"Web feeds found for {}\n\n```\n"
).format(url)
counter = 0
feed_url_mark = 0
for feed_url in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
feed_name = None
if "title" in feeds[feed_url][0]["feed"].keys():
feed_name = feeds[feed_url][0].feed.title
feed_name = feed_name if feed_name else "Untitled"
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if "entries" in feeds[feed_url][0].keys():
feed_amnt = feeds[feed_url][0].entries
else:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
feed_url_mark = [feed_url]
counter += 1
message += (
"Title : {}\n"
"Link : {}\n"
"\n"
).format(feed_name, feed_url)
if counter > 1:
message += (
"```\nTotal of {} feeds."
).format(counter)
result = message
elif feed_url_mark:
result = feed_url_mark
else:
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.download_feed(url)
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)

View file

@ -18,6 +18,7 @@ TODO
"""
import logging
import os
import slixfeed.action as action
from slixfeed.config import (
@ -78,6 +79,38 @@ async def message(self, message):
"""
if message["type"] in ("chat", "groupchat", "normal"):
jid = message["from"].bare
message_text = " ".join(message["body"].split())
# BOTE This is an exceptional case in which we treat
# type groupchat the same as type chat.
if (message_text.lower().startswith("http")) and(
message_text.lower().endswith(".opml")):
url = message_text
await task.clean_tasks_xmpp(
jid, ["status"])
status_type = "dnd"
status_message = (
"📥️ Procesing request to import feeds ..."
)
send_status_message(
self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid)
count = await action.import_opml(db_file, url)
if count:
response = (
"Successfully imported {} feeds"
).format(count)
else:
response = (
"OPML file was not imported."
)
await task.clean_tasks_xmpp(
jid, ["status"])
await task.start_tasks_xmpp(
self, jid, ["status"])
send_reply_message(self, message, response)
if message["type"] == "groupchat":
# nick = message["from"][message["from"].index("/")+1:]
nick = str(message["from"])
@ -135,18 +168,26 @@ async def message(self, message):
# await compose.message(self, jid, message)
message_text = " ".join(message["body"].split())
if message["type"] == "groupchat":
message_text = message_text[1:]
message_lowercase = message_text.lower()
print(current_time(), "ACCOUNT: " + str(message["from"]))
print(current_time(), "COMMAND:", message_text)
response = 0
logging.debug(
[str(message["from"]), ":", message_text])
response = None
match message_lowercase:
# case "breakpoint":
# if jid == get_value("accounts", "XMPP", "operator"):
# breakpoint()
# print("task_manager[jid]")
# print(task_manager[jid])
# await self.get_roster()
# print("roster 1")
# print(self.client_roster)
# print("roster 2")
# print(self.client_roster.keys())
# print("jid")
# print(jid)
# else:
# response = (
# "This action is restricted. "
@ -171,15 +212,6 @@ async def message(self, message):
"Send \"help\" for instructions.\n"
)
send_reply_message(self, message, response)
# print("task_manager[jid]")
# print(task_manager[jid])
await self.get_roster()
print("roster 1")
print(self.client_roster)
print("roster 2")
print(self.client_roster.keys())
print("jid")
print(jid)
# case _ if message_lowercase.startswith("activate"):
# if message["type"] == "groupchat":
@ -242,8 +274,8 @@ async def message(self, message):
response = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
"index {}"
).format(url, name, ix)
else:
response = "Missing URL."
send_reply_message(self, message, response)
@ -406,32 +438,32 @@ async def message(self, message):
message_lowercase.startswith("gopher:")):
response = "Gemini and Gopher are not supported yet."
send_reply_message(self, message, response)
case _ if (message_lowercase.startswith("http")) and(
message_lowercase.endswith(".opml")):
url = message_text
await task.clean_tasks_xmpp(
jid, ["status"])
status_type = "dnd"
status_message = (
"📥️ Procesing request to import feeds ..."
)
send_status_message(
self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid)
count = await action.import_opml(db_file, url)
if count:
response = (
"Successfully imported {} feeds"
).format(count)
else:
response = (
"OPML file was not imported."
)
await task.clean_tasks_xmpp(
jid, ["status"])
await task.start_tasks_xmpp(
self, jid, ["status"])
send_reply_message(self, message, response)
# case _ if (message_lowercase.startswith("http")) and(
# message_lowercase.endswith(".opml")):
# url = message_text
# await task.clean_tasks_xmpp(
# jid, ["status"])
# status_type = "dnd"
# status_message = (
# "📥️ Procesing request to import feeds ..."
# )
# send_status_message(
# self, jid, status_type, status_message)
# db_file = get_pathname_to_database(jid)
# count = await action.import_opml(db_file, url)
# if count:
# response = (
# "Successfully imported {} feeds"
# ).format(count)
# else:
# response = (
# "OPML file was not imported."
# )
# await task.clean_tasks_xmpp(
# jid, ["status"])
# await task.start_tasks_xmpp(
# self, jid, ["status"])
# send_reply_message(self, message, response)
case _ if (message_lowercase.startswith("http") or
message_lowercase.startswith("feed:")):
url = message_text
@ -447,7 +479,8 @@ async def message(self, message):
url = uri.feed_to_http(url)
url = (uri.replace_hostname(url, "feed")) or url
db_file = get_pathname_to_database(jid)
response = await action.add_feed(db_file, url)
response = await action.add_feed(
db_file, url)
await task.clean_tasks_xmpp(
jid, ["status"])
await task.start_tasks_xmpp(
@ -458,8 +491,10 @@ async def message(self, message):
if query:
if len(query) > 3:
db_file = get_pathname_to_database(jid)
result = await sqlite.search_feeds(db_file, query)
response = action.list_feeds_by_query(query, result)
result = await sqlite.search_feeds(
db_file, query)
response = action.list_feeds_by_query(
query, result)
else:
response = (
"Enter at least 4 characters to search"
@ -506,11 +541,11 @@ async def message(self, message):
await groupchat.join(self, jid, muc_jid)
response = (
"Joined groupchat {}"
).format(message_text)
).format(message_text)
else:
response = (
"> {}\nXMPP URI is not valid."
).format(message_text)
).format(message_text)
send_reply_message(self, message, response)
case _ if message_lowercase.startswith("length"):
key = message_text[:6]
@ -685,16 +720,19 @@ async def message(self, message):
db_file, ix)
response = (
"> {}\nNews source {} has been removed "
"from subscription list.").format(url, ix)
"from subscription list."
).format(url, ix)
except:
response = (
"No news source with ID {}.".format(ix))
"No news source with ID {}."
).format(ix)
except:
url = ix_url
await sqlite.remove_feed_by_url(db_file, url)
response = (
"> {}\nNews source has been removed "
"from subscription list.").format(url)
"from subscription list."
).format(url)
# await refresh_task(
# self,
# jid,
@ -835,11 +873,11 @@ async def message(self, message):
await groupchat.join(self, jid, muc_jid)
response = (
"Joined groupchat {}"
).format(message_text)
).format(message_text)
else:
response = (
"> {}\nXMPP URI is not valid."
).format(message_text)
).format(message_text)
send_reply_message(self, message, response)
case _:
response = (

View file

@ -216,7 +216,7 @@ def print_help():
" info\n"
" Print information page.\n"
" support\n"
" Join xmpp:slixmpp@muc.poez.io?join\n"
" Join xmpp:slixfeed@chat.woodpeckersnest.space?join\n"
# "\n"
# "PROTOCOLS\n"
# " Supported prootcols are IRC, Matrix and XMPP.\n"