diff --git a/slixfeed/__main__.py b/slixfeed/__main__.py index 7f95840..86e3b5d 100644 --- a/slixfeed/__main__.py +++ b/slixfeed/__main__.py @@ -13,42 +13,44 @@ FIXME TODO -0) from slixfeed.FILENAME import XYZ - See project feed2toot +1) from slixfeed.FILENAME import XYZ + See project /chaica/feed2toot -1) SQL prepared statements. +2) SQL prepared statements; -2) Machine Learning for scrapping Title, Link, Summary and Timstamp. +3) Machine Learning for scrapping Title, Link, Summary and Timstamp; + Scrape element (example: Liferea) + http://intertwingly.net/blog/ + https://www.brandenburg.de/ -3) Set MUC subject +4) Set MUC subject Feeds which entries are to be set as groupchat subject. Perhaps not, as it would require to check every feed for this setting. - Maybe a separate bot. + Maybe a separate bot; -4) Support categories. +5) Support categories; -5) Default prepackaged list of feeds. +6) XMPP commands; -6) XMPP commands. +7) Bot as transport; -7) Bot as transport. +8) OMEMO; -8) OMEMO. +9) Logging; + https://docs.python.org/3/howto/logging.html -9) Logging. +10) Readability + See project /buriy/python-readability -10) Default feeds (e.g. Blacklisted News, TBOT etc.) - -11) Download and upload/send article (xHTML, xHTMLZ, Markdown, MHTML, TXT). - Use Readability. +11) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, TXT). 12) Fetch summary from URL, instead of storing summary, or Store 5 upcoming summaries. This would help making the database files smaller. 13) Support protocol Gopher - https://github.com/michael-lazar/pygopherd - https://github.com/gopherball/gb + See project /michael-lazar/pygopherd + See project /gopherball/gb 14) Support ActivityPub @person@domain (see Tip Of The Day). @@ -60,7 +62,11 @@ TODO 16) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger -17) See project offpunk/offblocklist.py +17) See project /offpunk/offblocklist.py + +18) Search messages of government regulated publishers, and promote other sources. + Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds. + However, you might want to get news from (1) (2) and (3) instead! """ diff --git a/slixfeed/datahandler.py b/slixfeed/datahandler.py index 5860ba2..86f5850 100644 --- a/slixfeed/datahandler.py +++ b/slixfeed/datahandler.py @@ -19,14 +19,14 @@ from asyncio import TimeoutError from asyncio.exceptions import IncompleteReadError from bs4 import BeautifulSoup from confighandler import get_list, get_value_default +from datetimehandler import now, rfc2822_to_iso8601 from email.utils import parseaddr from feedparser import parse from http.client import IncompleteRead -from lxml import html -from datetimehandler import now, rfc2822_to_iso8601 -from urlhandler import complete_url, join_url, trim_url from listhandler import is_listed +from lxml import html import sqlitehandler as sqlite +from urlhandler import complete_url, join_url, trim_url from urllib import error # from xml.etree.ElementTree import ElementTree, ParseError from urllib.parse import urljoin, urlsplit, urlunsplit @@ -202,8 +202,8 @@ async def download_updates(db_file, url=None): print("PROBLEM: date is int") print(date) # breakpoint() - print(source) - print(date) + # print(source) + # print(date) await sqlite.add_entry_and_set_date( db_file, source, @@ -261,9 +261,9 @@ async def view_feed(url): title = get_title(url, feed) entries = feed.entries msg = "Preview of {}:\n```\n".format(title) - count = 0 + counter = 0 for entry in entries: - count += 1 + counter += 1 if entry.has_key("title"): title = entry.title else: @@ -292,9 +292,9 @@ async def view_feed(url): title, date, link, - count + counter ) - if count > 4: + if counter > 4: break msg += ( "```\nSource: {}" @@ -446,7 +446,7 @@ async def add_feed(db_file, url): title = get_title(url, feed) if feed.bozo: bozo = ( - "Bozo detected. Failed to load: {}." + "Bozo detected. Failed to load: {}" ).format(url) print(bozo) msg = await probe_page(add_feed, url, res[0], db_file=db_file) @@ -505,7 +505,7 @@ async def probe_page(callback, url, doc, num=None, db_file=None): elif isinstance(msg, list): url = msg[0] if db_file: - print("if db_file", db_file) + # print("if db_file", db_file) return await callback(db_file, url) elif num: return await callback(url, num) @@ -531,6 +531,8 @@ async def download_feed(url): user_agent = await get_value_default("user-agent", "Network") except: user_agent = "Slixfeed/0.1" + if not len(user_agent): + user_agent = "Slixfeed/0.1" timeout = ClientTimeout(total=10) headers = {'User-Agent': user_agent} async with ClientSession(headers=headers) as session: @@ -597,6 +599,8 @@ def get_title(url, feed): title = feed["feed"]["title"] except: title = urlsplit(url).netloc + if not title: + title = urlsplit(url).netloc return title @@ -621,7 +625,7 @@ async def feed_mode_request(url, tree): """ feeds = {} parted_url = urlsplit(url) - paths = await get_list("pathnames") + paths = await get_list("pathnames", "lists.yaml") for path in paths: address = urlunsplit([ parted_url.scheme, @@ -693,7 +697,7 @@ async def feed_mode_request(url, tree): ).format(url) if not positive: msg = ( - "No feeds were found for {}." + "No feeds were found for {}" ).format(url) return msg elif feeds: @@ -721,17 +725,21 @@ async def feed_mode_scan(url, tree): feeds = {} # paths = [] # TODO Test - paths = await get_list("pathnames") + paths = await get_list("pathnames", "lists.yaml") for path in paths: # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) - xpath_query = "//a[contains(@href,'{}')]".format(path) + # xpath_query = "//a[contains(@href,'{}')]".format(path) + num = 5 + xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num) addresses = tree.xpath(xpath_query) + xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) + addresses += tree.xpath(xpath_query) parted_url = urlsplit(url) # NOTE Should number of addresses be limited or # perhaps be N from the start and N from the end for address in addresses: - print(address.xpath('@href')[0]) - print(addresses) + # print(address.xpath('@href')[0]) + # print(addresses) address = address.xpath('@href')[0] if "/" not in address: protocol = parted_url.scheme @@ -759,11 +767,15 @@ async def feed_mode_scan(url, tree): if res[1] == 200: try: feeds[address] = parse(res[0]) + # print(feeds[address]) + # breakpoint() # print(feeds) except: continue if len(feeds) > 1: - positive = 0 + # print(feeds) + # breakpoint() + counter = 0 msg = ( "RSS URL scan has found {} feeds:\n```\n" ).format(len(feeds)) @@ -779,23 +791,32 @@ async def feed_mode_scan(url, tree): feed_addr = feed feed_amnt = len(feeds[feed].entries) if feed_amnt: - positive = 1 + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_mark is + # utilized in order to make fetch possible. + feed_mark = [feed_addr] + counter += 1 msg += ( - "Title: {}\n" - " Link: {}\n" - "Count: {}\n" + "Title : {}\n" + "Link : {}\n" + "Count : {}\n" "\n" ).format( feed_name, feed_addr, feed_amnt ) - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - if not positive: + if counter > 1: + msg += ( + "```\nThe above feeds were extracted from\n{}" + ).format(url) + elif feed_mark: + return feed_mark + else: msg = ( - "No feeds were found for {}." + "No feeds were found for {}" ).format(url) return msg elif feeds: diff --git a/slixfeed/sqlitehandler.py b/slixfeed/sqlitehandler.py index d3a3abc..452d83e 100644 --- a/slixfeed/sqlitehandler.py +++ b/slixfeed/sqlitehandler.py @@ -471,7 +471,10 @@ async def get_entry_unread(db_file, num=None): title = result[1] summary = result[2] # Remove HTML tags - summary = BeautifulSoup(summary, "lxml").text + try: + summary = BeautifulSoup(summary, "lxml").text + except: + print(result[2]) # TODO Limit text length summary = summary.replace("\n\n\n", "\n\n") length = await get_settings_value(db_file, "length") diff --git a/slixfeed/taskhandler.py b/slixfeed/taskhandler.py index 9fe519c..d3a51f1 100644 --- a/slixfeed/taskhandler.py +++ b/slixfeed/taskhandler.py @@ -78,7 +78,7 @@ await taskhandler.start_tasks( """ async def start_tasks_xmpp(self, jid, tasks): - print("start_tasks_xmpp", jid, tasks) + # print("start_tasks_xmpp", jid, tasks) task_manager[jid] = {} for task in tasks: # print("task:", task) @@ -109,7 +109,7 @@ async def start_tasks_xmpp(self, jid, tasks): # await task async def clean_tasks_xmpp(jid, tasks): - print("clean_tasks_xmpp", jid, tasks) + # print("clean_tasks_xmpp", jid, tasks) for task in tasks: # if task_manager[jid][task]: try: @@ -132,7 +132,7 @@ Consider callback e.g. Slixfeed.send_status. Or taskhandler for each protocol or specific taskhandler function. """ async def task_jid(self, jid): - print("task_jid", jid) + # print("task_jid", jid) """ JID (Jabber ID) task manager. @@ -258,7 +258,7 @@ async def send_update(self, jid, num=None): async def send_status(self, jid): - print("send_status", jid) + # print("send_status", jid) # print(await current_time(), jid, "def send_status") """ Send status message. @@ -336,7 +336,7 @@ async def send_status(self, jid): async def refresh_task(self, jid, callback, key, val=None): - print("refresh_task", jid, key) + # print("refresh_task", jid, key) """ Apply new setting at runtime. @@ -382,7 +382,7 @@ async def refresh_task(self, jid, callback, key, val=None): # TODO Take this function out of # async def check_updates(jid): - print("check_updates", jid) + # print("check_updates", jid) # print(await current_time(), jid, "def check_updates") """ Start calling for update check up. diff --git a/slixfeed/xmpphandler.py b/slixfeed/xmpphandler.py index 0052a73..1596435 100644 --- a/slixfeed/xmpphandler.py +++ b/slixfeed/xmpphandler.py @@ -64,6 +64,7 @@ import listhandler as lister import sqlitehandler as sqlite import taskhandler as tasker import urlhandler as urlfixer +from time import sleep from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference @@ -102,7 +103,6 @@ class Slixfeed(slixmpp.ClientXMPP): # The bot works fine when the nickname is hardcoded; or # The bot won't join some MUCs when its nickname has brackets self.nick = nick - # The session_start event will be triggered when # the bot establishes its connection with the server # and the XML streams are ready for use. We want to @@ -387,10 +387,12 @@ class Slixfeed(slixmpp.ClientXMPP): # print(current_time(),"Maximum connection attempts exceeded.") # logging.error("Maximum connection attempts exceeded.") print(current_time(), "Attempt number", self.connection_attempts) - self.reconnect(wait=5.0) - seconds = 5 + seconds = 30 print(current_time(), "Next attempt within", seconds, "seconds") - await asyncio.sleep(seconds) + # NOTE asyncio.sleep doesn't interval as expected + # await asyncio.sleep(seconds) + sleep(seconds) + self.reconnect(wait=5.0) async def inspect_connection(self, event): @@ -912,7 +914,7 @@ class Slixfeed(slixmpp.ClientXMPP): ["status"] ) task = ( - "📫️ Processing request to fetch data from {} ..." + "📫️ Processing request to fetch data from {}" ).format(url) process_task_message(self, jid, task) action = await initdb( @@ -1080,8 +1082,9 @@ class Slixfeed(slixmpp.ClientXMPP): action = ( "Only new items of newly added feeds will be sent." ) - case _ if message_lowercase.startswith("next"): - num = message[5:] + # TODO Will you add support for number of messages? + case "next": + # num = message[5:] await tasker.clean_tasks_xmpp( jid, ["interval", "status"] @@ -1137,13 +1140,15 @@ class Slixfeed(slixmpp.ClientXMPP): else: action = "Missing value." case "random": - action = "Updates will be sent randomly." + # TODO /questions/2279706/select-random-row-from-a-sqlite-table + # NOTE sqlitehandler.get_entry_unread + action = "Updates will be sent by random order." case _ if message_lowercase.startswith("read"): data = message[5:] data = data.split() url = data[0] task = ( - "📫️ Processing request to fetch data from {} ..." + "📫️ Processing request to fetch data from {}" ).format(url) process_task_message(self, jid, task) await tasker.clean_tasks_xmpp(