From 7135994888a2b55244d3863dd41482a0a54b8dc8 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Thu, 4 Jan 2024 01:16:24 +0000 Subject: [PATCH] Segregate code into more particular functions --- slixfeed/action.py | 369 +++++++++++++++++++ slixfeed/config.py | 19 +- slixfeed/crawl.py | 382 ++++++++++++++++++++ slixfeed/fetch.py | 759 +-------------------------------------- slixfeed/log.py | 33 ++ slixfeed/read.py | 74 ++++ slixfeed/sqlite.py | 8 +- slixfeed/task.py | 46 ++- slixfeed/url.py | 2 +- slixfeed/utility.py | 109 ------ slixfeed/xmpp/client.py | 12 +- slixfeed/xmpp/connect.py | 3 +- slixfeed/xmpp/process.py | 116 +++--- 13 files changed, 995 insertions(+), 937 deletions(-) create mode 100644 slixfeed/action.py create mode 100644 slixfeed/crawl.py create mode 100644 slixfeed/log.py create mode 100644 slixfeed/read.py delete mode 100644 slixfeed/utility.py diff --git a/slixfeed/action.py b/slixfeed/action.py new file mode 100644 index 0000000..790d443 --- /dev/null +++ b/slixfeed/action.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from asyncio.exceptions import IncompleteReadError +from bs4 import BeautifulSoup +from http.client import IncompleteRead +from feedparser import parse +import slixfeed.config as config +import slixfeed.crawl as crawl +from slixfeed.datetime import now, rfc2822_to_iso8601 +import slixfeed.fetch as fetch +import slixfeed.sqlite as sqlite +import slixfeed.read as read +import slixfeed.task as task +from slixfeed.url import complete_url, join_url, trim_url +from urllib import error +from urllib.parse import urlsplit + + +async def add_feed(db_file, url): + while True: + exist = await sqlite.is_feed_exist(db_file, url) + if not exist: + result = await fetch.download_feed([url]) + document = result[0] + status = result[1] + if document: + feed = parse(document) + # if read.is_feed(url, feed): + if read.is_feed(feed): + try: + title = feed["feed"]["title"] + except: + title = urlsplit(url).netloc + await sqlite.insert_feed( + db_file, url, title, status) + await organize_items( + db_file, [url]) + old = await sqlite.get_settings_value( + db_file, "old") + if not old: + await sqlite.mark_source_as_read( + db_file, url) + response = ( + "> {}\nNews source {} has been " + "added to subscription list." + ).format(url, title) + break + else: + result = await crawl.probe_page( + url, document) + # TODO Check length and for a write a + # unified message for a set of feeds. + # Use logging if you so choose to + # distinct the methods + if isinstance(result, list): + url = result[0] + elif isinstance(result, str): + response = result + break + else: + response = ( + "> {}\nFailed to load URL. Reason: {}" + ).format(url, status) + break + else: + ix = exist[0] + name = exist[1] + response = ( + "> {}\nNews source \"{}\" is already " + "listed in the subscription list at " + "index {}".format(url, name, ix) + ) + break + return response + + +async def view_feed(url): + while True: + result = await fetch.download_feed([url]) + document = result[0] + status = result[1] + if document: + feed = parse(document) + # if read.is_feed(url, feed): + if read.is_feed(feed): + try: + title = feed["feed"]["title"] + except: + title = urlsplit(url).netloc + entries = feed.entries + response = "Preview of {}:\n\n```\n".format(title) + counter = 0 + for entry in entries: + counter += 1 + if entry.has_key("title"): + title = entry.title + else: + title = "*** No title ***" + if entry.has_key("link"): + # link = complete_url(source, entry.link) + link = join_url(url, entry.link) + link = trim_url(link) + else: + link = "*** No link ***" + if entry.has_key("published"): + date = entry.published + date = rfc2822_to_iso8601(date) + elif entry.has_key("updated"): + date = entry.updated + date = rfc2822_to_iso8601(date) + else: + date = "*** No date ***" + response += ( + "Title : {}\n" + "Date : {}\n" + "Link : {}\n" + "Count : {}\n" + "\n" + ).format(title, date, link, counter) + if counter > 4: + break + response += ( + "```\nSource: {}" + ).format(url) + break + else: + result = await crawl.probe_page( + url, document) + # TODO Check length and for a write a + # unified message for a set of feeds. + # Use logging if you so choose to + # distinct the methods + if isinstance(result, list): + url = result[0] + elif isinstance(result, str): + response = result + break + else: + response = ( + "> {}\nFailed to load URL. Reason: {}" + ).format(url, status) + break + return response + + +async def view_entry(url, num): + while True: + result = await fetch.download_feed([url]) + document = result[0] + status = result[1] + if document: + feed = parse(document) + # if read.is_feed(url, feed): + if read.is_feed(feed): + try: + title = feed["feed"]["title"] + except: + title = urlsplit(url).netloc + entries = feed.entries + num = int(num) - 1 + entry = entries[num] + response = "Preview of {}:\n\n```\n".format(title) + if entry.has_key("title"): + title = entry.title + else: + title = "*** No title ***" + if entry.has_key("published"): + date = entry.published + date = rfc2822_to_iso8601(date) + elif entry.has_key("updated"): + date = entry.updated + date = rfc2822_to_iso8601(date) + else: + date = "*** No date ***" + if entry.has_key("summary"): + summary = entry.summary + # Remove HTML tags + summary = BeautifulSoup(summary, "lxml").text + # TODO Limit text length + summary = summary.replace("\n\n\n", "\n\n") + else: + summary = "*** No summary ***" + if entry.has_key("link"): + # link = complete_url(source, entry.link) + link = join_url(url, entry.link) + link = trim_url(link) + else: + link = "*** No link ***" + response = ( + "{}\n" + "\n" + # "> {}\n" + "{}\n" + "\n" + "{}\n" + "\n" + ).format(title, summary, link) + break + else: + result = await crawl.probe_page( + url, document) + # TODO Check length and for a write a + # unified message for a set of feeds. + # Use logging if you so choose to + # distinct the methods + if isinstance(result, list): + url = result[0] + elif isinstance(result, str): + response = result + break + else: + response = ( + "> {}\nFailed to load URL. Reason: {}" + ).format(url, status) + break + return response + + +# NOTE Why (if res[0]) and (if res[1] == 200)? +async def organize_items(db_file, urls): + """ + Check feeds for new entries. + + Parameters + ---------- + db_file : str + Path to database file. + url : str, optional + URL. The default is None. + """ + for url in urls: + # print(os.path.basename(db_file), url[0]) + source = url[0] + res = await fetch.download_feed(source) + # TypeError: 'NoneType' object is not subscriptable + if res is None: + # Skip to next feed + # urls.next() + # next(urls) + continue + await sqlite.update_source_status( + db_file, res[1], source) + if res[0]: + try: + feed = parse(res[0]) + if feed.bozo: + # bozo = ( + # "WARNING: Bozo detected for feed: {}\n" + # "For more information, visit " + # "https://pythonhosted.org/feedparser/bozo.html" + # ).format(source) + # print(bozo) + valid = 0 + else: + valid = 1 + await sqlite.update_source_validity( + db_file, source, valid) + except ( + IncompleteReadError, + IncompleteRead, + error.URLError + ) as e: + # print(e) + # TODO Print error to log + None + # NOTE I don't think there should be "return" + # because then we might stop scanning next URLs + # return + # TODO Place these couple of lines back down + # NOTE Need to correct the SQL statement to do so + # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW + if res[1] == 200: + # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW + # TODO Place these couple of lines back down + # NOTE Need to correct the SQL statement to do so + entries = feed.entries + # length = len(entries) + # await remove_entry(db_file, source, length) + await sqlite.remove_nonexistent_entries( + db_file, feed, source) + # new_entry = 0 + for entry in entries: + # TODO Pass date too for comparion check + if entry.has_key("published"): + date = entry.published + date = rfc2822_to_iso8601(date) + elif entry.has_key("updated"): + date = entry.updated + date = rfc2822_to_iso8601(date) + else: + # TODO Just set date = "*** No date ***" + # date = await datetime.now().isoformat() + date = now() + # NOTE Would seconds result in better database performance + # date = datetime.datetime(date) + # date = (date-datetime.datetime(1970,1,1)).total_seconds() + if entry.has_key("title"): + title = entry.title + # title = "{}: *{}*".format(feed["feed"]["title"], entry.title) + else: + title = date + # title = feed["feed"]["title"] + if entry.has_key("link"): + # link = complete_url(source, entry.link) + link = join_url(source, entry.link) + link = trim_url(link) + else: + link = source + if entry.has_key("id"): + eid = entry.id + else: + eid = link + exist = await sqlite.check_entry_exist( + db_file, source, eid=eid, + title=title, link=link, date=date) + if not exist: + # new_entry = new_entry + 1 + # TODO Enhance summary + if entry.has_key("summary"): + summary = entry.summary + # # Remove HTML tags + # summary = BeautifulSoup(summary, "lxml").text + # # TODO Limit text length + # summary = summary.replace("\n\n\n", "\n\n") + # summary = summary[:300] + " […]‍⃨" + # summary = summary.strip().split('\n') + # summary = ["> " + line for line in summary] + # summary = "\n".join(summary) + else: + summary = "> *** No summary ***" + read_status = 0 + pathname = urlsplit(link).path + string = ( + "{} {} {}" + ).format( + title, + summary, + pathname + ) + allow_list = await config.is_listed( + db_file, "filter-allow", string) + if not allow_list: + reject_list = await config.is_listed( + db_file, "filter-deny", string) + if reject_list: + # print(">>> REJECTED", title) + summary = ( + "REJECTED {}".format( + reject_list.upper() + ) + ) + # summary = "" + read_status = 1 + entry = ( + title, link, eid, source, date, read_status) + if isinstance(date, int): + print("PROBLEM: date is int") + print(date) + # breakpoint() + # print(source) + # print(date) + await sqlite.add_entry_and_set_date( + db_file, source, entry) + # print(current_time(), entry, title) + # else: + # print(current_time(), exist, title) + + diff --git a/slixfeed/config.py b/slixfeed/config.py index b400ebc..98d35e7 100644 --- a/slixfeed/config.py +++ b/slixfeed/config.py @@ -59,8 +59,9 @@ def get_value(filename, section, keys): for key in keys: try: value = section_res[key] - logging.debug("Found value {} for key {}".format( - value, key)) + logging.debug( + "Found value {} for key {}".format(value, key) + ) except: value = '' logging.error("Missing key:", key) @@ -70,7 +71,8 @@ def get_value(filename, section, keys): try: result = section_res[key] logging.debug( - "Found value {} for key {}".format(result, key)) + "Found value {} for key {}".format(result, key) + ) except: result = '' # logging.error("Missing key:", key) @@ -78,7 +80,8 @@ def get_value(filename, section, keys): logging.error( "Check configuration file {}.ini for " "missing key(s) \"{}\" under section [{}].".format( - filename, keys, section)) + filename, keys, section) + ) else: return result @@ -171,7 +174,9 @@ def get_default_dbdir(): else: return os.path.abspath('.') else: - data_home = os.path.join(os.environ.get('HOME'), '.local', 'share') + data_home = os.path.join( + os.environ.get('HOME'), '.local', 'share' + ) return os.path.join(data_home, 'slixfeed') @@ -200,7 +205,9 @@ def get_default_confdir(): else: return os.path.abspath('.') else: - config_home = os.path.join(os.environ.get('HOME'), '.config') + config_home = os.path.join( + os.environ.get('HOME'), '.config' + ) return os.path.join(config_home, 'slixfeed') diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py new file mode 100644 index 0000000..75caf4e --- /dev/null +++ b/slixfeed/crawl.py @@ -0,0 +1,382 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + +TODO + +1.1) Do not compose messages. + +1.2) Return URLs, nothing else other (e.g. processed messages). + +1.3) Correction of URLs is aceptable. + +""" + +from aiohttp import ClientError, ClientSession, ClientTimeout +from feedparser import parse +from lxml import html +import slixfeed.config as config +from slixfeed.fetch import download_feed +from slixfeed.url import complete_url, join_url, trim_url +from urllib.parse import urlsplit, urlunsplit + + +# TODO Use boolean as a flag to determine whether a single URL was found +# async def probe_page( +# callback, url, document, num=None, db_file=None): +# result = None +# try: +# # tree = etree.fromstring(res[0]) # etree is for xml +# tree = html.fromstring(document) +# except: +# result = ( +# "> {}\nFailed to parse URL as feed." +# ).format(url) +# if not result: +# print("RSS Auto-Discovery Engaged") +# result = await feed_mode_auto_discovery(url, tree) +# if not result: +# print("RSS Scan Mode Engaged") +# result = await feed_mode_scan(url, tree) +# if not result: +# print("RSS Arbitrary Mode Engaged") +# result = await feed_mode_request(url, tree) +# if not result: +# result = ( +# "> {}\nNo news feeds were found for URL." +# ).format(url) +# # elif msg: +# else: +# if isinstance(result, str): +# return result +# elif isinstance(result, list): +# url = result[0] +# if db_file: +# # print("if db_file", db_file) +# return await callback(db_file, url) +# elif num: +# return await callback(url, num) +# else: +# return await callback(url) + + +async def probe_page(url, document): + """ + Parameters + ---------- + url : str + URL. + document : TYPE + DESCRIPTION. + + Returns + ------- + result : list or str + Single URL as list or selection of URLs as str. + """ + result = None + try: + # tree = etree.fromstring(res[0]) # etree is for xml + tree = html.fromstring(document) + except: + result = ( + "> {}\nFailed to parse URL as feed." + ).format(url) + if not result: + print("RSS Auto-Discovery Engaged") + result = await feed_mode_auto_discovery(url, tree) + if not result: + print("RSS Scan Mode Engaged") + result = await feed_mode_scan(url, tree) + if not result: + print("RSS Arbitrary Mode Engaged") + result = await feed_mode_request(url, tree) + if not result: + result = ( + "> {}\nNo news feeds were found for URL." + ).format(url) + return result + + +# TODO Improve scan by gradual decreasing of path +async def feed_mode_request(url, tree): + """ + Lookup for feeds by pathname using HTTP Requests. + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + feeds = {} + parted_url = urlsplit(url) + paths = config.get_list("lists.yaml") + paths = paths["pathnames"] + for path in paths: + address = urlunsplit([ + parted_url.scheme, + parted_url.netloc, + path, + None, + None + ]) + res = await download_feed(address) + if res[1] == 200: + # print(parse(res[0])["feed"]["title"]) + # feeds[address] = parse(res[0])["feed"]["title"] + try: + title = parse(res[0])["feed"]["title"] + except: + title = '*** No Title ***' + feeds[address] = title + # Check whether URL has path (i.e. not root) + # Check parted_url.path to avoid error in case root wasn't given + # TODO Make more tests + if parted_url.path and parted_url.path.split('/')[1]: + paths.extend( + [".atom", ".feed", ".rdf", ".rss"] + ) if '.rss' not in paths else -1 + # if paths.index('.rss'): + # paths.extend([".atom", ".feed", ".rdf", ".rss"]) + address = urlunsplit([ + parted_url.scheme, + parted_url.netloc, + parted_url.path.split('/')[1] + path, + None, + None + ]) + res = await download_feed(address) + if res[1] == 200: + try: + feeds[address] = parse(res[0]) + # print(feeds) + except: + continue + if len(feeds) > 1: + counter = 0 + msg = ( + "RSS URL discovery has found {} feeds:\n\n```\n" + ).format(len(feeds)) + feed_mark = 0 + for feed in feeds: + try: + feed_name = feeds[feed]["feed"]["title"] + except: + feed_name = urlsplit(feed).netloc + feed_addr = feed + # AttributeError: 'str' object has no attribute 'entries' + try: + feed_amnt = len(feeds[feed].entries) + except: + continue + if feed_amnt: + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_mark is + # utilized in order to make fetch possible. + feed_mark = [feed_addr] + counter += 1 + msg += ( + "Title: {}\n" + "Link : {}\n" + "Items: {}\n" + "\n" + ).format(feed_name, feed_addr, feed_amnt) + if counter > 1: + msg += ( + "```\nThe above feeds were extracted from\n{}" + ).format(url) + elif feed_mark: + return feed_mark + else: + msg = ( + "No feeds were found for {}" + ).format(url) + return msg + elif feeds: + return feeds + + +async def feed_mode_scan(url, tree): + """ + Scan page for potential feeds by pathname. + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + feeds = {} + # paths = [] + # TODO Test + paths = config.get_list("lists.yaml") + paths = paths["pathnames"] + for path in paths: + # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) + # xpath_query = "//a[contains(@href,'{}')]".format(path) + num = 5 + xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num) + addresses = tree.xpath(xpath_query) + xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) + addresses += tree.xpath(xpath_query) + parted_url = urlsplit(url) + # NOTE Should number of addresses be limited or + # perhaps be N from the start and N from the end + for address in addresses: + # print(address.xpath('@href')[0]) + # print(addresses) + address = address.xpath('@href')[0] + if "/" not in address: + protocol = parted_url.scheme + hostname = parted_url.netloc + pathname = address + address = urlunsplit([ + protocol, + hostname, + pathname, + None, + None + ]) + if address.startswith('/'): + protocol = parted_url.scheme + hostname = parted_url.netloc + pathname = address + address = urlunsplit([ + protocol, + hostname, + pathname, + None, + None + ]) + res = await download_feed(address) + if res[1] == 200: + try: + feeds[address] = parse(res[0]) + # print(feeds[address]) + # breakpoint() + # print(feeds) + except: + continue + if len(feeds) > 1: + # print(feeds) + # breakpoint() + counter = 0 + msg = ( + "RSS URL scan has found {} feeds:\n\n```\n" + ).format(len(feeds)) + feed_mark = 0 + for feed in feeds: + # try: + # res = await download_feed(feed) + # except: + # continue + try: + feed_name = feeds[feed]["feed"]["title"] + except: + feed_name = urlsplit(feed).netloc + feed_addr = feed + feed_amnt = len(feeds[feed].entries) + if feed_amnt: + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_mark is + # utilized in order to make fetch possible. + feed_mark = [feed_addr] + counter += 1 + msg += ( + "Title : {}\n" + "Link : {}\n" + "Count : {}\n" + "\n" + ).format(feed_name, feed_addr, feed_amnt) + if counter > 1: + msg += ( + "```\nThe above feeds were extracted from\n{}" + ).format(url) + elif feed_mark: + return feed_mark + else: + msg = ( + "No feeds were found for {}" + ).format(url) + return msg + elif feeds: + return feeds + + +async def feed_mode_auto_discovery(url, tree): + """ + Lookup for feeds using RSS autodiscovery technique. + + See: https://www.rssboard.org/rss-autodiscovery + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + xpath_query = ( + '//link[(@rel="alternate") and ' + '(@type="application/atom+xml" or ' + '@type="application/rdf+xml" or ' + '@type="application/rss+xml")]' + ) + # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" + # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" + feeds = tree.xpath(xpath_query) + if len(feeds) > 1: + msg = ( + "RSS Auto-Discovery has found {} feeds:\n\n```\n" + ).format(len(feeds)) + for feed in feeds: + # # The following code works; + # # The following code will catch + # # only valid resources (i.e. not 404); + # # The following code requires more bandwidth. + # res = await download_feed(feed) + # if res[0]: + # disco = parse(res[0]) + # title = disco["feed"]["title"] + # msg += "{} \n {} \n\n".format(title, feed) + feed_name = feed.xpath('@title')[0] + feed_addr = join_url(url, feed.xpath('@href')[0]) + # if feed_addr.startswith("/"): + # feed_addr = url + feed_addr + msg += "{}\n{}\n\n".format(feed_name, feed_addr) + msg += ( + "```\nThe above feeds were extracted from\n{}" + ).format(url) + return msg + elif feeds: + feed_addr = join_url(url, feeds[0].xpath('@href')[0]) + return [feed_addr] diff --git a/slixfeed/fetch.py b/slixfeed/fetch.py index fe47f19..9cadd22 100644 --- a/slixfeed/fetch.py +++ b/slixfeed/fetch.py @@ -33,454 +33,24 @@ from http.client import IncompleteRead from lxml import html import slixfeed.config as config from slixfeed.datetime import now, rfc2822_to_iso8601 -import slixfeed.utility as utility import slixfeed.sqlite as sqlite from slixfeed.url import complete_url, join_url, trim_url from urllib import error # from xml.etree.ElementTree import ElementTree, ParseError from urllib.parse import urlsplit, urlunsplit -# NOTE Why (if res[0]) and (if res[1] == 200)? -async def download_updates(db_file, url=None): - """ - Check feeds for new entries. - Parameters - ---------- - db_file : str - Path to database file. - url : str, optional - URL. The default is None. - """ - if url: - urls = [url] # Valid [url] and [url,] and (url,) - else: - urls = await sqlite.get_feeds_url(db_file) - for url in urls: - # print(os.path.basename(db_file), url[0]) - source = url[0] - res = await download_feed(source) - # TypeError: 'NoneType' object is not subscriptable - if res is None: - # Skip to next feed - # urls.next() - # next(urls) - continue - await sqlite.update_source_status( - db_file, res[1], source) - if res[0]: - try: - feed = parse(res[0]) - if feed.bozo: - # bozo = ( - # "WARNING: Bozo detected for feed: {}\n" - # "For more information, visit " - # "https://pythonhosted.org/feedparser/bozo.html" - # ).format(source) - # print(bozo) - valid = 0 - else: - valid = 1 - await sqlite.update_source_validity( - db_file, source, valid) - except ( - IncompleteReadError, - IncompleteRead, - error.URLError - ) as e: - # print(e) - # TODO Print error to log - None - # NOTE I don't think there should be "return" - # because then we might stop scanning next URLs - # return - # TODO Place these couple of lines back down - # NOTE Need to correct the SQL statement to do so - # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW - if res[1] == 200: - # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW - # TODO Place these couple of lines back down - # NOTE Need to correct the SQL statement to do so - entries = feed.entries - # length = len(entries) - # await remove_entry(db_file, source, length) - await sqlite.remove_nonexistent_entries( - db_file, feed, source) - # new_entry = 0 - for entry in entries: - # TODO Pass date too for comparion check - if entry.has_key("published"): - date = entry.published - date = rfc2822_to_iso8601(date) - elif entry.has_key("updated"): - date = entry.updated - date = rfc2822_to_iso8601(date) - else: - # TODO Just set date = "*** No date ***" - # date = await datetime.now().isoformat() - date = now() - # NOTE Would seconds result in better database performance - # date = datetime.datetime(date) - # date = (date-datetime.datetime(1970,1,1)).total_seconds() - if entry.has_key("title"): - title = entry.title - # title = "{}: *{}*".format(feed["feed"]["title"], entry.title) - else: - title = date - # title = feed["feed"]["title"] - if entry.has_key("link"): - # link = complete_url(source, entry.link) - link = join_url(source, entry.link) - link = trim_url(link) - else: - link = source - if entry.has_key("id"): - eid = entry.id - else: - eid = link - exist = await sqlite.check_entry_exist( - db_file, source, eid=eid, - title=title, link=link, date=date) - if not exist: - # new_entry = new_entry + 1 - # TODO Enhance summary - if entry.has_key("summary"): - summary = entry.summary - # # Remove HTML tags - # summary = BeautifulSoup(summary, "lxml").text - # # TODO Limit text length - # summary = summary.replace("\n\n\n", "\n\n") - # summary = summary[:300] + " […]‍⃨" - # summary = summary.strip().split('\n') - # summary = ["> " + line for line in summary] - # summary = "\n".join(summary) - else: - summary = "> *** No summary ***" - read_status = 0 - pathname = urlsplit(link).path - string = ( - "{} {} {}" - ).format( - title, - summary, - pathname - ) - allow_list = await config.is_listed( - db_file, "filter-allow", string) - if not allow_list: - reject_list = await config.is_listed( - db_file, "filter-deny", string) - if reject_list: - # print(">>> REJECTED", title) - summary = ( - "REJECTED {}".format( - reject_list.upper() - ) - ) - # summary = "" - read_status = 1 - entry = ( - title, link, eid, source, date, read_status) - if isinstance(date, int): - print("PROBLEM: date is int") - print(date) - # breakpoint() - # print(source) - # print(date) - await sqlite.add_entry_and_set_date( - db_file, source, entry) - # print(current_time(), entry, title) - # else: - # print(current_time(), exist, title) +# async def dat(): +# async def ftp(): + +# async def gemini(): -# NOTE Why (if result[0]) and (if result[1] == 200)? -async def view_feed(url): - """ - Check feeds for new entries. +# async def gopher(): - Parameters - ---------- - db_file : str - Path to database file. - url : str, optional - URL. The default is None. - - Returns - ------- - msg : str - Feed content or error message. - """ - result = await download_feed(url) - if result[0]: - try: - feed = parse(result[0]) - if feed.bozo: - # msg = ( - # ">{}\n" - # "WARNING: Bozo detected!\n" - # "For more information, visit " - # "https://pythonhosted.org/feedparser/bozo.html" - # ).format(url) - msg = await probe_page(view_feed, url, result[0]) - return msg - except ( - IncompleteReadError, - IncompleteRead, - error.URLError - ) as e: - # print(e) - # TODO Print error to log - msg = ( - "> {}\n" - "Error: {}" - ).format(url, e) - # breakpoint() - if result[1] == 200: - feed = parse(result[0]) - title = utility.get_title(url, feed) - entries = feed.entries - msg = "Preview of {}:\n\n```\n".format(title) - counter = 0 - for entry in entries: - counter += 1 - if entry.has_key("title"): - title = entry.title - else: - title = "*** No title ***" - if entry.has_key("link"): - # link = complete_url(source, entry.link) - link = join_url(url, entry.link) - link = trim_url(link) - else: - link = "*** No link ***" - if entry.has_key("published"): - date = entry.published - date = rfc2822_to_iso8601(date) - elif entry.has_key("updated"): - date = entry.updated - date = rfc2822_to_iso8601(date) - else: - date = "*** No date ***" - msg += ( - "Title : {}\n" - "Date : {}\n" - "Link : {}\n" - "Count : {}\n" - "\n" - ).format(title, date, link, counter) - if counter > 4: - break - msg += ( - "```\nSource: {}" - ).format(url) - else: - msg = ( - ">{}\nFailed to load URL. Reason: {}" - ).format(url, result[1]) - return msg - - -# NOTE Why (if result[0]) and (if result[1] == 200)? -async def view_entry(url, num): - result = await download_feed(url) - if result[0]: - try: - feed = parse(result[0]) - if feed.bozo: - # msg = ( - # ">{}\n" - # "WARNING: Bozo detected!\n" - # "For more information, visit " - # "https://pythonhosted.org/feedparser/bozo.html" - # ).format(url) - msg = await probe_page(view_entry, url, result[0], num=num) - return msg - except ( - IncompleteReadError, - IncompleteRead, - error.URLError - ) as e: - # print(e) - # TODO Print error to log - msg = ( - "> {}\n" - "Error: {}" - ).format(url, e) - # breakpoint() - if result[1] == 200: - feed = parse(result[0]) - title = utility.get_title(url, result[0]) - entries = feed.entries - num = int(num) - 1 - entry = entries[num] - if entry.has_key("title"): - title = entry.title - else: - title = "*** No title ***" - if entry.has_key("published"): - date = entry.published - date = rfc2822_to_iso8601(date) - elif entry.has_key("updated"): - date = entry.updated - date = rfc2822_to_iso8601(date) - else: - date = "*** No date ***" - if entry.has_key("summary"): - summary = entry.summary - # Remove HTML tags - summary = BeautifulSoup(summary, "lxml").text - # TODO Limit text length - summary = summary.replace("\n\n\n", "\n\n") - else: - summary = "*** No summary ***" - if entry.has_key("link"): - # link = complete_url(source, entry.link) - link = join_url(url, entry.link) - link = trim_url(link) - else: - link = "*** No link ***" - msg = ( - "{}\n" - "\n" - "> {}\n" - "\n" - "{}\n" - "\n" - ).format(title, summary, link) - else: - msg = ( - ">{}\n" - "Failed to load URL. Reason: {}\n" - "Try again momentarily." - ).format(url, result[1]) - return msg - - -async def add_feed_no_check(db_file, data): - """ - Add given feed without validity check. - - Parameters - ---------- - db_file : str - Path to database file. - data : str - URL or URL and Title. - - Returns - ------- - msg : str - Status message. - """ - url = data[0] - title = data[1] - url = trim_url(url) - exist = await sqlite.is_feed_exist(db_file, url) - if not exist: - msg = await sqlite.insert_feed(db_file, url, title) - await download_updates(db_file, [url]) - else: - ix = exist[0] - name = exist[1] - msg = ( - "> {}\nNews source \"{}\" is already " - "listed in the subscription list at " - "index {}".format(url, name, ix) - ) - return msg - - -async def add_feed(db_file, url): - """ - Check whether feed exist, otherwise process it. - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - - Returns - ------- - msg : str - Status message. - """ - msg = None - url = trim_url(url) - exist = await sqlite.is_feed_exist(db_file, url) - if not exist: - res = await download_feed(url) - if res[0]: - feed = parse(res[0]) - title = utility.get_title(url, feed) - if utility.is_feed(url, feed): - status = res[1] - await sqlite.insert_feed( - db_file, url, title, status) - await download_updates(db_file, [url]) - title = title if title else url - msg = ( - "> {}\nNews source \"{}\" has been added " - "to subscription list." - ).format(url, title) - else: - msg = await probe_page( - add_feed, url, res[0], db_file=db_file) - else: - status = res[1] - msg = ( - "> {}\nFailed to load URL. Reason: {}" - ).format(url, status) - else: - ix = exist[0] - name = exist[1] - msg = ( - "> {}\nNews source \"{}\" is already " - "listed in the subscription list at " - "index {}".format(url, name, ix) - ) - return msg - - -# TODO callback for use with add_feed and view_feed -async def probe_page(callback, url, doc, num=None, db_file=None): - msg = None - try: - # tree = etree.fromstring(res[0]) # etree is for xml - tree = html.fromstring(doc) - except: - msg = ( - "> {}\nFailed to parse URL as feed." - ).format(url) - if not msg: - print("RSS Auto-Discovery Engaged") - msg = await feed_mode_auto_discovery(url, tree) - if not msg: - print("RSS Scan Mode Engaged") - msg = await feed_mode_scan(url, tree) - if not msg: - print("RSS Arbitrary Mode Engaged") - msg = await feed_mode_request(url, tree) - if not msg: - msg = ( - "> {}\nNo news feeds were found for URL." - ).format(url) - # elif msg: - else: - if isinstance(msg, str): - return msg - elif isinstance(msg, list): - url = msg[0] - if db_file: - # print("if db_file", db_file) - return await callback(db_file, url) - elif num: - return await callback(url, num) - else: - return await callback(url) +# async def http(): +# async def ipfs(): async def download_feed(url): """ @@ -488,7 +58,7 @@ async def download_feed(url): Parameters ---------- - url : str + url : list URL. Returns @@ -502,27 +72,23 @@ async def download_feed(url): user_agent = "Slixfeed/0.1" if not len(user_agent): user_agent = "Slixfeed/0.1" + headers = {'User-Agent': user_agent} + url = url[0] proxy = (config.get_value("settings", "Network", "http_proxy")) or '' timeout = ClientTimeout(total=10) - headers = {'User-Agent': user_agent} async with ClientSession(headers=headers) as session: # async with ClientSession(trust_env=True) as session: try: - async with session.get( - url, - proxy=proxy, - # proxy_auth=(proxy_username, proxy_password) - timeout=timeout - ) as response: + async with session.get(url, proxy=proxy, + # proxy_auth=(proxy_username, proxy_password), + timeout=timeout + ) as response: status = response.status if response.status == 200: try: doc = await response.text() # print (response.content_type) - msg = [ - doc, - status - ] + msg = [doc, status] except: # msg = [ # False, @@ -531,307 +97,20 @@ async def download_feed(url): # ) # ] msg = [ - False, - "Document is too large or is not textual." + False, "Document is too large or is not textual." ] else: msg = [ - False, - "HTTP Error: " + str(status) + False, "HTTP Error: " + str(status) ] except ClientError as e: # print('Error', str(e)) msg = [ - False, - "Error: " + str(e) + False, "Error: " + str(e) ] except TimeoutError as e: # print('Timeout:', str(e)) msg = [ - False, - "Timeout: " + str(e) + False, "Timeout: " + str(e) ] return msg - - -# TODO Improve scan by gradual decreasing of path -async def feed_mode_request(url, tree): - """ - Lookup for feeds by pathname using HTTP Requests. - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - feeds = {} - parted_url = urlsplit(url) - paths = config.get_list("lists.yaml") - paths = paths["pathnames"] - for path in paths: - address = urlunsplit([ - parted_url.scheme, - parted_url.netloc, - path, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - # print(parse(res[0])["feed"]["title"]) - # feeds[address] = parse(res[0])["feed"]["title"] - try: - title = parse(res[0])["feed"]["title"] - except: - title = '*** No Title ***' - feeds[address] = title - # Check whether URL has path (i.e. not root) - # Check parted_url.path to avoid error in case root wasn't given - # TODO Make more tests - if parted_url.path and parted_url.path.split('/')[1]: - paths.extend( - [".atom", ".feed", ".rdf", ".rss"] - ) if '.rss' not in paths else -1 - # if paths.index('.rss'): - # paths.extend([".atom", ".feed", ".rdf", ".rss"]) - address = urlunsplit([ - parted_url.scheme, - parted_url.netloc, - parted_url.path.split('/')[1] + path, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - try: - feeds[address] = parse(res[0]) - # print(feeds) - except: - continue - if len(feeds) > 1: - counter = 0 - msg = ( - "RSS URL discovery has found {} feeds:\n\n```\n" - ).format(len(feeds)) - feed_mark = 0 - for feed in feeds: - try: - feed_name = feeds[feed]["feed"]["title"] - except: - feed_name = urlsplit(feed).netloc - feed_addr = feed - # AttributeError: 'str' object has no attribute 'entries' - try: - feed_amnt = len(feeds[feed].entries) - except: - continue - if feed_amnt: - # NOTE Because there could be many false positives - # which are revealed in second phase of scan, we - # could end with a single feed, which would be - # listed instead of fetched, so feed_mark is - # utilized in order to make fetch possible. - feed_mark = [feed_addr] - counter += 1 - msg += ( - "Title: {}\n" - "Link : {}\n" - "Items: {}\n" - "\n" - ).format(feed_name, feed_addr, feed_amnt) - if counter > 1: - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - elif feed_mark: - return feed_mark - else: - msg = ( - "No feeds were found for {}" - ).format(url) - return msg - elif feeds: - return feeds - - -async def feed_mode_scan(url, tree): - """ - Scan page for potential feeds by pathname. - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - feeds = {} - # paths = [] - # TODO Test - paths = config.get_list("lists.yaml") - paths = paths["pathnames"] - for path in paths: - # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) - # xpath_query = "//a[contains(@href,'{}')]".format(path) - num = 5 - xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num) - addresses = tree.xpath(xpath_query) - xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) - addresses += tree.xpath(xpath_query) - parted_url = urlsplit(url) - # NOTE Should number of addresses be limited or - # perhaps be N from the start and N from the end - for address in addresses: - # print(address.xpath('@href')[0]) - # print(addresses) - address = address.xpath('@href')[0] - if "/" not in address: - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = address - address = urlunsplit([ - protocol, - hostname, - pathname, - None, - None - ]) - if address.startswith('/'): - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = address - address = urlunsplit([ - protocol, - hostname, - pathname, - None, - None - ]) - res = await download_feed(address) - if res[1] == 200: - try: - feeds[address] = parse(res[0]) - # print(feeds[address]) - # breakpoint() - # print(feeds) - except: - continue - if len(feeds) > 1: - # print(feeds) - # breakpoint() - counter = 0 - msg = ( - "RSS URL scan has found {} feeds:\n\n```\n" - ).format(len(feeds)) - feed_mark = 0 - for feed in feeds: - # try: - # res = await download_feed(feed) - # except: - # continue - try: - feed_name = feeds[feed]["feed"]["title"] - except: - feed_name = urlsplit(feed).netloc - feed_addr = feed - feed_amnt = len(feeds[feed].entries) - if feed_amnt: - # NOTE Because there could be many false positives - # which are revealed in second phase of scan, we - # could end with a single feed, which would be - # listed instead of fetched, so feed_mark is - # utilized in order to make fetch possible. - feed_mark = [feed_addr] - counter += 1 - msg += ( - "Title : {}\n" - "Link : {}\n" - "Count : {}\n" - "\n" - ).format(feed_name, feed_addr, feed_amnt) - if counter > 1: - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - elif feed_mark: - return feed_mark - else: - msg = ( - "No feeds were found for {}" - ).format(url) - return msg - elif feeds: - return feeds - - -async def feed_mode_auto_discovery(url, tree): - """ - Lookup for feeds using RSS autodiscovery technique. - - See: https://www.rssboard.org/rss-autodiscovery - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - xpath_query = ( - '//link[(@rel="alternate") and ' - '(@type="application/atom+xml" or ' - '@type="application/rdf+xml" or ' - '@type="application/rss+xml")]' - ) - # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" - # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" - feeds = tree.xpath(xpath_query) - if len(feeds) > 1: - msg = ( - "RSS Auto-Discovery has found {} feeds:\n\n```\n" - ).format(len(feeds)) - for feed in feeds: - # # The following code works; - # # The following code will catch - # # only valid resources (i.e. not 404); - # # The following code requires more bandwidth. - # res = await download_feed(feed) - # if res[0]: - # disco = parse(res[0]) - # title = disco["feed"]["title"] - # msg += "{} \n {} \n\n".format(title, feed) - feed_name = feed.xpath('@title')[0] - feed_addr = join_url(url, feed.xpath('@href')[0]) - # if feed_addr.startswith("/"): - # feed_addr = url + feed_addr - msg += "{}\n{}\n\n".format(feed_name, feed_addr) - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - return msg - elif feeds: - feed_addr = join_url(url, feeds[0].xpath('@href')[0]) - return [feed_addr] diff --git a/slixfeed/log.py b/slixfeed/log.py new file mode 100644 index 0000000..0a1032a --- /dev/null +++ b/slixfeed/log.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +def markdown(timestamp, filename, jid, message): + """ + Log message to file. + + Parameters + ---------- + timestamp : str + Time stamp. + filename : str + Jabber ID as name of file. + jid : str + Jabber ID. + message : str + Message content. + + Returns + ------- + None. + + """ + with open(filename + '.md', 'a') as file: + # entry = "{} {}:\n{}\n\n".format(timestamp, jid, message) + entry = ( + "## {}\n" + "### {}\n\n" + "{}\n\n").format(jid, timestamp, message) + file.write(entry) + + diff --git a/slixfeed/read.py b/slixfeed/read.py new file mode 100644 index 0000000..b35ad02 --- /dev/null +++ b/slixfeed/read.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" + +TODO + +1) is_feed: Look into the type ("atom", "rss2" etc.) + +""" + + +def title(feed): + """ + Get title of feed. + + Parameters + ---------- + url : str + URL. + feed : dict + Parsed feed document. + + Returns + ------- + title : str + Title or None. + """ + try: + title = feed["feed"]["title"] + except: + title = None + return title + + +def is_feed(feed): + """ + Determine whether document is feed or not. + + Parameters + ---------- + feed : dict + Parsed feed. + + Returns + ------- + val : boolean + True or False. + """ + msg = None + if not feed.entries: + try: + feed["feed"]["title"] + val = True + # msg = ( + # "Empty feed for {}" + # ).format(url) + except: + val = False + # msg = ( + # "No entries nor title for {}" + # ).format(url) + elif feed.bozo: + val = False + # msg = ( + # "Bozo detected for {}" + # ).format(url) + else: + val = True + # msg = ( + # "Good feed for {}" + # ).format(url) + print(msg) + return val diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index e4a514f..42b1696 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -222,9 +222,6 @@ async def remove_feed(db_file, ix): "FROM feeds " "WHERE id = ?" ) - # cur - # for i in url: - # url = i[0] url = cur.execute(sql, (ix,)).fetchone()[0] # NOTE Should we move DBLOCK to this line? 2022-12-23 sql = ( @@ -246,8 +243,10 @@ async def remove_feed(db_file, ix): cur.execute(sql, (ix,)) +# TODO Rename function name async def is_feed_exist(db_file, url): """ + Get Id and Name of feed. Check whether a feed exists. Query for feeds by given url. @@ -270,8 +269,7 @@ async def is_feed_exist(db_file, url): "WHERE address = ?" ) result = cur.execute(sql, (url,)).fetchone() - if result: - return True + return result async def get_number_of_items(db_file, table): diff --git a/slixfeed/task.py b/slixfeed/task.py index c4f5c65..1604065 100644 --- a/slixfeed/task.py +++ b/slixfeed/task.py @@ -49,13 +49,14 @@ from slixfeed.config import ( get_default_dbdir, get_value_default) from slixfeed.datetime import current_time -from slixfeed.fetch import download_updates +from slixfeed.action import organize_items from slixfeed.sqlite import ( - get_unread_entries, get_feed_title, - get_settings_value, + get_feeds_url, get_number_of_items, get_number_of_entries_unread, + get_settings_value, + get_unread_entries, mark_as_read, mark_entry_as_read, delete_archived_entry @@ -329,7 +330,9 @@ async def refresh_task(self, jid, callback, key, val=None): val : str, optional Value. The default is None. """ - logging.debug("Refreshing task {} for JID {}".format(callback, jid)) + logging.debug( + "Refreshing task {} for JID {}".format(callback, jid) + ) if not val: db_file = get_pathname_to_database(jid) val = await get_settings_value(db_file, key) @@ -340,7 +343,8 @@ async def refresh_task(self, jid, callback, key, val=None): except: logging.debug( "No task of type {} to cancel for " - "JID {} (clean_tasks)".format(key, jid)) + "JID {} (clean_tasks)".format(key, jid) + ) # task_manager[jid][key] = loop.call_at( # loop.time() + 60 * float(val), # loop.create_task, @@ -378,10 +382,13 @@ async def check_updates(jid): jid : str Jabber ID. """ - logging.debug("Scanning for updates for JID {}".format(jid)) + logging.debug( + "Scanning for updates for JID {}".format(jid) + ) while True: db_file = get_pathname_to_database(jid) - await download_updates(db_file) + urls = await get_feeds_url(db_file) + await organize_items(db_file, urls) val = get_value_default("settings", "Settings", "check") await asyncio.sleep(60 * float(val)) # Schedule to call this function again in 90 minutes @@ -394,12 +401,16 @@ async def check_updates(jid): async def start_tasks(self, presence): jid = presence["from"].bare - logging.debug("Beginning tasks for JID {}".format(jid)) + logging.debug( + "Beginning tasks for JID {}".format(jid) + ) if jid not in self.boundjid.bare: await clean_tasks_xmpp( - jid, ["interval", "status", "check"]) + jid, ["interval", "status", "check"] + ) await start_tasks_xmpp( - self, jid, ["interval", "status", "check"]) + self, jid, ["interval", "status", "check"] + ) # await task_jid(self, jid) # main_task.extend([asyncio.create_task(task_jid(jid))]) # print(main_task) @@ -408,9 +419,12 @@ async def start_tasks(self, presence): async def stop_tasks(self, presence): if not self.boundjid.bare: jid = presence["from"].bare - logging.debug("Stopping tasks for JID {}".format(jid)) + logging.debug( + "Stopping tasks for JID {}".format(jid) + ) await clean_tasks_xmpp( - jid, ["interval", "status", "check"]) + jid, ["interval", "status", "check"] + ) async def check_readiness(self, presence): @@ -434,7 +448,9 @@ async def check_readiness(self, presence): jid = presence["from"].bare if presence["show"] in ("away", "dnd", "xa"): - logging.debug("Stopping updates for JID {}".format(jid)) + logging.debug( + "Stopping updates for JID {}".format(jid) + ) await clean_tasks_xmpp( jid, ["interval"]) await start_tasks_xmpp( @@ -477,7 +493,9 @@ async def select_file(self): if (file.endswith(".db") and not file.endswith(".db-jour.db")): jid = file[:-3] - main_task.extend([tg.create_task(self.task_jid(jid))]) + main_task.extend( + [tg.create_task(self.task_jid(jid))] + ) # main_task = [tg.create_task(self.task_jid(jid))] # task_manager.update({jid: tg}) diff --git a/slixfeed/url.py b/slixfeed/url.py index db52dd2..7d82402 100644 --- a/slixfeed/url.py +++ b/slixfeed/url.py @@ -21,7 +21,7 @@ from urllib.parse import ( parse_qs, urlencode, urljoin, - urlparse, + # urlparse, urlsplit, urlunsplit ) diff --git a/slixfeed/utility.py b/slixfeed/utility.py deleted file mode 100644 index 07d0cfa..0000000 --- a/slixfeed/utility.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -TODO - -1) is_feed: Look into the type ("atom", "rss2" etc.) - -""" - -from urllib.parse import urlsplit - - -def log_as_markdown(timestamp, filename, jid, message): - """ - Log message to file. - - Parameters - ---------- - timestamp : str - Time stamp. - filename : str - Jabber ID as name of file. - jid : str - Jabber ID. - message : str - Message content. - - Returns - ------- - None. - - """ - with open(filename + '.md', 'a') as file: - # entry = "{} {}:\n{}\n\n".format(timestamp, jid, message) - entry = ( - "## {}\n" - "### {}\n\n" - "{}\n\n").format(jid, timestamp, message) - file.write(entry) - - -def get_title(url, feed): - """ - Get title of feed. - - Parameters - ---------- - url : str - URL. - feed : dict - Parsed feed document. - - Returns - ------- - title : str - Title or URL hostname. - """ - try: - title = feed["feed"]["title"] - except: - title = urlsplit(url).netloc - if not title: - title = urlsplit(url).netloc - return title - - -def is_feed(url, feed): - """ - Determine whether document is feed or not. - - Parameters - ---------- - url : str - URL. - feed : dict - Parsed feed. - - Returns - ------- - val : boolean - True or False. - """ - msg = None - if not feed.entries: - try: - feed["feed"]["title"] - val = True - msg = ( - "Empty feed for {}" - ).format(url) - except: - val = False - msg = ( - "No entries nor title for {}" - ).format(url) - elif feed.bozo: - val = False - msg = ( - "Bozo detected for {}" - ).format(url) - else: - val = True - msg = ( - "Good feed for {}" - ).format(url) - print(msg) - return val diff --git a/slixfeed/xmpp/client.py b/slixfeed/xmpp/client.py index aed6668..68421fc 100644 --- a/slixfeed/xmpp/client.py +++ b/slixfeed/xmpp/client.py @@ -48,26 +48,20 @@ NOTE """ import asyncio -from slixfeed.config import add_to_list, get_list, remove_from_list -import slixfeed.fetch as fetcher -from slixfeed.datetime import current_time import logging # import os from random import randrange import slixmpp -from slixmpp.exceptions import IqError, IqTimeout -import slixfeed.sqlite as sqlite import slixfeed.task as task -import slixfeed.url as urlfixer from time import sleep from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference from slixmpp.plugins.xep_0048.stanza import Bookmarks -import xmltodict -import xml.etree.ElementTree as ET -from lxml import etree +# import xmltodict +# import xml.etree.ElementTree as ET +# from lxml import etree import slixfeed.xmpp.connect as connect import slixfeed.xmpp.process as process diff --git a/slixfeed/xmpp/connect.py b/slixfeed/xmpp/connect.py index 60f1c72..be8e37e 100644 --- a/slixfeed/xmpp/connect.py +++ b/slixfeed/xmpp/connect.py @@ -17,7 +17,8 @@ async def recover_connection(self, event, message): # print(current_time(),"Maximum connection attempts exceeded.") # logging.error("Maximum connection attempts exceeded.") print(current_time(), "Attempt number", self.connection_attempts) - seconds = (get_value("accounts", "XMPP Connect", "reconnect_timeout")) or 30 + seconds = (get_value( + "accounts", "XMPP Connect", "reconnect_timeout")) or 30 seconds = int(seconds) print(current_time(), "Next attempt within", seconds, "seconds") # NOTE asyncio.sleep doesn't interval as expected diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index e5fdc42..7df17c1 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -19,19 +19,22 @@ TODO """ import os +import slixfeed.action as action from slixfeed.config import ( add_to_list, get_default_dbdir, get_value, get_pathname_to_database, remove_from_list) +import slixfeed.crawl as crawl from slixfeed.datetime import current_time, timestamp import slixfeed.export as export -import slixfeed.fetch as fetcher +import slixfeed.fetch as fetch import slixfeed.opml as opml import slixfeed.sqlite as sqlite import slixfeed.task as task -import slixfeed.utility as utility +import slixfeed.log as log +import slixfeed.read as read import slixfeed.url as uri import slixfeed.xmpp.bookmark as bookmark import slixfeed.xmpp.compose as compose @@ -40,6 +43,7 @@ import slixfeed.xmpp.status as status import slixfeed.xmpp.text as text import slixfeed.xmpp.upload as upload from slixfeed.xmpp.utility import jid_type +from urllib.parse import urlsplit, urlunsplit async def event(self, event): @@ -210,20 +214,35 @@ async def message(self, message): # else: # response = "This command is valid for groupchat only." case _ if message_lowercase.startswith("add"): + # Add given feed without validity check. message_text = message_text[4:] url = message_text.split(" ")[0] title = " ".join(message_text.split(" ")[1:]) if url.startswith("http"): db_file = get_pathname_to_database(jid) - response = await fetcher.add_feed_no_check(db_file, [url, title]) - old = await sqlite.get_settings_value(db_file, "old") - if old: - await task.clean_tasks_xmpp(jid, ["status"]) - # await send_status(jid) - await task.start_tasks_xmpp(self, jid, ["status"]) + exist = await sqlite.is_feed_exist(db_file, url) + if not exist: + await sqlite.insert_feed(db_file, url, title) + await action.organize_items(db_file, [url]) + old = await sqlite.get_settings_value(db_file, "old") + if old: + await task.clean_tasks_xmpp(jid, ["status"]) + # await send_status(jid) + await task.start_tasks_xmpp(self, jid, ["status"]) + else: + await sqlite.mark_source_as_read(db_file, url) + response = ( + "> {}\nNews source has been " + "added to subscription list." + ).format(url) else: - db_file = get_pathname_to_database(jid) - await sqlite.mark_source_as_read(db_file, url) + ix = exist[0] + name = exist[1] + response = ( + "> {}\nNews source \"{}\" is already " + "listed in the subscription list at " + "index {}".format(url, name, ix) + ) else: response = "Missing URL." send_reply_message(self, message, response) @@ -388,31 +407,13 @@ async def message(self, message): send_status_message(self, jid, status_type, status_message) if url.startswith("feed:"): url = uri.feed_to_http(url) - # url_alt = await uri.replace_hostname(url, "feed") - # if url_alt: - # url = url_alt url = (uri.replace_hostname(url, "feed")) or url db_file = get_pathname_to_database(jid) - response = await fetcher.add_feed(db_file, url) - await task.start_tasks_xmpp(self, jid, ["status"]) - # response = "> " + message + "\n" + response - # FIXME Make the taskhandler to update status message - # await refresh_task( - # self, - # jid, - # send_status, - # "status", - # 20 - # ) - # NOTE This would show the number of new unread entries - old = await sqlite.get_settings_value(db_file, "old") - if old: - await task.clean_tasks_xmpp(jid, ["status"]) - # await send_status(jid) - await task.start_tasks_xmpp(self, jid, ["status"]) - else: - db_file = get_pathname_to_database(jid) - await sqlite.mark_source_as_read(db_file, url) + response = await action.add_feed(db_file, url) + await task.clean_tasks_xmpp( + jid, ["status"]) + await task.start_tasks_xmpp( + self, jid, ["status"]) send_reply_message(self, message, response) case _ if message_lowercase.startswith("feeds"): query = message_text[6:] @@ -521,7 +522,7 @@ async def message(self, message): send_reply_message(self, message, response) case "new": db_file = get_pathname_to_database(jid) - sqlite.set_settings_value(db_file, ["old", 0]) + await sqlite.set_settings_value(db_file, ["old", 0]) response = ( "Only new items of newly added feeds will be sent." ) @@ -581,7 +582,8 @@ async def message(self, message): data = message_text[5:] data = data.split() url = data[0] - await task.clean_tasks_xmpp(jid, ["status"]) + await task.clean_tasks_xmpp( + jid, ["status"]) status_type = "dnd" status_message = ( "📫️ Processing request to fetch data from {}" @@ -593,13 +595,13 @@ async def message(self, message): match len(data): case 1: if url.startswith("http"): - response = await fetcher.view_feed(url) + response = await action.view_feed(url) else: response = "Missing URL." case 2: num = data[1] if url.startswith("http"): - response = await fetcher.view_entry(url, num) + response = await action.view_entry(url, num) else: response = "Missing URL." case _: @@ -627,15 +629,15 @@ async def message(self, message): response = "Missing value." send_reply_message(self, message, response) # NOTE Should people be asked for numeric value? - case _ if message_lowercase.startswith("remove"): + case _ if message_lowercase.startswith("remove "): ix = message_text[7:] if ix: db_file = get_pathname_to_database(jid) try: await sqlite.remove_feed(db_file, ix) response = ( - "> {}\nNews source has been removed " - "from subscription list.").format(url) + "News source {} has been removed " + "from subscription list.").format(ix) # await refresh_task( # self, # jid, @@ -643,10 +645,13 @@ async def message(self, message): # "status", # 20 # ) - await task.clean_tasks_xmpp(jid, ["status"]) - await task.start_tasks_xmpp(self, jid, ["status"]) + await task.clean_tasks_xmpp( + jid, ["status"]) + await task.start_tasks_xmpp( + self, jid, ["status"]) except: - response = "No news source with ID {}.".format(ix) + response = ( + "No news source with ID {}.".format(ix)) else: response = "Missing feed ID." send_reply_message(self, message, response) @@ -655,7 +660,8 @@ async def message(self, message): await task.clean_tasks_xmpp(jid, ["status"]) status_type = "dnd" status_message = "📫️ Marking entries as read..." - send_status_message(self, jid, status_type, status_message) + send_status_message( + self, jid, status_type, status_message) if source: db_file = get_pathname_to_database(jid) await sqlite.mark_source_as_read(db_file, source) @@ -688,9 +694,11 @@ async def message(self, message): key = "enabled" val = 1 db_file = get_pathname_to_database(jid) - await sqlite.set_settings_value(db_file, [key, val]) + await sqlite.set_settings_value( + db_file, [key, val]) # asyncio.create_task(task_jid(self, jid)) - await task.start_tasks_xmpp(self, jid, ["interval", "status", "check"]) + await task.start_tasks_xmpp( + self, jid, ["interval", "status", "check"]) response = "Updates are enabled." # print(current_time(), "task_manager[jid]") # print(task_manager[jid]) @@ -747,13 +755,17 @@ async def message(self, message): key = "enabled" val = 0 db_file = get_pathname_to_database(jid) - await sqlite.set_settings_value(db_file, [key, val]) - await task.clean_tasks_xmpp(jid, ["interval", "status"]) + await sqlite.set_settings_value( + db_file, [key, val]) + await task.clean_tasks_xmpp( + jid, ["interval", "status"]) response = "Updates are disabled." send_reply_message(self, message, response) status_type = "xa" - status_message = "💡️ Send \"Start\" to receive Jabber updates" - send_status_message(self, jid, status_type, status_message) + status_message = ( + "💡️ Send \"Start\" to receive Jabber updates") + send_status_message( + self, jid, status_type, status_message) case "support": # TODO Send an invitation. response = ( @@ -789,10 +801,10 @@ async def message(self, message): os.mkdir(data_dir) if not os.path.isdir(data_dir + '/logs/'): os.mkdir(data_dir + '/logs/') - utility.log_as_markdown( + log.markdown( current_time(), os.path.join(data_dir, "logs", jid), jid, message_text) - utility.log_as_markdown( + log.markdown( current_time(), os.path.join(data_dir, "logs", jid), self.boundjid.bare, response)