From 93ea8a9fabda77784210268307d8faa595e179d6 Mon Sep 17 00:00:00 2001 From: "Schimon Jehudah, Adv." Date: Sun, 16 Jun 2024 11:55:22 +0300 Subject: [PATCH] Fix command export; Restructure code. --- slixfeed/crawl.py | 436 --------------------------------- slixfeed/dt.py | 114 --------- slixfeed/format.py | 19 -- slixfeed/read.py | 74 ------ slixfeed/sqlite.py | 7 +- slixfeed/syndication.py | 483 +++++++++++++++++++++++++++++++++++-- slixfeed/url.py | 352 --------------------------- slixfeed/utilities.py | 465 ++++++++++++++++++++++++++++++++++- slixfeed/version.py | 4 +- slixfeed/xmpp/chat.py | 29 +-- slixfeed/xmpp/client.py | 118 +++++---- slixfeed/xmpp/commands.py | 35 ++- slixfeed/xmpp/connect.py | 8 +- slixfeed/xmpp/privilege.py | 49 ---- slixfeed/xmpp/publish.py | 5 +- slixfeed/xmpp/upload.py | 4 +- slixfeed/xmpp/utilities.py | 48 ++++ 17 files changed, 1066 insertions(+), 1184 deletions(-) delete mode 100644 slixfeed/crawl.py delete mode 100644 slixfeed/dt.py delete mode 100644 slixfeed/format.py delete mode 100644 slixfeed/read.py delete mode 100644 slixfeed/url.py delete mode 100644 slixfeed/xmpp/privilege.py diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py deleted file mode 100644 index abdb2bf..0000000 --- a/slixfeed/crawl.py +++ /dev/null @@ -1,436 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -FIXME - -1) https://wiki.pine64.org - File "/slixfeed/crawl.py", line 178, in feed_mode_guess - address = join_url(url, parted_url.path.split('/')[1] + path) - ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^ - IndexError: list index out of range - -TODO - -1.1) Attempt to scan more paths: /blog/, /news/ etc., including root / - Attempt to scan sub domains - https://esmailelbob.xyz/en/ - https://blog.esmailelbob.xyz/feed/ - -1.2) Consider utilizing fetch.http_response - -2) Consider merging with module fetch.py - -FEEDS CRAWLER PROJECT - -3) Mark redirects for manual check - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/atom.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/feed.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/news.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/news.xml.php - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/rdf.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/rss.xml - -Title : JSON Feed -Link : https://www.jsonfeed.org/feed.json/videos.xml - - -""" - -from aiohttp import ClientError, ClientSession, ClientTimeout -from feedparser import parse -import logging -from lxml import etree -from lxml import html -from lxml.etree import fromstring -import slixfeed.config as config -import slixfeed.fetch as fetch -from slixfeed.log import Logger -from slixfeed.url import complete_url, join_url, trim_url -from urllib.parse import urlsplit, urlunsplit - - -# TODO Use boolean as a flag to determine whether a single URL was found -# async def probe_page( -# callback, url, document, num=None, db_file=None): -# result = None -# try: -# # tree = etree.fromstring(res[0]) # etree is for xml -# tree = html.fromstring(document) -# except: -# result = ( -# "> {}\nFailed to parse URL as feed." -# ).format(url) -# if not result: -# print("RSS Auto-Discovery Engaged") -# result = await feed_mode_auto_discovery(url, tree) -# if not result: -# print("RSS Scan Mode Engaged") -# result = await feed_mode_scan(url, tree) -# if not result: -# print("RSS Arbitrary Mode Engaged") -# result = await feed_mode_request(url, tree) -# if not result: -# result = ( -# "> {}\nNo news feeds were found for URL." -# ).format(url) -# # elif msg: -# else: -# if isinstance(result, str): -# return result -# elif isinstance(result, list): -# url = result[0] -# if db_file: -# # print("if db_file", db_file) -# return await callback(db_file, url) -# elif num: -# return await callback(url, num) -# else: -# return await callback(url) - -logger = Logger(__name__) - -async def probe_page(url, document=None): - """ - Parameters - ---------- - url : str - URL. - document : TYPE - DESCRIPTION. - - Returns - ------- - result : list or str - Single URL as list or selection of URLs as str. - """ - if not document: - response = await fetch.http(url) - if not response['error']: - document = response['content'] - try: - # tree = etree.fromstring(res[0]) # etree is for xml - tree = html.fromstring(document) - result = None - except Exception as e: - logger.error(str(e)) - try: - # /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - # xml = html.fromstring(document.encode('utf-8')) - # parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') - # tree = fromstring(xml, parser=parser) - - # /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported - #tree = html.fromstring(bytes(document, encoding='utf8')) - - # https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html - #parser = etree.XMLParser(recover=True) - #tree = etree.fromstring(document, parser) - - tree = html.fromstring(document.encode('utf-8')) - result = None - except Exception as e: - logger.error(str(e)) - logger.warning("Failed to parse URL as feed for {}.".format(url)) - result = {'link' : None, - 'index' : None, - 'name' : None, - 'code' : None, - 'error' : True, - 'exist' : None} - if not result: - logger.debug("Feed auto-discovery engaged for {}".format(url)) - result = await feed_mode_auto_discovery(url, tree) - if not result: - logger.debug("Feed link scan mode engaged for {}".format(url)) - result = await feed_mode_scan(url, tree) - if not result: - logger.debug("Feed arbitrary mode engaged for {}".format(url)) - result = await feed_mode_guess(url, tree) - if not result: - logger.debug("No feeds were found for {}".format(url)) - result = None - return result - - -# TODO Improve scan by gradual decreasing of path -async def feed_mode_guess(url, tree): - """ - Lookup for feeds by pathname using HTTP Requests. - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - urls = [] - parted_url = urlsplit(url) - paths = config.open_config_file("lists.toml")["pathnames"] - # Check whether URL has path (i.e. not root) - # Check parted_url.path to avoid error in case root wasn't given - # TODO Make more tests - if parted_url.path and parted_url.path.split('/')[1]: - paths.extend( - [".atom", ".feed", ".rdf", ".rss"] - ) if '.rss' not in paths else -1 - # if paths.index('.rss'): - # paths.extend([".atom", ".feed", ".rdf", ".rss"]) - parted_url_path = parted_url.path if parted_url.path else '/' - for path in paths: - address = join_url(url, parted_url_path.split('/')[1] + path) - if address not in urls: - urls.extend([address]) - # breakpoint() - # print("feed_mode_guess") - urls = await process_feed_selection(url, urls) - return urls - - -async def feed_mode_scan(url, tree): - """ - Scan page for potential feeds by pathname. - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - urls = [] - paths = config.open_config_file("lists.toml")["pathnames"] - for path in paths: - # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) - # xpath_query = "//a[contains(@href,'{}')]".format(path) - num = 5 - xpath_query = ( - "(//a[contains(@href,'{}')])[position()<={}]" - ).format(path, num) - addresses = tree.xpath(xpath_query) - xpath_query = ( - "(//a[contains(@href,'{}')])[position()>last()-{}]" - ).format(path, num) - addresses += tree.xpath(xpath_query) - # NOTE Should number of addresses be limited or - # perhaps be N from the start and N from the end - for address in addresses: - address = join_url(url, address.xpath('@href')[0]) - if address not in urls: - urls.extend([address]) - # breakpoint() - # print("feed_mode_scan") - urls = await process_feed_selection(url, urls) - return urls - - -async def feed_mode_auto_discovery(url, tree): - """ - Lookup for feeds using RSS autodiscovery technique. - - See: https://www.rssboard.org/rss-autodiscovery - - Parameters - ---------- - db_file : str - Path to database file. - url : str - URL. - tree : TYPE - DESCRIPTION. - - Returns - ------- - msg : str - Message with URLs. - """ - xpath_query = ( - '//link[(@rel="alternate") and ' - '(@type="application/atom+xml" or ' - '@type="application/rdf+xml" or ' - '@type="application/rss+xml")]' - ) - # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" - # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" - feeds = tree.xpath(xpath_query) - if feeds: - urls = [] - for feed in feeds: - # # The following code works; - # # The following code will catch - # # only valid resources (i.e. not 404); - # # The following code requires more bandwidth. - # res = await fetch.http(feed) - # if res[0]: - # disco = parse(res[0]) - # title = disco["feed"]["title"] - # msg += "{} \n {} \n\n".format(title, feed) - - # feed_name = feed.xpath('@title')[0] - # feed_addr = join_url(url, feed.xpath('@href')[0]) - - # if feed_addr.startswith("/"): - # feed_addr = url + feed_addr - address = join_url(url, feed.xpath('@href')[0]) - if address not in urls: - urls.extend([address]) - # breakpoint() - # print("feed_mode_auto_discovery") - urls = await process_feed_selection(url, urls) - return urls - - -# TODO Segregate function into function that returns -# URLs (string) and Feeds (dict) and function that -# composes text message (string). -# Maybe that's not necessary. -async def process_feed_selection(url, urls): - feeds = {} - for i in urls: - result = await fetch.http(i) - if not result['error']: - document = result['content'] - status_code = result['status_code'] - if status_code == 200: # NOTE This line might be redundant - try: - feeds[i] = [parse(document)] - except: - continue - message = ( - "Web feeds found for {}\n\n```\n" - ).format(url) - urls = [] - for feed_url in feeds: - # try: - # res = await fetch.http(feed) - # except: - # continue - feed_name = None - if "title" in feeds[feed_url][0]["feed"].keys(): - feed_name = feeds[feed_url][0].feed.title - feed_name = feed_name if feed_name else "Untitled" - # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc - # AttributeError: 'str' object has no attribute 'entries' - if "entries" in feeds[feed_url][0].keys(): - feed_amnt = feeds[feed_url][0].entries - else: - continue - if feed_amnt: - # NOTE Because there could be many false positives - # which are revealed in second phase of scan, we - # could end with a single feed, which would be - # listed instead of fetched, so feed_url_mark is - # utilized in order to make fetch possible. - # NOTE feed_url_mark was a variable which stored - # single URL (probably first accepted as valid) - # in order to get an indication whether a single - # URL has been fetched, so that the receiving - # function will scan that single URL instead of - # listing it as a message. - url = {'link' : feed_url, - 'index' : None, - 'name' : feed_name, - 'code' : status_code, - 'error' : False, - 'exist' : None} - urls.extend([url]) - count = len(urls) - if count > 1: - result = urls - elif count: - result = urls[0] - else: - result = None - return result - - -# def get_discovered_feeds(url, urls): -# message = ( -# "Found {} web feeds:\n\n```\n" -# ).format(len(urls)) -# if len(urls) > 1: -# for urls in urls: -# message += ( -# "Title : {}\n" -# "Link : {}\n" -# "\n" -# ).format(url, url.title) -# message += ( -# "```\nThe above feeds were extracted from\n{}" -# ).format(url) -# elif len(urls) > 0: -# result = urls -# else: -# message = ( -# "No feeds were found for {}" -# ).format(url) -# return result - - -# Test module -# TODO ModuleNotFoundError: No module named 'slixfeed' -# import slixfeed.fetch as fetch -# from slixfeed.action import is_feed, process_feed_selection - -# async def start(url): -# while True: -# result = await fetch.http(url) -# document = result[0] -# status = result[1] -# if document: -# feed = parse(document) -# if is_feed(feed): -# print(url) -# else: -# urls = await probe_page( -# url, document) -# if len(urls) > 1: -# await process_feed_selection(urls) -# elif urls: -# url = urls[0] -# else: -# response = ( -# "> {}\nFailed to load URL. Reason: {}" -# ).format(url, status) -# break -# return response - -# url = "https://www.smh.com.au/rssheadlines" -# start(url) diff --git a/slixfeed/dt.py b/slixfeed/dt.py deleted file mode 100644 index d9f388d..0000000 --- a/slixfeed/dt.py +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" -https://feedparser.readthedocs.io/en/latest/date-parsing.html -""" - -from datetime import datetime -from dateutil.parser import parse -from email.utils import parsedate, parsedate_to_datetime - -def now(): - """ - ISO 8601 Timestamp. - - Returns - ------- - date : ??? - ISO 8601 Timestamp. - """ - date = datetime.now().isoformat() - return date - - -def convert_struct_time_to_iso8601(struct_time): - date = datetime(*struct_time[:6]) - date = date.isoformat() - return date - - -def current_date(): - """ - Print MM DD, YYYY (Weekday Time) timestamp. - - Returns - ------- - date : str - MM DD, YYYY (Weekday Time) timestamp. - """ - now = datetime.now() - time = now.strftime("%B %d, %Y (%A %T)") - return time - - -def current_time(): - """ - Print HH:MM:SS timestamp. - - Returns - ------- - date : str - HH:MM:SS timestamp. - """ - now = datetime.now() - time = now.strftime("%H:%M:%S") - return time - - -def timestamp(): - """ - Print time stamp to be used in filename. - - Returns - ------- - formatted_time : str - %Y%m%d-%H%M%S timestamp. - """ - now = datetime.now() - formatted_time = now.strftime("%Y%m%d-%H%M%S") - return formatted_time - - -def validate(date): - """ - Validate date format. - - Parameters - ---------- - date : str - Timestamp. - - Returns - ------- - date : str - Timestamp. - """ - try: - parse(date) - except: - date = now() - return date - - -def rfc2822_to_iso8601(date): - """ - Convert RFC 2822 into ISO 8601. - - Parameters - ---------- - date : str - RFC 2822 Timestamp. - - Returns - ------- - date : str - ISO 8601 Timestamp. - """ - if parsedate(date): - try: - date = parsedate_to_datetime(date) - date = date.isoformat() - except: - date = now() - return date diff --git a/slixfeed/format.py b/slixfeed/format.py deleted file mode 100644 index 61e3983..0000000 --- a/slixfeed/format.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -TODO - -Move code from sqlite.get_entry_unread - -if num > 1: - news_list += ( - "\n{}\n{}\n{}\n" - ).format(str(title), str(link), str(feed_title)) -else: - news_list = ( - "{}\n{}\n{}" - ).format(str(title), str(link), str(feed_title)) - -""" diff --git a/slixfeed/read.py b/slixfeed/read.py deleted file mode 100644 index b35ad02..0000000 --- a/slixfeed/read.py +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -TODO - -1) is_feed: Look into the type ("atom", "rss2" etc.) - -""" - - -def title(feed): - """ - Get title of feed. - - Parameters - ---------- - url : str - URL. - feed : dict - Parsed feed document. - - Returns - ------- - title : str - Title or None. - """ - try: - title = feed["feed"]["title"] - except: - title = None - return title - - -def is_feed(feed): - """ - Determine whether document is feed or not. - - Parameters - ---------- - feed : dict - Parsed feed. - - Returns - ------- - val : boolean - True or False. - """ - msg = None - if not feed.entries: - try: - feed["feed"]["title"] - val = True - # msg = ( - # "Empty feed for {}" - # ).format(url) - except: - val = False - # msg = ( - # "No entries nor title for {}" - # ).format(url) - elif feed.bozo: - val = False - # msg = ( - # "Bozo detected for {}" - # ).format(url) - else: - val = True - # msg = ( - # "Good feed for {}" - # ).format(url) - print(msg) - return val diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index 5fbb6fd..f883ea2 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -20,9 +20,8 @@ TODO """ from asyncio import Lock -import slixfeed.dt as dt from slixfeed.log import Logger -from slixfeed.url import join_url +from slixfeed.utilities import DateAndTime, Url from sqlite3 import connect, Error, IntegrityError import sys import time @@ -2736,7 +2735,7 @@ def get_invalid_entries(db_file, url, feed): title = feed["feed"]["title"] # Prepare a link to compare if entry.has_key("link"): - link = join_url(url, entry.link) + link = Url.join_url(url, entry.link) else: link = url # Compare date, link and title @@ -2745,7 +2744,7 @@ def get_invalid_entries(db_file, url, feed): # print("compare published:", title, link, time) # print("compare published:", entry_title, entry_link, timestamp) # print("============") - time = dt.rfc2822_to_iso8601(entry.published) + time = DateAndTime.rfc2822_to_iso8601(entry.published) if (entry_title == title and entry_link == link and timestamp == time): diff --git a/slixfeed/syndication.py b/slixfeed/syndication.py index 8aa262a..52c74ff 100644 --- a/slixfeed/syndication.py +++ b/slixfeed/syndication.py @@ -29,12 +29,10 @@ from feedparser import parse import os import slixfeed.config as config from slixfeed.config import Config -import slixfeed.crawl as crawl -import slixfeed.dt as dt import slixfeed.fetch as fetch from slixfeed.log import Logger import slixfeed.sqlite as sqlite -from slixfeed.url import join_url, trim_url +from slixfeed.utilities import DateAndTime, Url from slixfeed.utilities import Html, MD from slixmpp.xmlstream import ET import sys @@ -56,7 +54,7 @@ class Feed: if not os.path.isdir(cache_dir + '/' + ext): os.mkdir(cache_dir + '/' + ext) filename = os.path.join( - cache_dir, ext, 'slixfeed_' + dt.timestamp() + '.' + ext) + cache_dir, ext, 'slixfeed_' + DateAndTime.timestamp() + '.' + ext) db_file = config.get_pathname_to_database(jid_bare) results = sqlite.get_feeds(db_file) match ext: @@ -220,6 +218,7 @@ class Feed: return node_entry + # Look into the type ("atom", "rss2" etc.) def is_feed(url, feed): """ Determine whether document is feed or not. @@ -301,7 +300,7 @@ class Feed: if "updated_parsed" in feed["feed"].keys(): updated = feed["feed"]["updated_parsed"] try: - updated = dt.convert_struct_time_to_iso8601(updated) + updated = DateAndTime.convert_struct_time_to_iso8601(updated) except Exception as e: logger.error(str(e)) updated = '' @@ -325,7 +324,7 @@ class Feed: if feed.has_key('updated_parsed'): feed_updated = feed.updated_parsed try: - feed_updated = dt.convert_struct_time_to_iso8601(feed_updated) + feed_updated = DateAndTime.convert_struct_time_to_iso8601(feed_updated) except Exception as e: logger.error(str(e)) feed_updated = None @@ -357,7 +356,7 @@ class Feed: # NOTE Do not be tempted to return a compact dictionary. # That is, dictionary within dictionary # Return multiple dictionaries in a list or tuple. - result = await crawl.probe_page(url, document) + result = await FeedDiscovery.probe_page(url, document) if not result: # Get out of the loop with dict indicating error. result_final = {'link' : url, @@ -437,16 +436,16 @@ class Feed: title = "*** No title ***" if entry.has_key("link"): # link = complete_url(source, entry.link) - link = join_url(url, entry.link) - link = trim_url(link) + link = Url.join_url(url, entry.link) + link = Url.trim_url(link) else: link = "*** No link ***" if entry.has_key("published"): date = entry.published - date = dt.rfc2822_to_iso8601(date) + date = DateAndTime.rfc2822_to_iso8601(date) elif entry.has_key("updated"): date = entry.updated - date = dt.rfc2822_to_iso8601(date) + date = DateAndTime.rfc2822_to_iso8601(date) else: date = "*** No date ***" response += ("Title : {}\n" @@ -481,10 +480,10 @@ class Feed: title = '*** No title ***' if entry.has_key("published"): date = entry.published - date = dt.rfc2822_to_iso8601(date) + date = DateAndTime.rfc2822_to_iso8601(date) elif entry.has_key("updated"): date = entry.updated - date = dt.rfc2822_to_iso8601(date) + date = DateAndTime.rfc2822_to_iso8601(date) else: date = '*** No date ***' if entry.has_key("summary"): @@ -500,8 +499,8 @@ class Feed: summary = '*** No summary ***' if entry.has_key("link"): # link = complete_url(source, entry.link) - link = join_url(url, entry.link) - link = trim_url(link) + link = Url.join_url(url, entry.link) + link = Url.trim_url(link) else: link = '*** No link ***' response = ("{}\n" @@ -543,7 +542,7 @@ class Feed: if feed.has_key('updated_parsed'): feed_updated = feed.updated_parsed try: - feed_updated = dt.convert_struct_time_to_iso8601(feed_updated) + feed_updated = DateAndTime.convert_struct_time_to_iso8601(feed_updated) except: feed_updated = '' else: @@ -598,18 +597,18 @@ class Feed: logger.debug('{}: entry: {}'.format(function_name, entry.link)) if entry.has_key("published"): entry_published = entry.published - entry_published = dt.rfc2822_to_iso8601(entry_published) + entry_published = DateAndTime.rfc2822_to_iso8601(entry_published) else: entry_published = '' if entry.has_key("updated"): entry_updated = entry.updated - entry_updated = dt.rfc2822_to_iso8601(entry_updated) + entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated) else: - entry_updated = dt.now() + entry_updated = DateAndTime.now() if entry.has_key("link"): # link = complete_url(source, entry.link) - entry_link = join_url(feed_url, entry.link) - entry_link = trim_url(entry_link) + entry_link = Url.join_url(feed_url, entry.link) + entry_link = Url.trim_url(entry_link) else: entry_link = feed_url # title = feed["feed"]["title"] @@ -783,8 +782,8 @@ class Feed: # if (e_link.rel == "enclosure" and # media_type in ("audio", "image", "video")): # media_link = e_link.href - # media_link = join_url(url, e_link.href) - # media_link = trim_url(media_link) + # media_link = Url.join_url(url, e_link.href) + # media_link = Url.trim_url(media_link) ########################################################### @@ -821,6 +820,442 @@ class Feed: return new_entries +""" + +FIXME + +1) https://wiki.pine64.org + File "/slixfeed/crawl.py", line 178, in feed_mode_guess + address = Url.join_url(url, parted_url.path.split('/')[1] + path) + ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^ + IndexError: list index out of range + +TODO + +1.1) Attempt to scan more paths: /blog/, /news/ etc., including root / + Attempt to scan sub domains + https://esmailelbob.xyz/en/ + https://blog.esmailelbob.xyz/feed/ + +1.2) Consider utilizing fetch.http_response + +2) DeviantArt + https://www.deviantart.com/nedesem/gallery + https://backend.deviantart.com/rss.xml?q=gallery:nedesem + https://backend.deviantart.com/rss.xml?q=nedesem + + https://www.deviantart.com/search?q= + https://backend.deviantart.com/rss.xml?q=search: + +FEEDS CRAWLER PROJECT + +3) Mark redirects for manual check + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/atom.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/feed.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/news.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/news.xml.php + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/rdf.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/rss.xml + +Title : JSON Feed +Link : https://www.jsonfeed.org/feed.json/videos.xml + + +""" + +from aiohttp import ClientError, ClientSession, ClientTimeout +from lxml import etree +from lxml import html +from lxml.etree import fromstring + + +class FeedDiscovery: + + +# TODO Use boolean as a flag to determine whether a single URL was found +# async def probe_page( +# callback, url, document, num=None, db_file=None): +# result = None +# try: +# # tree = etree.fromstring(res[0]) # etree is for xml +# tree = html.fromstring(document) +# except: +# result = ( +# "> {}\nFailed to parse URL as feed." +# ).format(url) +# if not result: +# print("RSS Auto-Discovery Engaged") +# result = await feed_mode_auto_discovery(url, tree) +# if not result: +# print("RSS Scan Mode Engaged") +# result = await feed_mode_scan(url, tree) +# if not result: +# print("RSS Arbitrary Mode Engaged") +# result = await feed_mode_request(url, tree) +# if not result: +# result = ( +# "> {}\nNo news feeds were found for URL." +# ).format(url) +# # elif msg: +# else: +# if isinstance(result, str): +# return result +# elif isinstance(result, list): +# url = result[0] +# if db_file: +# # print("if db_file", db_file) +# return await callback(db_file, url) +# elif num: +# return await callback(url, num) +# else: +# return await callback(url) + + async def probe_page(url, document=None): + """ + Parameters + ---------- + url : str + URL. + document : TYPE + DESCRIPTION. + + Returns + ------- + result : list or str + Single URL as list or selection of URLs as str. + """ + if not document: + response = await fetch.http(url) + if not response['error']: + document = response['content'] + try: + # tree = etree.fromstring(res[0]) # etree is for xml + tree = html.fromstring(document) + result = None + except Exception as e: + logger.error(str(e)) + try: + # /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + # xml = html.fromstring(document.encode('utf-8')) + # parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') + # tree = fromstring(xml, parser=parser) + + # /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported + #tree = html.fromstring(bytes(document, encoding='utf8')) + + # https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html + #parser = etree.XMLParser(recover=True) + #tree = etree.fromstring(document, parser) + + tree = html.fromstring(document.encode('utf-8')) + result = None + except Exception as e: + logger.error(str(e)) + logger.warning("Failed to parse URL as feed for {}.".format(url)) + result = {'link' : None, + 'index' : None, + 'name' : None, + 'code' : None, + 'error' : True, + 'exist' : None} + if not result: + logger.debug("Feed auto-discovery engaged for {}".format(url)) + result = FeedDiscovery.feed_mode_auto_discovery(url, tree) + if not result: + logger.debug("Feed link scan mode engaged for {}".format(url)) + result = FeedDiscovery.feed_mode_scan(url, tree) + if not result: + logger.debug("Feed arbitrary mode engaged for {}".format(url)) + result = FeedDiscovery.feed_mode_guess(url, tree) + if not result: + logger.debug("No feeds were found for {}".format(url)) + result = None + result = await FeedDiscovery.process_feed_selection(url, result) + return result + + + # TODO Improve scan by gradual decreasing of path + def feed_mode_guess(url, tree): + """ + Lookup for feeds by pathname using HTTP Requests. + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + urls = [] + parted_url = urlsplit(url) + paths = config.open_config_file("lists.toml")["pathnames"] + # Check whether URL has path (i.e. not root) + # Check parted_url.path to avoid error in case root wasn't given + # TODO Make more tests + if parted_url.path and parted_url.path.split('/')[1]: + paths.extend( + [".atom", ".feed", ".rdf", ".rss"] + ) if '.rss' not in paths else -1 + # if paths.index('.rss'): + # paths.extend([".atom", ".feed", ".rdf", ".rss"]) + parted_url_path = parted_url.path if parted_url.path else '/' + for path in paths: + address = Url.join_url(url, parted_url_path.split('/')[1] + path) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_guess") + return urls + + + def feed_mode_scan(url, tree): + """ + Scan page for potential feeds by pathname. + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + urls = [] + paths = config.open_config_file("lists.toml")["pathnames"] + for path in paths: + # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) + # xpath_query = "//a[contains(@href,'{}')]".format(path) + num = 5 + xpath_query = ( + "(//a[contains(@href,'{}')])[position()<={}]" + ).format(path, num) + addresses = tree.xpath(xpath_query) + xpath_query = ( + "(//a[contains(@href,'{}')])[position()>last()-{}]" + ).format(path, num) + addresses += tree.xpath(xpath_query) + # NOTE Should number of addresses be limited or + # perhaps be N from the start and N from the end + for address in addresses: + address = Url.join_url(url, address.xpath('@href')[0]) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_scan") + return urls + + + def feed_mode_auto_discovery(url, tree): + """ + Lookup for feeds using RSS autodiscovery technique. + + See: https://www.rssboard.org/rss-autodiscovery + + Parameters + ---------- + db_file : str + Path to database file. + url : str + URL. + tree : TYPE + DESCRIPTION. + + Returns + ------- + msg : str + Message with URLs. + """ + xpath_query = ( + '//link[(@rel="alternate") and ' + '(@type="application/atom+xml" or ' + '@type="application/rdf+xml" or ' + '@type="application/rss+xml")]' + ) + # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" + # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" + feeds = tree.xpath(xpath_query) + if feeds: + urls = [] + for feed in feeds: + # # The following code works; + # # The following code will catch + # # only valid resources (i.e. not 404); + # # The following code requires more bandwidth. + # res = await fetch.http(feed) + # if res[0]: + # disco = parse(res[0]) + # title = disco["feed"]["title"] + # msg += "{} \n {} \n\n".format(title, feed) + + # feed_name = feed.xpath('@title')[0] + # feed_addr = Url.join_url(url, feed.xpath('@href')[0]) + + # if feed_addr.startswith("/"): + # feed_addr = url + feed_addr + address = Url.join_url(url, feed.xpath('@href')[0]) + if address not in urls: + urls.extend([address]) + # breakpoint() + # print("feed_mode_auto_discovery") + return urls + + + # TODO Segregate function into function that returns + # URLs (string) and Feeds (dict) and function that + # composes text message (string). + # Maybe that's not necessary. + async def process_feed_selection(url, urls): + feeds = {} + for i in urls: + result = await fetch.http(i) + if not result['error']: + document = result['content'] + status_code = result['status_code'] + if status_code == 200: # NOTE This line might be redundant + try: + feeds[i] = [parse(document)] + except: + continue + message = ( + "Web feeds found for {}\n\n```\n" + ).format(url) + urls = [] + for feed_url in feeds: + # try: + # res = await fetch.http(feed) + # except: + # continue + feed_name = None + if "title" in feeds[feed_url][0]["feed"].keys(): + feed_name = feeds[feed_url][0].feed.title + feed_name = feed_name if feed_name else "Untitled" + # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc + # AttributeError: 'str' object has no attribute 'entries' + if "entries" in feeds[feed_url][0].keys(): + feed_amnt = feeds[feed_url][0].entries + else: + continue + if feed_amnt: + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_url_mark is + # utilized in order to make fetch possible. + # NOTE feed_url_mark was a variable which stored + # single URL (probably first accepted as valid) + # in order to get an indication whether a single + # URL has been fetched, so that the receiving + # function will scan that single URL instead of + # listing it as a message. + url = {'link' : feed_url, + 'index' : None, + 'name' : feed_name, + 'code' : status_code, + 'error' : False, + 'exist' : None} + urls.extend([url]) + count = len(urls) + if count > 1: + result = urls + elif count: + result = urls[0] + else: + result = None + return result + + + # def get_discovered_feeds(url, urls): + # message = ( + # "Found {} web feeds:\n\n```\n" + # ).format(len(urls)) + # if len(urls) > 1: + # for urls in urls: + # message += ( + # "Title : {}\n" + # "Link : {}\n" + # "\n" + # ).format(url, url.title) + # message += ( + # "```\nThe above feeds were extracted from\n{}" + # ).format(url) + # elif len(urls) > 0: + # result = urls + # else: + # message = ( + # "No feeds were found for {}" + # ).format(url) + # return result + + + # Test module + # TODO ModuleNotFoundError: No module named 'slixfeed' + # import slixfeed.fetch as fetch + # from slixfeed.action import is_feed, process_feed_selection + + # async def start(url): + # while True: + # result = await fetch.http(url) + # document = result[0] + # status = result[1] + # if document: + # feed = parse(document) + # if is_feed(feed): + # print(url) + # else: + # urls = await probe_page( + # url, document) + # if len(urls) > 1: + # await process_feed_selection(urls) + # elif urls: + # url = urls[0] + # else: + # response = ( + # "> {}\nFailed to load URL. Reason: {}" + # ).format(url, status) + # break + # return response + + # url = "https://www.smh.com.au/rssheadlines" + # start(url) + + + + + class FeedTask: @@ -921,7 +1356,7 @@ class Opml: ETR.SubElement(head, "generator").text = "Slixfeed" ETR.SubElement(head, "urlPublic").text = ( "https://slixfeed.woodpeckersnest.space/") - time_stamp = dt.current_time() + time_stamp = DateAndTime.current_time() ETR.SubElement(head, "dateCreated").text = time_stamp ETR.SubElement(head, "dateModified").text = time_stamp body = ETR.SubElement(root, "body") diff --git a/slixfeed/url.py b/slixfeed/url.py deleted file mode 100644 index 4fef810..0000000 --- a/slixfeed/url.py +++ /dev/null @@ -1,352 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -""" - -FIXME - -1) Do not handle base64 - https://www.lilithsaintcrow.com/2024/02/love-anonymous/ -  - https://www.lilithsaintcrow.com/2024/02/love-anonymous//image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC - -TODO - -1) ActivityPub URL revealer activitypub_to_http. - -2) SQLite preference "instance" for preferred instances. - -""" - -from email.utils import parseaddr -import os -import random -import slixfeed.config as config -import slixfeed.fetch as fetch -from slixfeed.log import Logger -from urllib.parse import ( - parse_qs, - urlencode, - urljoin, - # urlparse, - urlsplit, - urlunsplit - ) - -logger = Logger(__name__) - - -# NOTE -# hostname and protocol are listed as one in file proxies.toml. -# Perhaps a better practice would be to have them separated. - -# NOTE -# File proxies.toml will remain as it is, in order to be -# coordinated with the dataset of project LibRedirect, even -# though rule-sets might be adopted (see )Privacy Redirect). - -def get_hostname(url): - parted_url = urlsplit(url) - hostname = parted_url.netloc - if hostname.startswith('www.'): hostname = hostname.replace('www.', '') - return hostname - - -async def replace_hostname(url, url_type): - """ - Replace hostname. - - Parameters - ---------- - url : str - URL. - url_type : str - "feed" or "link". - - Returns - ------- - url : str - URL. - """ - url_new = None - parted_url = urlsplit(url) - # protocol = parted_url.scheme - hostname = parted_url.netloc - hostname = hostname.replace('www.','') - pathname = parted_url.path - queries = parted_url.query - fragment = parted_url.fragment - proxies = config.open_config_file('proxies.toml')['proxies'] - for proxy_name in proxies: - proxy = proxies[proxy_name] - if hostname in proxy['hostname'] and url_type in proxy['type']: - while not url_new: - print('>>>') - print(url_new) - proxy_type = 'clearnet' - proxy_list = proxy[proxy_type] - if len(proxy_list): - # proxy_list = proxies[proxy_name][proxy_type] - proxy_url = random.choice(proxy_list) - parted_proxy_url = urlsplit(proxy_url) - protocol_new = parted_proxy_url.scheme - hostname_new = parted_proxy_url.netloc - url_new = urlunsplit([protocol_new, hostname_new, - pathname, queries, fragment]) - print(proxy_url) - print(url_new) - print('>>>') - response = await fetch.http(url_new) - if (response and - response['status_code'] == 200 and - # response.reason == 'OK' and - url_new.startswith(proxy_url)): - break - else: - config_dir = config.get_default_config_directory() - proxies_obsolete_file = config_dir + '/proxies_obsolete.toml' - proxies_file = config_dir + '/proxies.toml' - if not os.path.isfile(proxies_obsolete_file): - config.create_skeleton(proxies_file) - config.backup_obsolete(proxies_obsolete_file, - proxy_name, proxy_type, - proxy_url) - try: - config.update_proxies(proxies_file, proxy_name, - proxy_type, proxy_url) - except ValueError as e: - logger.error([str(e), proxy_url]) - url_new = None - else: - logger.warning('No proxy URLs for {}. ' - 'Please update proxies.toml' - .format(proxy_name)) - url_new = url - break - return url_new - - -def remove_tracking_parameters(url): - """ - Remove queries with tracking parameters. - - Parameters - ---------- - url : str - URL. - - Returns - ------- - url : str - URL. - """ - if url.startswith('data:') and ';base64,' in url: - return url - parted_url = urlsplit(url) - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = parted_url.path - queries = parse_qs(parted_url.query) - fragment = parted_url.fragment - trackers = config.open_config_file('queries.toml')['trackers'] - for tracker in trackers: - if tracker in queries: del queries[tracker] - queries_new = urlencode(queries, doseq=True) - url = urlunsplit([protocol, hostname, pathname, queries_new, fragment]) - return url - - -def feed_to_http(url): - """ - Replace scheme FEED by HTTP. - - Parameters - ---------- - url : str - URL. - - Returns - ------- - new_url : str - URL. - """ - par_url = urlsplit(url) - new_url = urlunsplit(['http', par_url.netloc, par_url.path, par_url.query, - par_url.fragment]) - return new_url - - -def check_xmpp_uri(uri): - """ - Check validity of XMPP URI. - - Parameters - ---------- - uri : str - URI. - - Returns - ------- - jid : str - JID or None. - """ - jid = urlsplit(uri).path - if parseaddr(jid)[1] != jid: - jid = False - return jid - - -# NOTE Read the documentation -# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin -def complete_url(source, link): - """ - Check if URL is pathname and complete it into URL. - - Parameters - ---------- - source : str - Feed URL. - link : str - Link URL or pathname. - - Returns - ------- - str - URL. - """ - if link.startswith('data:') and ';base64,' in link: - return link - if link.startswith('www.'): - return 'http://' + link - parted_link = urlsplit(link) - parted_feed = urlsplit(source) - if parted_link.scheme == 'magnet' and parted_link.query: - return link - if parted_link.scheme and parted_link.netloc: - return link - if link.startswith('//'): - if parted_link.netloc and parted_link.path: - new_link = urlunsplit([parted_feed.scheme, parted_link.netloc, - parted_link.path, parted_link.query, - parted_link.fragment]) - elif link.startswith('/'): - new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, - parted_link.path, parted_link.query, - parted_link.fragment]) - elif link.startswith('../'): - pathlink = parted_link.path.split('/') - pathfeed = parted_feed.path.split('/') - for i in pathlink: - if i == '..': - if pathlink.index('..') == 0: - pathfeed.pop() - else: - break - while pathlink.count('..'): - if pathlink.index('..') == 0: - pathlink.remove('..') - else: - break - pathlink = '/'.join(pathlink) - pathfeed.extend([pathlink]) - new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, - '/'.join(pathfeed), parted_link.query, - parted_link.fragment]) - else: - pathlink = parted_link.path.split('/') - pathfeed = parted_feed.path.split('/') - if link.startswith('./'): - pathlink.remove('.') - if not source.endswith('/'): - pathfeed.pop() - pathlink = '/'.join(pathlink) - pathfeed.extend([pathlink]) - new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, - '/'.join(pathfeed), parted_link.query, - parted_link.fragment]) - return new_link - - - -# TODO - -# Feed https://www.ocaml.org/feed.xml -# Link %20https://frama-c.com/fc-versions/cobalt.html%20 - -# FIXME - -# Feed https://cyber.dabamos.de/blog/feed.rss -# Link https://cyber.dabamos.de/blog/#article-2022-07-15 - -def join_url(source, link): - """ - Join base URL with given pathname. - - Parameters - ---------- - source : str - Feed URL. - link : str - Link URL or pathname. - - Returns - ------- - str - URL. - """ - if link.startswith('data:') and ';base64,' in link: - return link - if link.startswith('www.'): - new_link = 'http://' + link - elif link.startswith('%20') and link.endswith('%20'): - old_link = link.split('%20') - del old_link[0] - old_link.pop() - new_link = ''.join(old_link) - else: - new_link = urljoin(source, link) - return new_link - - -def trim_url(url): - """ - Check URL pathname for double slash. - - Parameters - ---------- - url : str - URL. - - Returns - ------- - url : str - URL. - """ - if url.startswith('data:') and ';base64,' in url: - return url - parted_url = urlsplit(url) - protocol = parted_url.scheme - hostname = parted_url.netloc - pathname = parted_url.path - queries = parted_url.query - fragment = parted_url.fragment - while '//' in pathname: - pathname = pathname.replace('//', '/') - url = urlunsplit([protocol, hostname, pathname, queries, fragment]) - return url - - -def activitypub_to_http(namespace): - """ - Replace ActivityPub namespace by HTTP. - - Parameters - ---------- - namespace : str - Namespace. - - Returns - ------- - new_url : str - URL. - """ diff --git a/slixfeed/utilities.py b/slixfeed/utilities.py index 2bf4092..3492a60 100644 --- a/slixfeed/utilities.py +++ b/slixfeed/utilities.py @@ -39,16 +39,27 @@ TODO """ +from datetime import datetime +from email.utils import parseaddr +from dateutil.parser import parse +from email.utils import parsedate, parsedate_to_datetime import hashlib +import os +import random import slixfeed.config as config -from slixfeed.config import Config from lxml import etree, html import slixfeed.dt as dt import slixfeed.fetch as fetch from slixfeed.log import Logger -import slixfeed.sqlite as sqlite -from slixfeed.url import join_url, complete_url import sys +from urllib.parse import ( + parse_qs, + urlencode, + urljoin, + # urlparse, + urlsplit, + urlunsplit + ) try: import tomllib @@ -58,6 +69,115 @@ except: logger = Logger(__name__) +class DateAndTime: + +#https://feedparser.readthedocs.io/en/latest/date-parsing.html + + def now(): + """ + ISO 8601 Timestamp. + + Returns + ------- + date : ??? + ISO 8601 Timestamp. + """ + date = datetime.now().isoformat() + return date + + + def convert_struct_time_to_iso8601(struct_time): + date = datetime(*struct_time[:6]) + date = date.isoformat() + return date + + + def current_date(): + """ + Print MM DD, YYYY (Weekday Time) timestamp. + + Returns + ------- + date : str + MM DD, YYYY (Weekday Time) timestamp. + """ + now = datetime.now() + time = now.strftime("%B %d, %Y (%A %T)") + return time + + + def current_time(): + """ + Print HH:MM:SS timestamp. + + Returns + ------- + date : str + HH:MM:SS timestamp. + """ + now = datetime.now() + time = now.strftime("%H:%M:%S") + return time + + + def timestamp(): + """ + Print time stamp to be used in filename. + + Returns + ------- + formatted_time : str + %Y%m%d-%H%M%S timestamp. + """ + now = datetime.now() + formatted_time = now.strftime("%Y%m%d-%H%M%S") + return formatted_time + + + def validate(date): + """ + Validate date format. + + Parameters + ---------- + date : str + Timestamp. + + Returns + ------- + date : str + Timestamp. + """ + try: + parse(date) + except: + date = DateAndTime.now() + return date + + + def rfc2822_to_iso8601(date): + """ + Convert RFC 2822 into ISO 8601. + + Parameters + ---------- + date : str + RFC 2822 Timestamp. + + Returns + ------- + date : str + ISO 8601 Timestamp. + """ + if parsedate(date): + try: + date = parsedate_to_datetime(date) + date = date.isoformat() + except: + date = DateAndTime.now() + return date + + class Documentation: @@ -120,7 +240,7 @@ class Html: if len(images): image = images[0] image = str(image) - image_url = complete_url(url, image) + image_url = Url.complete_url(url, image) return image_url @@ -224,6 +344,343 @@ class Task: .format(task, jid_bare)) +""" + +FIXME + +1) Do not handle base64 + https://www.lilithsaintcrow.com/2024/02/love-anonymous/ +  + https://www.lilithsaintcrow.com/2024/02/love-anonymous//image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC + +TODO + +1) ActivityPub URL revealer activitypub_to_http. + +2) SQLite preference "instance" for preferred instances. + +""" + + +class Url: + +# NOTE +# hostname and protocol are listed as one in file proxies.toml. +# Perhaps a better practice would be to have them separated. + +# NOTE +# File proxies.toml will remain as it is, in order to be +# coordinated with the dataset of project LibRedirect, even +# though rule-sets might be adopted (see )Privacy Redirect). + + def get_hostname(url): + parted_url = urlsplit(url) + hostname = parted_url.netloc + if hostname.startswith('www.'): hostname = hostname.replace('www.', '') + return hostname + + + async def replace_hostname(url, url_type): + """ + Replace hostname. + + Parameters + ---------- + url : str + URL. + url_type : str + "feed" or "link". + + Returns + ------- + url : str + URL. + """ + url_new = None + parted_url = urlsplit(url) + # protocol = parted_url.scheme + hostname = parted_url.netloc + hostname = hostname.replace('www.','') + pathname = parted_url.path + queries = parted_url.query + fragment = parted_url.fragment + proxies = config.open_config_file('proxies.toml')['proxies'] + for proxy_name in proxies: + proxy = proxies[proxy_name] + if hostname in proxy['hostname'] and url_type in proxy['type']: + while not url_new: + print('>>>') + print(url_new) + proxy_type = 'clearnet' + proxy_list = proxy[proxy_type] + if len(proxy_list): + # proxy_list = proxies[proxy_name][proxy_type] + proxy_url = random.choice(proxy_list) + parted_proxy_url = urlsplit(proxy_url) + protocol_new = parted_proxy_url.scheme + hostname_new = parted_proxy_url.netloc + url_new = urlunsplit([protocol_new, hostname_new, + pathname, queries, fragment]) + print(proxy_url) + print(url_new) + print('>>>') + response = await fetch.http(url_new) + if (response and + response['status_code'] == 200 and + # response.reason == 'OK' and + url_new.startswith(proxy_url)): + break + else: + config_dir = config.get_default_config_directory() + proxies_obsolete_file = config_dir + '/proxies_obsolete.toml' + proxies_file = config_dir + '/proxies.toml' + if not os.path.isfile(proxies_obsolete_file): + config.create_skeleton(proxies_file) + config.backup_obsolete(proxies_obsolete_file, + proxy_name, proxy_type, + proxy_url) + try: + config.update_proxies(proxies_file, proxy_name, + proxy_type, proxy_url) + except ValueError as e: + logger.error([str(e), proxy_url]) + url_new = None + else: + logger.warning('No proxy URLs for {}. ' + 'Please update proxies.toml' + .format(proxy_name)) + url_new = url + break + return url_new + + + def remove_tracking_parameters(url): + """ + Remove queries with tracking parameters. + + Parameters + ---------- + url : str + URL. + + Returns + ------- + url : str + URL. + """ + if url.startswith('data:') and ';base64,' in url: + return url + parted_url = urlsplit(url) + protocol = parted_url.scheme + hostname = parted_url.netloc + pathname = parted_url.path + queries = parse_qs(parted_url.query) + fragment = parted_url.fragment + trackers = config.open_config_file('queries.toml')['trackers'] + for tracker in trackers: + if tracker in queries: del queries[tracker] + queries_new = urlencode(queries, doseq=True) + url = urlunsplit([protocol, hostname, pathname, queries_new, fragment]) + return url + + + def feed_to_http(url): + """ + Replace scheme FEED by HTTP. + + Parameters + ---------- + url : str + URL. + + Returns + ------- + new_url : str + URL. + """ + par_url = urlsplit(url) + new_url = urlunsplit(['http', par_url.netloc, par_url.path, par_url.query, + par_url.fragment]) + return new_url + + + def check_xmpp_uri(uri): + """ + Check validity of XMPP URI. + + Parameters + ---------- + uri : str + URI. + + Returns + ------- + jid : str + JID or None. + """ + jid = urlsplit(uri).path + if parseaddr(jid)[1] != jid: + jid = False + return jid + + + # NOTE Read the documentation + # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin + def complete_url(source, link): + """ + Check if URL is pathname and complete it into URL. + + Parameters + ---------- + source : str + Feed URL. + link : str + Link URL or pathname. + + Returns + ------- + str + URL. + """ + if link.startswith('data:') and ';base64,' in link: + return link + if link.startswith('www.'): + return 'http://' + link + parted_link = urlsplit(link) + parted_feed = urlsplit(source) + if parted_link.scheme == 'magnet' and parted_link.query: + return link + if parted_link.scheme and parted_link.netloc: + return link + if link.startswith('//'): + if parted_link.netloc and parted_link.path: + new_link = urlunsplit([parted_feed.scheme, parted_link.netloc, + parted_link.path, parted_link.query, + parted_link.fragment]) + elif link.startswith('/'): + new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, + parted_link.path, parted_link.query, + parted_link.fragment]) + elif link.startswith('../'): + pathlink = parted_link.path.split('/') + pathfeed = parted_feed.path.split('/') + for i in pathlink: + if i == '..': + if pathlink.index('..') == 0: + pathfeed.pop() + else: + break + while pathlink.count('..'): + if pathlink.index('..') == 0: + pathlink.remove('..') + else: + break + pathlink = '/'.join(pathlink) + pathfeed.extend([pathlink]) + new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, + '/'.join(pathfeed), parted_link.query, + parted_link.fragment]) + else: + pathlink = parted_link.path.split('/') + pathfeed = parted_feed.path.split('/') + if link.startswith('./'): + pathlink.remove('.') + if not source.endswith('/'): + pathfeed.pop() + pathlink = '/'.join(pathlink) + pathfeed.extend([pathlink]) + new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc, + '/'.join(pathfeed), parted_link.query, + parted_link.fragment]) + return new_link + + + + # TODO + + # Feed https://www.ocaml.org/feed.xml + # Link %20https://frama-c.com/fc-versions/cobalt.html%20 + + # FIXME + + # Feed https://cyber.dabamos.de/blog/feed.rss + # Link https://cyber.dabamos.de/blog/#article-2022-07-15 + + def join_url(source, link): + """ + Join base URL with given pathname. + + Parameters + ---------- + source : str + Feed URL. + link : str + Link URL or pathname. + + Returns + ------- + str + URL. + """ + if link.startswith('data:') and ';base64,' in link: + return link + if link.startswith('www.'): + new_link = 'http://' + link + elif link.startswith('%20') and link.endswith('%20'): + old_link = link.split('%20') + del old_link[0] + old_link.pop() + new_link = ''.join(old_link) + else: + new_link = urljoin(source, link) + return new_link + + + def trim_url(url): + """ + Check URL pathname for double slash. + + Parameters + ---------- + url : str + URL. + + Returns + ------- + url : str + URL. + """ + if url.startswith('data:') and ';base64,' in url: + return url + parted_url = urlsplit(url) + protocol = parted_url.scheme + hostname = parted_url.netloc + pathname = parted_url.path + queries = parted_url.query + fragment = parted_url.fragment + while '//' in pathname: + pathname = pathname.replace('//', '/') + url = urlunsplit([protocol, hostname, pathname, queries, fragment]) + return url + + + def activitypub_to_http(namespace): + """ + Replace ActivityPub namespace by HTTP. + + Parameters + ---------- + namespace : str + Namespace. + + Returns + ------- + new_url : str + URL. + """ + + + class Utilities: diff --git a/slixfeed/version.py b/slixfeed/version.py index 473ee71..5c7be88 100644 --- a/slixfeed/version.py +++ b/slixfeed/version.py @@ -1,2 +1,2 @@ -__version__ = '0.1.81' -__version_info__ = (0, 1, 81) +__version__ = '0.1.82' +__version_info__ = (0, 1, 82) diff --git a/slixfeed/xmpp/chat.py b/slixfeed/xmpp/chat.py index 60b54c7..51abb55 100644 --- a/slixfeed/xmpp/chat.py +++ b/slixfeed/xmpp/chat.py @@ -29,16 +29,11 @@ import slixfeed.config as config from slixfeed.config import Config from slixfeed.log import Logger import slixfeed.sqlite as sqlite -from slixfeed.url import ( - remove_tracking_parameters, - replace_hostname, - ) from slixfeed.syndication import FeedTask -from slixfeed.utilities import Documentation, Html, MD, Task +from slixfeed.utilities import Documentation, Html, MD, Task, Url from slixfeed.xmpp.commands import XmppCommands from slixfeed.xmpp.message import XmppMessage from slixfeed.xmpp.presence import XmppPresence -from slixfeed.xmpp.privilege import is_operator, is_moderator from slixfeed.xmpp.status import XmppStatusTask from slixfeed.xmpp.upload import XmppUpload from slixfeed.xmpp.utilities import XmppUtilities @@ -89,7 +84,7 @@ class XmppChat: if (message['muc']['nick'] == self.alias): return jid_full = str(message['from']) - if not is_moderator(self, jid_bare, jid_full): + if not XmppUtilities.is_moderator(self, jid_bare, jid_full): return if message['type'] == 'groupchat': @@ -115,7 +110,7 @@ class XmppChat: # return # approved = False jid_full = str(message['from']) - if not is_moderator(self, jid_bare, jid_full): + if not XmppUtilities.is_moderator(self, jid_bare, jid_full): return # if role == 'moderator': # approved = True @@ -257,7 +252,7 @@ class XmppChat: response = 'Current value for archive: ' response += XmppCommands.get_archive(self, jid_bare) case _ if command_lowercase.startswith('bookmark +'): - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): muc_jid = command[11:] response = await XmppCommands.bookmark_add( self, muc_jid) @@ -265,7 +260,7 @@ class XmppChat: response = ('This action is restricted. ' 'Type: adding bookmarks.') case _ if command_lowercase.startswith('bookmark -'): - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): muc_jid = command[11:] response = await XmppCommands.bookmark_del( self, muc_jid) @@ -273,7 +268,7 @@ class XmppChat: response = ('This action is restricted. ' 'Type: removing bookmarks.') case 'bookmarks': - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): response = await XmppCommands.print_bookmarks(self) else: response = ('This action is restricted. ' @@ -333,7 +328,7 @@ class XmppChat: XmppPresence.send(self, jid_bare, status_message, status_type=status_type) filename, response = XmppCommands.export_feeds( - self, jid_bare, ext) + jid_bare, ext) url = await XmppUpload.start(self, jid_bare, filename) # response = ( # 'Feeds exported successfully to {}.\n{}' @@ -388,7 +383,7 @@ class XmppChat: response = await XmppCommands.pubsub_list(self, jid) response += '```' case _ if command_lowercase.startswith('pubsub send'): - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): info = command[12:] info = info.split(' ') jid = info[0] @@ -461,7 +456,7 @@ class XmppChat: await XmppChatAction.send_unread_items(self, jid_bare, num) XmppStatusTask.restart_task(self, jid_bare) case _ if command_lowercase.startswith('node delete'): - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): info = command[12:] info = info.split(' ') response = XmppCommands.node_delete(self, info) @@ -469,7 +464,7 @@ class XmppChat: response = ('This action is restricted. ' 'Type: sending news to PubSub.') case _ if command_lowercase.startswith('node purge'): - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): info = command[11:] info = info.split(' ') response = XmppCommands.node_purge(self, info) @@ -770,8 +765,8 @@ class XmppChatAction: else: summary = '*** No summary ***' link = result[2] - link = remove_tracking_parameters(link) - link = await replace_hostname(link, "link") or link + link = Url.remove_tracking_parameters(link) + link = await Url.replace_hostname(link, "link") or link feed_id = result[4] # news_item = ("\n{}\n{}\n{} [{}]\n").format(str(title), str(link), # str(feed_title), str(ix)) diff --git a/slixfeed/xmpp/client.py b/slixfeed/xmpp/client.py index 304d15c..fe160bf 100644 --- a/slixfeed/xmpp/client.py +++ b/slixfeed/xmpp/client.py @@ -44,14 +44,11 @@ import slixmpp import slixfeed.config as config from slixfeed.config import Config -import slixfeed.crawl as crawl -import slixfeed.dt as dt import slixfeed.fetch as fetch from slixfeed.log import Logger import slixfeed.sqlite as sqlite -from slixfeed.syndication import Feed, FeedTask, Opml -import slixfeed.url as uri -from slixfeed.utilities import Html, Task, Utilities +from slixfeed.syndication import Feed, FeedDiscovery, FeedTask, Opml +from slixfeed.utilities import DateAndTime, Html, Task, Url, Utilities from slixfeed.version import __version__ from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.chat import XmppChat, XmppChatTask @@ -62,7 +59,6 @@ from slixfeed.xmpp.message import XmppMessage from slixfeed.xmpp.muc import XmppMuc from slixfeed.xmpp.groupchat import XmppGroupchat from slixfeed.xmpp.presence import XmppPresence -from slixfeed.xmpp.privilege import is_operator, is_access import slixfeed.xmpp.profile as profile from slixfeed.xmpp.publish import XmppPubsub, XmppPubsubAction, XmppPubsubTask from slixfeed.xmpp.roster import XmppRoster @@ -791,7 +787,7 @@ class XmppClient(slixmpp.ClientXMPP): # ) # NOTE https://codeberg.org/poezio/slixmpp/issues/3515 - # if is_operator(self, jid_bare): + # if XmppUtilities.is_operator(self, jid_bare): self['xep_0050'].add_command(node='subscription', name='🪶️ Subscribe', handler=self._handle_subscription_add) @@ -842,7 +838,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'PubSub') form['instructions'] = 'Publish news items to PubSub nodes.' options = form.add_field(desc='From which medium source do you ' @@ -863,7 +859,7 @@ class XmppClient(slixmpp.ClientXMPP): session['prev'] = None session['payload'] = form else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -883,7 +879,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): values = payload['values'] form = self['xep_0004'].make_form('form', 'Publish') form['instructions'] = ('Choose a PubSub Jabber ID and verify ' @@ -971,7 +967,7 @@ class XmppClient(slixmpp.ClientXMPP): session['has_next'] = True session['prev'] = self._handle_publish else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -994,7 +990,7 @@ class XmppClient(slixmpp.ClientXMPP): print(values['jid']) jid = values['jid'] if 'jid' in values else None jid_bare = session['from'].bare - if jid != jid_bare and not is_operator(self, jid_bare): + if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare): text_warn = ('Posting to {} is restricted to operators only.' .format(jid_bare)) # Should not this be self.boundjid.bare? session['allow_prev'] = False @@ -1065,7 +1061,7 @@ class XmppClient(slixmpp.ClientXMPP): ixs = values['entries'] #if jid: jid = jid[0] if isinstance(jid, list) else jid jid_bare = session['from'].bare - if jid != jid_bare and not is_operator(self, jid_bare): + if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare): # TODO Report incident text_warn = 'You are not suppose to be here.' session['allow_prev'] = False @@ -1100,7 +1096,7 @@ class XmppClient(slixmpp.ClientXMPP): values = payload['values'] jid = values['jid'] if 'jid' in values else None jid_bare = session['from'].bare - if jid != jid_bare and not is_operator(self, jid_bare): + if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare): # TODO Report incident text_warn = 'You are not suppose to be here.' # text_warn = ('Posting to {} is restricted to operators only.' @@ -1119,7 +1115,7 @@ class XmppClient(slixmpp.ClientXMPP): if jid == self.boundjid.bare: node = 'urn:xmpp:microblog:0' else: - node = uri.get_hostname(url) + node = Url.get_hostname(url) form = self['xep_0004'].make_form('form', 'Publish') while True: result = await fetch.http(url) @@ -1137,7 +1133,7 @@ class XmppClient(slixmpp.ClientXMPP): if "title" in feed["feed"].keys(): title = feed["feed"]["title"] else: - title = uri.get_hostname(url) + title = Url.get_hostname(url) entries = feed.entries entry_ix = 0 for entry in entries: @@ -1146,10 +1142,10 @@ class XmppClient(slixmpp.ClientXMPP): else: if entry.has_key("published"): title = entry.published - title = dt.rfc2822_to_iso8601(title) + title = DateAndTime.rfc2822_to_iso8601(title) elif entry.has_key("updated"): title = entry.updated - title = dt.rfc2822_to_iso8601(title) + title = DateAndTime.rfc2822_to_iso8601(title) else: title = "*** No title ***" options.addOption(title, str(entry_ix)) @@ -1164,7 +1160,7 @@ class XmppClient(slixmpp.ClientXMPP): session['payload'] = form break else: - result = await crawl.probe_page(url, document) + result = await FeedDiscovery.probe_page(url, document) if isinstance(result, list): results = result form['instructions'] = ('Discovered {} subscriptions ' @@ -1225,7 +1221,7 @@ class XmppClient(slixmpp.ClientXMPP): jid = values['jid'][0] if 'jid' in values else None #if jid: jid = jid[0] if isinstance(jid, list) else jid jid_bare = session['from'].bare - if jid != jid_bare and not is_operator(self, jid_bare): + if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare): # TODO Report incident text_warn = 'You are not suppose to be here.' session['allow_prev'] = False @@ -1262,10 +1258,10 @@ class XmppClient(slixmpp.ClientXMPP): # else: # if feed.entries[entry].has_key("published"): # title = feed.entries[entry].published - # title = dt.rfc2822_to_iso8601(title) + # title = DateAndTime.rfc2822_to_iso8601(title) # elif feed.entries[entry].has_key("updated"): # title = feed.entries[entry].updated - # title = dt.rfc2822_to_iso8601(title) + # title = DateAndTime.rfc2822_to_iso8601(title) # else: # title = "*** No title ***" # if feed.entries[entry].has_key("summary"): @@ -1393,7 +1389,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): jid = session['from'].bare db_file = config.get_pathname_to_database(jid_bare) form = self['xep_0004'].make_form('form', 'Filters') @@ -1432,7 +1428,7 @@ class XmppClient(slixmpp.ClientXMPP): session['next'] = self._handle_filters_complete session['payload'] = form else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -1502,7 +1498,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'Subscribe') # form['instructions'] = 'Add a new custom subscription.' form.add_field(desc='Enter a URL.', @@ -1517,7 +1513,7 @@ class XmppClient(slixmpp.ClientXMPP): required=True, value='http://', var='subscription') - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): # form['instructions'] = ('Special section for operators:\n' # 'This section allows you to add ' # 'subscriptions for a JID of your ' @@ -1544,7 +1540,7 @@ class XmppClient(slixmpp.ClientXMPP): session['prev'] = None session['payload'] = form else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -1576,7 +1572,7 @@ class XmppClient(slixmpp.ClientXMPP): # options.addOption('News by tag', 'tag') options.addOption('Rejected', 'reject') options.addOption('Unread', 'unread') - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): # form['instructions'] = ('Special section for operators:\n' # 'This section allows you to view news items ' # 'of a JID of your choice.') @@ -1617,7 +1613,7 @@ class XmppClient(slixmpp.ClientXMPP): jid_bare = session['from'].bare values = payload['values'] form = self['xep_0004'].make_form('form', 'Updates') - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'] form.add_field(var='jid', ftype='hidden', @@ -1675,7 +1671,7 @@ class XmppClient(slixmpp.ClientXMPP): ix = values['update'] jid_bare = session['from'].bare form = self['xep_0004'].make_form('form', 'Article') - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid = values['jid'] jid_bare = jid[0] if isinstance(jid, list) else jid form.add_field(var='jid', @@ -1688,9 +1684,9 @@ class XmppClient(slixmpp.ClientXMPP): url = sqlite.get_entry_url(db_file, ix) url = url[0] # TODO Handle a situation when index is no longer exist logger.debug('Original URL: {}'.format(url)) - url = uri.remove_tracking_parameters(url) + url = Url.remove_tracking_parameters(url) logger.debug('Processed URL (tracker removal): {}'.format(url)) - url = (await uri.replace_hostname(url, 'link')) or url + url = (await Url.replace_hostname(url, 'link')) or url logger.debug('Processed URL (replace hostname): {}'.format(url)) # result = await fetch.http(url) # if 'content' in result: @@ -1750,7 +1746,7 @@ class XmppClient(slixmpp.ClientXMPP): identifier = values['identifier'] if 'identifier' in values else None url = values['subscription'] jid_bare = session['from'].bare - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: custom_jid = values['jid'] jid_bare = custom_jid[0] if isinstance(custom_jid, list) else jid_bare # jid_bare = custom_jid[0] if custom_jid else jid_bare @@ -1780,7 +1776,7 @@ class XmppClient(slixmpp.ClientXMPP): session['prev'] = None # elif not identifier: # counter = 0 - # hostname = uri.get_hostname(url) + # hostname = Url.get_hostname(url) # identifier = hostname + ':' + str(counter) # while True: # if sqlite.check_identifier_exist(db_file, identifier): @@ -1797,7 +1793,7 @@ class XmppClient(slixmpp.ClientXMPP): exist_count = 0 for url in urls: counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) identifier = hostname + ':' + str(counter) while True: if sqlite.check_identifier_exist(db_file, identifier): @@ -1830,7 +1826,7 @@ class XmppClient(slixmpp.ClientXMPP): if isinstance(url, list): url = url[0] counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) identifier = hostname + ':' + str(counter) while True: if sqlite.check_identifier_exist(db_file, identifier): @@ -1956,7 +1952,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare values = payload['values'] - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'][0] del values['jid'] db_file = config.get_pathname_to_database(jid_bare) @@ -1981,7 +1977,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare values = payload['values'] - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'][0] del values['jid'] db_file = config.get_pathname_to_database(jid_bare) @@ -2022,7 +2018,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'Discover & Search') form['instructions'] = 'Discover news subscriptions of all kinds' options = form.add_field(desc='Select type of search.', @@ -2039,7 +2035,7 @@ class XmppClient(slixmpp.ClientXMPP): session['payload'] = form session['prev'] = None else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -2146,7 +2142,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'Subscriptions') form['instructions'] = ('Browse, view, toggle or remove ' 'tags and subscriptions.') @@ -2160,7 +2156,7 @@ class XmppClient(slixmpp.ClientXMPP): options.addOption('Browse tags', 'tag') options.addOption('Remove subscriptions', 'delete') options.addOption('Toggle subscriptions', 'toggle') - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): form['instructions'] = None # form['instructions'] = ('Special section for operators:\n' # 'This section allows you to change ' @@ -2190,7 +2186,7 @@ class XmppClient(slixmpp.ClientXMPP): session['next'] = self._handle_subscriptions_result session['has_next'] = True else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -2212,7 +2208,7 @@ class XmppClient(slixmpp.ClientXMPP): values = payload['values'] jid_bare = session['from'].bare form = self['xep_0004'].make_form('form', 'Subscriptions') - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'] form.add_field(ftype='hidden', value=jid_bare, @@ -2306,7 +2302,7 @@ class XmppClient(slixmpp.ClientXMPP): form = self['xep_0004'].make_form('form', 'Subscriptions') jid_bare = session['from'].bare values = payload['values'] - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'][0] form.add_field(ftype='hidden', value=jid_bare, @@ -2344,7 +2340,7 @@ class XmppClient(slixmpp.ClientXMPP): form = self['xep_0004'].make_form('form', 'Subscription') jid_bare = session['from'].bare values = payload['values'] - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'][0] if values['jid'] else jid_bare form.add_field(ftype='hidden', value=jid_bare, @@ -2440,7 +2436,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare values = payload['values'] - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid_bare = values['jid'][0] db_file = config.get_pathname_to_database(jid_bare) # url = values['url'] @@ -2506,14 +2502,14 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'Advanced') form['instructions'] = 'Extended options' options = form.add_field(ftype='list-single', label='Choose', required=True, var='option') - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): options.addOption('Administration', 'admin') # options.addOption('Activity', 'activity') # options.addOption('Filters', 'filter') @@ -2527,7 +2523,7 @@ class XmppClient(slixmpp.ClientXMPP): session['next'] = self._handle_advanced_result session['prev'] = self._handle_advanced else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -2556,7 +2552,7 @@ class XmppClient(slixmpp.ClientXMPP): case 'admin': # NOTE Even though this check is already conducted on previous # form, this check is being done just in case. - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): if self.is_component: # NOTE This will be changed with XEP-0222 XEP-0223 text_info = ('Subscriber management options are ' @@ -2589,7 +2585,7 @@ class XmppClient(slixmpp.ClientXMPP): else: logger.warning('An unauthorized attempt to access ' 'bookmarks has been detected for JID {} at ' - '{}'.format(jid_bare, dt.timestamp())) + '{}'.format(jid_bare, DateAndTime.timestamp())) text_warn = 'This resource is restricted.' session['notes'] = [['warn', text_warn]] session['has_next'] = False @@ -2617,7 +2613,7 @@ class XmppClient(slixmpp.ClientXMPP): required=True, var='url') url['validate']['datatype'] = 'xs:anyURI' - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): form.add_field(ftype='fixed', label='* Operators', desc='This section allows you to import ' @@ -2651,7 +2647,7 @@ class XmppClient(slixmpp.ClientXMPP): options.addOption('OPML', 'opml') # options.addOption('HTML', 'html') # options.addOption('XBEL', 'xbel') - if is_operator(self, jid_bare): + if XmppUtilities.is_operator(self, jid_bare): # form['instructions'] = ('Special section for operators:\n' # 'This section allows you to ' # 'import and export subscriptions ' @@ -2841,7 +2837,7 @@ class XmppClient(slixmpp.ClientXMPP): url = values['url'] if url.startswith('http') and url.endswith('.opml'): jid_bare = session['from'].bare - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid = values['jid'] jid_bare = jid[0] if isinstance(jid, list) else jid db_file = config.get_pathname_to_database(jid_bare) @@ -2882,7 +2878,7 @@ class XmppClient(slixmpp.ClientXMPP): # form['type'] = 'result' values = payload['values'] jid_bare = session['from'].bare - if is_operator(self, jid_bare) and 'jid' in values: + if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values: jid = values['jid'] jid_bare = jid[0] if isinstance(jid, list) else jid # form = self['xep_0004'].make_form('result', 'Done') @@ -2915,7 +2911,7 @@ class XmppClient(slixmpp.ClientXMPP): jid_bare = session['from'].bare jid_full = str(session['from']) chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): form = self['xep_0004'].make_form('form', 'Subscribe') # NOTE Refresh button would be of use form['instructions'] = 'Featured subscriptions' @@ -2938,7 +2934,7 @@ class XmppClient(slixmpp.ClientXMPP): if '@' in jid_bare: hostname = jid_bare.split('@')[1] url = 'http://' + hostname - result = await crawl.probe_page(url) + result = await FeedDiscovery.probe_page(url) if not result: url = {'url' : url, 'index' : None, @@ -2966,7 +2962,7 @@ class XmppClient(slixmpp.ClientXMPP): session['payload'] = form session['prev'] = self._handle_promoted else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' @@ -3620,7 +3616,7 @@ class XmppClient(slixmpp.ClientXMPP): .format(function_name, jid_full)) jid_bare = session['from'].bare chat_type = await XmppUtilities.get_chat_type(self, jid_bare) - if is_access(self, jid_bare, jid_full, chat_type): + if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type): db_file = config.get_pathname_to_database(jid_bare) if jid_bare not in self.settings: Config.add_settings_jid(self.settings, jid_bare, db_file) @@ -3718,7 +3714,7 @@ class XmppClient(slixmpp.ClientXMPP): session['next'] = self._handle_settings_complete session['payload'] = form else: - if not is_operator(self, jid_bare): + if not XmppUtilities.is_operator(self, jid_bare): text_warn = 'This resource is restricted to operators.' elif chat_type == 'groupchat': text_warn = ('This resource is restricted to moderators of {}.' diff --git a/slixfeed/xmpp/commands.py b/slixfeed/xmpp/commands.py index fcd242b..93fcbe1 100644 --- a/slixfeed/xmpp/commands.py +++ b/slixfeed/xmpp/commands.py @@ -5,14 +5,11 @@ from feedparser import parse from random import randrange import slixfeed.config as config from slixfeed.config import Config -import slixfeed.crawl as crawl -import slixfeed.dt as dt import slixfeed.fetch as fetch from slixfeed.log import Logger import slixfeed.sqlite as sqlite -from slixfeed.syndication import Feed, Opml -import slixfeed.url as uri -from slixfeed.utilities import Documentation, Utilities +from slixfeed.syndication import Feed, FeedDiscovery, Opml +from slixfeed.utilities import DateAndTime, Documentation, Url, Utilities from slixfeed.version import __version__ from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.muc import XmppMuc @@ -121,9 +118,9 @@ class XmppCommands: """ if url.startswith('http'): if not title: - title = uri.get_hostname(url) + title = Url.get_hostname(url) counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) hostname = hostname.replace('.','-') identifier = hostname + ':' + str(counter) while True: @@ -148,7 +145,7 @@ class XmppCommands: if feed.has_key('updated_parsed'): feed_updated = feed.updated_parsed try: - feed_updated = dt.convert_struct_time_to_iso8601( + feed_updated = DateAndTime.convert_struct_time_to_iso8601( feed_updated) except: feed_updated = None @@ -393,7 +390,7 @@ class XmppCommands: identifier = info[2] else: counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) hostname = hostname.replace('.','-') identifier = hostname + ':' + str(counter) while True: @@ -417,8 +414,8 @@ class XmppCommands: if (url.startswith('feed:/') or url.startswith('itpc:/') or url.startswith('rss:/')): - url = uri.feed_to_http(url) - url = (await uri.replace_hostname(url, 'feed')) or url + url = Url.feed_to_http(url) + url = (await Url.replace_hostname(url, 'feed')) or url result = await Feed.add_feed(self, jid_bare, db_file, url, identifier) if isinstance(result, list): @@ -479,10 +476,10 @@ class XmppCommands: # both interfaces Chat and IPC async def fetch_http(self, url, db_file, jid_bare): if url.startswith('feed:/') or url.startswith('rss:/'): - url = uri.feed_to_http(url) - url = (await uri.replace_hostname(url, 'feed')) or url + url = Url.feed_to_http(url) + url = (await Url.replace_hostname(url, 'feed')) or url counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) hostname = hostname.replace('.','-') identifier = hostname + ':' + str(counter) while True: @@ -581,7 +578,7 @@ class XmppCommands: async def muc_join(self, command): if command: - muc_jid = uri.check_xmpp_uri(command) + muc_jid = Url.check_xmpp_uri(command) if muc_jid: # TODO probe JID and confirm it's a groupchat result = await XmppMuc.join(self, muc_jid) @@ -735,8 +732,8 @@ class XmppCommands: async def feed_read(self, jid_bare, data, url): if url.startswith('feed:/') or url.startswith('rss:/'): - url = uri.feed_to_http(url) - url = (await uri.replace_hostname(url, 'feed')) or url + url = Url.feed_to_http(url) + url = (await Url.replace_hostname(url, 'feed')) or url match len(data): case 1: if url.startswith('http'): @@ -750,7 +747,7 @@ class XmppCommands: message = Feed.view_feed(url, feed) break else: - result = await crawl.probe_page(url, document) + result = await FeedDiscovery.probe_page(url, document) if isinstance(result, list): results = result message = ("Syndication feeds found for {}\n\n```\n" @@ -786,7 +783,7 @@ class XmppCommands: message = Feed.view_entry(url, feed, num) break else: - result = await crawl.probe_page(url, document) + result = await FeedDiscovery.probe_page(url, document) if isinstance(result, list): results = result message = ("Syndication feeds found for {}\n\n```\n" diff --git a/slixfeed/xmpp/connect.py b/slixfeed/xmpp/connect.py index 0978ad4..73820f7 100644 --- a/slixfeed/xmpp/connect.py +++ b/slixfeed/xmpp/connect.py @@ -14,7 +14,7 @@ TODO """ import asyncio -from slixfeed.dt import current_time +from slixfeed.utilities import DateAndTime from slixfeed.log import Logger from slixmpp.exceptions import IqTimeout, IqError from time import sleep @@ -62,17 +62,17 @@ class XmppConnect: def recover(self, message): logger.warning(message) - print(current_time(), message, 'Attempting to reconnect.') + print(DateAndTime.current_time(), message, 'Attempting to reconnect.') self.connection_attempts += 1 # if self.connection_attempts <= self.max_connection_attempts: # self.reconnect(wait=5.0) # wait a bit before attempting to reconnect # else: # print(current_time(),"Maximum connection attempts exceeded.") # logging.error("Maximum connection attempts exceeded.") - print(current_time(), 'Attempt number', self.connection_attempts) + print(DateAndTime.current_time(), 'Attempt number', self.connection_attempts) seconds = self.reconnect_timeout or 30 seconds = int(seconds) - print(current_time(), 'Next attempt within', seconds, 'seconds') + print(DateAndTime.current_time(), 'Next attempt within', seconds, 'seconds') # NOTE asyncio.sleep doesn't interval as expected # await asyncio.sleep(seconds) sleep(seconds) diff --git a/slixfeed/xmpp/privilege.py b/slixfeed/xmpp/privilege.py deleted file mode 100644 index b39f3de..0000000 --- a/slixfeed/xmpp/privilege.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - - -def is_access(self, jid_bare, jid_full, chat_type): - """Determine access privilege""" - operator = is_operator(self, jid_bare) - if operator: - if chat_type == 'groupchat': - if is_moderator(self, jid_bare, jid_full): - access = True - else: - access = True - else: - access = False - return access - - -def is_operator(self, jid_bare): - """Check if given JID is an operator""" - result = False - for operator in self.operators: - if jid_bare == operator['jid']: - result = True - # operator_name = operator['name'] - break - return result - - -def is_moderator(self, jid_bare, jid_full): - """Check if given JID is a moderator""" - alias = jid_full[jid_full.index('/')+1:] - role = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'role') - if role == 'moderator': - result = True - else: - result = False - return result - - -def is_member(self, jid_bare, jid_full): - """Check if given JID is a member""" - alias = jid_full[jid_full.index('/')+1:] - affiliation = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'affiliation') - if affiliation == 'member': - result = True - else: - result = False - return result \ No newline at end of file diff --git a/slixfeed/xmpp/publish.py b/slixfeed/xmpp/publish.py index 2588bd7..d238133 100644 --- a/slixfeed/xmpp/publish.py +++ b/slixfeed/xmpp/publish.py @@ -16,8 +16,7 @@ from slixfeed.config import Config from slixfeed.log import Logger import slixfeed.sqlite as sqlite from slixfeed.syndication import Feed -import slixfeed.url as uri -from slixfeed.utilities import Utilities +from slixfeed.utilities import Url, Utilities from slixfeed.xmpp.iq import XmppIQ import sys @@ -337,7 +336,7 @@ class XmppPubsubAction: node_id = node_id[0] if not node_id: counter = 0 - hostname = uri.get_hostname(url) + hostname = Url.get_hostname(url) hostname = hostname.replace('.','-') identifier = hostname + ':' + str(counter) while True: diff --git a/slixfeed/xmpp/upload.py b/slixfeed/xmpp/upload.py index c548d6c..bb5638d 100644 --- a/slixfeed/xmpp/upload.py +++ b/slixfeed/xmpp/upload.py @@ -16,7 +16,7 @@ logger = Logger(__name__) class XmppUpload: async def start(self, jid, filename, domain=None): - logger.info('Uploading file %s...', filename) + logger.info(['Uploading file %s...', filename]) try: upload_file = self['xep_0363'].upload_file # if self.encrypted and not self['xep_0454']: @@ -34,7 +34,7 @@ class XmppUpload: filename, domain, timeout=10, ) logger.info('Upload successful!') - logger.info('Sending file to %s', jid) + logger.info(['Sending file to %s', jid]) except HTTPError: url = ('Error: It appears that this server does not support ' 'HTTP File Upload.') diff --git a/slixfeed/xmpp/utilities.py b/slixfeed/xmpp/utilities.py index c1dc3e1..9b2c0a3 100644 --- a/slixfeed/xmpp/utilities.py +++ b/slixfeed/xmpp/utilities.py @@ -58,3 +58,51 @@ class XmppUtilities: # finally: # logger.info('Chat type is:', chat_type) return result + + + + def is_access(self, jid_bare, jid_full, chat_type): + """Determine access privilege""" + operator = XmppUtilities.is_operator(self, jid_bare) + if operator: + if chat_type == 'groupchat': + if XmppUtilities.is_moderator(self, jid_bare, jid_full): + access = True + else: + access = True + else: + access = False + return access + + + def is_operator(self, jid_bare): + """Check if given JID is an operator""" + result = False + for operator in self.operators: + if jid_bare == operator['jid']: + result = True + # operator_name = operator['name'] + break + return result + + + def is_moderator(self, jid_bare, jid_full): + """Check if given JID is a moderator""" + alias = jid_full[jid_full.index('/')+1:] + role = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'role') + if role == 'moderator': + result = True + else: + result = False + return result + + + def is_member(self, jid_bare, jid_full): + """Check if given JID is a member""" + alias = jid_full[jid_full.index('/')+1:] + affiliation = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'affiliation') + if affiliation == 'member': + result = True + else: + result = False + return result \ No newline at end of file