Fix command export; Restructure code.

This commit is contained in:
Schimon Jehudah, Adv. 2024-06-16 11:55:22 +03:00
parent 15e6a1de66
commit 93ea8a9fab
17 changed files with 1066 additions and 1184 deletions

View file

@ -1,436 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FIXME
1) https://wiki.pine64.org
File "/slixfeed/crawl.py", line 178, in feed_mode_guess
address = join_url(url, parted_url.path.split('/')[1] + path)
~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range
TODO
1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /
Attempt to scan sub domains
https://esmailelbob.xyz/en/
https://blog.esmailelbob.xyz/feed/
1.2) Consider utilizing fetch.http_response
2) Consider merging with module fetch.py
FEEDS CRAWLER PROJECT
3) Mark redirects for manual check
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/atom.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rdf.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rss.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/videos.xml
"""
from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
import logging
from lxml import etree
from lxml import html
from lxml.etree import fromstring
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
logger = Logger(__name__)
async def probe_page(url, document=None):
"""
Parameters
----------
url : str
URL.
document : TYPE
DESCRIPTION.
Returns
-------
result : list or str
Single URL as list or selection of URLs as str.
"""
if not document:
response = await fetch.http(url)
if not response['error']:
document = response['content']
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document)
result = None
except Exception as e:
logger.error(str(e))
try:
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html.fromstring(document.encode('utf-8'))
result = None
except Exception as e:
logger.error(str(e))
logger.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
if not result:
logger.debug("Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree)
if not result:
logger.debug("Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree)
if not result:
logger.debug("Feed arbitrary mode engaged for {}".format(url))
result = await feed_mode_guess(url, tree)
if not result:
logger.debug("No feeds were found for {}".format(url))
result = None
return result
# TODO Improve scan by gradual decreasing of path
async def feed_mode_guess(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
urls = []
parted_url = urlsplit(url)
paths = config.open_config_file("lists.toml")["pathnames"]
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
parted_url_path = parted_url.path if parted_url.path else '/'
for path in paths:
address = join_url(url, parted_url_path.split('/')[1] + path)
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_guess")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
urls = []
paths = config.open_config_file("lists.toml")["pathnames"]
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = (
"(//a[contains(@href,'{}')])[position()<={}]"
).format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = (
"(//a[contains(@href,'{}')])[position()>last()-{}]"
).format(path, num)
addresses += tree.xpath(xpath_query)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
address = join_url(url, address.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_scan")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
if feeds:
urls = []
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await fetch.http(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
# feed_name = feed.xpath('@title')[0]
# feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
address = join_url(url, feed.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_auto_discovery")
urls = await process_feed_selection(url, urls)
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
result = await fetch.http(i)
if not result['error']:
document = result['content']
status_code = result['status_code']
if status_code == 200: # NOTE This line might be redundant
try:
feeds[i] = [parse(document)]
except:
continue
message = (
"Web feeds found for {}\n\n```\n"
).format(url)
urls = []
for feed_url in feeds:
# try:
# res = await fetch.http(feed)
# except:
# continue
feed_name = None
if "title" in feeds[feed_url][0]["feed"].keys():
feed_name = feeds[feed_url][0].feed.title
feed_name = feed_name if feed_name else "Untitled"
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if "entries" in feeds[feed_url][0].keys():
feed_amnt = feeds[feed_url][0].entries
else:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
# NOTE feed_url_mark was a variable which stored
# single URL (probably first accepted as valid)
# in order to get an indication whether a single
# URL has been fetched, so that the receiving
# function will scan that single URL instead of
# listing it as a message.
url = {'link' : feed_url,
'index' : None,
'name' : feed_name,
'code' : status_code,
'error' : False,
'exist' : None}
urls.extend([url])
count = len(urls)
if count > 1:
result = urls
elif count:
result = urls[0]
else:
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.http(url)
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)

View file

@ -1,114 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
https://feedparser.readthedocs.io/en/latest/date-parsing.html
"""
from datetime import datetime
from dateutil.parser import parse
from email.utils import parsedate, parsedate_to_datetime
def now():
"""
ISO 8601 Timestamp.
Returns
-------
date : ???
ISO 8601 Timestamp.
"""
date = datetime.now().isoformat()
return date
def convert_struct_time_to_iso8601(struct_time):
date = datetime(*struct_time[:6])
date = date.isoformat()
return date
def current_date():
"""
Print MM DD, YYYY (Weekday Time) timestamp.
Returns
-------
date : str
MM DD, YYYY (Weekday Time) timestamp.
"""
now = datetime.now()
time = now.strftime("%B %d, %Y (%A %T)")
return time
def current_time():
"""
Print HH:MM:SS timestamp.
Returns
-------
date : str
HH:MM:SS timestamp.
"""
now = datetime.now()
time = now.strftime("%H:%M:%S")
return time
def timestamp():
"""
Print time stamp to be used in filename.
Returns
-------
formatted_time : str
%Y%m%d-%H%M%S timestamp.
"""
now = datetime.now()
formatted_time = now.strftime("%Y%m%d-%H%M%S")
return formatted_time
def validate(date):
"""
Validate date format.
Parameters
----------
date : str
Timestamp.
Returns
-------
date : str
Timestamp.
"""
try:
parse(date)
except:
date = now()
return date
def rfc2822_to_iso8601(date):
"""
Convert RFC 2822 into ISO 8601.
Parameters
----------
date : str
RFC 2822 Timestamp.
Returns
-------
date : str
ISO 8601 Timestamp.
"""
if parsedate(date):
try:
date = parsedate_to_datetime(date)
date = date.isoformat()
except:
date = now()
return date

View file

@ -1,19 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
Move code from sqlite.get_entry_unread
if num > 1:
news_list += (
"\n{}\n{}\n{}\n"
).format(str(title), str(link), str(feed_title))
else:
news_list = (
"{}\n{}\n{}"
).format(str(title), str(link), str(feed_title))
"""

View file

@ -1,74 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) is_feed: Look into the type ("atom", "rss2" etc.)
"""
def title(feed):
"""
Get title of feed.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed document.
Returns
-------
title : str
Title or None.
"""
try:
title = feed["feed"]["title"]
except:
title = None
return title
def is_feed(feed):
"""
Determine whether document is feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
msg = None
if not feed.entries:
try:
feed["feed"]["title"]
val = True
# msg = (
# "Empty feed for {}"
# ).format(url)
except:
val = False
# msg = (
# "No entries nor title for {}"
# ).format(url)
elif feed.bozo:
val = False
# msg = (
# "Bozo detected for {}"
# ).format(url)
else:
val = True
# msg = (
# "Good feed for {}"
# ).format(url)
print(msg)
return val

View file

@ -20,9 +20,8 @@ TODO
"""
from asyncio import Lock
import slixfeed.dt as dt
from slixfeed.log import Logger
from slixfeed.url import join_url
from slixfeed.utilities import DateAndTime, Url
from sqlite3 import connect, Error, IntegrityError
import sys
import time
@ -2736,7 +2735,7 @@ def get_invalid_entries(db_file, url, feed):
title = feed["feed"]["title"]
# Prepare a link to compare
if entry.has_key("link"):
link = join_url(url, entry.link)
link = Url.join_url(url, entry.link)
else:
link = url
# Compare date, link and title
@ -2745,7 +2744,7 @@ def get_invalid_entries(db_file, url, feed):
# print("compare published:", title, link, time)
# print("compare published:", entry_title, entry_link, timestamp)
# print("============")
time = dt.rfc2822_to_iso8601(entry.published)
time = DateAndTime.rfc2822_to_iso8601(entry.published)
if (entry_title == title and
entry_link == link and
timestamp == time):

View file

@ -29,12 +29,10 @@ from feedparser import parse
import os
import slixfeed.config as config
from slixfeed.config import Config
import slixfeed.crawl as crawl
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.url import join_url, trim_url
from slixfeed.utilities import DateAndTime, Url
from slixfeed.utilities import Html, MD
from slixmpp.xmlstream import ET
import sys
@ -56,7 +54,7 @@ class Feed:
if not os.path.isdir(cache_dir + '/' + ext):
os.mkdir(cache_dir + '/' + ext)
filename = os.path.join(
cache_dir, ext, 'slixfeed_' + dt.timestamp() + '.' + ext)
cache_dir, ext, 'slixfeed_' + DateAndTime.timestamp() + '.' + ext)
db_file = config.get_pathname_to_database(jid_bare)
results = sqlite.get_feeds(db_file)
match ext:
@ -220,6 +218,7 @@ class Feed:
return node_entry
# Look into the type ("atom", "rss2" etc.)
def is_feed(url, feed):
"""
Determine whether document is feed or not.
@ -301,7 +300,7 @@ class Feed:
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
try:
updated = dt.convert_struct_time_to_iso8601(updated)
updated = DateAndTime.convert_struct_time_to_iso8601(updated)
except Exception as e:
logger.error(str(e))
updated = ''
@ -325,7 +324,7 @@ class Feed:
if feed.has_key('updated_parsed'):
feed_updated = feed.updated_parsed
try:
feed_updated = dt.convert_struct_time_to_iso8601(feed_updated)
feed_updated = DateAndTime.convert_struct_time_to_iso8601(feed_updated)
except Exception as e:
logger.error(str(e))
feed_updated = None
@ -357,7 +356,7 @@ class Feed:
# NOTE Do not be tempted to return a compact dictionary.
# That is, dictionary within dictionary
# Return multiple dictionaries in a list or tuple.
result = await crawl.probe_page(url, document)
result = await FeedDiscovery.probe_page(url, document)
if not result:
# Get out of the loop with dict indicating error.
result_final = {'link' : url,
@ -437,16 +436,16 @@ class Feed:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
link = Url.join_url(url, entry.link)
link = Url.trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = dt.rfc2822_to_iso8601(date)
date = DateAndTime.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = dt.rfc2822_to_iso8601(date)
date = DateAndTime.rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
response += ("Title : {}\n"
@ -481,10 +480,10 @@ class Feed:
title = '*** No title ***'
if entry.has_key("published"):
date = entry.published
date = dt.rfc2822_to_iso8601(date)
date = DateAndTime.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = dt.rfc2822_to_iso8601(date)
date = DateAndTime.rfc2822_to_iso8601(date)
else:
date = '*** No date ***'
if entry.has_key("summary"):
@ -500,8 +499,8 @@ class Feed:
summary = '*** No summary ***'
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
link = Url.join_url(url, entry.link)
link = Url.trim_url(link)
else:
link = '*** No link ***'
response = ("{}\n"
@ -543,7 +542,7 @@ class Feed:
if feed.has_key('updated_parsed'):
feed_updated = feed.updated_parsed
try:
feed_updated = dt.convert_struct_time_to_iso8601(feed_updated)
feed_updated = DateAndTime.convert_struct_time_to_iso8601(feed_updated)
except:
feed_updated = ''
else:
@ -598,18 +597,18 @@ class Feed:
logger.debug('{}: entry: {}'.format(function_name, entry.link))
if entry.has_key("published"):
entry_published = entry.published
entry_published = dt.rfc2822_to_iso8601(entry_published)
entry_published = DateAndTime.rfc2822_to_iso8601(entry_published)
else:
entry_published = ''
if entry.has_key("updated"):
entry_updated = entry.updated
entry_updated = dt.rfc2822_to_iso8601(entry_updated)
entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated)
else:
entry_updated = dt.now()
entry_updated = DateAndTime.now()
if entry.has_key("link"):
# link = complete_url(source, entry.link)
entry_link = join_url(feed_url, entry.link)
entry_link = trim_url(entry_link)
entry_link = Url.join_url(feed_url, entry.link)
entry_link = Url.trim_url(entry_link)
else:
entry_link = feed_url
# title = feed["feed"]["title"]
@ -783,8 +782,8 @@ class Feed:
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = join_url(url, e_link.href)
# media_link = trim_url(media_link)
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
@ -821,6 +820,442 @@ class Feed:
return new_entries
"""
FIXME
1) https://wiki.pine64.org
File "/slixfeed/crawl.py", line 178, in feed_mode_guess
address = Url.join_url(url, parted_url.path.split('/')[1] + path)
~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range
TODO
1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /
Attempt to scan sub domains
https://esmailelbob.xyz/en/
https://blog.esmailelbob.xyz/feed/
1.2) Consider utilizing fetch.http_response
2) DeviantArt
https://www.deviantart.com/nedesem/gallery
https://backend.deviantart.com/rss.xml?q=gallery:nedesem
https://backend.deviantart.com/rss.xml?q=nedesem
https://www.deviantart.com/search?q=
https://backend.deviantart.com/rss.xml?q=search:
FEEDS CRAWLER PROJECT
3) Mark redirects for manual check
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/atom.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rdf.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rss.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/videos.xml
"""
from aiohttp import ClientError, ClientSession, ClientTimeout
from lxml import etree
from lxml import html
from lxml.etree import fromstring
class FeedDiscovery:
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
async def probe_page(url, document=None):
"""
Parameters
----------
url : str
URL.
document : TYPE
DESCRIPTION.
Returns
-------
result : list or str
Single URL as list or selection of URLs as str.
"""
if not document:
response = await fetch.http(url)
if not response['error']:
document = response['content']
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document)
result = None
except Exception as e:
logger.error(str(e))
try:
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html.fromstring(document.encode('utf-8'))
result = None
except Exception as e:
logger.error(str(e))
logger.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
if not result:
logger.debug("Feed auto-discovery engaged for {}".format(url))
result = FeedDiscovery.feed_mode_auto_discovery(url, tree)
if not result:
logger.debug("Feed link scan mode engaged for {}".format(url))
result = FeedDiscovery.feed_mode_scan(url, tree)
if not result:
logger.debug("Feed arbitrary mode engaged for {}".format(url))
result = FeedDiscovery.feed_mode_guess(url, tree)
if not result:
logger.debug("No feeds were found for {}".format(url))
result = None
result = await FeedDiscovery.process_feed_selection(url, result)
return result
# TODO Improve scan by gradual decreasing of path
def feed_mode_guess(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
urls = []
parted_url = urlsplit(url)
paths = config.open_config_file("lists.toml")["pathnames"]
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
parted_url_path = parted_url.path if parted_url.path else '/'
for path in paths:
address = Url.join_url(url, parted_url_path.split('/')[1] + path)
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_guess")
return urls
def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
urls = []
paths = config.open_config_file("lists.toml")["pathnames"]
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = (
"(//a[contains(@href,'{}')])[position()<={}]"
).format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = (
"(//a[contains(@href,'{}')])[position()>last()-{}]"
).format(path, num)
addresses += tree.xpath(xpath_query)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
address = Url.join_url(url, address.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_scan")
return urls
def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
if feeds:
urls = []
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await fetch.http(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
# feed_name = feed.xpath('@title')[0]
# feed_addr = Url.join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
address = Url.join_url(url, feed.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_auto_discovery")
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
result = await fetch.http(i)
if not result['error']:
document = result['content']
status_code = result['status_code']
if status_code == 200: # NOTE This line might be redundant
try:
feeds[i] = [parse(document)]
except:
continue
message = (
"Web feeds found for {}\n\n```\n"
).format(url)
urls = []
for feed_url in feeds:
# try:
# res = await fetch.http(feed)
# except:
# continue
feed_name = None
if "title" in feeds[feed_url][0]["feed"].keys():
feed_name = feeds[feed_url][0].feed.title
feed_name = feed_name if feed_name else "Untitled"
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if "entries" in feeds[feed_url][0].keys():
feed_amnt = feeds[feed_url][0].entries
else:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
# NOTE feed_url_mark was a variable which stored
# single URL (probably first accepted as valid)
# in order to get an indication whether a single
# URL has been fetched, so that the receiving
# function will scan that single URL instead of
# listing it as a message.
url = {'link' : feed_url,
'index' : None,
'name' : feed_name,
'code' : status_code,
'error' : False,
'exist' : None}
urls.extend([url])
count = len(urls)
if count > 1:
result = urls
elif count:
result = urls[0]
else:
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.http(url)
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)
class FeedTask:
@ -921,7 +1356,7 @@ class Opml:
ETR.SubElement(head, "generator").text = "Slixfeed"
ETR.SubElement(head, "urlPublic").text = (
"https://slixfeed.woodpeckersnest.space/")
time_stamp = dt.current_time()
time_stamp = DateAndTime.current_time()
ETR.SubElement(head, "dateCreated").text = time_stamp
ETR.SubElement(head, "dateModified").text = time_stamp
body = ETR.SubElement(root, "body")

View file

@ -1,352 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FIXME
1) Do not handle base64
https://www.lilithsaintcrow.com/2024/02/love-anonymous/
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC
https://www.lilithsaintcrow.com/2024/02/love-anonymous//image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC
TODO
1) ActivityPub URL revealer activitypub_to_http.
2) SQLite preference "instance" for preferred instances.
"""
from email.utils import parseaddr
import os
import random
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
from urllib.parse import (
parse_qs,
urlencode,
urljoin,
# urlparse,
urlsplit,
urlunsplit
)
logger = Logger(__name__)
# NOTE
# hostname and protocol are listed as one in file proxies.toml.
# Perhaps a better practice would be to have them separated.
# NOTE
# File proxies.toml will remain as it is, in order to be
# coordinated with the dataset of project LibRedirect, even
# though rule-sets might be adopted (see )Privacy Redirect).
def get_hostname(url):
parted_url = urlsplit(url)
hostname = parted_url.netloc
if hostname.startswith('www.'): hostname = hostname.replace('www.', '')
return hostname
async def replace_hostname(url, url_type):
"""
Replace hostname.
Parameters
----------
url : str
URL.
url_type : str
"feed" or "link".
Returns
-------
url : str
URL.
"""
url_new = None
parted_url = urlsplit(url)
# protocol = parted_url.scheme
hostname = parted_url.netloc
hostname = hostname.replace('www.','')
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
proxies = config.open_config_file('proxies.toml')['proxies']
for proxy_name in proxies:
proxy = proxies[proxy_name]
if hostname in proxy['hostname'] and url_type in proxy['type']:
while not url_new:
print('>>>')
print(url_new)
proxy_type = 'clearnet'
proxy_list = proxy[proxy_type]
if len(proxy_list):
# proxy_list = proxies[proxy_name][proxy_type]
proxy_url = random.choice(proxy_list)
parted_proxy_url = urlsplit(proxy_url)
protocol_new = parted_proxy_url.scheme
hostname_new = parted_proxy_url.netloc
url_new = urlunsplit([protocol_new, hostname_new,
pathname, queries, fragment])
print(proxy_url)
print(url_new)
print('>>>')
response = await fetch.http(url_new)
if (response and
response['status_code'] == 200 and
# response.reason == 'OK' and
url_new.startswith(proxy_url)):
break
else:
config_dir = config.get_default_config_directory()
proxies_obsolete_file = config_dir + '/proxies_obsolete.toml'
proxies_file = config_dir + '/proxies.toml'
if not os.path.isfile(proxies_obsolete_file):
config.create_skeleton(proxies_file)
config.backup_obsolete(proxies_obsolete_file,
proxy_name, proxy_type,
proxy_url)
try:
config.update_proxies(proxies_file, proxy_name,
proxy_type, proxy_url)
except ValueError as e:
logger.error([str(e), proxy_url])
url_new = None
else:
logger.warning('No proxy URLs for {}. '
'Please update proxies.toml'
.format(proxy_name))
url_new = url
break
return url_new
def remove_tracking_parameters(url):
"""
Remove queries with tracking parameters.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
if url.startswith('data:') and ';base64,' in url:
return url
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parse_qs(parted_url.query)
fragment = parted_url.fragment
trackers = config.open_config_file('queries.toml')['trackers']
for tracker in trackers:
if tracker in queries: del queries[tracker]
queries_new = urlencode(queries, doseq=True)
url = urlunsplit([protocol, hostname, pathname, queries_new, fragment])
return url
def feed_to_http(url):
"""
Replace scheme FEED by HTTP.
Parameters
----------
url : str
URL.
Returns
-------
new_url : str
URL.
"""
par_url = urlsplit(url)
new_url = urlunsplit(['http', par_url.netloc, par_url.path, par_url.query,
par_url.fragment])
return new_url
def check_xmpp_uri(uri):
"""
Check validity of XMPP URI.
Parameters
----------
uri : str
URI.
Returns
-------
jid : str
JID or None.
"""
jid = urlsplit(uri).path
if parseaddr(jid)[1] != jid:
jid = False
return jid
# NOTE Read the documentation
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
def complete_url(source, link):
"""
Check if URL is pathname and complete it into URL.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith('data:') and ';base64,' in link:
return link
if link.startswith('www.'):
return 'http://' + link
parted_link = urlsplit(link)
parted_feed = urlsplit(source)
if parted_link.scheme == 'magnet' and parted_link.query:
return link
if parted_link.scheme and parted_link.netloc:
return link
if link.startswith('//'):
if parted_link.netloc and parted_link.path:
new_link = urlunsplit([parted_feed.scheme, parted_link.netloc,
parted_link.path, parted_link.query,
parted_link.fragment])
elif link.startswith('/'):
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
parted_link.path, parted_link.query,
parted_link.fragment])
elif link.startswith('../'):
pathlink = parted_link.path.split('/')
pathfeed = parted_feed.path.split('/')
for i in pathlink:
if i == '..':
if pathlink.index('..') == 0:
pathfeed.pop()
else:
break
while pathlink.count('..'):
if pathlink.index('..') == 0:
pathlink.remove('..')
else:
break
pathlink = '/'.join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
'/'.join(pathfeed), parted_link.query,
parted_link.fragment])
else:
pathlink = parted_link.path.split('/')
pathfeed = parted_feed.path.split('/')
if link.startswith('./'):
pathlink.remove('.')
if not source.endswith('/'):
pathfeed.pop()
pathlink = '/'.join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
'/'.join(pathfeed), parted_link.query,
parted_link.fragment])
return new_link
# TODO
# Feed https://www.ocaml.org/feed.xml
# Link %20https://frama-c.com/fc-versions/cobalt.html%20
# FIXME
# Feed https://cyber.dabamos.de/blog/feed.rss
# Link https://cyber.dabamos.de/blog/#article-2022-07-15
def join_url(source, link):
"""
Join base URL with given pathname.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith('data:') and ';base64,' in link:
return link
if link.startswith('www.'):
new_link = 'http://' + link
elif link.startswith('%20') and link.endswith('%20'):
old_link = link.split('%20')
del old_link[0]
old_link.pop()
new_link = ''.join(old_link)
else:
new_link = urljoin(source, link)
return new_link
def trim_url(url):
"""
Check URL pathname for double slash.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
if url.startswith('data:') and ';base64,' in url:
return url
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
while '//' in pathname:
pathname = pathname.replace('//', '/')
url = urlunsplit([protocol, hostname, pathname, queries, fragment])
return url
def activitypub_to_http(namespace):
"""
Replace ActivityPub namespace by HTTP.
Parameters
----------
namespace : str
Namespace.
Returns
-------
new_url : str
URL.
"""

View file

@ -39,16 +39,27 @@ TODO
"""
from datetime import datetime
from email.utils import parseaddr
from dateutil.parser import parse
from email.utils import parsedate, parsedate_to_datetime
import hashlib
import os
import random
import slixfeed.config as config
from slixfeed.config import Config
from lxml import etree, html
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.url import join_url, complete_url
import sys
from urllib.parse import (
parse_qs,
urlencode,
urljoin,
# urlparse,
urlsplit,
urlunsplit
)
try:
import tomllib
@ -58,6 +69,115 @@ except:
logger = Logger(__name__)
class DateAndTime:
#https://feedparser.readthedocs.io/en/latest/date-parsing.html
def now():
"""
ISO 8601 Timestamp.
Returns
-------
date : ???
ISO 8601 Timestamp.
"""
date = datetime.now().isoformat()
return date
def convert_struct_time_to_iso8601(struct_time):
date = datetime(*struct_time[:6])
date = date.isoformat()
return date
def current_date():
"""
Print MM DD, YYYY (Weekday Time) timestamp.
Returns
-------
date : str
MM DD, YYYY (Weekday Time) timestamp.
"""
now = datetime.now()
time = now.strftime("%B %d, %Y (%A %T)")
return time
def current_time():
"""
Print HH:MM:SS timestamp.
Returns
-------
date : str
HH:MM:SS timestamp.
"""
now = datetime.now()
time = now.strftime("%H:%M:%S")
return time
def timestamp():
"""
Print time stamp to be used in filename.
Returns
-------
formatted_time : str
%Y%m%d-%H%M%S timestamp.
"""
now = datetime.now()
formatted_time = now.strftime("%Y%m%d-%H%M%S")
return formatted_time
def validate(date):
"""
Validate date format.
Parameters
----------
date : str
Timestamp.
Returns
-------
date : str
Timestamp.
"""
try:
parse(date)
except:
date = DateAndTime.now()
return date
def rfc2822_to_iso8601(date):
"""
Convert RFC 2822 into ISO 8601.
Parameters
----------
date : str
RFC 2822 Timestamp.
Returns
-------
date : str
ISO 8601 Timestamp.
"""
if parsedate(date):
try:
date = parsedate_to_datetime(date)
date = date.isoformat()
except:
date = DateAndTime.now()
return date
class Documentation:
@ -120,7 +240,7 @@ class Html:
if len(images):
image = images[0]
image = str(image)
image_url = complete_url(url, image)
image_url = Url.complete_url(url, image)
return image_url
@ -224,6 +344,343 @@ class Task:
.format(task, jid_bare))
"""
FIXME
1) Do not handle base64
https://www.lilithsaintcrow.com/2024/02/love-anonymous/
data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC
https://www.lilithsaintcrow.com/2024/02/love-anonymous//image/png;base64,iVBORw0KGgoAAAANSUhEUgAABaAAAAeAAQAAAAAQ6M16AAAAAnRSTlMAAHaTzTgAAAFmSURBVBgZ7cEBAQAAAIKg/q92SMAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADgWE3LAAGyZmPPAAAAAElFTkSuQmCC
TODO
1) ActivityPub URL revealer activitypub_to_http.
2) SQLite preference "instance" for preferred instances.
"""
class Url:
# NOTE
# hostname and protocol are listed as one in file proxies.toml.
# Perhaps a better practice would be to have them separated.
# NOTE
# File proxies.toml will remain as it is, in order to be
# coordinated with the dataset of project LibRedirect, even
# though rule-sets might be adopted (see )Privacy Redirect).
def get_hostname(url):
parted_url = urlsplit(url)
hostname = parted_url.netloc
if hostname.startswith('www.'): hostname = hostname.replace('www.', '')
return hostname
async def replace_hostname(url, url_type):
"""
Replace hostname.
Parameters
----------
url : str
URL.
url_type : str
"feed" or "link".
Returns
-------
url : str
URL.
"""
url_new = None
parted_url = urlsplit(url)
# protocol = parted_url.scheme
hostname = parted_url.netloc
hostname = hostname.replace('www.','')
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
proxies = config.open_config_file('proxies.toml')['proxies']
for proxy_name in proxies:
proxy = proxies[proxy_name]
if hostname in proxy['hostname'] and url_type in proxy['type']:
while not url_new:
print('>>>')
print(url_new)
proxy_type = 'clearnet'
proxy_list = proxy[proxy_type]
if len(proxy_list):
# proxy_list = proxies[proxy_name][proxy_type]
proxy_url = random.choice(proxy_list)
parted_proxy_url = urlsplit(proxy_url)
protocol_new = parted_proxy_url.scheme
hostname_new = parted_proxy_url.netloc
url_new = urlunsplit([protocol_new, hostname_new,
pathname, queries, fragment])
print(proxy_url)
print(url_new)
print('>>>')
response = await fetch.http(url_new)
if (response and
response['status_code'] == 200 and
# response.reason == 'OK' and
url_new.startswith(proxy_url)):
break
else:
config_dir = config.get_default_config_directory()
proxies_obsolete_file = config_dir + '/proxies_obsolete.toml'
proxies_file = config_dir + '/proxies.toml'
if not os.path.isfile(proxies_obsolete_file):
config.create_skeleton(proxies_file)
config.backup_obsolete(proxies_obsolete_file,
proxy_name, proxy_type,
proxy_url)
try:
config.update_proxies(proxies_file, proxy_name,
proxy_type, proxy_url)
except ValueError as e:
logger.error([str(e), proxy_url])
url_new = None
else:
logger.warning('No proxy URLs for {}. '
'Please update proxies.toml'
.format(proxy_name))
url_new = url
break
return url_new
def remove_tracking_parameters(url):
"""
Remove queries with tracking parameters.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
if url.startswith('data:') and ';base64,' in url:
return url
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parse_qs(parted_url.query)
fragment = parted_url.fragment
trackers = config.open_config_file('queries.toml')['trackers']
for tracker in trackers:
if tracker in queries: del queries[tracker]
queries_new = urlencode(queries, doseq=True)
url = urlunsplit([protocol, hostname, pathname, queries_new, fragment])
return url
def feed_to_http(url):
"""
Replace scheme FEED by HTTP.
Parameters
----------
url : str
URL.
Returns
-------
new_url : str
URL.
"""
par_url = urlsplit(url)
new_url = urlunsplit(['http', par_url.netloc, par_url.path, par_url.query,
par_url.fragment])
return new_url
def check_xmpp_uri(uri):
"""
Check validity of XMPP URI.
Parameters
----------
uri : str
URI.
Returns
-------
jid : str
JID or None.
"""
jid = urlsplit(uri).path
if parseaddr(jid)[1] != jid:
jid = False
return jid
# NOTE Read the documentation
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
def complete_url(source, link):
"""
Check if URL is pathname and complete it into URL.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith('data:') and ';base64,' in link:
return link
if link.startswith('www.'):
return 'http://' + link
parted_link = urlsplit(link)
parted_feed = urlsplit(source)
if parted_link.scheme == 'magnet' and parted_link.query:
return link
if parted_link.scheme and parted_link.netloc:
return link
if link.startswith('//'):
if parted_link.netloc and parted_link.path:
new_link = urlunsplit([parted_feed.scheme, parted_link.netloc,
parted_link.path, parted_link.query,
parted_link.fragment])
elif link.startswith('/'):
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
parted_link.path, parted_link.query,
parted_link.fragment])
elif link.startswith('../'):
pathlink = parted_link.path.split('/')
pathfeed = parted_feed.path.split('/')
for i in pathlink:
if i == '..':
if pathlink.index('..') == 0:
pathfeed.pop()
else:
break
while pathlink.count('..'):
if pathlink.index('..') == 0:
pathlink.remove('..')
else:
break
pathlink = '/'.join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
'/'.join(pathfeed), parted_link.query,
parted_link.fragment])
else:
pathlink = parted_link.path.split('/')
pathfeed = parted_feed.path.split('/')
if link.startswith('./'):
pathlink.remove('.')
if not source.endswith('/'):
pathfeed.pop()
pathlink = '/'.join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([parted_feed.scheme, parted_feed.netloc,
'/'.join(pathfeed), parted_link.query,
parted_link.fragment])
return new_link
# TODO
# Feed https://www.ocaml.org/feed.xml
# Link %20https://frama-c.com/fc-versions/cobalt.html%20
# FIXME
# Feed https://cyber.dabamos.de/blog/feed.rss
# Link https://cyber.dabamos.de/blog/#article-2022-07-15
def join_url(source, link):
"""
Join base URL with given pathname.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith('data:') and ';base64,' in link:
return link
if link.startswith('www.'):
new_link = 'http://' + link
elif link.startswith('%20') and link.endswith('%20'):
old_link = link.split('%20')
del old_link[0]
old_link.pop()
new_link = ''.join(old_link)
else:
new_link = urljoin(source, link)
return new_link
def trim_url(url):
"""
Check URL pathname for double slash.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
if url.startswith('data:') and ';base64,' in url:
return url
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
while '//' in pathname:
pathname = pathname.replace('//', '/')
url = urlunsplit([protocol, hostname, pathname, queries, fragment])
return url
def activitypub_to_http(namespace):
"""
Replace ActivityPub namespace by HTTP.
Parameters
----------
namespace : str
Namespace.
Returns
-------
new_url : str
URL.
"""
class Utilities:

View file

@ -1,2 +1,2 @@
__version__ = '0.1.81'
__version_info__ = (0, 1, 81)
__version__ = '0.1.82'
__version_info__ = (0, 1, 82)

View file

@ -29,16 +29,11 @@ import slixfeed.config as config
from slixfeed.config import Config
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.url import (
remove_tracking_parameters,
replace_hostname,
)
from slixfeed.syndication import FeedTask
from slixfeed.utilities import Documentation, Html, MD, Task
from slixfeed.utilities import Documentation, Html, MD, Task, Url
from slixfeed.xmpp.commands import XmppCommands
from slixfeed.xmpp.message import XmppMessage
from slixfeed.xmpp.presence import XmppPresence
from slixfeed.xmpp.privilege import is_operator, is_moderator
from slixfeed.xmpp.status import XmppStatusTask
from slixfeed.xmpp.upload import XmppUpload
from slixfeed.xmpp.utilities import XmppUtilities
@ -89,7 +84,7 @@ class XmppChat:
if (message['muc']['nick'] == self.alias):
return
jid_full = str(message['from'])
if not is_moderator(self, jid_bare, jid_full):
if not XmppUtilities.is_moderator(self, jid_bare, jid_full):
return
if message['type'] == 'groupchat':
@ -115,7 +110,7 @@ class XmppChat:
# return
# approved = False
jid_full = str(message['from'])
if not is_moderator(self, jid_bare, jid_full):
if not XmppUtilities.is_moderator(self, jid_bare, jid_full):
return
# if role == 'moderator':
# approved = True
@ -257,7 +252,7 @@ class XmppChat:
response = 'Current value for archive: '
response += XmppCommands.get_archive(self, jid_bare)
case _ if command_lowercase.startswith('bookmark +'):
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
muc_jid = command[11:]
response = await XmppCommands.bookmark_add(
self, muc_jid)
@ -265,7 +260,7 @@ class XmppChat:
response = ('This action is restricted. '
'Type: adding bookmarks.')
case _ if command_lowercase.startswith('bookmark -'):
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
muc_jid = command[11:]
response = await XmppCommands.bookmark_del(
self, muc_jid)
@ -273,7 +268,7 @@ class XmppChat:
response = ('This action is restricted. '
'Type: removing bookmarks.')
case 'bookmarks':
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
response = await XmppCommands.print_bookmarks(self)
else:
response = ('This action is restricted. '
@ -333,7 +328,7 @@ class XmppChat:
XmppPresence.send(self, jid_bare, status_message,
status_type=status_type)
filename, response = XmppCommands.export_feeds(
self, jid_bare, ext)
jid_bare, ext)
url = await XmppUpload.start(self, jid_bare, filename)
# response = (
# 'Feeds exported successfully to {}.\n{}'
@ -388,7 +383,7 @@ class XmppChat:
response = await XmppCommands.pubsub_list(self, jid)
response += '```'
case _ if command_lowercase.startswith('pubsub send'):
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
info = command[12:]
info = info.split(' ')
jid = info[0]
@ -461,7 +456,7 @@ class XmppChat:
await XmppChatAction.send_unread_items(self, jid_bare, num)
XmppStatusTask.restart_task(self, jid_bare)
case _ if command_lowercase.startswith('node delete'):
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
info = command[12:]
info = info.split(' ')
response = XmppCommands.node_delete(self, info)
@ -469,7 +464,7 @@ class XmppChat:
response = ('This action is restricted. '
'Type: sending news to PubSub.')
case _ if command_lowercase.startswith('node purge'):
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
info = command[11:]
info = info.split(' ')
response = XmppCommands.node_purge(self, info)
@ -770,8 +765,8 @@ class XmppChatAction:
else:
summary = '*** No summary ***'
link = result[2]
link = remove_tracking_parameters(link)
link = await replace_hostname(link, "link") or link
link = Url.remove_tracking_parameters(link)
link = await Url.replace_hostname(link, "link") or link
feed_id = result[4]
# news_item = ("\n{}\n{}\n{} [{}]\n").format(str(title), str(link),
# str(feed_title), str(ix))

View file

@ -44,14 +44,11 @@ import slixmpp
import slixfeed.config as config
from slixfeed.config import Config
import slixfeed.crawl as crawl
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, FeedTask, Opml
import slixfeed.url as uri
from slixfeed.utilities import Html, Task, Utilities
from slixfeed.syndication import Feed, FeedDiscovery, FeedTask, Opml
from slixfeed.utilities import DateAndTime, Html, Task, Url, Utilities
from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.chat import XmppChat, XmppChatTask
@ -62,7 +59,6 @@ from slixfeed.xmpp.message import XmppMessage
from slixfeed.xmpp.muc import XmppMuc
from slixfeed.xmpp.groupchat import XmppGroupchat
from slixfeed.xmpp.presence import XmppPresence
from slixfeed.xmpp.privilege import is_operator, is_access
import slixfeed.xmpp.profile as profile
from slixfeed.xmpp.publish import XmppPubsub, XmppPubsubAction, XmppPubsubTask
from slixfeed.xmpp.roster import XmppRoster
@ -791,7 +787,7 @@ class XmppClient(slixmpp.ClientXMPP):
# )
# NOTE https://codeberg.org/poezio/slixmpp/issues/3515
# if is_operator(self, jid_bare):
# if XmppUtilities.is_operator(self, jid_bare):
self['xep_0050'].add_command(node='subscription',
name='🪶️ Subscribe',
handler=self._handle_subscription_add)
@ -842,7 +838,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'PubSub')
form['instructions'] = 'Publish news items to PubSub nodes.'
options = form.add_field(desc='From which medium source do you '
@ -863,7 +859,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['prev'] = None
session['payload'] = form
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -883,7 +879,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
values = payload['values']
form = self['xep_0004'].make_form('form', 'Publish')
form['instructions'] = ('Choose a PubSub Jabber ID and verify '
@ -971,7 +967,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['has_next'] = True
session['prev'] = self._handle_publish
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -994,7 +990,7 @@ class XmppClient(slixmpp.ClientXMPP):
print(values['jid'])
jid = values['jid'] if 'jid' in values else None
jid_bare = session['from'].bare
if jid != jid_bare and not is_operator(self, jid_bare):
if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare):
text_warn = ('Posting to {} is restricted to operators only.'
.format(jid_bare)) # Should not this be self.boundjid.bare?
session['allow_prev'] = False
@ -1065,7 +1061,7 @@ class XmppClient(slixmpp.ClientXMPP):
ixs = values['entries']
#if jid: jid = jid[0] if isinstance(jid, list) else jid
jid_bare = session['from'].bare
if jid != jid_bare and not is_operator(self, jid_bare):
if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare):
# TODO Report incident
text_warn = 'You are not suppose to be here.'
session['allow_prev'] = False
@ -1100,7 +1096,7 @@ class XmppClient(slixmpp.ClientXMPP):
values = payload['values']
jid = values['jid'] if 'jid' in values else None
jid_bare = session['from'].bare
if jid != jid_bare and not is_operator(self, jid_bare):
if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare):
# TODO Report incident
text_warn = 'You are not suppose to be here.'
# text_warn = ('Posting to {} is restricted to operators only.'
@ -1119,7 +1115,7 @@ class XmppClient(slixmpp.ClientXMPP):
if jid == self.boundjid.bare:
node = 'urn:xmpp:microblog:0'
else:
node = uri.get_hostname(url)
node = Url.get_hostname(url)
form = self['xep_0004'].make_form('form', 'Publish')
while True:
result = await fetch.http(url)
@ -1137,7 +1133,7 @@ class XmppClient(slixmpp.ClientXMPP):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = uri.get_hostname(url)
title = Url.get_hostname(url)
entries = feed.entries
entry_ix = 0
for entry in entries:
@ -1146,10 +1142,10 @@ class XmppClient(slixmpp.ClientXMPP):
else:
if entry.has_key("published"):
title = entry.published
title = dt.rfc2822_to_iso8601(title)
title = DateAndTime.rfc2822_to_iso8601(title)
elif entry.has_key("updated"):
title = entry.updated
title = dt.rfc2822_to_iso8601(title)
title = DateAndTime.rfc2822_to_iso8601(title)
else:
title = "*** No title ***"
options.addOption(title, str(entry_ix))
@ -1164,7 +1160,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['payload'] = form
break
else:
result = await crawl.probe_page(url, document)
result = await FeedDiscovery.probe_page(url, document)
if isinstance(result, list):
results = result
form['instructions'] = ('Discovered {} subscriptions '
@ -1225,7 +1221,7 @@ class XmppClient(slixmpp.ClientXMPP):
jid = values['jid'][0] if 'jid' in values else None
#if jid: jid = jid[0] if isinstance(jid, list) else jid
jid_bare = session['from'].bare
if jid != jid_bare and not is_operator(self, jid_bare):
if jid != jid_bare and not XmppUtilities.is_operator(self, jid_bare):
# TODO Report incident
text_warn = 'You are not suppose to be here.'
session['allow_prev'] = False
@ -1262,10 +1258,10 @@ class XmppClient(slixmpp.ClientXMPP):
# else:
# if feed.entries[entry].has_key("published"):
# title = feed.entries[entry].published
# title = dt.rfc2822_to_iso8601(title)
# title = DateAndTime.rfc2822_to_iso8601(title)
# elif feed.entries[entry].has_key("updated"):
# title = feed.entries[entry].updated
# title = dt.rfc2822_to_iso8601(title)
# title = DateAndTime.rfc2822_to_iso8601(title)
# else:
# title = "*** No title ***"
# if feed.entries[entry].has_key("summary"):
@ -1393,7 +1389,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
jid = session['from'].bare
db_file = config.get_pathname_to_database(jid_bare)
form = self['xep_0004'].make_form('form', 'Filters')
@ -1432,7 +1428,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['next'] = self._handle_filters_complete
session['payload'] = form
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -1502,7 +1498,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'Subscribe')
# form['instructions'] = 'Add a new custom subscription.'
form.add_field(desc='Enter a URL.',
@ -1517,7 +1513,7 @@ class XmppClient(slixmpp.ClientXMPP):
required=True,
value='http://',
var='subscription')
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
# form['instructions'] = ('Special section for operators:\n'
# 'This section allows you to add '
# 'subscriptions for a JID of your '
@ -1544,7 +1540,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['prev'] = None
session['payload'] = form
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -1576,7 +1572,7 @@ class XmppClient(slixmpp.ClientXMPP):
# options.addOption('News by tag', 'tag')
options.addOption('Rejected', 'reject')
options.addOption('Unread', 'unread')
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
# form['instructions'] = ('Special section for operators:\n'
# 'This section allows you to view news items '
# 'of a JID of your choice.')
@ -1617,7 +1613,7 @@ class XmppClient(slixmpp.ClientXMPP):
jid_bare = session['from'].bare
values = payload['values']
form = self['xep_0004'].make_form('form', 'Updates')
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid']
form.add_field(var='jid',
ftype='hidden',
@ -1675,7 +1671,7 @@ class XmppClient(slixmpp.ClientXMPP):
ix = values['update']
jid_bare = session['from'].bare
form = self['xep_0004'].make_form('form', 'Article')
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid = values['jid']
jid_bare = jid[0] if isinstance(jid, list) else jid
form.add_field(var='jid',
@ -1688,9 +1684,9 @@ class XmppClient(slixmpp.ClientXMPP):
url = sqlite.get_entry_url(db_file, ix)
url = url[0] # TODO Handle a situation when index is no longer exist
logger.debug('Original URL: {}'.format(url))
url = uri.remove_tracking_parameters(url)
url = Url.remove_tracking_parameters(url)
logger.debug('Processed URL (tracker removal): {}'.format(url))
url = (await uri.replace_hostname(url, 'link')) or url
url = (await Url.replace_hostname(url, 'link')) or url
logger.debug('Processed URL (replace hostname): {}'.format(url))
# result = await fetch.http(url)
# if 'content' in result:
@ -1750,7 +1746,7 @@ class XmppClient(slixmpp.ClientXMPP):
identifier = values['identifier'] if 'identifier' in values else None
url = values['subscription']
jid_bare = session['from'].bare
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
custom_jid = values['jid']
jid_bare = custom_jid[0] if isinstance(custom_jid, list) else jid_bare
# jid_bare = custom_jid[0] if custom_jid else jid_bare
@ -1780,7 +1776,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['prev'] = None
# elif not identifier:
# counter = 0
# hostname = uri.get_hostname(url)
# hostname = Url.get_hostname(url)
# identifier = hostname + ':' + str(counter)
# while True:
# if sqlite.check_identifier_exist(db_file, identifier):
@ -1797,7 +1793,7 @@ class XmppClient(slixmpp.ClientXMPP):
exist_count = 0
for url in urls:
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True:
if sqlite.check_identifier_exist(db_file, identifier):
@ -1830,7 +1826,7 @@ class XmppClient(slixmpp.ClientXMPP):
if isinstance(url, list):
url = url[0]
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True:
if sqlite.check_identifier_exist(db_file, identifier):
@ -1956,7 +1952,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
values = payload['values']
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid'][0]
del values['jid']
db_file = config.get_pathname_to_database(jid_bare)
@ -1981,7 +1977,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
values = payload['values']
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid'][0]
del values['jid']
db_file = config.get_pathname_to_database(jid_bare)
@ -2022,7 +2018,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'Discover & Search')
form['instructions'] = 'Discover news subscriptions of all kinds'
options = form.add_field(desc='Select type of search.',
@ -2039,7 +2035,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['payload'] = form
session['prev'] = None
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -2146,7 +2142,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'Subscriptions')
form['instructions'] = ('Browse, view, toggle or remove '
'tags and subscriptions.')
@ -2160,7 +2156,7 @@ class XmppClient(slixmpp.ClientXMPP):
options.addOption('Browse tags', 'tag')
options.addOption('Remove subscriptions', 'delete')
options.addOption('Toggle subscriptions', 'toggle')
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
form['instructions'] = None
# form['instructions'] = ('Special section for operators:\n'
# 'This section allows you to change '
@ -2190,7 +2186,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['next'] = self._handle_subscriptions_result
session['has_next'] = True
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -2212,7 +2208,7 @@ class XmppClient(slixmpp.ClientXMPP):
values = payload['values']
jid_bare = session['from'].bare
form = self['xep_0004'].make_form('form', 'Subscriptions')
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid']
form.add_field(ftype='hidden',
value=jid_bare,
@ -2306,7 +2302,7 @@ class XmppClient(slixmpp.ClientXMPP):
form = self['xep_0004'].make_form('form', 'Subscriptions')
jid_bare = session['from'].bare
values = payload['values']
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid'][0]
form.add_field(ftype='hidden',
value=jid_bare,
@ -2344,7 +2340,7 @@ class XmppClient(slixmpp.ClientXMPP):
form = self['xep_0004'].make_form('form', 'Subscription')
jid_bare = session['from'].bare
values = payload['values']
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid'][0] if values['jid'] else jid_bare
form.add_field(ftype='hidden',
value=jid_bare,
@ -2440,7 +2436,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
values = payload['values']
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid_bare = values['jid'][0]
db_file = config.get_pathname_to_database(jid_bare)
# url = values['url']
@ -2506,14 +2502,14 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'Advanced')
form['instructions'] = 'Extended options'
options = form.add_field(ftype='list-single',
label='Choose',
required=True,
var='option')
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
options.addOption('Administration', 'admin')
# options.addOption('Activity', 'activity')
# options.addOption('Filters', 'filter')
@ -2527,7 +2523,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['next'] = self._handle_advanced_result
session['prev'] = self._handle_advanced
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -2556,7 +2552,7 @@ class XmppClient(slixmpp.ClientXMPP):
case 'admin':
# NOTE Even though this check is already conducted on previous
# form, this check is being done just in case.
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
if self.is_component:
# NOTE This will be changed with XEP-0222 XEP-0223
text_info = ('Subscriber management options are '
@ -2589,7 +2585,7 @@ class XmppClient(slixmpp.ClientXMPP):
else:
logger.warning('An unauthorized attempt to access '
'bookmarks has been detected for JID {} at '
'{}'.format(jid_bare, dt.timestamp()))
'{}'.format(jid_bare, DateAndTime.timestamp()))
text_warn = 'This resource is restricted.'
session['notes'] = [['warn', text_warn]]
session['has_next'] = False
@ -2617,7 +2613,7 @@ class XmppClient(slixmpp.ClientXMPP):
required=True,
var='url')
url['validate']['datatype'] = 'xs:anyURI'
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
form.add_field(ftype='fixed',
label='* Operators',
desc='This section allows you to import '
@ -2651,7 +2647,7 @@ class XmppClient(slixmpp.ClientXMPP):
options.addOption('OPML', 'opml')
# options.addOption('HTML', 'html')
# options.addOption('XBEL', 'xbel')
if is_operator(self, jid_bare):
if XmppUtilities.is_operator(self, jid_bare):
# form['instructions'] = ('Special section for operators:\n'
# 'This section allows you to '
# 'import and export subscriptions '
@ -2841,7 +2837,7 @@ class XmppClient(slixmpp.ClientXMPP):
url = values['url']
if url.startswith('http') and url.endswith('.opml'):
jid_bare = session['from'].bare
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid = values['jid']
jid_bare = jid[0] if isinstance(jid, list) else jid
db_file = config.get_pathname_to_database(jid_bare)
@ -2882,7 +2878,7 @@ class XmppClient(slixmpp.ClientXMPP):
# form['type'] = 'result'
values = payload['values']
jid_bare = session['from'].bare
if is_operator(self, jid_bare) and 'jid' in values:
if XmppUtilities.is_operator(self, jid_bare) and 'jid' in values:
jid = values['jid']
jid_bare = jid[0] if isinstance(jid, list) else jid
# form = self['xep_0004'].make_form('result', 'Done')
@ -2915,7 +2911,7 @@ class XmppClient(slixmpp.ClientXMPP):
jid_bare = session['from'].bare
jid_full = str(session['from'])
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
form = self['xep_0004'].make_form('form', 'Subscribe')
# NOTE Refresh button would be of use
form['instructions'] = 'Featured subscriptions'
@ -2938,7 +2934,7 @@ class XmppClient(slixmpp.ClientXMPP):
if '@' in jid_bare:
hostname = jid_bare.split('@')[1]
url = 'http://' + hostname
result = await crawl.probe_page(url)
result = await FeedDiscovery.probe_page(url)
if not result:
url = {'url' : url,
'index' : None,
@ -2966,7 +2962,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['payload'] = form
session['prev'] = self._handle_promoted
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'
@ -3620,7 +3616,7 @@ class XmppClient(slixmpp.ClientXMPP):
.format(function_name, jid_full))
jid_bare = session['from'].bare
chat_type = await XmppUtilities.get_chat_type(self, jid_bare)
if is_access(self, jid_bare, jid_full, chat_type):
if XmppUtilities.is_access(self, jid_bare, jid_full, chat_type):
db_file = config.get_pathname_to_database(jid_bare)
if jid_bare not in self.settings:
Config.add_settings_jid(self.settings, jid_bare, db_file)
@ -3718,7 +3714,7 @@ class XmppClient(slixmpp.ClientXMPP):
session['next'] = self._handle_settings_complete
session['payload'] = form
else:
if not is_operator(self, jid_bare):
if not XmppUtilities.is_operator(self, jid_bare):
text_warn = 'This resource is restricted to operators.'
elif chat_type == 'groupchat':
text_warn = ('This resource is restricted to moderators of {}.'

View file

@ -5,14 +5,11 @@ from feedparser import parse
from random import randrange
import slixfeed.config as config
from slixfeed.config import Config
import slixfeed.crawl as crawl
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, Opml
import slixfeed.url as uri
from slixfeed.utilities import Documentation, Utilities
from slixfeed.syndication import Feed, FeedDiscovery, Opml
from slixfeed.utilities import DateAndTime, Documentation, Url, Utilities
from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.muc import XmppMuc
@ -121,9 +118,9 @@ class XmppCommands:
"""
if url.startswith('http'):
if not title:
title = uri.get_hostname(url)
title = Url.get_hostname(url)
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
@ -148,7 +145,7 @@ class XmppCommands:
if feed.has_key('updated_parsed'):
feed_updated = feed.updated_parsed
try:
feed_updated = dt.convert_struct_time_to_iso8601(
feed_updated = DateAndTime.convert_struct_time_to_iso8601(
feed_updated)
except:
feed_updated = None
@ -393,7 +390,7 @@ class XmppCommands:
identifier = info[2]
else:
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
@ -417,8 +414,8 @@ class XmppCommands:
if (url.startswith('feed:/') or
url.startswith('itpc:/') or
url.startswith('rss:/')):
url = uri.feed_to_http(url)
url = (await uri.replace_hostname(url, 'feed')) or url
url = Url.feed_to_http(url)
url = (await Url.replace_hostname(url, 'feed')) or url
result = await Feed.add_feed(self, jid_bare, db_file, url,
identifier)
if isinstance(result, list):
@ -479,10 +476,10 @@ class XmppCommands:
# both interfaces Chat and IPC
async def fetch_http(self, url, db_file, jid_bare):
if url.startswith('feed:/') or url.startswith('rss:/'):
url = uri.feed_to_http(url)
url = (await uri.replace_hostname(url, 'feed')) or url
url = Url.feed_to_http(url)
url = (await Url.replace_hostname(url, 'feed')) or url
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
@ -581,7 +578,7 @@ class XmppCommands:
async def muc_join(self, command):
if command:
muc_jid = uri.check_xmpp_uri(command)
muc_jid = Url.check_xmpp_uri(command)
if muc_jid:
# TODO probe JID and confirm it's a groupchat
result = await XmppMuc.join(self, muc_jid)
@ -735,8 +732,8 @@ class XmppCommands:
async def feed_read(self, jid_bare, data, url):
if url.startswith('feed:/') or url.startswith('rss:/'):
url = uri.feed_to_http(url)
url = (await uri.replace_hostname(url, 'feed')) or url
url = Url.feed_to_http(url)
url = (await Url.replace_hostname(url, 'feed')) or url
match len(data):
case 1:
if url.startswith('http'):
@ -750,7 +747,7 @@ class XmppCommands:
message = Feed.view_feed(url, feed)
break
else:
result = await crawl.probe_page(url, document)
result = await FeedDiscovery.probe_page(url, document)
if isinstance(result, list):
results = result
message = ("Syndication feeds found for {}\n\n```\n"
@ -786,7 +783,7 @@ class XmppCommands:
message = Feed.view_entry(url, feed, num)
break
else:
result = await crawl.probe_page(url, document)
result = await FeedDiscovery.probe_page(url, document)
if isinstance(result, list):
results = result
message = ("Syndication feeds found for {}\n\n```\n"

View file

@ -14,7 +14,7 @@ TODO
"""
import asyncio
from slixfeed.dt import current_time
from slixfeed.utilities import DateAndTime
from slixfeed.log import Logger
from slixmpp.exceptions import IqTimeout, IqError
from time import sleep
@ -62,17 +62,17 @@ class XmppConnect:
def recover(self, message):
logger.warning(message)
print(current_time(), message, 'Attempting to reconnect.')
print(DateAndTime.current_time(), message, 'Attempting to reconnect.')
self.connection_attempts += 1
# if self.connection_attempts <= self.max_connection_attempts:
# self.reconnect(wait=5.0) # wait a bit before attempting to reconnect
# else:
# print(current_time(),"Maximum connection attempts exceeded.")
# logging.error("Maximum connection attempts exceeded.")
print(current_time(), 'Attempt number', self.connection_attempts)
print(DateAndTime.current_time(), 'Attempt number', self.connection_attempts)
seconds = self.reconnect_timeout or 30
seconds = int(seconds)
print(current_time(), 'Next attempt within', seconds, 'seconds')
print(DateAndTime.current_time(), 'Next attempt within', seconds, 'seconds')
# NOTE asyncio.sleep doesn't interval as expected
# await asyncio.sleep(seconds)
sleep(seconds)

View file

@ -1,49 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
def is_access(self, jid_bare, jid_full, chat_type):
"""Determine access privilege"""
operator = is_operator(self, jid_bare)
if operator:
if chat_type == 'groupchat':
if is_moderator(self, jid_bare, jid_full):
access = True
else:
access = True
else:
access = False
return access
def is_operator(self, jid_bare):
"""Check if given JID is an operator"""
result = False
for operator in self.operators:
if jid_bare == operator['jid']:
result = True
# operator_name = operator['name']
break
return result
def is_moderator(self, jid_bare, jid_full):
"""Check if given JID is a moderator"""
alias = jid_full[jid_full.index('/')+1:]
role = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'role')
if role == 'moderator':
result = True
else:
result = False
return result
def is_member(self, jid_bare, jid_full):
"""Check if given JID is a member"""
alias = jid_full[jid_full.index('/')+1:]
affiliation = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'affiliation')
if affiliation == 'member':
result = True
else:
result = False
return result

View file

@ -16,8 +16,7 @@ from slixfeed.config import Config
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed
import slixfeed.url as uri
from slixfeed.utilities import Utilities
from slixfeed.utilities import Url, Utilities
from slixfeed.xmpp.iq import XmppIQ
import sys
@ -337,7 +336,7 @@ class XmppPubsubAction:
node_id = node_id[0]
if not node_id:
counter = 0
hostname = uri.get_hostname(url)
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:

View file

@ -16,7 +16,7 @@ logger = Logger(__name__)
class XmppUpload:
async def start(self, jid, filename, domain=None):
logger.info('Uploading file %s...', filename)
logger.info(['Uploading file %s...', filename])
try:
upload_file = self['xep_0363'].upload_file
# if self.encrypted and not self['xep_0454']:
@ -34,7 +34,7 @@ class XmppUpload:
filename, domain, timeout=10,
)
logger.info('Upload successful!')
logger.info('Sending file to %s', jid)
logger.info(['Sending file to %s', jid])
except HTTPError:
url = ('Error: It appears that this server does not support '
'HTTP File Upload.')

View file

@ -58,3 +58,51 @@ class XmppUtilities:
# finally:
# logger.info('Chat type is:', chat_type)
return result
def is_access(self, jid_bare, jid_full, chat_type):
"""Determine access privilege"""
operator = XmppUtilities.is_operator(self, jid_bare)
if operator:
if chat_type == 'groupchat':
if XmppUtilities.is_moderator(self, jid_bare, jid_full):
access = True
else:
access = True
else:
access = False
return access
def is_operator(self, jid_bare):
"""Check if given JID is an operator"""
result = False
for operator in self.operators:
if jid_bare == operator['jid']:
result = True
# operator_name = operator['name']
break
return result
def is_moderator(self, jid_bare, jid_full):
"""Check if given JID is a moderator"""
alias = jid_full[jid_full.index('/')+1:]
role = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'role')
if role == 'moderator':
result = True
else:
result = False
return result
def is_member(self, jid_bare, jid_full):
"""Check if given JID is a member"""
alias = jid_full[jid_full.index('/')+1:]
affiliation = self.plugin['xep_0045'].get_jid_property(jid_bare, alias, 'affiliation')
if affiliation == 'member':
result = True
else:
result = False
return result