#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ TODO 1) ActivityPub URL revealer activitypub_to_http. 2) Remove tracking queries. 3) Redirect to Invidious, Librarian, Nitter, ProxiTok etc. because they provide RSS. """ from email.utils import parseaddr from urllib.parse import urljoin, urlsplit, urlunsplit def feed_to_http(url): """ Replace scheme FEED by HTTP. Parameters ---------- url : str URL. Returns ------- new_url : str URL. """ par_url = urlsplit(url) new_url = urlunsplit([ "http", par_url.netloc, par_url.path, par_url.query, par_url.fragment ]) return new_url def activitypub_to_http(namespace): """ Replace ActivityPub namespace by HTTP. Parameters ---------- namespace : str Namespace. Returns ------- new_url : str URL. """ def check_xmpp_uri(uri): """ Check validity of XMPP URI. Parameters ---------- uri : str URI. Returns ------- jid : str JID or None. """ jid = urlsplit(uri).path if parseaddr(jid)[1] != jid: jid = False return jid # NOTE Read the documentation # https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin def complete_url(source, link): """ Check if URL is pathname and complete it into URL. Parameters ---------- source : str Feed URL. link : str Link URL or pathname. Returns ------- str URL. """ if link.startswith("www."): return "http://" + link parted_link = urlsplit(link) parted_feed = urlsplit(source) if parted_link.scheme == "magnet" and parted_link.query: return link if parted_link.scheme and parted_link.netloc: return link if link.startswith("//"): if parted_link.netloc and parted_link.path: new_link = urlunsplit([ parted_feed.scheme, parted_link.netloc, parted_link.path, parted_link.query, parted_link.fragment ]) elif link.startswith("/"): new_link = urlunsplit([ parted_feed.scheme, parted_feed.netloc, parted_link.path, parted_link.query, parted_link.fragment ]) elif link.startswith("../"): pathlink = parted_link.path.split("/") pathfeed = parted_feed.path.split("/") for i in pathlink: if i == "..": if pathlink.index("..") == 0: pathfeed.pop() else: break while pathlink.count(".."): if pathlink.index("..") == 0: pathlink.remove("..") else: break pathlink = "/".join(pathlink) pathfeed.extend([pathlink]) new_link = urlunsplit([ parted_feed.scheme, parted_feed.netloc, "/".join(pathfeed), parted_link.query, parted_link.fragment ]) else: pathlink = parted_link.path.split("/") pathfeed = parted_feed.path.split("/") if link.startswith("./"): pathlink.remove(".") if not source.endswith("/"): pathfeed.pop() pathlink = "/".join(pathlink) pathfeed.extend([pathlink]) new_link = urlunsplit([ parted_feed.scheme, parted_feed.netloc, "/".join(pathfeed), parted_link.query, parted_link.fragment ]) return new_link """ TODO Feed https://www.ocaml.org/feed.xml Link %20https://frama-c.com/fc-versions/cobalt.html%20 FIXME Feed https://cyber.dabamos.de/blog/feed.rss Link https://cyber.dabamos.de/blog/#article-2022-07-15 """ def join_url(source, link): """ Join base URL with given pathname. Parameters ---------- source : str Feed URL. link : str Link URL or pathname. Returns ------- str URL. """ if link.startswith("www."): new_link = "http://" + link elif link.startswith("%20") and link.endswith("%20"): old_link = link.split("%20") del old_link[0] old_link.pop() new_link = "".join(old_link) else: new_link = urljoin(source, link) return new_link def trim_url(url): """ Check URL pathname for double slash. Parameters ---------- url : str URL. Returns ------- url : str URL. """ parted_url = urlsplit(url) protocol = parted_url.scheme hostname = parted_url.netloc pathname = parted_url.path queries = parted_url.query fragment = parted_url.fragment while "//" in pathname: pathname = pathname.replace("//", "/") url = urlunsplit([ protocol, hostname, pathname, queries, fragment ]) return url