Slixfeed/slixfeed/url.py
2024-01-04 01:16:24 +00:00

322 lines
No EOL
7.1 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) ActivityPub URL revealer activitypub_to_http.
2) Remove tracking queries.
3) Redirect to Invidious, Librarian, Nitter, ProxiTok etc.
because they provide RSS.
"""
import slixfeed.config as config
from email.utils import parseaddr
import random
from urllib.parse import (
parse_qs,
urlencode,
urljoin,
# urlparse,
urlsplit,
urlunsplit
)
# NOTE hostname and protocol are listed as one in file
# proxies.yaml. Perhaps a better practice would be to have
# them separated. File proxies.yaml will remainas is in order
# to be coordinated with the dataset of project LibRedirect.
def replace_hostname(url, url_type):
"""
Replace hostname.
Parameters
----------
url : str
URL.
url_type : str
"feed" or "link".
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
hostname = hostname.replace("www.","")
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
proxies = config.get_list("proxies.yaml")
for proxy in proxies:
proxy = proxies[proxy]
if hostname in proxy["hostname"] and url_type in proxy["type"]:
select_proxy = random.choice(proxy["clearnet"])
parted_proxy = urlsplit(select_proxy)
protocol_new = parted_proxy.scheme
hostname_new = parted_proxy.netloc
url = urlunsplit([
protocol_new,
hostname_new,
pathname,
queries,
fragment
])
return url
def remove_tracking_parameters(url):
"""
Remove queries with tracking parameters.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parse_qs(parted_url.query)
fragment = parted_url.fragment
trackers = config.get_list("queries.yaml")
trackers = trackers["trackers"]
for tracker in trackers:
if tracker in queries: del queries[tracker]
queries_new = urlencode(queries, doseq=True)
url = urlunsplit([
protocol,
hostname,
pathname,
queries_new,
fragment
])
return url
def feed_to_http(url):
"""
Replace scheme FEED by HTTP.
Parameters
----------
url : str
URL.
Returns
-------
new_url : str
URL.
"""
par_url = urlsplit(url)
new_url = urlunsplit([
"http",
par_url.netloc,
par_url.path,
par_url.query,
par_url.fragment
])
return new_url
def activitypub_to_http(namespace):
"""
Replace ActivityPub namespace by HTTP.
Parameters
----------
namespace : str
Namespace.
Returns
-------
new_url : str
URL.
"""
def check_xmpp_uri(uri):
"""
Check validity of XMPP URI.
Parameters
----------
uri : str
URI.
Returns
-------
jid : str
JID or None.
"""
jid = urlsplit(uri).path
if parseaddr(jid)[1] != jid:
jid = False
return jid
# NOTE Read the documentation
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
def complete_url(source, link):
"""
Check if URL is pathname and complete it into URL.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith("www."):
return "http://" + link
parted_link = urlsplit(link)
parted_feed = urlsplit(source)
if parted_link.scheme == "magnet" and parted_link.query:
return link
if parted_link.scheme and parted_link.netloc:
return link
if link.startswith("//"):
if parted_link.netloc and parted_link.path:
new_link = urlunsplit([
parted_feed.scheme,
parted_link.netloc,
parted_link.path,
parted_link.query,
parted_link.fragment
])
elif link.startswith("/"):
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
parted_link.path,
parted_link.query,
parted_link.fragment
])
elif link.startswith("../"):
pathlink = parted_link.path.split("/")
pathfeed = parted_feed.path.split("/")
for i in pathlink:
if i == "..":
if pathlink.index("..") == 0:
pathfeed.pop()
else:
break
while pathlink.count(".."):
if pathlink.index("..") == 0:
pathlink.remove("..")
else:
break
pathlink = "/".join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
"/".join(pathfeed),
parted_link.query,
parted_link.fragment
])
else:
pathlink = parted_link.path.split("/")
pathfeed = parted_feed.path.split("/")
if link.startswith("./"):
pathlink.remove(".")
if not source.endswith("/"):
pathfeed.pop()
pathlink = "/".join(pathlink)
pathfeed.extend([pathlink])
new_link = urlunsplit([
parted_feed.scheme,
parted_feed.netloc,
"/".join(pathfeed),
parted_link.query,
parted_link.fragment
])
return new_link
"""
TODO
Feed https://www.ocaml.org/feed.xml
Link %20https://frama-c.com/fc-versions/cobalt.html%20
FIXME
Feed https://cyber.dabamos.de/blog/feed.rss
Link https://cyber.dabamos.de/blog/#article-2022-07-15
"""
def join_url(source, link):
"""
Join base URL with given pathname.
Parameters
----------
source : str
Feed URL.
link : str
Link URL or pathname.
Returns
-------
str
URL.
"""
if link.startswith("www."):
new_link = "http://" + link
elif link.startswith("%20") and link.endswith("%20"):
old_link = link.split("%20")
del old_link[0]
old_link.pop()
new_link = "".join(old_link)
else:
new_link = urljoin(source, link)
return new_link
def trim_url(url):
"""
Check URL pathname for double slash.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
while "//" in pathname:
pathname = pathname.replace("//", "/")
url = urlunsplit([
protocol,
hostname,
pathname,
queries,
fragment
])
return url