Add ClearURLs functionality.

Fix Proxy functionality (remove www).
This commit is contained in:
Schimon Jehudah 2023-12-27 22:48:31 +00:00
parent 4d03d6e16e
commit 8fbe97e357
5 changed files with 382 additions and 51 deletions

235
assets/queries.yaml Normal file
View file

@ -0,0 +1,235 @@
# The purpose of this list is to remove queries that
# are mostly utilized as trackers for advertising.
trackers:
- ad
- ad_medium
- ad_name
- ad_pvid
- ad_sub
# ad_tags
- advertising-id
# aem_p4p_detail
- af
- aff
- aff_fcid
- aff_fsk
- aff_platform
- aff_trace_key
- affparams
- afSmartRedirect
- afftrack
- affparams
# aid
- algo_exp_id
- algo_pvid
- ar
# ascsubtag
# asc_contentid
- asgtbndr
- atc
- ats
- autostart
# b64e# breaks yandex
- bizType
# block
- bta
- businessType
- campaign
- campaignId
# __cf_chl_rt_tk
# cid# breaks sacred magick
- ck
# clickid
# client_id
# cm_ven
# cmd
- content-id
- crid
- cst
- cts
- curPageLogUid
# data# breaks yandex
# dchild
# dclid
- deals-widget
- dgcid
- dicbo
# dt
- edd
- edm_click_module
# ei
# embed
# _encoding
# etext# breaks yandex
- eventSource
- fbclid
- feature
- field-lbr_brands_browse-bin
- forced_click
# fr
- frs
# from# breaks yandex
- _ga
- ga_order
- ga_search_query
- ga_search_type
- ga_view_type
- gatewayAdapt
# gclid
# gclsrc
- gh_jid
- gps-id
# gs_lcp
- gt
- guccounter
- hdtime
- hosted_button_id
- ICID
- ico
- ig_rid
# idzone
# iflsig
- intcmp
- irclickid
# irgwc
# irpid
- is_from_webapp
- itid
# itok
# katds_labels
# keywords
- keyno
- l10n
- linkCode
- mc
- mid
- __mk_de_DE
- mp
- nats
- nci
- obOrigUrl
- offer_id
- optout
- oq
- organic_search_click
- pa
- Partner
- partner
- partner_id
- partner_ID
- pcampaignid
- pd_rd_i
- pd_rd_r
- pd_rd_w
- pd_rd_wg
- pdp_npi
- pf_rd_i
- pf_rd_m
- pf_rd_p
- pf_rd_r
- pf_rd_s
- pf_rd_t
- pg
- PHPSESSID
- pk_campaign
- pdp_ext_f
- pkey
- platform
- plkey
- pqr
- pr
- pro
- prod
- prom
- promo
- promocode
- promoid
- psc
- psprogram
- pvid
- qid
# r
- realDomain
- recruiter_id
- redirect
- ref
- ref_
- ref_src
- refcode
- referrer
- refinements
- reftag
- rf
- rnid
- rowan_id1
- rowan_msg_id
# rss
# sCh
- sclient
- scm
- scm_id
- scm-url
# sd
- sender_device
- sh
- shareId
- showVariations
- si
# sid# breaks whatsup.org.il
- ___SID
# site_id
- sk
- smid
- social_params
- source
- sourceId
- sp_csd
- spLa
- spm
- spreadType
# sprefix
- sr
- src
- _src
- src_cmp
- src_player
- src_src
- srcSns
- su
# sxin_0_pb
- _t
# tag
- tcampaign
- td
- terminal_id
# text
- th# Sometimes restored after page load
# title
- tracelog
- traffic_id
- traffic_source
- traffic_type
- tt
- uact
- ug_edm_item_id
- utm
# utm1
# utm2
# utm3
# utm4
# utm5
# utm6
# utm7
# utm8
# utm9
- utm_campaign
- utm_content
- utm_medium
- utm_source
- utm_term
- uuid
# utype
# ve
# ved
# zone'

View file

@ -14,6 +14,10 @@ TODO
2) Check also for HTML, not only feed.bozo. 2) Check also for HTML, not only feed.bozo.
3) Add "if is_feed(url, feed)" to view_entry and view_feed
4) Refactor view_entry and view_feed - Why "if" twice?
""" """
from aiohttp import ClientError, ClientSession, ClientTimeout from aiohttp import ClientError, ClientSession, ClientTimeout
@ -193,7 +197,6 @@ async def download_updates(db_file, url=None):
read_status = 1 read_status = 1
entry = ( entry = (
title, title,
summary,
link, link,
eid, eid,
source, source,
@ -446,18 +449,7 @@ async def add_feed(db_file, url):
if res[0]: if res[0]:
feed = parse(res[0]) feed = parse(res[0])
title = get_title(url, feed) title = get_title(url, feed)
if not feed.entries: if is_feed(url, feed):
try:
feed["feed"]["title"]
except:
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
elif feed.bozo:
bozo = (
"Bozo detected. Failed to load: {}"
).format(url)
print(bozo)
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
else:
status = res[1] status = res[1]
msg = await sqlite.insert_feed( msg = await sqlite.insert_feed(
db_file, db_file,
@ -466,6 +458,13 @@ async def add_feed(db_file, url):
status status
) )
await download_updates(db_file, [url]) await download_updates(db_file, [url])
else:
msg = await probe_page(
add_feed,
url,
res[0],
db_file=db_file
)
else: else:
status = res[1] status = res[1]
msg = ( msg = (
@ -673,7 +672,7 @@ async def feed_mode_request(url, tree):
except: except:
continue continue
if len(feeds) > 1: if len(feeds) > 1:
positive = 0 counter = 0
msg = ( msg = (
"RSS URL discovery has found {} feeds:\n```\n" "RSS URL discovery has found {} feeds:\n```\n"
).format(len(feeds)) ).format(len(feeds))
@ -689,7 +688,13 @@ async def feed_mode_request(url, tree):
except: except:
continue continue
if feed_amnt: if feed_amnt:
positive = 1 # NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += ( msg += (
"Title: {}\n" "Title: {}\n"
"Link : {}\n" "Link : {}\n"
@ -700,10 +705,13 @@ async def feed_mode_request(url, tree):
feed_addr, feed_addr,
feed_amnt feed_amnt
) )
if counter > 1:
msg += ( msg += (
"```\nThe above feeds were extracted from\n{}" "```\nThe above feeds were extracted from\n{}"
).format(url) ).format(url)
if not positive: elif feed_mark:
return feed_mark
else:
msg = ( msg = (
"No feeds were found for {}" "No feeds were found for {}"
).format(url) ).format(url)
@ -887,3 +895,41 @@ async def feed_mode_auto_discovery(url, tree):
elif feeds: elif feeds:
feed_addr = join_url(url, feeds[0].xpath('@href')[0]) feed_addr = join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr] return [feed_addr]
def is_feed(url, feed):
"""
Determine whether document is feed or not.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
if not feed.entries:
try:
feed["feed"]["title"]
except:
val = False
msg = (
"No entries nor title for {}"
).format(url)
elif feed.bozo:
val = False
msg = (
"Bozo detected for {}"
).format(url)
else:
val = True
msg = (
"Good feed for {}"
).format(url)
print(msg)
return val

View file

@ -24,6 +24,7 @@ import confighandler as config
import datahandler as datahandler import datahandler as datahandler
from datetimehandler import current_time, rfc2822_to_iso8601 from datetimehandler import current_time, rfc2822_to_iso8601
from sqlite3 import connect, Error from sqlite3 import connect, Error
from urlhandler import remove_tracking_parameters
# from eliot import start_action, to_file # from eliot import start_action, to_file
# # with start_action(action_type="list_feeds()", db=db_file): # # with start_action(action_type="list_feeds()", db=db_file):
@ -88,7 +89,6 @@ def create_tables(db_file):
"CREATE TABLE IF NOT EXISTS entries (" "CREATE TABLE IF NOT EXISTS entries ("
"id INTEGER PRIMARY KEY," "id INTEGER PRIMARY KEY,"
"title TEXT NOT NULL," "title TEXT NOT NULL,"
"summary TEXT NOT NULL,"
"link TEXT NOT NULL," "link TEXT NOT NULL,"
"entry_id TEXT," "entry_id TEXT,"
"source TEXT NOT NULL," "source TEXT NOT NULL,"
@ -100,7 +100,6 @@ def create_tables(db_file):
"CREATE TABLE IF NOT EXISTS archive (" "CREATE TABLE IF NOT EXISTS archive ("
"id INTEGER PRIMARY KEY," "id INTEGER PRIMARY KEY,"
"title TEXT NOT NULL," "title TEXT NOT NULL,"
"summary TEXT NOT NULL,"
"link TEXT NOT NULL," "link TEXT NOT NULL,"
"entry_id TEXT," "entry_id TEXT,"
"source TEXT NOT NULL," "source TEXT NOT NULL,"
@ -434,11 +433,11 @@ async def get_entry_unread(db_file, num=None):
# "DESC LIMIT :num" # "DESC LIMIT :num"
# ) # )
sql = ( sql = (
"SELECT id, title, summary, link, source, timestamp " "SELECT id, title, link, source, timestamp "
"FROM entries " "FROM entries "
"WHERE read = 0 " "WHERE read = 0 "
"UNION ALL " "UNION ALL "
"SELECT id, title, summary, link, source, timestamp " "SELECT id, title, link, source, timestamp "
"FROM archive " "FROM archive "
"ORDER BY timestamp " "ORDER BY timestamp "
"DESC LIMIT :num" "DESC LIMIT :num"
@ -469,41 +468,45 @@ async def get_entry_unread(db_file, num=None):
for result in results: for result in results:
ix = result[0] ix = result[0]
title = result[1] title = result[1]
summary = result[2] # # TODO Retrieve summary from feed
# Remove HTML tags # # See datahandler.view_entry
try: # summary = result[2]
summary = BeautifulSoup(summary, "lxml").text # # Remove HTML tags
except: # try:
print(result[2]) # summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length # except:
summary = summary.replace("\n\n\n", "\n\n") # print(result[2])
length = await get_settings_value(db_file, "length") # breakpoint()
summary = summary[:length] + " […]" # # TODO Limit text length
summary = summary.strip().split('\n') # summary = summary.replace("\n\n\n", "\n\n")
summary = ["> " + line for line in summary] # length = await get_settings_value(db_file, "length")
summary = "\n".join(summary) # summary = summary[:length] + " […]"
link = result[3] # summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
link = result[2]
link = await remove_tracking_parameters(link)
sql = ( sql = (
"SELECT name " "SELECT name "
"FROM feeds " "FROM feeds "
"WHERE address = :source " "WHERE address = :source "
) )
source = result[4] source = result[3]
feed = cur.execute(sql, (source,)) feed = cur.execute(sql, (source,))
feed = feed.fetchone()[0] feed = feed.fetchone()[0]
if num > 1: if num > 1:
news_list += ( news_list += (
"\n{}\n{}\n" "\n{}\n{}\n{}\n"
).format( ).format(
str(title), str(title),
str(link) str(link),
str(feed)
) )
else: else:
news_list = ( news_list = (
"{}\n\n{}\n\n{}\n{}" "{}\n{}\n{}"
).format( ).format(
str(title), str(title),
str(summary),
str(link), str(link),
str(feed) str(feed)
) )
@ -532,7 +535,7 @@ async def mark_entry_as_read(cur, ix):
""" """
sql = ( sql = (
"UPDATE entries " "UPDATE entries "
"SET summary = '', read = 1 " "SET read = 1 "
"WHERE id = ?" "WHERE id = ?"
) )
cur.execute(sql, (ix,)) cur.execute(sql, (ix,))
@ -554,7 +557,7 @@ async def mark_source_as_read(db_file, source):
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = (
"UPDATE entries " "UPDATE entries "
"SET summary = '', read = 1 " "SET read = 1 "
"WHERE source = ?" "WHERE source = ?"
) )
cur.execute(sql, (source,)) cur.execute(sql, (source,))
@ -574,7 +577,7 @@ async def mark_all_as_read(db_file):
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = (
"UPDATE entries " "UPDATE entries "
"SET summary = '', read = 1 " "SET read = 1 "
) )
cur.execute(sql) cur.execute(sql)
sql = ( sql = (
@ -892,23 +895,23 @@ async def add_entry(cur, entry):
"INSERT " "INSERT "
"INTO entries(" "INTO entries("
"title, " "title, "
"summary, "
"link, " "link, "
"entry_id, " "entry_id, "
"source, " "source, "
"timestamp, " "timestamp, "
"read" "read"
") " ") "
"VALUES(?, ?, ?, ?, ?, ?, ?)" "VALUES(?, ?, ?, ?, ?, ?)"
) )
try: try:
cur.execute(sql, entry) cur.execute(sql, entry)
except: except:
print(current_time(), "COROUTINE OBJECT NOW") print(current_time(), "COROUTINE OBJECT NOW")
print(entry[6]) # for i in entry:
print(type(entry[6])) # print(type(i))
# print(i)
# print(type(entry))
print(entry) print(entry)
print(type(entry))
print(current_time(), "COROUTINE OBJECT NOW") print(current_time(), "COROUTINE OBJECT NOW")
# breakpoint() # breakpoint()

View file

@ -17,7 +17,14 @@ TODO
from confighandler import get_list from confighandler import get_list
from email.utils import parseaddr from email.utils import parseaddr
import random import random
from urllib.parse import urljoin, urlsplit, urlunsplit from urllib.parse import (
parse_qs,
urlencode,
urljoin,
urlparse,
urlsplit,
urlunsplit
)
# NOTE hostname and protocol are listed as one in file # NOTE hostname and protocol are listed as one in file
@ -41,6 +48,7 @@ async def replace_hostname(url):
parted_url = urlsplit(url) parted_url = urlsplit(url)
protocol = parted_url.scheme protocol = parted_url.scheme
hostname = parted_url.netloc hostname = parted_url.netloc
hostname = hostname.replace("www.","")
pathname = parted_url.path pathname = parted_url.path
queries = parted_url.query queries = parted_url.query
fragment = parted_url.fragment fragment = parted_url.fragment
@ -62,6 +70,41 @@ async def replace_hostname(url):
return url return url
async def remove_tracking_parameters(url):
"""
Remove queries with tracking parameters.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parse_qs(parted_url.query)
fragment = parted_url.fragment
trackers = await get_list("queries.yaml")
trackers = trackers["trackers"]
for tracker in trackers:
if tracker in queries: del queries[tracker]
queries_new = urlencode(queries, doseq=True)
url = urlunsplit([
protocol,
hostname,
pathname,
queries_new,
fragment
])
return url
def feed_to_http(url): def feed_to_http(url):
""" """
Replace scheme FEED by HTTP. Replace scheme FEED by HTTP.

View file

@ -3,6 +3,10 @@
""" """
TODO
1) Split into modules (e.g. slixfeed/xmpp/bookmarks.py)
FIXME FIXME
1) Function check_readiness or event "changed_status" is causing for 1) Function check_readiness or event "changed_status" is causing for