Add ClearURLs functionality.

Fix Proxy functionality (remove www).
This commit is contained in:
Schimon Jehudah 2023-12-27 22:48:31 +00:00
parent 4d03d6e16e
commit 8fbe97e357
5 changed files with 382 additions and 51 deletions

235
assets/queries.yaml Normal file
View file

@ -0,0 +1,235 @@
# The purpose of this list is to remove queries that
# are mostly utilized as trackers for advertising.
trackers:
- ad
- ad_medium
- ad_name
- ad_pvid
- ad_sub
# ad_tags
- advertising-id
# aem_p4p_detail
- af
- aff
- aff_fcid
- aff_fsk
- aff_platform
- aff_trace_key
- affparams
- afSmartRedirect
- afftrack
- affparams
# aid
- algo_exp_id
- algo_pvid
- ar
# ascsubtag
# asc_contentid
- asgtbndr
- atc
- ats
- autostart
# b64e# breaks yandex
- bizType
# block
- bta
- businessType
- campaign
- campaignId
# __cf_chl_rt_tk
# cid# breaks sacred magick
- ck
# clickid
# client_id
# cm_ven
# cmd
- content-id
- crid
- cst
- cts
- curPageLogUid
# data# breaks yandex
# dchild
# dclid
- deals-widget
- dgcid
- dicbo
# dt
- edd
- edm_click_module
# ei
# embed
# _encoding
# etext# breaks yandex
- eventSource
- fbclid
- feature
- field-lbr_brands_browse-bin
- forced_click
# fr
- frs
# from# breaks yandex
- _ga
- ga_order
- ga_search_query
- ga_search_type
- ga_view_type
- gatewayAdapt
# gclid
# gclsrc
- gh_jid
- gps-id
# gs_lcp
- gt
- guccounter
- hdtime
- hosted_button_id
- ICID
- ico
- ig_rid
# idzone
# iflsig
- intcmp
- irclickid
# irgwc
# irpid
- is_from_webapp
- itid
# itok
# katds_labels
# keywords
- keyno
- l10n
- linkCode
- mc
- mid
- __mk_de_DE
- mp
- nats
- nci
- obOrigUrl
- offer_id
- optout
- oq
- organic_search_click
- pa
- Partner
- partner
- partner_id
- partner_ID
- pcampaignid
- pd_rd_i
- pd_rd_r
- pd_rd_w
- pd_rd_wg
- pdp_npi
- pf_rd_i
- pf_rd_m
- pf_rd_p
- pf_rd_r
- pf_rd_s
- pf_rd_t
- pg
- PHPSESSID
- pk_campaign
- pdp_ext_f
- pkey
- platform
- plkey
- pqr
- pr
- pro
- prod
- prom
- promo
- promocode
- promoid
- psc
- psprogram
- pvid
- qid
# r
- realDomain
- recruiter_id
- redirect
- ref
- ref_
- ref_src
- refcode
- referrer
- refinements
- reftag
- rf
- rnid
- rowan_id1
- rowan_msg_id
# rss
# sCh
- sclient
- scm
- scm_id
- scm-url
# sd
- sender_device
- sh
- shareId
- showVariations
- si
# sid# breaks whatsup.org.il
- ___SID
# site_id
- sk
- smid
- social_params
- source
- sourceId
- sp_csd
- spLa
- spm
- spreadType
# sprefix
- sr
- src
- _src
- src_cmp
- src_player
- src_src
- srcSns
- su
# sxin_0_pb
- _t
# tag
- tcampaign
- td
- terminal_id
# text
- th# Sometimes restored after page load
# title
- tracelog
- traffic_id
- traffic_source
- traffic_type
- tt
- uact
- ug_edm_item_id
- utm
# utm1
# utm2
# utm3
# utm4
# utm5
# utm6
# utm7
# utm8
# utm9
- utm_campaign
- utm_content
- utm_medium
- utm_source
- utm_term
- uuid
# utype
# ve
# ved
# zone'

View file

@ -14,6 +14,10 @@ TODO
2) Check also for HTML, not only feed.bozo.
3) Add "if is_feed(url, feed)" to view_entry and view_feed
4) Refactor view_entry and view_feed - Why "if" twice?
"""
from aiohttp import ClientError, ClientSession, ClientTimeout
@ -193,7 +197,6 @@ async def download_updates(db_file, url=None):
read_status = 1
entry = (
title,
summary,
link,
eid,
source,
@ -446,18 +449,7 @@ async def add_feed(db_file, url):
if res[0]:
feed = parse(res[0])
title = get_title(url, feed)
if not feed.entries:
try:
feed["feed"]["title"]
except:
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
elif feed.bozo:
bozo = (
"Bozo detected. Failed to load: {}"
).format(url)
print(bozo)
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
else:
if is_feed(url, feed):
status = res[1]
msg = await sqlite.insert_feed(
db_file,
@ -466,6 +458,13 @@ async def add_feed(db_file, url):
status
)
await download_updates(db_file, [url])
else:
msg = await probe_page(
add_feed,
url,
res[0],
db_file=db_file
)
else:
status = res[1]
msg = (
@ -673,7 +672,7 @@ async def feed_mode_request(url, tree):
except:
continue
if len(feeds) > 1:
positive = 0
counter = 0
msg = (
"RSS URL discovery has found {} feeds:\n```\n"
).format(len(feeds))
@ -689,7 +688,13 @@ async def feed_mode_request(url, tree):
except:
continue
if feed_amnt:
positive = 1
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title: {}\n"
"Link : {}\n"
@ -700,10 +705,13 @@ async def feed_mode_request(url, tree):
feed_addr,
feed_amnt
)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
if not positive:
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
@ -887,3 +895,41 @@ async def feed_mode_auto_discovery(url, tree):
elif feeds:
feed_addr = join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr]
def is_feed(url, feed):
"""
Determine whether document is feed or not.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
if not feed.entries:
try:
feed["feed"]["title"]
except:
val = False
msg = (
"No entries nor title for {}"
).format(url)
elif feed.bozo:
val = False
msg = (
"Bozo detected for {}"
).format(url)
else:
val = True
msg = (
"Good feed for {}"
).format(url)
print(msg)
return val

View file

@ -24,6 +24,7 @@ import confighandler as config
import datahandler as datahandler
from datetimehandler import current_time, rfc2822_to_iso8601
from sqlite3 import connect, Error
from urlhandler import remove_tracking_parameters
# from eliot import start_action, to_file
# # with start_action(action_type="list_feeds()", db=db_file):
@ -88,7 +89,6 @@ def create_tables(db_file):
"CREATE TABLE IF NOT EXISTS entries ("
"id INTEGER PRIMARY KEY,"
"title TEXT NOT NULL,"
"summary TEXT NOT NULL,"
"link TEXT NOT NULL,"
"entry_id TEXT,"
"source TEXT NOT NULL,"
@ -100,7 +100,6 @@ def create_tables(db_file):
"CREATE TABLE IF NOT EXISTS archive ("
"id INTEGER PRIMARY KEY,"
"title TEXT NOT NULL,"
"summary TEXT NOT NULL,"
"link TEXT NOT NULL,"
"entry_id TEXT,"
"source TEXT NOT NULL,"
@ -434,11 +433,11 @@ async def get_entry_unread(db_file, num=None):
# "DESC LIMIT :num"
# )
sql = (
"SELECT id, title, summary, link, source, timestamp "
"SELECT id, title, link, source, timestamp "
"FROM entries "
"WHERE read = 0 "
"UNION ALL "
"SELECT id, title, summary, link, source, timestamp "
"SELECT id, title, link, source, timestamp "
"FROM archive "
"ORDER BY timestamp "
"DESC LIMIT :num"
@ -469,41 +468,45 @@ async def get_entry_unread(db_file, num=None):
for result in results:
ix = result[0]
title = result[1]
summary = result[2]
# Remove HTML tags
try:
summary = BeautifulSoup(summary, "lxml").text
except:
print(result[2])
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
length = await get_settings_value(db_file, "length")
summary = summary[:length] + " […]"
summary = summary.strip().split('\n')
summary = ["> " + line for line in summary]
summary = "\n".join(summary)
link = result[3]
# # TODO Retrieve summary from feed
# # See datahandler.view_entry
# summary = result[2]
# # Remove HTML tags
# try:
# summary = BeautifulSoup(summary, "lxml").text
# except:
# print(result[2])
# breakpoint()
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# length = await get_settings_value(db_file, "length")
# summary = summary[:length] + " […]"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
link = result[2]
link = await remove_tracking_parameters(link)
sql = (
"SELECT name "
"FROM feeds "
"WHERE address = :source "
)
source = result[4]
source = result[3]
feed = cur.execute(sql, (source,))
feed = feed.fetchone()[0]
if num > 1:
news_list += (
"\n{}\n{}\n"
"\n{}\n{}\n{}\n"
).format(
str(title),
str(link)
str(link),
str(feed)
)
else:
news_list = (
"{}\n\n{}\n\n{}\n{}"
"{}\n{}\n{}"
).format(
str(title),
str(summary),
str(link),
str(feed)
)
@ -532,7 +535,7 @@ async def mark_entry_as_read(cur, ix):
"""
sql = (
"UPDATE entries "
"SET summary = '', read = 1 "
"SET read = 1 "
"WHERE id = ?"
)
cur.execute(sql, (ix,))
@ -554,7 +557,7 @@ async def mark_source_as_read(db_file, source):
cur = conn.cursor()
sql = (
"UPDATE entries "
"SET summary = '', read = 1 "
"SET read = 1 "
"WHERE source = ?"
)
cur.execute(sql, (source,))
@ -574,7 +577,7 @@ async def mark_all_as_read(db_file):
cur = conn.cursor()
sql = (
"UPDATE entries "
"SET summary = '', read = 1 "
"SET read = 1 "
)
cur.execute(sql)
sql = (
@ -892,23 +895,23 @@ async def add_entry(cur, entry):
"INSERT "
"INTO entries("
"title, "
"summary, "
"link, "
"entry_id, "
"source, "
"timestamp, "
"read"
") "
"VALUES(?, ?, ?, ?, ?, ?, ?)"
"VALUES(?, ?, ?, ?, ?, ?)"
)
try:
cur.execute(sql, entry)
except:
print(current_time(), "COROUTINE OBJECT NOW")
print(entry[6])
print(type(entry[6]))
# for i in entry:
# print(type(i))
# print(i)
# print(type(entry))
print(entry)
print(type(entry))
print(current_time(), "COROUTINE OBJECT NOW")
# breakpoint()

View file

@ -17,7 +17,14 @@ TODO
from confighandler import get_list
from email.utils import parseaddr
import random
from urllib.parse import urljoin, urlsplit, urlunsplit
from urllib.parse import (
parse_qs,
urlencode,
urljoin,
urlparse,
urlsplit,
urlunsplit
)
# NOTE hostname and protocol are listed as one in file
@ -41,6 +48,7 @@ async def replace_hostname(url):
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
hostname = hostname.replace("www.","")
pathname = parted_url.path
queries = parted_url.query
fragment = parted_url.fragment
@ -62,6 +70,41 @@ async def replace_hostname(url):
return url
async def remove_tracking_parameters(url):
"""
Remove queries with tracking parameters.
Parameters
----------
url : str
URL.
Returns
-------
url : str
URL.
"""
parted_url = urlsplit(url)
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = parted_url.path
queries = parse_qs(parted_url.query)
fragment = parted_url.fragment
trackers = await get_list("queries.yaml")
trackers = trackers["trackers"]
for tracker in trackers:
if tracker in queries: del queries[tracker]
queries_new = urlencode(queries, doseq=True)
url = urlunsplit([
protocol,
hostname,
pathname,
queries_new,
fragment
])
return url
def feed_to_http(url):
"""
Replace scheme FEED by HTTP.

View file

@ -3,6 +3,10 @@
"""
TODO
1) Split into modules (e.g. slixfeed/xmpp/bookmarks.py)
FIXME
1) Function check_readiness or event "changed_status" is causing for