From 8fbe97e357986fb4f791472061ea0007babd64f2 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Wed, 27 Dec 2023 22:48:31 +0000 Subject: [PATCH] Add ClearURLs functionality. Fix Proxy functionality (remove www). --- assets/queries.yaml | 235 ++++++++++++++++++++++++++++++++++++++ slixfeed/datahandler.py | 84 +++++++++++--- slixfeed/sqlitehandler.py | 65 ++++++----- slixfeed/urlhandler.py | 45 +++++++- slixfeed/xmpphandler.py | 4 + 5 files changed, 382 insertions(+), 51 deletions(-) create mode 100644 assets/queries.yaml diff --git a/assets/queries.yaml b/assets/queries.yaml new file mode 100644 index 0000000..f39c66b --- /dev/null +++ b/assets/queries.yaml @@ -0,0 +1,235 @@ +# The purpose of this list is to remove queries that +# are mostly utilized as trackers for advertising. + +trackers: + - ad + - ad_medium + - ad_name + - ad_pvid + - ad_sub + # ad_tags + - advertising-id + # aem_p4p_detail + - af + - aff + - aff_fcid + - aff_fsk + - aff_platform + - aff_trace_key + - affparams + - afSmartRedirect + - afftrack + - affparams + # aid + - algo_exp_id + - algo_pvid + - ar + # ascsubtag + # asc_contentid + - asgtbndr + - atc + - ats + - autostart + # b64e# breaks yandex + - bizType + # block + - bta + - businessType + - campaign + - campaignId + # __cf_chl_rt_tk + # cid# breaks sacred magick + - ck + # clickid + # client_id + # cm_ven + # cmd + - content-id + - crid + - cst + - cts + - curPageLogUid + # data# breaks yandex + # dchild + # dclid + - deals-widget + - dgcid + - dicbo + # dt + - edd + - edm_click_module + # ei + # embed + # _encoding + # etext# breaks yandex + - eventSource + - fbclid + - feature + - field-lbr_brands_browse-bin + - forced_click + # fr + - frs + # from# breaks yandex + - _ga + - ga_order + - ga_search_query + - ga_search_type + - ga_view_type + - gatewayAdapt + # gclid + # gclsrc + - gh_jid + - gps-id + # gs_lcp + - gt + - guccounter + - hdtime + - hosted_button_id + - ICID + - ico + - ig_rid + # idzone + # iflsig + - intcmp + - irclickid + # irgwc + # irpid + - is_from_webapp + - itid + # itok + # katds_labels + # keywords + - keyno + - l10n + - linkCode + - mc + - mid + - __mk_de_DE + - mp + - nats + - nci + - obOrigUrl + - offer_id + - optout + - oq + - organic_search_click + - pa + - Partner + - partner + - partner_id + - partner_ID + - pcampaignid + - pd_rd_i + - pd_rd_r + - pd_rd_w + - pd_rd_wg + - pdp_npi + - pf_rd_i + - pf_rd_m + - pf_rd_p + - pf_rd_r + - pf_rd_s + - pf_rd_t + - pg + - PHPSESSID + - pk_campaign + - pdp_ext_f + - pkey + - platform + - plkey + - pqr + - pr + - pro + - prod + - prom + - promo + - promocode + - promoid + - psc + - psprogram + - pvid + - qid + # r + - realDomain + - recruiter_id + - redirect + - ref + - ref_ + - ref_src + - refcode + - referrer + - refinements + - reftag + - rf + - rnid + - rowan_id1 + - rowan_msg_id + # rss + # sCh + - sclient + - scm + - scm_id + - scm-url + # sd + - sender_device + - sh + - shareId + - showVariations + - si + # sid# breaks whatsup.org.il + - ___SID + # site_id + - sk + - smid + - social_params + - source + - sourceId + - sp_csd + - spLa + - spm + - spreadType + # sprefix + - sr + - src + - _src + - src_cmp + - src_player + - src_src + - srcSns + - su + # sxin_0_pb + - _t + # tag + - tcampaign + - td + - terminal_id + # text + - th# Sometimes restored after page load + # title + - tracelog + - traffic_id + - traffic_source + - traffic_type + - tt + - uact + - ug_edm_item_id + - utm + # utm1 + # utm2 + # utm3 + # utm4 + # utm5 + # utm6 + # utm7 + # utm8 + # utm9 + - utm_campaign + - utm_content + - utm_medium + - utm_source + - utm_term + - uuid + # utype + # ve + # ved + # zone' diff --git a/slixfeed/datahandler.py b/slixfeed/datahandler.py index afdb84c..9da9e49 100644 --- a/slixfeed/datahandler.py +++ b/slixfeed/datahandler.py @@ -14,6 +14,10 @@ TODO 2) Check also for HTML, not only feed.bozo. +3) Add "if is_feed(url, feed)" to view_entry and view_feed + +4) Refactor view_entry and view_feed - Why "if" twice? + """ from aiohttp import ClientError, ClientSession, ClientTimeout @@ -193,7 +197,6 @@ async def download_updates(db_file, url=None): read_status = 1 entry = ( title, - summary, link, eid, source, @@ -446,18 +449,7 @@ async def add_feed(db_file, url): if res[0]: feed = parse(res[0]) title = get_title(url, feed) - if not feed.entries: - try: - feed["feed"]["title"] - except: - msg = await probe_page(add_feed, url, res[0], db_file=db_file) - elif feed.bozo: - bozo = ( - "Bozo detected. Failed to load: {}" - ).format(url) - print(bozo) - msg = await probe_page(add_feed, url, res[0], db_file=db_file) - else: + if is_feed(url, feed): status = res[1] msg = await sqlite.insert_feed( db_file, @@ -466,6 +458,13 @@ async def add_feed(db_file, url): status ) await download_updates(db_file, [url]) + else: + msg = await probe_page( + add_feed, + url, + res[0], + db_file=db_file + ) else: status = res[1] msg = ( @@ -673,7 +672,7 @@ async def feed_mode_request(url, tree): except: continue if len(feeds) > 1: - positive = 0 + counter = 0 msg = ( "RSS URL discovery has found {} feeds:\n```\n" ).format(len(feeds)) @@ -689,7 +688,13 @@ async def feed_mode_request(url, tree): except: continue if feed_amnt: - positive = 1 + # NOTE Because there could be many false positives + # which are revealed in second phase of scan, we + # could end with a single feed, which would be + # listed instead of fetched, so feed_mark is + # utilized in order to make fetch possible. + feed_mark = [feed_addr] + counter += 1 msg += ( "Title: {}\n" "Link : {}\n" @@ -700,10 +705,13 @@ async def feed_mode_request(url, tree): feed_addr, feed_amnt ) - msg += ( - "```\nThe above feeds were extracted from\n{}" - ).format(url) - if not positive: + if counter > 1: + msg += ( + "```\nThe above feeds were extracted from\n{}" + ).format(url) + elif feed_mark: + return feed_mark + else: msg = ( "No feeds were found for {}" ).format(url) @@ -887,3 +895,41 @@ async def feed_mode_auto_discovery(url, tree): elif feeds: feed_addr = join_url(url, feeds[0].xpath('@href')[0]) return [feed_addr] + + +def is_feed(url, feed): + """ + Determine whether document is feed or not. + + Parameters + ---------- + url : str + URL. + feed : dict + Parsed feed. + + Returns + ------- + val : boolean + True or False. + """ + if not feed.entries: + try: + feed["feed"]["title"] + except: + val = False + msg = ( + "No entries nor title for {}" + ).format(url) + elif feed.bozo: + val = False + msg = ( + "Bozo detected for {}" + ).format(url) + else: + val = True + msg = ( + "Good feed for {}" + ).format(url) + print(msg) + return val diff --git a/slixfeed/sqlitehandler.py b/slixfeed/sqlitehandler.py index f460ce5..6996a88 100644 --- a/slixfeed/sqlitehandler.py +++ b/slixfeed/sqlitehandler.py @@ -24,6 +24,7 @@ import confighandler as config import datahandler as datahandler from datetimehandler import current_time, rfc2822_to_iso8601 from sqlite3 import connect, Error +from urlhandler import remove_tracking_parameters # from eliot import start_action, to_file # # with start_action(action_type="list_feeds()", db=db_file): @@ -88,7 +89,6 @@ def create_tables(db_file): "CREATE TABLE IF NOT EXISTS entries (" "id INTEGER PRIMARY KEY," "title TEXT NOT NULL," - "summary TEXT NOT NULL," "link TEXT NOT NULL," "entry_id TEXT," "source TEXT NOT NULL," @@ -100,7 +100,6 @@ def create_tables(db_file): "CREATE TABLE IF NOT EXISTS archive (" "id INTEGER PRIMARY KEY," "title TEXT NOT NULL," - "summary TEXT NOT NULL," "link TEXT NOT NULL," "entry_id TEXT," "source TEXT NOT NULL," @@ -434,11 +433,11 @@ async def get_entry_unread(db_file, num=None): # "DESC LIMIT :num" # ) sql = ( - "SELECT id, title, summary, link, source, timestamp " + "SELECT id, title, link, source, timestamp " "FROM entries " "WHERE read = 0 " "UNION ALL " - "SELECT id, title, summary, link, source, timestamp " + "SELECT id, title, link, source, timestamp " "FROM archive " "ORDER BY timestamp " "DESC LIMIT :num" @@ -469,41 +468,45 @@ async def get_entry_unread(db_file, num=None): for result in results: ix = result[0] title = result[1] - summary = result[2] - # Remove HTML tags - try: - summary = BeautifulSoup(summary, "lxml").text - except: - print(result[2]) - # TODO Limit text length - summary = summary.replace("\n\n\n", "\n\n") - length = await get_settings_value(db_file, "length") - summary = summary[:length] + " […]" - summary = summary.strip().split('\n') - summary = ["> " + line for line in summary] - summary = "\n".join(summary) - link = result[3] + # # TODO Retrieve summary from feed + # # See datahandler.view_entry + # summary = result[2] + # # Remove HTML tags + # try: + # summary = BeautifulSoup(summary, "lxml").text + # except: + # print(result[2]) + # breakpoint() + # # TODO Limit text length + # summary = summary.replace("\n\n\n", "\n\n") + # length = await get_settings_value(db_file, "length") + # summary = summary[:length] + " […]" + # summary = summary.strip().split('\n') + # summary = ["> " + line for line in summary] + # summary = "\n".join(summary) + link = result[2] + link = await remove_tracking_parameters(link) sql = ( "SELECT name " "FROM feeds " "WHERE address = :source " ) - source = result[4] + source = result[3] feed = cur.execute(sql, (source,)) feed = feed.fetchone()[0] if num > 1: news_list += ( - "\n{}\n{}\n" + "\n{}\n{}\n{}\n" ).format( str(title), - str(link) + str(link), + str(feed) ) else: news_list = ( - "{}\n\n{}\n\n{}\n{}" + "{}\n{}\n{}" ).format( str(title), - str(summary), str(link), str(feed) ) @@ -532,7 +535,7 @@ async def mark_entry_as_read(cur, ix): """ sql = ( "UPDATE entries " - "SET summary = '', read = 1 " + "SET read = 1 " "WHERE id = ?" ) cur.execute(sql, (ix,)) @@ -554,7 +557,7 @@ async def mark_source_as_read(db_file, source): cur = conn.cursor() sql = ( "UPDATE entries " - "SET summary = '', read = 1 " + "SET read = 1 " "WHERE source = ?" ) cur.execute(sql, (source,)) @@ -574,7 +577,7 @@ async def mark_all_as_read(db_file): cur = conn.cursor() sql = ( "UPDATE entries " - "SET summary = '', read = 1 " + "SET read = 1 " ) cur.execute(sql) sql = ( @@ -892,23 +895,23 @@ async def add_entry(cur, entry): "INSERT " "INTO entries(" "title, " - "summary, " "link, " "entry_id, " "source, " "timestamp, " "read" ") " - "VALUES(?, ?, ?, ?, ?, ?, ?)" + "VALUES(?, ?, ?, ?, ?, ?)" ) try: cur.execute(sql, entry) except: print(current_time(), "COROUTINE OBJECT NOW") - print(entry[6]) - print(type(entry[6])) + # for i in entry: + # print(type(i)) + # print(i) + # print(type(entry)) print(entry) - print(type(entry)) print(current_time(), "COROUTINE OBJECT NOW") # breakpoint() diff --git a/slixfeed/urlhandler.py b/slixfeed/urlhandler.py index 00bdac8..9b6dac1 100644 --- a/slixfeed/urlhandler.py +++ b/slixfeed/urlhandler.py @@ -17,7 +17,14 @@ TODO from confighandler import get_list from email.utils import parseaddr import random -from urllib.parse import urljoin, urlsplit, urlunsplit +from urllib.parse import ( + parse_qs, + urlencode, + urljoin, + urlparse, + urlsplit, + urlunsplit + ) # NOTE hostname and protocol are listed as one in file @@ -41,6 +48,7 @@ async def replace_hostname(url): parted_url = urlsplit(url) protocol = parted_url.scheme hostname = parted_url.netloc + hostname = hostname.replace("www.","") pathname = parted_url.path queries = parted_url.query fragment = parted_url.fragment @@ -62,6 +70,41 @@ async def replace_hostname(url): return url +async def remove_tracking_parameters(url): + """ + Remove queries with tracking parameters. + + Parameters + ---------- + url : str + URL. + + Returns + ------- + url : str + URL. + """ + parted_url = urlsplit(url) + protocol = parted_url.scheme + hostname = parted_url.netloc + pathname = parted_url.path + queries = parse_qs(parted_url.query) + fragment = parted_url.fragment + trackers = await get_list("queries.yaml") + trackers = trackers["trackers"] + for tracker in trackers: + if tracker in queries: del queries[tracker] + queries_new = urlencode(queries, doseq=True) + url = urlunsplit([ + protocol, + hostname, + pathname, + queries_new, + fragment + ]) + return url + + def feed_to_http(url): """ Replace scheme FEED by HTTP. diff --git a/slixfeed/xmpphandler.py b/slixfeed/xmpphandler.py index d36513b..6f22893 100644 --- a/slixfeed/xmpphandler.py +++ b/slixfeed/xmpphandler.py @@ -3,6 +3,10 @@ """ +TODO + +1) Split into modules (e.g. slixfeed/xmpp/bookmarks.py) + FIXME 1) Function check_readiness or event "changed_status" is causing for