forked from sch/Slixfeed
Add ClearURLs functionality.
Fix Proxy functionality (remove www).
This commit is contained in:
parent
4d03d6e16e
commit
8fbe97e357
5 changed files with 382 additions and 51 deletions
235
assets/queries.yaml
Normal file
235
assets/queries.yaml
Normal file
|
@ -0,0 +1,235 @@
|
|||
# The purpose of this list is to remove queries that
|
||||
# are mostly utilized as trackers for advertising.
|
||||
|
||||
trackers:
|
||||
- ad
|
||||
- ad_medium
|
||||
- ad_name
|
||||
- ad_pvid
|
||||
- ad_sub
|
||||
# ad_tags
|
||||
- advertising-id
|
||||
# aem_p4p_detail
|
||||
- af
|
||||
- aff
|
||||
- aff_fcid
|
||||
- aff_fsk
|
||||
- aff_platform
|
||||
- aff_trace_key
|
||||
- affparams
|
||||
- afSmartRedirect
|
||||
- afftrack
|
||||
- affparams
|
||||
# aid
|
||||
- algo_exp_id
|
||||
- algo_pvid
|
||||
- ar
|
||||
# ascsubtag
|
||||
# asc_contentid
|
||||
- asgtbndr
|
||||
- atc
|
||||
- ats
|
||||
- autostart
|
||||
# b64e# breaks yandex
|
||||
- bizType
|
||||
# block
|
||||
- bta
|
||||
- businessType
|
||||
- campaign
|
||||
- campaignId
|
||||
# __cf_chl_rt_tk
|
||||
# cid# breaks sacred magick
|
||||
- ck
|
||||
# clickid
|
||||
# client_id
|
||||
# cm_ven
|
||||
# cmd
|
||||
- content-id
|
||||
- crid
|
||||
- cst
|
||||
- cts
|
||||
- curPageLogUid
|
||||
# data# breaks yandex
|
||||
# dchild
|
||||
# dclid
|
||||
- deals-widget
|
||||
- dgcid
|
||||
- dicbo
|
||||
# dt
|
||||
- edd
|
||||
- edm_click_module
|
||||
# ei
|
||||
# embed
|
||||
# _encoding
|
||||
# etext# breaks yandex
|
||||
- eventSource
|
||||
- fbclid
|
||||
- feature
|
||||
- field-lbr_brands_browse-bin
|
||||
- forced_click
|
||||
# fr
|
||||
- frs
|
||||
# from# breaks yandex
|
||||
- _ga
|
||||
- ga_order
|
||||
- ga_search_query
|
||||
- ga_search_type
|
||||
- ga_view_type
|
||||
- gatewayAdapt
|
||||
# gclid
|
||||
# gclsrc
|
||||
- gh_jid
|
||||
- gps-id
|
||||
# gs_lcp
|
||||
- gt
|
||||
- guccounter
|
||||
- hdtime
|
||||
- hosted_button_id
|
||||
- ICID
|
||||
- ico
|
||||
- ig_rid
|
||||
# idzone
|
||||
# iflsig
|
||||
- intcmp
|
||||
- irclickid
|
||||
# irgwc
|
||||
# irpid
|
||||
- is_from_webapp
|
||||
- itid
|
||||
# itok
|
||||
# katds_labels
|
||||
# keywords
|
||||
- keyno
|
||||
- l10n
|
||||
- linkCode
|
||||
- mc
|
||||
- mid
|
||||
- __mk_de_DE
|
||||
- mp
|
||||
- nats
|
||||
- nci
|
||||
- obOrigUrl
|
||||
- offer_id
|
||||
- optout
|
||||
- oq
|
||||
- organic_search_click
|
||||
- pa
|
||||
- Partner
|
||||
- partner
|
||||
- partner_id
|
||||
- partner_ID
|
||||
- pcampaignid
|
||||
- pd_rd_i
|
||||
- pd_rd_r
|
||||
- pd_rd_w
|
||||
- pd_rd_wg
|
||||
- pdp_npi
|
||||
- pf_rd_i
|
||||
- pf_rd_m
|
||||
- pf_rd_p
|
||||
- pf_rd_r
|
||||
- pf_rd_s
|
||||
- pf_rd_t
|
||||
- pg
|
||||
- PHPSESSID
|
||||
- pk_campaign
|
||||
- pdp_ext_f
|
||||
- pkey
|
||||
- platform
|
||||
- plkey
|
||||
- pqr
|
||||
- pr
|
||||
- pro
|
||||
- prod
|
||||
- prom
|
||||
- promo
|
||||
- promocode
|
||||
- promoid
|
||||
- psc
|
||||
- psprogram
|
||||
- pvid
|
||||
- qid
|
||||
# r
|
||||
- realDomain
|
||||
- recruiter_id
|
||||
- redirect
|
||||
- ref
|
||||
- ref_
|
||||
- ref_src
|
||||
- refcode
|
||||
- referrer
|
||||
- refinements
|
||||
- reftag
|
||||
- rf
|
||||
- rnid
|
||||
- rowan_id1
|
||||
- rowan_msg_id
|
||||
# rss
|
||||
# sCh
|
||||
- sclient
|
||||
- scm
|
||||
- scm_id
|
||||
- scm-url
|
||||
# sd
|
||||
- sender_device
|
||||
- sh
|
||||
- shareId
|
||||
- showVariations
|
||||
- si
|
||||
# sid# breaks whatsup.org.il
|
||||
- ___SID
|
||||
# site_id
|
||||
- sk
|
||||
- smid
|
||||
- social_params
|
||||
- source
|
||||
- sourceId
|
||||
- sp_csd
|
||||
- spLa
|
||||
- spm
|
||||
- spreadType
|
||||
# sprefix
|
||||
- sr
|
||||
- src
|
||||
- _src
|
||||
- src_cmp
|
||||
- src_player
|
||||
- src_src
|
||||
- srcSns
|
||||
- su
|
||||
# sxin_0_pb
|
||||
- _t
|
||||
# tag
|
||||
- tcampaign
|
||||
- td
|
||||
- terminal_id
|
||||
# text
|
||||
- th# Sometimes restored after page load
|
||||
# title
|
||||
- tracelog
|
||||
- traffic_id
|
||||
- traffic_source
|
||||
- traffic_type
|
||||
- tt
|
||||
- uact
|
||||
- ug_edm_item_id
|
||||
- utm
|
||||
# utm1
|
||||
# utm2
|
||||
# utm3
|
||||
# utm4
|
||||
# utm5
|
||||
# utm6
|
||||
# utm7
|
||||
# utm8
|
||||
# utm9
|
||||
- utm_campaign
|
||||
- utm_content
|
||||
- utm_medium
|
||||
- utm_source
|
||||
- utm_term
|
||||
- uuid
|
||||
# utype
|
||||
# ve
|
||||
# ved
|
||||
# zone'
|
|
@ -14,6 +14,10 @@ TODO
|
|||
|
||||
2) Check also for HTML, not only feed.bozo.
|
||||
|
||||
3) Add "if is_feed(url, feed)" to view_entry and view_feed
|
||||
|
||||
4) Refactor view_entry and view_feed - Why "if" twice?
|
||||
|
||||
"""
|
||||
|
||||
from aiohttp import ClientError, ClientSession, ClientTimeout
|
||||
|
@ -193,7 +197,6 @@ async def download_updates(db_file, url=None):
|
|||
read_status = 1
|
||||
entry = (
|
||||
title,
|
||||
summary,
|
||||
link,
|
||||
eid,
|
||||
source,
|
||||
|
@ -446,18 +449,7 @@ async def add_feed(db_file, url):
|
|||
if res[0]:
|
||||
feed = parse(res[0])
|
||||
title = get_title(url, feed)
|
||||
if not feed.entries:
|
||||
try:
|
||||
feed["feed"]["title"]
|
||||
except:
|
||||
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
|
||||
elif feed.bozo:
|
||||
bozo = (
|
||||
"Bozo detected. Failed to load: {}"
|
||||
).format(url)
|
||||
print(bozo)
|
||||
msg = await probe_page(add_feed, url, res[0], db_file=db_file)
|
||||
else:
|
||||
if is_feed(url, feed):
|
||||
status = res[1]
|
||||
msg = await sqlite.insert_feed(
|
||||
db_file,
|
||||
|
@ -466,6 +458,13 @@ async def add_feed(db_file, url):
|
|||
status
|
||||
)
|
||||
await download_updates(db_file, [url])
|
||||
else:
|
||||
msg = await probe_page(
|
||||
add_feed,
|
||||
url,
|
||||
res[0],
|
||||
db_file=db_file
|
||||
)
|
||||
else:
|
||||
status = res[1]
|
||||
msg = (
|
||||
|
@ -673,7 +672,7 @@ async def feed_mode_request(url, tree):
|
|||
except:
|
||||
continue
|
||||
if len(feeds) > 1:
|
||||
positive = 0
|
||||
counter = 0
|
||||
msg = (
|
||||
"RSS URL discovery has found {} feeds:\n```\n"
|
||||
).format(len(feeds))
|
||||
|
@ -689,7 +688,13 @@ async def feed_mode_request(url, tree):
|
|||
except:
|
||||
continue
|
||||
if feed_amnt:
|
||||
positive = 1
|
||||
# NOTE Because there could be many false positives
|
||||
# which are revealed in second phase of scan, we
|
||||
# could end with a single feed, which would be
|
||||
# listed instead of fetched, so feed_mark is
|
||||
# utilized in order to make fetch possible.
|
||||
feed_mark = [feed_addr]
|
||||
counter += 1
|
||||
msg += (
|
||||
"Title: {}\n"
|
||||
"Link : {}\n"
|
||||
|
@ -700,10 +705,13 @@ async def feed_mode_request(url, tree):
|
|||
feed_addr,
|
||||
feed_amnt
|
||||
)
|
||||
msg += (
|
||||
"```\nThe above feeds were extracted from\n{}"
|
||||
).format(url)
|
||||
if not positive:
|
||||
if counter > 1:
|
||||
msg += (
|
||||
"```\nThe above feeds were extracted from\n{}"
|
||||
).format(url)
|
||||
elif feed_mark:
|
||||
return feed_mark
|
||||
else:
|
||||
msg = (
|
||||
"No feeds were found for {}"
|
||||
).format(url)
|
||||
|
@ -887,3 +895,41 @@ async def feed_mode_auto_discovery(url, tree):
|
|||
elif feeds:
|
||||
feed_addr = join_url(url, feeds[0].xpath('@href')[0])
|
||||
return [feed_addr]
|
||||
|
||||
|
||||
def is_feed(url, feed):
|
||||
"""
|
||||
Determine whether document is feed or not.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
URL.
|
||||
feed : dict
|
||||
Parsed feed.
|
||||
|
||||
Returns
|
||||
-------
|
||||
val : boolean
|
||||
True or False.
|
||||
"""
|
||||
if not feed.entries:
|
||||
try:
|
||||
feed["feed"]["title"]
|
||||
except:
|
||||
val = False
|
||||
msg = (
|
||||
"No entries nor title for {}"
|
||||
).format(url)
|
||||
elif feed.bozo:
|
||||
val = False
|
||||
msg = (
|
||||
"Bozo detected for {}"
|
||||
).format(url)
|
||||
else:
|
||||
val = True
|
||||
msg = (
|
||||
"Good feed for {}"
|
||||
).format(url)
|
||||
print(msg)
|
||||
return val
|
||||
|
|
|
@ -24,6 +24,7 @@ import confighandler as config
|
|||
import datahandler as datahandler
|
||||
from datetimehandler import current_time, rfc2822_to_iso8601
|
||||
from sqlite3 import connect, Error
|
||||
from urlhandler import remove_tracking_parameters
|
||||
|
||||
# from eliot import start_action, to_file
|
||||
# # with start_action(action_type="list_feeds()", db=db_file):
|
||||
|
@ -88,7 +89,6 @@ def create_tables(db_file):
|
|||
"CREATE TABLE IF NOT EXISTS entries ("
|
||||
"id INTEGER PRIMARY KEY,"
|
||||
"title TEXT NOT NULL,"
|
||||
"summary TEXT NOT NULL,"
|
||||
"link TEXT NOT NULL,"
|
||||
"entry_id TEXT,"
|
||||
"source TEXT NOT NULL,"
|
||||
|
@ -100,7 +100,6 @@ def create_tables(db_file):
|
|||
"CREATE TABLE IF NOT EXISTS archive ("
|
||||
"id INTEGER PRIMARY KEY,"
|
||||
"title TEXT NOT NULL,"
|
||||
"summary TEXT NOT NULL,"
|
||||
"link TEXT NOT NULL,"
|
||||
"entry_id TEXT,"
|
||||
"source TEXT NOT NULL,"
|
||||
|
@ -434,11 +433,11 @@ async def get_entry_unread(db_file, num=None):
|
|||
# "DESC LIMIT :num"
|
||||
# )
|
||||
sql = (
|
||||
"SELECT id, title, summary, link, source, timestamp "
|
||||
"SELECT id, title, link, source, timestamp "
|
||||
"FROM entries "
|
||||
"WHERE read = 0 "
|
||||
"UNION ALL "
|
||||
"SELECT id, title, summary, link, source, timestamp "
|
||||
"SELECT id, title, link, source, timestamp "
|
||||
"FROM archive "
|
||||
"ORDER BY timestamp "
|
||||
"DESC LIMIT :num"
|
||||
|
@ -469,41 +468,45 @@ async def get_entry_unread(db_file, num=None):
|
|||
for result in results:
|
||||
ix = result[0]
|
||||
title = result[1]
|
||||
summary = result[2]
|
||||
# Remove HTML tags
|
||||
try:
|
||||
summary = BeautifulSoup(summary, "lxml").text
|
||||
except:
|
||||
print(result[2])
|
||||
# TODO Limit text length
|
||||
summary = summary.replace("\n\n\n", "\n\n")
|
||||
length = await get_settings_value(db_file, "length")
|
||||
summary = summary[:length] + " […]"
|
||||
summary = summary.strip().split('\n')
|
||||
summary = ["> " + line for line in summary]
|
||||
summary = "\n".join(summary)
|
||||
link = result[3]
|
||||
# # TODO Retrieve summary from feed
|
||||
# # See datahandler.view_entry
|
||||
# summary = result[2]
|
||||
# # Remove HTML tags
|
||||
# try:
|
||||
# summary = BeautifulSoup(summary, "lxml").text
|
||||
# except:
|
||||
# print(result[2])
|
||||
# breakpoint()
|
||||
# # TODO Limit text length
|
||||
# summary = summary.replace("\n\n\n", "\n\n")
|
||||
# length = await get_settings_value(db_file, "length")
|
||||
# summary = summary[:length] + " […]"
|
||||
# summary = summary.strip().split('\n')
|
||||
# summary = ["> " + line for line in summary]
|
||||
# summary = "\n".join(summary)
|
||||
link = result[2]
|
||||
link = await remove_tracking_parameters(link)
|
||||
sql = (
|
||||
"SELECT name "
|
||||
"FROM feeds "
|
||||
"WHERE address = :source "
|
||||
)
|
||||
source = result[4]
|
||||
source = result[3]
|
||||
feed = cur.execute(sql, (source,))
|
||||
feed = feed.fetchone()[0]
|
||||
if num > 1:
|
||||
news_list += (
|
||||
"\n{}\n{}\n"
|
||||
"\n{}\n{}\n{}\n"
|
||||
).format(
|
||||
str(title),
|
||||
str(link)
|
||||
str(link),
|
||||
str(feed)
|
||||
)
|
||||
else:
|
||||
news_list = (
|
||||
"{}\n\n{}\n\n{}\n{}"
|
||||
"{}\n{}\n{}"
|
||||
).format(
|
||||
str(title),
|
||||
str(summary),
|
||||
str(link),
|
||||
str(feed)
|
||||
)
|
||||
|
@ -532,7 +535,7 @@ async def mark_entry_as_read(cur, ix):
|
|||
"""
|
||||
sql = (
|
||||
"UPDATE entries "
|
||||
"SET summary = '', read = 1 "
|
||||
"SET read = 1 "
|
||||
"WHERE id = ?"
|
||||
)
|
||||
cur.execute(sql, (ix,))
|
||||
|
@ -554,7 +557,7 @@ async def mark_source_as_read(db_file, source):
|
|||
cur = conn.cursor()
|
||||
sql = (
|
||||
"UPDATE entries "
|
||||
"SET summary = '', read = 1 "
|
||||
"SET read = 1 "
|
||||
"WHERE source = ?"
|
||||
)
|
||||
cur.execute(sql, (source,))
|
||||
|
@ -574,7 +577,7 @@ async def mark_all_as_read(db_file):
|
|||
cur = conn.cursor()
|
||||
sql = (
|
||||
"UPDATE entries "
|
||||
"SET summary = '', read = 1 "
|
||||
"SET read = 1 "
|
||||
)
|
||||
cur.execute(sql)
|
||||
sql = (
|
||||
|
@ -892,23 +895,23 @@ async def add_entry(cur, entry):
|
|||
"INSERT "
|
||||
"INTO entries("
|
||||
"title, "
|
||||
"summary, "
|
||||
"link, "
|
||||
"entry_id, "
|
||||
"source, "
|
||||
"timestamp, "
|
||||
"read"
|
||||
") "
|
||||
"VALUES(?, ?, ?, ?, ?, ?, ?)"
|
||||
"VALUES(?, ?, ?, ?, ?, ?)"
|
||||
)
|
||||
try:
|
||||
cur.execute(sql, entry)
|
||||
except:
|
||||
print(current_time(), "COROUTINE OBJECT NOW")
|
||||
print(entry[6])
|
||||
print(type(entry[6]))
|
||||
# for i in entry:
|
||||
# print(type(i))
|
||||
# print(i)
|
||||
# print(type(entry))
|
||||
print(entry)
|
||||
print(type(entry))
|
||||
print(current_time(), "COROUTINE OBJECT NOW")
|
||||
# breakpoint()
|
||||
|
||||
|
|
|
@ -17,7 +17,14 @@ TODO
|
|||
from confighandler import get_list
|
||||
from email.utils import parseaddr
|
||||
import random
|
||||
from urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
from urllib.parse import (
|
||||
parse_qs,
|
||||
urlencode,
|
||||
urljoin,
|
||||
urlparse,
|
||||
urlsplit,
|
||||
urlunsplit
|
||||
)
|
||||
|
||||
|
||||
# NOTE hostname and protocol are listed as one in file
|
||||
|
@ -41,6 +48,7 @@ async def replace_hostname(url):
|
|||
parted_url = urlsplit(url)
|
||||
protocol = parted_url.scheme
|
||||
hostname = parted_url.netloc
|
||||
hostname = hostname.replace("www.","")
|
||||
pathname = parted_url.path
|
||||
queries = parted_url.query
|
||||
fragment = parted_url.fragment
|
||||
|
@ -62,6 +70,41 @@ async def replace_hostname(url):
|
|||
return url
|
||||
|
||||
|
||||
async def remove_tracking_parameters(url):
|
||||
"""
|
||||
Remove queries with tracking parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
URL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url : str
|
||||
URL.
|
||||
"""
|
||||
parted_url = urlsplit(url)
|
||||
protocol = parted_url.scheme
|
||||
hostname = parted_url.netloc
|
||||
pathname = parted_url.path
|
||||
queries = parse_qs(parted_url.query)
|
||||
fragment = parted_url.fragment
|
||||
trackers = await get_list("queries.yaml")
|
||||
trackers = trackers["trackers"]
|
||||
for tracker in trackers:
|
||||
if tracker in queries: del queries[tracker]
|
||||
queries_new = urlencode(queries, doseq=True)
|
||||
url = urlunsplit([
|
||||
protocol,
|
||||
hostname,
|
||||
pathname,
|
||||
queries_new,
|
||||
fragment
|
||||
])
|
||||
return url
|
||||
|
||||
|
||||
def feed_to_http(url):
|
||||
"""
|
||||
Replace scheme FEED by HTTP.
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
|
||||
"""
|
||||
|
||||
TODO
|
||||
|
||||
1) Split into modules (e.g. slixfeed/xmpp/bookmarks.py)
|
||||
|
||||
FIXME
|
||||
|
||||
1) Function check_readiness or event "changed_status" is causing for
|
||||
|
|
Loading…
Reference in a new issue