Fix archiving functionality

This commit is contained in:
Schimon Jehudah 2024-01-14 21:43:23 +00:00
parent c04a1b6534
commit 80e49a8d38
3 changed files with 105 additions and 268 deletions

View file

@ -459,18 +459,12 @@ async def add_feed(db_file, url):
) )
await scan( await scan(
db_file, url) db_file, url)
old = ( old = await get_setting_value(db_file, "old")
await sqlite.get_settings_value(
db_file, "old")
) or (
config.get_value_default(
"settings", "Settings", "old")
)
if not old: if not old:
await sqlite.mark_feed_as_read( await sqlite.mark_feed_as_read(
db_file, url) db_file, url)
response = ( response = (
"> {}\nNews source {} has been " "> {}\nNews source \"{}\" has been "
"added to subscription list." "added to subscription list."
).format(url, title) ).format(url, title)
break break
@ -657,7 +651,7 @@ async def scan(db_file, url):
entries = feed.entries entries = feed.entries
# length = len(entries) # length = len(entries)
await remove_nonexistent_entries( await remove_nonexistent_entries(
db_file, feed, url) db_file, url, feed)
try: try:
if feed.bozo: if feed.bozo:
# bozo = ( # bozo = (
@ -669,15 +663,17 @@ async def scan(db_file, url):
valid = 0 valid = 0
else: else:
valid = 1 valid = 1
feed_id = await sqlite.get_feed_id(db_file, url)
await sqlite.update_feed_validity( await sqlite.update_feed_validity(
db_file, url, valid) db_file, feed_id, valid)
if "updated_parsed" in feed["feed"].keys(): if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"] updated = feed["feed"]["updated_parsed"]
updated = convert_struct_time_to_iso8601(updated) updated = convert_struct_time_to_iso8601(updated)
else: else:
updated = '' updated = ''
feed_id = await sqlite.get_feed_id(db_file, url)
await sqlite.update_feed_properties( await sqlite.update_feed_properties(
db_file, url, len(feed["entries"]), updated) db_file, feed_id, len(feed["entries"]), updated)
# await update_feed_status # await update_feed_status
except ( except (
IncompleteReadError, IncompleteReadError,
@ -706,8 +702,9 @@ async def scan(db_file, url):
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title) # title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
title = entry.title if entry.has_key("title") else date title = entry.title if entry.has_key("title") else date
entry_id = entry.id if entry.has_key("id") else link entry_id = entry.id if entry.has_key("id") else link
feed_id = await sqlite.get_feed_id(db_file, url)
exist = await sqlite.check_entry_exist( exist = await sqlite.check_entry_exist(
db_file, url, entry_id=entry_id, db_file, feed_id, entry_id=entry_id,
title=title, link=link, date=date) title=title, link=link, date=date)
if not exist: if not exist:
summary = entry.summary if entry.has_key("summary") else '' summary = entry.summary if entry.has_key("summary") else ''
@ -760,7 +757,6 @@ async def scan(db_file, url):
"link": link, "link": link,
"enclosure": media_link, "enclosure": media_link,
"entry_id": entry_id, "entry_id": entry_id,
"url": url,
"date": date, "date": date,
"read_status": read_status "read_status": read_status
} }
@ -770,8 +766,9 @@ async def scan(db_file, url):
# url, date, read_status) # url, date, read_status)
# await sqlite.set_date(db_file, url) # await sqlite.set_date(db_file, url)
if len(new_entries): if len(new_entries):
feed_id = await sqlite.get_feed_id(db_file, url)
await sqlite.add_entries_and_update_timestamp( await sqlite.add_entries_and_update_timestamp(
db_file, new_entries) db_file, feed_id, new_entries)
def get_document_title(data): def get_document_title(data):
@ -912,163 +909,7 @@ async def get_magnet(link):
return torrent return torrent
# NOTE Why (if res[0]) and (if res[1] == 200)? async def remove_nonexistent_entries(db_file, url, feed):
async def organize_items(db_file, urls):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
for url in urls:
# print(os.path.basename(db_file), url[0])
url = url[0]
res = await fetch.http(url)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed
# urls.next()
# next(urls)
continue
status = res[1]
await sqlite.update_feed_status(
db_file, url, status)
if res[0]:
try:
feed = parse(res[0])
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
# print(bozo)
valid = 0
else:
valid = 1
await sqlite.update_feed_validity(
db_file, url, valid)
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
updated = convert_struct_time_to_iso8601(updated)
else:
updated = ''
entries = len(feed["entries"])
await sqlite.update_feed_properties(
db_file, url, entries, updated)
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
logging.error(e)
# TODO Print error to log
# None
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if status == 200:
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed.entries
# length = len(entries)
# await remove_entry(db_file, source, length)
await remove_nonexistent_entries(
db_file, feed, url)
# new_entry = 0
for entry in entries:
# TODO Pass date too for comparion check
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = now()
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
if entry.has_key("title"):
title = entry.title
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
else:
title = date
# title = feed["feed"]["title"]
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = url
if entry.has_key("id"):
eid = entry.id
else:
eid = link
exist = await sqlite.check_entry_exist(
db_file, url, eid=eid,
title=title, link=link, date=date)
if not exist:
# new_entry = new_entry + 1
# TODO Enhance summary
if entry.has_key("summary"):
summary = entry.summary
# # Remove HTML tags
# summary = BeautifulSoup(summary, "lxml").text
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# summary = summary[:300] + " […]‍⃨"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
else:
summary = "> *** No summary ***"
read_status = 0
pathname = urlsplit(link).path
string = ("{} {} {}"
).format(
title, summary, pathname
)
allow_list = await config.is_include_keyword(
db_file, "filter-allow", string)
if not allow_list:
reject_list = await config.is_include_keyword(
db_file, "filter-deny", string)
if reject_list:
# print(">>> REJECTED", title)
summary = (
"REJECTED {}".format(
reject_list.upper()
)
)
# summary = ""
read_status = 1
entry = (
title, link, eid, url, date, read_status)
if isinstance(date, int):
print("PROBLEM: date is int")
print(date)
# breakpoint()
# print(source)
# print(date)
await sqlite.add_entry_and_set_date(
db_file, url, entry)
# print(current_time(), entry, title)
# else:
# print(current_time(), exist, title)
async def remove_nonexistent_entries(db_file, feed, url):
""" """
Remove entries that don't exist in a given parsed feed. Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non Check the entries returned from feed and delete read non
@ -1078,15 +919,21 @@ async def remove_nonexistent_entries(db_file, feed, url):
---------- ----------
db_file : str db_file : str
Path to database file. Path to database file.
url : str
Feed URL.
feed : list feed : list
Parsed feed document. Parsed feed document.
url : str
Feed URL. URL of associated feed.
""" """
items = await sqlite.get_entries_of_feed(db_file, feed, url) feed_id = await sqlite.get_feed_id(db_file, url)
items = await sqlite.get_entries_of_feed(db_file, feed_id)
entries = feed.entries entries = feed.entries
# breakpoint()
for item in items: for item in items:
ix = item[0]
entry_title = item[1]
entry_link = item[2]
entry_id = item[3]
timestamp = item[4]
read_status = item[5]
valid = False valid = False
for entry in entries: for entry in entries:
title = None title = None
@ -1094,10 +941,10 @@ async def remove_nonexistent_entries(db_file, feed, url):
time = None time = None
# valid = False # valid = False
# TODO better check and don't repeat code # TODO better check and don't repeat code
if entry.has_key("id") and item[3]: if entry.has_key("id") and entry_id:
if entry.id == item[3]: if entry.id == entry_id:
# print("compare1:", entry.id) # print("compare1:", entry.id)
# print("compare2:", item[3]) # print("compare2:", entry_id)
# print("============") # print("============")
valid = True valid = True
break break
@ -1110,61 +957,57 @@ async def remove_nonexistent_entries(db_file, feed, url):
link = join_url(url, entry.link) link = join_url(url, entry.link)
else: else:
link = url link = url
if entry.has_key("published") and item[4]: if entry.has_key("published") and timestamp:
# print("compare11:", title, link, time) # print("compare11:", title, link, time)
# print("compare22:", item[1], item[2], item[4]) # print("compare22:", entry_title, entry_link, timestamp)
# print("============") # print("============")
time = rfc2822_to_iso8601(entry.published) time = rfc2822_to_iso8601(entry.published)
if (item[1] == title and if (entry_title == title and
item[2] == link and entry_link == link and
item[4] == time): timestamp == time):
valid = True valid = True
break break
else: else:
if (item[1] == title and if (entry_title == title and
item[2] == link): entry_link == link):
# print("compare111:", title, link) # print("compare111:", title, link)
# print("compare222:", item[1], item[2]) # print("compare222:", entry_title, entry_link)
# print("============") # print("============")
valid = True valid = True
break break
# TODO better check and don't repeat code # TODO better check and don't repeat code
if not valid: if not valid:
# print("id: ", item[0]) # print("id: ", ix)
# if title: # if title:
# print("title: ", title) # print("title: ", title)
# print("item[1]: ", item[1]) # print("entry_title: ", entry_title)
# if link: # if link:
# print("link: ", link) # print("link: ", link)
# print("item[2]: ", item[2]) # print("entry_link: ", entry_link)
# if entry.id: # if entry.id:
# print("last_entry:", entry.id) # print("last_entry:", entry.id)
# print("item[3]: ", item[3]) # print("entry_id: ", entry_id)
# if time: # if time:
# print("time: ", time) # print("time: ", time)
# print("item[4]: ", item[4]) # print("timestamp: ", timestamp)
# print("read: ", item[5]) # print("read: ", read_status)
# breakpoint() # breakpoint()
# TODO Send to table archive # TODO Send to table archive
# TODO Also make a regular/routine check for sources that # TODO Also make a regular/routine check for sources that
# have been changed (though that can only happen when # have been changed (though that can only happen when
# manually editing) # manually editing)
ix = item[0] # ix = item[0]
# print(">>> SOURCE: ", source) # print(">>> SOURCE: ", source)
# print(">>> INVALID:", item[1]) # print(">>> INVALID:", entry_title)
# print("title:", item[1]) # print("title:", entry_title)
# print("link :", item[2]) # print("link :", entry_link)
# print("id :", item[3]) # print("id :", entry_id)
if item[5] == 1: if read_status == 1:
await sqlite.delete_entry_by_id(db_file, ix) await sqlite.delete_entry_by_id(db_file, ix)
# print(">>> DELETING:", item[1]) # print(">>> DELETING:", entry_title)
else: else:
# print(">>> ARCHIVING:", item[1]) # print(">>> ARCHIVING:", entry_title)
await sqlite.archive_entry(db_file, ix) await sqlite.archive_entry(db_file, ix)
limit = ( limit = await get_setting_value(db_file, "archive")
await sqlite.get_settings_value(db_file, "archive")
) or (
config.get_value_default("settings", "Settings", "archive")
)
await sqlite.maintain_archive(db_file, limit) await sqlite.maintain_archive(db_file, limit)

View file

@ -467,7 +467,15 @@ async def insert_feed_(
title, url title, url
) )
cur.execute(sql, par) cur.execute(sql, par)
feed_id = get_feed_id(cur, url) sql = (
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
par = (url,)
feed_id = cur.execute(sql, par).fetchone()[0]
insert_feed_properties( insert_feed_properties(
cur, feed_id, entries=None, cur, feed_id, entries=None,
version=None, encoding=None, language=None) version=None, encoding=None, language=None)
@ -705,14 +713,14 @@ async def get_unread_entries(db_file, num):
return results return results
def get_feed_id(cur, url): async def get_feed_id(db_file, url):
""" """
Get index of given feed. Get index of given feed.
Parameters Parameters
---------- ----------
cur : object db_file : str
Cursor object. Path to database file.
url : str url : str
URL. URL.
@ -721,16 +729,18 @@ def get_feed_id(cur, url):
feed_id : str feed_id : str
Feed index. Feed index.
""" """
sql = ( with create_connection(db_file) as conn:
""" cur = conn.cursor()
SELECT id sql = (
FROM feeds """
WHERE url = :url SELECT id
""" FROM feeds
) WHERE url = :url
par = (url,) """
feed_id = cur.execute(sql, par).fetchone()[0] )
return feed_id par = (url,)
feed_id = cur.execute(sql, par).fetchone()[0]
return feed_id
async def mark_entry_as_read(cur, ix): async def mark_entry_as_read(cur, ix):
@ -1050,7 +1060,7 @@ were async, there were errors of coroutines
""" """
async def add_entry( async def add_entry(
db_file, title, link, entry_id, url, date, read_status): db_file, title, link, entry_id, feed_id, date, read_status):
""" """
Add a new entry row into the entries table. Add a new entry row into the entries table.
@ -1064,8 +1074,8 @@ async def add_entry(
Link. Link.
entry_id : str entry_id : str
Entry index. Entry index.
url : str feed_id : str
URL. Feed Id.
date : str date : str
Date. Date.
read_status : str read_status : str
@ -1074,7 +1084,6 @@ async def add_entry(
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
INSERT INSERT
@ -1110,7 +1119,7 @@ async def add_entry(
# # breakpoint() # # breakpoint()
async def add_entries_and_update_timestamp(db_file, new_entries): async def add_entries_and_update_timestamp(db_file, feed_id, new_entries):
""" """
Add new entries. Add new entries.
@ -1124,10 +1133,7 @@ async def add_entries_and_update_timestamp(db_file, new_entries):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feeds = []
for entry in new_entries: for entry in new_entries:
url = entry["url"]
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
INSERT INSERT
@ -1147,26 +1153,21 @@ async def add_entries_and_update_timestamp(db_file, new_entries):
"read": entry["read_status"] "read": entry["read_status"]
} }
cur.execute(sql, par) cur.execute(sql, par)
if url not in feeds: sql = (
feeds.extend([url]) """
for feed in feeds: UPDATE status
url = feed SET renewed = :today
feed_id = get_feed_id(cur, url) WHERE feed_id = :feed_id
sql = ( """
""" )
UPDATE status par = {
SET renewed = :today "today": date.today(),
WHERE feed_id = :feed_id "feed_id": feed_id
""" }
) cur.execute(sql, par)
par = {
"today": date.today(),
"feed_id": feed_id
}
cur.execute(sql, par)
async def set_date(db_file, url): async def set_date(db_file, feed_id):
""" """
Set renewed date of given feed. Set renewed date of given feed.
@ -1174,13 +1175,12 @@ async def set_date(db_file, url):
---------- ----------
db_file : str db_file : str
Path to database file. Path to database file.
url : str feed_id : str
URL. Feed Id.
""" """
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1196,7 +1196,7 @@ async def set_date(db_file, url):
cur.execute(sql, par) cur.execute(sql, par)
async def update_feed_status(db_file, url, status_code): async def update_feed_status(db_file, feed_id, status_code):
""" """
Set status_code of feed_id in table status. Set status_code of feed_id in table status.
@ -1212,7 +1212,6 @@ async def update_feed_status(db_file, url, status_code):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1228,7 +1227,7 @@ async def update_feed_status(db_file, url, status_code):
cur.execute(sql, par) cur.execute(sql, par)
async def update_feed_validity(db_file, url, valid): async def update_feed_validity(db_file, feed_id, valid):
""" """
Set validity status of feed_id in table status. Set validity status of feed_id in table status.
@ -1244,7 +1243,6 @@ async def update_feed_validity(db_file, url, valid):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1259,7 +1257,7 @@ async def update_feed_validity(db_file, url, valid):
cur.execute(sql, par) cur.execute(sql, par)
async def update_feed_properties(db_file, url, entries, updated): async def update_feed_properties(db_file, feed_id, entries, updated):
""" """
Update properties of url in table feeds. Update properties of url in table feeds.
@ -1277,7 +1275,6 @@ async def update_feed_properties(db_file, url, entries, updated):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
UPDATE properties UPDATE properties
@ -1343,7 +1340,7 @@ async def maintain_archive(db_file, limit):
# NOTE Entries that are read from archive are deleted. # NOTE Entries that are read from archive are deleted.
# NOTE Unlike entries from table entries, entries from # NOTE Unlike entries from table entries, entries from
# table archive are not marked as read. # table archive are not marked as read.
async def get_entries_of_feed(db_file, feed, url): async def get_entries_of_feed(db_file, feed_id):
""" """
Remove entries that don't exist in a given parsed feed. Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non Check the entries returned from feed and delete read non
@ -1353,10 +1350,8 @@ async def get_entries_of_feed(db_file, feed, url):
---------- ----------
db_file : str db_file : str
Path to database file. Path to database file.
feed : list feed_id : str
Parsed feed document. Feed Id.
url : str
Feed URL. URL of associated feed.
""" """
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
@ -1367,7 +1362,7 @@ async def get_entries_of_feed(db_file, feed, url):
WHERE feed_id = ? WHERE feed_id = ?
""" """
) )
par = (url,) par = (feed_id,)
items = cur.execute(sql, par).fetchall() items = cur.execute(sql, par).fetchall()
return items return items
@ -1587,7 +1582,7 @@ ERROR DATE: result = https://blog.heckel.io/feed/
""" """
async def check_entry_exist( async def check_entry_exist(
db_file, url, entry_id=None, title=None, link=None, date=None): db_file, feed_id, entry_id=None, title=None, link=None, date=None):
""" """
Check whether an entry exists. Check whether an entry exists.
If entry has an ID, check by ID. If entry has an ID, check by ID.
@ -1598,8 +1593,8 @@ async def check_entry_exist(
---------- ----------
db_file : str db_file : str
Path to database file. Path to database file.
source : str feed_id : str
Feed URL. URL of associated feed. Feed Id.
entry_id : str, optional entry_id : str, optional
Entry ID. The default is None. Entry ID. The default is None.
title : str, optional title : str, optional
@ -1618,7 +1613,6 @@ async def check_entry_exist(
cur = conn.cursor() cur = conn.cursor()
exist = False exist = False
if entry_id: if entry_id:
feed_id = get_feed_id(cur, url)
sql = ( sql = (
""" """
SELECT id SELECT id
@ -1643,13 +1637,13 @@ async def check_entry_exist(
par = { par = {
"title": title, "title": title,
"link": link, "link": link,
"timestamp": date "date": date
} }
try: try:
result = cur.execute(sql, par).fetchone() result = cur.execute(sql, par).fetchone()
if result: exist = True if result: exist = True
except: except:
print(current_time(), "ERROR DATE: source =", url) print(current_time(), "ERROR DATE: source =", feed_id)
print(current_time(), "ERROR DATE: date =", date) print(current_time(), "ERROR DATE: date =", date)
else: else:
sql = ( sql = (

View file

@ -838,8 +838,8 @@ async def message(self, message):
url = await sqlite.remove_feed_by_index( url = await sqlite.remove_feed_by_index(
db_file, ix) db_file, ix)
response = ( response = (
"> {}\nNews source {} has been removed " "> {}\nNews source \"{}\" has been "
"from subscription list." "removed from subscription list."
).format(url, ix) ).format(url, ix)
except: except:
response = ( response = (