From 80e49a8d38cdd7fb481e4de6b70acd51fb47b736 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Sun, 14 Jan 2024 21:43:23 +0000 Subject: [PATCH] Fix archiving functionality --- slixfeed/action.py | 253 ++++++++------------------------------- slixfeed/sqlite.py | 116 +++++++++--------- slixfeed/xmpp/process.py | 4 +- 3 files changed, 105 insertions(+), 268 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index 7454358..4e7e853 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -459,18 +459,12 @@ async def add_feed(db_file, url): ) await scan( db_file, url) - old = ( - await sqlite.get_settings_value( - db_file, "old") - ) or ( - config.get_value_default( - "settings", "Settings", "old") - ) + old = await get_setting_value(db_file, "old") if not old: await sqlite.mark_feed_as_read( db_file, url) response = ( - "> {}\nNews source {} has been " + "> {}\nNews source \"{}\" has been " "added to subscription list." ).format(url, title) break @@ -657,7 +651,7 @@ async def scan(db_file, url): entries = feed.entries # length = len(entries) await remove_nonexistent_entries( - db_file, feed, url) + db_file, url, feed) try: if feed.bozo: # bozo = ( @@ -669,15 +663,17 @@ async def scan(db_file, url): valid = 0 else: valid = 1 + feed_id = await sqlite.get_feed_id(db_file, url) await sqlite.update_feed_validity( - db_file, url, valid) + db_file, feed_id, valid) if "updated_parsed" in feed["feed"].keys(): updated = feed["feed"]["updated_parsed"] updated = convert_struct_time_to_iso8601(updated) else: updated = '' + feed_id = await sqlite.get_feed_id(db_file, url) await sqlite.update_feed_properties( - db_file, url, len(feed["entries"]), updated) + db_file, feed_id, len(feed["entries"]), updated) # await update_feed_status except ( IncompleteReadError, @@ -706,8 +702,9 @@ async def scan(db_file, url): # title = "{}: *{}*".format(feed["feed"]["title"], entry.title) title = entry.title if entry.has_key("title") else date entry_id = entry.id if entry.has_key("id") else link + feed_id = await sqlite.get_feed_id(db_file, url) exist = await sqlite.check_entry_exist( - db_file, url, entry_id=entry_id, + db_file, feed_id, entry_id=entry_id, title=title, link=link, date=date) if not exist: summary = entry.summary if entry.has_key("summary") else '' @@ -760,7 +757,6 @@ async def scan(db_file, url): "link": link, "enclosure": media_link, "entry_id": entry_id, - "url": url, "date": date, "read_status": read_status } @@ -770,8 +766,9 @@ async def scan(db_file, url): # url, date, read_status) # await sqlite.set_date(db_file, url) if len(new_entries): + feed_id = await sqlite.get_feed_id(db_file, url) await sqlite.add_entries_and_update_timestamp( - db_file, new_entries) + db_file, feed_id, new_entries) def get_document_title(data): @@ -912,163 +909,7 @@ async def get_magnet(link): return torrent -# NOTE Why (if res[0]) and (if res[1] == 200)? -async def organize_items(db_file, urls): - """ - Check feeds for new entries. - - Parameters - ---------- - db_file : str - Path to database file. - url : str, optional - URL. The default is None. - """ - for url in urls: - # print(os.path.basename(db_file), url[0]) - url = url[0] - res = await fetch.http(url) - # TypeError: 'NoneType' object is not subscriptable - if res is None: - # Skip to next feed - # urls.next() - # next(urls) - continue - status = res[1] - await sqlite.update_feed_status( - db_file, url, status) - if res[0]: - try: - feed = parse(res[0]) - if feed.bozo: - # bozo = ( - # "WARNING: Bozo detected for feed: {}\n" - # "For more information, visit " - # "https://pythonhosted.org/feedparser/bozo.html" - # ).format(url) - # print(bozo) - valid = 0 - else: - valid = 1 - await sqlite.update_feed_validity( - db_file, url, valid) - if "updated_parsed" in feed["feed"].keys(): - updated = feed["feed"]["updated_parsed"] - updated = convert_struct_time_to_iso8601(updated) - else: - updated = '' - entries = len(feed["entries"]) - await sqlite.update_feed_properties( - db_file, url, entries, updated) - except ( - IncompleteReadError, - IncompleteRead, - error.URLError - ) as e: - logging.error(e) - # TODO Print error to log - # None - # NOTE I don't think there should be "return" - # because then we might stop scanning next URLs - # return - # TODO Place these couple of lines back down - # NOTE Need to correct the SQL statement to do so - # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW - if status == 200: - # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW - # TODO Place these couple of lines back down - # NOTE Need to correct the SQL statement to do so - entries = feed.entries - # length = len(entries) - # await remove_entry(db_file, source, length) - await remove_nonexistent_entries( - db_file, feed, url) - # new_entry = 0 - for entry in entries: - # TODO Pass date too for comparion check - if entry.has_key("published"): - date = entry.published - date = rfc2822_to_iso8601(date) - elif entry.has_key("updated"): - date = entry.updated - date = rfc2822_to_iso8601(date) - else: - # TODO Just set date = "*** No date ***" - # date = await datetime.now().isoformat() - date = now() - # NOTE Would seconds result in better database performance - # date = datetime.datetime(date) - # date = (date-datetime.datetime(1970,1,1)).total_seconds() - if entry.has_key("title"): - title = entry.title - # title = "{}: *{}*".format(feed["feed"]["title"], entry.title) - else: - title = date - # title = feed["feed"]["title"] - if entry.has_key("link"): - # link = complete_url(source, entry.link) - link = join_url(url, entry.link) - link = trim_url(link) - else: - link = url - if entry.has_key("id"): - eid = entry.id - else: - eid = link - exist = await sqlite.check_entry_exist( - db_file, url, eid=eid, - title=title, link=link, date=date) - if not exist: - # new_entry = new_entry + 1 - # TODO Enhance summary - if entry.has_key("summary"): - summary = entry.summary - # # Remove HTML tags - # summary = BeautifulSoup(summary, "lxml").text - # # TODO Limit text length - # summary = summary.replace("\n\n\n", "\n\n") - # summary = summary[:300] + " […]‍⃨" - # summary = summary.strip().split('\n') - # summary = ["> " + line for line in summary] - # summary = "\n".join(summary) - else: - summary = "> *** No summary ***" - read_status = 0 - pathname = urlsplit(link).path - string = ("{} {} {}" - ).format( - title, summary, pathname - ) - allow_list = await config.is_include_keyword( - db_file, "filter-allow", string) - if not allow_list: - reject_list = await config.is_include_keyword( - db_file, "filter-deny", string) - if reject_list: - # print(">>> REJECTED", title) - summary = ( - "REJECTED {}".format( - reject_list.upper() - ) - ) - # summary = "" - read_status = 1 - entry = ( - title, link, eid, url, date, read_status) - if isinstance(date, int): - print("PROBLEM: date is int") - print(date) - # breakpoint() - # print(source) - # print(date) - await sqlite.add_entry_and_set_date( - db_file, url, entry) - # print(current_time(), entry, title) - # else: - # print(current_time(), exist, title) - - -async def remove_nonexistent_entries(db_file, feed, url): +async def remove_nonexistent_entries(db_file, url, feed): """ Remove entries that don't exist in a given parsed feed. Check the entries returned from feed and delete read non @@ -1078,15 +919,21 @@ async def remove_nonexistent_entries(db_file, feed, url): ---------- db_file : str Path to database file. + url : str + Feed URL. feed : list Parsed feed document. - url : str - Feed URL. URL of associated feed. """ - items = await sqlite.get_entries_of_feed(db_file, feed, url) + feed_id = await sqlite.get_feed_id(db_file, url) + items = await sqlite.get_entries_of_feed(db_file, feed_id) entries = feed.entries - # breakpoint() for item in items: + ix = item[0] + entry_title = item[1] + entry_link = item[2] + entry_id = item[3] + timestamp = item[4] + read_status = item[5] valid = False for entry in entries: title = None @@ -1094,10 +941,10 @@ async def remove_nonexistent_entries(db_file, feed, url): time = None # valid = False # TODO better check and don't repeat code - if entry.has_key("id") and item[3]: - if entry.id == item[3]: + if entry.has_key("id") and entry_id: + if entry.id == entry_id: # print("compare1:", entry.id) - # print("compare2:", item[3]) + # print("compare2:", entry_id) # print("============") valid = True break @@ -1110,61 +957,57 @@ async def remove_nonexistent_entries(db_file, feed, url): link = join_url(url, entry.link) else: link = url - if entry.has_key("published") and item[4]: + if entry.has_key("published") and timestamp: # print("compare11:", title, link, time) - # print("compare22:", item[1], item[2], item[4]) + # print("compare22:", entry_title, entry_link, timestamp) # print("============") time = rfc2822_to_iso8601(entry.published) - if (item[1] == title and - item[2] == link and - item[4] == time): + if (entry_title == title and + entry_link == link and + timestamp == time): valid = True break else: - if (item[1] == title and - item[2] == link): + if (entry_title == title and + entry_link == link): # print("compare111:", title, link) - # print("compare222:", item[1], item[2]) + # print("compare222:", entry_title, entry_link) # print("============") valid = True break # TODO better check and don't repeat code if not valid: - # print("id: ", item[0]) + # print("id: ", ix) # if title: # print("title: ", title) - # print("item[1]: ", item[1]) + # print("entry_title: ", entry_title) # if link: # print("link: ", link) - # print("item[2]: ", item[2]) + # print("entry_link: ", entry_link) # if entry.id: # print("last_entry:", entry.id) - # print("item[3]: ", item[3]) + # print("entry_id: ", entry_id) # if time: # print("time: ", time) - # print("item[4]: ", item[4]) - # print("read: ", item[5]) + # print("timestamp: ", timestamp) + # print("read: ", read_status) # breakpoint() # TODO Send to table archive # TODO Also make a regular/routine check for sources that # have been changed (though that can only happen when # manually editing) - ix = item[0] + # ix = item[0] # print(">>> SOURCE: ", source) - # print(">>> INVALID:", item[1]) - # print("title:", item[1]) - # print("link :", item[2]) - # print("id :", item[3]) - if item[5] == 1: + # print(">>> INVALID:", entry_title) + # print("title:", entry_title) + # print("link :", entry_link) + # print("id :", entry_id) + if read_status == 1: await sqlite.delete_entry_by_id(db_file, ix) - # print(">>> DELETING:", item[1]) + # print(">>> DELETING:", entry_title) else: - # print(">>> ARCHIVING:", item[1]) + # print(">>> ARCHIVING:", entry_title) await sqlite.archive_entry(db_file, ix) - limit = ( - await sqlite.get_settings_value(db_file, "archive") - ) or ( - config.get_value_default("settings", "Settings", "archive") - ) + limit = await get_setting_value(db_file, "archive") await sqlite.maintain_archive(db_file, limit) diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index 84117be..aeceeb9 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -467,7 +467,15 @@ async def insert_feed_( title, url ) cur.execute(sql, par) - feed_id = get_feed_id(cur, url) + sql = ( + """ + SELECT id + FROM feeds + WHERE url = :url + """ + ) + par = (url,) + feed_id = cur.execute(sql, par).fetchone()[0] insert_feed_properties( cur, feed_id, entries=None, version=None, encoding=None, language=None) @@ -705,14 +713,14 @@ async def get_unread_entries(db_file, num): return results -def get_feed_id(cur, url): +async def get_feed_id(db_file, url): """ Get index of given feed. Parameters ---------- - cur : object - Cursor object. + db_file : str + Path to database file. url : str URL. @@ -721,16 +729,18 @@ def get_feed_id(cur, url): feed_id : str Feed index. """ - sql = ( - """ - SELECT id - FROM feeds - WHERE url = :url - """ - ) - par = (url,) - feed_id = cur.execute(sql, par).fetchone()[0] - return feed_id + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT id + FROM feeds + WHERE url = :url + """ + ) + par = (url,) + feed_id = cur.execute(sql, par).fetchone()[0] + return feed_id async def mark_entry_as_read(cur, ix): @@ -1050,7 +1060,7 @@ were async, there were errors of coroutines """ async def add_entry( - db_file, title, link, entry_id, url, date, read_status): + db_file, title, link, entry_id, feed_id, date, read_status): """ Add a new entry row into the entries table. @@ -1064,8 +1074,8 @@ async def add_entry( Link. entry_id : str Entry index. - url : str - URL. + feed_id : str + Feed Id. date : str Date. read_status : str @@ -1074,7 +1084,6 @@ async def add_entry( async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feed_id = get_feed_id(cur, url) sql = ( """ INSERT @@ -1110,7 +1119,7 @@ async def add_entry( # # breakpoint() -async def add_entries_and_update_timestamp(db_file, new_entries): +async def add_entries_and_update_timestamp(db_file, feed_id, new_entries): """ Add new entries. @@ -1124,10 +1133,7 @@ async def add_entries_and_update_timestamp(db_file, new_entries): async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feeds = [] for entry in new_entries: - url = entry["url"] - feed_id = get_feed_id(cur, url) sql = ( """ INSERT @@ -1147,26 +1153,21 @@ async def add_entries_and_update_timestamp(db_file, new_entries): "read": entry["read_status"] } cur.execute(sql, par) - if url not in feeds: - feeds.extend([url]) - for feed in feeds: - url = feed - feed_id = get_feed_id(cur, url) - sql = ( - """ - UPDATE status - SET renewed = :today - WHERE feed_id = :feed_id - """ - ) - par = { - "today": date.today(), - "feed_id": feed_id - } - cur.execute(sql, par) + sql = ( + """ + UPDATE status + SET renewed = :today + WHERE feed_id = :feed_id + """ + ) + par = { + "today": date.today(), + "feed_id": feed_id + } + cur.execute(sql, par) -async def set_date(db_file, url): +async def set_date(db_file, feed_id): """ Set renewed date of given feed. @@ -1174,13 +1175,12 @@ async def set_date(db_file, url): ---------- db_file : str Path to database file. - url : str - URL. + feed_id : str + Feed Id. """ async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feed_id = get_feed_id(cur, url) sql = ( """ UPDATE status @@ -1196,7 +1196,7 @@ async def set_date(db_file, url): cur.execute(sql, par) -async def update_feed_status(db_file, url, status_code): +async def update_feed_status(db_file, feed_id, status_code): """ Set status_code of feed_id in table status. @@ -1212,7 +1212,6 @@ async def update_feed_status(db_file, url, status_code): async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feed_id = get_feed_id(cur, url) sql = ( """ UPDATE status @@ -1228,7 +1227,7 @@ async def update_feed_status(db_file, url, status_code): cur.execute(sql, par) -async def update_feed_validity(db_file, url, valid): +async def update_feed_validity(db_file, feed_id, valid): """ Set validity status of feed_id in table status. @@ -1244,7 +1243,6 @@ async def update_feed_validity(db_file, url, valid): async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feed_id = get_feed_id(cur, url) sql = ( """ UPDATE status @@ -1259,7 +1257,7 @@ async def update_feed_validity(db_file, url, valid): cur.execute(sql, par) -async def update_feed_properties(db_file, url, entries, updated): +async def update_feed_properties(db_file, feed_id, entries, updated): """ Update properties of url in table feeds. @@ -1277,7 +1275,6 @@ async def update_feed_properties(db_file, url, entries, updated): async with DBLOCK: with create_connection(db_file) as conn: cur = conn.cursor() - feed_id = get_feed_id(cur, url) sql = ( """ UPDATE properties @@ -1343,7 +1340,7 @@ async def maintain_archive(db_file, limit): # NOTE Entries that are read from archive are deleted. # NOTE Unlike entries from table entries, entries from # table archive are not marked as read. -async def get_entries_of_feed(db_file, feed, url): +async def get_entries_of_feed(db_file, feed_id): """ Remove entries that don't exist in a given parsed feed. Check the entries returned from feed and delete read non @@ -1353,10 +1350,8 @@ async def get_entries_of_feed(db_file, feed, url): ---------- db_file : str Path to database file. - feed : list - Parsed feed document. - url : str - Feed URL. URL of associated feed. + feed_id : str + Feed Id. """ with create_connection(db_file) as conn: cur = conn.cursor() @@ -1367,7 +1362,7 @@ async def get_entries_of_feed(db_file, feed, url): WHERE feed_id = ? """ ) - par = (url,) + par = (feed_id,) items = cur.execute(sql, par).fetchall() return items @@ -1587,7 +1582,7 @@ ERROR DATE: result = https://blog.heckel.io/feed/ """ async def check_entry_exist( - db_file, url, entry_id=None, title=None, link=None, date=None): + db_file, feed_id, entry_id=None, title=None, link=None, date=None): """ Check whether an entry exists. If entry has an ID, check by ID. @@ -1598,8 +1593,8 @@ async def check_entry_exist( ---------- db_file : str Path to database file. - source : str - Feed URL. URL of associated feed. + feed_id : str + Feed Id. entry_id : str, optional Entry ID. The default is None. title : str, optional @@ -1618,7 +1613,6 @@ async def check_entry_exist( cur = conn.cursor() exist = False if entry_id: - feed_id = get_feed_id(cur, url) sql = ( """ SELECT id @@ -1643,13 +1637,13 @@ async def check_entry_exist( par = { "title": title, "link": link, - "timestamp": date + "date": date } try: result = cur.execute(sql, par).fetchone() if result: exist = True except: - print(current_time(), "ERROR DATE: source =", url) + print(current_time(), "ERROR DATE: source =", feed_id) print(current_time(), "ERROR DATE: date =", date) else: sql = ( diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index a7794b5..80ce9a8 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -838,8 +838,8 @@ async def message(self, message): url = await sqlite.remove_feed_by_index( db_file, ix) response = ( - "> {}\nNews source {} has been removed " - "from subscription list." + "> {}\nNews source \"{}\" has been " + "removed from subscription list." ).format(url, ix) except: response = (