From 43fa1a463ccc5037a609f9d2d026dc397e87c2fb Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Sat, 13 Jan 2024 17:17:43 +0000 Subject: [PATCH] Save enclosures Send new message upon media detection --- slixfeed/action.py | 170 +++++++++++++++-------- slixfeed/fetch.py | 17 ++- slixfeed/sqlite.py | 293 +++++++++++++++++++-------------------- slixfeed/task.py | 66 +++++---- slixfeed/xmpp/process.py | 51 ++++--- 5 files changed, 348 insertions(+), 249 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index 0d68792..dac7cb1 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -42,7 +42,7 @@ from slixfeed.url import ( ) import slixfeed.xmpp.bookmark as bookmark from urllib import error -from urllib.parse import urlsplit +from urllib.parse import parse_qs, urlsplit import xml.etree.ElementTree as ET try: @@ -688,9 +688,34 @@ async def scan(db_file, url): if isinstance(date, int): logging.error( "Variable 'date' is int: {}".format(date)) + media_link = '' + if entry.has_key("links"): + for e_link in entry.links: + try: + # if (link.rel == "enclosure" and + # (link.type.startswith("audio/") or + # link.type.startswith("image/") or + # link.type.startswith("video/")) + # ): + media_type = e_link.type[:e_link.type.index("/")] + if e_link.has_key("rel"): + if (e_link.rel == "enclosure" and + media_type in ("audio", "image", "video")): + media_link = e_link.href + media_link = join_url(url, e_link.href) + media_link = trim_url(media_link) + break + except: + logging.error( + "KeyError: 'href'\n" + "Missing 'href' attribute for {}".format(url)) + logging.info( + "Continue scanning for next potential " + "enclosure of {}".format(link)) entry = { "title": title, "link": link, + "enclosure": media_link, "entry_id": entry_id, "url": url, "date": date, @@ -706,42 +731,47 @@ async def scan(db_file, url): db_file, new_entries) -async def generate_document(url, ext, filename): - result = await fetch.http(url) - data = result[0] - code = result[1] - status = None - if data: - try: - document = Document(data) - content = document.summary() - except: - logging.warning( - "Check that package readability is installed.") - match ext: - case "html": - generate_html(content, filename) - case "md": - try: - generate_markdown(content, filename) - except: - logging.warning( - "Check that package html2text is installed.") - status = ( - "Package html2text was not found.") - case "pdf": - try: - generate_pdf(content, filename) - except: - logging.warning( - "Check that packages pdfkit and wkhtmltopdf " - "are installed.") - status = ( - "Package pdfkit or wkhtmltopdf was not found.") - else: - status = code - if status: - return status +def get_document_title(data): + try: + document = Document(data) + title = document.short_title() + except: + document = BeautifulSoup(data, 'html.parser') + title = document.title.string + return title + + +def generate_document(data, url, ext, filename): + error = None + try: + document = Document(data) + content = document.summary() + except: + content = data + logging.warning( + "Check that package readability is installed.") + match ext: + case "html": + generate_html(content, filename) + case "md": + try: + generate_markdown(content, filename) + except: + logging.warning( + "Check that package html2text is installed.") + error = ( + "Package html2text was not found.") + case "pdf": + try: + generate_pdf(content, filename) + except: + logging.warning( + "Check that packages pdfkit and wkhtmltopdf " + "are installed.") + error = ( + "Package pdfkit or wkhtmltopdf was not found.") + if error: + return error # TODO Either adapt it to filename # or change it to something else @@ -751,28 +781,25 @@ async def generate_document(url, ext, filename): # file.write(html_doc) -async def extract_image_from_feed(db_file, ix, url): - feed_url = sqlite.get_feed_url(db_file, ix) +async def extract_image_from_feed(db_file, feed_id, url): + feed_url = sqlite.get_feed_url(db_file, feed_id) result = await fetch.http(feed_url) document = result[0] - # breakpoint() - print("extract_image_from_feed") if document: feed = parse(document) for entry in feed.entries: - print(len(feed.entries)) - print(entry.link) - print(url) - if entry.link == url: - for link in entry.links: - if (link.rel == "enclosure" and - link.type.startswith("image/")): - # if link.type.startswith("image/"): - image_url = link.href - print("found") - print(image_url) - break - return image_url + try: + if entry.link == url: + for link in entry.links: + if (link.rel == "enclosure" and + link.type.startswith("image/")): + image_url = link.href + return image_url + except: + logging.error(url) + logging.error( + "AttributeError: object has no attribute 'link'") + breakpoint() async def extract_image_from_html(url): @@ -783,17 +810,17 @@ async def extract_image_from_html(url): document = Document(data) content = document.summary() except: + content = data logging.warning( "Check that package readability is installed.") tree = html.fromstring(content) + # TODO Exclude banners, class="share" links etc. images = tree.xpath('//img/@src') if len(images): image = images[0] image = str(image) image_url = complete_url(url, image) - else: - image_url = None - return image_url + return image_url def generate_html(text, filename): @@ -813,6 +840,35 @@ def generate_markdown(text, filename): file.write(markdown) +# TODO Add support for eDonkey, Gnutella, Soulseek +async def get_magnet(link): + parted_link = urlsplit(link) + queries = parse_qs(parted_link.query) + query_xt = queries["xt"][0] + if query_xt.startswith("urn:btih:"): + filename = queries["dn"][0] + checksum = query_xt[len("urn:btih:"):] + torrent = await fetch.magnet(link) + logging.debug( + "Attempting to retrieve {} ({})".format( + filename, checksum)) + if not torrent: + logging.debug( + "Attempting to retrieve {} from HTTP caching service".format( + filename)) + urls = [ + 'https://watercache.libertycorp.org/get/{}/{}', + 'https://itorrents.org/torrent/{}.torrent?title={}', + 'https://firecache.libertycorp.org/get/{}/{}', + 'http://fcache63sakpihd44kxdduy6kgpdhgejgp323wci435zwy6kiylcnfad.onion/get/{}/{}' + ] + for url in urls: + torrent = fetch.http(url.format(checksum, filename)) + if torrent: + break + return torrent + + # NOTE Why (if res[0]) and (if res[1] == 200)? async def organize_items(db_file, urls): """ diff --git a/slixfeed/fetch.py b/slixfeed/fetch.py index 8195266..5fca196 100644 --- a/slixfeed/fetch.py +++ b/slixfeed/fetch.py @@ -28,9 +28,16 @@ from asyncio import TimeoutError # from asyncio.exceptions import IncompleteReadError # from bs4 import BeautifulSoup # from http.client import IncompleteRead +import logging # from lxml import html -import slixfeed.config as config # from xml.etree.ElementTree import ElementTree, ParseError +import slixfeed.config as config +try: + from magnet2torrent import Magnet2Torrent, FailedToFetchException +except: + logging.info( + "Package magnet2torrent was not found.\n" + "BitTorrent is disabled.") # async def dat(): @@ -105,3 +112,11 @@ async def http(url): False, "Timeout: " + str(e) ] return msg + + +async def magnet(link): + m2t = Magnet2Torrent(link) + try: + filename, torrent_data = await m2t.retrieve_torrent() + except FailedToFetchException: + logging.debug("Failed") diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index 160bc9a..a77aa6e 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -129,6 +129,7 @@ def create_tables(db_file): id INTEGER NOT NULL, title TEXT NOT NULL, link TEXT NOT NULL, + enclosure TEXT, entry_id TEXT NOT NULL, feed_id INTEGER NOT NULL, timestamp TEXT, @@ -146,6 +147,7 @@ def create_tables(db_file): id INTEGER NOT NULL, title TEXT NOT NULL, link TEXT NOT NULL, + enclosure TEXT, entry_id TEXT NOT NULL, feed_id INTEGER NOT NULL, timestamp TEXT, @@ -486,7 +488,8 @@ async def remove_feed_by_url(db_file, url): cur = conn.cursor() sql = ( """ - DELETE FROM feeds + DELETE + FROM feeds WHERE url = ? """ ) @@ -556,16 +559,17 @@ async def get_feed_id_and_name(db_file, url): result : list List of ID and Name of feed. """ - cur = get_cursor(db_file) - sql = ( - """ - SELECT id, name - FROM feeds - WHERE url = ? - """ - ) - result = cur.execute(sql, (url,)).fetchone() - return result + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT id, name + FROM feeds + WHERE url = ? + """ + ) + result = cur.execute(sql, (url,)).fetchone() + return result async def get_number_of_items(db_file, table): @@ -677,11 +681,11 @@ async def get_unread_entries(db_file, num): cur = conn.cursor() sql = ( """ - SELECT id, title, link, feed_id, timestamp + SELECT id, title, link, enclosure, feed_id, timestamp FROM entries WHERE read = 0 UNION ALL - SELECT id, title, link, feed_id, timestamp + SELECT id, title, link, enclosure, feed_id, timestamp FROM archive ORDER BY timestamp DESC LIMIT :num @@ -861,17 +865,9 @@ def get_entry_url(db_file, ix): return url -def get_feed_url(db_file, ix): +def get_feed_url(db_file, feed_id): with create_connection(db_file) as conn: cur = conn.cursor() - sql = ( # TODO Handletable archive too - """ - SELECT feed_id - FROM entries - WHERE id = :ix - """ - ) - feed_id = cur.execute(sql, (ix,)).fetchone()[0] sql = ( """ SELECT url @@ -1152,14 +1148,15 @@ async def add_entries_and_update_timestamp(db_file, new_entries): """ INSERT INTO entries( - title, link, entry_id, feed_id, timestamp, read) + title, link, enclosure, entry_id, feed_id, timestamp, read) VALUES( - :title, :link, :entry_id, :feed_id, :timestamp, :read) + :title, :link, :enclosure, :entry_id, :feed_id, :timestamp, :read) """ ) cur.execute(sql, { "title": entry["title"], "link": entry["link"], + "enclosure": entry["enclosure"], "entry_id": entry["entry_id"], "feed_id": feed_id, "timestamp": entry["date"], @@ -1338,10 +1335,12 @@ async def maintain_archive(db_file, limit): """ DELETE FROM archive WHERE id - IN (SELECT id - FROM archive - ORDER BY timestamp ASC - LIMIT :difference) + IN ( + SELECT id + FROM archive + ORDER BY timestamp ASC + LIMIT :difference + ) """ ) cur.execute(sql, { @@ -1452,15 +1451,16 @@ async def get_feeds(db_file): # Select name, url (feeds) updated, enabled, feed_id (status) # 2) Sort feeds by id. Sort status by feed_id # results += cur.execute(sql).fetchall() - cur = get_cursor(db_file) - sql = ( - """ - SELECT name, url, id - FROM feeds - """ - ) - results = cur.execute(sql).fetchall() - return results + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT name, url, id + FROM feeds + """ + ) + results = cur.execute(sql).fetchall() + return results async def last_entries(db_file, num): @@ -1479,29 +1479,30 @@ async def last_entries(db_file, num): titles_list : str List of recent N entries as message. """ - cur = get_cursor(db_file) - # sql = ( - # "SELECT title, link " - # "FROM entries " - # "ORDER BY ROWID DESC " - # "LIMIT :num" - # ) - sql = ( - """ - SELECT title, link, timestamp - FROM entries - WHERE read = 0 - UNION ALL - SELECT title, link, timestamp - FROM archive - WHERE read = 0 - ORDER BY timestamp DESC - LIMIT :num - """ - ) - results = cur.execute( - sql, (num,)).fetchall() - return results + with create_connection(db_file) as conn: + cur = conn.cursor() + # sql = ( + # "SELECT title, link " + # "FROM entries " + # "ORDER BY ROWID DESC " + # "LIMIT :num" + # ) + sql = ( + """ + SELECT title, link, timestamp + FROM entries + WHERE read = 0 + UNION ALL + SELECT title, link, timestamp + FROM archive + WHERE read = 0 + ORDER BY timestamp DESC + LIMIT :num + """ + ) + results = cur.execute( + sql, (num,)).fetchall() + return results async def search_feeds(db_file, query): @@ -1520,19 +1521,20 @@ async def search_feeds(db_file, query): titles_list : str Feeds of specified keywords as message. """ - cur = get_cursor(db_file) - sql = ( - """ - SELECT name, id, url - FROM feeds - WHERE name LIKE ? - OR url LIKE ? - LIMIT 50 - """ - ) - results = cur.execute( - sql, [f'%{query}%', f'%{query}%']).fetchall() - return results + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT name, id, url + FROM feeds + WHERE name LIKE ? + OR url LIKE ? + LIMIT 50 + """ + ) + results = cur.execute( + sql, [f'%{query}%', f'%{query}%']).fetchall() + return results async def search_entries(db_file, query): @@ -1551,22 +1553,23 @@ async def search_entries(db_file, query): titles_list : str Entries of specified keywords as message. """ - cur = get_cursor(db_file) - sql = ( - """ - SELECT title, link - FROM entries - WHERE title LIKE ? - UNION ALL - SELECT title, link - FROM archive - WHERE title LIKE ? - LIMIT 50 - """ - ) - results = cur.execute( - sql, (f'%{query}%', f'%{query}%')).fetchall() - return results + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT title, link + FROM entries + WHERE title LIKE ? + UNION ALL + SELECT title, link + FROM archive + WHERE title LIKE ? + LIMIT 50 + """ + ) + results = cur.execute( + sql, (f'%{query}%', f'%{query}%')).fetchall() + return results """ @@ -1619,68 +1622,62 @@ async def check_entry_exist( bool True or None. """ - cur = get_cursor(db_file) - exist = False - if entry_id: - feed_id = get_feed_id(cur, url) - sql = ( - """ - SELECT id - FROM entries - WHERE - entry_id = :entry_id and - feed_id = :feed_id - """ - ) - result = cur.execute(sql, { - "entry_id": entry_id, - "feed_id": feed_id - }).fetchone() - if result: exist = True - elif date: - sql = ( - """ - SELECT id - FROM entries - WHERE - title = :title and - link = :link and - timestamp = :date - """ - ) - try: + with create_connection(db_file) as conn: + cur = conn.cursor() + exist = False + if entry_id: + feed_id = get_feed_id(cur, url) + sql = ( + """ + SELECT id + FROM entries + WHERE entry_id = :entry_id and feed_id = :feed_id + """ + ) result = cur.execute(sql, { - "title": title, - "link": link, - "timestamp": date + "entry_id": entry_id, + "feed_id": feed_id }).fetchone() if result: exist = True - except: - print(current_time(), "ERROR DATE: source =", url) - print(current_time(), "ERROR DATE: date =", date) - else: - sql = ( - """ - SELECT id - FROM entries - WHERE - title = :title and - link = :link - """ - ) - result = cur.execute(sql, { - "title": title, - "link": link - }).fetchone() - if result: exist = True - # try: - # if result: - # return True - # else: - # return None - # except: - # print(current_time(), "ERROR DATE: result =", url) - return exist + elif date: + sql = ( + """ + SELECT id + FROM entries + WHERE title = :title and link = :link and timestamp = :date + """ + ) + try: + result = cur.execute(sql, { + "title": title, + "link": link, + "timestamp": date + }).fetchone() + if result: exist = True + except: + print(current_time(), "ERROR DATE: source =", url) + print(current_time(), "ERROR DATE: date =", date) + else: + sql = ( + """ + SELECT id + FROM entries + WHERE title = :title and link = :link + """ + ) + result = cur.execute(sql, { + "title": title, + "link": link + }).fetchone() + if result: exist = True + # try: + # if result: + # return True + # else: + # return None + # except: + # print(current_time(), "ERROR DATE: result =", url) + return exist async def set_settings_value(db_file, key_value): diff --git a/slixfeed/task.py b/slixfeed/task.py index 66e4388..cab015e 100644 --- a/slixfeed/task.py +++ b/slixfeed/task.py @@ -227,46 +227,60 @@ async def send_update(self, jid, num=None): num = int(num) news_digest = [] results = await get_unread_entries(db_file, num) - image_url = None + news_digest = '' + media = None + chat_type = await utility.jid_type(self, jid) for result in results: ix = result[0] title_e = result[1] url = result[2] - feed_id = result[3] - date = result[4] + enclosure = result[3] + feed_id = result[4] + date = result[5] title_f = get_feed_title(db_file, feed_id) - news_item = action.list_unread_entries(result, title_f) - news_digest.extend([news_item]) + news_digest += action.list_unread_entries(result, title_f) # print(db_file) # print(result[0]) # breakpoint() - await mark_as_read(db_file, result[0]) - if not image_url: - image_url = await action.extract_image_from_feed( - db_file, ix, url) - if not image_url: - image_url = await action.extract_image_from_html(url) - print("image_url") - print(image_url) - new = " ".join(news_digest) - # breakpoint() - if new: + await mark_as_read(db_file, ix) + + # Find media + if url.startswith("magnet:"): + media = action.get_magnet(url) + elif enclosure.startswith("magnet:"): + media = action.get_magnet(enclosure) + elif enclosure: + media = enclosure + else: + media = await action.extract_image_from_html(url) + + if media and news_digest: + # Send textual message + xmpp.Slixfeed.send_message( + self, mto=jid, mbody=news_digest, mtype=chat_type) + news_digest = '' + # Send media + message = xmpp.Slixfeed.make_message( + self, mto=jid, mbody=media, mtype=chat_type) + message['oob']['url'] = media + message.send() + media = None + + if news_digest: # TODO Add while loop to assure delivery. # print(await current_time(), ">>> ACT send_message",jid) - chat_type = await utility.jid_type(self, jid) # NOTE Do we need "if statement"? See NOTE at is_muc. if chat_type in ("chat", "groupchat"): # TODO Provide a choice (with or without images) xmpp.Slixfeed.send_message( - self, mto=jid, mbody=new, mtype=chat_type) - if image_url: - # message = xmpp.Slixfeed.make_message( - # self, mto=jid, mbody=new, mtype=chat_type) - message = xmpp.Slixfeed.make_message( - self, mto=jid, mbody=image_url, mtype=chat_type) - message['oob']['url'] = image_url - print(image_url) - message.send() + self, mto=jid, mbody=news_digest, mtype=chat_type) + # if media: + # # message = xmpp.Slixfeed.make_message( + # # self, mto=jid, mbody=new, mtype=chat_type) + # message = xmpp.Slixfeed.make_message( + # self, mto=jid, mbody=media, mtype=chat_type) + # message['oob']['url'] = media + # message.send() # TODO Do not refresh task before # verifying that it was completed. diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index 09a42ab..0192710 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -18,6 +18,7 @@ TODO """ +import slixfeed.fetch as fetch import logging import os import slixfeed.action as action @@ -335,7 +336,7 @@ async def message(self, message): else: response = "Missing value." send_reply_message(self, message, response) - case _ if message_lowercase.startswith("bookmark - "): + case _ if message_lowercase.startswith("bookmark -"): if jid == get_value("accounts", "XMPP", "operator"): muc_jid = message_text[11:] await bookmark.remove(self, muc_jid) @@ -394,7 +395,7 @@ async def message(self, message): else: response = "Missing keywords." send_reply_message(self, message, response) - case _ if message_lowercase.startswith("export "): + case _ if message_lowercase.startswith("export"): ex = message_text[7:] if ex in ("opml", "html", "md", "xbel"): status_type = "dnd" @@ -440,7 +441,7 @@ async def message(self, message): response = "Gemini and Gopher are not supported yet." send_reply_message(self, message, response) # TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT - case _ if (message_lowercase.startswith("get ")): + case _ if (message_lowercase.startswith("get")): message_text = message_text[4:] ix_url = message_text.split(" ")[0] ext = " ".join(message_text.split(" ")[1:]) @@ -450,8 +451,8 @@ async def message(self, message): if ext in ("html", "md", "pdf"): status_type = "dnd" status_message = ( - "📃️ Procesing request to produce {} document ..." - ).format(ext) + "📃️ Procesing request to produce {} document..." + ).format(ext.upper()) send_status_message( self, jid, status_type, status_message) db_file = get_pathname_to_database(jid) @@ -461,27 +462,43 @@ async def message(self, message): os.mkdir(data_dir) if not os.path.isdir(data_dir + '/readability'): os.mkdir(data_dir + '/readability') - filename = os.path.join( - data_dir, "readability", "saved_article_" + timestamp() + "." + ext) try: ix = int(ix_url) try: url = sqlite.get_entry_url(db_file, ix) except: - response = "No entry Id with {}".format(ix) + response = "No entry with Id {}".format(ix) except: url = ix_url if url: url = uri.remove_tracking_parameters(url) url = (uri.replace_hostname(url, "link")) or url - status = await action.generate_document(url, ext, filename) - if status: - response = ( - "Failed to export {}. Reason: {}" - ).format(ext, status) + result = await fetch.http(url) + data = result[0] + code = result[1] + if data: + title = action.get_document_title(data) + title = title.strip().lower() + for i in (" ", "-"): + title = title.replace(i, "_") + for i in ("?", "'", "!"): + title = title.replace(i, "") + filename = os.path.join( + data_dir, "readability", + title + "_" + timestamp() + "." + ext) + error = action.generate_document( + data, url, ext, filename) + if status: + response = ( + "Failed to export {}. Reason: {}" + ).format(ext.upper(), error) + else: + url = await upload.start(self, jid, filename) + await send_oob_message(self, jid, url) else: - url = await upload.start(self, jid, filename) - await send_oob_message(self, jid, url) + response = ( + "Failed to fetch {}. Reason: {}" + ).format(url, code) await task.start_tasks_xmpp( self, jid, ["status"]) else: @@ -769,7 +786,7 @@ async def message(self, message): else: response = "Missing value." send_reply_message(self, message, response) - case _ if message_lowercase.startswith("remove "): + case _ if message_lowercase.startswith("remove"): ix_url = message_text[7:] if ix_url: db_file = get_pathname_to_database(jid) @@ -873,7 +890,7 @@ async def message(self, message): except: response = "No news source with ID {}.".format(ix) send_reply_message(self, message, response) - case _ if message_lowercase.startswith("enable "): + case _ if message_lowercase.startswith("enable"): ix = message_text[7:] db_file = get_pathname_to_database(jid) try: