From ec82aeb3ccf3db184052c5e5df6b1582ee2b704d Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Thu, 11 Jan 2024 10:55:42 +0000 Subject: [PATCH] Detect image from xml enclosure in addition to html img --- slixfeed/action.py | 97 +++++++++++++++++++++++++++++++--------- slixfeed/crawl.py | 18 +++++--- slixfeed/fetch.py | 2 +- slixfeed/sqlite.py | 25 ++++++++++- slixfeed/task.py | 11 ++--- slixfeed/xmpp/process.py | 45 +++++++------------ 6 files changed, 131 insertions(+), 67 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index 396ec34..0d68792 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results): async def import_opml(db_file, url): - result = await fetch.download_feed(url) + result = await fetch.http(url) document = result[0] if document: root = ET.fromstring(document) @@ -378,7 +378,7 @@ async def add_feed(db_file, url): while True: exist = await sqlite.get_feed_id_and_name(db_file, url) if not exist: - result = await fetch.download_feed(url) + result = await fetch.http(url) document = result[0] status_code = result[1] if document: @@ -458,7 +458,7 @@ async def add_feed(db_file, url): async def view_feed(url): while True: - result = await fetch.download_feed(url) + result = await fetch.http(url) document = result[0] status = result[1] if document: @@ -523,7 +523,7 @@ async def view_feed(url): async def view_entry(url, num): while True: - result = await fetch.download_feed(url) + result = await fetch.http(url) document = result[0] status = result[1] if document: @@ -602,7 +602,7 @@ async def scan(db_file, url): URL. The default is None. """ if isinstance(url, tuple): url = url[0] - result = await fetch.download_feed(url) + result = await fetch.http(url) try: document = result[0] status = result[1] @@ -704,34 +704,87 @@ async def scan(db_file, url): if len(new_entries): await sqlite.add_entries_and_update_timestamp( db_file, new_entries) - -async def get_content(url): - result = await fetch.download_feed(url) +async def generate_document(url, ext, filename): + result = await fetch.http(url) data = result[0] code = result[1] + status = None if data: try: - document = Document(result[0]) + document = Document(data) content = document.summary() - info = [content, code] except: logging.warning( - "Install package readability.") - info = result + "Check that package readability is installed.") + match ext: + case "html": + generate_html(content, filename) + case "md": + try: + generate_markdown(content, filename) + except: + logging.warning( + "Check that package html2text is installed.") + status = ( + "Package html2text was not found.") + case "pdf": + try: + generate_pdf(content, filename) + except: + logging.warning( + "Check that packages pdfkit and wkhtmltopdf " + "are installed.") + status = ( + "Package pdfkit or wkhtmltopdf was not found.") else: - info = [None, code] - return info - # TODO Either adapt it to filename - # or change it to something else - #filename = document.title() - # with open(filename, 'w') as file: - # html_doc = document.summary() - # file.write(html_doc) + status = code + if status: + return status + + # TODO Either adapt it to filename + # or change it to something else + #filename = document.title() + # with open(filename, 'w') as file: + # html_doc = document.summary() + # file.write(html_doc) -def extract_first_image(url, content): +async def extract_image_from_feed(db_file, ix, url): + feed_url = sqlite.get_feed_url(db_file, ix) + result = await fetch.http(feed_url) + document = result[0] + # breakpoint() + print("extract_image_from_feed") + if document: + feed = parse(document) + for entry in feed.entries: + print(len(feed.entries)) + print(entry.link) + print(url) + if entry.link == url: + for link in entry.links: + if (link.rel == "enclosure" and + link.type.startswith("image/")): + # if link.type.startswith("image/"): + image_url = link.href + print("found") + print(image_url) + break + return image_url + + +async def extract_image_from_html(url): + result = await fetch.http(url) + data = result[0] + if data: + try: + document = Document(data) + content = document.summary() + except: + logging.warning( + "Check that package readability is installed.") tree = html.fromstring(content) images = tree.xpath('//img/@src') if len(images): @@ -775,7 +828,7 @@ async def organize_items(db_file, urls): for url in urls: # print(os.path.basename(db_file), url[0]) url = url[0] - res = await fetch.download_feed(url) + res = await fetch.http(url) # TypeError: 'NoneType' object is not subscriptable if res is None: # Skip to next feed diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py index 7fada28..af576e6 100644 --- a/slixfeed/crawl.py +++ b/slixfeed/crawl.py @@ -22,7 +22,7 @@ from feedparser import parse import logging from lxml import html import slixfeed.config as config -from slixfeed.fetch import download_feed +import slixfeed.fetch as fetch from slixfeed.url import complete_url, join_url, trim_url from urllib.parse import urlsplit, urlunsplit @@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree): # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) # xpath_query = "//a[contains(@href,'{}')]".format(path) num = 5 - xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num) + xpath_query = ( + "(//a[contains(@href,'{}')])[position()<={}]" + ).format(path, num) addresses = tree.xpath(xpath_query) - xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) + xpath_query = ( + "(//a[contains(@href,'{}')])[position()>last()-{}]" + ).format(path, num) addresses += tree.xpath(xpath_query) # NOTE Should number of addresses be limited or # perhaps be N from the start and N from the end @@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree): # # The following code will catch # # only valid resources (i.e. not 404); # # The following code requires more bandwidth. - # res = await download_feed(feed) + # res = await fetch.http(feed) # if res[0]: # disco = parse(res[0]) # title = disco["feed"]["title"] @@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree): async def process_feed_selection(url, urls): feeds = {} for i in urls: - res = await download_feed(i) + res = await fetch.http(i) if res[1] == 200: try: feeds[i] = [parse(res[0])] @@ -266,7 +270,7 @@ async def process_feed_selection(url, urls): feed_url_mark = 0 for feed_url in feeds: # try: - # res = await download_feed(feed) + # res = await fetch.http(feed) # except: # continue feed_name = None @@ -334,7 +338,7 @@ async def process_feed_selection(url, urls): # async def start(url): # while True: -# result = await fetch.download_feed(url) +# result = await fetch.http(url) # document = result[0] # status = result[1] # if document: diff --git a/slixfeed/fetch.py b/slixfeed/fetch.py index a024667..8195266 100644 --- a/slixfeed/fetch.py +++ b/slixfeed/fetch.py @@ -45,7 +45,7 @@ import slixfeed.config as config # async def ipfs(): -async def download_feed(url): +async def http(url): """ Download content of given URL. diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index 628205e..160bc9a 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -847,11 +847,10 @@ def get_feed_title(db_file, ix): return title -# TODO Handletable archive too def get_entry_url(db_file, ix): with create_connection(db_file) as conn: cur = conn.cursor() - sql = ( + sql = ( # TODO Handletable archive too """ SELECT link FROM entries @@ -862,6 +861,28 @@ def get_entry_url(db_file, ix): return url +def get_feed_url(db_file, ix): + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( # TODO Handletable archive too + """ + SELECT feed_id + FROM entries + WHERE id = :ix + """ + ) + feed_id = cur.execute(sql, (ix,)).fetchone()[0] + sql = ( + """ + SELECT url + FROM feeds + WHERE id = :feed_id + """ + ) + url = cur.execute(sql, (feed_id,)).fetchone()[0] + return url + + async def mark_as_read(db_file, ix): async with DBLOCK: with create_connection(db_file) as conn: diff --git a/slixfeed/task.py b/slixfeed/task.py index 94d073a..66e4388 100644 --- a/slixfeed/task.py +++ b/slixfeed/task.py @@ -242,11 +242,12 @@ async def send_update(self, jid, num=None): # breakpoint() await mark_as_read(db_file, result[0]) if not image_url: - info = await action.get_content(url) - content = info[1] - status = info[0] - if status == 200: - image_url = action.extract_first_image(url, content) + image_url = await action.extract_image_from_feed( + db_file, ix, url) + if not image_url: + image_url = await action.extract_image_from_html(url) + print("image_url") + print(image_url) new = " ".join(news_digest) # breakpoint() if new: diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index 4e03d4a..09a42ab 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -445,6 +445,8 @@ async def message(self, message): ix_url = message_text.split(" ")[0] ext = " ".join(message_text.split(" ")[1:]) ext = ext if ext else 'pdf' + url = None + status = None if ext in ("html", "md", "pdf"): status_type = "dnd" status_message = ( @@ -469,42 +471,25 @@ async def message(self, message): response = "No entry Id with {}".format(ix) except: url = ix_url - url = uri.remove_tracking_parameters(url) - url = (uri.replace_hostname(url, "link")) or url - info = await action.get_content(url) - content = info[0] - status = info[1] - if content: - try: - match ext: - case "html": - action.generate_html(content, filename) - case "md": - action.generate_markdown(content, filename) - case "pdf": - action.generate_pdf(content, filename) - url = await upload.start( - self, jid, filename) - await send_oob_message( - self, jid, url) - except: - logging.warning( - "Check that packages html2text, pdfkit " - "and wkhtmltopdf are installed") + if url: + url = uri.remove_tracking_parameters(url) + url = (uri.replace_hostname(url, "link")) or url + status = await action.generate_document(url, ext, filename) + if status: response = ( - "Failed to export to {}" - ).format(ext) - await task.start_tasks_xmpp( - self, jid, ["status"]) - else: - response = ( - "Failed to fetch resource. Reason: {}" - ).format(status) + "Failed to export {}. Reason: {}" + ).format(ext, status) + else: + url = await upload.start(self, jid, filename) + await send_oob_message(self, jid, url) + await task.start_tasks_xmpp( + self, jid, ["status"]) else: response = "Missing entry Id." else: response = "Unsupported filetype." if response: + print(response) send_reply_message(self, message, response) # case _ if (message_lowercase.startswith("http")) and( # message_lowercase.endswith(".opml")):