Detect image from xml enclosure in addition to html img

2024-01-11 10:55:42 +00:00 · 2024-01-11 10:55:42 +00:00 · ec82aeb3cc
commit ec82aeb3cc
parent b675618b95
6 changed files with 131 additions and 67 deletions
--- a/slixfeed/action.py
+++ b/slixfeed/action.py
@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results):
 async def import_opml(db_file, url):
-    result = await fetch.download_feed(url)
+    result = await fetch.http(url)
    document = result[0]
    if document:
        root = ET.fromstring(document)
@ -378,7 +378,7 @@ async def add_feed(db_file, url):
    while True:
        exist = await sqlite.get_feed_id_and_name(db_file, url)
        if not exist:
-            result = await fetch.download_feed(url)
+            result = await fetch.http(url)
            document = result[0]
            status_code = result[1]
            if document:
@ -458,7 +458,7 @@ async def add_feed(db_file, url):
 async def view_feed(url):
    while True:
-        result = await fetch.download_feed(url)
+        result = await fetch.http(url)
        document = result[0]
        status = result[1]
        if document:
@ -523,7 +523,7 @@ async def view_feed(url):
 async def view_entry(url, num):
    while True:
-        result = await fetch.download_feed(url)
+        result = await fetch.http(url)
        document = result[0]
        status = result[1]
        if document:
@ -602,7 +602,7 @@ async def scan(db_file, url):
        URL. The default is None.
    """
    if isinstance(url, tuple): url = url[0]
-    result = await fetch.download_feed(url)
+    result = await fetch.http(url)
    try:
        document = result[0]
        status = result[1]
@ -706,23 +706,43 @@ async def scan(db_file, url):
            db_file, new_entries)
-
+async def generate_document(url, ext, filename):
-async def get_content(url):
+    result = await fetch.http(url)
    result = await fetch.download_feed(url)
    data = result[0]
    code = result[1]
    status = None
    if data:
        try:
-            document = Document(result[0])
+            document = Document(data)
            content = document.summary()
            info = [content, code]
        except:
            logging.warning(
-                "Install package readability.")
+                "Check that package readability is installed.")
-            info = result
+        match ext:
            case "html":
                generate_html(content, filename)
            case "md":
                try:
                    generate_markdown(content, filename)
                except:
                    logging.warning(
                        "Check that package html2text is installed.")
                    status = (
                        "Package html2text was not found.")
            case "pdf":
                try:
                    generate_pdf(content, filename)
                except:
                    logging.warning(
                        "Check that packages pdfkit and wkhtmltopdf "
                        "are installed.")
                    status = (
                        "Package pdfkit or wkhtmltopdf was not found.")
    else:
-        info = [None, code]
+        status = code
-    return info
+    if status:
        return status
    # TODO Either adapt it to filename
    # or change it to something else
    #filename = document.title()
@ -731,7 +751,40 @@ async def get_content(url):
    #     file.write(html_doc)
-def extract_first_image(url, content):
+async def extract_image_from_feed(db_file, ix, url):
    feed_url = sqlite.get_feed_url(db_file, ix)
    result = await fetch.http(feed_url)
    document = result[0]
    # breakpoint()
    print("extract_image_from_feed")
    if document:
        feed = parse(document)
        for entry in feed.entries:
            print(len(feed.entries))
            print(entry.link)
            print(url)
            if entry.link == url:
                for link in entry.links:
                    if (link.rel == "enclosure" and
                        link.type.startswith("image/")):
                    # if link.type.startswith("image/"):
                        image_url = link.href
                        print("found")
                        print(image_url)
                        break
    return image_url
 async def extract_image_from_html(url):
    result = await fetch.http(url)
    data = result[0]
    if data:
        try:
            document = Document(data)
            content = document.summary()
        except:
            logging.warning(
                "Check that package readability is installed.")
    tree = html.fromstring(content)
    images = tree.xpath('//img/@src')
    if len(images):
@ -775,7 +828,7 @@ async def organize_items(db_file, urls):
    for url in urls:
        # print(os.path.basename(db_file), url[0])
        url = url[0]
-        res = await fetch.download_feed(url)
+        res = await fetch.http(url)
        # TypeError: 'NoneType' object is not subscriptable
        if res is None:
            # Skip to next feed
--- a/slixfeed/crawl.py
+++ b/slixfeed/crawl.py
@ -22,7 +22,7 @@ from feedparser import parse
 import logging
 from lxml import html
 import slixfeed.config as config
-from slixfeed.fetch import download_feed
+import slixfeed.fetch as fetch
 from slixfeed.url import complete_url, join_url, trim_url
 from urllib.parse import urlsplit, urlunsplit
@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree):
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
        # xpath_query = "//a[contains(@href,'{}')]".format(path)
        num = 5
-        xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
+        xpath_query = (
            "(//a[contains(@href,'{}')])[position()<={}]"
            ).format(path, num)
        addresses = tree.xpath(xpath_query)
-        xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
+        xpath_query = (
            "(//a[contains(@href,'{}')])[position()>last()-{}]"
            ).format(path, num)
        addresses += tree.xpath(xpath_query)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree):
            # # The following code will catch
            # # only valid resources (i.e. not 404);
            # # The following code requires more bandwidth.
-            # res = await download_feed(feed)
+            # res = await fetch.http(feed)
            # if res[0]:
            #     disco = parse(res[0])
            #     title = disco["feed"]["title"]
@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree):
 async def process_feed_selection(url, urls):
    feeds = {}
    for i in urls:
-        res = await download_feed(i)
+        res = await fetch.http(i)
        if res[1] == 200:
            try:
                feeds[i] = [parse(res[0])]
@ -266,7 +270,7 @@ async def process_feed_selection(url, urls):
    feed_url_mark = 0
    for feed_url in feeds:
        # try:
-        #     res = await download_feed(feed)
+        #     res = await fetch.http(feed)
        # except:
        #     continue
        feed_name = None
@ -334,7 +338,7 @@ async def process_feed_selection(url, urls):
 # async def start(url):
 #     while True:
-#         result = await fetch.download_feed(url)
+#         result = await fetch.http(url)
 #         document = result[0]
 #         status = result[1]
 #         if document:
--- a/slixfeed/fetch.py
+++ b/slixfeed/fetch.py
@ -45,7 +45,7 @@ import slixfeed.config as config
 # async def ipfs():
-async def download_feed(url):
+async def http(url):
    """
    Download content of given URL.
--- a/slixfeed/sqlite.py
+++ b/slixfeed/sqlite.py
@ -847,11 +847,10 @@ def get_feed_title(db_file, ix):
        return title
 # TODO Handletable archive too
 def get_entry_url(db_file, ix):
    with create_connection(db_file) as conn:
        cur = conn.cursor()
-        sql = (
+        sql = ( # TODO Handletable archive too
            """
            SELECT link
            FROM entries
@ -862,6 +861,28 @@ def get_entry_url(db_file, ix):
        return url
 def get_feed_url(db_file, ix):
    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = ( # TODO Handletable archive too
            """
            SELECT feed_id
            FROM entries
            WHERE id = :ix
            """
            )
        feed_id = cur.execute(sql, (ix,)).fetchone()[0]
        sql = (
            """
            SELECT url
            FROM feeds
            WHERE id = :feed_id
            """
            )
        url = cur.execute(sql, (feed_id,)).fetchone()[0]
        return url
 async def mark_as_read(db_file, ix):
    async with DBLOCK:
        with create_connection(db_file) as conn:
--- a/slixfeed/task.py
+++ b/slixfeed/task.py
@ -242,11 +242,12 @@ async def send_update(self, jid, num=None):
            # breakpoint()
            await mark_as_read(db_file, result[0])
            if not image_url:
-                info = await action.get_content(url)
+                image_url = await action.extract_image_from_feed(
-                content = info[1]
+                    db_file, ix, url)
-                status = info[0]
+            if not image_url:
-                if status == 200:
+                image_url = await action.extract_image_from_html(url)
-                    image_url = action.extract_first_image(url, content)
+            print("image_url")
            print(image_url)
        new = " ".join(news_digest)
        # breakpoint()
        if new:
--- a/slixfeed/xmpp/process.py
+++ b/slixfeed/xmpp/process.py
@ -445,6 +445,8 @@ async def message(self, message):
                ix_url = message_text.split(" ")[0]
                ext = " ".join(message_text.split(" ")[1:])
                ext = ext if ext else 'pdf'
                url = None
                status = None
                if ext in ("html", "md", "pdf"):
                    status_type = "dnd"
                    status_message = (
@ -469,42 +471,25 @@ async def message(self, message):
                                response = "No entry Id with {}".format(ix)
                        except:
                            url = ix_url
                        if url:
                            url = uri.remove_tracking_parameters(url)
                            url = (uri.replace_hostname(url, "link")) or url
-                        info = await action.get_content(url)
+                            status = await action.generate_document(url, ext, filename)
-                        content = info[0]
+                            if status:
                        status = info[1]
                        if content:
                            try:
                                match ext:
                                    case "html":
                                        action.generate_html(content, filename)
                                    case "md":
                                        action.generate_markdown(content, filename)
                                    case "pdf":
                                        action.generate_pdf(content, filename)
                                url = await upload.start(
                                    self, jid, filename)
                                await send_oob_message(
                                    self, jid, url)
                            except:
                                logging.warning(
                                    "Check that packages html2text, pdfkit "
                                    "and wkhtmltopdf are installed")
                                response = (
-                                    "Failed to export to {}"
+                                    "Failed to export {}.  Reason: {}"
-                                    ).format(ext)
+                                    ).format(ext, status)
                            else:
                                url = await upload.start(self, jid, filename)
                                await send_oob_message(self, jid, url)
                        await task.start_tasks_xmpp(
                            self, jid, ["status"])
                        else:
                            response = (
                                "Failed to fetch resource.  Reason: {}"
                                ).format(status)
                    else:
                        response = "Missing entry Id."
                else:
                    response = "Unsupported filetype."
                if response:
                    print(response)
                    send_reply_message(self, message, response)
            # case _ if (message_lowercase.startswith("http")) and(
            #     message_lowercase.endswith(".opml")):