Save enclosures

Send new message upon media detection
2024-01-13 17:17:43 +00:00 · 2024-01-13 17:17:43 +00:00 · 43fa1a463c
commit 43fa1a463c
parent ec82aeb3cc
5 changed files with 348 additions and 249 deletions
--- a/slixfeed/action.py
+++ b/slixfeed/action.py
@ -42,7 +42,7 @@ from slixfeed.url import (
    )
 import slixfeed.xmpp.bookmark as bookmark
 from urllib import error
-from urllib.parse import urlsplit
+from urllib.parse import parse_qs, urlsplit
 import xml.etree.ElementTree as ET
 try:
@ -688,9 +688,34 @@ async def scan(db_file, url):
                if isinstance(date, int):
                    logging.error(
                        "Variable 'date' is int: {}".format(date))
                media_link = ''
                if entry.has_key("links"):
                    for e_link in entry.links:
                        try:
                            # if (link.rel == "enclosure" and
                            #     (link.type.startswith("audio/") or
                            #      link.type.startswith("image/") or
                            #      link.type.startswith("video/"))
                            #     ):
                            media_type = e_link.type[:e_link.type.index("/")]
                            if e_link.has_key("rel"):
                                if (e_link.rel == "enclosure" and
                                    media_type in ("audio", "image", "video")):
                                    media_link = e_link.href
                                    media_link = join_url(url, e_link.href)
                                    media_link = trim_url(media_link)
                                    break
                        except:
                            logging.error(
                                "KeyError: 'href'\n"
                                "Missing 'href' attribute for {}".format(url))
                            logging.info(
                                "Continue scanning for next potential "
                                "enclosure of {}".format(link))
                entry = {
                    "title": title,
                    "link": link,
                    "enclosure": media_link,
                    "entry_id": entry_id,
                    "url": url,
                    "date": date,
@ -706,16 +731,23 @@ async def scan(db_file, url):
            db_file, new_entries)
-async def generate_document(url, ext, filename):
+def get_document_title(data):
-    result = await fetch.http(url)
+    try:
-    data = result[0]
+        document = Document(data)
-    code = result[1]
+        title = document.short_title()
-    status = None
+    except:
-    if data:
+        document = BeautifulSoup(data, 'html.parser')
        title = document.title.string
    return title
 def generate_document(data, url, ext, filename):
    error = None
    try:
        document = Document(data)
        content = document.summary()
    except:
        content = data
        logging.warning(
            "Check that package readability is installed.")
    match ext:
@ -727,7 +759,7 @@ async def generate_document(url, ext, filename):
            except:
                logging.warning(
                    "Check that package html2text is installed.")
-                    status = (
+                error = (
                    "Package html2text was not found.")
        case "pdf":
            try:
@ -736,12 +768,10 @@ async def generate_document(url, ext, filename):
                logging.warning(
                    "Check that packages pdfkit and wkhtmltopdf "
                    "are installed.")
-                    status = (
+                error = (
                    "Package pdfkit or wkhtmltopdf was not found.")
-    else:
+    if error:
-        status = code
+        return error
    if status:
        return status
    # TODO Either adapt it to filename
    # or change it to something else
@ -751,28 +781,25 @@ async def generate_document(url, ext, filename):
    #     file.write(html_doc)
-async def extract_image_from_feed(db_file, ix, url):
+async def extract_image_from_feed(db_file, feed_id, url):
-    feed_url = sqlite.get_feed_url(db_file, ix)
+    feed_url = sqlite.get_feed_url(db_file, feed_id)
    result = await fetch.http(feed_url)
    document = result[0]
    # breakpoint()
    print("extract_image_from_feed")
    if document:
        feed = parse(document)
        for entry in feed.entries:
-            print(len(feed.entries))
+            try:
            print(entry.link)
            print(url)
                if entry.link == url:
                    for link in entry.links:
                        if (link.rel == "enclosure" and
                            link.type.startswith("image/")):
                    # if link.type.startswith("image/"):
                            image_url = link.href
                        print("found")
                        print(image_url)
                        break
                            return image_url
            except:
                logging.error(url)
                logging.error(
                    "AttributeError: object has no attribute 'link'")
                breakpoint()
 async def extract_image_from_html(url):
@ -783,16 +810,16 @@ async def extract_image_from_html(url):
            document = Document(data)
            content = document.summary()
        except:
            content = data
            logging.warning(
                "Check that package readability is installed.")
    tree = html.fromstring(content)
    # TODO Exclude banners, class="share" links etc.
    images = tree.xpath('//img/@src')
    if len(images):
        image = images[0]
        image = str(image)
        image_url = complete_url(url, image)
    else:
        image_url = None
        return image_url
@ -813,6 +840,35 @@ def generate_markdown(text, filename):
        file.write(markdown)
 # TODO Add support for eDonkey, Gnutella, Soulseek
 async def get_magnet(link):
    parted_link = urlsplit(link)
    queries = parse_qs(parted_link.query)
    query_xt = queries["xt"][0]
    if query_xt.startswith("urn:btih:"):
        filename = queries["dn"][0]
        checksum = query_xt[len("urn:btih:"):]
        torrent = await fetch.magnet(link)
        logging.debug(
            "Attempting to retrieve {} ({})".format(
                filename, checksum))
        if not torrent:
            logging.debug(
                "Attempting to retrieve {} from HTTP caching service".format(
                    filename))
            urls = [
                'https://watercache.libertycorp.org/get/{}/{}',
                'https://itorrents.org/torrent/{}.torrent?title={}',
                'https://firecache.libertycorp.org/get/{}/{}',
                'http://fcache63sakpihd44kxdduy6kgpdhgejgp323wci435zwy6kiylcnfad.onion/get/{}/{}'
                ]
            for url in urls:
                torrent = fetch.http(url.format(checksum, filename))
                if torrent:
                    break
    return torrent
 # NOTE Why (if res[0]) and (if res[1] == 200)?
 async def organize_items(db_file, urls):
    """
--- a/slixfeed/fetch.py
+++ b/slixfeed/fetch.py
@ -28,9 +28,16 @@ from asyncio import TimeoutError
 # from asyncio.exceptions import IncompleteReadError
 # from bs4 import BeautifulSoup
 # from http.client import IncompleteRead
 import logging
 # from lxml import html
 import slixfeed.config as config
 # from xml.etree.ElementTree import ElementTree, ParseError
 import slixfeed.config as config
 try:
    from magnet2torrent import Magnet2Torrent, FailedToFetchException
 except:
    logging.info(
        "Package magnet2torrent was not found.\n"
        "BitTorrent is disabled.")
 # async def dat():
@ -105,3 +112,11 @@ async def http(url):
                False, "Timeout: " + str(e)
                ]
    return msg
 async def magnet(link):
    m2t = Magnet2Torrent(link)
    try:
        filename, torrent_data = await m2t.retrieve_torrent()
    except FailedToFetchException:
        logging.debug("Failed")
--- a/slixfeed/sqlite.py
+++ b/slixfeed/sqlite.py
@ -129,6 +129,7 @@ def create_tables(db_file):
                id INTEGER NOT NULL,
                title TEXT NOT NULL,
                link TEXT NOT NULL,
                enclosure TEXT,
                entry_id TEXT NOT NULL,
                feed_id INTEGER NOT NULL,
                timestamp TEXT,
@ -146,6 +147,7 @@ def create_tables(db_file):
                id INTEGER NOT NULL,
                title TEXT NOT NULL,
                link TEXT NOT NULL,
                enclosure TEXT,
                entry_id TEXT NOT NULL,
                feed_id INTEGER NOT NULL,
                timestamp TEXT,
@ -486,7 +488,8 @@ async def remove_feed_by_url(db_file, url):
            cur = conn.cursor()
            sql = (
                """
-                DELETE FROM feeds
+                DELETE
                FROM feeds
                WHERE url = ?
                """
                )
@ -556,7 +559,8 @@ async def get_feed_id_and_name(db_file, url):
    result : list
        List of ID and Name of feed.
    """
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = (
            """
            SELECT id, name
@ -677,11 +681,11 @@ async def get_unread_entries(db_file, num):
        cur = conn.cursor()
        sql = (
            """
-            SELECT id, title, link, feed_id, timestamp
+            SELECT id, title, link, enclosure, feed_id, timestamp
            FROM entries
            WHERE read = 0
            UNION ALL
-            SELECT id, title, link, feed_id, timestamp
+            SELECT id, title, link, enclosure, feed_id, timestamp
            FROM archive
            ORDER BY timestamp
            DESC LIMIT :num
@ -861,17 +865,9 @@ def get_entry_url(db_file, ix):
        return url
-def get_feed_url(db_file, ix):
+def get_feed_url(db_file, feed_id):
    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = ( # TODO Handletable archive too
            """
            SELECT feed_id
            FROM entries
            WHERE id = :ix
            """
            )
        feed_id = cur.execute(sql, (ix,)).fetchone()[0]
        sql = (
            """
            SELECT url
@ -1152,14 +1148,15 @@ async def add_entries_and_update_timestamp(db_file, new_entries):
                    """
                    INSERT
                    INTO entries(
-                        title, link, entry_id, feed_id, timestamp, read)
+                        title, link, enclosure, entry_id, feed_id, timestamp, read)
                    VALUES(
-                        :title, :link, :entry_id, :feed_id, :timestamp, :read)
+                        :title, :link, :enclosure, :entry_id, :feed_id, :timestamp, :read)
                    """
                    )
                cur.execute(sql, {
                    "title": entry["title"],
                    "link": entry["link"],
                    "enclosure": entry["enclosure"],
                    "entry_id": entry["entry_id"],
                    "feed_id": feed_id,
                    "timestamp": entry["date"],
@ -1338,10 +1335,12 @@ async def maintain_archive(db_file, limit):
                    """
                    DELETE FROM archive
                    WHERE id
-                    IN (SELECT id
+                    IN (
                        SELECT id
                        FROM archive
                        ORDER BY timestamp ASC
-                    LIMIT :difference)
+                        LIMIT :difference
                        )
                    """
                    )
                cur.execute(sql, {
@ -1452,7 +1451,8 @@ async def get_feeds(db_file):
    #    Select name, url (feeds) updated, enabled, feed_id (status)
    # 2) Sort feeds by id. Sort status by feed_id
    # results += cur.execute(sql).fetchall()
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = (
            """
            SELECT name, url, id
@ -1479,7 +1479,8 @@ async def last_entries(db_file, num):
    titles_list : str
        List of recent N entries as message.
    """
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        # sql = (
        #     "SELECT title, link "
        #     "FROM entries "
@ -1520,7 +1521,8 @@ async def search_feeds(db_file, query):
    titles_list : str
        Feeds of specified keywords as message.
    """
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = (
            """
            SELECT name, id, url
@ -1551,7 +1553,8 @@ async def search_entries(db_file, query):
    titles_list : str
        Entries of specified keywords as message.
    """
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        sql = (
            """
            SELECT title, link
@ -1619,7 +1622,8 @@ async def check_entry_exist(
    bool
        True or None.
    """
-    cur = get_cursor(db_file)
+    with create_connection(db_file) as conn:
        cur = conn.cursor()
        exist = False
        if entry_id:
            feed_id = get_feed_id(cur, url)
@ -1627,9 +1631,7 @@ async def check_entry_exist(
                """
                SELECT id
                FROM entries
-            WHERE
+                WHERE entry_id = :entry_id and feed_id = :feed_id
            entry_id = :entry_id and
            feed_id = :feed_id
                """
                )
            result = cur.execute(sql, {
@ -1642,10 +1644,7 @@ async def check_entry_exist(
                """
                SELECT id
                FROM entries
-            WHERE
+                WHERE title = :title and link = :link and timestamp = :date
            title = :title and
            link = :link and
            timestamp = :date
                """
                )
            try:
@ -1663,9 +1662,7 @@ async def check_entry_exist(
                """
                SELECT id
                FROM entries
-            WHERE
+                WHERE title = :title and link = :link
            title = :title and
            link = :link
                """
                )
            result = cur.execute(sql, {
--- a/slixfeed/task.py
+++ b/slixfeed/task.py
@ -227,46 +227,60 @@ async def send_update(self, jid, num=None):
            num = int(num)
        news_digest = []
        results = await get_unread_entries(db_file, num)
-        image_url = None
+        news_digest = ''
        media = None
        chat_type = await utility.jid_type(self, jid)
        for result in results:
            ix = result[0]
            title_e = result[1]
            url = result[2]
-            feed_id = result[3]
+            enclosure = result[3]
-            date = result[4]
+            feed_id = result[4]
            date = result[5]
            title_f = get_feed_title(db_file, feed_id)
-            news_item = action.list_unread_entries(result, title_f)
+            news_digest += action.list_unread_entries(result, title_f)
            news_digest.extend([news_item])
            # print(db_file)
            # print(result[0])
            # breakpoint()
-            await mark_as_read(db_file, result[0])
+            await mark_as_read(db_file, ix)
-            if not image_url:
+
-                image_url = await action.extract_image_from_feed(
+            # Find media
-                    db_file, ix, url)
+            if url.startswith("magnet:"):
-            if not image_url:
+                media = action.get_magnet(url)
-                image_url = await action.extract_image_from_html(url)
+            elif enclosure.startswith("magnet:"):
-            print("image_url")
+                media = action.get_magnet(enclosure)
-            print(image_url)
+            elif enclosure:
-        new = " ".join(news_digest)
+                media = enclosure
-        # breakpoint()
+            else:
-        if new:
+                media = await action.extract_image_from_html(url)
            if media and news_digest:
                # Send textual message
                xmpp.Slixfeed.send_message(
                    self, mto=jid, mbody=news_digest, mtype=chat_type)
                news_digest = ''
                # Send media
                message = xmpp.Slixfeed.make_message(
                    self, mto=jid, mbody=media, mtype=chat_type)
                message['oob']['url'] = media
                message.send()
                media = None
        if news_digest:
            # TODO Add while loop to assure delivery.
            # print(await current_time(), ">>> ACT send_message",jid)
            chat_type = await utility.jid_type(self, jid)
            # NOTE Do we need "if statement"? See NOTE at is_muc.
            if chat_type in ("chat", "groupchat"):
                # TODO Provide a choice (with or without images)
                xmpp.Slixfeed.send_message(
-                    self, mto=jid, mbody=new, mtype=chat_type)
+                    self, mto=jid, mbody=news_digest, mtype=chat_type)
-                if image_url:
+        # if media:
        #     # message = xmpp.Slixfeed.make_message(
        #     #     self, mto=jid, mbody=new, mtype=chat_type)
        #     message = xmpp.Slixfeed.make_message(
-                    #     self, mto=jid, mbody=new, mtype=chat_type)
+        #         self, mto=jid, mbody=media, mtype=chat_type)
-                    message = xmpp.Slixfeed.make_message(
+        #     message['oob']['url'] = media
-                        self, mto=jid, mbody=image_url, mtype=chat_type)
+        #     message.send()
                    message['oob']['url'] = image_url
                    print(image_url)
                    message.send()
        # TODO Do not refresh task before
        # verifying that it was completed.
--- a/slixfeed/xmpp/process.py
+++ b/slixfeed/xmpp/process.py
@ -18,6 +18,7 @@ TODO
 """
 import slixfeed.fetch as fetch
 import logging
 import os
 import slixfeed.action as action
@ -451,7 +452,7 @@ async def message(self, message):
                    status_type = "dnd"
                    status_message = (
                        "📃️ Procesing request to produce {} document..."
-                        ).format(ext)
+                        ).format(ext.upper())
                    send_status_message(
                        self, jid, status_type, status_message)
                    db_file = get_pathname_to_database(jid)
@ -461,27 +462,43 @@ async def message(self, message):
                            os.mkdir(data_dir)
                        if not os.path.isdir(data_dir + '/readability'):
                            os.mkdir(data_dir + '/readability')
                        filename = os.path.join(
                            data_dir, "readability", "saved_article_" + timestamp() + "." + ext)
                        try:
                            ix = int(ix_url)
                            try:
                                url = sqlite.get_entry_url(db_file, ix)
                            except:
-                                response = "No entry Id with {}".format(ix)
+                                response = "No entry with Id {}".format(ix)
                        except:
                            url = ix_url
                        if url:
                            url = uri.remove_tracking_parameters(url)
                            url = (uri.replace_hostname(url, "link")) or url
-                            status = await action.generate_document(url, ext, filename)
+                            result = await fetch.http(url)
                            data = result[0]
                            code = result[1]
                            if data:
                                title = action.get_document_title(data)
                                title = title.strip().lower()
                                for i in (" ", "-"):
                                    title = title.replace(i, "_")
                                for i in ("?", "'", "!"):
                                    title = title.replace(i, "")
                                filename = os.path.join(
                                    data_dir, "readability",
                                    title + "_" + timestamp() + "." + ext)
                                error = action.generate_document(
                                    data, url, ext, filename)
                                if status:
                                    response = (
                                        "Failed to export {}.  Reason: {}"
-                                    ).format(ext, status)
+                                        ).format(ext.upper(), error)
                                else:
                                    url = await upload.start(self, jid, filename)
                                    await send_oob_message(self, jid, url)
                            else:
                                response = (
                                    "Failed to fetch {}.  Reason: {}"
                                    ).format(url, code)
                        await task.start_tasks_xmpp(
                            self, jid, ["status"])
                    else: