Add functionality to download articles

2024-01-09 15:53:19 +00:00 · 2024-01-09 15:53:19 +00:00 · b06e1bc693
commit b06e1bc693
parent 9709c052ee
6 changed files with 120 additions and 24 deletions
--- a/setup.py
+++ b/setup.py
@ -16,7 +16,10 @@ setup(
        'aiohttp',
        'bs4',
        'feedparser',
+        'html2text',
        'lxml',
+        'pdfkit',
+        'readability-lxml',
        'slixmpp'
    ],
    classifiers=[
--- a/slixfeed/main.py
+++ b/slixfeed/main.py
@ -42,33 +42,33 @@ TODO
 9.1) IDEA: Bot to display Title and Excerpt
     (including sending a PDF version of it) of posted link

-10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT).
-
-11) Fetch summary from URL, instead of storing summary, or
+10) Fetch summary from URL, instead of storing summary, or
    Store 5 upcoming summaries.
    This would help making the database files smaller.

-12) Support protocol Gopher
+11) Support protocol Gopher
    See project /michael-lazar/pygopherd
    See project /gopherball/gb

-13) Support ActivityPub @person@domain (see Tip Of The Day).
+12) Support ActivityPub @person@domain (see Tip Of The Day).

-14) Tip Of The Day.
+13) Tip Of The Day.
    Did you know that you can follow you favorite Mastodon feeds by just
    sending the URL address?
    Supported fediverse websites are:
        Akkoma, Firefish (Calckey), Friendica, HubZilla,
        Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.

-15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
+14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
    
-16) See project /offpunk/offblocklist.py
+15) See project /offpunk/offblocklist.py

-18) Search messages of government regulated publishers, and promote other sources.
+16) Search messages of government regulated publishers, and promote other sources.
    Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
    However, you might want to get news from (1) (2) and (3) instead!

+17) Make the program portable (directly use the directory assets) -- Thorsten
+
 """

 # vars and their meanings:
--- a/slixfeed/action.py
+++ b/slixfeed/action.py
@ -12,9 +12,12 @@ TODO

 from asyncio.exceptions import IncompleteReadError
 from bs4 import BeautifulSoup
+import html2text
 from http.client import IncompleteRead
 from feedparser import parse
 import logging
+import pdfkit
+from readability import Document
 import slixfeed.config as config
 import slixfeed.crawl as crawl
 from slixfeed.datetime import (
@ -656,9 +659,38 @@ async def scan(db_file, url):
                    db_file, title, link, entry_id,
                    url, date, read_status)
                await sqlite.set_date(db_file, url)
-        


+async def get_content(db_file, ix):
+    url = sqlite.get_entry_url(db_file, ix)
+    result = await fetch.download_feed(url)
+    if result[0]:
+        document = Document(result[0])
+        return document.summary()
+        # TODO Either adapt it to filename
+        # or change it to something else
+        #filename = document.title()
+        # with open(filename, 'w') as file:
+        #     html_doc = document.summary()
+        #     file.write(html_doc)
+
+
+def generate_html(text, filename):
+        with open(filename, 'w') as file:
+            file.write(text)
+
+
+def generate_pdf(text, filename):
+        pdfkit.from_string(text, filename)
+
+
+def generate_markdown(text, filename):
+        h2m = html2text.HTML2Text()
+        # Convert HTML to Markdown
+        markdown = h2m.handle(text)
+        with open(filename, 'w') as file:
+            file.write(markdown)
+

 # NOTE Why (if res[0]) and (if res[1] == 200)?
 async def organize_items(db_file, urls):
--- a/slixfeed/sqlite.py
+++ b/slixfeed/sqlite.py
@ -700,7 +700,7 @@ async def archive_entry(db_file, ix):
                    "ERROR DB deleting items from "
                    "table entries at index", ix
                    )
-    
+

 def get_feed_title(db_file, ix):
    with create_connection(db_file) as conn:
@ -716,6 +716,21 @@ def get_feed_title(db_file, ix):
        return title


+# TODO Handletable archive too
+def get_entry_url(db_file, ix):
+    with create_connection(db_file) as conn:
+        cur = conn.cursor()
+        sql = (
+            """
+            SELECT link
+            FROM entries
+            WHERE id = :ix
+            """
+            )
+        url = cur.execute(sql, (ix,)).fetchone()[0]
+        return url
+
+
 async def mark_as_read(db_file, ix):
    async with DBLOCK:
        with create_connection(db_file) as conn:
--- a/slixfeed/xmpp/manual.py
+++ b/slixfeed/xmpp/manual.py
@ -62,10 +62,10 @@ def print_info():
        " Thorsten Mühlfelder (SalixOS, Germany),"
        "\n"
        " Yann Leboulanger (Gajim, France).\n"
+        "\n"
        "COPYRIGHT\n"
        " Slixfeed is free software; you can redistribute it and/or\n"
-        " modify it under the terms of the GNU General Public License\n"
-        " as published by the Free Software Foundation; version 3 only\n"
+        " modify it under the terms of the MIT License.\n"
        "\n"
        " Slixfeed is distributed in the hope that it will be useful,\n"
        " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
@ -117,6 +117,9 @@ def print_help():
        "   Add <url> to subscription list.\n"
        " add <url> TITLE\n"
        "   Add <url> to subscription list (without validity check).\n"
+        " get <id> <type>\n"
+        "   Send an article as file. Specify <id> and <type>."
+        "   Supported types are HTML, MD and PDF (default).\n"
        " join <muc>\n"
        "   Join specified groupchat.\n"
        " read <url>\n"
@ -177,8 +180,10 @@ def print_help():
        "EDIT OPTIONS\n"
        " remove <id>\n"
        "   Remove feed of <id> from subscription list.\n"
-        " status <id>\n"
-        "   Toggle update status of feed of <id>.\n"
+        " disable <id>\n"
+        "   Disable updates for feed of <id>.\n"
+        " enable <id>\n"
+        "   Enable updates for feed of <id>.\n"
        "\n"
        "SEARCH OPTIONS\n"
        " feeds\n"
@ -198,16 +203,16 @@ def print_help():
        # " unread\n"
        # "   Print number of unread news items.\n"
        # "\n"
-        # "BACKUP OPTIONS\n"
-        # " export opml\n"
-        # "   Send an OPML file with your feeds.\n"
+        "BACKUP OPTIONS\n"
+        " export opml\n"
+        "   Send an OPML file with feeds.\n"
        # " backup news html\n"
        # "   Send an HTML formatted file of your news items.\n"
        # " backup news md\n"
        # "   Send a Markdown file of your news items.\n"
        # " backup news text\n"
        # "   Send a Plain Text file of your news items.\n"
-        # "\n"
+        "\n"
        "SUPPORT\n"
        " commands\n"
        "   Print list of commands.\n"
@ -247,8 +252,12 @@ def print_cmd():
        "allow -           : Delete keywords from allow list (comma separates).\n"
        "deny +            : Keywords to block (comma separates).\n"
        "deny -            : Delete keywords from deny list (comma separates).\n"
+        "disable <id>      : Disable updates for feed of <id>.\n"
+        "enable <id>       : Enable updates for feed of <id>.\n"
+        "export opml       : Send an OPML file with feeds.\n"
        "feeds             : List all subscriptions.\n"
        "feeds <text>      : Search subscriptions by given <text>.\n"
+        "get <id> <type>   : Send an article as file. Specify <id> and <type>. Supported types are HTML, MD and PDF (default).\n"
        "interval <n>      : Set interval update to every <n> minutes.\n"
        "join <muc>        : Join specified groupchat.\n"
        "length            : Set maximum length of news item description. (0 for no limit)\n"
@ -264,7 +273,6 @@ def print_cmd():
        "remove <id>       : Remove feed from subscription list.\n"
        "search <text>     : Search news items by given <text>.\n"
        "start             : Enable bot and send updates.\n"
-        "status <id>       : Toggle update status of feed.\n"
        "stop              : Disable bot and stop updates.\n"
        "```"
        )
--- a/slixfeed/xmpp/process.py
+++ b/slixfeed/xmpp/process.py
@ -34,7 +34,7 @@ import slixfeed.task as task
 import slixfeed.url as uri
 import slixfeed.xmpp.bookmark as bookmark
 import slixfeed.xmpp.muc as groupchat
-import slixfeed.xmpp.text as text
+import slixfeed.xmpp.manual as manual
 import slixfeed.xmpp.upload as upload
 from slixfeed.xmpp.utility import jid_type

@ -195,13 +195,13 @@ async def message(self, message):
            #             )
            #         send_reply_message(self, message, response)
            case "commands":
-                response = text.print_cmd()
+                response = manual.print_cmd()
                send_reply_message(self, message, response)
            case "help":
-                response = text.print_help()
+                response = manual.print_help()
                send_reply_message(self, message, response)
            case "info":
-                response = text.print_info()
+                response = manual.print_info()
                send_reply_message(self, message, response)
            case _ if message_lowercase in [
                "greetings", "hallo", "hello", "hey",
@ -438,6 +438,44 @@ async def message(self, message):
                        message_lowercase.startswith("gopher:")):
                response = "Gemini and Gopher are not supported yet."
                send_reply_message(self, message, response)
+            # TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT
+            case _ if (message_lowercase.startswith("get ")):
+                message_text = message_text[4:]
+                ix = message_text.split(" ")[0]
+                ex = " ".join(message_text.split(" ")[1:])
+                ex = ex if ex else 'pdf'
+                db_file = get_pathname_to_database(jid)
+                data_dir = get_default_data_directory()
+                if ix:
+                    if not os.path.isdir(data_dir):
+                        os.mkdir(data_dir)
+                    if not os.path.isdir(data_dir + '/readability'):
+                        os.mkdir(data_dir + '/readability')
+                    filename = os.path.join(
+                        data_dir, "readability", "saved_article_" + timestamp() + "." + ex)
+                    try:
+                        text = await action.get_content(db_file, ix)
+                    except:
+                        response = "No entry Id with {}".format(ix)
+                    if text:
+                        match ex:
+                            case "html":
+                                action.generate_html(text, filename)
+                            case "md":
+                                action.generate_markdown(text, filename)
+                            case "pdf":
+                                action.generate_pdf(text, filename)
+                        url = await upload.start(
+                            self, jid, filename)
+                        print(url)
+                        await send_oob_message(
+                            self, jid, url)
+                    else:
+                        response = "Failed to fetch resource."
+                else:
+                    response = "Missing entry Id."
+                if response:
+                    send_reply_message(self, message, response)
            # case _ if (message_lowercase.startswith("http")) and(
            #     message_lowercase.endswith(".opml")):
            #     url = message_text