From b06e1bc693b4f082361d5293350197299924b6b7 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Tue, 9 Jan 2024 15:53:19 +0000 Subject: [PATCH] Add functionality to download articles --- setup.py | 3 ++ slixfeed/__main__.py | 18 +++++------ slixfeed/action.py | 34 +++++++++++++++++++- slixfeed/sqlite.py | 17 +++++++++- slixfeed/xmpp/{text.py => manual.py} | 26 ++++++++++------ slixfeed/xmpp/process.py | 46 +++++++++++++++++++++++++--- 6 files changed, 120 insertions(+), 24 deletions(-) rename slixfeed/xmpp/{text.py => manual.py} (92%) diff --git a/setup.py b/setup.py index 9d012fe..13e4313 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,10 @@ setup( 'aiohttp', 'bs4', 'feedparser', + 'html2text', 'lxml', + 'pdfkit', + 'readability-lxml', 'slixmpp' ], classifiers=[ diff --git a/slixfeed/__main__.py b/slixfeed/__main__.py index cb134c8..2fbef5e 100644 --- a/slixfeed/__main__.py +++ b/slixfeed/__main__.py @@ -42,33 +42,33 @@ TODO 9.1) IDEA: Bot to display Title and Excerpt (including sending a PDF version of it) of posted link -10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT). - -11) Fetch summary from URL, instead of storing summary, or +10) Fetch summary from URL, instead of storing summary, or Store 5 upcoming summaries. This would help making the database files smaller. -12) Support protocol Gopher +11) Support protocol Gopher See project /michael-lazar/pygopherd See project /gopherball/gb -13) Support ActivityPub @person@domain (see Tip Of The Day). +12) Support ActivityPub @person@domain (see Tip Of The Day). -14) Tip Of The Day. +13) Tip Of The Day. Did you know that you can follow you favorite Mastodon feeds by just sending the URL address? Supported fediverse websites are: Akkoma, Firefish (Calckey), Friendica, HubZilla, Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox. -15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger +14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger -16) See project /offpunk/offblocklist.py +15) See project /offpunk/offblocklist.py -18) Search messages of government regulated publishers, and promote other sources. +16) Search messages of government regulated publishers, and promote other sources. Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds. However, you might want to get news from (1) (2) and (3) instead! +17) Make the program portable (directly use the directory assets) -- Thorsten + """ # vars and their meanings: diff --git a/slixfeed/action.py b/slixfeed/action.py index 084f0b1..b278c8c 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -12,9 +12,12 @@ TODO from asyncio.exceptions import IncompleteReadError from bs4 import BeautifulSoup +import html2text from http.client import IncompleteRead from feedparser import parse import logging +import pdfkit +from readability import Document import slixfeed.config as config import slixfeed.crawl as crawl from slixfeed.datetime import ( @@ -656,9 +659,38 @@ async def scan(db_file, url): db_file, title, link, entry_id, url, date, read_status) await sqlite.set_date(db_file, url) - +async def get_content(db_file, ix): + url = sqlite.get_entry_url(db_file, ix) + result = await fetch.download_feed(url) + if result[0]: + document = Document(result[0]) + return document.summary() + # TODO Either adapt it to filename + # or change it to something else + #filename = document.title() + # with open(filename, 'w') as file: + # html_doc = document.summary() + # file.write(html_doc) + + +def generate_html(text, filename): + with open(filename, 'w') as file: + file.write(text) + + +def generate_pdf(text, filename): + pdfkit.from_string(text, filename) + + +def generate_markdown(text, filename): + h2m = html2text.HTML2Text() + # Convert HTML to Markdown + markdown = h2m.handle(text) + with open(filename, 'w') as file: + file.write(markdown) + # NOTE Why (if res[0]) and (if res[1] == 200)? async def organize_items(db_file, urls): diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index f634306..d571baa 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -700,7 +700,7 @@ async def archive_entry(db_file, ix): "ERROR DB deleting items from " "table entries at index", ix ) - + def get_feed_title(db_file, ix): with create_connection(db_file) as conn: @@ -716,6 +716,21 @@ def get_feed_title(db_file, ix): return title +# TODO Handletable archive too +def get_entry_url(db_file, ix): + with create_connection(db_file) as conn: + cur = conn.cursor() + sql = ( + """ + SELECT link + FROM entries + WHERE id = :ix + """ + ) + url = cur.execute(sql, (ix,)).fetchone()[0] + return url + + async def mark_as_read(db_file, ix): async with DBLOCK: with create_connection(db_file) as conn: diff --git a/slixfeed/xmpp/text.py b/slixfeed/xmpp/manual.py similarity index 92% rename from slixfeed/xmpp/text.py rename to slixfeed/xmpp/manual.py index 8afd583..01c79c3 100644 --- a/slixfeed/xmpp/text.py +++ b/slixfeed/xmpp/manual.py @@ -62,10 +62,10 @@ def print_info(): " Thorsten Mühlfelder (SalixOS, Germany)," "\n" " Yann Leboulanger (Gajim, France).\n" + "\n" "COPYRIGHT\n" " Slixfeed is free software; you can redistribute it and/or\n" - " modify it under the terms of the GNU General Public License\n" - " as published by the Free Software Foundation; version 3 only\n" + " modify it under the terms of the MIT License.\n" "\n" " Slixfeed is distributed in the hope that it will be useful,\n" " but WITHOUT ANY WARRANTY; without even the implied warranty of\n" @@ -117,6 +117,9 @@ def print_help(): " Add to subscription list.\n" " add TITLE\n" " Add to subscription list (without validity check).\n" + " get \n" + " Send an article as file. Specify and ." + " Supported types are HTML, MD and PDF (default).\n" " join \n" " Join specified groupchat.\n" " read \n" @@ -177,8 +180,10 @@ def print_help(): "EDIT OPTIONS\n" " remove \n" " Remove feed of from subscription list.\n" - " status \n" - " Toggle update status of feed of .\n" + " disable \n" + " Disable updates for feed of .\n" + " enable \n" + " Enable updates for feed of .\n" "\n" "SEARCH OPTIONS\n" " feeds\n" @@ -198,16 +203,16 @@ def print_help(): # " unread\n" # " Print number of unread news items.\n" # "\n" - # "BACKUP OPTIONS\n" - # " export opml\n" - # " Send an OPML file with your feeds.\n" + "BACKUP OPTIONS\n" + " export opml\n" + " Send an OPML file with feeds.\n" # " backup news html\n" # " Send an HTML formatted file of your news items.\n" # " backup news md\n" # " Send a Markdown file of your news items.\n" # " backup news text\n" # " Send a Plain Text file of your news items.\n" - # "\n" + "\n" "SUPPORT\n" " commands\n" " Print list of commands.\n" @@ -247,8 +252,12 @@ def print_cmd(): "allow - : Delete keywords from allow list (comma separates).\n" "deny + : Keywords to block (comma separates).\n" "deny - : Delete keywords from deny list (comma separates).\n" + "disable : Disable updates for feed of .\n" + "enable : Enable updates for feed of .\n" + "export opml : Send an OPML file with feeds.\n" "feeds : List all subscriptions.\n" "feeds : Search subscriptions by given .\n" + "get : Send an article as file. Specify and . Supported types are HTML, MD and PDF (default).\n" "interval : Set interval update to every minutes.\n" "join : Join specified groupchat.\n" "length : Set maximum length of news item description. (0 for no limit)\n" @@ -264,7 +273,6 @@ def print_cmd(): "remove : Remove feed from subscription list.\n" "search : Search news items by given .\n" "start : Enable bot and send updates.\n" - "status : Toggle update status of feed.\n" "stop : Disable bot and stop updates.\n" "```" ) diff --git a/slixfeed/xmpp/process.py b/slixfeed/xmpp/process.py index 30459b1..954f953 100644 --- a/slixfeed/xmpp/process.py +++ b/slixfeed/xmpp/process.py @@ -34,7 +34,7 @@ import slixfeed.task as task import slixfeed.url as uri import slixfeed.xmpp.bookmark as bookmark import slixfeed.xmpp.muc as groupchat -import slixfeed.xmpp.text as text +import slixfeed.xmpp.manual as manual import slixfeed.xmpp.upload as upload from slixfeed.xmpp.utility import jid_type @@ -195,13 +195,13 @@ async def message(self, message): # ) # send_reply_message(self, message, response) case "commands": - response = text.print_cmd() + response = manual.print_cmd() send_reply_message(self, message, response) case "help": - response = text.print_help() + response = manual.print_help() send_reply_message(self, message, response) case "info": - response = text.print_info() + response = manual.print_info() send_reply_message(self, message, response) case _ if message_lowercase in [ "greetings", "hallo", "hello", "hey", @@ -438,6 +438,44 @@ async def message(self, message): message_lowercase.startswith("gopher:")): response = "Gemini and Gopher are not supported yet." send_reply_message(self, message, response) + # TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT + case _ if (message_lowercase.startswith("get ")): + message_text = message_text[4:] + ix = message_text.split(" ")[0] + ex = " ".join(message_text.split(" ")[1:]) + ex = ex if ex else 'pdf' + db_file = get_pathname_to_database(jid) + data_dir = get_default_data_directory() + if ix: + if not os.path.isdir(data_dir): + os.mkdir(data_dir) + if not os.path.isdir(data_dir + '/readability'): + os.mkdir(data_dir + '/readability') + filename = os.path.join( + data_dir, "readability", "saved_article_" + timestamp() + "." + ex) + try: + text = await action.get_content(db_file, ix) + except: + response = "No entry Id with {}".format(ix) + if text: + match ex: + case "html": + action.generate_html(text, filename) + case "md": + action.generate_markdown(text, filename) + case "pdf": + action.generate_pdf(text, filename) + url = await upload.start( + self, jid, filename) + print(url) + await send_oob_message( + self, jid, url) + else: + response = "Failed to fetch resource." + else: + response = "Missing entry Id." + if response: + send_reply_message(self, message, response) # case _ if (message_lowercase.startswith("http")) and( # message_lowercase.endswith(".opml")): # url = message_text