Add functionality to download articles

This commit is contained in:
Schimon Jehudah 2024-01-09 15:53:19 +00:00
parent 9709c052ee
commit b06e1bc693
6 changed files with 120 additions and 24 deletions

View file

@ -16,7 +16,10 @@ setup(
'aiohttp',
'bs4',
'feedparser',
'html2text',
'lxml',
'pdfkit',
'readability-lxml',
'slixmpp'
],
classifiers=[

View file

@ -42,33 +42,33 @@ TODO
9.1) IDEA: Bot to display Title and Excerpt
(including sending a PDF version of it) of posted link
10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT).
11) Fetch summary from URL, instead of storing summary, or
10) Fetch summary from URL, instead of storing summary, or
Store 5 upcoming summaries.
This would help making the database files smaller.
12) Support protocol Gopher
11) Support protocol Gopher
See project /michael-lazar/pygopherd
See project /gopherball/gb
13) Support ActivityPub @person@domain (see Tip Of The Day).
12) Support ActivityPub @person@domain (see Tip Of The Day).
14) Tip Of The Day.
13) Tip Of The Day.
Did you know that you can follow you favorite Mastodon feeds by just
sending the URL address?
Supported fediverse websites are:
Akkoma, Firefish (Calckey), Friendica, HubZilla,
Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.
15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
16) See project /offpunk/offblocklist.py
15) See project /offpunk/offblocklist.py
18) Search messages of government regulated publishers, and promote other sources.
16) Search messages of government regulated publishers, and promote other sources.
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
However, you might want to get news from (1) (2) and (3) instead!
17) Make the program portable (directly use the directory assets) -- Thorsten
"""
# vars and their meanings:

View file

@ -12,9 +12,12 @@ TODO
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
import html2text
from http.client import IncompleteRead
from feedparser import parse
import logging
import pdfkit
from readability import Document
import slixfeed.config as config
import slixfeed.crawl as crawl
from slixfeed.datetime import (
@ -656,9 +659,38 @@ async def scan(db_file, url):
db_file, title, link, entry_id,
url, date, read_status)
await sqlite.set_date(db_file, url)
async def get_content(db_file, ix):
url = sqlite.get_entry_url(db_file, ix)
result = await fetch.download_feed(url)
if result[0]:
document = Document(result[0])
return document.summary()
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
def generate_html(text, filename):
with open(filename, 'w') as file:
file.write(text)
def generate_pdf(text, filename):
pdfkit.from_string(text, filename)
def generate_markdown(text, filename):
h2m = html2text.HTML2Text()
# Convert HTML to Markdown
markdown = h2m.handle(text)
with open(filename, 'w') as file:
file.write(markdown)
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls):

View file

@ -700,7 +700,7 @@ async def archive_entry(db_file, ix):
"ERROR DB deleting items from "
"table entries at index", ix
)
def get_feed_title(db_file, ix):
with create_connection(db_file) as conn:
@ -716,6 +716,21 @@ def get_feed_title(db_file, ix):
return title
# TODO Handletable archive too
def get_entry_url(db_file, ix):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT link
FROM entries
WHERE id = :ix
"""
)
url = cur.execute(sql, (ix,)).fetchone()[0]
return url
async def mark_as_read(db_file, ix):
async with DBLOCK:
with create_connection(db_file) as conn:

View file

@ -62,10 +62,10 @@ def print_info():
" Thorsten Mühlfelder (SalixOS, Germany),"
"\n"
" Yann Leboulanger (Gajim, France).\n"
"\n"
"COPYRIGHT\n"
" Slixfeed is free software; you can redistribute it and/or\n"
" modify it under the terms of the GNU General Public License\n"
" as published by the Free Software Foundation; version 3 only\n"
" modify it under the terms of the MIT License.\n"
"\n"
" Slixfeed is distributed in the hope that it will be useful,\n"
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
@ -117,6 +117,9 @@ def print_help():
" Add <url> to subscription list.\n"
" add <url> TITLE\n"
" Add <url> to subscription list (without validity check).\n"
" get <id> <type>\n"
" Send an article as file. Specify <id> and <type>."
" Supported types are HTML, MD and PDF (default).\n"
" join <muc>\n"
" Join specified groupchat.\n"
" read <url>\n"
@ -177,8 +180,10 @@ def print_help():
"EDIT OPTIONS\n"
" remove <id>\n"
" Remove feed of <id> from subscription list.\n"
" status <id>\n"
" Toggle update status of feed of <id>.\n"
" disable <id>\n"
" Disable updates for feed of <id>.\n"
" enable <id>\n"
" Enable updates for feed of <id>.\n"
"\n"
"SEARCH OPTIONS\n"
" feeds\n"
@ -198,16 +203,16 @@ def print_help():
# " unread\n"
# " Print number of unread news items.\n"
# "\n"
# "BACKUP OPTIONS\n"
# " export opml\n"
# " Send an OPML file with your feeds.\n"
"BACKUP OPTIONS\n"
" export opml\n"
" Send an OPML file with feeds.\n"
# " backup news html\n"
# " Send an HTML formatted file of your news items.\n"
# " backup news md\n"
# " Send a Markdown file of your news items.\n"
# " backup news text\n"
# " Send a Plain Text file of your news items.\n"
# "\n"
"\n"
"SUPPORT\n"
" commands\n"
" Print list of commands.\n"
@ -247,8 +252,12 @@ def print_cmd():
"allow - : Delete keywords from allow list (comma separates).\n"
"deny + : Keywords to block (comma separates).\n"
"deny - : Delete keywords from deny list (comma separates).\n"
"disable <id> : Disable updates for feed of <id>.\n"
"enable <id> : Enable updates for feed of <id>.\n"
"export opml : Send an OPML file with feeds.\n"
"feeds : List all subscriptions.\n"
"feeds <text> : Search subscriptions by given <text>.\n"
"get <id> <type> : Send an article as file. Specify <id> and <type>. Supported types are HTML, MD and PDF (default).\n"
"interval <n> : Set interval update to every <n> minutes.\n"
"join <muc> : Join specified groupchat.\n"
"length : Set maximum length of news item description. (0 for no limit)\n"
@ -264,7 +273,6 @@ def print_cmd():
"remove <id> : Remove feed from subscription list.\n"
"search <text> : Search news items by given <text>.\n"
"start : Enable bot and send updates.\n"
"status <id> : Toggle update status of feed.\n"
"stop : Disable bot and stop updates.\n"
"```"
)

View file

@ -34,7 +34,7 @@ import slixfeed.task as task
import slixfeed.url as uri
import slixfeed.xmpp.bookmark as bookmark
import slixfeed.xmpp.muc as groupchat
import slixfeed.xmpp.text as text
import slixfeed.xmpp.manual as manual
import slixfeed.xmpp.upload as upload
from slixfeed.xmpp.utility import jid_type
@ -195,13 +195,13 @@ async def message(self, message):
# )
# send_reply_message(self, message, response)
case "commands":
response = text.print_cmd()
response = manual.print_cmd()
send_reply_message(self, message, response)
case "help":
response = text.print_help()
response = manual.print_help()
send_reply_message(self, message, response)
case "info":
response = text.print_info()
response = manual.print_info()
send_reply_message(self, message, response)
case _ if message_lowercase in [
"greetings", "hallo", "hello", "hey",
@ -438,6 +438,44 @@ async def message(self, message):
message_lowercase.startswith("gopher:")):
response = "Gemini and Gopher are not supported yet."
send_reply_message(self, message, response)
# TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT
case _ if (message_lowercase.startswith("get ")):
message_text = message_text[4:]
ix = message_text.split(" ")[0]
ex = " ".join(message_text.split(" ")[1:])
ex = ex if ex else 'pdf'
db_file = get_pathname_to_database(jid)
data_dir = get_default_data_directory()
if ix:
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/readability'):
os.mkdir(data_dir + '/readability')
filename = os.path.join(
data_dir, "readability", "saved_article_" + timestamp() + "." + ex)
try:
text = await action.get_content(db_file, ix)
except:
response = "No entry Id with {}".format(ix)
if text:
match ex:
case "html":
action.generate_html(text, filename)
case "md":
action.generate_markdown(text, filename)
case "pdf":
action.generate_pdf(text, filename)
url = await upload.start(
self, jid, filename)
print(url)
await send_oob_message(
self, jid, url)
else:
response = "Failed to fetch resource."
else:
response = "Missing entry Id."
if response:
send_reply_message(self, message, response)
# case _ if (message_lowercase.startswith("http")) and(
# message_lowercase.endswith(".opml")):
# url = message_text