Add functionality to download articles

This commit is contained in:
Schimon Jehudah 2024-01-09 15:53:19 +00:00
parent 9709c052ee
commit b06e1bc693
6 changed files with 120 additions and 24 deletions

View file

@ -16,7 +16,10 @@ setup(
'aiohttp', 'aiohttp',
'bs4', 'bs4',
'feedparser', 'feedparser',
'html2text',
'lxml', 'lxml',
'pdfkit',
'readability-lxml',
'slixmpp' 'slixmpp'
], ],
classifiers=[ classifiers=[

View file

@ -42,33 +42,33 @@ TODO
9.1) IDEA: Bot to display Title and Excerpt 9.1) IDEA: Bot to display Title and Excerpt
(including sending a PDF version of it) of posted link (including sending a PDF version of it) of posted link
10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT). 10) Fetch summary from URL, instead of storing summary, or
11) Fetch summary from URL, instead of storing summary, or
Store 5 upcoming summaries. Store 5 upcoming summaries.
This would help making the database files smaller. This would help making the database files smaller.
12) Support protocol Gopher 11) Support protocol Gopher
See project /michael-lazar/pygopherd See project /michael-lazar/pygopherd
See project /gopherball/gb See project /gopherball/gb
13) Support ActivityPub @person@domain (see Tip Of The Day). 12) Support ActivityPub @person@domain (see Tip Of The Day).
14) Tip Of The Day. 13) Tip Of The Day.
Did you know that you can follow you favorite Mastodon feeds by just Did you know that you can follow you favorite Mastodon feeds by just
sending the URL address? sending the URL address?
Supported fediverse websites are: Supported fediverse websites are:
Akkoma, Firefish (Calckey), Friendica, HubZilla, Akkoma, Firefish (Calckey), Friendica, HubZilla,
Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox. Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.
15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger 14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
16) See project /offpunk/offblocklist.py 15) See project /offpunk/offblocklist.py
18) Search messages of government regulated publishers, and promote other sources. 16) Search messages of government regulated publishers, and promote other sources.
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds. Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
However, you might want to get news from (1) (2) and (3) instead! However, you might want to get news from (1) (2) and (3) instead!
17) Make the program portable (directly use the directory assets) -- Thorsten
""" """
# vars and their meanings: # vars and their meanings:

View file

@ -12,9 +12,12 @@ TODO
from asyncio.exceptions import IncompleteReadError from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import html2text
from http.client import IncompleteRead from http.client import IncompleteRead
from feedparser import parse from feedparser import parse
import logging import logging
import pdfkit
from readability import Document
import slixfeed.config as config import slixfeed.config as config
import slixfeed.crawl as crawl import slixfeed.crawl as crawl
from slixfeed.datetime import ( from slixfeed.datetime import (
@ -656,9 +659,38 @@ async def scan(db_file, url):
db_file, title, link, entry_id, db_file, title, link, entry_id,
url, date, read_status) url, date, read_status)
await sqlite.set_date(db_file, url) await sqlite.set_date(db_file, url)
async def get_content(db_file, ix):
url = sqlite.get_entry_url(db_file, ix)
result = await fetch.download_feed(url)
if result[0]:
document = Document(result[0])
return document.summary()
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
def generate_html(text, filename):
with open(filename, 'w') as file:
file.write(text)
def generate_pdf(text, filename):
pdfkit.from_string(text, filename)
def generate_markdown(text, filename):
h2m = html2text.HTML2Text()
# Convert HTML to Markdown
markdown = h2m.handle(text)
with open(filename, 'w') as file:
file.write(markdown)
# NOTE Why (if res[0]) and (if res[1] == 200)? # NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls): async def organize_items(db_file, urls):

View file

@ -700,7 +700,7 @@ async def archive_entry(db_file, ix):
"ERROR DB deleting items from " "ERROR DB deleting items from "
"table entries at index", ix "table entries at index", ix
) )
def get_feed_title(db_file, ix): def get_feed_title(db_file, ix):
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
@ -716,6 +716,21 @@ def get_feed_title(db_file, ix):
return title return title
# TODO Handletable archive too
def get_entry_url(db_file, ix):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT link
FROM entries
WHERE id = :ix
"""
)
url = cur.execute(sql, (ix,)).fetchone()[0]
return url
async def mark_as_read(db_file, ix): async def mark_as_read(db_file, ix):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:

View file

@ -62,10 +62,10 @@ def print_info():
" Thorsten Mühlfelder (SalixOS, Germany)," " Thorsten Mühlfelder (SalixOS, Germany),"
"\n" "\n"
" Yann Leboulanger (Gajim, France).\n" " Yann Leboulanger (Gajim, France).\n"
"\n"
"COPYRIGHT\n" "COPYRIGHT\n"
" Slixfeed is free software; you can redistribute it and/or\n" " Slixfeed is free software; you can redistribute it and/or\n"
" modify it under the terms of the GNU General Public License\n" " modify it under the terms of the MIT License.\n"
" as published by the Free Software Foundation; version 3 only\n"
"\n" "\n"
" Slixfeed is distributed in the hope that it will be useful,\n" " Slixfeed is distributed in the hope that it will be useful,\n"
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n" " but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
@ -117,6 +117,9 @@ def print_help():
" Add <url> to subscription list.\n" " Add <url> to subscription list.\n"
" add <url> TITLE\n" " add <url> TITLE\n"
" Add <url> to subscription list (without validity check).\n" " Add <url> to subscription list (without validity check).\n"
" get <id> <type>\n"
" Send an article as file. Specify <id> and <type>."
" Supported types are HTML, MD and PDF (default).\n"
" join <muc>\n" " join <muc>\n"
" Join specified groupchat.\n" " Join specified groupchat.\n"
" read <url>\n" " read <url>\n"
@ -177,8 +180,10 @@ def print_help():
"EDIT OPTIONS\n" "EDIT OPTIONS\n"
" remove <id>\n" " remove <id>\n"
" Remove feed of <id> from subscription list.\n" " Remove feed of <id> from subscription list.\n"
" status <id>\n" " disable <id>\n"
" Toggle update status of feed of <id>.\n" " Disable updates for feed of <id>.\n"
" enable <id>\n"
" Enable updates for feed of <id>.\n"
"\n" "\n"
"SEARCH OPTIONS\n" "SEARCH OPTIONS\n"
" feeds\n" " feeds\n"
@ -198,16 +203,16 @@ def print_help():
# " unread\n" # " unread\n"
# " Print number of unread news items.\n" # " Print number of unread news items.\n"
# "\n" # "\n"
# "BACKUP OPTIONS\n" "BACKUP OPTIONS\n"
# " export opml\n" " export opml\n"
# " Send an OPML file with your feeds.\n" " Send an OPML file with feeds.\n"
# " backup news html\n" # " backup news html\n"
# " Send an HTML formatted file of your news items.\n" # " Send an HTML formatted file of your news items.\n"
# " backup news md\n" # " backup news md\n"
# " Send a Markdown file of your news items.\n" # " Send a Markdown file of your news items.\n"
# " backup news text\n" # " backup news text\n"
# " Send a Plain Text file of your news items.\n" # " Send a Plain Text file of your news items.\n"
# "\n" "\n"
"SUPPORT\n" "SUPPORT\n"
" commands\n" " commands\n"
" Print list of commands.\n" " Print list of commands.\n"
@ -247,8 +252,12 @@ def print_cmd():
"allow - : Delete keywords from allow list (comma separates).\n" "allow - : Delete keywords from allow list (comma separates).\n"
"deny + : Keywords to block (comma separates).\n" "deny + : Keywords to block (comma separates).\n"
"deny - : Delete keywords from deny list (comma separates).\n" "deny - : Delete keywords from deny list (comma separates).\n"
"disable <id> : Disable updates for feed of <id>.\n"
"enable <id> : Enable updates for feed of <id>.\n"
"export opml : Send an OPML file with feeds.\n"
"feeds : List all subscriptions.\n" "feeds : List all subscriptions.\n"
"feeds <text> : Search subscriptions by given <text>.\n" "feeds <text> : Search subscriptions by given <text>.\n"
"get <id> <type> : Send an article as file. Specify <id> and <type>. Supported types are HTML, MD and PDF (default).\n"
"interval <n> : Set interval update to every <n> minutes.\n" "interval <n> : Set interval update to every <n> minutes.\n"
"join <muc> : Join specified groupchat.\n" "join <muc> : Join specified groupchat.\n"
"length : Set maximum length of news item description. (0 for no limit)\n" "length : Set maximum length of news item description. (0 for no limit)\n"
@ -264,7 +273,6 @@ def print_cmd():
"remove <id> : Remove feed from subscription list.\n" "remove <id> : Remove feed from subscription list.\n"
"search <text> : Search news items by given <text>.\n" "search <text> : Search news items by given <text>.\n"
"start : Enable bot and send updates.\n" "start : Enable bot and send updates.\n"
"status <id> : Toggle update status of feed.\n"
"stop : Disable bot and stop updates.\n" "stop : Disable bot and stop updates.\n"
"```" "```"
) )

View file

@ -34,7 +34,7 @@ import slixfeed.task as task
import slixfeed.url as uri import slixfeed.url as uri
import slixfeed.xmpp.bookmark as bookmark import slixfeed.xmpp.bookmark as bookmark
import slixfeed.xmpp.muc as groupchat import slixfeed.xmpp.muc as groupchat
import slixfeed.xmpp.text as text import slixfeed.xmpp.manual as manual
import slixfeed.xmpp.upload as upload import slixfeed.xmpp.upload as upload
from slixfeed.xmpp.utility import jid_type from slixfeed.xmpp.utility import jid_type
@ -195,13 +195,13 @@ async def message(self, message):
# ) # )
# send_reply_message(self, message, response) # send_reply_message(self, message, response)
case "commands": case "commands":
response = text.print_cmd() response = manual.print_cmd()
send_reply_message(self, message, response) send_reply_message(self, message, response)
case "help": case "help":
response = text.print_help() response = manual.print_help()
send_reply_message(self, message, response) send_reply_message(self, message, response)
case "info": case "info":
response = text.print_info() response = manual.print_info()
send_reply_message(self, message, response) send_reply_message(self, message, response)
case _ if message_lowercase in [ case _ if message_lowercase in [
"greetings", "hallo", "hello", "hey", "greetings", "hallo", "hello", "hey",
@ -438,6 +438,44 @@ async def message(self, message):
message_lowercase.startswith("gopher:")): message_lowercase.startswith("gopher:")):
response = "Gemini and Gopher are not supported yet." response = "Gemini and Gopher are not supported yet."
send_reply_message(self, message, response) send_reply_message(self, message, response)
# TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT
case _ if (message_lowercase.startswith("get ")):
message_text = message_text[4:]
ix = message_text.split(" ")[0]
ex = " ".join(message_text.split(" ")[1:])
ex = ex if ex else 'pdf'
db_file = get_pathname_to_database(jid)
data_dir = get_default_data_directory()
if ix:
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/readability'):
os.mkdir(data_dir + '/readability')
filename = os.path.join(
data_dir, "readability", "saved_article_" + timestamp() + "." + ex)
try:
text = await action.get_content(db_file, ix)
except:
response = "No entry Id with {}".format(ix)
if text:
match ex:
case "html":
action.generate_html(text, filename)
case "md":
action.generate_markdown(text, filename)
case "pdf":
action.generate_pdf(text, filename)
url = await upload.start(
self, jid, filename)
print(url)
await send_oob_message(
self, jid, url)
else:
response = "Failed to fetch resource."
else:
response = "Missing entry Id."
if response:
send_reply_message(self, message, response)
# case _ if (message_lowercase.startswith("http")) and( # case _ if (message_lowercase.startswith("http")) and(
# message_lowercase.endswith(".opml")): # message_lowercase.endswith(".opml")):
# url = message_text # url = message_text