forked from sch/Slixfeed
Add functionality to download articles
This commit is contained in:
parent
9709c052ee
commit
b06e1bc693
6 changed files with 120 additions and 24 deletions
3
setup.py
3
setup.py
|
@ -16,7 +16,10 @@ setup(
|
|||
'aiohttp',
|
||||
'bs4',
|
||||
'feedparser',
|
||||
'html2text',
|
||||
'lxml',
|
||||
'pdfkit',
|
||||
'readability-lxml',
|
||||
'slixmpp'
|
||||
],
|
||||
classifiers=[
|
||||
|
|
|
@ -42,33 +42,33 @@ TODO
|
|||
9.1) IDEA: Bot to display Title and Excerpt
|
||||
(including sending a PDF version of it) of posted link
|
||||
|
||||
10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT).
|
||||
|
||||
11) Fetch summary from URL, instead of storing summary, or
|
||||
10) Fetch summary from URL, instead of storing summary, or
|
||||
Store 5 upcoming summaries.
|
||||
This would help making the database files smaller.
|
||||
|
||||
12) Support protocol Gopher
|
||||
11) Support protocol Gopher
|
||||
See project /michael-lazar/pygopherd
|
||||
See project /gopherball/gb
|
||||
|
||||
13) Support ActivityPub @person@domain (see Tip Of The Day).
|
||||
12) Support ActivityPub @person@domain (see Tip Of The Day).
|
||||
|
||||
14) Tip Of The Day.
|
||||
13) Tip Of The Day.
|
||||
Did you know that you can follow you favorite Mastodon feeds by just
|
||||
sending the URL address?
|
||||
Supported fediverse websites are:
|
||||
Akkoma, Firefish (Calckey), Friendica, HubZilla,
|
||||
Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.
|
||||
|
||||
15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
|
||||
14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
|
||||
|
||||
16) See project /offpunk/offblocklist.py
|
||||
15) See project /offpunk/offblocklist.py
|
||||
|
||||
18) Search messages of government regulated publishers, and promote other sources.
|
||||
16) Search messages of government regulated publishers, and promote other sources.
|
||||
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
|
||||
However, you might want to get news from (1) (2) and (3) instead!
|
||||
|
||||
17) Make the program portable (directly use the directory assets) -- Thorsten
|
||||
|
||||
"""
|
||||
|
||||
# vars and their meanings:
|
||||
|
|
|
@ -12,9 +12,12 @@ TODO
|
|||
|
||||
from asyncio.exceptions import IncompleteReadError
|
||||
from bs4 import BeautifulSoup
|
||||
import html2text
|
||||
from http.client import IncompleteRead
|
||||
from feedparser import parse
|
||||
import logging
|
||||
import pdfkit
|
||||
from readability import Document
|
||||
import slixfeed.config as config
|
||||
import slixfeed.crawl as crawl
|
||||
from slixfeed.datetime import (
|
||||
|
@ -658,6 +661,35 @@ async def scan(db_file, url):
|
|||
await sqlite.set_date(db_file, url)
|
||||
|
||||
|
||||
async def get_content(db_file, ix):
|
||||
url = sqlite.get_entry_url(db_file, ix)
|
||||
result = await fetch.download_feed(url)
|
||||
if result[0]:
|
||||
document = Document(result[0])
|
||||
return document.summary()
|
||||
# TODO Either adapt it to filename
|
||||
# or change it to something else
|
||||
#filename = document.title()
|
||||
# with open(filename, 'w') as file:
|
||||
# html_doc = document.summary()
|
||||
# file.write(html_doc)
|
||||
|
||||
|
||||
def generate_html(text, filename):
|
||||
with open(filename, 'w') as file:
|
||||
file.write(text)
|
||||
|
||||
|
||||
def generate_pdf(text, filename):
|
||||
pdfkit.from_string(text, filename)
|
||||
|
||||
|
||||
def generate_markdown(text, filename):
|
||||
h2m = html2text.HTML2Text()
|
||||
# Convert HTML to Markdown
|
||||
markdown = h2m.handle(text)
|
||||
with open(filename, 'w') as file:
|
||||
file.write(markdown)
|
||||
|
||||
|
||||
# NOTE Why (if res[0]) and (if res[1] == 200)?
|
||||
|
|
|
@ -716,6 +716,21 @@ def get_feed_title(db_file, ix):
|
|||
return title
|
||||
|
||||
|
||||
# TODO Handletable archive too
|
||||
def get_entry_url(db_file, ix):
|
||||
with create_connection(db_file) as conn:
|
||||
cur = conn.cursor()
|
||||
sql = (
|
||||
"""
|
||||
SELECT link
|
||||
FROM entries
|
||||
WHERE id = :ix
|
||||
"""
|
||||
)
|
||||
url = cur.execute(sql, (ix,)).fetchone()[0]
|
||||
return url
|
||||
|
||||
|
||||
async def mark_as_read(db_file, ix):
|
||||
async with DBLOCK:
|
||||
with create_connection(db_file) as conn:
|
||||
|
|
|
@ -62,10 +62,10 @@ def print_info():
|
|||
" Thorsten Mühlfelder (SalixOS, Germany),"
|
||||
"\n"
|
||||
" Yann Leboulanger (Gajim, France).\n"
|
||||
"\n"
|
||||
"COPYRIGHT\n"
|
||||
" Slixfeed is free software; you can redistribute it and/or\n"
|
||||
" modify it under the terms of the GNU General Public License\n"
|
||||
" as published by the Free Software Foundation; version 3 only\n"
|
||||
" modify it under the terms of the MIT License.\n"
|
||||
"\n"
|
||||
" Slixfeed is distributed in the hope that it will be useful,\n"
|
||||
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
|
||||
|
@ -117,6 +117,9 @@ def print_help():
|
|||
" Add <url> to subscription list.\n"
|
||||
" add <url> TITLE\n"
|
||||
" Add <url> to subscription list (without validity check).\n"
|
||||
" get <id> <type>\n"
|
||||
" Send an article as file. Specify <id> and <type>."
|
||||
" Supported types are HTML, MD and PDF (default).\n"
|
||||
" join <muc>\n"
|
||||
" Join specified groupchat.\n"
|
||||
" read <url>\n"
|
||||
|
@ -177,8 +180,10 @@ def print_help():
|
|||
"EDIT OPTIONS\n"
|
||||
" remove <id>\n"
|
||||
" Remove feed of <id> from subscription list.\n"
|
||||
" status <id>\n"
|
||||
" Toggle update status of feed of <id>.\n"
|
||||
" disable <id>\n"
|
||||
" Disable updates for feed of <id>.\n"
|
||||
" enable <id>\n"
|
||||
" Enable updates for feed of <id>.\n"
|
||||
"\n"
|
||||
"SEARCH OPTIONS\n"
|
||||
" feeds\n"
|
||||
|
@ -198,16 +203,16 @@ def print_help():
|
|||
# " unread\n"
|
||||
# " Print number of unread news items.\n"
|
||||
# "\n"
|
||||
# "BACKUP OPTIONS\n"
|
||||
# " export opml\n"
|
||||
# " Send an OPML file with your feeds.\n"
|
||||
"BACKUP OPTIONS\n"
|
||||
" export opml\n"
|
||||
" Send an OPML file with feeds.\n"
|
||||
# " backup news html\n"
|
||||
# " Send an HTML formatted file of your news items.\n"
|
||||
# " backup news md\n"
|
||||
# " Send a Markdown file of your news items.\n"
|
||||
# " backup news text\n"
|
||||
# " Send a Plain Text file of your news items.\n"
|
||||
# "\n"
|
||||
"\n"
|
||||
"SUPPORT\n"
|
||||
" commands\n"
|
||||
" Print list of commands.\n"
|
||||
|
@ -247,8 +252,12 @@ def print_cmd():
|
|||
"allow - : Delete keywords from allow list (comma separates).\n"
|
||||
"deny + : Keywords to block (comma separates).\n"
|
||||
"deny - : Delete keywords from deny list (comma separates).\n"
|
||||
"disable <id> : Disable updates for feed of <id>.\n"
|
||||
"enable <id> : Enable updates for feed of <id>.\n"
|
||||
"export opml : Send an OPML file with feeds.\n"
|
||||
"feeds : List all subscriptions.\n"
|
||||
"feeds <text> : Search subscriptions by given <text>.\n"
|
||||
"get <id> <type> : Send an article as file. Specify <id> and <type>. Supported types are HTML, MD and PDF (default).\n"
|
||||
"interval <n> : Set interval update to every <n> minutes.\n"
|
||||
"join <muc> : Join specified groupchat.\n"
|
||||
"length : Set maximum length of news item description. (0 for no limit)\n"
|
||||
|
@ -264,7 +273,6 @@ def print_cmd():
|
|||
"remove <id> : Remove feed from subscription list.\n"
|
||||
"search <text> : Search news items by given <text>.\n"
|
||||
"start : Enable bot and send updates.\n"
|
||||
"status <id> : Toggle update status of feed.\n"
|
||||
"stop : Disable bot and stop updates.\n"
|
||||
"```"
|
||||
)
|
|
@ -34,7 +34,7 @@ import slixfeed.task as task
|
|||
import slixfeed.url as uri
|
||||
import slixfeed.xmpp.bookmark as bookmark
|
||||
import slixfeed.xmpp.muc as groupchat
|
||||
import slixfeed.xmpp.text as text
|
||||
import slixfeed.xmpp.manual as manual
|
||||
import slixfeed.xmpp.upload as upload
|
||||
from slixfeed.xmpp.utility import jid_type
|
||||
|
||||
|
@ -195,13 +195,13 @@ async def message(self, message):
|
|||
# )
|
||||
# send_reply_message(self, message, response)
|
||||
case "commands":
|
||||
response = text.print_cmd()
|
||||
response = manual.print_cmd()
|
||||
send_reply_message(self, message, response)
|
||||
case "help":
|
||||
response = text.print_help()
|
||||
response = manual.print_help()
|
||||
send_reply_message(self, message, response)
|
||||
case "info":
|
||||
response = text.print_info()
|
||||
response = manual.print_info()
|
||||
send_reply_message(self, message, response)
|
||||
case _ if message_lowercase in [
|
||||
"greetings", "hallo", "hello", "hey",
|
||||
|
@ -438,6 +438,44 @@ async def message(self, message):
|
|||
message_lowercase.startswith("gopher:")):
|
||||
response = "Gemini and Gopher are not supported yet."
|
||||
send_reply_message(self, message, response)
|
||||
# TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT
|
||||
case _ if (message_lowercase.startswith("get ")):
|
||||
message_text = message_text[4:]
|
||||
ix = message_text.split(" ")[0]
|
||||
ex = " ".join(message_text.split(" ")[1:])
|
||||
ex = ex if ex else 'pdf'
|
||||
db_file = get_pathname_to_database(jid)
|
||||
data_dir = get_default_data_directory()
|
||||
if ix:
|
||||
if not os.path.isdir(data_dir):
|
||||
os.mkdir(data_dir)
|
||||
if not os.path.isdir(data_dir + '/readability'):
|
||||
os.mkdir(data_dir + '/readability')
|
||||
filename = os.path.join(
|
||||
data_dir, "readability", "saved_article_" + timestamp() + "." + ex)
|
||||
try:
|
||||
text = await action.get_content(db_file, ix)
|
||||
except:
|
||||
response = "No entry Id with {}".format(ix)
|
||||
if text:
|
||||
match ex:
|
||||
case "html":
|
||||
action.generate_html(text, filename)
|
||||
case "md":
|
||||
action.generate_markdown(text, filename)
|
||||
case "pdf":
|
||||
action.generate_pdf(text, filename)
|
||||
url = await upload.start(
|
||||
self, jid, filename)
|
||||
print(url)
|
||||
await send_oob_message(
|
||||
self, jid, url)
|
||||
else:
|
||||
response = "Failed to fetch resource."
|
||||
else:
|
||||
response = "Missing entry Id."
|
||||
if response:
|
||||
send_reply_message(self, message, response)
|
||||
# case _ if (message_lowercase.startswith("http")) and(
|
||||
# message_lowercase.endswith(".opml")):
|
||||
# url = message_text
|
||||
|
|
Loading…
Reference in a new issue