Add functionality to download articles
This commit is contained in:
parent
9709c052ee
commit
b06e1bc693
6 changed files with 120 additions and 24 deletions
3
setup.py
3
setup.py
|
@ -16,7 +16,10 @@ setup(
|
||||||
'aiohttp',
|
'aiohttp',
|
||||||
'bs4',
|
'bs4',
|
||||||
'feedparser',
|
'feedparser',
|
||||||
|
'html2text',
|
||||||
'lxml',
|
'lxml',
|
||||||
|
'pdfkit',
|
||||||
|
'readability-lxml',
|
||||||
'slixmpp'
|
'slixmpp'
|
||||||
],
|
],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
|
|
@ -42,33 +42,33 @@ TODO
|
||||||
9.1) IDEA: Bot to display Title and Excerpt
|
9.1) IDEA: Bot to display Title and Excerpt
|
||||||
(including sending a PDF version of it) of posted link
|
(including sending a PDF version of it) of posted link
|
||||||
|
|
||||||
10) Download and upload/send article (xHTML, HTMLZ, Markdown, MHTML, PDF, TXT).
|
10) Fetch summary from URL, instead of storing summary, or
|
||||||
|
|
||||||
11) Fetch summary from URL, instead of storing summary, or
|
|
||||||
Store 5 upcoming summaries.
|
Store 5 upcoming summaries.
|
||||||
This would help making the database files smaller.
|
This would help making the database files smaller.
|
||||||
|
|
||||||
12) Support protocol Gopher
|
11) Support protocol Gopher
|
||||||
See project /michael-lazar/pygopherd
|
See project /michael-lazar/pygopherd
|
||||||
See project /gopherball/gb
|
See project /gopherball/gb
|
||||||
|
|
||||||
13) Support ActivityPub @person@domain (see Tip Of The Day).
|
12) Support ActivityPub @person@domain (see Tip Of The Day).
|
||||||
|
|
||||||
14) Tip Of The Day.
|
13) Tip Of The Day.
|
||||||
Did you know that you can follow you favorite Mastodon feeds by just
|
Did you know that you can follow you favorite Mastodon feeds by just
|
||||||
sending the URL address?
|
sending the URL address?
|
||||||
Supported fediverse websites are:
|
Supported fediverse websites are:
|
||||||
Akkoma, Firefish (Calckey), Friendica, HubZilla,
|
Akkoma, Firefish (Calckey), Friendica, HubZilla,
|
||||||
Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.
|
Mastodon, Misskey, Pixelfed, Pleroma, Socialhome, Soapbox.
|
||||||
|
|
||||||
15) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
|
14) Brand: News Broker, Newsman, Newsdealer, Laura Harbinger
|
||||||
|
|
||||||
16) See project /offpunk/offblocklist.py
|
15) See project /offpunk/offblocklist.py
|
||||||
|
|
||||||
18) Search messages of government regulated publishers, and promote other sources.
|
16) Search messages of government regulated publishers, and promote other sources.
|
||||||
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
|
Dear reader, we couldn't get news from XYZ as they don't provide RSS feeds.
|
||||||
However, you might want to get news from (1) (2) and (3) instead!
|
However, you might want to get news from (1) (2) and (3) instead!
|
||||||
|
|
||||||
|
17) Make the program portable (directly use the directory assets) -- Thorsten
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# vars and their meanings:
|
# vars and their meanings:
|
||||||
|
|
|
@ -12,9 +12,12 @@ TODO
|
||||||
|
|
||||||
from asyncio.exceptions import IncompleteReadError
|
from asyncio.exceptions import IncompleteReadError
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
import html2text
|
||||||
from http.client import IncompleteRead
|
from http.client import IncompleteRead
|
||||||
from feedparser import parse
|
from feedparser import parse
|
||||||
import logging
|
import logging
|
||||||
|
import pdfkit
|
||||||
|
from readability import Document
|
||||||
import slixfeed.config as config
|
import slixfeed.config as config
|
||||||
import slixfeed.crawl as crawl
|
import slixfeed.crawl as crawl
|
||||||
from slixfeed.datetime import (
|
from slixfeed.datetime import (
|
||||||
|
@ -658,6 +661,35 @@ async def scan(db_file, url):
|
||||||
await sqlite.set_date(db_file, url)
|
await sqlite.set_date(db_file, url)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_content(db_file, ix):
|
||||||
|
url = sqlite.get_entry_url(db_file, ix)
|
||||||
|
result = await fetch.download_feed(url)
|
||||||
|
if result[0]:
|
||||||
|
document = Document(result[0])
|
||||||
|
return document.summary()
|
||||||
|
# TODO Either adapt it to filename
|
||||||
|
# or change it to something else
|
||||||
|
#filename = document.title()
|
||||||
|
# with open(filename, 'w') as file:
|
||||||
|
# html_doc = document.summary()
|
||||||
|
# file.write(html_doc)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_html(text, filename):
|
||||||
|
with open(filename, 'w') as file:
|
||||||
|
file.write(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_pdf(text, filename):
|
||||||
|
pdfkit.from_string(text, filename)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_markdown(text, filename):
|
||||||
|
h2m = html2text.HTML2Text()
|
||||||
|
# Convert HTML to Markdown
|
||||||
|
markdown = h2m.handle(text)
|
||||||
|
with open(filename, 'w') as file:
|
||||||
|
file.write(markdown)
|
||||||
|
|
||||||
|
|
||||||
# NOTE Why (if res[0]) and (if res[1] == 200)?
|
# NOTE Why (if res[0]) and (if res[1] == 200)?
|
||||||
|
|
|
@ -716,6 +716,21 @@ def get_feed_title(db_file, ix):
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
|
||||||
|
# TODO Handletable archive too
|
||||||
|
def get_entry_url(db_file, ix):
|
||||||
|
with create_connection(db_file) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
sql = (
|
||||||
|
"""
|
||||||
|
SELECT link
|
||||||
|
FROM entries
|
||||||
|
WHERE id = :ix
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
url = cur.execute(sql, (ix,)).fetchone()[0]
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
async def mark_as_read(db_file, ix):
|
async def mark_as_read(db_file, ix):
|
||||||
async with DBLOCK:
|
async with DBLOCK:
|
||||||
with create_connection(db_file) as conn:
|
with create_connection(db_file) as conn:
|
||||||
|
|
|
@ -62,10 +62,10 @@ def print_info():
|
||||||
" Thorsten Mühlfelder (SalixOS, Germany),"
|
" Thorsten Mühlfelder (SalixOS, Germany),"
|
||||||
"\n"
|
"\n"
|
||||||
" Yann Leboulanger (Gajim, France).\n"
|
" Yann Leboulanger (Gajim, France).\n"
|
||||||
|
"\n"
|
||||||
"COPYRIGHT\n"
|
"COPYRIGHT\n"
|
||||||
" Slixfeed is free software; you can redistribute it and/or\n"
|
" Slixfeed is free software; you can redistribute it and/or\n"
|
||||||
" modify it under the terms of the GNU General Public License\n"
|
" modify it under the terms of the MIT License.\n"
|
||||||
" as published by the Free Software Foundation; version 3 only\n"
|
|
||||||
"\n"
|
"\n"
|
||||||
" Slixfeed is distributed in the hope that it will be useful,\n"
|
" Slixfeed is distributed in the hope that it will be useful,\n"
|
||||||
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
|
" but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
|
||||||
|
@ -117,6 +117,9 @@ def print_help():
|
||||||
" Add <url> to subscription list.\n"
|
" Add <url> to subscription list.\n"
|
||||||
" add <url> TITLE\n"
|
" add <url> TITLE\n"
|
||||||
" Add <url> to subscription list (without validity check).\n"
|
" Add <url> to subscription list (without validity check).\n"
|
||||||
|
" get <id> <type>\n"
|
||||||
|
" Send an article as file. Specify <id> and <type>."
|
||||||
|
" Supported types are HTML, MD and PDF (default).\n"
|
||||||
" join <muc>\n"
|
" join <muc>\n"
|
||||||
" Join specified groupchat.\n"
|
" Join specified groupchat.\n"
|
||||||
" read <url>\n"
|
" read <url>\n"
|
||||||
|
@ -177,8 +180,10 @@ def print_help():
|
||||||
"EDIT OPTIONS\n"
|
"EDIT OPTIONS\n"
|
||||||
" remove <id>\n"
|
" remove <id>\n"
|
||||||
" Remove feed of <id> from subscription list.\n"
|
" Remove feed of <id> from subscription list.\n"
|
||||||
" status <id>\n"
|
" disable <id>\n"
|
||||||
" Toggle update status of feed of <id>.\n"
|
" Disable updates for feed of <id>.\n"
|
||||||
|
" enable <id>\n"
|
||||||
|
" Enable updates for feed of <id>.\n"
|
||||||
"\n"
|
"\n"
|
||||||
"SEARCH OPTIONS\n"
|
"SEARCH OPTIONS\n"
|
||||||
" feeds\n"
|
" feeds\n"
|
||||||
|
@ -198,16 +203,16 @@ def print_help():
|
||||||
# " unread\n"
|
# " unread\n"
|
||||||
# " Print number of unread news items.\n"
|
# " Print number of unread news items.\n"
|
||||||
# "\n"
|
# "\n"
|
||||||
# "BACKUP OPTIONS\n"
|
"BACKUP OPTIONS\n"
|
||||||
# " export opml\n"
|
" export opml\n"
|
||||||
# " Send an OPML file with your feeds.\n"
|
" Send an OPML file with feeds.\n"
|
||||||
# " backup news html\n"
|
# " backup news html\n"
|
||||||
# " Send an HTML formatted file of your news items.\n"
|
# " Send an HTML formatted file of your news items.\n"
|
||||||
# " backup news md\n"
|
# " backup news md\n"
|
||||||
# " Send a Markdown file of your news items.\n"
|
# " Send a Markdown file of your news items.\n"
|
||||||
# " backup news text\n"
|
# " backup news text\n"
|
||||||
# " Send a Plain Text file of your news items.\n"
|
# " Send a Plain Text file of your news items.\n"
|
||||||
# "\n"
|
"\n"
|
||||||
"SUPPORT\n"
|
"SUPPORT\n"
|
||||||
" commands\n"
|
" commands\n"
|
||||||
" Print list of commands.\n"
|
" Print list of commands.\n"
|
||||||
|
@ -247,8 +252,12 @@ def print_cmd():
|
||||||
"allow - : Delete keywords from allow list (comma separates).\n"
|
"allow - : Delete keywords from allow list (comma separates).\n"
|
||||||
"deny + : Keywords to block (comma separates).\n"
|
"deny + : Keywords to block (comma separates).\n"
|
||||||
"deny - : Delete keywords from deny list (comma separates).\n"
|
"deny - : Delete keywords from deny list (comma separates).\n"
|
||||||
|
"disable <id> : Disable updates for feed of <id>.\n"
|
||||||
|
"enable <id> : Enable updates for feed of <id>.\n"
|
||||||
|
"export opml : Send an OPML file with feeds.\n"
|
||||||
"feeds : List all subscriptions.\n"
|
"feeds : List all subscriptions.\n"
|
||||||
"feeds <text> : Search subscriptions by given <text>.\n"
|
"feeds <text> : Search subscriptions by given <text>.\n"
|
||||||
|
"get <id> <type> : Send an article as file. Specify <id> and <type>. Supported types are HTML, MD and PDF (default).\n"
|
||||||
"interval <n> : Set interval update to every <n> minutes.\n"
|
"interval <n> : Set interval update to every <n> minutes.\n"
|
||||||
"join <muc> : Join specified groupchat.\n"
|
"join <muc> : Join specified groupchat.\n"
|
||||||
"length : Set maximum length of news item description. (0 for no limit)\n"
|
"length : Set maximum length of news item description. (0 for no limit)\n"
|
||||||
|
@ -264,7 +273,6 @@ def print_cmd():
|
||||||
"remove <id> : Remove feed from subscription list.\n"
|
"remove <id> : Remove feed from subscription list.\n"
|
||||||
"search <text> : Search news items by given <text>.\n"
|
"search <text> : Search news items by given <text>.\n"
|
||||||
"start : Enable bot and send updates.\n"
|
"start : Enable bot and send updates.\n"
|
||||||
"status <id> : Toggle update status of feed.\n"
|
|
||||||
"stop : Disable bot and stop updates.\n"
|
"stop : Disable bot and stop updates.\n"
|
||||||
"```"
|
"```"
|
||||||
)
|
)
|
|
@ -34,7 +34,7 @@ import slixfeed.task as task
|
||||||
import slixfeed.url as uri
|
import slixfeed.url as uri
|
||||||
import slixfeed.xmpp.bookmark as bookmark
|
import slixfeed.xmpp.bookmark as bookmark
|
||||||
import slixfeed.xmpp.muc as groupchat
|
import slixfeed.xmpp.muc as groupchat
|
||||||
import slixfeed.xmpp.text as text
|
import slixfeed.xmpp.manual as manual
|
||||||
import slixfeed.xmpp.upload as upload
|
import slixfeed.xmpp.upload as upload
|
||||||
from slixfeed.xmpp.utility import jid_type
|
from slixfeed.xmpp.utility import jid_type
|
||||||
|
|
||||||
|
@ -195,13 +195,13 @@ async def message(self, message):
|
||||||
# )
|
# )
|
||||||
# send_reply_message(self, message, response)
|
# send_reply_message(self, message, response)
|
||||||
case "commands":
|
case "commands":
|
||||||
response = text.print_cmd()
|
response = manual.print_cmd()
|
||||||
send_reply_message(self, message, response)
|
send_reply_message(self, message, response)
|
||||||
case "help":
|
case "help":
|
||||||
response = text.print_help()
|
response = manual.print_help()
|
||||||
send_reply_message(self, message, response)
|
send_reply_message(self, message, response)
|
||||||
case "info":
|
case "info":
|
||||||
response = text.print_info()
|
response = manual.print_info()
|
||||||
send_reply_message(self, message, response)
|
send_reply_message(self, message, response)
|
||||||
case _ if message_lowercase in [
|
case _ if message_lowercase in [
|
||||||
"greetings", "hallo", "hello", "hey",
|
"greetings", "hallo", "hello", "hey",
|
||||||
|
@ -438,6 +438,44 @@ async def message(self, message):
|
||||||
message_lowercase.startswith("gopher:")):
|
message_lowercase.startswith("gopher:")):
|
||||||
response = "Gemini and Gopher are not supported yet."
|
response = "Gemini and Gopher are not supported yet."
|
||||||
send_reply_message(self, message, response)
|
send_reply_message(self, message, response)
|
||||||
|
# TODO xHTML, HTMLZ, Markdown, MHTML, PDF, TXT
|
||||||
|
case _ if (message_lowercase.startswith("get ")):
|
||||||
|
message_text = message_text[4:]
|
||||||
|
ix = message_text.split(" ")[0]
|
||||||
|
ex = " ".join(message_text.split(" ")[1:])
|
||||||
|
ex = ex if ex else 'pdf'
|
||||||
|
db_file = get_pathname_to_database(jid)
|
||||||
|
data_dir = get_default_data_directory()
|
||||||
|
if ix:
|
||||||
|
if not os.path.isdir(data_dir):
|
||||||
|
os.mkdir(data_dir)
|
||||||
|
if not os.path.isdir(data_dir + '/readability'):
|
||||||
|
os.mkdir(data_dir + '/readability')
|
||||||
|
filename = os.path.join(
|
||||||
|
data_dir, "readability", "saved_article_" + timestamp() + "." + ex)
|
||||||
|
try:
|
||||||
|
text = await action.get_content(db_file, ix)
|
||||||
|
except:
|
||||||
|
response = "No entry Id with {}".format(ix)
|
||||||
|
if text:
|
||||||
|
match ex:
|
||||||
|
case "html":
|
||||||
|
action.generate_html(text, filename)
|
||||||
|
case "md":
|
||||||
|
action.generate_markdown(text, filename)
|
||||||
|
case "pdf":
|
||||||
|
action.generate_pdf(text, filename)
|
||||||
|
url = await upload.start(
|
||||||
|
self, jid, filename)
|
||||||
|
print(url)
|
||||||
|
await send_oob_message(
|
||||||
|
self, jid, url)
|
||||||
|
else:
|
||||||
|
response = "Failed to fetch resource."
|
||||||
|
else:
|
||||||
|
response = "Missing entry Id."
|
||||||
|
if response:
|
||||||
|
send_reply_message(self, message, response)
|
||||||
# case _ if (message_lowercase.startswith("http")) and(
|
# case _ if (message_lowercase.startswith("http")) and(
|
||||||
# message_lowercase.endswith(".opml")):
|
# message_lowercase.endswith(".opml")):
|
||||||
# url = message_text
|
# url = message_text
|
||||||
|
|
Loading…
Reference in a new issue