Save enclosures

Send new message upon media detection
This commit is contained in:
Schimon Jehudah 2024-01-13 17:17:43 +00:00
parent ec82aeb3cc
commit 43fa1a463c
5 changed files with 348 additions and 249 deletions

View file

@ -42,7 +42,7 @@ from slixfeed.url import (
) )
import slixfeed.xmpp.bookmark as bookmark import slixfeed.xmpp.bookmark as bookmark
from urllib import error from urllib import error
from urllib.parse import urlsplit from urllib.parse import parse_qs, urlsplit
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
try: try:
@ -688,9 +688,34 @@ async def scan(db_file, url):
if isinstance(date, int): if isinstance(date, int):
logging.error( logging.error(
"Variable 'date' is int: {}".format(date)) "Variable 'date' is int: {}".format(date))
media_link = ''
if entry.has_key("links"):
for e_link in entry.links:
try:
# if (link.rel == "enclosure" and
# (link.type.startswith("audio/") or
# link.type.startswith("image/") or
# link.type.startswith("video/"))
# ):
media_type = e_link.type[:e_link.type.index("/")]
if e_link.has_key("rel"):
if (e_link.rel == "enclosure" and
media_type in ("audio", "image", "video")):
media_link = e_link.href
media_link = join_url(url, e_link.href)
media_link = trim_url(media_link)
break
except:
logging.error(
"KeyError: 'href'\n"
"Missing 'href' attribute for {}".format(url))
logging.info(
"Continue scanning for next potential "
"enclosure of {}".format(link))
entry = { entry = {
"title": title, "title": title,
"link": link, "link": link,
"enclosure": media_link,
"entry_id": entry_id, "entry_id": entry_id,
"url": url, "url": url,
"date": date, "date": date,
@ -706,16 +731,23 @@ async def scan(db_file, url):
db_file, new_entries) db_file, new_entries)
async def generate_document(url, ext, filename): def get_document_title(data):
result = await fetch.http(url) try:
data = result[0] document = Document(data)
code = result[1] title = document.short_title()
status = None except:
if data: document = BeautifulSoup(data, 'html.parser')
title = document.title.string
return title
def generate_document(data, url, ext, filename):
error = None
try: try:
document = Document(data) document = Document(data)
content = document.summary() content = document.summary()
except: except:
content = data
logging.warning( logging.warning(
"Check that package readability is installed.") "Check that package readability is installed.")
match ext: match ext:
@ -727,7 +759,7 @@ async def generate_document(url, ext, filename):
except: except:
logging.warning( logging.warning(
"Check that package html2text is installed.") "Check that package html2text is installed.")
status = ( error = (
"Package html2text was not found.") "Package html2text was not found.")
case "pdf": case "pdf":
try: try:
@ -736,12 +768,10 @@ async def generate_document(url, ext, filename):
logging.warning( logging.warning(
"Check that packages pdfkit and wkhtmltopdf " "Check that packages pdfkit and wkhtmltopdf "
"are installed.") "are installed.")
status = ( error = (
"Package pdfkit or wkhtmltopdf was not found.") "Package pdfkit or wkhtmltopdf was not found.")
else: if error:
status = code return error
if status:
return status
# TODO Either adapt it to filename # TODO Either adapt it to filename
# or change it to something else # or change it to something else
@ -751,28 +781,25 @@ async def generate_document(url, ext, filename):
# file.write(html_doc) # file.write(html_doc)
async def extract_image_from_feed(db_file, ix, url): async def extract_image_from_feed(db_file, feed_id, url):
feed_url = sqlite.get_feed_url(db_file, ix) feed_url = sqlite.get_feed_url(db_file, feed_id)
result = await fetch.http(feed_url) result = await fetch.http(feed_url)
document = result[0] document = result[0]
# breakpoint()
print("extract_image_from_feed")
if document: if document:
feed = parse(document) feed = parse(document)
for entry in feed.entries: for entry in feed.entries:
print(len(feed.entries)) try:
print(entry.link)
print(url)
if entry.link == url: if entry.link == url:
for link in entry.links: for link in entry.links:
if (link.rel == "enclosure" and if (link.rel == "enclosure" and
link.type.startswith("image/")): link.type.startswith("image/")):
# if link.type.startswith("image/"):
image_url = link.href image_url = link.href
print("found")
print(image_url)
break
return image_url return image_url
except:
logging.error(url)
logging.error(
"AttributeError: object has no attribute 'link'")
breakpoint()
async def extract_image_from_html(url): async def extract_image_from_html(url):
@ -783,16 +810,16 @@ async def extract_image_from_html(url):
document = Document(data) document = Document(data)
content = document.summary() content = document.summary()
except: except:
content = data
logging.warning( logging.warning(
"Check that package readability is installed.") "Check that package readability is installed.")
tree = html.fromstring(content) tree = html.fromstring(content)
# TODO Exclude banners, class="share" links etc.
images = tree.xpath('//img/@src') images = tree.xpath('//img/@src')
if len(images): if len(images):
image = images[0] image = images[0]
image = str(image) image = str(image)
image_url = complete_url(url, image) image_url = complete_url(url, image)
else:
image_url = None
return image_url return image_url
@ -813,6 +840,35 @@ def generate_markdown(text, filename):
file.write(markdown) file.write(markdown)
# TODO Add support for eDonkey, Gnutella, Soulseek
async def get_magnet(link):
parted_link = urlsplit(link)
queries = parse_qs(parted_link.query)
query_xt = queries["xt"][0]
if query_xt.startswith("urn:btih:"):
filename = queries["dn"][0]
checksum = query_xt[len("urn:btih:"):]
torrent = await fetch.magnet(link)
logging.debug(
"Attempting to retrieve {} ({})".format(
filename, checksum))
if not torrent:
logging.debug(
"Attempting to retrieve {} from HTTP caching service".format(
filename))
urls = [
'https://watercache.libertycorp.org/get/{}/{}',
'https://itorrents.org/torrent/{}.torrent?title={}',
'https://firecache.libertycorp.org/get/{}/{}',
'http://fcache63sakpihd44kxdduy6kgpdhgejgp323wci435zwy6kiylcnfad.onion/get/{}/{}'
]
for url in urls:
torrent = fetch.http(url.format(checksum, filename))
if torrent:
break
return torrent
# NOTE Why (if res[0]) and (if res[1] == 200)? # NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls): async def organize_items(db_file, urls):
""" """

View file

@ -28,9 +28,16 @@ from asyncio import TimeoutError
# from asyncio.exceptions import IncompleteReadError # from asyncio.exceptions import IncompleteReadError
# from bs4 import BeautifulSoup # from bs4 import BeautifulSoup
# from http.client import IncompleteRead # from http.client import IncompleteRead
import logging
# from lxml import html # from lxml import html
import slixfeed.config as config
# from xml.etree.ElementTree import ElementTree, ParseError # from xml.etree.ElementTree import ElementTree, ParseError
import slixfeed.config as config
try:
from magnet2torrent import Magnet2Torrent, FailedToFetchException
except:
logging.info(
"Package magnet2torrent was not found.\n"
"BitTorrent is disabled.")
# async def dat(): # async def dat():
@ -105,3 +112,11 @@ async def http(url):
False, "Timeout: " + str(e) False, "Timeout: " + str(e)
] ]
return msg return msg
async def magnet(link):
m2t = Magnet2Torrent(link)
try:
filename, torrent_data = await m2t.retrieve_torrent()
except FailedToFetchException:
logging.debug("Failed")

View file

@ -129,6 +129,7 @@ def create_tables(db_file):
id INTEGER NOT NULL, id INTEGER NOT NULL,
title TEXT NOT NULL, title TEXT NOT NULL,
link TEXT NOT NULL, link TEXT NOT NULL,
enclosure TEXT,
entry_id TEXT NOT NULL, entry_id TEXT NOT NULL,
feed_id INTEGER NOT NULL, feed_id INTEGER NOT NULL,
timestamp TEXT, timestamp TEXT,
@ -146,6 +147,7 @@ def create_tables(db_file):
id INTEGER NOT NULL, id INTEGER NOT NULL,
title TEXT NOT NULL, title TEXT NOT NULL,
link TEXT NOT NULL, link TEXT NOT NULL,
enclosure TEXT,
entry_id TEXT NOT NULL, entry_id TEXT NOT NULL,
feed_id INTEGER NOT NULL, feed_id INTEGER NOT NULL,
timestamp TEXT, timestamp TEXT,
@ -486,7 +488,8 @@ async def remove_feed_by_url(db_file, url):
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = (
""" """
DELETE FROM feeds DELETE
FROM feeds
WHERE url = ? WHERE url = ?
""" """
) )
@ -556,7 +559,8 @@ async def get_feed_id_and_name(db_file, url):
result : list result : list
List of ID and Name of feed. List of ID and Name of feed.
""" """
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( sql = (
""" """
SELECT id, name SELECT id, name
@ -677,11 +681,11 @@ async def get_unread_entries(db_file, num):
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = (
""" """
SELECT id, title, link, feed_id, timestamp SELECT id, title, link, enclosure, feed_id, timestamp
FROM entries FROM entries
WHERE read = 0 WHERE read = 0
UNION ALL UNION ALL
SELECT id, title, link, feed_id, timestamp SELECT id, title, link, enclosure, feed_id, timestamp
FROM archive FROM archive
ORDER BY timestamp ORDER BY timestamp
DESC LIMIT :num DESC LIMIT :num
@ -861,17 +865,9 @@ def get_entry_url(db_file, ix):
return url return url
def get_feed_url(db_file, ix): def get_feed_url(db_file, feed_id):
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( # TODO Handletable archive too
"""
SELECT feed_id
FROM entries
WHERE id = :ix
"""
)
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
sql = ( sql = (
""" """
SELECT url SELECT url
@ -1152,14 +1148,15 @@ async def add_entries_and_update_timestamp(db_file, new_entries):
""" """
INSERT INSERT
INTO entries( INTO entries(
title, link, entry_id, feed_id, timestamp, read) title, link, enclosure, entry_id, feed_id, timestamp, read)
VALUES( VALUES(
:title, :link, :entry_id, :feed_id, :timestamp, :read) :title, :link, :enclosure, :entry_id, :feed_id, :timestamp, :read)
""" """
) )
cur.execute(sql, { cur.execute(sql, {
"title": entry["title"], "title": entry["title"],
"link": entry["link"], "link": entry["link"],
"enclosure": entry["enclosure"],
"entry_id": entry["entry_id"], "entry_id": entry["entry_id"],
"feed_id": feed_id, "feed_id": feed_id,
"timestamp": entry["date"], "timestamp": entry["date"],
@ -1338,10 +1335,12 @@ async def maintain_archive(db_file, limit):
""" """
DELETE FROM archive DELETE FROM archive
WHERE id WHERE id
IN (SELECT id IN (
SELECT id
FROM archive FROM archive
ORDER BY timestamp ASC ORDER BY timestamp ASC
LIMIT :difference) LIMIT :difference
)
""" """
) )
cur.execute(sql, { cur.execute(sql, {
@ -1452,7 +1451,8 @@ async def get_feeds(db_file):
# Select name, url (feeds) updated, enabled, feed_id (status) # Select name, url (feeds) updated, enabled, feed_id (status)
# 2) Sort feeds by id. Sort status by feed_id # 2) Sort feeds by id. Sort status by feed_id
# results += cur.execute(sql).fetchall() # results += cur.execute(sql).fetchall()
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( sql = (
""" """
SELECT name, url, id SELECT name, url, id
@ -1479,7 +1479,8 @@ async def last_entries(db_file, num):
titles_list : str titles_list : str
List of recent N entries as message. List of recent N entries as message.
""" """
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
# sql = ( # sql = (
# "SELECT title, link " # "SELECT title, link "
# "FROM entries " # "FROM entries "
@ -1520,7 +1521,8 @@ async def search_feeds(db_file, query):
titles_list : str titles_list : str
Feeds of specified keywords as message. Feeds of specified keywords as message.
""" """
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( sql = (
""" """
SELECT name, id, url SELECT name, id, url
@ -1551,7 +1553,8 @@ async def search_entries(db_file, query):
titles_list : str titles_list : str
Entries of specified keywords as message. Entries of specified keywords as message.
""" """
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( sql = (
""" """
SELECT title, link SELECT title, link
@ -1619,7 +1622,8 @@ async def check_entry_exist(
bool bool
True or None. True or None.
""" """
cur = get_cursor(db_file) with create_connection(db_file) as conn:
cur = conn.cursor()
exist = False exist = False
if entry_id: if entry_id:
feed_id = get_feed_id(cur, url) feed_id = get_feed_id(cur, url)
@ -1627,9 +1631,7 @@ async def check_entry_exist(
""" """
SELECT id SELECT id
FROM entries FROM entries
WHERE WHERE entry_id = :entry_id and feed_id = :feed_id
entry_id = :entry_id and
feed_id = :feed_id
""" """
) )
result = cur.execute(sql, { result = cur.execute(sql, {
@ -1642,10 +1644,7 @@ async def check_entry_exist(
""" """
SELECT id SELECT id
FROM entries FROM entries
WHERE WHERE title = :title and link = :link and timestamp = :date
title = :title and
link = :link and
timestamp = :date
""" """
) )
try: try:
@ -1663,9 +1662,7 @@ async def check_entry_exist(
""" """
SELECT id SELECT id
FROM entries FROM entries
WHERE WHERE title = :title and link = :link
title = :title and
link = :link
""" """
) )
result = cur.execute(sql, { result = cur.execute(sql, {

View file

@ -227,46 +227,60 @@ async def send_update(self, jid, num=None):
num = int(num) num = int(num)
news_digest = [] news_digest = []
results = await get_unread_entries(db_file, num) results = await get_unread_entries(db_file, num)
image_url = None news_digest = ''
media = None
chat_type = await utility.jid_type(self, jid)
for result in results: for result in results:
ix = result[0] ix = result[0]
title_e = result[1] title_e = result[1]
url = result[2] url = result[2]
feed_id = result[3] enclosure = result[3]
date = result[4] feed_id = result[4]
date = result[5]
title_f = get_feed_title(db_file, feed_id) title_f = get_feed_title(db_file, feed_id)
news_item = action.list_unread_entries(result, title_f) news_digest += action.list_unread_entries(result, title_f)
news_digest.extend([news_item])
# print(db_file) # print(db_file)
# print(result[0]) # print(result[0])
# breakpoint() # breakpoint()
await mark_as_read(db_file, result[0]) await mark_as_read(db_file, ix)
if not image_url:
image_url = await action.extract_image_from_feed( # Find media
db_file, ix, url) if url.startswith("magnet:"):
if not image_url: media = action.get_magnet(url)
image_url = await action.extract_image_from_html(url) elif enclosure.startswith("magnet:"):
print("image_url") media = action.get_magnet(enclosure)
print(image_url) elif enclosure:
new = " ".join(news_digest) media = enclosure
# breakpoint() else:
if new: media = await action.extract_image_from_html(url)
if media and news_digest:
# Send textual message
xmpp.Slixfeed.send_message(
self, mto=jid, mbody=news_digest, mtype=chat_type)
news_digest = ''
# Send media
message = xmpp.Slixfeed.make_message(
self, mto=jid, mbody=media, mtype=chat_type)
message['oob']['url'] = media
message.send()
media = None
if news_digest:
# TODO Add while loop to assure delivery. # TODO Add while loop to assure delivery.
# print(await current_time(), ">>> ACT send_message",jid) # print(await current_time(), ">>> ACT send_message",jid)
chat_type = await utility.jid_type(self, jid)
# NOTE Do we need "if statement"? See NOTE at is_muc. # NOTE Do we need "if statement"? See NOTE at is_muc.
if chat_type in ("chat", "groupchat"): if chat_type in ("chat", "groupchat"):
# TODO Provide a choice (with or without images) # TODO Provide a choice (with or without images)
xmpp.Slixfeed.send_message( xmpp.Slixfeed.send_message(
self, mto=jid, mbody=new, mtype=chat_type) self, mto=jid, mbody=news_digest, mtype=chat_type)
if image_url: # if media:
# # message = xmpp.Slixfeed.make_message(
# # self, mto=jid, mbody=new, mtype=chat_type)
# message = xmpp.Slixfeed.make_message( # message = xmpp.Slixfeed.make_message(
# self, mto=jid, mbody=new, mtype=chat_type) # self, mto=jid, mbody=media, mtype=chat_type)
message = xmpp.Slixfeed.make_message( # message['oob']['url'] = media
self, mto=jid, mbody=image_url, mtype=chat_type) # message.send()
message['oob']['url'] = image_url
print(image_url)
message.send()
# TODO Do not refresh task before # TODO Do not refresh task before
# verifying that it was completed. # verifying that it was completed.

View file

@ -18,6 +18,7 @@ TODO
""" """
import slixfeed.fetch as fetch
import logging import logging
import os import os
import slixfeed.action as action import slixfeed.action as action
@ -451,7 +452,7 @@ async def message(self, message):
status_type = "dnd" status_type = "dnd"
status_message = ( status_message = (
"📃️ Procesing request to produce {} document..." "📃️ Procesing request to produce {} document..."
).format(ext) ).format(ext.upper())
send_status_message( send_status_message(
self, jid, status_type, status_message) self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
@ -461,27 +462,43 @@ async def message(self, message):
os.mkdir(data_dir) os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/readability'): if not os.path.isdir(data_dir + '/readability'):
os.mkdir(data_dir + '/readability') os.mkdir(data_dir + '/readability')
filename = os.path.join(
data_dir, "readability", "saved_article_" + timestamp() + "." + ext)
try: try:
ix = int(ix_url) ix = int(ix_url)
try: try:
url = sqlite.get_entry_url(db_file, ix) url = sqlite.get_entry_url(db_file, ix)
except: except:
response = "No entry Id with {}".format(ix) response = "No entry with Id {}".format(ix)
except: except:
url = ix_url url = ix_url
if url: if url:
url = uri.remove_tracking_parameters(url) url = uri.remove_tracking_parameters(url)
url = (uri.replace_hostname(url, "link")) or url url = (uri.replace_hostname(url, "link")) or url
status = await action.generate_document(url, ext, filename) result = await fetch.http(url)
data = result[0]
code = result[1]
if data:
title = action.get_document_title(data)
title = title.strip().lower()
for i in (" ", "-"):
title = title.replace(i, "_")
for i in ("?", "'", "!"):
title = title.replace(i, "")
filename = os.path.join(
data_dir, "readability",
title + "_" + timestamp() + "." + ext)
error = action.generate_document(
data, url, ext, filename)
if status: if status:
response = ( response = (
"Failed to export {}. Reason: {}" "Failed to export {}. Reason: {}"
).format(ext, status) ).format(ext.upper(), error)
else: else:
url = await upload.start(self, jid, filename) url = await upload.start(self, jid, filename)
await send_oob_message(self, jid, url) await send_oob_message(self, jid, url)
else:
response = (
"Failed to fetch {}. Reason: {}"
).format(url, code)
await task.start_tasks_xmpp( await task.start_tasks_xmpp(
self, jid, ["status"]) self, jid, ["status"])
else: else: