Save enclosures

Send new message upon media detection
This commit is contained in:
Schimon Jehudah 2024-01-13 17:17:43 +00:00
parent ec82aeb3cc
commit 43fa1a463c
5 changed files with 348 additions and 249 deletions

View file

@ -42,7 +42,7 @@ from slixfeed.url import (
)
import slixfeed.xmpp.bookmark as bookmark
from urllib import error
from urllib.parse import urlsplit
from urllib.parse import parse_qs, urlsplit
import xml.etree.ElementTree as ET
try:
@ -688,9 +688,34 @@ async def scan(db_file, url):
if isinstance(date, int):
logging.error(
"Variable 'date' is int: {}".format(date))
media_link = ''
if entry.has_key("links"):
for e_link in entry.links:
try:
# if (link.rel == "enclosure" and
# (link.type.startswith("audio/") or
# link.type.startswith("image/") or
# link.type.startswith("video/"))
# ):
media_type = e_link.type[:e_link.type.index("/")]
if e_link.has_key("rel"):
if (e_link.rel == "enclosure" and
media_type in ("audio", "image", "video")):
media_link = e_link.href
media_link = join_url(url, e_link.href)
media_link = trim_url(media_link)
break
except:
logging.error(
"KeyError: 'href'\n"
"Missing 'href' attribute for {}".format(url))
logging.info(
"Continue scanning for next potential "
"enclosure of {}".format(link))
entry = {
"title": title,
"link": link,
"enclosure": media_link,
"entry_id": entry_id,
"url": url,
"date": date,
@ -706,16 +731,23 @@ async def scan(db_file, url):
db_file, new_entries)
async def generate_document(url, ext, filename):
result = await fetch.http(url)
data = result[0]
code = result[1]
status = None
if data:
def get_document_title(data):
try:
document = Document(data)
title = document.short_title()
except:
document = BeautifulSoup(data, 'html.parser')
title = document.title.string
return title
def generate_document(data, url, ext, filename):
error = None
try:
document = Document(data)
content = document.summary()
except:
content = data
logging.warning(
"Check that package readability is installed.")
match ext:
@ -727,7 +759,7 @@ async def generate_document(url, ext, filename):
except:
logging.warning(
"Check that package html2text is installed.")
status = (
error = (
"Package html2text was not found.")
case "pdf":
try:
@ -736,12 +768,10 @@ async def generate_document(url, ext, filename):
logging.warning(
"Check that packages pdfkit and wkhtmltopdf "
"are installed.")
status = (
error = (
"Package pdfkit or wkhtmltopdf was not found.")
else:
status = code
if status:
return status
if error:
return error
# TODO Either adapt it to filename
# or change it to something else
@ -751,28 +781,25 @@ async def generate_document(url, ext, filename):
# file.write(html_doc)
async def extract_image_from_feed(db_file, ix, url):
feed_url = sqlite.get_feed_url(db_file, ix)
async def extract_image_from_feed(db_file, feed_id, url):
feed_url = sqlite.get_feed_url(db_file, feed_id)
result = await fetch.http(feed_url)
document = result[0]
# breakpoint()
print("extract_image_from_feed")
if document:
feed = parse(document)
for entry in feed.entries:
print(len(feed.entries))
print(entry.link)
print(url)
try:
if entry.link == url:
for link in entry.links:
if (link.rel == "enclosure" and
link.type.startswith("image/")):
# if link.type.startswith("image/"):
image_url = link.href
print("found")
print(image_url)
break
return image_url
except:
logging.error(url)
logging.error(
"AttributeError: object has no attribute 'link'")
breakpoint()
async def extract_image_from_html(url):
@ -783,16 +810,16 @@ async def extract_image_from_html(url):
document = Document(data)
content = document.summary()
except:
content = data
logging.warning(
"Check that package readability is installed.")
tree = html.fromstring(content)
# TODO Exclude banners, class="share" links etc.
images = tree.xpath('//img/@src')
if len(images):
image = images[0]
image = str(image)
image_url = complete_url(url, image)
else:
image_url = None
return image_url
@ -813,6 +840,35 @@ def generate_markdown(text, filename):
file.write(markdown)
# TODO Add support for eDonkey, Gnutella, Soulseek
async def get_magnet(link):
parted_link = urlsplit(link)
queries = parse_qs(parted_link.query)
query_xt = queries["xt"][0]
if query_xt.startswith("urn:btih:"):
filename = queries["dn"][0]
checksum = query_xt[len("urn:btih:"):]
torrent = await fetch.magnet(link)
logging.debug(
"Attempting to retrieve {} ({})".format(
filename, checksum))
if not torrent:
logging.debug(
"Attempting to retrieve {} from HTTP caching service".format(
filename))
urls = [
'https://watercache.libertycorp.org/get/{}/{}',
'https://itorrents.org/torrent/{}.torrent?title={}',
'https://firecache.libertycorp.org/get/{}/{}',
'http://fcache63sakpihd44kxdduy6kgpdhgejgp323wci435zwy6kiylcnfad.onion/get/{}/{}'
]
for url in urls:
torrent = fetch.http(url.format(checksum, filename))
if torrent:
break
return torrent
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls):
"""

View file

@ -28,9 +28,16 @@ from asyncio import TimeoutError
# from asyncio.exceptions import IncompleteReadError
# from bs4 import BeautifulSoup
# from http.client import IncompleteRead
import logging
# from lxml import html
import slixfeed.config as config
# from xml.etree.ElementTree import ElementTree, ParseError
import slixfeed.config as config
try:
from magnet2torrent import Magnet2Torrent, FailedToFetchException
except:
logging.info(
"Package magnet2torrent was not found.\n"
"BitTorrent is disabled.")
# async def dat():
@ -105,3 +112,11 @@ async def http(url):
False, "Timeout: " + str(e)
]
return msg
async def magnet(link):
m2t = Magnet2Torrent(link)
try:
filename, torrent_data = await m2t.retrieve_torrent()
except FailedToFetchException:
logging.debug("Failed")

View file

@ -129,6 +129,7 @@ def create_tables(db_file):
id INTEGER NOT NULL,
title TEXT NOT NULL,
link TEXT NOT NULL,
enclosure TEXT,
entry_id TEXT NOT NULL,
feed_id INTEGER NOT NULL,
timestamp TEXT,
@ -146,6 +147,7 @@ def create_tables(db_file):
id INTEGER NOT NULL,
title TEXT NOT NULL,
link TEXT NOT NULL,
enclosure TEXT,
entry_id TEXT NOT NULL,
feed_id INTEGER NOT NULL,
timestamp TEXT,
@ -486,7 +488,8 @@ async def remove_feed_by_url(db_file, url):
cur = conn.cursor()
sql = (
"""
DELETE FROM feeds
DELETE
FROM feeds
WHERE url = ?
"""
)
@ -556,7 +559,8 @@ async def get_feed_id_and_name(db_file, url):
result : list
List of ID and Name of feed.
"""
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id, name
@ -677,11 +681,11 @@ async def get_unread_entries(db_file, num):
cur = conn.cursor()
sql = (
"""
SELECT id, title, link, feed_id, timestamp
SELECT id, title, link, enclosure, feed_id, timestamp
FROM entries
WHERE read = 0
UNION ALL
SELECT id, title, link, feed_id, timestamp
SELECT id, title, link, enclosure, feed_id, timestamp
FROM archive
ORDER BY timestamp
DESC LIMIT :num
@ -861,17 +865,9 @@ def get_entry_url(db_file, ix):
return url
def get_feed_url(db_file, ix):
def get_feed_url(db_file, feed_id):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( # TODO Handletable archive too
"""
SELECT feed_id
FROM entries
WHERE id = :ix
"""
)
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
sql = (
"""
SELECT url
@ -1152,14 +1148,15 @@ async def add_entries_and_update_timestamp(db_file, new_entries):
"""
INSERT
INTO entries(
title, link, entry_id, feed_id, timestamp, read)
title, link, enclosure, entry_id, feed_id, timestamp, read)
VALUES(
:title, :link, :entry_id, :feed_id, :timestamp, :read)
:title, :link, :enclosure, :entry_id, :feed_id, :timestamp, :read)
"""
)
cur.execute(sql, {
"title": entry["title"],
"link": entry["link"],
"enclosure": entry["enclosure"],
"entry_id": entry["entry_id"],
"feed_id": feed_id,
"timestamp": entry["date"],
@ -1338,10 +1335,12 @@ async def maintain_archive(db_file, limit):
"""
DELETE FROM archive
WHERE id
IN (SELECT id
IN (
SELECT id
FROM archive
ORDER BY timestamp ASC
LIMIT :difference)
LIMIT :difference
)
"""
)
cur.execute(sql, {
@ -1452,7 +1451,8 @@ async def get_feeds(db_file):
# Select name, url (feeds) updated, enabled, feed_id (status)
# 2) Sort feeds by id. Sort status by feed_id
# results += cur.execute(sql).fetchall()
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT name, url, id
@ -1479,7 +1479,8 @@ async def last_entries(db_file, num):
titles_list : str
List of recent N entries as message.
"""
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
# sql = (
# "SELECT title, link "
# "FROM entries "
@ -1520,7 +1521,8 @@ async def search_feeds(db_file, query):
titles_list : str
Feeds of specified keywords as message.
"""
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT name, id, url
@ -1551,7 +1553,8 @@ async def search_entries(db_file, query):
titles_list : str
Entries of specified keywords as message.
"""
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT title, link
@ -1619,7 +1622,8 @@ async def check_entry_exist(
bool
True or None.
"""
cur = get_cursor(db_file)
with create_connection(db_file) as conn:
cur = conn.cursor()
exist = False
if entry_id:
feed_id = get_feed_id(cur, url)
@ -1627,9 +1631,7 @@ async def check_entry_exist(
"""
SELECT id
FROM entries
WHERE
entry_id = :entry_id and
feed_id = :feed_id
WHERE entry_id = :entry_id and feed_id = :feed_id
"""
)
result = cur.execute(sql, {
@ -1642,10 +1644,7 @@ async def check_entry_exist(
"""
SELECT id
FROM entries
WHERE
title = :title and
link = :link and
timestamp = :date
WHERE title = :title and link = :link and timestamp = :date
"""
)
try:
@ -1663,9 +1662,7 @@ async def check_entry_exist(
"""
SELECT id
FROM entries
WHERE
title = :title and
link = :link
WHERE title = :title and link = :link
"""
)
result = cur.execute(sql, {

View file

@ -227,46 +227,60 @@ async def send_update(self, jid, num=None):
num = int(num)
news_digest = []
results = await get_unread_entries(db_file, num)
image_url = None
news_digest = ''
media = None
chat_type = await utility.jid_type(self, jid)
for result in results:
ix = result[0]
title_e = result[1]
url = result[2]
feed_id = result[3]
date = result[4]
enclosure = result[3]
feed_id = result[4]
date = result[5]
title_f = get_feed_title(db_file, feed_id)
news_item = action.list_unread_entries(result, title_f)
news_digest.extend([news_item])
news_digest += action.list_unread_entries(result, title_f)
# print(db_file)
# print(result[0])
# breakpoint()
await mark_as_read(db_file, result[0])
if not image_url:
image_url = await action.extract_image_from_feed(
db_file, ix, url)
if not image_url:
image_url = await action.extract_image_from_html(url)
print("image_url")
print(image_url)
new = " ".join(news_digest)
# breakpoint()
if new:
await mark_as_read(db_file, ix)
# Find media
if url.startswith("magnet:"):
media = action.get_magnet(url)
elif enclosure.startswith("magnet:"):
media = action.get_magnet(enclosure)
elif enclosure:
media = enclosure
else:
media = await action.extract_image_from_html(url)
if media and news_digest:
# Send textual message
xmpp.Slixfeed.send_message(
self, mto=jid, mbody=news_digest, mtype=chat_type)
news_digest = ''
# Send media
message = xmpp.Slixfeed.make_message(
self, mto=jid, mbody=media, mtype=chat_type)
message['oob']['url'] = media
message.send()
media = None
if news_digest:
# TODO Add while loop to assure delivery.
# print(await current_time(), ">>> ACT send_message",jid)
chat_type = await utility.jid_type(self, jid)
# NOTE Do we need "if statement"? See NOTE at is_muc.
if chat_type in ("chat", "groupchat"):
# TODO Provide a choice (with or without images)
xmpp.Slixfeed.send_message(
self, mto=jid, mbody=new, mtype=chat_type)
if image_url:
self, mto=jid, mbody=news_digest, mtype=chat_type)
# if media:
# # message = xmpp.Slixfeed.make_message(
# # self, mto=jid, mbody=new, mtype=chat_type)
# message = xmpp.Slixfeed.make_message(
# self, mto=jid, mbody=new, mtype=chat_type)
message = xmpp.Slixfeed.make_message(
self, mto=jid, mbody=image_url, mtype=chat_type)
message['oob']['url'] = image_url
print(image_url)
message.send()
# self, mto=jid, mbody=media, mtype=chat_type)
# message['oob']['url'] = media
# message.send()
# TODO Do not refresh task before
# verifying that it was completed.

View file

@ -18,6 +18,7 @@ TODO
"""
import slixfeed.fetch as fetch
import logging
import os
import slixfeed.action as action
@ -451,7 +452,7 @@ async def message(self, message):
status_type = "dnd"
status_message = (
"📃️ Procesing request to produce {} document..."
).format(ext)
).format(ext.upper())
send_status_message(
self, jid, status_type, status_message)
db_file = get_pathname_to_database(jid)
@ -461,27 +462,43 @@ async def message(self, message):
os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/readability'):
os.mkdir(data_dir + '/readability')
filename = os.path.join(
data_dir, "readability", "saved_article_" + timestamp() + "." + ext)
try:
ix = int(ix_url)
try:
url = sqlite.get_entry_url(db_file, ix)
except:
response = "No entry Id with {}".format(ix)
response = "No entry with Id {}".format(ix)
except:
url = ix_url
if url:
url = uri.remove_tracking_parameters(url)
url = (uri.replace_hostname(url, "link")) or url
status = await action.generate_document(url, ext, filename)
result = await fetch.http(url)
data = result[0]
code = result[1]
if data:
title = action.get_document_title(data)
title = title.strip().lower()
for i in (" ", "-"):
title = title.replace(i, "_")
for i in ("?", "'", "!"):
title = title.replace(i, "")
filename = os.path.join(
data_dir, "readability",
title + "_" + timestamp() + "." + ext)
error = action.generate_document(
data, url, ext, filename)
if status:
response = (
"Failed to export {}. Reason: {}"
).format(ext, status)
).format(ext.upper(), error)
else:
url = await upload.start(self, jid, filename)
await send_oob_message(self, jid, url)
else:
response = (
"Failed to fetch {}. Reason: {}"
).format(url, code)
await task.start_tasks_xmpp(
self, jid, ["status"])
else: