Detect image from xml enclosure in addition to html img

This commit is contained in:
Schimon Jehudah 2024-01-11 10:55:42 +00:00
parent b675618b95
commit ec82aeb3cc
6 changed files with 131 additions and 67 deletions

View file

@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results):
async def import_opml(db_file, url):
result = await fetch.download_feed(url)
result = await fetch.http(url)
document = result[0]
if document:
root = ET.fromstring(document)
@ -378,7 +378,7 @@ async def add_feed(db_file, url):
while True:
exist = await sqlite.get_feed_id_and_name(db_file, url)
if not exist:
result = await fetch.download_feed(url)
result = await fetch.http(url)
document = result[0]
status_code = result[1]
if document:
@ -458,7 +458,7 @@ async def add_feed(db_file, url):
async def view_feed(url):
while True:
result = await fetch.download_feed(url)
result = await fetch.http(url)
document = result[0]
status = result[1]
if document:
@ -523,7 +523,7 @@ async def view_feed(url):
async def view_entry(url, num):
while True:
result = await fetch.download_feed(url)
result = await fetch.http(url)
document = result[0]
status = result[1]
if document:
@ -602,7 +602,7 @@ async def scan(db_file, url):
URL. The default is None.
"""
if isinstance(url, tuple): url = url[0]
result = await fetch.download_feed(url)
result = await fetch.http(url)
try:
document = result[0]
status = result[1]
@ -706,32 +706,85 @@ async def scan(db_file, url):
db_file, new_entries)
async def get_content(url):
result = await fetch.download_feed(url)
async def generate_document(url, ext, filename):
result = await fetch.http(url)
data = result[0]
code = result[1]
status = None
if data:
try:
document = Document(result[0])
document = Document(data)
content = document.summary()
info = [content, code]
except:
logging.warning(
"Install package readability.")
info = result
"Check that package readability is installed.")
match ext:
case "html":
generate_html(content, filename)
case "md":
try:
generate_markdown(content, filename)
except:
logging.warning(
"Check that package html2text is installed.")
status = (
"Package html2text was not found.")
case "pdf":
try:
generate_pdf(content, filename)
except:
logging.warning(
"Check that packages pdfkit and wkhtmltopdf "
"are installed.")
status = (
"Package pdfkit or wkhtmltopdf was not found.")
else:
info = [None, code]
return info
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
status = code
if status:
return status
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
def extract_first_image(url, content):
async def extract_image_from_feed(db_file, ix, url):
feed_url = sqlite.get_feed_url(db_file, ix)
result = await fetch.http(feed_url)
document = result[0]
# breakpoint()
print("extract_image_from_feed")
if document:
feed = parse(document)
for entry in feed.entries:
print(len(feed.entries))
print(entry.link)
print(url)
if entry.link == url:
for link in entry.links:
if (link.rel == "enclosure" and
link.type.startswith("image/")):
# if link.type.startswith("image/"):
image_url = link.href
print("found")
print(image_url)
break
return image_url
async def extract_image_from_html(url):
result = await fetch.http(url)
data = result[0]
if data:
try:
document = Document(data)
content = document.summary()
except:
logging.warning(
"Check that package readability is installed.")
tree = html.fromstring(content)
images = tree.xpath('//img/@src')
if len(images):
@ -775,7 +828,7 @@ async def organize_items(db_file, urls):
for url in urls:
# print(os.path.basename(db_file), url[0])
url = url[0]
res = await fetch.download_feed(url)
res = await fetch.http(url)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed

View file

@ -22,7 +22,7 @@ from feedparser import parse
import logging
from lxml import html
import slixfeed.config as config
from slixfeed.fetch import download_feed
import slixfeed.fetch as fetch
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit
@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree):
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
xpath_query = (
"(//a[contains(@href,'{}')])[position()<={}]"
).format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
xpath_query = (
"(//a[contains(@href,'{}')])[position()>last()-{}]"
).format(path, num)
addresses += tree.xpath(xpath_query)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree):
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# res = await fetch.http(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree):
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
res = await download_feed(i)
res = await fetch.http(i)
if res[1] == 200:
try:
feeds[i] = [parse(res[0])]
@ -266,7 +270,7 @@ async def process_feed_selection(url, urls):
feed_url_mark = 0
for feed_url in feeds:
# try:
# res = await download_feed(feed)
# res = await fetch.http(feed)
# except:
# continue
feed_name = None
@ -334,7 +338,7 @@ async def process_feed_selection(url, urls):
# async def start(url):
# while True:
# result = await fetch.download_feed(url)
# result = await fetch.http(url)
# document = result[0]
# status = result[1]
# if document:

View file

@ -45,7 +45,7 @@ import slixfeed.config as config
# async def ipfs():
async def download_feed(url):
async def http(url):
"""
Download content of given URL.

View file

@ -847,11 +847,10 @@ def get_feed_title(db_file, ix):
return title
# TODO Handletable archive too
def get_entry_url(db_file, ix):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
sql = ( # TODO Handletable archive too
"""
SELECT link
FROM entries
@ -862,6 +861,28 @@ def get_entry_url(db_file, ix):
return url
def get_feed_url(db_file, ix):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( # TODO Handletable archive too
"""
SELECT feed_id
FROM entries
WHERE id = :ix
"""
)
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
sql = (
"""
SELECT url
FROM feeds
WHERE id = :feed_id
"""
)
url = cur.execute(sql, (feed_id,)).fetchone()[0]
return url
async def mark_as_read(db_file, ix):
async with DBLOCK:
with create_connection(db_file) as conn:

View file

@ -242,11 +242,12 @@ async def send_update(self, jid, num=None):
# breakpoint()
await mark_as_read(db_file, result[0])
if not image_url:
info = await action.get_content(url)
content = info[1]
status = info[0]
if status == 200:
image_url = action.extract_first_image(url, content)
image_url = await action.extract_image_from_feed(
db_file, ix, url)
if not image_url:
image_url = await action.extract_image_from_html(url)
print("image_url")
print(image_url)
new = " ".join(news_digest)
# breakpoint()
if new:

View file

@ -445,6 +445,8 @@ async def message(self, message):
ix_url = message_text.split(" ")[0]
ext = " ".join(message_text.split(" ")[1:])
ext = ext if ext else 'pdf'
url = None
status = None
if ext in ("html", "md", "pdf"):
status_type = "dnd"
status_message = (
@ -469,42 +471,25 @@ async def message(self, message):
response = "No entry Id with {}".format(ix)
except:
url = ix_url
url = uri.remove_tracking_parameters(url)
url = (uri.replace_hostname(url, "link")) or url
info = await action.get_content(url)
content = info[0]
status = info[1]
if content:
try:
match ext:
case "html":
action.generate_html(content, filename)
case "md":
action.generate_markdown(content, filename)
case "pdf":
action.generate_pdf(content, filename)
url = await upload.start(
self, jid, filename)
await send_oob_message(
self, jid, url)
except:
logging.warning(
"Check that packages html2text, pdfkit "
"and wkhtmltopdf are installed")
if url:
url = uri.remove_tracking_parameters(url)
url = (uri.replace_hostname(url, "link")) or url
status = await action.generate_document(url, ext, filename)
if status:
response = (
"Failed to export to {}"
).format(ext)
await task.start_tasks_xmpp(
self, jid, ["status"])
else:
response = (
"Failed to fetch resource. Reason: {}"
).format(status)
"Failed to export {}. Reason: {}"
).format(ext, status)
else:
url = await upload.start(self, jid, filename)
await send_oob_message(self, jid, url)
await task.start_tasks_xmpp(
self, jid, ["status"])
else:
response = "Missing entry Id."
else:
response = "Unsupported filetype."
if response:
print(response)
send_reply_message(self, message, response)
# case _ if (message_lowercase.startswith("http")) and(
# message_lowercase.endswith(".opml")):