Detect image from xml enclosure in addition to html img

This commit is contained in:
Schimon Jehudah 2024-01-11 10:55:42 +00:00
parent b675618b95
commit ec82aeb3cc
6 changed files with 131 additions and 67 deletions

View file

@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results):
async def import_opml(db_file, url): async def import_opml(db_file, url):
result = await fetch.download_feed(url) result = await fetch.http(url)
document = result[0] document = result[0]
if document: if document:
root = ET.fromstring(document) root = ET.fromstring(document)
@ -378,7 +378,7 @@ async def add_feed(db_file, url):
while True: while True:
exist = await sqlite.get_feed_id_and_name(db_file, url) exist = await sqlite.get_feed_id_and_name(db_file, url)
if not exist: if not exist:
result = await fetch.download_feed(url) result = await fetch.http(url)
document = result[0] document = result[0]
status_code = result[1] status_code = result[1]
if document: if document:
@ -458,7 +458,7 @@ async def add_feed(db_file, url):
async def view_feed(url): async def view_feed(url):
while True: while True:
result = await fetch.download_feed(url) result = await fetch.http(url)
document = result[0] document = result[0]
status = result[1] status = result[1]
if document: if document:
@ -523,7 +523,7 @@ async def view_feed(url):
async def view_entry(url, num): async def view_entry(url, num):
while True: while True:
result = await fetch.download_feed(url) result = await fetch.http(url)
document = result[0] document = result[0]
status = result[1] status = result[1]
if document: if document:
@ -602,7 +602,7 @@ async def scan(db_file, url):
URL. The default is None. URL. The default is None.
""" """
if isinstance(url, tuple): url = url[0] if isinstance(url, tuple): url = url[0]
result = await fetch.download_feed(url) result = await fetch.http(url)
try: try:
document = result[0] document = result[0]
status = result[1] status = result[1]
@ -706,32 +706,85 @@ async def scan(db_file, url):
db_file, new_entries) db_file, new_entries)
async def generate_document(url, ext, filename):
async def get_content(url): result = await fetch.http(url)
result = await fetch.download_feed(url)
data = result[0] data = result[0]
code = result[1] code = result[1]
status = None
if data: if data:
try: try:
document = Document(result[0]) document = Document(data)
content = document.summary() content = document.summary()
info = [content, code]
except: except:
logging.warning( logging.warning(
"Install package readability.") "Check that package readability is installed.")
info = result match ext:
case "html":
generate_html(content, filename)
case "md":
try:
generate_markdown(content, filename)
except:
logging.warning(
"Check that package html2text is installed.")
status = (
"Package html2text was not found.")
case "pdf":
try:
generate_pdf(content, filename)
except:
logging.warning(
"Check that packages pdfkit and wkhtmltopdf "
"are installed.")
status = (
"Package pdfkit or wkhtmltopdf was not found.")
else: else:
info = [None, code] status = code
return info if status:
# TODO Either adapt it to filename return status
# or change it to something else
#filename = document.title() # TODO Either adapt it to filename
# with open(filename, 'w') as file: # or change it to something else
# html_doc = document.summary() #filename = document.title()
# file.write(html_doc) # with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
def extract_first_image(url, content): async def extract_image_from_feed(db_file, ix, url):
feed_url = sqlite.get_feed_url(db_file, ix)
result = await fetch.http(feed_url)
document = result[0]
# breakpoint()
print("extract_image_from_feed")
if document:
feed = parse(document)
for entry in feed.entries:
print(len(feed.entries))
print(entry.link)
print(url)
if entry.link == url:
for link in entry.links:
if (link.rel == "enclosure" and
link.type.startswith("image/")):
# if link.type.startswith("image/"):
image_url = link.href
print("found")
print(image_url)
break
return image_url
async def extract_image_from_html(url):
result = await fetch.http(url)
data = result[0]
if data:
try:
document = Document(data)
content = document.summary()
except:
logging.warning(
"Check that package readability is installed.")
tree = html.fromstring(content) tree = html.fromstring(content)
images = tree.xpath('//img/@src') images = tree.xpath('//img/@src')
if len(images): if len(images):
@ -775,7 +828,7 @@ async def organize_items(db_file, urls):
for url in urls: for url in urls:
# print(os.path.basename(db_file), url[0]) # print(os.path.basename(db_file), url[0])
url = url[0] url = url[0]
res = await fetch.download_feed(url) res = await fetch.http(url)
# TypeError: 'NoneType' object is not subscriptable # TypeError: 'NoneType' object is not subscriptable
if res is None: if res is None:
# Skip to next feed # Skip to next feed

View file

@ -22,7 +22,7 @@ from feedparser import parse
import logging import logging
from lxml import html from lxml import html
import slixfeed.config as config import slixfeed.config as config
from slixfeed.fetch import download_feed import slixfeed.fetch as fetch
from slixfeed.url import complete_url, join_url, trim_url from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree):
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path) # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path) # xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5 num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num) xpath_query = (
"(//a[contains(@href,'{}')])[position()<={}]"
).format(path, num)
addresses = tree.xpath(xpath_query) addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num) xpath_query = (
"(//a[contains(@href,'{}')])[position()>last()-{}]"
).format(path, num)
addresses += tree.xpath(xpath_query) addresses += tree.xpath(xpath_query)
# NOTE Should number of addresses be limited or # NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end # perhaps be N from the start and N from the end
@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree):
# # The following code will catch # # The following code will catch
# # only valid resources (i.e. not 404); # # only valid resources (i.e. not 404);
# # The following code requires more bandwidth. # # The following code requires more bandwidth.
# res = await download_feed(feed) # res = await fetch.http(feed)
# if res[0]: # if res[0]:
# disco = parse(res[0]) # disco = parse(res[0])
# title = disco["feed"]["title"] # title = disco["feed"]["title"]
@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree):
async def process_feed_selection(url, urls): async def process_feed_selection(url, urls):
feeds = {} feeds = {}
for i in urls: for i in urls:
res = await download_feed(i) res = await fetch.http(i)
if res[1] == 200: if res[1] == 200:
try: try:
feeds[i] = [parse(res[0])] feeds[i] = [parse(res[0])]
@ -266,7 +270,7 @@ async def process_feed_selection(url, urls):
feed_url_mark = 0 feed_url_mark = 0
for feed_url in feeds: for feed_url in feeds:
# try: # try:
# res = await download_feed(feed) # res = await fetch.http(feed)
# except: # except:
# continue # continue
feed_name = None feed_name = None
@ -334,7 +338,7 @@ async def process_feed_selection(url, urls):
# async def start(url): # async def start(url):
# while True: # while True:
# result = await fetch.download_feed(url) # result = await fetch.http(url)
# document = result[0] # document = result[0]
# status = result[1] # status = result[1]
# if document: # if document:

View file

@ -45,7 +45,7 @@ import slixfeed.config as config
# async def ipfs(): # async def ipfs():
async def download_feed(url): async def http(url):
""" """
Download content of given URL. Download content of given URL.

View file

@ -847,11 +847,10 @@ def get_feed_title(db_file, ix):
return title return title
# TODO Handletable archive too
def get_entry_url(db_file, ix): def get_entry_url(db_file, ix):
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = ( # TODO Handletable archive too
""" """
SELECT link SELECT link
FROM entries FROM entries
@ -862,6 +861,28 @@ def get_entry_url(db_file, ix):
return url return url
def get_feed_url(db_file, ix):
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = ( # TODO Handletable archive too
"""
SELECT feed_id
FROM entries
WHERE id = :ix
"""
)
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
sql = (
"""
SELECT url
FROM feeds
WHERE id = :feed_id
"""
)
url = cur.execute(sql, (feed_id,)).fetchone()[0]
return url
async def mark_as_read(db_file, ix): async def mark_as_read(db_file, ix):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:

View file

@ -242,11 +242,12 @@ async def send_update(self, jid, num=None):
# breakpoint() # breakpoint()
await mark_as_read(db_file, result[0]) await mark_as_read(db_file, result[0])
if not image_url: if not image_url:
info = await action.get_content(url) image_url = await action.extract_image_from_feed(
content = info[1] db_file, ix, url)
status = info[0] if not image_url:
if status == 200: image_url = await action.extract_image_from_html(url)
image_url = action.extract_first_image(url, content) print("image_url")
print(image_url)
new = " ".join(news_digest) new = " ".join(news_digest)
# breakpoint() # breakpoint()
if new: if new:

View file

@ -445,6 +445,8 @@ async def message(self, message):
ix_url = message_text.split(" ")[0] ix_url = message_text.split(" ")[0]
ext = " ".join(message_text.split(" ")[1:]) ext = " ".join(message_text.split(" ")[1:])
ext = ext if ext else 'pdf' ext = ext if ext else 'pdf'
url = None
status = None
if ext in ("html", "md", "pdf"): if ext in ("html", "md", "pdf"):
status_type = "dnd" status_type = "dnd"
status_message = ( status_message = (
@ -469,42 +471,25 @@ async def message(self, message):
response = "No entry Id with {}".format(ix) response = "No entry Id with {}".format(ix)
except: except:
url = ix_url url = ix_url
url = uri.remove_tracking_parameters(url) if url:
url = (uri.replace_hostname(url, "link")) or url url = uri.remove_tracking_parameters(url)
info = await action.get_content(url) url = (uri.replace_hostname(url, "link")) or url
content = info[0] status = await action.generate_document(url, ext, filename)
status = info[1] if status:
if content:
try:
match ext:
case "html":
action.generate_html(content, filename)
case "md":
action.generate_markdown(content, filename)
case "pdf":
action.generate_pdf(content, filename)
url = await upload.start(
self, jid, filename)
await send_oob_message(
self, jid, url)
except:
logging.warning(
"Check that packages html2text, pdfkit "
"and wkhtmltopdf are installed")
response = ( response = (
"Failed to export to {}" "Failed to export {}. Reason: {}"
).format(ext) ).format(ext, status)
await task.start_tasks_xmpp( else:
self, jid, ["status"]) url = await upload.start(self, jid, filename)
else: await send_oob_message(self, jid, url)
response = ( await task.start_tasks_xmpp(
"Failed to fetch resource. Reason: {}" self, jid, ["status"])
).format(status)
else: else:
response = "Missing entry Id." response = "Missing entry Id."
else: else:
response = "Unsupported filetype." response = "Unsupported filetype."
if response: if response:
print(response)
send_reply_message(self, message, response) send_reply_message(self, message, response)
# case _ if (message_lowercase.startswith("http")) and( # case _ if (message_lowercase.startswith("http")) and(
# message_lowercase.endswith(".opml")): # message_lowercase.endswith(".opml")):