Detect image from xml enclosure in addition to html img
This commit is contained in:
parent
b675618b95
commit
ec82aeb3cc
6 changed files with 131 additions and 67 deletions
|
@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results):
|
|||
|
||||
|
||||
async def import_opml(db_file, url):
|
||||
result = await fetch.download_feed(url)
|
||||
result = await fetch.http(url)
|
||||
document = result[0]
|
||||
if document:
|
||||
root = ET.fromstring(document)
|
||||
|
@ -378,7 +378,7 @@ async def add_feed(db_file, url):
|
|||
while True:
|
||||
exist = await sqlite.get_feed_id_and_name(db_file, url)
|
||||
if not exist:
|
||||
result = await fetch.download_feed(url)
|
||||
result = await fetch.http(url)
|
||||
document = result[0]
|
||||
status_code = result[1]
|
||||
if document:
|
||||
|
@ -458,7 +458,7 @@ async def add_feed(db_file, url):
|
|||
|
||||
async def view_feed(url):
|
||||
while True:
|
||||
result = await fetch.download_feed(url)
|
||||
result = await fetch.http(url)
|
||||
document = result[0]
|
||||
status = result[1]
|
||||
if document:
|
||||
|
@ -523,7 +523,7 @@ async def view_feed(url):
|
|||
|
||||
async def view_entry(url, num):
|
||||
while True:
|
||||
result = await fetch.download_feed(url)
|
||||
result = await fetch.http(url)
|
||||
document = result[0]
|
||||
status = result[1]
|
||||
if document:
|
||||
|
@ -602,7 +602,7 @@ async def scan(db_file, url):
|
|||
URL. The default is None.
|
||||
"""
|
||||
if isinstance(url, tuple): url = url[0]
|
||||
result = await fetch.download_feed(url)
|
||||
result = await fetch.http(url)
|
||||
try:
|
||||
document = result[0]
|
||||
status = result[1]
|
||||
|
@ -706,23 +706,43 @@ async def scan(db_file, url):
|
|||
db_file, new_entries)
|
||||
|
||||
|
||||
|
||||
async def get_content(url):
|
||||
result = await fetch.download_feed(url)
|
||||
async def generate_document(url, ext, filename):
|
||||
result = await fetch.http(url)
|
||||
data = result[0]
|
||||
code = result[1]
|
||||
status = None
|
||||
if data:
|
||||
try:
|
||||
document = Document(result[0])
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
info = [content, code]
|
||||
except:
|
||||
logging.warning(
|
||||
"Install package readability.")
|
||||
info = result
|
||||
"Check that package readability is installed.")
|
||||
match ext:
|
||||
case "html":
|
||||
generate_html(content, filename)
|
||||
case "md":
|
||||
try:
|
||||
generate_markdown(content, filename)
|
||||
except:
|
||||
logging.warning(
|
||||
"Check that package html2text is installed.")
|
||||
status = (
|
||||
"Package html2text was not found.")
|
||||
case "pdf":
|
||||
try:
|
||||
generate_pdf(content, filename)
|
||||
except:
|
||||
logging.warning(
|
||||
"Check that packages pdfkit and wkhtmltopdf "
|
||||
"are installed.")
|
||||
status = (
|
||||
"Package pdfkit or wkhtmltopdf was not found.")
|
||||
else:
|
||||
info = [None, code]
|
||||
return info
|
||||
status = code
|
||||
if status:
|
||||
return status
|
||||
|
||||
# TODO Either adapt it to filename
|
||||
# or change it to something else
|
||||
#filename = document.title()
|
||||
|
@ -731,7 +751,40 @@ async def get_content(url):
|
|||
# file.write(html_doc)
|
||||
|
||||
|
||||
def extract_first_image(url, content):
|
||||
async def extract_image_from_feed(db_file, ix, url):
|
||||
feed_url = sqlite.get_feed_url(db_file, ix)
|
||||
result = await fetch.http(feed_url)
|
||||
document = result[0]
|
||||
# breakpoint()
|
||||
print("extract_image_from_feed")
|
||||
if document:
|
||||
feed = parse(document)
|
||||
for entry in feed.entries:
|
||||
print(len(feed.entries))
|
||||
print(entry.link)
|
||||
print(url)
|
||||
if entry.link == url:
|
||||
for link in entry.links:
|
||||
if (link.rel == "enclosure" and
|
||||
link.type.startswith("image/")):
|
||||
# if link.type.startswith("image/"):
|
||||
image_url = link.href
|
||||
print("found")
|
||||
print(image_url)
|
||||
break
|
||||
return image_url
|
||||
|
||||
|
||||
async def extract_image_from_html(url):
|
||||
result = await fetch.http(url)
|
||||
data = result[0]
|
||||
if data:
|
||||
try:
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
except:
|
||||
logging.warning(
|
||||
"Check that package readability is installed.")
|
||||
tree = html.fromstring(content)
|
||||
images = tree.xpath('//img/@src')
|
||||
if len(images):
|
||||
|
@ -775,7 +828,7 @@ async def organize_items(db_file, urls):
|
|||
for url in urls:
|
||||
# print(os.path.basename(db_file), url[0])
|
||||
url = url[0]
|
||||
res = await fetch.download_feed(url)
|
||||
res = await fetch.http(url)
|
||||
# TypeError: 'NoneType' object is not subscriptable
|
||||
if res is None:
|
||||
# Skip to next feed
|
||||
|
|
|
@ -22,7 +22,7 @@ from feedparser import parse
|
|||
import logging
|
||||
from lxml import html
|
||||
import slixfeed.config as config
|
||||
from slixfeed.fetch import download_feed
|
||||
import slixfeed.fetch as fetch
|
||||
from slixfeed.url import complete_url, join_url, trim_url
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
|
@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree):
|
|||
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
||||
# xpath_query = "//a[contains(@href,'{}')]".format(path)
|
||||
num = 5
|
||||
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
|
||||
xpath_query = (
|
||||
"(//a[contains(@href,'{}')])[position()<={}]"
|
||||
).format(path, num)
|
||||
addresses = tree.xpath(xpath_query)
|
||||
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
|
||||
xpath_query = (
|
||||
"(//a[contains(@href,'{}')])[position()>last()-{}]"
|
||||
).format(path, num)
|
||||
addresses += tree.xpath(xpath_query)
|
||||
# NOTE Should number of addresses be limited or
|
||||
# perhaps be N from the start and N from the end
|
||||
|
@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree):
|
|||
# # The following code will catch
|
||||
# # only valid resources (i.e. not 404);
|
||||
# # The following code requires more bandwidth.
|
||||
# res = await download_feed(feed)
|
||||
# res = await fetch.http(feed)
|
||||
# if res[0]:
|
||||
# disco = parse(res[0])
|
||||
# title = disco["feed"]["title"]
|
||||
|
@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree):
|
|||
async def process_feed_selection(url, urls):
|
||||
feeds = {}
|
||||
for i in urls:
|
||||
res = await download_feed(i)
|
||||
res = await fetch.http(i)
|
||||
if res[1] == 200:
|
||||
try:
|
||||
feeds[i] = [parse(res[0])]
|
||||
|
@ -266,7 +270,7 @@ async def process_feed_selection(url, urls):
|
|||
feed_url_mark = 0
|
||||
for feed_url in feeds:
|
||||
# try:
|
||||
# res = await download_feed(feed)
|
||||
# res = await fetch.http(feed)
|
||||
# except:
|
||||
# continue
|
||||
feed_name = None
|
||||
|
@ -334,7 +338,7 @@ async def process_feed_selection(url, urls):
|
|||
|
||||
# async def start(url):
|
||||
# while True:
|
||||
# result = await fetch.download_feed(url)
|
||||
# result = await fetch.http(url)
|
||||
# document = result[0]
|
||||
# status = result[1]
|
||||
# if document:
|
||||
|
|
|
@ -45,7 +45,7 @@ import slixfeed.config as config
|
|||
|
||||
# async def ipfs():
|
||||
|
||||
async def download_feed(url):
|
||||
async def http(url):
|
||||
"""
|
||||
Download content of given URL.
|
||||
|
||||
|
|
|
@ -847,11 +847,10 @@ def get_feed_title(db_file, ix):
|
|||
return title
|
||||
|
||||
|
||||
# TODO Handletable archive too
|
||||
def get_entry_url(db_file, ix):
|
||||
with create_connection(db_file) as conn:
|
||||
cur = conn.cursor()
|
||||
sql = (
|
||||
sql = ( # TODO Handletable archive too
|
||||
"""
|
||||
SELECT link
|
||||
FROM entries
|
||||
|
@ -862,6 +861,28 @@ def get_entry_url(db_file, ix):
|
|||
return url
|
||||
|
||||
|
||||
def get_feed_url(db_file, ix):
|
||||
with create_connection(db_file) as conn:
|
||||
cur = conn.cursor()
|
||||
sql = ( # TODO Handletable archive too
|
||||
"""
|
||||
SELECT feed_id
|
||||
FROM entries
|
||||
WHERE id = :ix
|
||||
"""
|
||||
)
|
||||
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
|
||||
sql = (
|
||||
"""
|
||||
SELECT url
|
||||
FROM feeds
|
||||
WHERE id = :feed_id
|
||||
"""
|
||||
)
|
||||
url = cur.execute(sql, (feed_id,)).fetchone()[0]
|
||||
return url
|
||||
|
||||
|
||||
async def mark_as_read(db_file, ix):
|
||||
async with DBLOCK:
|
||||
with create_connection(db_file) as conn:
|
||||
|
|
|
@ -242,11 +242,12 @@ async def send_update(self, jid, num=None):
|
|||
# breakpoint()
|
||||
await mark_as_read(db_file, result[0])
|
||||
if not image_url:
|
||||
info = await action.get_content(url)
|
||||
content = info[1]
|
||||
status = info[0]
|
||||
if status == 200:
|
||||
image_url = action.extract_first_image(url, content)
|
||||
image_url = await action.extract_image_from_feed(
|
||||
db_file, ix, url)
|
||||
if not image_url:
|
||||
image_url = await action.extract_image_from_html(url)
|
||||
print("image_url")
|
||||
print(image_url)
|
||||
new = " ".join(news_digest)
|
||||
# breakpoint()
|
||||
if new:
|
||||
|
|
|
@ -445,6 +445,8 @@ async def message(self, message):
|
|||
ix_url = message_text.split(" ")[0]
|
||||
ext = " ".join(message_text.split(" ")[1:])
|
||||
ext = ext if ext else 'pdf'
|
||||
url = None
|
||||
status = None
|
||||
if ext in ("html", "md", "pdf"):
|
||||
status_type = "dnd"
|
||||
status_message = (
|
||||
|
@ -469,42 +471,25 @@ async def message(self, message):
|
|||
response = "No entry Id with {}".format(ix)
|
||||
except:
|
||||
url = ix_url
|
||||
if url:
|
||||
url = uri.remove_tracking_parameters(url)
|
||||
url = (uri.replace_hostname(url, "link")) or url
|
||||
info = await action.get_content(url)
|
||||
content = info[0]
|
||||
status = info[1]
|
||||
if content:
|
||||
try:
|
||||
match ext:
|
||||
case "html":
|
||||
action.generate_html(content, filename)
|
||||
case "md":
|
||||
action.generate_markdown(content, filename)
|
||||
case "pdf":
|
||||
action.generate_pdf(content, filename)
|
||||
url = await upload.start(
|
||||
self, jid, filename)
|
||||
await send_oob_message(
|
||||
self, jid, url)
|
||||
except:
|
||||
logging.warning(
|
||||
"Check that packages html2text, pdfkit "
|
||||
"and wkhtmltopdf are installed")
|
||||
status = await action.generate_document(url, ext, filename)
|
||||
if status:
|
||||
response = (
|
||||
"Failed to export to {}"
|
||||
).format(ext)
|
||||
"Failed to export {}. Reason: {}"
|
||||
).format(ext, status)
|
||||
else:
|
||||
url = await upload.start(self, jid, filename)
|
||||
await send_oob_message(self, jid, url)
|
||||
await task.start_tasks_xmpp(
|
||||
self, jid, ["status"])
|
||||
else:
|
||||
response = (
|
||||
"Failed to fetch resource. Reason: {}"
|
||||
).format(status)
|
||||
else:
|
||||
response = "Missing entry Id."
|
||||
else:
|
||||
response = "Unsupported filetype."
|
||||
if response:
|
||||
print(response)
|
||||
send_reply_message(self, message, response)
|
||||
# case _ if (message_lowercase.startswith("http")) and(
|
||||
# message_lowercase.endswith(".opml")):
|
||||
|
|
Loading…
Reference in a new issue