Detect image from xml enclosure in addition to html img
This commit is contained in:
parent
b675618b95
commit
ec82aeb3cc
6 changed files with 131 additions and 67 deletions
|
@ -353,7 +353,7 @@ def export_to_opml(jid, filename, results):
|
||||||
|
|
||||||
|
|
||||||
async def import_opml(db_file, url):
|
async def import_opml(db_file, url):
|
||||||
result = await fetch.download_feed(url)
|
result = await fetch.http(url)
|
||||||
document = result[0]
|
document = result[0]
|
||||||
if document:
|
if document:
|
||||||
root = ET.fromstring(document)
|
root = ET.fromstring(document)
|
||||||
|
@ -378,7 +378,7 @@ async def add_feed(db_file, url):
|
||||||
while True:
|
while True:
|
||||||
exist = await sqlite.get_feed_id_and_name(db_file, url)
|
exist = await sqlite.get_feed_id_and_name(db_file, url)
|
||||||
if not exist:
|
if not exist:
|
||||||
result = await fetch.download_feed(url)
|
result = await fetch.http(url)
|
||||||
document = result[0]
|
document = result[0]
|
||||||
status_code = result[1]
|
status_code = result[1]
|
||||||
if document:
|
if document:
|
||||||
|
@ -458,7 +458,7 @@ async def add_feed(db_file, url):
|
||||||
|
|
||||||
async def view_feed(url):
|
async def view_feed(url):
|
||||||
while True:
|
while True:
|
||||||
result = await fetch.download_feed(url)
|
result = await fetch.http(url)
|
||||||
document = result[0]
|
document = result[0]
|
||||||
status = result[1]
|
status = result[1]
|
||||||
if document:
|
if document:
|
||||||
|
@ -523,7 +523,7 @@ async def view_feed(url):
|
||||||
|
|
||||||
async def view_entry(url, num):
|
async def view_entry(url, num):
|
||||||
while True:
|
while True:
|
||||||
result = await fetch.download_feed(url)
|
result = await fetch.http(url)
|
||||||
document = result[0]
|
document = result[0]
|
||||||
status = result[1]
|
status = result[1]
|
||||||
if document:
|
if document:
|
||||||
|
@ -602,7 +602,7 @@ async def scan(db_file, url):
|
||||||
URL. The default is None.
|
URL. The default is None.
|
||||||
"""
|
"""
|
||||||
if isinstance(url, tuple): url = url[0]
|
if isinstance(url, tuple): url = url[0]
|
||||||
result = await fetch.download_feed(url)
|
result = await fetch.http(url)
|
||||||
try:
|
try:
|
||||||
document = result[0]
|
document = result[0]
|
||||||
status = result[1]
|
status = result[1]
|
||||||
|
@ -706,23 +706,43 @@ async def scan(db_file, url):
|
||||||
db_file, new_entries)
|
db_file, new_entries)
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_document(url, ext, filename):
|
||||||
async def get_content(url):
|
result = await fetch.http(url)
|
||||||
result = await fetch.download_feed(url)
|
|
||||||
data = result[0]
|
data = result[0]
|
||||||
code = result[1]
|
code = result[1]
|
||||||
|
status = None
|
||||||
if data:
|
if data:
|
||||||
try:
|
try:
|
||||||
document = Document(result[0])
|
document = Document(data)
|
||||||
content = document.summary()
|
content = document.summary()
|
||||||
info = [content, code]
|
|
||||||
except:
|
except:
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Install package readability.")
|
"Check that package readability is installed.")
|
||||||
info = result
|
match ext:
|
||||||
|
case "html":
|
||||||
|
generate_html(content, filename)
|
||||||
|
case "md":
|
||||||
|
try:
|
||||||
|
generate_markdown(content, filename)
|
||||||
|
except:
|
||||||
|
logging.warning(
|
||||||
|
"Check that package html2text is installed.")
|
||||||
|
status = (
|
||||||
|
"Package html2text was not found.")
|
||||||
|
case "pdf":
|
||||||
|
try:
|
||||||
|
generate_pdf(content, filename)
|
||||||
|
except:
|
||||||
|
logging.warning(
|
||||||
|
"Check that packages pdfkit and wkhtmltopdf "
|
||||||
|
"are installed.")
|
||||||
|
status = (
|
||||||
|
"Package pdfkit or wkhtmltopdf was not found.")
|
||||||
else:
|
else:
|
||||||
info = [None, code]
|
status = code
|
||||||
return info
|
if status:
|
||||||
|
return status
|
||||||
|
|
||||||
# TODO Either adapt it to filename
|
# TODO Either adapt it to filename
|
||||||
# or change it to something else
|
# or change it to something else
|
||||||
#filename = document.title()
|
#filename = document.title()
|
||||||
|
@ -731,7 +751,40 @@ async def get_content(url):
|
||||||
# file.write(html_doc)
|
# file.write(html_doc)
|
||||||
|
|
||||||
|
|
||||||
def extract_first_image(url, content):
|
async def extract_image_from_feed(db_file, ix, url):
|
||||||
|
feed_url = sqlite.get_feed_url(db_file, ix)
|
||||||
|
result = await fetch.http(feed_url)
|
||||||
|
document = result[0]
|
||||||
|
# breakpoint()
|
||||||
|
print("extract_image_from_feed")
|
||||||
|
if document:
|
||||||
|
feed = parse(document)
|
||||||
|
for entry in feed.entries:
|
||||||
|
print(len(feed.entries))
|
||||||
|
print(entry.link)
|
||||||
|
print(url)
|
||||||
|
if entry.link == url:
|
||||||
|
for link in entry.links:
|
||||||
|
if (link.rel == "enclosure" and
|
||||||
|
link.type.startswith("image/")):
|
||||||
|
# if link.type.startswith("image/"):
|
||||||
|
image_url = link.href
|
||||||
|
print("found")
|
||||||
|
print(image_url)
|
||||||
|
break
|
||||||
|
return image_url
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_image_from_html(url):
|
||||||
|
result = await fetch.http(url)
|
||||||
|
data = result[0]
|
||||||
|
if data:
|
||||||
|
try:
|
||||||
|
document = Document(data)
|
||||||
|
content = document.summary()
|
||||||
|
except:
|
||||||
|
logging.warning(
|
||||||
|
"Check that package readability is installed.")
|
||||||
tree = html.fromstring(content)
|
tree = html.fromstring(content)
|
||||||
images = tree.xpath('//img/@src')
|
images = tree.xpath('//img/@src')
|
||||||
if len(images):
|
if len(images):
|
||||||
|
@ -775,7 +828,7 @@ async def organize_items(db_file, urls):
|
||||||
for url in urls:
|
for url in urls:
|
||||||
# print(os.path.basename(db_file), url[0])
|
# print(os.path.basename(db_file), url[0])
|
||||||
url = url[0]
|
url = url[0]
|
||||||
res = await fetch.download_feed(url)
|
res = await fetch.http(url)
|
||||||
# TypeError: 'NoneType' object is not subscriptable
|
# TypeError: 'NoneType' object is not subscriptable
|
||||||
if res is None:
|
if res is None:
|
||||||
# Skip to next feed
|
# Skip to next feed
|
||||||
|
|
|
@ -22,7 +22,7 @@ from feedparser import parse
|
||||||
import logging
|
import logging
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import slixfeed.config as config
|
import slixfeed.config as config
|
||||||
from slixfeed.fetch import download_feed
|
import slixfeed.fetch as fetch
|
||||||
from slixfeed.url import complete_url, join_url, trim_url
|
from slixfeed.url import complete_url, join_url, trim_url
|
||||||
from urllib.parse import urlsplit, urlunsplit
|
from urllib.parse import urlsplit, urlunsplit
|
||||||
|
|
||||||
|
@ -174,9 +174,13 @@ async def feed_mode_scan(url, tree):
|
||||||
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
||||||
# xpath_query = "//a[contains(@href,'{}')]".format(path)
|
# xpath_query = "//a[contains(@href,'{}')]".format(path)
|
||||||
num = 5
|
num = 5
|
||||||
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
|
xpath_query = (
|
||||||
|
"(//a[contains(@href,'{}')])[position()<={}]"
|
||||||
|
).format(path, num)
|
||||||
addresses = tree.xpath(xpath_query)
|
addresses = tree.xpath(xpath_query)
|
||||||
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
|
xpath_query = (
|
||||||
|
"(//a[contains(@href,'{}')])[position()>last()-{}]"
|
||||||
|
).format(path, num)
|
||||||
addresses += tree.xpath(xpath_query)
|
addresses += tree.xpath(xpath_query)
|
||||||
# NOTE Should number of addresses be limited or
|
# NOTE Should number of addresses be limited or
|
||||||
# perhaps be N from the start and N from the end
|
# perhaps be N from the start and N from the end
|
||||||
|
@ -226,7 +230,7 @@ async def feed_mode_auto_discovery(url, tree):
|
||||||
# # The following code will catch
|
# # The following code will catch
|
||||||
# # only valid resources (i.e. not 404);
|
# # only valid resources (i.e. not 404);
|
||||||
# # The following code requires more bandwidth.
|
# # The following code requires more bandwidth.
|
||||||
# res = await download_feed(feed)
|
# res = await fetch.http(feed)
|
||||||
# if res[0]:
|
# if res[0]:
|
||||||
# disco = parse(res[0])
|
# disco = parse(res[0])
|
||||||
# title = disco["feed"]["title"]
|
# title = disco["feed"]["title"]
|
||||||
|
@ -253,7 +257,7 @@ async def feed_mode_auto_discovery(url, tree):
|
||||||
async def process_feed_selection(url, urls):
|
async def process_feed_selection(url, urls):
|
||||||
feeds = {}
|
feeds = {}
|
||||||
for i in urls:
|
for i in urls:
|
||||||
res = await download_feed(i)
|
res = await fetch.http(i)
|
||||||
if res[1] == 200:
|
if res[1] == 200:
|
||||||
try:
|
try:
|
||||||
feeds[i] = [parse(res[0])]
|
feeds[i] = [parse(res[0])]
|
||||||
|
@ -266,7 +270,7 @@ async def process_feed_selection(url, urls):
|
||||||
feed_url_mark = 0
|
feed_url_mark = 0
|
||||||
for feed_url in feeds:
|
for feed_url in feeds:
|
||||||
# try:
|
# try:
|
||||||
# res = await download_feed(feed)
|
# res = await fetch.http(feed)
|
||||||
# except:
|
# except:
|
||||||
# continue
|
# continue
|
||||||
feed_name = None
|
feed_name = None
|
||||||
|
@ -334,7 +338,7 @@ async def process_feed_selection(url, urls):
|
||||||
|
|
||||||
# async def start(url):
|
# async def start(url):
|
||||||
# while True:
|
# while True:
|
||||||
# result = await fetch.download_feed(url)
|
# result = await fetch.http(url)
|
||||||
# document = result[0]
|
# document = result[0]
|
||||||
# status = result[1]
|
# status = result[1]
|
||||||
# if document:
|
# if document:
|
||||||
|
|
|
@ -45,7 +45,7 @@ import slixfeed.config as config
|
||||||
|
|
||||||
# async def ipfs():
|
# async def ipfs():
|
||||||
|
|
||||||
async def download_feed(url):
|
async def http(url):
|
||||||
"""
|
"""
|
||||||
Download content of given URL.
|
Download content of given URL.
|
||||||
|
|
||||||
|
|
|
@ -847,11 +847,10 @@ def get_feed_title(db_file, ix):
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
|
||||||
# TODO Handletable archive too
|
|
||||||
def get_entry_url(db_file, ix):
|
def get_entry_url(db_file, ix):
|
||||||
with create_connection(db_file) as conn:
|
with create_connection(db_file) as conn:
|
||||||
cur = conn.cursor()
|
cur = conn.cursor()
|
||||||
sql = (
|
sql = ( # TODO Handletable archive too
|
||||||
"""
|
"""
|
||||||
SELECT link
|
SELECT link
|
||||||
FROM entries
|
FROM entries
|
||||||
|
@ -862,6 +861,28 @@ def get_entry_url(db_file, ix):
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def get_feed_url(db_file, ix):
|
||||||
|
with create_connection(db_file) as conn:
|
||||||
|
cur = conn.cursor()
|
||||||
|
sql = ( # TODO Handletable archive too
|
||||||
|
"""
|
||||||
|
SELECT feed_id
|
||||||
|
FROM entries
|
||||||
|
WHERE id = :ix
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
feed_id = cur.execute(sql, (ix,)).fetchone()[0]
|
||||||
|
sql = (
|
||||||
|
"""
|
||||||
|
SELECT url
|
||||||
|
FROM feeds
|
||||||
|
WHERE id = :feed_id
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
url = cur.execute(sql, (feed_id,)).fetchone()[0]
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
async def mark_as_read(db_file, ix):
|
async def mark_as_read(db_file, ix):
|
||||||
async with DBLOCK:
|
async with DBLOCK:
|
||||||
with create_connection(db_file) as conn:
|
with create_connection(db_file) as conn:
|
||||||
|
|
|
@ -242,11 +242,12 @@ async def send_update(self, jid, num=None):
|
||||||
# breakpoint()
|
# breakpoint()
|
||||||
await mark_as_read(db_file, result[0])
|
await mark_as_read(db_file, result[0])
|
||||||
if not image_url:
|
if not image_url:
|
||||||
info = await action.get_content(url)
|
image_url = await action.extract_image_from_feed(
|
||||||
content = info[1]
|
db_file, ix, url)
|
||||||
status = info[0]
|
if not image_url:
|
||||||
if status == 200:
|
image_url = await action.extract_image_from_html(url)
|
||||||
image_url = action.extract_first_image(url, content)
|
print("image_url")
|
||||||
|
print(image_url)
|
||||||
new = " ".join(news_digest)
|
new = " ".join(news_digest)
|
||||||
# breakpoint()
|
# breakpoint()
|
||||||
if new:
|
if new:
|
||||||
|
|
|
@ -445,6 +445,8 @@ async def message(self, message):
|
||||||
ix_url = message_text.split(" ")[0]
|
ix_url = message_text.split(" ")[0]
|
||||||
ext = " ".join(message_text.split(" ")[1:])
|
ext = " ".join(message_text.split(" ")[1:])
|
||||||
ext = ext if ext else 'pdf'
|
ext = ext if ext else 'pdf'
|
||||||
|
url = None
|
||||||
|
status = None
|
||||||
if ext in ("html", "md", "pdf"):
|
if ext in ("html", "md", "pdf"):
|
||||||
status_type = "dnd"
|
status_type = "dnd"
|
||||||
status_message = (
|
status_message = (
|
||||||
|
@ -469,42 +471,25 @@ async def message(self, message):
|
||||||
response = "No entry Id with {}".format(ix)
|
response = "No entry Id with {}".format(ix)
|
||||||
except:
|
except:
|
||||||
url = ix_url
|
url = ix_url
|
||||||
|
if url:
|
||||||
url = uri.remove_tracking_parameters(url)
|
url = uri.remove_tracking_parameters(url)
|
||||||
url = (uri.replace_hostname(url, "link")) or url
|
url = (uri.replace_hostname(url, "link")) or url
|
||||||
info = await action.get_content(url)
|
status = await action.generate_document(url, ext, filename)
|
||||||
content = info[0]
|
if status:
|
||||||
status = info[1]
|
|
||||||
if content:
|
|
||||||
try:
|
|
||||||
match ext:
|
|
||||||
case "html":
|
|
||||||
action.generate_html(content, filename)
|
|
||||||
case "md":
|
|
||||||
action.generate_markdown(content, filename)
|
|
||||||
case "pdf":
|
|
||||||
action.generate_pdf(content, filename)
|
|
||||||
url = await upload.start(
|
|
||||||
self, jid, filename)
|
|
||||||
await send_oob_message(
|
|
||||||
self, jid, url)
|
|
||||||
except:
|
|
||||||
logging.warning(
|
|
||||||
"Check that packages html2text, pdfkit "
|
|
||||||
"and wkhtmltopdf are installed")
|
|
||||||
response = (
|
response = (
|
||||||
"Failed to export to {}"
|
"Failed to export {}. Reason: {}"
|
||||||
).format(ext)
|
).format(ext, status)
|
||||||
|
else:
|
||||||
|
url = await upload.start(self, jid, filename)
|
||||||
|
await send_oob_message(self, jid, url)
|
||||||
await task.start_tasks_xmpp(
|
await task.start_tasks_xmpp(
|
||||||
self, jid, ["status"])
|
self, jid, ["status"])
|
||||||
else:
|
|
||||||
response = (
|
|
||||||
"Failed to fetch resource. Reason: {}"
|
|
||||||
).format(status)
|
|
||||||
else:
|
else:
|
||||||
response = "Missing entry Id."
|
response = "Missing entry Id."
|
||||||
else:
|
else:
|
||||||
response = "Unsupported filetype."
|
response = "Unsupported filetype."
|
||||||
if response:
|
if response:
|
||||||
|
print(response)
|
||||||
send_reply_message(self, message, response)
|
send_reply_message(self, message, response)
|
||||||
# case _ if (message_lowercase.startswith("http")) and(
|
# case _ if (message_lowercase.startswith("http")) and(
|
||||||
# message_lowercase.endswith(".opml")):
|
# message_lowercase.endswith(".opml")):
|
||||||
|
|
Loading…
Reference in a new issue