Remove HTML support;

Improve handling of errors.
This commit is contained in:
Schimon Jehudah 2024-05-20 14:23:58 +00:00
parent 4ac8e0836d
commit 7f0c4f4274
10 changed files with 17 additions and 360 deletions

View file

@ -70,30 +70,6 @@ except:
logger = Logger(__name__) logger = Logger(__name__)
try:
import xml2epub
except ImportError:
logger.error('Package xml2epub was not found.\n'
'ePUB support is disabled.')
try:
import html2text
except ImportError:
logger.error('Package html2text was not found.\n'
'Markdown support is disabled.')
try:
import pdfkit
except ImportError:
logger.error('Package pdfkit was not found.\n'
'PDF support is disabled.')
try:
from readability import Document
except ImportError:
logger.error('Package readability was not found.\n'
'Arc90 Lab algorithm is disabled.')
def export_feeds(self, jid, jid_file, ext): def export_feeds(self, jid, jid_file, ext):
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
@ -129,6 +105,7 @@ async def xmpp_muc_autojoin(self, bookmarks):
alias = bookmark["nick"] alias = bookmark["nick"]
muc_jid = bookmark["jid"] muc_jid = bookmark["jid"]
result = await XmppGroupchat.join(self, muc_jid, alias) result = await XmppGroupchat.join(self, muc_jid, alias)
print(result)
if result == 'ban': if result == 'ban':
await XmppBookmark.remove(self, muc_jid) await XmppBookmark.remove(self, muc_jid)
logger.warning('{} is banned from {}'.format(self.alias, muc_jid)) logger.warning('{} is banned from {}'.format(self.alias, muc_jid))
@ -329,7 +306,9 @@ async def xmpp_pubsub_send_unread_items(self, jid_bare):
feed_entry = pack_entry_into_dict(db_file, entry) feed_entry = pack_entry_into_dict(db_file, entry)
node_entry = create_rfc4287_entry(feed_entry) node_entry = create_rfc4287_entry(feed_entry)
entry_url = feed_entry['link'] entry_url = feed_entry['link']
print(entry_url)
item_id = hash_url_to_md5(entry_url) item_id = hash_url_to_md5(entry_url)
print(item_id)
iq_create_entry = XmppPubsub.create_entry( iq_create_entry = XmppPubsub.create_entry(
self, jid_bare, node_id, item_id, node_entry) self, jid_bare, node_id, item_id, node_entry)
await XmppIQ.send(self, iq_create_entry) await XmppIQ.send(self, iq_create_entry)
@ -637,7 +616,7 @@ def manual(filename, section=None, command=None):
try: try:
cmd_list = cmds[section][command] cmd_list = cmds[section][command]
except KeyError as e: except KeyError as e:
logger.error(str(e)) logger.error(e)
cmd_list = None cmd_list = None
elif section: elif section:
try: try:
@ -1837,99 +1816,6 @@ def get_properties_of_entries(jid_bare, db_file, feed_url, feed_id, feed):
return new_entries return new_entries
def get_document_title(data):
function_name = sys._getframe().f_code.co_name
logger.debug('{}'.format(function_name))
try:
document = Document(data)
title = document.short_title()
except:
document = BeautifulSoup(data, 'html.parser')
title = document.title.string
return title
def get_document_content(data):
function_name = sys._getframe().f_code.co_name
logger.debug('{}'.format(function_name))
try:
document = Document(data)
content = document.summary()
except:
document = BeautifulSoup(data, 'html.parser')
content = data
return content
def get_document_content_as_text(data):
function_name = sys._getframe().f_code.co_name
logger.debug('{}'.format(function_name))
try:
document = Document(data)
content = document.summary()
except:
document = BeautifulSoup(data, 'html.parser')
content = data
text = remove_html_tags(content)
return text
def generate_document(data, url, ext, filename, readability=False):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: url: {} ext: {} filename: {}'
.format(function_name, url, ext, filename))
error = None
if readability:
try:
document = Document(data)
content = document.summary()
except:
content = data
logger.warning('Check that package readability is installed.')
else:
content = data
match ext:
case "epub":
filename = filename.split('.')
filename.pop()
filename = '.'.join(filename)
error = generate_epub(content, filename)
if error:
logger.error(error)
# logger.error(
# "Check that packages xml2epub is installed, "
# "or try again.")
case "html":
generate_html(content, filename)
case "md":
try:
generate_markdown(content, filename)
except:
logger.warning('Check that package html2text '
'is installed, or try again.')
error = 'Package html2text was not found.'
case "pdf":
error = generate_pdf(content, filename)
if error:
logger.error(error)
# logger.warning(
# "Check that packages pdfkit and wkhtmltopdf "
# "are installed, or try again.")
# error = (
# "Package pdfkit or wkhtmltopdf was not found.")
case "txt":
generate_txt(content, filename)
if error:
return error
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
async def extract_image_from_feed(db_file, feed_id, url): async def extract_image_from_feed(db_file, feed_id, url):
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} feed_id: {} url: {}' logger.debug('{}: db_file: {} feed_id: {} url: {}'
@ -1959,13 +1845,7 @@ async def extract_image_from_html(url):
result = await fetch.http(url) result = await fetch.http(url)
if not result['error']: if not result['error']:
data = result['content'] data = result['content']
try: tree = html.fromstring(data)
document = Document(data)
content = document.summary()
except:
content = data
logger.warning('Check that package readability is installed.')
tree = html.fromstring(content)
# TODO Exclude banners, class="share" links etc. # TODO Exclude banners, class="share" links etc.
images = tree.xpath( images = tree.xpath(
'//img[not(' '//img[not('
@ -1985,68 +1865,6 @@ async def extract_image_from_html(url):
return image_url return image_url
def generate_epub(text, filename):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: text: {} pathname: {}'.format(function_name, text, filename))
## create an empty eBook
filename_list = filename.split("/")
file_title = filename_list.pop()
directory = "/".join(filename_list)
book = xml2epub.Epub(file_title)
## create chapters by url
# chapter0 = xml2epub.create_chapter_from_string(text, title=filename, strict=False)
chapter0 = xml2epub.create_chapter_from_string(text, strict=False)
#### create chapter objects
# chapter1 = xml2epub.create_chapter_from_url("https://dev.to/devteam/top-7-featured-dev-posts-from-the-past-week-h6h")
# chapter2 = xml2epub.create_chapter_from_url("https://dev.to/ks1912/getting-started-with-docker-34g6")
## add chapters to your eBook
try:
book.add_chapter(chapter0)
# book.add_chapter(chapter1)
# book.add_chapter(chapter2)
## generate epub file
book.create_epub(directory, absolute_location=filename)
except ValueError as error:
return error
def generate_html(text, filename):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
with open(filename, 'w') as file:
file.write(text)
def generate_markdown(text, filename):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
h2m = html2text.HTML2Text()
# Convert HTML to Markdown
markdown = h2m.handle(text)
with open(filename, 'w') as file:
file.write(markdown)
def generate_pdf(text, filename):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
try:
pdfkit.from_string(text, filename)
except IOError as error:
return error
except OSError as error:
return error
def generate_txt(text, filename):
function_name = sys._getframe().f_code.co_name
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
text = remove_html_tags(text)
with open(filename, 'w') as file:
file.write(text)
# This works too # This works too
# ''.join(xml.etree.ElementTree.fromstring(text).itertext()) # ''.join(xml.etree.ElementTree.fromstring(text).itertext())
def remove_html_tags(data): def remove_html_tags(data):

View file

@ -61,18 +61,6 @@ old
Send all items of newly added feeds. Send all items of newly added feeds.
""" """
[document]
content = """
content <id>/<url> <type>
Send a readability (arc90) version of an article as file. Specify <id> or <url> and <type>.
Supported types are ePUB, HTML, MD and PDF (default).
"""
page = """
page <id>/<url> <type>
Send an article as file. Specify <id> or <url> and <type>.
Supported types are ePUB, HTML, MD and PDF (default).
"""
[filters] [filters]
allow = """ allow = """
allow [+|-] <keyword> allow [+|-] <keyword>

View file

@ -131,7 +131,7 @@ async def probe_page(url, document=None):
tree = html.fromstring(document) tree = html.fromstring(document)
result = None result = None
except: except:
logging.debug("Failed to parse URL as feed for {}.".format(url)) logging.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None, result = {'link' : None,
'index' : None, 'index' : None,
'name' : None, 'name' : None,

View file

@ -107,7 +107,8 @@ def http_response(url):
# response = requests.head(url, headers=headers, allow_redirects=True) # response = requests.head(url, headers=headers, allow_redirects=True)
response = requests.get(url, headers=headers, allow_redirects=True) response = requests.get(url, headers=headers, allow_redirects=True)
except Exception as e: except Exception as e:
logging.error(str(e)) logging.warning('Error in HTTP response')
logging.error(e)
response = None response = None
return response return response

View file

@ -66,7 +66,8 @@ def create_connection(db_file):
conn.execute("PRAGMA foreign_keys = ON") conn.execute("PRAGMA foreign_keys = ON")
# return conn # return conn
except Error as e: except Error as e:
print(e) logger.warning('Error creating a connection to database {}.'.format(db_file))
logger.error(e)
time_end = time.time() time_end = time.time()
difference = time_end - time_begin difference = time_end - time_begin
if difference > 1: logger.warning('{} (time: {})'.format(function_name, if difference > 1: logger.warning('{} (time: {})'.format(function_name,

View file

@ -1,2 +1,2 @@
__version__ = '0.1.66' __version__ = '0.1.67'
__version_info__ = (0, 1, 66) __version_info__ = (0, 1, 67)

View file

@ -595,105 +595,6 @@ class Chat:
message_lowercase.startswith('gopher:')): message_lowercase.startswith('gopher:')):
response = 'Gemini and Gopher are not supported yet.' response = 'Gemini and Gopher are not supported yet.'
XmppMessage.send_reply(self, message, response) XmppMessage.send_reply(self, message, response)
# TODO xHTML, HTMLZ, MHTML
case _ if (message_lowercase.startswith('content') or
message_lowercase.startswith('page')):
if message_lowercase.startswith('content'):
message_text = message_text[8:]
readability = True
else:
message_text = message_text[5:]
readability = False
ix_url = message_text.split(' ')[0]
ext = ' '.join(message_text.split(' ')[1:])
ext = ext if ext else 'pdf'
url = None
error = None
response = None
if ext in ('epub', 'html', 'markdown', 'md', 'pdf', 'text',
'txt'):
match ext:
case 'markdown':
ext = 'md'
case 'text':
ext = 'txt'
status_type = 'dnd'
status_message = ('📃️ Procesing request to produce {} '
'document...'.format(ext.upper()))
# pending_tasks_num = len(self.pending_tasks[jid_bare])
pending_tasks_num = randrange(10000, 99999)
self.pending_tasks[jid_bare][pending_tasks_num] = status_message
# self.pending_tasks_counter += 1
# self.pending_tasks[jid_bare][self.pending_tasks_counter] = status_message
XmppPresence.send(self, jid_bare, status_message,
status_type=status_type)
db_file = config.get_pathname_to_database(jid_file)
cache_dir = config.get_default_cache_directory()
if not os.path.isdir(cache_dir):
os.mkdir(cache_dir)
if not os.path.isdir(cache_dir + '/readability'):
os.mkdir(cache_dir + '/readability')
if ix_url:
try:
ix = int(ix_url)
try:
url = sqlite.get_entry_url(db_file, ix)
url = url[0]
except:
response = 'No entry with index {}'.format(ix)
except:
url = ix_url
if url:
url = uri.remove_tracking_parameters(url)
url = (await uri.replace_hostname(url, 'link')) or url
result = await fetch.http(url)
if not result['error']:
data = result['content']
code = result['status_code']
title = action.get_document_title(data)
title = title.strip().lower()
for i in (' ', '-'):
title = title.replace(i, '_')
for i in ('?', '"', '\'', '!'):
title = title.replace(i, '')
filename = os.path.join(
cache_dir, 'readability',
title + '_' + dt.timestamp() + '.' + ext)
error = action.generate_document(data, url,
ext, filename,
readability)
if error:
response = ('> {}\n'
'Failed to export {}. '
'Reason: {}'.format(
url, ext.upper(), error))
else:
url = await XmppUpload.start(
self, jid_bare, filename)
chat_type = await get_chat_type(self,
jid_bare)
XmppMessage.send_oob(self, jid_bare, url,
chat_type)
else:
response = ('> {}\n'
'Failed to fetch URL. Reason: {}'
.format(url, code))
else:
response = ('No action has been taken.'
'\n'
'Missing argument. '
'Enter URL or entry index number.')
else:
response = ('Unsupported filetype.\n'
'Try: epub, html, md (markdown), '
'pdf, or txt (text)')
del self.pending_tasks[jid_bare][pending_tasks_num]
# del self.pending_tasks[jid_bare][self.pending_tasks_counter]
key_list = ['status']
await task.start_tasks_xmpp_chat(self, jid_bare, key_list)
if response:
logging.warning('Error for URL {}: {}'.format(url, error))
XmppMessage.send_reply(self, message, response)
case _ if (message_lowercase.startswith('http')) and( case _ if (message_lowercase.startswith('http')) and(
message_lowercase.endswith('.opml')): message_lowercase.endswith('.opml')):
url = message_text url = message_text

View file

@ -1650,6 +1650,7 @@ class Slixfeed(slixmpp.ClientXMPP):
return session return session
# FIXME
async def _handle_recent_select(self, payload, session): async def _handle_recent_select(self, payload, session):
jid_full = str(session['from']) jid_full = str(session['from'])
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
@ -1724,60 +1725,6 @@ class Slixfeed(slixmpp.ClientXMPP):
return session return session
async def _handle_recent_action(self, payload, session):
jid_full = str(session['from'])
function_name = sys._getframe().f_code.co_name
logger.debug('{}: jid_full: {}'
.format(function_name, jid_full))
ext = payload['values']['filetype']
url = payload['values']['url'][0]
jid_bare = session['from'].bare
cache_dir = config.get_default_cache_directory()
if not os.path.isdir(cache_dir):
os.mkdir(cache_dir)
if not os.path.isdir(cache_dir + '/readability'):
os.mkdir(cache_dir + '/readability')
url = uri.remove_tracking_parameters(url)
url = (await uri.replace_hostname(url, 'link')) or url
result = await fetch.http(url)
if not result['error']:
data = result['content']
code = result['status_code']
title = action.get_document_title(data)
title = title.strip().lower()
for i in (' ', '-'):
title = title.replace(i, '_')
for i in ('?', '"', '\'', '!'):
title = title.replace(i, '')
filename = os.path.join(
cache_dir, 'readability',
title + '_' + dt.timestamp() + '.' + ext)
error = action.generate_document(data, url, ext, filename,
readability=True)
if error:
text_error = ('Failed to export {} fot {}'
'\n\n'
'Reason: {}'.format(ext.upper(), url, error))
session['notes'] = [['error', text_error]]
else:
url = await XmppUpload.start(self, jid_bare, filename)
chat_type = await get_chat_type(self, jid_bare)
XmppMessage.send_oob(self, jid_bare, url, chat_type)
form = self['xep_0004'].make_form('result', 'Download')
form['instructions'] = ('Download {} document.'
.format(ext.upper()))
field_url = form.add_field(var='url',
label='Link',
ftype='text-single',
value=url)
field_url['validate']['datatype'] = 'xs:anyURI'
session['payload'] = form
session['allow_complete'] = True
session['next'] = None
session['prev'] = None
return session
async def _handle_subscription_new(self, payload, session): async def _handle_subscription_new(self, payload, session):
jid_full = str(session['from']) jid_full = str(session['from'])
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name

View file

@ -44,7 +44,8 @@ class XmppPubsub:
async def get_node_configuration(self, jid, node_id): async def get_node_configuration(self, jid, node_id):
node = await self.plugin['xep_0060'].get_node_config(jid, node_id) node = await self.plugin['xep_0060'].get_node_config(jid, node_id)
print(node) if not node:
print('NODE CONFIG', node_id, str(node))
return node return node

View file

@ -46,8 +46,8 @@ async def get_chat_type(self, jid):
logging.info('Jabber ID: {}\n' logging.info('Jabber ID: {}\n'
'Chat Type: {}'.format(jid, result)) 'Chat Type: {}'.format(jid, result))
except (IqError, IqTimeout) as e: except (IqError, IqTimeout) as e:
logging.error(str(e)) logging.warning('Chat type could not be determined for {}'.format(jid))
logging.error(jid) logging.error(e)
result = 'error' result = 'error'
# except BaseException as e: # except BaseException as e:
# logging.error('BaseException', str(e)) # logging.error('BaseException', str(e))