From 7f0c4f4274e1bb4a2d3a0789b5ff82c1c9aa34c0 Mon Sep 17 00:00:00 2001 From: Schimon Jehudah Date: Mon, 20 May 2024 14:23:58 +0000 Subject: [PATCH] Remove HTML support; Improve handling of errors. --- slixfeed/action.py | 192 +--------------------------------- slixfeed/assets/commands.toml | 12 --- slixfeed/crawl.py | 2 +- slixfeed/fetch.py | 3 +- slixfeed/sqlite.py | 3 +- slixfeed/version.py | 4 +- slixfeed/xmpp/chat.py | 99 ------------------ slixfeed/xmpp/client.py | 55 +--------- slixfeed/xmpp/publish.py | 3 +- slixfeed/xmpp/utility.py | 4 +- 10 files changed, 17 insertions(+), 360 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index 64c9b61..e0722ce 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -70,30 +70,6 @@ except: logger = Logger(__name__) -try: - import xml2epub -except ImportError: - logger.error('Package xml2epub was not found.\n' - 'ePUB support is disabled.') - -try: - import html2text -except ImportError: - logger.error('Package html2text was not found.\n' - 'Markdown support is disabled.') - -try: - import pdfkit -except ImportError: - logger.error('Package pdfkit was not found.\n' - 'PDF support is disabled.') - -try: - from readability import Document -except ImportError: - logger.error('Package readability was not found.\n' - 'Arc90 Lab algorithm is disabled.') - def export_feeds(self, jid, jid_file, ext): function_name = sys._getframe().f_code.co_name @@ -129,6 +105,7 @@ async def xmpp_muc_autojoin(self, bookmarks): alias = bookmark["nick"] muc_jid = bookmark["jid"] result = await XmppGroupchat.join(self, muc_jid, alias) + print(result) if result == 'ban': await XmppBookmark.remove(self, muc_jid) logger.warning('{} is banned from {}'.format(self.alias, muc_jid)) @@ -329,7 +306,9 @@ async def xmpp_pubsub_send_unread_items(self, jid_bare): feed_entry = pack_entry_into_dict(db_file, entry) node_entry = create_rfc4287_entry(feed_entry) entry_url = feed_entry['link'] + print(entry_url) item_id = hash_url_to_md5(entry_url) + print(item_id) iq_create_entry = XmppPubsub.create_entry( self, jid_bare, node_id, item_id, node_entry) await XmppIQ.send(self, iq_create_entry) @@ -637,7 +616,7 @@ def manual(filename, section=None, command=None): try: cmd_list = cmds[section][command] except KeyError as e: - logger.error(str(e)) + logger.error(e) cmd_list = None elif section: try: @@ -1837,99 +1816,6 @@ def get_properties_of_entries(jid_bare, db_file, feed_url, feed_id, feed): return new_entries -def get_document_title(data): - function_name = sys._getframe().f_code.co_name - logger.debug('{}'.format(function_name)) - try: - document = Document(data) - title = document.short_title() - except: - document = BeautifulSoup(data, 'html.parser') - title = document.title.string - return title - - -def get_document_content(data): - function_name = sys._getframe().f_code.co_name - logger.debug('{}'.format(function_name)) - try: - document = Document(data) - content = document.summary() - except: - document = BeautifulSoup(data, 'html.parser') - content = data - return content - - -def get_document_content_as_text(data): - function_name = sys._getframe().f_code.co_name - logger.debug('{}'.format(function_name)) - try: - document = Document(data) - content = document.summary() - except: - document = BeautifulSoup(data, 'html.parser') - content = data - text = remove_html_tags(content) - return text - - -def generate_document(data, url, ext, filename, readability=False): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: url: {} ext: {} filename: {}' - .format(function_name, url, ext, filename)) - error = None - if readability: - try: - document = Document(data) - content = document.summary() - except: - content = data - logger.warning('Check that package readability is installed.') - else: - content = data - match ext: - case "epub": - filename = filename.split('.') - filename.pop() - filename = '.'.join(filename) - error = generate_epub(content, filename) - if error: - logger.error(error) - # logger.error( - # "Check that packages xml2epub is installed, " - # "or try again.") - case "html": - generate_html(content, filename) - case "md": - try: - generate_markdown(content, filename) - except: - logger.warning('Check that package html2text ' - 'is installed, or try again.') - error = 'Package html2text was not found.' - case "pdf": - error = generate_pdf(content, filename) - if error: - logger.error(error) - # logger.warning( - # "Check that packages pdfkit and wkhtmltopdf " - # "are installed, or try again.") - # error = ( - # "Package pdfkit or wkhtmltopdf was not found.") - case "txt": - generate_txt(content, filename) - if error: - return error - - # TODO Either adapt it to filename - # or change it to something else - #filename = document.title() - # with open(filename, 'w') as file: - # html_doc = document.summary() - # file.write(html_doc) - - async def extract_image_from_feed(db_file, feed_id, url): function_name = sys._getframe().f_code.co_name logger.debug('{}: db_file: {} feed_id: {} url: {}' @@ -1959,13 +1845,7 @@ async def extract_image_from_html(url): result = await fetch.http(url) if not result['error']: data = result['content'] - try: - document = Document(data) - content = document.summary() - except: - content = data - logger.warning('Check that package readability is installed.') - tree = html.fromstring(content) + tree = html.fromstring(data) # TODO Exclude banners, class="share" links etc. images = tree.xpath( '//img[not(' @@ -1985,68 +1865,6 @@ async def extract_image_from_html(url): return image_url -def generate_epub(text, filename): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: text: {} pathname: {}'.format(function_name, text, filename)) - ## create an empty eBook - filename_list = filename.split("/") - file_title = filename_list.pop() - directory = "/".join(filename_list) - book = xml2epub.Epub(file_title) - ## create chapters by url - # chapter0 = xml2epub.create_chapter_from_string(text, title=filename, strict=False) - chapter0 = xml2epub.create_chapter_from_string(text, strict=False) - #### create chapter objects - # chapter1 = xml2epub.create_chapter_from_url("https://dev.to/devteam/top-7-featured-dev-posts-from-the-past-week-h6h") - # chapter2 = xml2epub.create_chapter_from_url("https://dev.to/ks1912/getting-started-with-docker-34g6") - ## add chapters to your eBook - try: - book.add_chapter(chapter0) - # book.add_chapter(chapter1) - # book.add_chapter(chapter2) - ## generate epub file - book.create_epub(directory, absolute_location=filename) - except ValueError as error: - return error - - - -def generate_html(text, filename): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename)) - with open(filename, 'w') as file: - file.write(text) - - -def generate_markdown(text, filename): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename)) - h2m = html2text.HTML2Text() - # Convert HTML to Markdown - markdown = h2m.handle(text) - with open(filename, 'w') as file: - file.write(markdown) - - -def generate_pdf(text, filename): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename)) - try: - pdfkit.from_string(text, filename) - except IOError as error: - return error - except OSError as error: - return error - - -def generate_txt(text, filename): - function_name = sys._getframe().f_code.co_name - logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename)) - text = remove_html_tags(text) - with open(filename, 'w') as file: - file.write(text) - - # This works too # ''.join(xml.etree.ElementTree.fromstring(text).itertext()) def remove_html_tags(data): diff --git a/slixfeed/assets/commands.toml b/slixfeed/assets/commands.toml index 0253783..d1e5115 100644 --- a/slixfeed/assets/commands.toml +++ b/slixfeed/assets/commands.toml @@ -61,18 +61,6 @@ old Send all items of newly added feeds. """ -[document] -content = """ -content / -Send a readability (arc90) version of an article as file. Specify or and . -Supported types are ePUB, HTML, MD and PDF (default). -""" -page = """ -page / -Send an article as file. Specify or and . -Supported types are ePUB, HTML, MD and PDF (default). -""" - [filters] allow = """ allow [+|-] diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py index 8eae143..09c0275 100644 --- a/slixfeed/crawl.py +++ b/slixfeed/crawl.py @@ -131,7 +131,7 @@ async def probe_page(url, document=None): tree = html.fromstring(document) result = None except: - logging.debug("Failed to parse URL as feed for {}.".format(url)) + logging.warning("Failed to parse URL as feed for {}.".format(url)) result = {'link' : None, 'index' : None, 'name' : None, diff --git a/slixfeed/fetch.py b/slixfeed/fetch.py index a399f09..01cafd4 100644 --- a/slixfeed/fetch.py +++ b/slixfeed/fetch.py @@ -107,7 +107,8 @@ def http_response(url): # response = requests.head(url, headers=headers, allow_redirects=True) response = requests.get(url, headers=headers, allow_redirects=True) except Exception as e: - logging.error(str(e)) + logging.warning('Error in HTTP response') + logging.error(e) response = None return response diff --git a/slixfeed/sqlite.py b/slixfeed/sqlite.py index 709faa7..a95c55c 100644 --- a/slixfeed/sqlite.py +++ b/slixfeed/sqlite.py @@ -66,7 +66,8 @@ def create_connection(db_file): conn.execute("PRAGMA foreign_keys = ON") # return conn except Error as e: - print(e) + logger.warning('Error creating a connection to database {}.'.format(db_file)) + logger.error(e) time_end = time.time() difference = time_end - time_begin if difference > 1: logger.warning('{} (time: {})'.format(function_name, diff --git a/slixfeed/version.py b/slixfeed/version.py index 57fe273..e480891 100644 --- a/slixfeed/version.py +++ b/slixfeed/version.py @@ -1,2 +1,2 @@ -__version__ = '0.1.66' -__version_info__ = (0, 1, 66) +__version__ = '0.1.67' +__version_info__ = (0, 1, 67) diff --git a/slixfeed/xmpp/chat.py b/slixfeed/xmpp/chat.py index 2dae019..b38dac0 100644 --- a/slixfeed/xmpp/chat.py +++ b/slixfeed/xmpp/chat.py @@ -595,105 +595,6 @@ class Chat: message_lowercase.startswith('gopher:')): response = 'Gemini and Gopher are not supported yet.' XmppMessage.send_reply(self, message, response) - # TODO xHTML, HTMLZ, MHTML - case _ if (message_lowercase.startswith('content') or - message_lowercase.startswith('page')): - if message_lowercase.startswith('content'): - message_text = message_text[8:] - readability = True - else: - message_text = message_text[5:] - readability = False - ix_url = message_text.split(' ')[0] - ext = ' '.join(message_text.split(' ')[1:]) - ext = ext if ext else 'pdf' - url = None - error = None - response = None - if ext in ('epub', 'html', 'markdown', 'md', 'pdf', 'text', - 'txt'): - match ext: - case 'markdown': - ext = 'md' - case 'text': - ext = 'txt' - status_type = 'dnd' - status_message = ('📃️ Procesing request to produce {} ' - 'document...'.format(ext.upper())) - # pending_tasks_num = len(self.pending_tasks[jid_bare]) - pending_tasks_num = randrange(10000, 99999) - self.pending_tasks[jid_bare][pending_tasks_num] = status_message - # self.pending_tasks_counter += 1 - # self.pending_tasks[jid_bare][self.pending_tasks_counter] = status_message - XmppPresence.send(self, jid_bare, status_message, - status_type=status_type) - db_file = config.get_pathname_to_database(jid_file) - cache_dir = config.get_default_cache_directory() - if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) - if not os.path.isdir(cache_dir + '/readability'): - os.mkdir(cache_dir + '/readability') - if ix_url: - try: - ix = int(ix_url) - try: - url = sqlite.get_entry_url(db_file, ix) - url = url[0] - except: - response = 'No entry with index {}'.format(ix) - except: - url = ix_url - if url: - url = uri.remove_tracking_parameters(url) - url = (await uri.replace_hostname(url, 'link')) or url - result = await fetch.http(url) - if not result['error']: - data = result['content'] - code = result['status_code'] - title = action.get_document_title(data) - title = title.strip().lower() - for i in (' ', '-'): - title = title.replace(i, '_') - for i in ('?', '"', '\'', '!'): - title = title.replace(i, '') - filename = os.path.join( - cache_dir, 'readability', - title + '_' + dt.timestamp() + '.' + ext) - error = action.generate_document(data, url, - ext, filename, - readability) - if error: - response = ('> {}\n' - 'Failed to export {}. ' - 'Reason: {}'.format( - url, ext.upper(), error)) - else: - url = await XmppUpload.start( - self, jid_bare, filename) - chat_type = await get_chat_type(self, - jid_bare) - XmppMessage.send_oob(self, jid_bare, url, - chat_type) - else: - response = ('> {}\n' - 'Failed to fetch URL. Reason: {}' - .format(url, code)) - else: - response = ('No action has been taken.' - '\n' - 'Missing argument. ' - 'Enter URL or entry index number.') - else: - response = ('Unsupported filetype.\n' - 'Try: epub, html, md (markdown), ' - 'pdf, or txt (text)') - del self.pending_tasks[jid_bare][pending_tasks_num] - # del self.pending_tasks[jid_bare][self.pending_tasks_counter] - key_list = ['status'] - await task.start_tasks_xmpp_chat(self, jid_bare, key_list) - if response: - logging.warning('Error for URL {}: {}'.format(url, error)) - XmppMessage.send_reply(self, message, response) case _ if (message_lowercase.startswith('http')) and( message_lowercase.endswith('.opml')): url = message_text diff --git a/slixfeed/xmpp/client.py b/slixfeed/xmpp/client.py index f066b68..977b8e7 100644 --- a/slixfeed/xmpp/client.py +++ b/slixfeed/xmpp/client.py @@ -1650,6 +1650,7 @@ class Slixfeed(slixmpp.ClientXMPP): return session + # FIXME async def _handle_recent_select(self, payload, session): jid_full = str(session['from']) function_name = sys._getframe().f_code.co_name @@ -1724,60 +1725,6 @@ class Slixfeed(slixmpp.ClientXMPP): return session - async def _handle_recent_action(self, payload, session): - jid_full = str(session['from']) - function_name = sys._getframe().f_code.co_name - logger.debug('{}: jid_full: {}' - .format(function_name, jid_full)) - ext = payload['values']['filetype'] - url = payload['values']['url'][0] - jid_bare = session['from'].bare - cache_dir = config.get_default_cache_directory() - if not os.path.isdir(cache_dir): - os.mkdir(cache_dir) - if not os.path.isdir(cache_dir + '/readability'): - os.mkdir(cache_dir + '/readability') - url = uri.remove_tracking_parameters(url) - url = (await uri.replace_hostname(url, 'link')) or url - result = await fetch.http(url) - if not result['error']: - data = result['content'] - code = result['status_code'] - title = action.get_document_title(data) - title = title.strip().lower() - for i in (' ', '-'): - title = title.replace(i, '_') - for i in ('?', '"', '\'', '!'): - title = title.replace(i, '') - filename = os.path.join( - cache_dir, 'readability', - title + '_' + dt.timestamp() + '.' + ext) - error = action.generate_document(data, url, ext, filename, - readability=True) - if error: - text_error = ('Failed to export {} fot {}' - '\n\n' - 'Reason: {}'.format(ext.upper(), url, error)) - session['notes'] = [['error', text_error]] - else: - url = await XmppUpload.start(self, jid_bare, filename) - chat_type = await get_chat_type(self, jid_bare) - XmppMessage.send_oob(self, jid_bare, url, chat_type) - form = self['xep_0004'].make_form('result', 'Download') - form['instructions'] = ('Download {} document.' - .format(ext.upper())) - field_url = form.add_field(var='url', - label='Link', - ftype='text-single', - value=url) - field_url['validate']['datatype'] = 'xs:anyURI' - session['payload'] = form - session['allow_complete'] = True - session['next'] = None - session['prev'] = None - return session - - async def _handle_subscription_new(self, payload, session): jid_full = str(session['from']) function_name = sys._getframe().f_code.co_name diff --git a/slixfeed/xmpp/publish.py b/slixfeed/xmpp/publish.py index df7d009..d889357 100644 --- a/slixfeed/xmpp/publish.py +++ b/slixfeed/xmpp/publish.py @@ -44,7 +44,8 @@ class XmppPubsub: async def get_node_configuration(self, jid, node_id): node = await self.plugin['xep_0060'].get_node_config(jid, node_id) - print(node) + if not node: + print('NODE CONFIG', node_id, str(node)) return node diff --git a/slixfeed/xmpp/utility.py b/slixfeed/xmpp/utility.py index 2231cc8..3d20434 100644 --- a/slixfeed/xmpp/utility.py +++ b/slixfeed/xmpp/utility.py @@ -46,8 +46,8 @@ async def get_chat_type(self, jid): logging.info('Jabber ID: {}\n' 'Chat Type: {}'.format(jid, result)) except (IqError, IqTimeout) as e: - logging.error(str(e)) - logging.error(jid) + logging.warning('Chat type could not be determined for {}'.format(jid)) + logging.error(e) result = 'error' # except BaseException as e: # logging.error('BaseException', str(e))