Remove HTML support;
Improve handling of errors.
This commit is contained in:
parent
4ac8e0836d
commit
7f0c4f4274
10 changed files with 17 additions and 360 deletions
|
@ -70,30 +70,6 @@ except:
|
|||
|
||||
logger = Logger(__name__)
|
||||
|
||||
try:
|
||||
import xml2epub
|
||||
except ImportError:
|
||||
logger.error('Package xml2epub was not found.\n'
|
||||
'ePUB support is disabled.')
|
||||
|
||||
try:
|
||||
import html2text
|
||||
except ImportError:
|
||||
logger.error('Package html2text was not found.\n'
|
||||
'Markdown support is disabled.')
|
||||
|
||||
try:
|
||||
import pdfkit
|
||||
except ImportError:
|
||||
logger.error('Package pdfkit was not found.\n'
|
||||
'PDF support is disabled.')
|
||||
|
||||
try:
|
||||
from readability import Document
|
||||
except ImportError:
|
||||
logger.error('Package readability was not found.\n'
|
||||
'Arc90 Lab algorithm is disabled.')
|
||||
|
||||
|
||||
def export_feeds(self, jid, jid_file, ext):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
|
@ -129,6 +105,7 @@ async def xmpp_muc_autojoin(self, bookmarks):
|
|||
alias = bookmark["nick"]
|
||||
muc_jid = bookmark["jid"]
|
||||
result = await XmppGroupchat.join(self, muc_jid, alias)
|
||||
print(result)
|
||||
if result == 'ban':
|
||||
await XmppBookmark.remove(self, muc_jid)
|
||||
logger.warning('{} is banned from {}'.format(self.alias, muc_jid))
|
||||
|
@ -329,7 +306,9 @@ async def xmpp_pubsub_send_unread_items(self, jid_bare):
|
|||
feed_entry = pack_entry_into_dict(db_file, entry)
|
||||
node_entry = create_rfc4287_entry(feed_entry)
|
||||
entry_url = feed_entry['link']
|
||||
print(entry_url)
|
||||
item_id = hash_url_to_md5(entry_url)
|
||||
print(item_id)
|
||||
iq_create_entry = XmppPubsub.create_entry(
|
||||
self, jid_bare, node_id, item_id, node_entry)
|
||||
await XmppIQ.send(self, iq_create_entry)
|
||||
|
@ -637,7 +616,7 @@ def manual(filename, section=None, command=None):
|
|||
try:
|
||||
cmd_list = cmds[section][command]
|
||||
except KeyError as e:
|
||||
logger.error(str(e))
|
||||
logger.error(e)
|
||||
cmd_list = None
|
||||
elif section:
|
||||
try:
|
||||
|
@ -1837,99 +1816,6 @@ def get_properties_of_entries(jid_bare, db_file, feed_url, feed_id, feed):
|
|||
return new_entries
|
||||
|
||||
|
||||
def get_document_title(data):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}'.format(function_name))
|
||||
try:
|
||||
document = Document(data)
|
||||
title = document.short_title()
|
||||
except:
|
||||
document = BeautifulSoup(data, 'html.parser')
|
||||
title = document.title.string
|
||||
return title
|
||||
|
||||
|
||||
def get_document_content(data):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}'.format(function_name))
|
||||
try:
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
except:
|
||||
document = BeautifulSoup(data, 'html.parser')
|
||||
content = data
|
||||
return content
|
||||
|
||||
|
||||
def get_document_content_as_text(data):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}'.format(function_name))
|
||||
try:
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
except:
|
||||
document = BeautifulSoup(data, 'html.parser')
|
||||
content = data
|
||||
text = remove_html_tags(content)
|
||||
return text
|
||||
|
||||
|
||||
def generate_document(data, url, ext, filename, readability=False):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: url: {} ext: {} filename: {}'
|
||||
.format(function_name, url, ext, filename))
|
||||
error = None
|
||||
if readability:
|
||||
try:
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
except:
|
||||
content = data
|
||||
logger.warning('Check that package readability is installed.')
|
||||
else:
|
||||
content = data
|
||||
match ext:
|
||||
case "epub":
|
||||
filename = filename.split('.')
|
||||
filename.pop()
|
||||
filename = '.'.join(filename)
|
||||
error = generate_epub(content, filename)
|
||||
if error:
|
||||
logger.error(error)
|
||||
# logger.error(
|
||||
# "Check that packages xml2epub is installed, "
|
||||
# "or try again.")
|
||||
case "html":
|
||||
generate_html(content, filename)
|
||||
case "md":
|
||||
try:
|
||||
generate_markdown(content, filename)
|
||||
except:
|
||||
logger.warning('Check that package html2text '
|
||||
'is installed, or try again.')
|
||||
error = 'Package html2text was not found.'
|
||||
case "pdf":
|
||||
error = generate_pdf(content, filename)
|
||||
if error:
|
||||
logger.error(error)
|
||||
# logger.warning(
|
||||
# "Check that packages pdfkit and wkhtmltopdf "
|
||||
# "are installed, or try again.")
|
||||
# error = (
|
||||
# "Package pdfkit or wkhtmltopdf was not found.")
|
||||
case "txt":
|
||||
generate_txt(content, filename)
|
||||
if error:
|
||||
return error
|
||||
|
||||
# TODO Either adapt it to filename
|
||||
# or change it to something else
|
||||
#filename = document.title()
|
||||
# with open(filename, 'w') as file:
|
||||
# html_doc = document.summary()
|
||||
# file.write(html_doc)
|
||||
|
||||
|
||||
async def extract_image_from_feed(db_file, feed_id, url):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: db_file: {} feed_id: {} url: {}'
|
||||
|
@ -1959,13 +1845,7 @@ async def extract_image_from_html(url):
|
|||
result = await fetch.http(url)
|
||||
if not result['error']:
|
||||
data = result['content']
|
||||
try:
|
||||
document = Document(data)
|
||||
content = document.summary()
|
||||
except:
|
||||
content = data
|
||||
logger.warning('Check that package readability is installed.')
|
||||
tree = html.fromstring(content)
|
||||
tree = html.fromstring(data)
|
||||
# TODO Exclude banners, class="share" links etc.
|
||||
images = tree.xpath(
|
||||
'//img[not('
|
||||
|
@ -1985,68 +1865,6 @@ async def extract_image_from_html(url):
|
|||
return image_url
|
||||
|
||||
|
||||
def generate_epub(text, filename):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: text: {} pathname: {}'.format(function_name, text, filename))
|
||||
## create an empty eBook
|
||||
filename_list = filename.split("/")
|
||||
file_title = filename_list.pop()
|
||||
directory = "/".join(filename_list)
|
||||
book = xml2epub.Epub(file_title)
|
||||
## create chapters by url
|
||||
# chapter0 = xml2epub.create_chapter_from_string(text, title=filename, strict=False)
|
||||
chapter0 = xml2epub.create_chapter_from_string(text, strict=False)
|
||||
#### create chapter objects
|
||||
# chapter1 = xml2epub.create_chapter_from_url("https://dev.to/devteam/top-7-featured-dev-posts-from-the-past-week-h6h")
|
||||
# chapter2 = xml2epub.create_chapter_from_url("https://dev.to/ks1912/getting-started-with-docker-34g6")
|
||||
## add chapters to your eBook
|
||||
try:
|
||||
book.add_chapter(chapter0)
|
||||
# book.add_chapter(chapter1)
|
||||
# book.add_chapter(chapter2)
|
||||
## generate epub file
|
||||
book.create_epub(directory, absolute_location=filename)
|
||||
except ValueError as error:
|
||||
return error
|
||||
|
||||
|
||||
|
||||
def generate_html(text, filename):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
|
||||
with open(filename, 'w') as file:
|
||||
file.write(text)
|
||||
|
||||
|
||||
def generate_markdown(text, filename):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
|
||||
h2m = html2text.HTML2Text()
|
||||
# Convert HTML to Markdown
|
||||
markdown = h2m.handle(text)
|
||||
with open(filename, 'w') as file:
|
||||
file.write(markdown)
|
||||
|
||||
|
||||
def generate_pdf(text, filename):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
|
||||
try:
|
||||
pdfkit.from_string(text, filename)
|
||||
except IOError as error:
|
||||
return error
|
||||
except OSError as error:
|
||||
return error
|
||||
|
||||
|
||||
def generate_txt(text, filename):
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: text: {} filename: {}'.format(function_name, text, filename))
|
||||
text = remove_html_tags(text)
|
||||
with open(filename, 'w') as file:
|
||||
file.write(text)
|
||||
|
||||
|
||||
# This works too
|
||||
# ''.join(xml.etree.ElementTree.fromstring(text).itertext())
|
||||
def remove_html_tags(data):
|
||||
|
|
|
@ -61,18 +61,6 @@ old
|
|||
Send all items of newly added feeds.
|
||||
"""
|
||||
|
||||
[document]
|
||||
content = """
|
||||
content <id>/<url> <type>
|
||||
Send a readability (arc90) version of an article as file. Specify <id> or <url> and <type>.
|
||||
Supported types are ePUB, HTML, MD and PDF (default).
|
||||
"""
|
||||
page = """
|
||||
page <id>/<url> <type>
|
||||
Send an article as file. Specify <id> or <url> and <type>.
|
||||
Supported types are ePUB, HTML, MD and PDF (default).
|
||||
"""
|
||||
|
||||
[filters]
|
||||
allow = """
|
||||
allow [+|-] <keyword>
|
||||
|
|
|
@ -131,7 +131,7 @@ async def probe_page(url, document=None):
|
|||
tree = html.fromstring(document)
|
||||
result = None
|
||||
except:
|
||||
logging.debug("Failed to parse URL as feed for {}.".format(url))
|
||||
logging.warning("Failed to parse URL as feed for {}.".format(url))
|
||||
result = {'link' : None,
|
||||
'index' : None,
|
||||
'name' : None,
|
||||
|
|
|
@ -107,7 +107,8 @@ def http_response(url):
|
|||
# response = requests.head(url, headers=headers, allow_redirects=True)
|
||||
response = requests.get(url, headers=headers, allow_redirects=True)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
logging.warning('Error in HTTP response')
|
||||
logging.error(e)
|
||||
response = None
|
||||
return response
|
||||
|
||||
|
|
|
@ -66,7 +66,8 @@ def create_connection(db_file):
|
|||
conn.execute("PRAGMA foreign_keys = ON")
|
||||
# return conn
|
||||
except Error as e:
|
||||
print(e)
|
||||
logger.warning('Error creating a connection to database {}.'.format(db_file))
|
||||
logger.error(e)
|
||||
time_end = time.time()
|
||||
difference = time_end - time_begin
|
||||
if difference > 1: logger.warning('{} (time: {})'.format(function_name,
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
__version__ = '0.1.66'
|
||||
__version_info__ = (0, 1, 66)
|
||||
__version__ = '0.1.67'
|
||||
__version_info__ = (0, 1, 67)
|
||||
|
|
|
@ -595,105 +595,6 @@ class Chat:
|
|||
message_lowercase.startswith('gopher:')):
|
||||
response = 'Gemini and Gopher are not supported yet.'
|
||||
XmppMessage.send_reply(self, message, response)
|
||||
# TODO xHTML, HTMLZ, MHTML
|
||||
case _ if (message_lowercase.startswith('content') or
|
||||
message_lowercase.startswith('page')):
|
||||
if message_lowercase.startswith('content'):
|
||||
message_text = message_text[8:]
|
||||
readability = True
|
||||
else:
|
||||
message_text = message_text[5:]
|
||||
readability = False
|
||||
ix_url = message_text.split(' ')[0]
|
||||
ext = ' '.join(message_text.split(' ')[1:])
|
||||
ext = ext if ext else 'pdf'
|
||||
url = None
|
||||
error = None
|
||||
response = None
|
||||
if ext in ('epub', 'html', 'markdown', 'md', 'pdf', 'text',
|
||||
'txt'):
|
||||
match ext:
|
||||
case 'markdown':
|
||||
ext = 'md'
|
||||
case 'text':
|
||||
ext = 'txt'
|
||||
status_type = 'dnd'
|
||||
status_message = ('📃️ Procesing request to produce {} '
|
||||
'document...'.format(ext.upper()))
|
||||
# pending_tasks_num = len(self.pending_tasks[jid_bare])
|
||||
pending_tasks_num = randrange(10000, 99999)
|
||||
self.pending_tasks[jid_bare][pending_tasks_num] = status_message
|
||||
# self.pending_tasks_counter += 1
|
||||
# self.pending_tasks[jid_bare][self.pending_tasks_counter] = status_message
|
||||
XmppPresence.send(self, jid_bare, status_message,
|
||||
status_type=status_type)
|
||||
db_file = config.get_pathname_to_database(jid_file)
|
||||
cache_dir = config.get_default_cache_directory()
|
||||
if not os.path.isdir(cache_dir):
|
||||
os.mkdir(cache_dir)
|
||||
if not os.path.isdir(cache_dir + '/readability'):
|
||||
os.mkdir(cache_dir + '/readability')
|
||||
if ix_url:
|
||||
try:
|
||||
ix = int(ix_url)
|
||||
try:
|
||||
url = sqlite.get_entry_url(db_file, ix)
|
||||
url = url[0]
|
||||
except:
|
||||
response = 'No entry with index {}'.format(ix)
|
||||
except:
|
||||
url = ix_url
|
||||
if url:
|
||||
url = uri.remove_tracking_parameters(url)
|
||||
url = (await uri.replace_hostname(url, 'link')) or url
|
||||
result = await fetch.http(url)
|
||||
if not result['error']:
|
||||
data = result['content']
|
||||
code = result['status_code']
|
||||
title = action.get_document_title(data)
|
||||
title = title.strip().lower()
|
||||
for i in (' ', '-'):
|
||||
title = title.replace(i, '_')
|
||||
for i in ('?', '"', '\'', '!'):
|
||||
title = title.replace(i, '')
|
||||
filename = os.path.join(
|
||||
cache_dir, 'readability',
|
||||
title + '_' + dt.timestamp() + '.' + ext)
|
||||
error = action.generate_document(data, url,
|
||||
ext, filename,
|
||||
readability)
|
||||
if error:
|
||||
response = ('> {}\n'
|
||||
'Failed to export {}. '
|
||||
'Reason: {}'.format(
|
||||
url, ext.upper(), error))
|
||||
else:
|
||||
url = await XmppUpload.start(
|
||||
self, jid_bare, filename)
|
||||
chat_type = await get_chat_type(self,
|
||||
jid_bare)
|
||||
XmppMessage.send_oob(self, jid_bare, url,
|
||||
chat_type)
|
||||
else:
|
||||
response = ('> {}\n'
|
||||
'Failed to fetch URL. Reason: {}'
|
||||
.format(url, code))
|
||||
else:
|
||||
response = ('No action has been taken.'
|
||||
'\n'
|
||||
'Missing argument. '
|
||||
'Enter URL or entry index number.')
|
||||
else:
|
||||
response = ('Unsupported filetype.\n'
|
||||
'Try: epub, html, md (markdown), '
|
||||
'pdf, or txt (text)')
|
||||
del self.pending_tasks[jid_bare][pending_tasks_num]
|
||||
# del self.pending_tasks[jid_bare][self.pending_tasks_counter]
|
||||
key_list = ['status']
|
||||
await task.start_tasks_xmpp_chat(self, jid_bare, key_list)
|
||||
if response:
|
||||
logging.warning('Error for URL {}: {}'.format(url, error))
|
||||
XmppMessage.send_reply(self, message, response)
|
||||
case _ if (message_lowercase.startswith('http')) and(
|
||||
message_lowercase.endswith('.opml')):
|
||||
url = message_text
|
||||
|
|
|
@ -1650,6 +1650,7 @@ class Slixfeed(slixmpp.ClientXMPP):
|
|||
return session
|
||||
|
||||
|
||||
# FIXME
|
||||
async def _handle_recent_select(self, payload, session):
|
||||
jid_full = str(session['from'])
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
|
@ -1724,60 +1725,6 @@ class Slixfeed(slixmpp.ClientXMPP):
|
|||
return session
|
||||
|
||||
|
||||
async def _handle_recent_action(self, payload, session):
|
||||
jid_full = str(session['from'])
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
logger.debug('{}: jid_full: {}'
|
||||
.format(function_name, jid_full))
|
||||
ext = payload['values']['filetype']
|
||||
url = payload['values']['url'][0]
|
||||
jid_bare = session['from'].bare
|
||||
cache_dir = config.get_default_cache_directory()
|
||||
if not os.path.isdir(cache_dir):
|
||||
os.mkdir(cache_dir)
|
||||
if not os.path.isdir(cache_dir + '/readability'):
|
||||
os.mkdir(cache_dir + '/readability')
|
||||
url = uri.remove_tracking_parameters(url)
|
||||
url = (await uri.replace_hostname(url, 'link')) or url
|
||||
result = await fetch.http(url)
|
||||
if not result['error']:
|
||||
data = result['content']
|
||||
code = result['status_code']
|
||||
title = action.get_document_title(data)
|
||||
title = title.strip().lower()
|
||||
for i in (' ', '-'):
|
||||
title = title.replace(i, '_')
|
||||
for i in ('?', '"', '\'', '!'):
|
||||
title = title.replace(i, '')
|
||||
filename = os.path.join(
|
||||
cache_dir, 'readability',
|
||||
title + '_' + dt.timestamp() + '.' + ext)
|
||||
error = action.generate_document(data, url, ext, filename,
|
||||
readability=True)
|
||||
if error:
|
||||
text_error = ('Failed to export {} fot {}'
|
||||
'\n\n'
|
||||
'Reason: {}'.format(ext.upper(), url, error))
|
||||
session['notes'] = [['error', text_error]]
|
||||
else:
|
||||
url = await XmppUpload.start(self, jid_bare, filename)
|
||||
chat_type = await get_chat_type(self, jid_bare)
|
||||
XmppMessage.send_oob(self, jid_bare, url, chat_type)
|
||||
form = self['xep_0004'].make_form('result', 'Download')
|
||||
form['instructions'] = ('Download {} document.'
|
||||
.format(ext.upper()))
|
||||
field_url = form.add_field(var='url',
|
||||
label='Link',
|
||||
ftype='text-single',
|
||||
value=url)
|
||||
field_url['validate']['datatype'] = 'xs:anyURI'
|
||||
session['payload'] = form
|
||||
session['allow_complete'] = True
|
||||
session['next'] = None
|
||||
session['prev'] = None
|
||||
return session
|
||||
|
||||
|
||||
async def _handle_subscription_new(self, payload, session):
|
||||
jid_full = str(session['from'])
|
||||
function_name = sys._getframe().f_code.co_name
|
||||
|
|
|
@ -44,7 +44,8 @@ class XmppPubsub:
|
|||
|
||||
async def get_node_configuration(self, jid, node_id):
|
||||
node = await self.plugin['xep_0060'].get_node_config(jid, node_id)
|
||||
print(node)
|
||||
if not node:
|
||||
print('NODE CONFIG', node_id, str(node))
|
||||
return node
|
||||
|
||||
|
||||
|
|
|
@ -46,8 +46,8 @@ async def get_chat_type(self, jid):
|
|||
logging.info('Jabber ID: {}\n'
|
||||
'Chat Type: {}'.format(jid, result))
|
||||
except (IqError, IqTimeout) as e:
|
||||
logging.error(str(e))
|
||||
logging.error(jid)
|
||||
logging.warning('Chat type could not be determined for {}'.format(jid))
|
||||
logging.error(e)
|
||||
result = 'error'
|
||||
# except BaseException as e:
|
||||
# logging.error('BaseException', str(e))
|
||||
|
|
Loading…
Reference in a new issue