From 0a26ac163b6e57e17a83905919e3c27213649908 Mon Sep 17 00:00:00 2001 From: "Schimon Jehudah, Adv." Date: Mon, 10 Jun 2024 21:20:04 +0300 Subject: [PATCH] Do not disqualify subscriptions due to being not-well-formed --- slixfeed/action.py | 12 ++++------ slixfeed/crawl.py | 48 ++++++++++++++++++++++++++++----------- slixfeed/version.py | 4 ++-- slixfeed/xmpp/client.py | 5 ++-- slixfeed/xmpp/commands.py | 6 ++--- 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/slixfeed/action.py b/slixfeed/action.py index ddc072c..84bd8c2 100644 --- a/slixfeed/action.py +++ b/slixfeed/action.py @@ -700,7 +700,7 @@ def is_feed_json(document): return value -def is_feed(feed): +def is_feed(url, feed): """ Determine whether document is feed or not. @@ -737,10 +737,9 @@ def is_feed(feed): # "No entries nor title for {}" # ).format(url) elif feed.bozo: - value = False - # message = ( - # "Bozo detected for {}" - # ).format(url) + # NOTE Consider valid even when is not-well-formed + value = True + logger.warning('Bozo detected for {}'.format(url)) else: value = True # message = ( @@ -984,8 +983,7 @@ async def add_feed(self, jid_bare, db_file, url, identifier): document = result['content'] feed = parse(document) # if document and status_code == 200: - # if is_feed(url, feed): - if is_feed(feed): + if is_feed(url, feed): if "title" in feed["feed"].keys(): title = feed["feed"]["title"] else: diff --git a/slixfeed/crawl.py b/slixfeed/crawl.py index 09c0275..abdb2bf 100644 --- a/slixfeed/crawl.py +++ b/slixfeed/crawl.py @@ -62,9 +62,12 @@ Link : https://www.jsonfeed.org/feed.json/videos.xml from aiohttp import ClientError, ClientSession, ClientTimeout from feedparser import parse import logging +from lxml import etree from lxml import html +from lxml.etree import fromstring import slixfeed.config as config import slixfeed.fetch as fetch +from slixfeed.log import Logger from slixfeed.url import complete_url, join_url, trim_url from urllib.parse import urlsplit, urlunsplit @@ -107,6 +110,7 @@ from urllib.parse import urlsplit, urlunsplit # else: # return await callback(url) +logger = Logger(__name__) async def probe_page(url, document=None): """ @@ -130,25 +134,43 @@ async def probe_page(url, document=None): # tree = etree.fromstring(res[0]) # etree is for xml tree = html.fromstring(document) result = None - except: - logging.warning("Failed to parse URL as feed for {}.".format(url)) - result = {'link' : None, - 'index' : None, - 'name' : None, - 'code' : None, - 'error' : True, - 'exist' : None} + except Exception as e: + logger.error(str(e)) + try: + # /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + # xml = html.fromstring(document.encode('utf-8')) + # parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') + # tree = fromstring(xml, parser=parser) + + # /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported + #tree = html.fromstring(bytes(document, encoding='utf8')) + + # https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html + #parser = etree.XMLParser(recover=True) + #tree = etree.fromstring(document, parser) + + tree = html.fromstring(document.encode('utf-8')) + result = None + except Exception as e: + logger.error(str(e)) + logger.warning("Failed to parse URL as feed for {}.".format(url)) + result = {'link' : None, + 'index' : None, + 'name' : None, + 'code' : None, + 'error' : True, + 'exist' : None} if not result: - logging.debug("Feed auto-discovery engaged for {}".format(url)) + logger.debug("Feed auto-discovery engaged for {}".format(url)) result = await feed_mode_auto_discovery(url, tree) if not result: - logging.debug("Feed link scan mode engaged for {}".format(url)) + logger.debug("Feed link scan mode engaged for {}".format(url)) result = await feed_mode_scan(url, tree) if not result: - logging.debug("Feed arbitrary mode engaged for {}".format(url)) + logger.debug("Feed arbitrary mode engaged for {}".format(url)) result = await feed_mode_guess(url, tree) if not result: - logging.debug("No feeds were found for {}".format(url)) + logger.debug("No feeds were found for {}".format(url)) result = None return result @@ -411,4 +433,4 @@ async def process_feed_selection(url, urls): # return response # url = "https://www.smh.com.au/rssheadlines" -# start(url) \ No newline at end of file +# start(url) diff --git a/slixfeed/version.py b/slixfeed/version.py index 42422f1..86cbb79 100644 --- a/slixfeed/version.py +++ b/slixfeed/version.py @@ -1,2 +1,2 @@ -__version__ = '0.1.73' -__version_info__ = (0, 1, 73) +__version__ = '0.1.74' +__version_info__ = (0, 1, 74) diff --git a/slixfeed/xmpp/client.py b/slixfeed/xmpp/client.py index 85156ff..761ce08 100644 --- a/slixfeed/xmpp/client.py +++ b/slixfeed/xmpp/client.py @@ -52,9 +52,9 @@ from slixfeed.config import Config import slixfeed.crawl as crawl import slixfeed.dt as dt import slixfeed.fetch as fetch +from slixfeed.log import Logger import slixfeed.sqlite as sqlite import slixfeed.url as uri -from slixfeed.log import Logger from slixfeed.version import __version__ from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.chat import Chat @@ -1140,8 +1140,7 @@ class XmppClient(slixmpp.ClientXMPP): if not result['error']: document = result['content'] feed = parse(document) - # if is_feed(url, feed): - if action.is_feed(feed): + if action.is_feed(url, feed): form['instructions'] = 'Select entries to publish.' options = form.add_field(desc='Select entries to post.', ftype='list-multi', diff --git a/slixfeed/xmpp/commands.py b/slixfeed/xmpp/commands.py index 5a9c99f..4198642 100644 --- a/slixfeed/xmpp/commands.py +++ b/slixfeed/xmpp/commands.py @@ -754,8 +754,7 @@ class XmppCommands: if not result['error']: document = result['content'] feed = parse(document) - # if is_feed(url, feed): - if action.is_feed(feed): + if action.is_feed(url, feed): message = action.view_feed(url, feed) break else: @@ -791,8 +790,7 @@ class XmppCommands: document = result['content'] status = result['status_code'] feed = parse(document) - # if is_feed(url, feed): - if action.is_feed(feed): + if action.is_feed(url, feed): message = action.view_entry(url, feed, num) break else: