Do not disqualify subscriptions due to being not-well-formed

This commit is contained in:
Schimon Jehudah, Adv. 2024-06-10 21:20:04 +03:00
parent 245cd9832a
commit 0a26ac163b
5 changed files with 46 additions and 29 deletions

View file

@ -700,7 +700,7 @@ def is_feed_json(document):
return value
def is_feed(feed):
def is_feed(url, feed):
"""
Determine whether document is feed or not.
@ -737,10 +737,9 @@ def is_feed(feed):
# "No entries nor title for {}"
# ).format(url)
elif feed.bozo:
value = False
# message = (
# "Bozo detected for {}"
# ).format(url)
# NOTE Consider valid even when is not-well-formed
value = True
logger.warning('Bozo detected for {}'.format(url))
else:
value = True
# message = (
@ -984,8 +983,7 @@ async def add_feed(self, jid_bare, db_file, url, identifier):
document = result['content']
feed = parse(document)
# if document and status_code == 200:
# if is_feed(url, feed):
if is_feed(feed):
if is_feed(url, feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:

View file

@ -62,9 +62,12 @@ Link : https://www.jsonfeed.org/feed.json/videos.xml
from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
import logging
from lxml import etree
from lxml import html
from lxml.etree import fromstring
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit
@ -107,6 +110,7 @@ from urllib.parse import urlsplit, urlunsplit
# else:
# return await callback(url)
logger = Logger(__name__)
async def probe_page(url, document=None):
"""
@ -130,25 +134,43 @@ async def probe_page(url, document=None):
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document)
result = None
except:
logging.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
except Exception as e:
logger.error(str(e))
try:
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html.fromstring(document.encode('utf-8'))
result = None
except Exception as e:
logger.error(str(e))
logger.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
if not result:
logging.debug("Feed auto-discovery engaged for {}".format(url))
logger.debug("Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree)
if not result:
logging.debug("Feed link scan mode engaged for {}".format(url))
logger.debug("Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree)
if not result:
logging.debug("Feed arbitrary mode engaged for {}".format(url))
logger.debug("Feed arbitrary mode engaged for {}".format(url))
result = await feed_mode_guess(url, tree)
if not result:
logging.debug("No feeds were found for {}".format(url))
logger.debug("No feeds were found for {}".format(url))
result = None
return result

View file

@ -1,2 +1,2 @@
__version__ = '0.1.73'
__version_info__ = (0, 1, 73)
__version__ = '0.1.74'
__version_info__ = (0, 1, 74)

View file

@ -52,9 +52,9 @@ from slixfeed.config import Config
import slixfeed.crawl as crawl
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
import slixfeed.url as uri
from slixfeed.log import Logger
from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.chat import Chat
@ -1140,8 +1140,7 @@ class XmppClient(slixmpp.ClientXMPP):
if not result['error']:
document = result['content']
feed = parse(document)
# if is_feed(url, feed):
if action.is_feed(feed):
if action.is_feed(url, feed):
form['instructions'] = 'Select entries to publish.'
options = form.add_field(desc='Select entries to post.',
ftype='list-multi',

View file

@ -754,8 +754,7 @@ class XmppCommands:
if not result['error']:
document = result['content']
feed = parse(document)
# if is_feed(url, feed):
if action.is_feed(feed):
if action.is_feed(url, feed):
message = action.view_feed(url, feed)
break
else:
@ -791,8 +790,7 @@ class XmppCommands:
document = result['content']
status = result['status_code']
feed = parse(document)
# if is_feed(url, feed):
if action.is_feed(feed):
if action.is_feed(url, feed):
message = action.view_entry(url, feed, num)
break
else: