Do not disqualify subscriptions due to being not-well-formed

This commit is contained in:
Schimon Jehudah, Adv. 2024-06-10 21:20:04 +03:00
parent 245cd9832a
commit 0a26ac163b
5 changed files with 46 additions and 29 deletions

View file

@ -700,7 +700,7 @@ def is_feed_json(document):
return value return value
def is_feed(feed): def is_feed(url, feed):
""" """
Determine whether document is feed or not. Determine whether document is feed or not.
@ -737,10 +737,9 @@ def is_feed(feed):
# "No entries nor title for {}" # "No entries nor title for {}"
# ).format(url) # ).format(url)
elif feed.bozo: elif feed.bozo:
value = False # NOTE Consider valid even when is not-well-formed
# message = ( value = True
# "Bozo detected for {}" logger.warning('Bozo detected for {}'.format(url))
# ).format(url)
else: else:
value = True value = True
# message = ( # message = (
@ -984,8 +983,7 @@ async def add_feed(self, jid_bare, db_file, url, identifier):
document = result['content'] document = result['content']
feed = parse(document) feed = parse(document)
# if document and status_code == 200: # if document and status_code == 200:
# if is_feed(url, feed): if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys(): if "title" in feed["feed"].keys():
title = feed["feed"]["title"] title = feed["feed"]["title"]
else: else:

View file

@ -62,9 +62,12 @@ Link : https://www.jsonfeed.org/feed.json/videos.xml
from aiohttp import ClientError, ClientSession, ClientTimeout from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse from feedparser import parse
import logging import logging
from lxml import etree
from lxml import html from lxml import html
from lxml.etree import fromstring
import slixfeed.config as config import slixfeed.config as config
import slixfeed.fetch as fetch import slixfeed.fetch as fetch
from slixfeed.log import Logger
from slixfeed.url import complete_url, join_url, trim_url from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
@ -107,6 +110,7 @@ from urllib.parse import urlsplit, urlunsplit
# else: # else:
# return await callback(url) # return await callback(url)
logger = Logger(__name__)
async def probe_page(url, document=None): async def probe_page(url, document=None):
""" """
@ -130,25 +134,43 @@ async def probe_page(url, document=None):
# tree = etree.fromstring(res[0]) # etree is for xml # tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document) tree = html.fromstring(document)
result = None result = None
except: except Exception as e:
logging.warning("Failed to parse URL as feed for {}.".format(url)) logger.error(str(e))
result = {'link' : None, try:
'index' : None, # /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
'name' : None, # xml = html.fromstring(document.encode('utf-8'))
'code' : None, # parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
'error' : True, # tree = fromstring(xml, parser=parser)
'exist' : None}
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html.fromstring(document.encode('utf-8'))
result = None
except Exception as e:
logger.error(str(e))
logger.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
if not result: if not result:
logging.debug("Feed auto-discovery engaged for {}".format(url)) logger.debug("Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree) result = await feed_mode_auto_discovery(url, tree)
if not result: if not result:
logging.debug("Feed link scan mode engaged for {}".format(url)) logger.debug("Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree) result = await feed_mode_scan(url, tree)
if not result: if not result:
logging.debug("Feed arbitrary mode engaged for {}".format(url)) logger.debug("Feed arbitrary mode engaged for {}".format(url))
result = await feed_mode_guess(url, tree) result = await feed_mode_guess(url, tree)
if not result: if not result:
logging.debug("No feeds were found for {}".format(url)) logger.debug("No feeds were found for {}".format(url))
result = None result = None
return result return result
@ -411,4 +433,4 @@ async def process_feed_selection(url, urls):
# return response # return response
# url = "https://www.smh.com.au/rssheadlines" # url = "https://www.smh.com.au/rssheadlines"
# start(url) # start(url)

View file

@ -1,2 +1,2 @@
__version__ = '0.1.73' __version__ = '0.1.74'
__version_info__ = (0, 1, 73) __version_info__ = (0, 1, 74)

View file

@ -52,9 +52,9 @@ from slixfeed.config import Config
import slixfeed.crawl as crawl import slixfeed.crawl as crawl
import slixfeed.dt as dt import slixfeed.dt as dt
import slixfeed.fetch as fetch import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
import slixfeed.url as uri import slixfeed.url as uri
from slixfeed.log import Logger
from slixfeed.version import __version__ from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.chat import Chat from slixfeed.xmpp.chat import Chat
@ -1140,8 +1140,7 @@ class XmppClient(slixmpp.ClientXMPP):
if not result['error']: if not result['error']:
document = result['content'] document = result['content']
feed = parse(document) feed = parse(document)
# if is_feed(url, feed): if action.is_feed(url, feed):
if action.is_feed(feed):
form['instructions'] = 'Select entries to publish.' form['instructions'] = 'Select entries to publish.'
options = form.add_field(desc='Select entries to post.', options = form.add_field(desc='Select entries to post.',
ftype='list-multi', ftype='list-multi',

View file

@ -754,8 +754,7 @@ class XmppCommands:
if not result['error']: if not result['error']:
document = result['content'] document = result['content']
feed = parse(document) feed = parse(document)
# if is_feed(url, feed): if action.is_feed(url, feed):
if action.is_feed(feed):
message = action.view_feed(url, feed) message = action.view_feed(url, feed)
break break
else: else:
@ -791,8 +790,7 @@ class XmppCommands:
document = result['content'] document = result['content']
status = result['status_code'] status = result['status_code']
feed = parse(document) feed = parse(document)
# if is_feed(url, feed): if action.is_feed(url, feed):
if action.is_feed(feed):
message = action.view_entry(url, feed, num) message = action.view_entry(url, feed, num)
break break
else: else: