Do not disqualify subscriptions due to being not-well-formed
This commit is contained in:
parent
245cd9832a
commit
0a26ac163b
5 changed files with 46 additions and 29 deletions
|
@ -700,7 +700,7 @@ def is_feed_json(document):
|
|||
return value
|
||||
|
||||
|
||||
def is_feed(feed):
|
||||
def is_feed(url, feed):
|
||||
"""
|
||||
Determine whether document is feed or not.
|
||||
|
||||
|
@ -737,10 +737,9 @@ def is_feed(feed):
|
|||
# "No entries nor title for {}"
|
||||
# ).format(url)
|
||||
elif feed.bozo:
|
||||
value = False
|
||||
# message = (
|
||||
# "Bozo detected for {}"
|
||||
# ).format(url)
|
||||
# NOTE Consider valid even when is not-well-formed
|
||||
value = True
|
||||
logger.warning('Bozo detected for {}'.format(url))
|
||||
else:
|
||||
value = True
|
||||
# message = (
|
||||
|
@ -984,8 +983,7 @@ async def add_feed(self, jid_bare, db_file, url, identifier):
|
|||
document = result['content']
|
||||
feed = parse(document)
|
||||
# if document and status_code == 200:
|
||||
# if is_feed(url, feed):
|
||||
if is_feed(feed):
|
||||
if is_feed(url, feed):
|
||||
if "title" in feed["feed"].keys():
|
||||
title = feed["feed"]["title"]
|
||||
else:
|
||||
|
|
|
@ -62,9 +62,12 @@ Link : https://www.jsonfeed.org/feed.json/videos.xml
|
|||
from aiohttp import ClientError, ClientSession, ClientTimeout
|
||||
from feedparser import parse
|
||||
import logging
|
||||
from lxml import etree
|
||||
from lxml import html
|
||||
from lxml.etree import fromstring
|
||||
import slixfeed.config as config
|
||||
import slixfeed.fetch as fetch
|
||||
from slixfeed.log import Logger
|
||||
from slixfeed.url import complete_url, join_url, trim_url
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
|
@ -107,6 +110,7 @@ from urllib.parse import urlsplit, urlunsplit
|
|||
# else:
|
||||
# return await callback(url)
|
||||
|
||||
logger = Logger(__name__)
|
||||
|
||||
async def probe_page(url, document=None):
|
||||
"""
|
||||
|
@ -130,25 +134,43 @@ async def probe_page(url, document=None):
|
|||
# tree = etree.fromstring(res[0]) # etree is for xml
|
||||
tree = html.fromstring(document)
|
||||
result = None
|
||||
except:
|
||||
logging.warning("Failed to parse URL as feed for {}.".format(url))
|
||||
result = {'link' : None,
|
||||
'index' : None,
|
||||
'name' : None,
|
||||
'code' : None,
|
||||
'error' : True,
|
||||
'exist' : None}
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
try:
|
||||
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
|
||||
# xml = html.fromstring(document.encode('utf-8'))
|
||||
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
|
||||
# tree = fromstring(xml, parser=parser)
|
||||
|
||||
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
||||
#tree = html.fromstring(bytes(document, encoding='utf8'))
|
||||
|
||||
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
|
||||
#parser = etree.XMLParser(recover=True)
|
||||
#tree = etree.fromstring(document, parser)
|
||||
|
||||
tree = html.fromstring(document.encode('utf-8'))
|
||||
result = None
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
logger.warning("Failed to parse URL as feed for {}.".format(url))
|
||||
result = {'link' : None,
|
||||
'index' : None,
|
||||
'name' : None,
|
||||
'code' : None,
|
||||
'error' : True,
|
||||
'exist' : None}
|
||||
if not result:
|
||||
logging.debug("Feed auto-discovery engaged for {}".format(url))
|
||||
logger.debug("Feed auto-discovery engaged for {}".format(url))
|
||||
result = await feed_mode_auto_discovery(url, tree)
|
||||
if not result:
|
||||
logging.debug("Feed link scan mode engaged for {}".format(url))
|
||||
logger.debug("Feed link scan mode engaged for {}".format(url))
|
||||
result = await feed_mode_scan(url, tree)
|
||||
if not result:
|
||||
logging.debug("Feed arbitrary mode engaged for {}".format(url))
|
||||
logger.debug("Feed arbitrary mode engaged for {}".format(url))
|
||||
result = await feed_mode_guess(url, tree)
|
||||
if not result:
|
||||
logging.debug("No feeds were found for {}".format(url))
|
||||
logger.debug("No feeds were found for {}".format(url))
|
||||
result = None
|
||||
return result
|
||||
|
||||
|
@ -411,4 +433,4 @@ async def process_feed_selection(url, urls):
|
|||
# return response
|
||||
|
||||
# url = "https://www.smh.com.au/rssheadlines"
|
||||
# start(url)
|
||||
# start(url)
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
__version__ = '0.1.73'
|
||||
__version_info__ = (0, 1, 73)
|
||||
__version__ = '0.1.74'
|
||||
__version_info__ = (0, 1, 74)
|
||||
|
|
|
@ -52,9 +52,9 @@ from slixfeed.config import Config
|
|||
import slixfeed.crawl as crawl
|
||||
import slixfeed.dt as dt
|
||||
import slixfeed.fetch as fetch
|
||||
from slixfeed.log import Logger
|
||||
import slixfeed.sqlite as sqlite
|
||||
import slixfeed.url as uri
|
||||
from slixfeed.log import Logger
|
||||
from slixfeed.version import __version__
|
||||
from slixfeed.xmpp.bookmark import XmppBookmark
|
||||
from slixfeed.xmpp.chat import Chat
|
||||
|
@ -1140,8 +1140,7 @@ class XmppClient(slixmpp.ClientXMPP):
|
|||
if not result['error']:
|
||||
document = result['content']
|
||||
feed = parse(document)
|
||||
# if is_feed(url, feed):
|
||||
if action.is_feed(feed):
|
||||
if action.is_feed(url, feed):
|
||||
form['instructions'] = 'Select entries to publish.'
|
||||
options = form.add_field(desc='Select entries to post.',
|
||||
ftype='list-multi',
|
||||
|
|
|
@ -754,8 +754,7 @@ class XmppCommands:
|
|||
if not result['error']:
|
||||
document = result['content']
|
||||
feed = parse(document)
|
||||
# if is_feed(url, feed):
|
||||
if action.is_feed(feed):
|
||||
if action.is_feed(url, feed):
|
||||
message = action.view_feed(url, feed)
|
||||
break
|
||||
else:
|
||||
|
@ -791,8 +790,7 @@ class XmppCommands:
|
|||
document = result['content']
|
||||
status = result['status_code']
|
||||
feed = parse(document)
|
||||
# if is_feed(url, feed):
|
||||
if action.is_feed(feed):
|
||||
if action.is_feed(url, feed):
|
||||
message = action.view_entry(url, feed, num)
|
||||
break
|
||||
else:
|
||||
|
|
Loading…
Reference in a new issue