Do not disqualify subscriptions due to being not-well-formed
This commit is contained in:
parent
245cd9832a
commit
0a26ac163b
5 changed files with 46 additions and 29 deletions
|
@ -700,7 +700,7 @@ def is_feed_json(document):
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
|
||||||
def is_feed(feed):
|
def is_feed(url, feed):
|
||||||
"""
|
"""
|
||||||
Determine whether document is feed or not.
|
Determine whether document is feed or not.
|
||||||
|
|
||||||
|
@ -737,10 +737,9 @@ def is_feed(feed):
|
||||||
# "No entries nor title for {}"
|
# "No entries nor title for {}"
|
||||||
# ).format(url)
|
# ).format(url)
|
||||||
elif feed.bozo:
|
elif feed.bozo:
|
||||||
value = False
|
# NOTE Consider valid even when is not-well-formed
|
||||||
# message = (
|
value = True
|
||||||
# "Bozo detected for {}"
|
logger.warning('Bozo detected for {}'.format(url))
|
||||||
# ).format(url)
|
|
||||||
else:
|
else:
|
||||||
value = True
|
value = True
|
||||||
# message = (
|
# message = (
|
||||||
|
@ -984,8 +983,7 @@ async def add_feed(self, jid_bare, db_file, url, identifier):
|
||||||
document = result['content']
|
document = result['content']
|
||||||
feed = parse(document)
|
feed = parse(document)
|
||||||
# if document and status_code == 200:
|
# if document and status_code == 200:
|
||||||
# if is_feed(url, feed):
|
if is_feed(url, feed):
|
||||||
if is_feed(feed):
|
|
||||||
if "title" in feed["feed"].keys():
|
if "title" in feed["feed"].keys():
|
||||||
title = feed["feed"]["title"]
|
title = feed["feed"]["title"]
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -62,9 +62,12 @@ Link : https://www.jsonfeed.org/feed.json/videos.xml
|
||||||
from aiohttp import ClientError, ClientSession, ClientTimeout
|
from aiohttp import ClientError, ClientSession, ClientTimeout
|
||||||
from feedparser import parse
|
from feedparser import parse
|
||||||
import logging
|
import logging
|
||||||
|
from lxml import etree
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
from lxml.etree import fromstring
|
||||||
import slixfeed.config as config
|
import slixfeed.config as config
|
||||||
import slixfeed.fetch as fetch
|
import slixfeed.fetch as fetch
|
||||||
|
from slixfeed.log import Logger
|
||||||
from slixfeed.url import complete_url, join_url, trim_url
|
from slixfeed.url import complete_url, join_url, trim_url
|
||||||
from urllib.parse import urlsplit, urlunsplit
|
from urllib.parse import urlsplit, urlunsplit
|
||||||
|
|
||||||
|
@ -107,6 +110,7 @@ from urllib.parse import urlsplit, urlunsplit
|
||||||
# else:
|
# else:
|
||||||
# return await callback(url)
|
# return await callback(url)
|
||||||
|
|
||||||
|
logger = Logger(__name__)
|
||||||
|
|
||||||
async def probe_page(url, document=None):
|
async def probe_page(url, document=None):
|
||||||
"""
|
"""
|
||||||
|
@ -130,25 +134,43 @@ async def probe_page(url, document=None):
|
||||||
# tree = etree.fromstring(res[0]) # etree is for xml
|
# tree = etree.fromstring(res[0]) # etree is for xml
|
||||||
tree = html.fromstring(document)
|
tree = html.fromstring(document)
|
||||||
result = None
|
result = None
|
||||||
except:
|
except Exception as e:
|
||||||
logging.warning("Failed to parse URL as feed for {}.".format(url))
|
logger.error(str(e))
|
||||||
result = {'link' : None,
|
try:
|
||||||
'index' : None,
|
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
|
||||||
'name' : None,
|
# xml = html.fromstring(document.encode('utf-8'))
|
||||||
'code' : None,
|
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
|
||||||
'error' : True,
|
# tree = fromstring(xml, parser=parser)
|
||||||
'exist' : None}
|
|
||||||
|
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
||||||
|
#tree = html.fromstring(bytes(document, encoding='utf8'))
|
||||||
|
|
||||||
|
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
|
||||||
|
#parser = etree.XMLParser(recover=True)
|
||||||
|
#tree = etree.fromstring(document, parser)
|
||||||
|
|
||||||
|
tree = html.fromstring(document.encode('utf-8'))
|
||||||
|
result = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
logger.warning("Failed to parse URL as feed for {}.".format(url))
|
||||||
|
result = {'link' : None,
|
||||||
|
'index' : None,
|
||||||
|
'name' : None,
|
||||||
|
'code' : None,
|
||||||
|
'error' : True,
|
||||||
|
'exist' : None}
|
||||||
if not result:
|
if not result:
|
||||||
logging.debug("Feed auto-discovery engaged for {}".format(url))
|
logger.debug("Feed auto-discovery engaged for {}".format(url))
|
||||||
result = await feed_mode_auto_discovery(url, tree)
|
result = await feed_mode_auto_discovery(url, tree)
|
||||||
if not result:
|
if not result:
|
||||||
logging.debug("Feed link scan mode engaged for {}".format(url))
|
logger.debug("Feed link scan mode engaged for {}".format(url))
|
||||||
result = await feed_mode_scan(url, tree)
|
result = await feed_mode_scan(url, tree)
|
||||||
if not result:
|
if not result:
|
||||||
logging.debug("Feed arbitrary mode engaged for {}".format(url))
|
logger.debug("Feed arbitrary mode engaged for {}".format(url))
|
||||||
result = await feed_mode_guess(url, tree)
|
result = await feed_mode_guess(url, tree)
|
||||||
if not result:
|
if not result:
|
||||||
logging.debug("No feeds were found for {}".format(url))
|
logger.debug("No feeds were found for {}".format(url))
|
||||||
result = None
|
result = None
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -411,4 +433,4 @@ async def process_feed_selection(url, urls):
|
||||||
# return response
|
# return response
|
||||||
|
|
||||||
# url = "https://www.smh.com.au/rssheadlines"
|
# url = "https://www.smh.com.au/rssheadlines"
|
||||||
# start(url)
|
# start(url)
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
__version__ = '0.1.73'
|
__version__ = '0.1.74'
|
||||||
__version_info__ = (0, 1, 73)
|
__version_info__ = (0, 1, 74)
|
||||||
|
|
|
@ -52,9 +52,9 @@ from slixfeed.config import Config
|
||||||
import slixfeed.crawl as crawl
|
import slixfeed.crawl as crawl
|
||||||
import slixfeed.dt as dt
|
import slixfeed.dt as dt
|
||||||
import slixfeed.fetch as fetch
|
import slixfeed.fetch as fetch
|
||||||
|
from slixfeed.log import Logger
|
||||||
import slixfeed.sqlite as sqlite
|
import slixfeed.sqlite as sqlite
|
||||||
import slixfeed.url as uri
|
import slixfeed.url as uri
|
||||||
from slixfeed.log import Logger
|
|
||||||
from slixfeed.version import __version__
|
from slixfeed.version import __version__
|
||||||
from slixfeed.xmpp.bookmark import XmppBookmark
|
from slixfeed.xmpp.bookmark import XmppBookmark
|
||||||
from slixfeed.xmpp.chat import Chat
|
from slixfeed.xmpp.chat import Chat
|
||||||
|
@ -1140,8 +1140,7 @@ class XmppClient(slixmpp.ClientXMPP):
|
||||||
if not result['error']:
|
if not result['error']:
|
||||||
document = result['content']
|
document = result['content']
|
||||||
feed = parse(document)
|
feed = parse(document)
|
||||||
# if is_feed(url, feed):
|
if action.is_feed(url, feed):
|
||||||
if action.is_feed(feed):
|
|
||||||
form['instructions'] = 'Select entries to publish.'
|
form['instructions'] = 'Select entries to publish.'
|
||||||
options = form.add_field(desc='Select entries to post.',
|
options = form.add_field(desc='Select entries to post.',
|
||||||
ftype='list-multi',
|
ftype='list-multi',
|
||||||
|
|
|
@ -754,8 +754,7 @@ class XmppCommands:
|
||||||
if not result['error']:
|
if not result['error']:
|
||||||
document = result['content']
|
document = result['content']
|
||||||
feed = parse(document)
|
feed = parse(document)
|
||||||
# if is_feed(url, feed):
|
if action.is_feed(url, feed):
|
||||||
if action.is_feed(feed):
|
|
||||||
message = action.view_feed(url, feed)
|
message = action.view_feed(url, feed)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
@ -791,8 +790,7 @@ class XmppCommands:
|
||||||
document = result['content']
|
document = result['content']
|
||||||
status = result['status_code']
|
status = result['status_code']
|
||||||
feed = parse(document)
|
feed = parse(document)
|
||||||
# if is_feed(url, feed):
|
if action.is_feed(url, feed):
|
||||||
if action.is_feed(feed):
|
|
||||||
message = action.view_entry(url, feed, num)
|
message = action.view_entry(url, feed, num)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in a new issue