436 lines
14 KiB
Python
436 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
FIXME
|
|
|
|
1) https://wiki.pine64.org
|
|
File "/slixfeed/crawl.py", line 178, in feed_mode_guess
|
|
address = join_url(url, parted_url.path.split('/')[1] + path)
|
|
~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
|
|
IndexError: list index out of range
|
|
|
|
TODO
|
|
|
|
1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /
|
|
Attempt to scan sub domains
|
|
https://esmailelbob.xyz/en/
|
|
https://blog.esmailelbob.xyz/feed/
|
|
|
|
1.2) Consider utilizing fetch.http_response
|
|
|
|
2) Consider merging with module fetch.py
|
|
|
|
FEEDS CRAWLER PROJECT
|
|
|
|
3) Mark redirects for manual check
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/atom.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/feed.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/news.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/news.xml.php
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/rdf.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/rss.xml
|
|
|
|
Title : JSON Feed
|
|
Link : https://www.jsonfeed.org/feed.json/videos.xml
|
|
|
|
|
|
"""
|
|
|
|
from aiohttp import ClientError, ClientSession, ClientTimeout
|
|
from feedparser import parse
|
|
import logging
|
|
from lxml import etree
|
|
from lxml import html
|
|
from lxml.etree import fromstring
|
|
import slixfeed.config as config
|
|
import slixfeed.fetch as fetch
|
|
from slixfeed.log import Logger
|
|
from slixfeed.url import complete_url, join_url, trim_url
|
|
from urllib.parse import urlsplit, urlunsplit
|
|
|
|
|
|
# TODO Use boolean as a flag to determine whether a single URL was found
|
|
# async def probe_page(
|
|
# callback, url, document, num=None, db_file=None):
|
|
# result = None
|
|
# try:
|
|
# # tree = etree.fromstring(res[0]) # etree is for xml
|
|
# tree = html.fromstring(document)
|
|
# except:
|
|
# result = (
|
|
# "> {}\nFailed to parse URL as feed."
|
|
# ).format(url)
|
|
# if not result:
|
|
# print("RSS Auto-Discovery Engaged")
|
|
# result = await feed_mode_auto_discovery(url, tree)
|
|
# if not result:
|
|
# print("RSS Scan Mode Engaged")
|
|
# result = await feed_mode_scan(url, tree)
|
|
# if not result:
|
|
# print("RSS Arbitrary Mode Engaged")
|
|
# result = await feed_mode_request(url, tree)
|
|
# if not result:
|
|
# result = (
|
|
# "> {}\nNo news feeds were found for URL."
|
|
# ).format(url)
|
|
# # elif msg:
|
|
# else:
|
|
# if isinstance(result, str):
|
|
# return result
|
|
# elif isinstance(result, list):
|
|
# url = result[0]
|
|
# if db_file:
|
|
# # print("if db_file", db_file)
|
|
# return await callback(db_file, url)
|
|
# elif num:
|
|
# return await callback(url, num)
|
|
# else:
|
|
# return await callback(url)
|
|
|
|
logger = Logger(__name__)
|
|
|
|
async def probe_page(url, document=None):
|
|
"""
|
|
Parameters
|
|
----------
|
|
url : str
|
|
URL.
|
|
document : TYPE
|
|
DESCRIPTION.
|
|
|
|
Returns
|
|
-------
|
|
result : list or str
|
|
Single URL as list or selection of URLs as str.
|
|
"""
|
|
if not document:
|
|
response = await fetch.http(url)
|
|
if not response['error']:
|
|
document = response['content']
|
|
try:
|
|
# tree = etree.fromstring(res[0]) # etree is for xml
|
|
tree = html.fromstring(document)
|
|
result = None
|
|
except Exception as e:
|
|
logger.error(str(e))
|
|
try:
|
|
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
|
|
# xml = html.fromstring(document.encode('utf-8'))
|
|
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
|
|
# tree = fromstring(xml, parser=parser)
|
|
|
|
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
|
#tree = html.fromstring(bytes(document, encoding='utf8'))
|
|
|
|
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
|
|
#parser = etree.XMLParser(recover=True)
|
|
#tree = etree.fromstring(document, parser)
|
|
|
|
tree = html.fromstring(document.encode('utf-8'))
|
|
result = None
|
|
except Exception as e:
|
|
logger.error(str(e))
|
|
logger.warning("Failed to parse URL as feed for {}.".format(url))
|
|
result = {'link' : None,
|
|
'index' : None,
|
|
'name' : None,
|
|
'code' : None,
|
|
'error' : True,
|
|
'exist' : None}
|
|
if not result:
|
|
logger.debug("Feed auto-discovery engaged for {}".format(url))
|
|
result = await feed_mode_auto_discovery(url, tree)
|
|
if not result:
|
|
logger.debug("Feed link scan mode engaged for {}".format(url))
|
|
result = await feed_mode_scan(url, tree)
|
|
if not result:
|
|
logger.debug("Feed arbitrary mode engaged for {}".format(url))
|
|
result = await feed_mode_guess(url, tree)
|
|
if not result:
|
|
logger.debug("No feeds were found for {}".format(url))
|
|
result = None
|
|
return result
|
|
|
|
|
|
# TODO Improve scan by gradual decreasing of path
|
|
async def feed_mode_guess(url, tree):
|
|
"""
|
|
Lookup for feeds by pathname using HTTP Requests.
|
|
|
|
Parameters
|
|
----------
|
|
db_file : str
|
|
Path to database file.
|
|
url : str
|
|
URL.
|
|
tree : TYPE
|
|
DESCRIPTION.
|
|
|
|
Returns
|
|
-------
|
|
msg : str
|
|
Message with URLs.
|
|
"""
|
|
urls = []
|
|
parted_url = urlsplit(url)
|
|
paths = config.open_config_file("lists.toml")["pathnames"]
|
|
# Check whether URL has path (i.e. not root)
|
|
# Check parted_url.path to avoid error in case root wasn't given
|
|
# TODO Make more tests
|
|
if parted_url.path and parted_url.path.split('/')[1]:
|
|
paths.extend(
|
|
[".atom", ".feed", ".rdf", ".rss"]
|
|
) if '.rss' not in paths else -1
|
|
# if paths.index('.rss'):
|
|
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
|
|
parted_url_path = parted_url.path if parted_url.path else '/'
|
|
for path in paths:
|
|
address = join_url(url, parted_url_path.split('/')[1] + path)
|
|
if address not in urls:
|
|
urls.extend([address])
|
|
# breakpoint()
|
|
# print("feed_mode_guess")
|
|
urls = await process_feed_selection(url, urls)
|
|
return urls
|
|
|
|
|
|
async def feed_mode_scan(url, tree):
|
|
"""
|
|
Scan page for potential feeds by pathname.
|
|
|
|
Parameters
|
|
----------
|
|
db_file : str
|
|
Path to database file.
|
|
url : str
|
|
URL.
|
|
tree : TYPE
|
|
DESCRIPTION.
|
|
|
|
Returns
|
|
-------
|
|
msg : str
|
|
Message with URLs.
|
|
"""
|
|
urls = []
|
|
paths = config.open_config_file("lists.toml")["pathnames"]
|
|
for path in paths:
|
|
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
|
# xpath_query = "//a[contains(@href,'{}')]".format(path)
|
|
num = 5
|
|
xpath_query = (
|
|
"(//a[contains(@href,'{}')])[position()<={}]"
|
|
).format(path, num)
|
|
addresses = tree.xpath(xpath_query)
|
|
xpath_query = (
|
|
"(//a[contains(@href,'{}')])[position()>last()-{}]"
|
|
).format(path, num)
|
|
addresses += tree.xpath(xpath_query)
|
|
# NOTE Should number of addresses be limited or
|
|
# perhaps be N from the start and N from the end
|
|
for address in addresses:
|
|
address = join_url(url, address.xpath('@href')[0])
|
|
if address not in urls:
|
|
urls.extend([address])
|
|
# breakpoint()
|
|
# print("feed_mode_scan")
|
|
urls = await process_feed_selection(url, urls)
|
|
return urls
|
|
|
|
|
|
async def feed_mode_auto_discovery(url, tree):
|
|
"""
|
|
Lookup for feeds using RSS autodiscovery technique.
|
|
|
|
See: https://www.rssboard.org/rss-autodiscovery
|
|
|
|
Parameters
|
|
----------
|
|
db_file : str
|
|
Path to database file.
|
|
url : str
|
|
URL.
|
|
tree : TYPE
|
|
DESCRIPTION.
|
|
|
|
Returns
|
|
-------
|
|
msg : str
|
|
Message with URLs.
|
|
"""
|
|
xpath_query = (
|
|
'//link[(@rel="alternate") and '
|
|
'(@type="application/atom+xml" or '
|
|
'@type="application/rdf+xml" or '
|
|
'@type="application/rss+xml")]'
|
|
)
|
|
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
|
|
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
|
|
feeds = tree.xpath(xpath_query)
|
|
if feeds:
|
|
urls = []
|
|
for feed in feeds:
|
|
# # The following code works;
|
|
# # The following code will catch
|
|
# # only valid resources (i.e. not 404);
|
|
# # The following code requires more bandwidth.
|
|
# res = await fetch.http(feed)
|
|
# if res[0]:
|
|
# disco = parse(res[0])
|
|
# title = disco["feed"]["title"]
|
|
# msg += "{} \n {} \n\n".format(title, feed)
|
|
|
|
# feed_name = feed.xpath('@title')[0]
|
|
# feed_addr = join_url(url, feed.xpath('@href')[0])
|
|
|
|
# if feed_addr.startswith("/"):
|
|
# feed_addr = url + feed_addr
|
|
address = join_url(url, feed.xpath('@href')[0])
|
|
if address not in urls:
|
|
urls.extend([address])
|
|
# breakpoint()
|
|
# print("feed_mode_auto_discovery")
|
|
urls = await process_feed_selection(url, urls)
|
|
return urls
|
|
|
|
|
|
# TODO Segregate function into function that returns
|
|
# URLs (string) and Feeds (dict) and function that
|
|
# composes text message (string).
|
|
# Maybe that's not necessary.
|
|
async def process_feed_selection(url, urls):
|
|
feeds = {}
|
|
for i in urls:
|
|
result = await fetch.http(i)
|
|
if not result['error']:
|
|
document = result['content']
|
|
status_code = result['status_code']
|
|
if status_code == 200: # NOTE This line might be redundant
|
|
try:
|
|
feeds[i] = [parse(document)]
|
|
except:
|
|
continue
|
|
message = (
|
|
"Web feeds found for {}\n\n```\n"
|
|
).format(url)
|
|
urls = []
|
|
for feed_url in feeds:
|
|
# try:
|
|
# res = await fetch.http(feed)
|
|
# except:
|
|
# continue
|
|
feed_name = None
|
|
if "title" in feeds[feed_url][0]["feed"].keys():
|
|
feed_name = feeds[feed_url][0].feed.title
|
|
feed_name = feed_name if feed_name else "Untitled"
|
|
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
|
|
# AttributeError: 'str' object has no attribute 'entries'
|
|
if "entries" in feeds[feed_url][0].keys():
|
|
feed_amnt = feeds[feed_url][0].entries
|
|
else:
|
|
continue
|
|
if feed_amnt:
|
|
# NOTE Because there could be many false positives
|
|
# which are revealed in second phase of scan, we
|
|
# could end with a single feed, which would be
|
|
# listed instead of fetched, so feed_url_mark is
|
|
# utilized in order to make fetch possible.
|
|
# NOTE feed_url_mark was a variable which stored
|
|
# single URL (probably first accepted as valid)
|
|
# in order to get an indication whether a single
|
|
# URL has been fetched, so that the receiving
|
|
# function will scan that single URL instead of
|
|
# listing it as a message.
|
|
url = {'link' : feed_url,
|
|
'index' : None,
|
|
'name' : feed_name,
|
|
'code' : status_code,
|
|
'error' : False,
|
|
'exist' : None}
|
|
urls.extend([url])
|
|
count = len(urls)
|
|
if count > 1:
|
|
result = urls
|
|
elif count:
|
|
result = urls[0]
|
|
else:
|
|
result = None
|
|
return result
|
|
|
|
|
|
# def get_discovered_feeds(url, urls):
|
|
# message = (
|
|
# "Found {} web feeds:\n\n```\n"
|
|
# ).format(len(urls))
|
|
# if len(urls) > 1:
|
|
# for urls in urls:
|
|
# message += (
|
|
# "Title : {}\n"
|
|
# "Link : {}\n"
|
|
# "\n"
|
|
# ).format(url, url.title)
|
|
# message += (
|
|
# "```\nThe above feeds were extracted from\n{}"
|
|
# ).format(url)
|
|
# elif len(urls) > 0:
|
|
# result = urls
|
|
# else:
|
|
# message = (
|
|
# "No feeds were found for {}"
|
|
# ).format(url)
|
|
# return result
|
|
|
|
|
|
# Test module
|
|
# TODO ModuleNotFoundError: No module named 'slixfeed'
|
|
# import slixfeed.fetch as fetch
|
|
# from slixfeed.action import is_feed, process_feed_selection
|
|
|
|
# async def start(url):
|
|
# while True:
|
|
# result = await fetch.http(url)
|
|
# document = result[0]
|
|
# status = result[1]
|
|
# if document:
|
|
# feed = parse(document)
|
|
# if is_feed(feed):
|
|
# print(url)
|
|
# else:
|
|
# urls = await probe_page(
|
|
# url, document)
|
|
# if len(urls) > 1:
|
|
# await process_feed_selection(urls)
|
|
# elif urls:
|
|
# url = urls[0]
|
|
# else:
|
|
# response = (
|
|
# "> {}\nFailed to load URL. Reason: {}"
|
|
# ).format(url, status)
|
|
# break
|
|
# return response
|
|
|
|
# url = "https://www.smh.com.au/rssheadlines"
|
|
# start(url)
|