Slixfeed/slixfeed/crawl.py

437 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
FIXME
1) https://wiki.pine64.org
File "/slixfeed/crawl.py", line 178, in feed_mode_guess
address = join_url(url, parted_url.path.split('/')[1] + path)
~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range
TODO
1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /
Attempt to scan sub domains
https://esmailelbob.xyz/en/
https://blog.esmailelbob.xyz/feed/
1.2) Consider utilizing fetch.http_response
2) Consider merging with module fetch.py
FEEDS CRAWLER PROJECT
2024-01-20 18:28:31 +01:00
3) Mark redirects for manual check
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/atom.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rdf.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rss.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/videos.xml
"""
from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
2024-01-09 13:34:10 +01:00
import logging
from lxml import etree
from lxml import html
from lxml.etree import fromstring
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
logger = Logger(__name__)
async def probe_page(url, document=None):
"""
Parameters
----------
url : str
URL.
document : TYPE
DESCRIPTION.
Returns
-------
result : list or str
Single URL as list or selection of URLs as str.
"""
if not document:
response = await fetch.http(url)
if not response['error']:
document = response['content']
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document)
result = None
except Exception as e:
logger.error(str(e))
try:
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html.fromstring(document.encode('utf-8'))
result = None
except Exception as e:
logger.error(str(e))
2024-06-11 04:34:18 +02:00
logger.warning("Failed to parse URL as feed for {}.".format(url))
result = {'link' : None,
'index' : None,
'name' : None,
'code' : None,
'error' : True,
'exist' : None}
if not result:
logger.debug("Feed auto-discovery engaged for {}".format(url))
result = await feed_mode_auto_discovery(url, tree)
if not result:
logger.debug("Feed link scan mode engaged for {}".format(url))
result = await feed_mode_scan(url, tree)
if not result:
logger.debug("Feed arbitrary mode engaged for {}".format(url))
2024-01-09 13:34:10 +01:00
result = await feed_mode_guess(url, tree)
if not result:
logger.debug("No feeds were found for {}".format(url))
result = None
return result
# TODO Improve scan by gradual decreasing of path
2024-01-09 13:34:10 +01:00
async def feed_mode_guess(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
2024-01-09 13:34:10 +01:00
urls = []
parted_url = urlsplit(url)
paths = config.open_config_file("lists.toml")["pathnames"]
2024-01-09 13:34:10 +01:00
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
parted_url_path = parted_url.path if parted_url.path else '/'
for path in paths:
address = join_url(url, parted_url_path.split('/')[1] + path)
2024-01-09 13:34:10 +01:00
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_guess")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
2024-01-09 13:34:10 +01:00
urls = []
paths = config.open_config_file("lists.toml")["pathnames"]
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = (
"(//a[contains(@href,'{}')])[position()<={}]"
).format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = (
"(//a[contains(@href,'{}')])[position()>last()-{}]"
).format(path, num)
addresses += tree.xpath(xpath_query)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
2024-01-09 13:34:10 +01:00
address = join_url(url, address.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_scan")
urls = await process_feed_selection(url, urls)
return urls
async def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
2024-01-09 13:34:10 +01:00
if feeds:
urls = []
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await fetch.http(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
2024-01-09 13:34:10 +01:00
# feed_name = feed.xpath('@title')[0]
# feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
2024-01-09 13:34:10 +01:00
address = join_url(url, feed.xpath('@href')[0])
if address not in urls:
urls.extend([address])
# breakpoint()
# print("feed_mode_auto_discovery")
urls = await process_feed_selection(url, urls)
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
feeds = {}
for i in urls:
result = await fetch.http(i)
if not result['error']:
document = result['content']
status_code = result['status_code']
if status_code == 200: # NOTE This line might be redundant
try:
feeds[i] = [parse(document)]
except:
continue
2024-01-09 13:34:10 +01:00
message = (
"Web feeds found for {}\n\n```\n"
).format(url)
urls = []
2024-01-09 13:34:10 +01:00
for feed_url in feeds:
# try:
# res = await fetch.http(feed)
2024-01-09 13:34:10 +01:00
# except:
# continue
feed_name = None
if "title" in feeds[feed_url][0]["feed"].keys():
feed_name = feeds[feed_url][0].feed.title
feed_name = feed_name if feed_name else "Untitled"
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if "entries" in feeds[feed_url][0].keys():
feed_amnt = feeds[feed_url][0].entries
else:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
# NOTE feed_url_mark was a variable which stored
# single URL (probably first accepted as valid)
# in order to get an indication whether a single
# URL has been fetched, so that the receiving
# function will scan that single URL instead of
# listing it as a message.
url = {'link' : feed_url,
'index' : None,
'name' : feed_name,
'code' : status_code,
'error' : False,
'exist' : None}
urls.extend([url])
count = len(urls)
if count > 1:
result = urls
elif count:
result = urls[0]
2024-01-09 13:34:10 +01:00
else:
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.http(url)
2024-01-09 13:34:10 +01:00
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)