2024-01-04 02:16:24 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
2024-02-10 18:53:53 +01:00
FIXME
1 ) https : / / wiki . pine64 . org
File " /slixfeed/crawl.py " , line 178 , in feed_mode_guess
address = join_url ( url , parted_url . path . split ( ' / ' ) [ 1 ] + path )
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ^ ^ ^
IndexError : list index out of range
2024-01-04 02:16:24 +01:00
TODO
2024-02-04 18:08:12 +01:00
1.1 ) Attempt to scan more paths : / blog / , / news / etc . , including root /
Attempt to scan sub domains
https : / / esmailelbob . xyz / en /
https : / / blog . esmailelbob . xyz / feed /
2024-01-04 02:16:24 +01:00
2024-02-04 18:08:12 +01:00
1.2 ) Consider utilizing fetch . http_response
2024-01-06 23:03:08 +01:00
2 ) Consider merging with module fetch . py
2024-01-04 02:16:24 +01:00
2024-02-04 18:08:12 +01:00
FEEDS CRAWLER PROJECT
2024-01-20 18:28:31 +01:00
3 ) Mark redirects for manual check
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / atom . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / feed . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / feeds / rss / news . xml . php
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / jekyll / feed . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / news . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / news . xml . php
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / rdf . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / rss . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / videos . xml
2024-01-04 02:16:24 +01:00
"""
from aiohttp import ClientError , ClientSession , ClientTimeout
from feedparser import parse
2024-01-09 13:34:10 +01:00
import logging
2024-06-10 20:20:04 +02:00
from lxml import etree
2024-01-04 02:16:24 +01:00
from lxml import html
2024-06-10 20:20:04 +02:00
from lxml . etree import fromstring
2024-01-04 02:16:24 +01:00
import slixfeed . config as config
2024-01-11 11:55:42 +01:00
import slixfeed . fetch as fetch
2024-06-10 20:20:04 +02:00
from slixfeed . log import Logger
2024-01-04 02:16:24 +01:00
from slixfeed . url import complete_url , join_url , trim_url
from urllib . parse import urlsplit , urlunsplit
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
2024-06-10 20:20:04 +02:00
logger = Logger ( __name__ )
2024-01-04 02:16:24 +01:00
2024-02-18 00:21:44 +01:00
async def probe_page ( url , document = None ) :
2024-01-04 02:16:24 +01:00
"""
Parameters
- - - - - - - - - -
url : str
URL .
document : TYPE
DESCRIPTION .
Returns
- - - - - - -
result : list or str
Single URL as list or selection of URLs as str .
"""
2024-02-18 00:21:44 +01:00
if not document :
response = await fetch . http ( url )
if not response [ ' error ' ] :
document = response [ ' content ' ]
2024-01-04 02:16:24 +01:00
try :
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html . fromstring ( document )
2024-02-18 00:21:44 +01:00
result = None
2024-06-10 20:20:04 +02:00
except Exception as e :
logger . error ( str ( e ) )
try :
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html . fromstring ( document . encode ( ' utf-8 ' ) )
result = None
except Exception as e :
logger . error ( str ( e ) )
2024-06-11 04:21:07 +02:00
logger . warn ( " Failed to parse URL as feed for {} . " . format ( url ) )
2024-06-10 20:20:04 +02:00
result = { ' link ' : None ,
' index ' : None ,
' name ' : None ,
' code ' : None ,
' error ' : True ,
' exist ' : None }
2024-01-04 02:16:24 +01:00
if not result :
2024-06-10 20:20:04 +02:00
logger . debug ( " Feed auto-discovery engaged for {} " . format ( url ) )
2024-01-04 02:16:24 +01:00
result = await feed_mode_auto_discovery ( url , tree )
if not result :
2024-06-10 20:20:04 +02:00
logger . debug ( " Feed link scan mode engaged for {} " . format ( url ) )
2024-01-04 02:16:24 +01:00
result = await feed_mode_scan ( url , tree )
if not result :
2024-06-10 20:20:04 +02:00
logger . debug ( " Feed arbitrary mode engaged for {} " . format ( url ) )
2024-01-09 13:34:10 +01:00
result = await feed_mode_guess ( url , tree )
2024-01-04 02:16:24 +01:00
if not result :
2024-06-10 20:20:04 +02:00
logger . debug ( " No feeds were found for {} " . format ( url ) )
2024-02-18 00:21:44 +01:00
result = None
2024-01-04 02:16:24 +01:00
return result
# TODO Improve scan by gradual decreasing of path
2024-01-09 13:34:10 +01:00
async def feed_mode_guess ( url , tree ) :
2024-01-04 02:16:24 +01:00
"""
Lookup for feeds by pathname using HTTP Requests .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
2024-01-09 13:34:10 +01:00
urls = [ ]
2024-01-04 02:16:24 +01:00
parted_url = urlsplit ( url )
2024-02-04 18:08:12 +01:00
paths = config . open_config_file ( " lists.toml " ) [ " pathnames " ]
2024-01-09 13:34:10 +01:00
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url . path and parted_url . path . split ( ' / ' ) [ 1 ] :
paths . extend (
[ " .atom " , " .feed " , " .rdf " , " .rss " ]
) if ' .rss ' not in paths else - 1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
2024-02-18 00:21:44 +01:00
parted_url_path = parted_url . path if parted_url . path else ' / '
2024-01-04 02:16:24 +01:00
for path in paths :
2024-02-18 00:21:44 +01:00
address = join_url ( url , parted_url_path . split ( ' / ' ) [ 1 ] + path )
2024-01-09 13:34:10 +01:00
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_guess")
urls = await process_feed_selection ( url , urls )
return urls
2024-01-04 02:16:24 +01:00
async def feed_mode_scan ( url , tree ) :
"""
Scan page for potential feeds by pathname .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
2024-01-09 13:34:10 +01:00
urls = [ ]
2024-02-04 18:08:12 +01:00
paths = config . open_config_file ( " lists.toml " ) [ " pathnames " ]
2024-01-04 02:16:24 +01:00
for path in paths :
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
2024-01-11 11:55:42 +01:00
xpath_query = (
" (//a[contains(@href, ' {} ' )])[position()<= {} ] "
) . format ( path , num )
2024-01-04 02:16:24 +01:00
addresses = tree . xpath ( xpath_query )
2024-01-11 11:55:42 +01:00
xpath_query = (
" (//a[contains(@href, ' {} ' )])[position()>last()- {} ] "
) . format ( path , num )
2024-01-04 02:16:24 +01:00
addresses + = tree . xpath ( xpath_query )
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses :
2024-01-09 13:34:10 +01:00
address = join_url ( url , address . xpath ( ' @href ' ) [ 0 ] )
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_scan")
urls = await process_feed_selection ( url , urls )
return urls
2024-01-04 02:16:24 +01:00
async def feed_mode_auto_discovery ( url , tree ) :
"""
Lookup for feeds using RSS autodiscovery technique .
See : https : / / www . rssboard . org / rss - autodiscovery
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
xpath_query = (
' //link[(@rel= " alternate " ) and '
' (@type= " application/atom+xml " or '
' @type= " application/rdf+xml " or '
' @type= " application/rss+xml " )] '
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree . xpath ( xpath_query )
2024-01-09 13:34:10 +01:00
if feeds :
urls = [ ]
2024-01-04 02:16:24 +01:00
for feed in feeds :
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
2024-01-11 11:55:42 +01:00
# res = await fetch.http(feed)
2024-01-04 02:16:24 +01:00
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
2024-01-09 13:34:10 +01:00
# feed_name = feed.xpath('@title')[0]
# feed_addr = join_url(url, feed.xpath('@href')[0])
2024-01-04 02:16:24 +01:00
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
2024-01-09 13:34:10 +01:00
address = join_url ( url , feed . xpath ( ' @href ' ) [ 0 ] )
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_auto_discovery")
urls = await process_feed_selection ( url , urls )
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection ( url , urls ) :
feeds = { }
for i in urls :
2024-02-18 00:21:44 +01:00
result = await fetch . http ( i )
if not result [ ' error ' ] :
document = result [ ' content ' ]
status_code = result [ ' status_code ' ]
if status_code == 200 : # NOTE This line might be redundant
try :
feeds [ i ] = [ parse ( document ) ]
except :
continue
2024-01-09 13:34:10 +01:00
message = (
" Web feeds found for {} \n \n ``` \n "
) . format ( url )
2024-02-16 02:46:04 +01:00
urls = [ ]
2024-01-09 13:34:10 +01:00
for feed_url in feeds :
# try:
2024-01-11 11:55:42 +01:00
# res = await fetch.http(feed)
2024-01-09 13:34:10 +01:00
# except:
# continue
feed_name = None
if " title " in feeds [ feed_url ] [ 0 ] [ " feed " ] . keys ( ) :
feed_name = feeds [ feed_url ] [ 0 ] . feed . title
feed_name = feed_name if feed_name else " Untitled "
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if " entries " in feeds [ feed_url ] [ 0 ] . keys ( ) :
feed_amnt = feeds [ feed_url ] [ 0 ] . entries
else :
continue
if feed_amnt :
# NOTE Because there could be many false positives
2024-02-16 02:46:04 +01:00
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
# NOTE feed_url_mark was a variable which stored
# single URL (probably first accepted as valid)
# in order to get an indication whether a single
# URL has been fetched, so that the receiving
# function will scan that single URL instead of
# listing it as a message.
2024-02-18 00:21:44 +01:00
url = { ' link ' : feed_url ,
2024-02-16 02:46:04 +01:00
' index ' : None ,
' name ' : feed_name ,
' code ' : status_code ,
' error ' : False ,
' exist ' : None }
urls . extend ( [ url ] )
count = len ( urls )
if count > 1 :
result = urls
elif count :
result = urls [ 0 ]
2024-01-09 13:34:10 +01:00
else :
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
2024-01-11 11:55:42 +01:00
# result = await fetch.http(url)
2024-01-09 13:34:10 +01:00
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
2024-06-10 20:20:04 +02:00
# start(url)