2023-10-24 16:43:14 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2023-11-22 12:47:34 +01:00
"""
FIXME
1 ) feed_mode_scan doesn ' t find feed for https://www.blender.org/
even though it should be according to the pathnames dictionary .
2023-11-23 17:55:36 +01:00
TODO
1 ) Support Gemini and Gopher .
2023-12-26 12:22:45 +01:00
2 ) Check also for HTML , not only feed . bozo .
2023-12-27 23:48:31 +01:00
3 ) Add " if is_feed(url, feed) " to view_entry and view_feed
4 ) Refactor view_entry and view_feed - Why " if " twice ?
2023-11-22 12:47:34 +01:00
"""
2023-12-05 09:18:29 +01:00
from aiohttp import ClientError , ClientSession , ClientTimeout
2023-12-04 15:41:02 +01:00
from asyncio import TimeoutError
2023-10-24 16:43:14 +02:00
from asyncio . exceptions import IncompleteReadError
2023-12-04 15:41:02 +01:00
from bs4 import BeautifulSoup
2023-12-18 16:29:32 +01:00
from confighandler import get_list , get_value_default
2023-12-24 19:37:05 +01:00
from datetimehandler import now , rfc2822_to_iso8601
2023-12-08 12:32:01 +01:00
from email . utils import parseaddr
2023-12-04 15:41:02 +01:00
from feedparser import parse
2023-11-13 14:45:10 +01:00
from http . client import IncompleteRead
2023-12-04 15:41:02 +01:00
from listhandler import is_listed
2023-12-24 19:37:05 +01:00
from lxml import html
2023-12-11 10:04:45 +01:00
import sqlitehandler as sqlite
2023-12-24 19:37:05 +01:00
from urlhandler import complete_url , join_url , trim_url
2023-10-24 16:43:14 +02:00
from urllib import error
# from xml.etree.ElementTree import ElementTree, ParseError
2023-12-05 09:18:29 +01:00
from urllib . parse import urljoin , urlsplit , urlunsplit
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
# NOTE Why (if res[0]) and (if res[1] == 200)?
2023-11-13 14:45:10 +01:00
async def download_updates ( db_file , url = None ) :
"""
Check feeds for new entries .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str , optional
URL . The default is None .
"""
if url :
urls = [ url ] # Valid [url] and [url,] and (url,)
else :
2023-12-11 10:04:45 +01:00
urls = await sqlite . get_feeds_url ( db_file )
2023-10-24 16:43:14 +02:00
for url in urls :
# print(os.path.basename(db_file), url[0])
source = url [ 0 ]
res = await download_feed ( source )
# TypeError: 'NoneType' object is not subscriptable
if res is None :
# Skip to next feed
# urls.next()
# next(urls)
continue
2023-12-11 10:04:45 +01:00
await sqlite . update_source_status (
2023-11-13 14:45:10 +01:00
db_file ,
res [ 1 ] ,
source
)
2023-10-24 16:43:14 +02:00
if res [ 0 ] :
try :
2023-12-04 15:41:02 +01:00
feed = parse ( res [ 0 ] )
2023-10-24 16:43:14 +02:00
if feed . bozo :
2023-11-22 12:47:34 +01:00
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(source)
# print(bozo)
2023-10-24 16:43:14 +02:00
valid = 0
else :
valid = 1
2023-12-11 10:04:45 +01:00
await sqlite . update_source_validity (
2023-11-13 14:45:10 +01:00
db_file ,
source ,
valid )
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
None
2023-10-24 16:43:14 +02:00
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res [ 1 ] == 200 :
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed . entries
# length = len(entries)
2023-12-04 15:41:02 +01:00
# await remove_entry(db_file, source, length)
2023-12-11 10:04:45 +01:00
await sqlite . remove_nonexistent_entries (
2023-11-13 14:45:10 +01:00
db_file ,
feed ,
source
)
# new_entry = 0
2023-10-24 16:43:14 +02:00
for entry in entries :
2023-11-13 14:45:10 +01:00
# TODO Pass date too for comparion check
if entry . has_key ( " published " ) :
date = entry . published
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
elif entry . has_key ( " updated " ) :
date = entry . updated
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-13 14:45:10 +01:00
else :
2023-11-29 16:32:35 +01:00
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
2023-12-14 09:43:30 +01:00
date = now ( )
2023-11-29 16:32:35 +01:00
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
2023-12-01 14:22:03 +01:00
if entry . has_key ( " title " ) :
title = entry . title
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
else :
title = date
# title = feed["feed"]["title"]
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
2023-12-14 09:43:30 +01:00
link = join_url ( source , entry . link )
link = trim_url ( link )
2023-12-01 14:22:03 +01:00
else :
link = source
if entry . has_key ( " id " ) :
eid = entry . id
else :
eid = link
2023-12-11 10:04:45 +01:00
exist = await sqlite . check_entry_exist (
2023-11-13 14:45:10 +01:00
db_file ,
source ,
eid = eid ,
title = title ,
link = link ,
date = date
)
2023-10-24 16:43:14 +02:00
if not exist :
2023-11-13 14:45:10 +01:00
# new_entry = new_entry + 1
2023-10-24 16:43:14 +02:00
# TODO Enhance summary
if entry . has_key ( " summary " ) :
summary = entry . summary
2023-12-01 14:22:03 +01:00
# # Remove HTML tags
# summary = BeautifulSoup(summary, "lxml").text
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# summary = summary[:300] + " […]⃨"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
2023-10-24 16:43:14 +02:00
else :
2023-11-29 16:32:35 +01:00
summary = " > *** No summary *** "
2023-11-13 14:45:10 +01:00
read_status = 0
pathname = urlsplit ( link ) . path
string = (
" {} {} {} "
) . format (
title ,
summary ,
pathname
)
2023-12-04 15:41:02 +01:00
allow_list = await is_listed (
2023-11-13 14:45:10 +01:00
db_file ,
2023-11-26 16:23:52 +01:00
" filter-allow " ,
2023-11-13 14:45:10 +01:00
string
)
if not allow_list :
2023-12-04 15:41:02 +01:00
reject_list = await is_listed (
2023-11-13 14:45:10 +01:00
db_file ,
2023-11-26 16:23:52 +01:00
" filter-deny " ,
2023-11-13 14:45:10 +01:00
string
)
if reject_list :
2023-11-15 15:00:49 +01:00
# print(">>> REJECTED", title)
2023-11-23 17:55:36 +01:00
summary = (
" REJECTED {} " . format (
reject_list . upper ( )
)
)
2023-11-13 14:45:10 +01:00
# summary = ""
read_status = 1
entry = (
title ,
link ,
eid ,
source ,
date ,
read_status
)
2023-12-01 14:22:03 +01:00
if isinstance ( date , int ) :
2023-12-04 15:41:02 +01:00
print ( " PROBLEM: date is int " )
2023-12-01 14:22:03 +01:00
print ( date )
2023-12-04 15:41:02 +01:00
# breakpoint()
2023-12-24 19:37:05 +01:00
# print(source)
# print(date)
2023-12-11 10:04:45 +01:00
await sqlite . add_entry_and_set_date (
2023-11-13 14:45:10 +01:00
db_file ,
source ,
entry
)
2023-12-14 09:43:30 +01:00
# print(current_time(), entry, title)
2023-11-13 14:45:10 +01:00
# else:
2023-12-14 09:43:30 +01:00
# print(current_time(), exist, title)
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
# NOTE Why (if result[0]) and (if result[1] == 200)?
2023-11-26 16:23:52 +01:00
async def view_feed ( url ) :
2023-11-26 06:48:09 +01:00
"""
Check feeds for new entries .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str , optional
URL . The default is None .
Returns
- - - - - - -
msg : str
Feed content or error message .
"""
result = await download_feed ( url )
if result [ 0 ] :
try :
2023-12-04 15:41:02 +01:00
feed = parse ( result [ 0 ] )
2023-11-26 06:48:09 +01:00
if feed . bozo :
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
2023-11-26 16:23:52 +01:00
msg = await probe_page ( view_feed , url , result [ 0 ] )
2023-11-26 06:48:09 +01:00
return msg
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
msg = (
" > {} \n "
" Error: {} "
) . format ( url , e )
2023-12-04 15:41:02 +01:00
# breakpoint()
2023-11-26 06:48:09 +01:00
if result [ 1 ] == 200 :
2023-12-14 09:43:30 +01:00
feed = parse ( result [ 0 ] )
title = get_title ( url , feed )
2023-11-26 06:48:09 +01:00
entries = feed . entries
2023-11-26 16:23:52 +01:00
msg = " Preview of {} : \n ``` \n " . format ( title )
2023-12-24 19:37:05 +01:00
counter = 0
2023-11-26 06:48:09 +01:00
for entry in entries :
2023-12-24 19:37:05 +01:00
counter + = 1
2023-11-26 06:48:09 +01:00
if entry . has_key ( " title " ) :
title = entry . title
else :
title = " *** No title *** "
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
2023-12-14 09:43:30 +01:00
link = join_url ( url , entry . link )
link = trim_url ( link )
2023-11-26 06:48:09 +01:00
else :
link = " *** No link *** "
if entry . has_key ( " published " ) :
date = entry . published
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
elif entry . has_key ( " updated " ) :
date = entry . updated
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
else :
date = " *** No date *** "
msg + = (
" Title : {} \n "
" Date : {} \n "
" Link : {} \n "
" Count : {} \n "
" \n "
) . format (
title ,
date ,
link ,
2023-12-24 19:37:05 +01:00
counter
2023-11-26 06:48:09 +01:00
)
2023-12-24 19:37:05 +01:00
if counter > 4 :
2023-11-26 16:23:52 +01:00
break
2023-11-26 06:48:09 +01:00
msg + = (
2023-11-26 16:23:52 +01:00
" ``` \n Source: {} "
) . format ( url )
2023-11-26 06:48:09 +01:00
else :
msg = (
" > {} \n Failed to load URL. Reason: {} "
) . format ( url , result [ 1 ] )
return msg
2023-11-26 16:23:52 +01:00
# NOTE Why (if result[0]) and (if result[1] == 200)?
async def view_entry ( url , num ) :
2023-11-26 06:48:09 +01:00
result = await download_feed ( url )
2023-11-26 16:23:52 +01:00
if result [ 0 ] :
try :
2023-12-04 15:41:02 +01:00
feed = parse ( result [ 0 ] )
2023-11-26 16:23:52 +01:00
if feed . bozo :
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
2023-12-05 09:18:29 +01:00
msg = await probe_page ( view_entry , url , result [ 0 ] , num = num )
2023-11-26 16:23:52 +01:00
return msg
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
msg = (
" > {} \n "
" Error: {} "
) . format ( url , e )
2023-12-04 15:41:02 +01:00
# breakpoint()
2023-11-26 06:48:09 +01:00
if result [ 1 ] == 200 :
2023-12-04 15:41:02 +01:00
feed = parse ( result [ 0 ] )
2023-12-14 09:43:30 +01:00
title = get_title ( url , result [ 0 ] )
2023-11-26 06:48:09 +01:00
entries = feed . entries
2023-11-26 16:23:52 +01:00
num = int ( num ) - 1
2023-11-26 06:48:09 +01:00
entry = entries [ num ]
if entry . has_key ( " title " ) :
title = entry . title
else :
title = " *** No title *** "
if entry . has_key ( " published " ) :
date = entry . published
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
elif entry . has_key ( " updated " ) :
date = entry . updated
2023-12-14 09:43:30 +01:00
date = rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
else :
date = " *** No date *** "
if entry . has_key ( " summary " ) :
summary = entry . summary
# Remove HTML tags
2023-11-26 16:23:52 +01:00
summary = BeautifulSoup ( summary , " lxml " ) . text
2023-11-26 06:48:09 +01:00
# TODO Limit text length
2023-11-26 16:23:52 +01:00
summary = summary . replace ( " \n \n \n " , " \n \n " )
2023-11-26 06:48:09 +01:00
else :
summary = " *** No summary *** "
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
2023-12-14 09:43:30 +01:00
link = join_url ( url , entry . link )
link = trim_url ( link )
2023-11-26 06:48:09 +01:00
else :
link = " *** No link *** "
msg = (
" {} \n "
" \n "
2023-12-01 14:22:03 +01:00
" > {} \n "
2023-11-26 06:48:09 +01:00
" \n "
" {} \n "
" \n "
) . format (
title ,
summary ,
link
)
else :
msg = (
" > {} \n "
" Failed to load URL. Reason: {} \n "
" Try again momentarily. "
) . format ( url , result [ 1 ] )
return msg
2023-11-13 14:45:10 +01:00
async def add_feed_no_check ( db_file , data ) :
"""
Add given feed without validity check .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
data : str
URL or URL and Title .
Returns
- - - - - - -
msg : str
Status message .
"""
url = data [ 0 ]
title = data [ 1 ]
2023-12-14 09:43:30 +01:00
url = trim_url ( url )
2023-12-11 10:04:45 +01:00
exist = await sqlite . check_feed_exist ( db_file , url )
2023-11-13 14:45:10 +01:00
if not exist :
2023-12-11 10:04:45 +01:00
msg = await sqlite . insert_feed ( db_file , url , title )
2023-11-13 14:45:10 +01:00
await download_updates ( db_file , [ url ] )
else :
ix = exist [ 0 ]
name = exist [ 1 ]
msg = (
" > {} \n News source \" {} \" is already "
" listed in the subscription list at "
" index {} " . format ( url , name , ix )
)
return msg
2023-10-24 16:43:14 +02:00
async def add_feed ( db_file , url ) :
"""
Check whether feed exist , otherwise process it .
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
Returns
- - - - - - -
msg : str
Status message .
2023-10-24 16:43:14 +02:00
"""
2023-11-13 14:45:10 +01:00
msg = None
2023-12-14 09:43:30 +01:00
url = trim_url ( url )
2023-12-11 10:04:45 +01:00
exist = await sqlite . check_feed_exist ( db_file , url )
2023-10-24 16:43:14 +02:00
if not exist :
res = await download_feed ( url )
if res [ 0 ] :
2023-12-04 15:41:02 +01:00
feed = parse ( res [ 0 ] )
2023-12-14 09:43:30 +01:00
title = get_title ( url , feed )
2023-12-27 23:48:31 +01:00
if is_feed ( url , feed ) :
2023-11-13 14:45:10 +01:00
status = res [ 1 ]
2023-12-11 10:04:45 +01:00
msg = await sqlite . insert_feed (
2023-11-13 14:45:10 +01:00
db_file ,
url ,
title ,
status
)
await download_updates ( db_file , [ url ] )
2023-12-27 23:48:31 +01:00
else :
msg = await probe_page (
add_feed ,
url ,
res [ 0 ] ,
db_file = db_file
)
2023-10-24 16:43:14 +02:00
else :
2023-11-13 14:45:10 +01:00
status = res [ 1 ]
msg = (
2023-11-26 06:48:09 +01:00
" > {} \n Failed to load URL. Reason: {} "
2023-11-13 14:45:10 +01:00
) . format ( url , status )
2023-10-24 16:43:14 +02:00
else :
ix = exist [ 0 ]
name = exist [ 1 ]
2023-11-13 14:45:10 +01:00
msg = (
" > {} \n News source \" {} \" is already "
" listed in the subscription list at "
" index {} " . format ( url , name , ix )
)
2023-10-24 16:43:14 +02:00
return msg
2023-11-26 06:48:09 +01:00
# TODO callback for use with add_feed and view_feed
2023-11-26 16:23:52 +01:00
async def probe_page ( callback , url , doc , num = None , db_file = None ) :
2023-11-26 06:48:09 +01:00
msg = None
try :
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html . fromstring ( doc )
except :
msg = (
" > {} \n Failed to parse URL as feed. "
) . format ( url )
if not msg :
print ( " RSS Auto-Discovery Engaged " )
msg = await feed_mode_auto_discovery ( url , tree )
if not msg :
print ( " RSS Scan Mode Engaged " )
msg = await feed_mode_scan ( url , tree )
if not msg :
print ( " RSS Arbitrary Mode Engaged " )
msg = await feed_mode_request ( url , tree )
if not msg :
msg = (
" > {} \n No news feeds were found for URL. "
) . format ( url )
# elif msg:
else :
if isinstance ( msg , str ) :
return msg
elif isinstance ( msg , list ) :
url = msg [ 0 ]
if db_file :
2023-12-24 19:37:05 +01:00
# print("if db_file", db_file)
2023-11-26 06:48:09 +01:00
return await callback ( db_file , url )
2023-11-26 16:23:52 +01:00
elif num :
return await callback ( url , num )
2023-11-26 06:48:09 +01:00
else :
return await callback ( url )
2023-10-24 16:43:14 +02:00
async def download_feed ( url ) :
"""
Download content of given URL .
2023-11-02 06:17:04 +01:00
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
url : str
URL .
Returns
- - - - - - -
msg : list or str
Document or error message .
2023-10-24 16:43:14 +02:00
"""
2023-12-18 16:29:32 +01:00
try :
user_agent = await get_value_default ( " user-agent " , " Network " )
except :
user_agent = " Slixfeed/0.1 "
2023-12-24 19:37:05 +01:00
if not len ( user_agent ) :
user_agent = " Slixfeed/0.1 "
2023-12-04 15:41:02 +01:00
timeout = ClientTimeout ( total = 10 )
2023-12-18 17:09:33 +01:00
headers = { ' User-Agent ' : user_agent }
2023-12-18 16:29:32 +01:00
async with ClientSession ( headers = headers ) as session :
2023-12-04 15:41:02 +01:00
# async with ClientSession(trust_env=True) as session:
2023-10-24 16:43:14 +02:00
try :
async with session . get ( url , timeout = timeout ) as response :
status = response . status
if response . status == 200 :
try :
doc = await response . text ( )
# print (response.content_type)
2023-11-13 14:45:10 +01:00
msg = [
doc ,
status
]
2023-10-24 16:43:14 +02:00
except :
2023-11-13 14:45:10 +01:00
# msg = [
# False,
# ("The content of this document "
# "doesn't appear to be textual."
# )
# ]
msg = [
False ,
" Document is too large or is not textual. "
]
2023-10-24 16:43:14 +02:00
else :
2023-11-13 14:45:10 +01:00
msg = [
False ,
" HTTP Error: " + str ( status )
]
2023-12-04 15:41:02 +01:00
except ClientError as e :
2023-11-13 14:45:10 +01:00
# print('Error', str(e))
msg = [
False ,
" Error: " + str ( e )
]
2023-12-04 15:41:02 +01:00
except TimeoutError as e :
2023-10-24 16:43:14 +02:00
# print('Timeout:', str(e))
2023-11-13 14:45:10 +01:00
msg = [
False ,
" Timeout: " + str ( e )
]
return msg
2023-11-02 06:17:04 +01:00
2023-12-14 09:43:30 +01:00
def get_title ( url , feed ) :
2023-11-02 06:17:04 +01:00
"""
Get title of feed .
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
url : str
URL .
feed : dict
Parsed feed document .
Returns
- - - - - - -
title : str
Title or URL hostname .
2023-11-02 06:17:04 +01:00
"""
try :
title = feed [ " feed " ] [ " title " ]
except :
2023-11-13 14:45:10 +01:00
title = urlsplit ( url ) . netloc
2023-12-24 19:37:05 +01:00
if not title :
title = urlsplit ( url ) . netloc
2023-11-02 06:17:04 +01:00
return title
2023-11-13 14:45:10 +01:00
# TODO Improve scan by gradual decreasing of path
2023-11-26 06:48:09 +01:00
async def feed_mode_request ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Lookup for feeds by pathname using HTTP Requests .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
feeds = { }
parted_url = urlsplit ( url )
2023-12-26 12:22:45 +01:00
paths = await get_list ( " lists.yaml " )
paths = paths [ " pathnames " ]
2023-11-13 14:45:10 +01:00
for path in paths :
address = urlunsplit ( [
parted_url . scheme ,
parted_url . netloc ,
path ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
2023-12-04 15:41:02 +01:00
# print(parse(res[0])["feed"]["title"])
# feeds[address] = parse(res[0])["feed"]["title"]
2023-11-13 14:45:10 +01:00
try :
2023-12-04 15:41:02 +01:00
title = parse ( res [ 0 ] ) [ " feed " ] [ " title " ]
2023-11-13 14:45:10 +01:00
except :
title = ' *** No Title *** '
feeds [ address ] = title
# Check whether URL has path (i.e. not root)
if parted_url . path . split ( ' / ' ) [ 1 ] :
paths . extend (
[ " .atom " , " .feed " , " .rdf " , " .rss " ]
) if ' .rss ' not in paths else - 1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit ( [
parted_url . scheme ,
parted_url . netloc ,
parted_url . path . split ( ' / ' ) [ 1 ] + path ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
2023-12-04 15:41:02 +01:00
feeds [ address ] = parse ( res [ 0 ] )
2023-11-15 15:00:49 +01:00
# print(feeds)
2023-11-13 14:45:10 +01:00
except :
2023-11-15 15:00:49 +01:00
continue
2023-11-13 14:45:10 +01:00
if len ( feeds ) > 1 :
2023-12-27 23:48:31 +01:00
counter = 0
2023-11-13 14:45:10 +01:00
msg = (
" RSS URL discovery has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
2023-11-22 12:47:34 +01:00
try :
feed_name = feeds [ feed ] [ " feed " ] [ " title " ]
except :
feed_name = urlsplit ( feed ) . netloc
2023-11-13 14:45:10 +01:00
feed_addr = feed
2023-11-23 17:55:36 +01:00
# AttributeError: 'str' object has no attribute 'entries'
try :
feed_amnt = len ( feeds [ feed ] . entries )
except :
continue
2023-11-15 15:00:49 +01:00
if feed_amnt :
2023-12-27 23:48:31 +01:00
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [ feed_addr ]
counter + = 1
2023-11-15 15:00:49 +01:00
msg + = (
" Title: {} \n "
2023-11-26 06:48:09 +01:00
" Link : {} \n "
" Items: {} \n "
2023-11-15 15:00:49 +01:00
" \n "
) . format (
feed_name ,
feed_addr ,
feed_amnt
)
2023-12-27 23:48:31 +01:00
if counter > 1 :
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
elif feed_mark :
return feed_mark
else :
2023-11-15 15:00:49 +01:00
msg = (
2023-12-24 19:37:05 +01:00
" No feeds were found for {} "
2023-11-15 15:00:49 +01:00
) . format ( url )
return msg
2023-11-13 14:45:10 +01:00
elif feeds :
2023-11-26 06:48:09 +01:00
return feeds
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
async def feed_mode_scan ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Scan page for potential feeds by pathname .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
feeds = { }
# paths = []
# TODO Test
2023-12-26 12:22:45 +01:00
paths = await get_list ( " lists.yaml " )
paths = paths [ " pathnames " ]
2023-11-13 14:45:10 +01:00
for path in paths :
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
2023-12-24 19:37:05 +01:00
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = " (//a[contains(@href, ' {} ' )])[position()<= {} ] " . format ( path , num )
2023-11-13 14:45:10 +01:00
addresses = tree . xpath ( xpath_query )
2023-12-24 19:37:05 +01:00
xpath_query = " (//a[contains(@href, ' {} ' )])[position()>last()- {} ] " . format ( path , num )
addresses + = tree . xpath ( xpath_query )
2023-11-13 14:45:10 +01:00
parted_url = urlsplit ( url )
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses :
2023-12-24 19:37:05 +01:00
# print(address.xpath('@href')[0])
# print(addresses)
2023-11-13 14:45:10 +01:00
address = address . xpath ( ' @href ' ) [ 0 ]
if " / " not in address :
protocol = parted_url . scheme
hostname = parted_url . netloc
pathname = address
address = urlunsplit ( [
protocol ,
hostname ,
pathname ,
None ,
None
] )
if address . startswith ( ' / ' ) :
protocol = parted_url . scheme
hostname = parted_url . netloc
pathname = address
address = urlunsplit ( [
protocol ,
hostname ,
pathname ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
2023-12-04 15:41:02 +01:00
feeds [ address ] = parse ( res [ 0 ] )
2023-12-24 19:37:05 +01:00
# print(feeds[address])
# breakpoint()
2023-11-15 15:00:49 +01:00
# print(feeds)
2023-11-13 14:45:10 +01:00
except :
continue
if len ( feeds ) > 1 :
2023-12-24 19:37:05 +01:00
# print(feeds)
# breakpoint()
counter = 0
2023-11-13 14:45:10 +01:00
msg = (
" RSS URL scan has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
# try:
# res = await download_feed(feed)
# except:
# continue
2023-11-22 12:47:34 +01:00
try :
feed_name = feeds [ feed ] [ " feed " ] [ " title " ]
except :
feed_name = urlsplit ( feed ) . netloc
2023-11-13 14:45:10 +01:00
feed_addr = feed
2023-11-15 15:00:49 +01:00
feed_amnt = len ( feeds [ feed ] . entries )
if feed_amnt :
2023-12-24 19:37:05 +01:00
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [ feed_addr ]
counter + = 1
2023-11-15 15:00:49 +01:00
msg + = (
2023-12-24 19:37:05 +01:00
" Title : {} \n "
" Link : {} \n "
" Count : {} \n "
2023-11-15 15:00:49 +01:00
" \n "
) . format (
feed_name ,
feed_addr ,
feed_amnt
)
2023-12-24 19:37:05 +01:00
if counter > 1 :
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
elif feed_mark :
return feed_mark
else :
2023-11-15 15:00:49 +01:00
msg = (
2023-12-24 19:37:05 +01:00
" No feeds were found for {} "
2023-11-15 15:00:49 +01:00
) . format ( url )
2023-11-13 14:45:10 +01:00
return msg
elif feeds :
2023-11-26 06:48:09 +01:00
return feeds
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
async def feed_mode_auto_discovery ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Lookup for feeds using RSS autodiscovery technique .
See : https : / / www . rssboard . org / rss - autodiscovery
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
xpath_query = (
' //link[(@rel= " alternate " ) and '
' (@type= " application/atom+xml " or '
' @type= " application/rdf+xml " or '
' @type= " application/rss+xml " )] '
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree . xpath ( xpath_query )
if len ( feeds ) > 1 :
msg = (
" RSS Auto-Discovery has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
2023-12-04 15:41:02 +01:00
# disco = parse(res[0])
2023-11-13 14:45:10 +01:00
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed . xpath ( ' @title ' ) [ 0 ]
2023-12-14 09:43:30 +01:00
feed_addr = join_url ( url , feed . xpath ( ' @href ' ) [ 0 ] )
2023-11-13 14:45:10 +01:00
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg + = " {} \n {} \n \n " . format ( feed_name , feed_addr )
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
return msg
elif feeds :
2023-12-14 09:43:30 +01:00
feed_addr = join_url ( url , feeds [ 0 ] . xpath ( ' @href ' ) [ 0 ] )
2023-11-26 06:48:09 +01:00
return [ feed_addr ]
2023-12-27 23:48:31 +01:00
def is_feed ( url , feed ) :
"""
Determine whether document is feed or not .
Parameters
- - - - - - - - - -
url : str
URL .
feed : dict
Parsed feed .
Returns
- - - - - - -
val : boolean
True or False .
"""
if not feed . entries :
try :
feed [ " feed " ] [ " title " ]
except :
val = False
msg = (
" No entries nor title for {} "
) . format ( url )
elif feed . bozo :
val = False
msg = (
" Bozo detected for {} "
) . format ( url )
else :
val = True
msg = (
" Good feed for {} "
) . format ( url )
print ( msg )
return val