2023-10-24 16:43:14 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2023-11-22 12:47:34 +01:00
"""
FIXME
1 ) feed_mode_scan doesn ' t find feed for https://www.blender.org/
even though it should be according to the pathnames dictionary .
2023-11-23 17:55:36 +01:00
TODO
1 ) Support Gemini and Gopher .
2023-11-22 12:47:34 +01:00
"""
2023-10-24 16:43:14 +02:00
import aiohttp
import asyncio
2023-11-13 14:45:10 +01:00
import feedparser
2023-10-24 16:43:14 +02:00
import sqlitehandler
2023-11-02 06:17:04 +01:00
import confighandler
2023-11-13 14:45:10 +01:00
import datetimehandler
2023-11-26 16:23:52 +01:00
import listhandler
2023-10-24 16:43:14 +02:00
from asyncio . exceptions import IncompleteReadError
2023-11-13 14:45:10 +01:00
from http . client import IncompleteRead
2023-10-24 16:43:14 +02:00
from urllib import error
from bs4 import BeautifulSoup
# from xml.etree.ElementTree import ElementTree, ParseError
2023-11-13 14:45:10 +01:00
from urllib . parse import urljoin
from urllib . parse import urlsplit
from urllib . parse import urlunsplit
2023-10-24 16:43:14 +02:00
from lxml import html
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
# NOTE Why (if res[0]) and (if res[1] == 200)?
2023-11-13 14:45:10 +01:00
async def download_updates ( db_file , url = None ) :
"""
Check feeds for new entries .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str , optional
URL . The default is None .
"""
if url :
urls = [ url ] # Valid [url] and [url,] and (url,)
else :
urls = await sqlitehandler . get_feeds_url ( db_file )
2023-10-24 16:43:14 +02:00
for url in urls :
# print(os.path.basename(db_file), url[0])
source = url [ 0 ]
res = await download_feed ( source )
# TypeError: 'NoneType' object is not subscriptable
if res is None :
# Skip to next feed
# urls.next()
# next(urls)
continue
2023-11-13 14:45:10 +01:00
await sqlitehandler . update_source_status (
db_file ,
res [ 1 ] ,
source
)
2023-10-24 16:43:14 +02:00
if res [ 0 ] :
try :
feed = feedparser . parse ( res [ 0 ] )
if feed . bozo :
2023-11-22 12:47:34 +01:00
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(source)
# print(bozo)
2023-10-24 16:43:14 +02:00
valid = 0
else :
valid = 1
2023-11-13 14:45:10 +01:00
await sqlitehandler . update_source_validity (
db_file ,
source ,
valid )
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
None
2023-10-24 16:43:14 +02:00
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res [ 1 ] == 200 :
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed . entries
# length = len(entries)
# await sqlitehandler.remove_entry(db_file, source, length)
2023-11-13 14:45:10 +01:00
await sqlitehandler . remove_nonexistent_entries (
db_file ,
feed ,
source
)
# new_entry = 0
2023-10-24 16:43:14 +02:00
for entry in entries :
if entry . has_key ( " title " ) :
title = entry . title
else :
title = feed [ " feed " ] [ " title " ]
if entry . has_key ( " link " ) :
2023-11-13 14:45:10 +01:00
# link = complete_url(source, entry.link)
link = await join_url ( source , entry . link )
link = await trim_url ( link )
2023-10-24 16:43:14 +02:00
else :
link = source
2023-11-22 12:47:34 +01:00
if entry . has_key ( " id " ) :
eid = entry . id
else :
eid = link
2023-11-13 14:45:10 +01:00
# TODO Pass date too for comparion check
if entry . has_key ( " published " ) :
date = entry . published
date = await datetimehandler . rfc2822_to_iso8601 ( date )
2023-11-26 06:48:09 +01:00
elif entry . has_key ( " updated " ) :
date = entry . updated
date = await datetimehandler . rfc2822_to_iso8601 ( date )
2023-11-13 14:45:10 +01:00
else :
2023-11-29 16:32:35 +01:00
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = await datetimehandler . now ( )
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
2023-11-13 14:45:10 +01:00
exist = await sqlitehandler . check_entry_exist (
db_file ,
source ,
eid = eid ,
title = title ,
link = link ,
date = date
)
2023-10-24 16:43:14 +02:00
if not exist :
2023-11-13 14:45:10 +01:00
# new_entry = new_entry + 1
2023-10-24 16:43:14 +02:00
# TODO Enhance summary
if entry . has_key ( " summary " ) :
summary = entry . summary
# Remove HTML tags
summary = BeautifulSoup ( summary , " lxml " ) . text
# TODO Limit text length
2023-11-29 16:32:35 +01:00
summary = summary . replace ( " \n \n \n " , " \n \n " )
summary = summary [ : 300 ] + " ⃨ "
summary = summary . strip ( ) . split ( ' \n ' )
summary = [ " > " + line for line in summary ]
summary = " \n " . join ( summary )
2023-10-24 16:43:14 +02:00
else :
2023-11-29 16:32:35 +01:00
summary = " > *** No summary *** "
2023-11-13 14:45:10 +01:00
read_status = 0
pathname = urlsplit ( link ) . path
string = (
" {} {} {} "
) . format (
title ,
summary ,
pathname
)
2023-11-26 16:23:52 +01:00
allow_list = await listhandler . is_listed (
2023-11-13 14:45:10 +01:00
db_file ,
2023-11-26 16:23:52 +01:00
" filter-allow " ,
2023-11-13 14:45:10 +01:00
string
)
if not allow_list :
2023-11-26 16:23:52 +01:00
reject_list = await listhandler . is_listed (
2023-11-13 14:45:10 +01:00
db_file ,
2023-11-26 16:23:52 +01:00
" filter-deny " ,
2023-11-13 14:45:10 +01:00
string
)
if reject_list :
2023-11-15 15:00:49 +01:00
# print(">>> REJECTED", title)
2023-11-23 17:55:36 +01:00
summary = (
" REJECTED {} " . format (
reject_list . upper ( )
)
)
2023-11-13 14:45:10 +01:00
# summary = ""
read_status = 1
entry = (
title ,
summary ,
link ,
eid ,
source ,
date ,
read_status
)
await sqlitehandler . add_entry_and_set_date (
db_file ,
source ,
entry
)
# print(await datetimehandler.current_time(), entry, title)
# else:
# print(await datetimehandler.current_time(), exist, title)
2023-11-26 06:48:09 +01:00
# NOTE Why (if result[0]) and (if result[1] == 200)?
2023-11-26 16:23:52 +01:00
async def view_feed ( url ) :
2023-11-26 06:48:09 +01:00
"""
Check feeds for new entries .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str , optional
URL . The default is None .
Returns
- - - - - - -
msg : str
Feed content or error message .
"""
result = await download_feed ( url )
if result [ 0 ] :
try :
feed = feedparser . parse ( result [ 0 ] )
if feed . bozo :
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
2023-11-26 16:23:52 +01:00
msg = await probe_page ( view_feed , url , result [ 0 ] )
2023-11-26 06:48:09 +01:00
return msg
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
msg = (
" > {} \n "
" Error: {} "
) . format ( url , e )
breakpoint ( )
if result [ 1 ] == 200 :
title = await get_title ( url , result [ 0 ] )
entries = feed . entries
2023-11-26 16:23:52 +01:00
msg = " Preview of {} : \n ``` \n " . format ( title )
2023-11-26 06:48:09 +01:00
count = 0
for entry in entries :
count + = 1
if entry . has_key ( " title " ) :
title = entry . title
else :
title = " *** No title *** "
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
link = await join_url ( url , entry . link )
link = await trim_url ( link )
else :
link = " *** No link *** "
if entry . has_key ( " published " ) :
date = entry . published
date = await datetimehandler . rfc2822_to_iso8601 ( date )
elif entry . has_key ( " updated " ) :
date = entry . updated
date = await datetimehandler . rfc2822_to_iso8601 ( date )
else :
date = " *** No date *** "
msg + = (
" Title : {} \n "
" Date : {} \n "
" Link : {} \n "
" Count : {} \n "
" \n "
) . format (
title ,
date ,
link ,
count
)
2023-11-26 16:23:52 +01:00
if count > 4 :
break
2023-11-26 06:48:09 +01:00
msg + = (
2023-11-26 16:23:52 +01:00
" ``` \n Source: {} "
) . format ( url )
2023-11-26 06:48:09 +01:00
else :
msg = (
" > {} \n Failed to load URL. Reason: {} "
) . format ( url , result [ 1 ] )
return msg
2023-11-26 16:23:52 +01:00
# NOTE Why (if result[0]) and (if result[1] == 200)?
async def view_entry ( url , num ) :
2023-11-26 06:48:09 +01:00
result = await download_feed ( url )
2023-11-26 16:23:52 +01:00
if result [ 0 ] :
try :
feed = feedparser . parse ( result [ 0 ] )
if feed . bozo :
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
msg = await probe_page ( view_entry , url , result [ 0 ] , num )
return msg
except (
IncompleteReadError ,
IncompleteRead ,
error . URLError
) as e :
# print(e)
# TODO Print error to log
msg = (
" > {} \n "
" Error: {} "
) . format ( url , e )
breakpoint ( )
2023-11-26 06:48:09 +01:00
if result [ 1 ] == 200 :
feed = feedparser . parse ( result [ 0 ] )
title = await get_title ( url , result [ 0 ] )
entries = feed . entries
2023-11-26 16:23:52 +01:00
num = int ( num ) - 1
2023-11-26 06:48:09 +01:00
entry = entries [ num ]
if entry . has_key ( " title " ) :
title = entry . title
else :
title = " *** No title *** "
if entry . has_key ( " published " ) :
date = entry . published
date = await datetimehandler . rfc2822_to_iso8601 ( date )
elif entry . has_key ( " updated " ) :
date = entry . updated
date = await datetimehandler . rfc2822_to_iso8601 ( date )
else :
date = " *** No date *** "
if entry . has_key ( " summary " ) :
summary = entry . summary
# Remove HTML tags
2023-11-26 16:23:52 +01:00
summary = BeautifulSoup ( summary , " lxml " ) . text
2023-11-26 06:48:09 +01:00
# TODO Limit text length
2023-11-26 16:23:52 +01:00
summary = summary . replace ( " \n \n \n " , " \n \n " )
2023-11-26 06:48:09 +01:00
else :
summary = " *** No summary *** "
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
link = await join_url ( url , entry . link )
link = await trim_url ( link )
else :
link = " *** No link *** "
msg = (
" {} \n "
" \n "
" {} \n "
" \n "
" {} \n "
" \n "
) . format (
title ,
summary ,
link
)
else :
msg = (
" > {} \n "
" Failed to load URL. Reason: {} \n "
" Try again momentarily. "
) . format ( url , result [ 1 ] )
return msg
2023-11-13 14:45:10 +01:00
async def add_feed_no_check ( db_file , data ) :
"""
Add given feed without validity check .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
data : str
URL or URL and Title .
Returns
- - - - - - -
msg : str
Status message .
"""
url = data [ 0 ]
title = data [ 1 ]
url = await trim_url ( url )
exist = await sqlitehandler . check_feed_exist ( db_file , url )
if not exist :
msg = await sqlitehandler . add_feed ( db_file , url , title )
await download_updates ( db_file , [ url ] )
else :
ix = exist [ 0 ]
name = exist [ 1 ]
msg = (
" > {} \n News source \" {} \" is already "
" listed in the subscription list at "
" index {} " . format ( url , name , ix )
)
return msg
2023-10-24 16:43:14 +02:00
async def add_feed ( db_file , url ) :
"""
Check whether feed exist , otherwise process it .
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
Returns
- - - - - - -
msg : str
Status message .
2023-10-24 16:43:14 +02:00
"""
2023-11-13 14:45:10 +01:00
msg = None
url = await trim_url ( url )
2023-10-24 16:43:14 +02:00
exist = await sqlitehandler . check_feed_exist ( db_file , url )
if not exist :
res = await download_feed ( url )
if res [ 0 ] :
feed = feedparser . parse ( res [ 0 ] )
2023-11-02 06:17:04 +01:00
title = await get_title ( url , feed )
2023-10-24 16:43:14 +02:00
if feed . bozo :
2023-11-13 14:45:10 +01:00
bozo = (
" Bozo detected. Failed to load: {} . "
) . format ( url )
2023-10-24 16:43:14 +02:00
print ( bozo )
2023-11-26 06:48:09 +01:00
msg = await probe_page ( add_feed , url , res [ 0 ] , db_file )
2023-10-24 16:43:14 +02:00
else :
2023-11-13 14:45:10 +01:00
status = res [ 1 ]
msg = await sqlitehandler . add_feed (
db_file ,
url ,
title ,
status
)
await download_updates ( db_file , [ url ] )
2023-10-24 16:43:14 +02:00
else :
2023-11-13 14:45:10 +01:00
status = res [ 1 ]
msg = (
2023-11-26 06:48:09 +01:00
" > {} \n Failed to load URL. Reason: {} "
2023-11-13 14:45:10 +01:00
) . format ( url , status )
2023-10-24 16:43:14 +02:00
else :
ix = exist [ 0 ]
name = exist [ 1 ]
2023-11-13 14:45:10 +01:00
msg = (
" > {} \n News source \" {} \" is already "
" listed in the subscription list at "
" index {} " . format ( url , name , ix )
)
2023-10-24 16:43:14 +02:00
return msg
2023-11-26 06:48:09 +01:00
# TODO callback for use with add_feed and view_feed
2023-11-26 16:23:52 +01:00
async def probe_page ( callback , url , doc , num = None , db_file = None ) :
2023-11-26 06:48:09 +01:00
msg = None
try :
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html . fromstring ( doc )
except :
msg = (
" > {} \n Failed to parse URL as feed. "
) . format ( url )
if not msg :
print ( " RSS Auto-Discovery Engaged " )
msg = await feed_mode_auto_discovery ( url , tree )
if not msg :
print ( " RSS Scan Mode Engaged " )
msg = await feed_mode_scan ( url , tree )
if not msg :
print ( " RSS Arbitrary Mode Engaged " )
msg = await feed_mode_request ( url , tree )
if not msg :
msg = (
" > {} \n No news feeds were found for URL. "
) . format ( url )
# elif msg:
else :
if isinstance ( msg , str ) :
return msg
elif isinstance ( msg , list ) :
url = msg [ 0 ]
if db_file :
return await callback ( db_file , url )
2023-11-26 16:23:52 +01:00
elif num :
return await callback ( url , num )
2023-11-26 06:48:09 +01:00
else :
return await callback ( url )
2023-10-24 16:43:14 +02:00
async def download_feed ( url ) :
"""
Download content of given URL .
2023-11-02 06:17:04 +01:00
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
url : str
URL .
Returns
- - - - - - -
msg : list or str
Document or error message .
2023-10-24 16:43:14 +02:00
"""
timeout = aiohttp . ClientTimeout ( total = 10 )
async with aiohttp . ClientSession ( ) as session :
# async with aiohttp.ClientSession(trust_env=True) as session:
try :
async with session . get ( url , timeout = timeout ) as response :
status = response . status
if response . status == 200 :
try :
doc = await response . text ( )
# print (response.content_type)
2023-11-13 14:45:10 +01:00
msg = [
doc ,
status
]
2023-10-24 16:43:14 +02:00
except :
2023-11-13 14:45:10 +01:00
# msg = [
# False,
# ("The content of this document "
# "doesn't appear to be textual."
# )
# ]
msg = [
False ,
" Document is too large or is not textual. "
]
2023-10-24 16:43:14 +02:00
else :
2023-11-13 14:45:10 +01:00
msg = [
False ,
" HTTP Error: " + str ( status )
]
2023-10-24 16:43:14 +02:00
except aiohttp . ClientError as e :
2023-11-13 14:45:10 +01:00
# print('Error', str(e))
msg = [
False ,
" Error: " + str ( e )
]
2023-10-24 16:43:14 +02:00
except asyncio . TimeoutError as e :
# print('Timeout:', str(e))
2023-11-13 14:45:10 +01:00
msg = [
False ,
" Timeout: " + str ( e )
]
return msg
2023-11-02 06:17:04 +01:00
async def get_title ( url , feed ) :
"""
Get title of feed .
2023-11-13 14:45:10 +01:00
Parameters
- - - - - - - - - -
url : str
URL .
feed : dict
Parsed feed document .
Returns
- - - - - - -
title : str
Title or URL hostname .
2023-11-02 06:17:04 +01:00
"""
try :
title = feed [ " feed " ] [ " title " ]
except :
2023-11-13 14:45:10 +01:00
title = urlsplit ( url ) . netloc
2023-11-02 06:17:04 +01:00
return title
2023-11-13 14:45:10 +01:00
# NOTE Read the documentation
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urljoin
def complete_url ( source , link ) :
"""
Check if URL is pathname and complete it into URL .
Parameters
- - - - - - - - - -
source : str
Feed URL .
link : str
Link URL or pathname .
Returns
- - - - - - -
str
URL .
"""
if link . startswith ( " www. " ) :
return " http:// " + link
parted_link = urlsplit ( link )
parted_feed = urlsplit ( source )
if parted_link . scheme == " magnet " and parted_link . query :
return link
if parted_link . scheme and parted_link . netloc :
return link
if link . startswith ( " // " ) :
if parted_link . netloc and parted_link . path :
new_link = urlunsplit ( [
parted_feed . scheme ,
parted_link . netloc ,
parted_link . path ,
parted_link . query ,
parted_link . fragment
] )
elif link . startswith ( " / " ) :
new_link = urlunsplit ( [
parted_feed . scheme ,
parted_feed . netloc ,
parted_link . path ,
parted_link . query ,
parted_link . fragment
] )
elif link . startswith ( " ../ " ) :
pathlink = parted_link . path . split ( " / " )
pathfeed = parted_feed . path . split ( " / " )
for i in pathlink :
if i == " .. " :
if pathlink . index ( " .. " ) == 0 :
pathfeed . pop ( )
else :
break
while pathlink . count ( " .. " ) :
if pathlink . index ( " .. " ) == 0 :
pathlink . remove ( " .. " )
else :
break
pathlink = " / " . join ( pathlink )
pathfeed . extend ( [ pathlink ] )
new_link = urlunsplit ( [
parted_feed . scheme ,
parted_feed . netloc ,
" / " . join ( pathfeed ) ,
parted_link . query ,
parted_link . fragment
] )
else :
pathlink = parted_link . path . split ( " / " )
pathfeed = parted_feed . path . split ( " / " )
if link . startswith ( " ./ " ) :
pathlink . remove ( " . " )
if not source . endswith ( " / " ) :
pathfeed . pop ( )
pathlink = " / " . join ( pathlink )
pathfeed . extend ( [ pathlink ] )
new_link = urlunsplit ( [
parted_feed . scheme ,
parted_feed . netloc ,
" / " . join ( pathfeed ) ,
parted_link . query ,
parted_link . fragment
] )
return new_link
"""
TODO
Feed https : / / www . ocaml . org / feed . xml
Link % 20 https : / / frama - c . com / fc - versions / cobalt . html % 20
FIXME
Feed https : / / cyber . dabamos . de / blog / feed . rss
Link https : / / cyber . dabamos . de / blog / #article-2022-07-15
"""
async def join_url ( source , link ) :
"""
Join base URL with given pathname .
Parameters
- - - - - - - - - -
source : str
Feed URL .
link : str
Link URL or pathname .
Returns
- - - - - - -
str
URL .
"""
if link . startswith ( " www. " ) :
new_link = " http:// " + link
elif link . startswith ( " % 20 " ) and link . endswith ( " % 20 " ) :
old_link = link . split ( " % 20 " )
del old_link [ 0 ]
old_link . pop ( )
new_link = " " . join ( old_link )
else :
new_link = urljoin ( source , link )
return new_link
async def trim_url ( url ) :
"""
Check URL pathname for double slash .
Parameters
- - - - - - - - - -
url : str
URL .
Returns
- - - - - - -
url : str
URL .
"""
parted_url = urlsplit ( url )
protocol = parted_url . scheme
hostname = parted_url . netloc
pathname = parted_url . path
queries = parted_url . query
fragment = parted_url . fragment
while " // " in pathname :
pathname = pathname . replace ( " // " , " / " )
url = urlunsplit ( [
protocol ,
hostname ,
pathname ,
queries ,
fragment
] )
return url
# TODO Improve scan by gradual decreasing of path
2023-11-26 06:48:09 +01:00
async def feed_mode_request ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Lookup for feeds by pathname using HTTP Requests .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
feeds = { }
parted_url = urlsplit ( url )
paths = confighandler . get_list ( )
for path in paths :
address = urlunsplit ( [
parted_url . scheme ,
parted_url . netloc ,
path ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
# print(feedparser.parse(res[0])["feed"]["title"])
# feeds[address] = feedparser.parse(res[0])["feed"]["title"]
try :
title = feedparser . parse ( res [ 0 ] ) [ " feed " ] [ " title " ]
except :
title = ' *** No Title *** '
feeds [ address ] = title
# Check whether URL has path (i.e. not root)
if parted_url . path . split ( ' / ' ) [ 1 ] :
paths . extend (
[ " .atom " , " .feed " , " .rdf " , " .rss " ]
) if ' .rss ' not in paths else - 1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit ( [
parted_url . scheme ,
parted_url . netloc ,
parted_url . path . split ( ' / ' ) [ 1 ] + path ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
2023-11-15 15:00:49 +01:00
feeds [ address ] = feedparser . parse ( res [ 0 ] )
# print(feeds)
2023-11-13 14:45:10 +01:00
except :
2023-11-15 15:00:49 +01:00
continue
2023-11-13 14:45:10 +01:00
if len ( feeds ) > 1 :
2023-11-15 15:00:49 +01:00
positive = 0
2023-11-13 14:45:10 +01:00
msg = (
" RSS URL discovery has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
2023-11-22 12:47:34 +01:00
try :
feed_name = feeds [ feed ] [ " feed " ] [ " title " ]
except :
feed_name = urlsplit ( feed ) . netloc
2023-11-13 14:45:10 +01:00
feed_addr = feed
2023-11-23 17:55:36 +01:00
# AttributeError: 'str' object has no attribute 'entries'
try :
feed_amnt = len ( feeds [ feed ] . entries )
except :
continue
2023-11-15 15:00:49 +01:00
if feed_amnt :
positive = 1
msg + = (
" Title: {} \n "
2023-11-26 06:48:09 +01:00
" Link : {} \n "
" Items: {} \n "
2023-11-15 15:00:49 +01:00
" \n "
) . format (
feed_name ,
feed_addr ,
feed_amnt
)
2023-11-13 14:45:10 +01:00
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
2023-11-15 15:00:49 +01:00
if not positive :
msg = (
" No feeds were found for {} . "
) . format ( url )
return msg
2023-11-13 14:45:10 +01:00
elif feeds :
2023-11-26 06:48:09 +01:00
return feeds
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
async def feed_mode_scan ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Scan page for potential feeds by pathname .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
feeds = { }
# paths = []
# TODO Test
paths = confighandler . get_list ( )
for path in paths :
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = " //a[contains(@href, ' {} ' )] " . format ( path )
addresses = tree . xpath ( xpath_query )
parted_url = urlsplit ( url )
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses :
print ( address . xpath ( ' @href ' ) [ 0 ] )
print ( addresses )
address = address . xpath ( ' @href ' ) [ 0 ]
if " / " not in address :
protocol = parted_url . scheme
hostname = parted_url . netloc
pathname = address
address = urlunsplit ( [
protocol ,
hostname ,
pathname ,
None ,
None
] )
if address . startswith ( ' / ' ) :
protocol = parted_url . scheme
hostname = parted_url . netloc
pathname = address
address = urlunsplit ( [
protocol ,
hostname ,
pathname ,
None ,
None
] )
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
2023-11-15 15:00:49 +01:00
feeds [ address ] = feedparser . parse ( res [ 0 ] )
# print(feeds)
2023-11-13 14:45:10 +01:00
except :
continue
if len ( feeds ) > 1 :
2023-11-15 15:00:49 +01:00
positive = 0
2023-11-13 14:45:10 +01:00
msg = (
" RSS URL scan has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
# try:
# res = await download_feed(feed)
# except:
# continue
2023-11-22 12:47:34 +01:00
try :
feed_name = feeds [ feed ] [ " feed " ] [ " title " ]
except :
feed_name = urlsplit ( feed ) . netloc
2023-11-13 14:45:10 +01:00
feed_addr = feed
2023-11-15 15:00:49 +01:00
feed_amnt = len ( feeds [ feed ] . entries )
if feed_amnt :
positive = 1
msg + = (
" Title: {} \n "
" Link: {} \n "
" Count: {} \n "
" \n "
) . format (
feed_name ,
feed_addr ,
feed_amnt
)
2023-11-13 14:45:10 +01:00
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
2023-11-15 15:00:49 +01:00
if not positive :
msg = (
" No feeds were found for {} . "
) . format ( url )
2023-11-13 14:45:10 +01:00
return msg
elif feeds :
2023-11-26 06:48:09 +01:00
return feeds
2023-11-13 14:45:10 +01:00
2023-11-26 06:48:09 +01:00
async def feed_mode_auto_discovery ( url , tree ) :
2023-11-13 14:45:10 +01:00
"""
Lookup for feeds using RSS autodiscovery technique .
See : https : / / www . rssboard . org / rss - autodiscovery
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
xpath_query = (
' //link[(@rel= " alternate " ) and '
' (@type= " application/atom+xml " or '
' @type= " application/rdf+xml " or '
' @type= " application/rss+xml " )] '
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree . xpath ( xpath_query )
if len ( feeds ) > 1 :
msg = (
" RSS Auto-Discovery has found {} feeds: \n ``` \n "
) . format ( len ( feeds ) )
for feed in feeds :
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
# disco = feedparser.parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed . xpath ( ' @title ' ) [ 0 ]
feed_addr = await join_url ( url , feed . xpath ( ' @href ' ) [ 0 ] )
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg + = " {} \n {} \n \n " . format ( feed_name , feed_addr )
msg + = (
" ``` \n The above feeds were extracted from \n {} "
) . format ( url )
return msg
elif feeds :
feed_addr = await join_url ( url , feeds [ 0 ] . xpath ( ' @href ' ) [ 0 ] )
2023-11-26 06:48:09 +01:00
return [ feed_addr ]
2023-11-22 12:47:34 +01:00
async def feed_to_http ( url ) :
"""
Replace scheme feed by http .
Parameters
- - - - - - - - - -
url : str
URL .
Returns
- - - - - - -
new_url : str
URL .
"""
par_url = urlsplit ( url )
new_url = urlunsplit ( [
" http " ,
par_url . netloc ,
par_url . path ,
par_url . query ,
par_url . fragment
] )
return new_url
async def activitypub_to_http ( namespace ) :
"""
Replace ActivityPub namespace by http .
Parameters
- - - - - - - - - -
namespace : str
Namespace .
Returns
- - - - - - -
new_url : str
URL .
"""
par_url = urlsplit ( namespace )
new_url = urlunsplit ( [
" http " ,
par_url . netloc ,
par_url . path ,
par_url . query ,
par_url . fragment
] )
return new_url