2023-10-24 16:43:14 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import feedparser
import aiohttp
import asyncio
2023-11-02 06:17:04 +01:00
import os
2023-10-24 16:43:14 +02:00
import sqlitehandler
2023-11-02 06:17:04 +01:00
import confighandler
2023-10-24 16:43:14 +02:00
from http . client import IncompleteRead
from asyncio . exceptions import IncompleteReadError
from urllib import error
from bs4 import BeautifulSoup
# from xml.etree.ElementTree import ElementTree, ParseError
from urllib . parse import urlparse
from lxml import html
async def download_updates ( db_file ) :
"""
Check feeds for new entries .
2023-11-02 06:17:04 +01:00
2023-10-24 16:43:14 +02:00
: param db_file : Database filename .
"""
urls = await sqlitehandler . get_subscriptions ( db_file )
for url in urls :
# print(os.path.basename(db_file), url[0])
source = url [ 0 ]
res = await download_feed ( source )
# TypeError: 'NoneType' object is not subscriptable
if res is None :
# Skip to next feed
# urls.next()
# next(urls)
continue
await sqlitehandler . update_source_status ( db_file , res [ 1 ] , source )
if res [ 0 ] :
try :
feed = feedparser . parse ( res [ 0 ] )
if feed . bozo :
# bozo = ("WARNING: Bozo detected for feed <{}>. "
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# .format(source))
# print(bozo)
valid = 0
else :
valid = 1
await sqlitehandler . update_source_validity ( db_file , source , valid )
except ( IncompleteReadError , IncompleteRead , error . URLError ) as e :
print ( e )
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res [ 1 ] == 200 :
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed . entries
# length = len(entries)
# await sqlitehandler.remove_entry(db_file, source, length)
await sqlitehandler . remove_nonexistent_entries ( db_file , feed , source )
new_entry = 0
for entry in entries :
if entry . has_key ( " title " ) :
title = entry . title
else :
title = feed [ " feed " ] [ " title " ]
if entry . has_key ( " link " ) :
link = entry . link
else :
link = source
exist = await sqlitehandler . check_entry_exist ( db_file , title , link )
if not exist :
new_entry = new_entry + 1
# TODO Enhance summary
if entry . has_key ( " summary " ) :
summary = entry . summary
# Remove HTML tags
summary = BeautifulSoup ( summary , " lxml " ) . text
# TODO Limit text length
summary = summary . replace ( " \n \n " , " \n " ) [ : 300 ] + " ⃨ "
else :
summary = ' *** No summary *** '
entry = ( title , summary , link , source , 0 ) ;
await sqlitehandler . add_entry_and_set_date ( db_file , source , entry )
async def add_feed ( db_file , url ) :
"""
Check whether feed exist , otherwise process it .
: param db_file : Database filename .
: param url : URL .
: return : Status message .
"""
exist = await sqlitehandler . check_feed_exist ( db_file , url )
if not exist :
res = await download_feed ( url )
if res [ 0 ] :
feed = feedparser . parse ( res [ 0 ] )
2023-11-02 06:17:04 +01:00
title = await get_title ( url , feed )
2023-10-24 16:43:14 +02:00
if feed . bozo :
bozo = ( " WARNING: Bozo detected. Failed to load < {} >. " . format ( url ) )
print ( bozo )
try :
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html . fromstring ( res [ 0 ] )
except :
return " Failed to parse URL < {} > as feed " . format ( url )
print ( " RSS Auto-Discovery Engaged " )
xpath_query = """ //link[(@rel= " alternate " ) and (@type= " application/atom+xml " or @type= " application/rdf+xml " or @type= " application/rss+xml " )] """
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree . xpath ( xpath_query )
if len ( feeds ) > 1 :
msg = " RSS Auto-Discovery has found {} feeds: \n \n " . format ( len ( feeds ) )
for feed in feeds :
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
# disco = feedparser.parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed . xpath ( ' @title ' ) [ 0 ]
feed_addr = feed . xpath ( ' @href ' ) [ 0 ]
msg + = " {} \n {} \n \n " . format ( feed_name , feed_addr )
msg + = " The above feeds were extracted from \n {} " . format ( url )
return msg
elif feeds :
url = feeds [ 0 ] . xpath ( ' @href ' ) [ 0 ]
# Why wouldn't add_feed return a message
# upon success unless return is explicitly
# mentioned, yet upon failure it wouldn't?
return await add_feed ( db_file , url )
print ( " RSS Scan Mode Engaged " )
feeds = { }
2023-11-02 06:17:04 +01:00
paths = [ ]
# TODO Test
cfg_dir = confighandler . get_default_confdir ( )
if not os . path . isdir ( cfg_dir ) :
os . mkdir ( cfg_dir )
cfg_file = os . path . join ( cfg_dir , r " url_paths.txt " )
if not os . path . isfile ( cfg_file ) :
# confighandler.generate_dictionary()
list = confighandler . get_default_list ( )
file = open ( cfg_file , " w " )
file . writelines ( " \n " . join ( list ) )
file . close ( )
file = open ( cfg_file , " r " )
lines = file . readlines ( )
for line in lines :
paths . extend ( [ line . strip ( ) ] )
2023-10-24 16:43:14 +02:00
for path in paths :
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = " //a[contains(@href, ' {} ' )] " . format ( path )
addresses = tree . xpath ( xpath_query )
parted_url = urlparse ( url )
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses :
address = address . xpath ( ' @href ' ) [ 0 ]
if address . startswith ( ' / ' ) :
address = parted_url . scheme + ' :// ' + parted_url . netloc + address
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
feeds [ address ] = feedparser . parse ( res [ 0 ] ) [ " feed " ] [ " title " ]
except :
continue
if len ( feeds ) > 1 :
msg = " RSS URL scan has found {} feeds: \n \n " . format ( len ( feeds ) )
for feed in feeds :
# try:
# res = await download_feed(feed)
# except:
# continue
feed_name = feeds [ feed ]
feed_addr = feed
msg + = " {} \n {} \n \n " . format ( feed_name , feed_addr )
msg + = " The above feeds were extracted from \n {} " . format ( url )
return msg
elif feeds :
url = list ( feeds ) [ 0 ]
return await add_feed ( db_file , url )
# (HTTP) Request(s) Paths
print ( " RSS Arbitrary Mode Engaged " )
feeds = { }
parted_url = urlparse ( url )
for path in paths :
address = parted_url . scheme + ' :// ' + parted_url . netloc + path
res = await download_feed ( address )
if res [ 1 ] == 200 :
# print(feedparser.parse(res[0])["feed"]["title"])
# feeds[address] = feedparser.parse(res[0])["feed"]["title"]
try :
title = feedparser . parse ( res [ 0 ] ) [ " feed " ] [ " title " ]
except :
title = ' *** No Title *** '
feeds [ address ] = title
# Check whether URL has path (i.e. not root)
if parted_url . path . split ( ' / ' ) [ 1 ] :
paths . extend ( [ " .atom " , " .feed " , " .rdf " , " .rss " ] ) if ' .rss ' not in paths else - 1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = parted_url . scheme + ' :// ' + parted_url . netloc + ' / ' + parted_url . path . split ( ' / ' ) [ 1 ] + path
res = await download_feed ( address )
if res [ 1 ] == 200 :
try :
title = feedparser . parse ( res [ 0 ] ) [ " feed " ] [ " title " ]
except :
title = ' *** No Title *** '
feeds [ address ] = title
if len ( feeds ) > 1 :
msg = " RSS URL discovery has found {} feeds: \n \n " . format ( len ( feeds ) )
for feed in feeds :
feed_name = feeds [ feed ]
feed_addr = feed
msg + = " {} \n {} \n \n " . format ( feed_name , feed_addr )
msg + = " The above feeds were extracted from \n {} " . format ( url )
elif feeds :
url = list ( feeds ) [ 0 ]
msg = await add_feed ( db_file , url )
else :
msg = " No news feeds were found for URL < {} >. " . format ( url )
else :
msg = await sqlitehandler . add_feed ( db_file , title , url , res )
else :
msg = " Failed to get URL < {} >. Reason: {} " . format ( url , res [ 1 ] )
else :
ix = exist [ 0 ]
name = exist [ 1 ]
msg = " > {} \n News source \" {} \" is already listed in the subscription list at index {} " . format ( url , name , ix )
return msg
async def download_feed ( url ) :
"""
Download content of given URL .
2023-11-02 06:17:04 +01:00
2023-10-24 16:43:14 +02:00
: param url : URL .
: return : Document or error message .
"""
timeout = aiohttp . ClientTimeout ( total = 10 )
async with aiohttp . ClientSession ( ) as session :
# async with aiohttp.ClientSession(trust_env=True) as session:
try :
async with session . get ( url , timeout = timeout ) as response :
status = response . status
if response . status == 200 :
try :
doc = await response . text ( )
# print (response.content_type)
return [ doc , status ]
except :
2023-11-02 06:17:04 +01:00
# return [False, "The content of this document doesn't appear to be textual."]
return [ False , " Document is too large or is not textual. " ]
2023-10-24 16:43:14 +02:00
else :
return [ False , " HTTP Error: " + str ( status ) ]
except aiohttp . ClientError as e :
print ( ' Error ' , str ( e ) )
return [ False , " Error: " + str ( e ) ]
except asyncio . TimeoutError as e :
# print('Timeout:', str(e))
return [ False , " Timeout: " + str ( e ) ]
2023-11-02 06:17:04 +01:00
async def get_title ( url , feed ) :
"""
Get title of feed .
: param url : URL
: param feed : Parsed feed
: return : Title or URL hostname .
"""
try :
title = feed [ " feed " ] [ " title " ]
except :
title = urlparse ( url ) . netloc
return title