2024-06-13 17:53:53 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1 ) Function scan at " for entry in entries "
Suppress directly calling function " add_entry " ( accept db_file )
Pass a list of valid entries to a new function " add_entries "
( accept db_file ) which would call function " add_entry " ( accept cur ) .
* accelerate adding of large set of entries at once .
* prevent ( or mitigate halt of consequent actions ) .
* reduce I / O .
2 ) Call sqlite function from function statistics .
Returning a list of values doesn ' t ' seem to be a good practice .
3 ) Special statistics for operator :
* Size of database ( s ) ;
* Amount of JIDs subscribed ;
* Amount of feeds of all JIDs ;
* Amount of entries of all JIDs .
"""
import asyncio
from feedparser import parse
import os
from slixfeed . config import Config
import slixfeed . fetch as fetch
2024-06-17 19:31:40 +02:00
from slixfeed . log import Logger , Message
2024-06-13 17:53:53 +02:00
import slixfeed . sqlite as sqlite
2024-12-15 19:56:01 +01:00
from slixfeed . utilities import Database , DateAndTime , Html , MD , String , Url , Utilities
2024-06-13 17:53:53 +02:00
from slixmpp . xmlstream import ET
import sys
from urllib . parse import urlsplit
import xml . etree . ElementTree as ETR
logger = Logger ( __name__ )
class Feed :
# NOTE Consider removal of MD (and any other option HTML and XBEL)
2024-12-05 11:06:57 +01:00
def export_feeds ( dir_data , dir_cache , jid_bare , ext ) :
2024-06-13 17:53:53 +02:00
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : jid_bare: {} : ext: {} ' . format ( function_name , jid_bare , ext ) )
2024-11-19 14:55:29 +01:00
if not os . path . isdir ( dir_cache ) :
os . mkdir ( dir_cache )
if not os . path . isdir ( dir_cache + ' / ' + ext ) :
os . mkdir ( dir_cache + ' / ' + ext )
2024-06-13 17:53:53 +02:00
filename = os . path . join (
2024-11-19 14:55:29 +01:00
dir_cache , ext , ' slixfeed_ ' + DateAndTime . timestamp ( ) + ' . ' + ext )
2024-12-15 19:33:55 +01:00
db_file = Database . instantiate ( dir_data , jid_bare )
2024-06-13 17:53:53 +02:00
results = sqlite . get_feeds ( db_file )
match ext :
# case 'html':
# response = 'Not yet implemented.'
case ' md ' :
MD . export_to_markdown ( jid_bare , filename , results )
case ' opml ' :
Opml . export_to_file ( jid_bare , filename , results )
# case 'xbel':
# response = 'Not yet implemented.'
return filename
def pack_entry_into_dict ( db_file , entry ) :
entry_id = entry [ 0 ]
authors = sqlite . get_authors_by_entry_id ( db_file , entry_id )
entry_authors = [ ]
for author in authors :
entry_author = {
' name ' : author [ 2 ] ,
' email ' : author [ 3 ] ,
' url ' : author [ 4 ] }
entry_authors . extend ( [ entry_author ] )
contributors = sqlite . get_contributors_by_entry_id ( db_file , entry_id )
entry_contributors = [ ]
for contributor in contributors :
entry_contributor = {
' name ' : contributor [ 2 ] ,
' email ' : contributor [ 3 ] ,
' url ' : contributor [ 4 ] }
entry_contributors . extend ( [ entry_contributor ] )
links = sqlite . get_links_by_entry_id ( db_file , entry_id )
entry_links = [ ]
for link in links :
entry_link = {
' url ' : link [ 2 ] ,
' type ' : link [ 3 ] ,
' rel ' : link [ 4 ] ,
' size ' : link [ 5 ] }
entry_links . extend ( [ entry_link ] )
tags = sqlite . get_tags_by_entry_id ( db_file , entry_id )
entry_tags = [ ]
for tag in tags :
entry_tag = {
' term ' : tag [ 2 ] ,
' scheme ' : tag [ 3 ] ,
' label ' : tag [ 4 ] }
entry_tags . extend ( [ entry_tag ] )
contents = sqlite . get_contents_by_entry_id ( db_file , entry_id )
entry_contents = [ ]
for content in contents :
entry_content = {
' text ' : content [ 2 ] ,
' type ' : content [ 3 ] ,
' base ' : content [ 4 ] ,
' lang ' : content [ 5 ] }
entry_contents . extend ( [ entry_content ] )
feed_entry = {
' authors ' : entry_authors ,
' category ' : entry [ 10 ] ,
' comments ' : entry [ 12 ] ,
' contents ' : entry_contents ,
' contributors ' : entry_contributors ,
' summary_base ' : entry [ 9 ] ,
' summary_lang ' : entry [ 7 ] ,
' summary_text ' : entry [ 6 ] ,
' summary_type ' : entry [ 8 ] ,
' enclosures ' : entry [ 13 ] ,
' href ' : entry [ 11 ] ,
' link ' : entry [ 3 ] ,
' links ' : entry_links ,
' published ' : entry [ 14 ] ,
' rating ' : entry [ 13 ] ,
' tags ' : entry_tags ,
' title ' : entry [ 4 ] ,
' title_type ' : entry [ 3 ] ,
' updated ' : entry [ 15 ] }
return feed_entry
def create_rfc4287_entry ( feed_entry ) :
node_entry = ET . Element ( ' entry ' )
node_entry . set ( ' xmlns ' , ' http://www.w3.org/2005/Atom ' )
# Title
title = ET . SubElement ( node_entry , ' title ' )
if feed_entry [ ' title ' ] :
if feed_entry [ ' title_type ' ] : title . set ( ' type ' , feed_entry [ ' title_type ' ] )
title . text = feed_entry [ ' title ' ]
elif feed_entry [ ' summary_text ' ] :
if feed_entry [ ' summary_type ' ] : title . set ( ' type ' , feed_entry [ ' summary_type ' ] )
title . text = feed_entry [ ' summary_text ' ]
# if feed_entry['summary_base']: title.set('base', feed_entry['summary_base'])
# if feed_entry['summary_lang']: title.set('lang', feed_entry['summary_lang'])
else :
title . text = feed_entry [ ' published ' ]
# Some feeds have identical content for contents and summary
# So if content is present, do not add summary
if feed_entry [ ' contents ' ] :
# Content
for feed_entry_content in feed_entry [ ' contents ' ] :
content = ET . SubElement ( node_entry , ' content ' )
# if feed_entry_content['base']: content.set('base', feed_entry_content['base'])
if feed_entry_content [ ' lang ' ] : content . set ( ' lang ' , feed_entry_content [ ' lang ' ] )
if feed_entry_content [ ' type ' ] : content . set ( ' type ' , feed_entry_content [ ' type ' ] )
content . text = feed_entry_content [ ' text ' ]
else :
# Summary
summary = ET . SubElement ( node_entry , ' summary ' ) # TODO Try 'content'
# if feed_entry['summary_base']: summary.set('base', feed_entry['summary_base'])
# TODO Check realization of "lang"
if feed_entry [ ' summary_type ' ] : summary . set ( ' type ' , feed_entry [ ' summary_type ' ] )
if feed_entry [ ' summary_lang ' ] : summary . set ( ' lang ' , feed_entry [ ' summary_lang ' ] )
summary . text = feed_entry [ ' summary_text ' ]
# Authors
for feed_entry_author in feed_entry [ ' authors ' ] :
author = ET . SubElement ( node_entry , ' author ' )
name = ET . SubElement ( author , ' name ' )
name . text = feed_entry_author [ ' name ' ]
if feed_entry_author [ ' url ' ] :
uri = ET . SubElement ( author , ' uri ' )
uri . text = feed_entry_author [ ' url ' ]
if feed_entry_author [ ' email ' ] :
email = ET . SubElement ( author , ' email ' )
email . text = feed_entry_author [ ' email ' ]
# Contributors
for feed_entry_contributor in feed_entry [ ' contributors ' ] :
contributor = ET . SubElement ( node_entry , ' author ' )
name = ET . SubElement ( contributor , ' name ' )
name . text = feed_entry_contributor [ ' name ' ]
if feed_entry_contributor [ ' url ' ] :
uri = ET . SubElement ( contributor , ' uri ' )
uri . text = feed_entry_contributor [ ' url ' ]
if feed_entry_contributor [ ' email ' ] :
email = ET . SubElement ( contributor , ' email ' )
email . text = feed_entry_contributor [ ' email ' ]
# Category
category = ET . SubElement ( node_entry , " category " )
category . set ( ' category ' , feed_entry [ ' category ' ] )
# Tags
for feed_entry_tag in feed_entry [ ' tags ' ] :
tag = ET . SubElement ( node_entry , ' category ' )
tag . set ( ' term ' , feed_entry_tag [ ' term ' ] )
# Link
link = ET . SubElement ( node_entry , " link " )
link . set ( ' href ' , feed_entry [ ' link ' ] )
# Links
for feed_entry_link in feed_entry [ ' links ' ] :
link = ET . SubElement ( node_entry , " link " )
link . set ( ' href ' , feed_entry_link [ ' url ' ] )
link . set ( ' type ' , feed_entry_link [ ' type ' ] )
link . set ( ' rel ' , feed_entry_link [ ' rel ' ] )
# Date updated
if feed_entry [ ' updated ' ] :
updated = ET . SubElement ( node_entry , ' updated ' )
updated . text = feed_entry [ ' updated ' ]
# Date published
if feed_entry [ ' published ' ] :
published = ET . SubElement ( node_entry , ' published ' )
published . text = feed_entry [ ' published ' ]
return node_entry
2024-06-16 10:55:22 +02:00
# Look into the type ("atom", "rss2" etc.)
2024-06-13 17:53:53 +02:00
def is_feed ( url , feed ) :
"""
Determine whether document is feed or not .
Parameters
- - - - - - - - - -
feed : dict
Parsed feed .
Returns
- - - - - - -
val : boolean
True or False .
"""
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} ' . format ( function_name ) )
value = False
# message = None
if not feed . entries :
if " version " in feed . keys ( ) :
# feed["version"]
if feed . version :
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
elif " title " in feed [ " feed " ] . keys ( ) :
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
else :
value = False
# message = (
# "No entries nor title for {}"
# ).format(url)
elif feed . bozo :
# NOTE Consider valid even when is not-well-formed
value = True
logger . warning ( ' Bozo detected for {} ' . format ( url ) )
else :
value = True
# message = (
# "Good feed for {}"
# ).format(url)
return value
async def add_feed ( self , jid_bare , db_file , url , identifier ) :
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : db_file: {} url: {} '
. format ( function_name , db_file , url ) )
while True :
feed_id = sqlite . get_feed_id ( db_file , url )
if not feed_id :
2024-06-17 16:25:24 +02:00
if not sqlite . check_identifier_exist ( db_file , identifier ) :
2024-11-24 17:25:52 +01:00
result = await fetch . http ( self . settings_network , url )
2024-06-13 17:53:53 +02:00
message = result [ ' message ' ]
status_code = result [ ' status_code ' ]
if not result [ ' error ' ] :
await sqlite . update_feed_status ( db_file , feed_id , status_code )
document = result [ ' content ' ]
feed = parse ( document )
# if document and status_code == 200:
if Feed . is_feed ( url , feed ) :
if " title " in feed [ " feed " ] . keys ( ) :
title = feed [ " feed " ] [ " title " ]
else :
title = urlsplit ( url ) . netloc
if " language " in feed [ " feed " ] . keys ( ) :
language = feed [ " feed " ] [ " language " ]
else :
language = ' '
if " encoding " in feed . keys ( ) :
encoding = feed [ " encoding " ]
else :
encoding = ' '
if " updated_parsed " in feed [ " feed " ] . keys ( ) :
updated = feed [ " feed " ] [ " updated_parsed " ]
try :
2024-06-16 10:55:22 +02:00
updated = DateAndTime . convert_struct_time_to_iso8601 ( updated )
2024-06-13 17:53:53 +02:00
except Exception as e :
logger . error ( str ( e ) )
updated = ' '
else :
updated = ' '
version = feed . version
entries_count = len ( feed . entries )
await sqlite . insert_feed ( db_file ,
url ,
title ,
identifier ,
entries = entries_count ,
version = version ,
encoding = encoding ,
language = language ,
status_code = status_code ,
updated = updated )
feed_valid = 0 if feed . bozo else 1
await sqlite . update_feed_validity (
db_file , feed_id , feed_valid )
if feed . has_key ( ' updated_parsed ' ) :
feed_updated = feed . updated_parsed
try :
2024-06-16 10:55:22 +02:00
feed_updated = DateAndTime . convert_struct_time_to_iso8601 ( feed_updated )
2024-06-13 17:53:53 +02:00
except Exception as e :
logger . error ( str ( e ) )
feed_updated = None
else :
feed_updated = None
feed_properties = Feed . get_properties_of_feed (
db_file , feed_id , feed )
await sqlite . update_feed_properties (
db_file , feed_id , feed_properties )
feed_id = sqlite . get_feed_id ( db_file , url )
feed_id = feed_id [ 0 ]
2024-06-17 16:25:24 +02:00
new_entries = [ ]
for entry in feed . entries :
if entry . has_key ( " link " ) :
entry_link = Url . join_url ( url , entry . link )
entry_link = Url . trim_url ( entry_link )
entry_identifier = String . md5_hash ( entry_link )
if not sqlite . get_entry_id_by_identifier (
db_file , entry_identifier ) :
new_entry = Feed . get_properties_of_entry (
url , entry_identifier , entry )
new_entries . extend ( [ new_entry ] )
2024-06-13 17:53:53 +02:00
if new_entries :
await sqlite . add_entries_and_update_feed_state (
db_file , feed_id , new_entries )
2024-11-24 16:25:49 +01:00
old = self . settings [ jid_bare ] [ ' old ' ] or self . defaults [ ' default ' ] [ ' old ' ]
if not old : await sqlite . mark_feed_as_read ( db_file ,
feed_id )
2024-06-13 17:53:53 +02:00
result_final = { ' link ' : url ,
' index ' : feed_id ,
' name ' : title ,
' code ' : status_code ,
' error ' : False ,
' message ' : message ,
' exist ' : False ,
' identifier ' : None }
break
else :
# NOTE Do not be tempted to return a compact dictionary.
# That is, dictionary within dictionary
# Return multiple dictionaries in a list or tuple.
2024-11-24 16:25:49 +01:00
result = await FeedDiscovery . probe_page (
self . settings_network , self . pathnames , url , document )
2024-06-13 17:53:53 +02:00
if not result :
# Get out of the loop with dict indicating error.
result_final = { ' link ' : url ,
' index ' : None ,
' name ' : None ,
' code ' : status_code ,
' error ' : True ,
' message ' : message ,
' exist ' : False ,
' identifier ' : None }
break
elif isinstance ( result , list ) :
# Get out of the loop and deliver a list of dicts.
result_final = result
break
else :
# Go back up to the while loop and try again.
url = result [ ' link ' ]
else :
await sqlite . update_feed_status ( db_file , feed_id , status_code )
result_final = { ' link ' : url ,
' index ' : None ,
' name ' : None ,
' code ' : status_code ,
' error ' : True ,
' message ' : message ,
' exist ' : False ,
' identifier ' : None }
break
else :
2024-06-17 16:25:24 +02:00
ix = sqlite . get_entry_id_by_identifier ( db_file , identifier )
2024-06-13 17:53:53 +02:00
message = ( ' Identifier " {} " is already allocated. '
. format ( identifier ) )
result_final = { ' link ' : url ,
' index ' : ix ,
' name ' : None ,
' code ' : None ,
' error ' : False ,
' message ' : message ,
' exist ' : False ,
' identifier ' : identifier }
break
else :
feed_id = feed_id [ 0 ]
title = sqlite . get_feed_title ( db_file , feed_id )
title = title [ 0 ]
message = ' URL already exist. '
result_final = { ' link ' : url ,
' index ' : feed_id ,
' name ' : title ,
' code ' : None ,
' error ' : False ,
' message ' : message ,
' exist ' : True ,
' identifier ' : None }
break
return result_final
def view_feed ( url , feed ) :
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : url: {} '
. format ( function_name , url ) )
if " title " in feed [ " feed " ] . keys ( ) :
title = feed [ " feed " ] [ " title " ]
else :
title = urlsplit ( url ) . netloc
entries = feed . entries
response = " Preview of {} : \n \n ``` \n " . format ( title )
counter = 0
for entry in entries :
counter + = 1
if entry . has_key ( " title " ) :
title = entry . title
else :
title = " *** No title *** "
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
2024-06-16 10:55:22 +02:00
link = Url . join_url ( url , entry . link )
link = Url . trim_url ( link )
2024-06-13 17:53:53 +02:00
else :
link = " *** No link *** "
if entry . has_key ( " published " ) :
date = entry . published
2024-06-16 10:55:22 +02:00
date = DateAndTime . rfc2822_to_iso8601 ( date )
2024-06-13 17:53:53 +02:00
elif entry . has_key ( " updated " ) :
date = entry . updated
2024-06-16 10:55:22 +02:00
date = DateAndTime . rfc2822_to_iso8601 ( date )
2024-06-13 17:53:53 +02:00
else :
date = " *** No date *** "
response + = ( " Title : {} \n "
" Date : {} \n "
" Link : {} \n "
" Count : {} \n "
" \n "
. format ( title , date , link , counter ) )
if counter > 4 :
break
response + = (
" ``` \n Source: {} "
) . format ( url )
return response
def view_entry ( url , feed , num ) :
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : url: {} num: {} '
. format ( function_name , url , num ) )
if " title " in feed [ " feed " ] . keys ( ) :
title = feed [ " feed " ] [ " title " ]
else :
title = urlsplit ( url ) . netloc
entries = feed . entries
num = int ( num ) - 1
entry = entries [ num ]
response = " Preview of {} : \n \n ``` \n " . format ( title )
if entry . has_key ( " title " ) :
title = entry . title
else :
title = ' *** No title *** '
if entry . has_key ( " published " ) :
date = entry . published
2024-06-16 10:55:22 +02:00
date = DateAndTime . rfc2822_to_iso8601 ( date )
2024-06-13 17:53:53 +02:00
elif entry . has_key ( " updated " ) :
date = entry . updated
2024-06-16 10:55:22 +02:00
date = DateAndTime . rfc2822_to_iso8601 ( date )
2024-06-13 17:53:53 +02:00
else :
date = ' *** No date *** '
if entry . has_key ( " summary " ) :
summary = entry . summary
# Remove HTML tags
if summary :
summary = Html . remove_html_tags ( summary )
# TODO Limit text length
summary = summary . replace ( " \n \n \n " , " \n \n " )
else :
summary = ' *** No summary *** '
else :
summary = ' *** No summary *** '
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
2024-06-16 10:55:22 +02:00
link = Url . join_url ( url , entry . link )
link = Url . trim_url ( link )
2024-06-13 17:53:53 +02:00
else :
link = ' *** No link *** '
response = ( " {} \n "
" \n "
# "> {}\n"
" {} \n "
" \n "
" {} \n "
" \n "
. format ( title , summary , link ) )
return response
# NOTE This function is not being utilized
2024-11-24 16:25:49 +01:00
async def download_feed ( settings_network , db_file , feed_url ) :
2024-06-13 17:53:53 +02:00
"""
2024-06-17 16:25:24 +02:00
Process feed content .
2024-06-13 17:53:53 +02:00
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
2024-06-17 16:25:24 +02:00
feed_url : str
URL of feed .
2024-06-13 17:53:53 +02:00
"""
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : db_file: {} url: {} '
. format ( function_name , db_file , feed_url ) )
if isinstance ( feed_url , tuple ) : feed_url = feed_url [ 0 ]
2024-11-24 16:25:49 +01:00
result = await fetch . http ( settings_network , feed_url )
2024-06-13 17:53:53 +02:00
feed_id = sqlite . get_feed_id ( db_file , feed_url )
feed_id = feed_id [ 0 ]
status_code = result [ ' status_code ' ]
await sqlite . update_feed_status ( db_file , feed_id , status_code )
def get_properties_of_feed ( db_file , feed_id , feed ) :
if feed . has_key ( ' updated_parsed ' ) :
feed_updated = feed . updated_parsed
try :
2024-06-16 10:55:22 +02:00
feed_updated = DateAndTime . convert_struct_time_to_iso8601 ( feed_updated )
2024-06-13 17:53:53 +02:00
except :
feed_updated = ' '
else :
feed_updated = ' '
entries_count = len ( feed . entries )
feed_version = feed . version if feed . has_key ( ' version ' ) else ' '
feed_encoding = feed . encoding if feed . has_key ( ' encoding ' ) else ' '
feed_language = feed . feed . language if feed . feed . has_key ( ' language ' ) else ' '
feed_icon = feed . feed . icon if feed . feed . has_key ( ' icon ' ) else ' '
2024-06-17 16:25:24 +02:00
# (Pdb) feed.feed.image
# {}
# (Pdb) feed.version
# 'rss10'
# (Pdb) feed.feed.image
# {'links': [{'rel': 'alternate', 'type': 'text/html'}]}
# (Pdb) feed.version
# ''
feed_image = feed . feed . image . href if feed . feed . has_key ( ' image ' ) and feed . feed . image . has_key ( ' href ' ) else ' '
2024-06-13 17:53:53 +02:00
feed_logo = feed . feed . logo if feed . feed . has_key ( ' logo ' ) else ' '
feed_ttl = feed . feed . ttl if feed . feed . has_key ( ' ttl ' ) else ' '
feed_properties = {
" version " : feed_version ,
" encoding " : feed_encoding ,
" language " : feed_language ,
" rating " : ' ' ,
" entries_count " : entries_count ,
" icon " : feed_icon ,
" image " : feed_image ,
" logo " : feed_logo ,
" ttl " : feed_ttl ,
" updated " : feed_updated ,
}
return feed_properties
# TODO get all active feeds of active accounts and scan the feed with the earliest scanned time
# TODO Rename function name (idea: scan_and_populate)
2024-06-17 16:25:24 +02:00
def get_properties_of_entry ( feed_url , entry_identifier , entry ) :
2024-06-13 17:53:53 +02:00
"""
2024-06-17 16:25:24 +02:00
Process entry content .
2024-06-13 17:53:53 +02:00
Parameters
- - - - - - - - - -
2024-06-17 16:25:24 +02:00
feed_url : str
URL of feed .
entry :
Object of entry .
2024-06-13 17:53:53 +02:00
"""
function_name = sys . _getframe ( ) . f_code . co_name
2024-06-17 16:25:24 +02:00
logger . debug ( ' {} feed_url: {} '
. format ( function_name , feed_url ) )
2024-06-13 17:53:53 +02:00
2024-06-17 16:25:24 +02:00
read_status = 0
if entry . has_key ( " published " ) :
entry_published = entry . published
entry_published = DateAndTime . rfc2822_to_iso8601 ( entry_published )
else :
entry_published = ' '
if entry . has_key ( " updated " ) :
entry_updated = entry . updated
entry_updated = DateAndTime . rfc2822_to_iso8601 ( entry_updated )
else :
entry_updated = DateAndTime . now ( )
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
entry_link = Url . join_url ( feed_url , entry . link )
entry_link = Url . trim_url ( entry_link )
else :
entry_link = feed_url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
entry_title = entry . title if entry . has_key ( " title " ) else entry_published
# entry_id = entry.id if entry.has_key("id") else entry_link
# # Filter
# pathname = urlsplit(link).path
# string = (
# "{} {} {}"
# ).format(
# title, summary, pathname)
# if self.settings['default']['filter']:
# print('Filter is now processing data.')
# allow_list = config.is_include_keyword(db_file,
# "allow", string)
# if not allow_list:
# reject_list = config.is_include_keyword(db_file,
# "deny",
# string)
# if reject_list:
# read_status = 1
# logger.debug('Rejected : {}'
# '\n'
# 'Keyword : {}'
# .format(link, reject_list))
if isinstance ( entry_published , int ) :
logger . error ( ' Variable " published " is int: {} ' . format ( entry_published ) )
if isinstance ( entry_updated , int ) :
logger . error ( ' Variable " updated " is int: {} ' . format ( entry_updated ) )
# Authors
entry_authors = [ ]
if entry . has_key ( ' authors ' ) :
for author in entry . authors :
author_properties = {
' name ' : author . name if author . has_key ( ' name ' ) else ' ' ,
' url ' : author . href if author . has_key ( ' href ' ) else ' ' ,
' email ' : author . email if author . has_key ( ' email ' ) else ' ' ,
2024-06-13 17:53:53 +02:00
}
2024-06-17 16:25:24 +02:00
entry_authors . extend ( [ author_properties ] )
elif entry . has_key ( ' author_detail ' ) :
author_properties = {
' name ' : entry . author_detail . name if entry . author_detail . has_key ( ' name ' ) else ' ' ,
' url ' : entry . author_detail . href if entry . author_detail . has_key ( ' href ' ) else ' ' ,
' email ' : entry . author_detail . email if entry . author_detail . has_key ( ' email ' ) else ' ' ,
}
entry_authors . extend ( [ author_properties ] )
elif entry . has_key ( ' author ' ) :
author_properties = {
' name ' : entry . author ,
' url ' : ' ' ,
' email ' : ' ' ,
}
entry_authors . extend ( [ author_properties ] )
# Contributors
entry_contributors = [ ]
if entry . has_key ( ' contributors ' ) :
for contributor in entry . contributors :
contributor_properties = {
' name ' : contributor . name if contributor . has_key ( ' name ' ) else ' ' ,
' url ' : contributor . href if contributor . has_key ( ' href ' ) else ' ' ,
' email ' : contributor . email if contributor . has_key ( ' email ' ) else ' ' ,
}
entry_contributors . extend ( [ contributor_properties ] )
# Tags
entry_tags = [ ]
if entry . has_key ( ' tags ' ) :
for tag in entry . tags :
tag_properties = {
' term ' : tag . term if tag . has_key ( ' term ' ) else ' ' ,
' scheme ' : tag . scheme if tag . has_key ( ' scheme ' ) else ' ' ,
' label ' : tag . label if tag . has_key ( ' label ' ) else ' ' ,
}
entry_tags . extend ( [ tag_properties ] )
# Content
entry_contents = [ ]
if entry . has_key ( ' content ' ) :
for content in entry . content :
text = content . value if content . has_key ( ' value ' ) else ' '
type = content . type if content . has_key ( ' type ' ) else ' '
lang = content . lang if content . has_key ( ' lang ' ) else ' '
base = content . base if content . has_key ( ' base ' ) else ' '
entry_content = {
' text ' : text ,
' lang ' : lang ,
' type ' : type ,
' base ' : base ,
}
entry_contents . extend ( [ entry_content ] )
# Links and Enclosures
entry_links = [ ]
if entry . has_key ( ' links ' ) :
for link in entry . links :
link_properties = {
' url ' : link . href if link . has_key ( ' href ' ) else ' ' ,
' rel ' : link . rel if link . has_key ( ' rel ' ) else ' ' ,
' type ' : link . type if link . has_key ( ' type ' ) else ' ' ,
' length ' : ' ' ,
}
entry_links . extend ( [ link_properties ] )
# Element media:content is utilized by Mastodon
if entry . has_key ( ' media_content ' ) :
for link in entry . media_content :
link_properties = {
' url ' : link [ ' url ' ] if ' url ' in link else ' ' ,
' rel ' : ' enclosure ' ,
' type ' : link [ ' type ' ] if ' type ' in link else ' ' ,
# 'medium' : link['medium'] if 'medium' in link else '',
' length ' : link [ ' filesize ' ] if ' filesize ' in link else ' ' ,
}
entry_links . extend ( [ link_properties ] )
if entry . has_key ( ' media_thumbnail ' ) :
for link in entry . media_thumbnail :
link_properties = {
' url ' : link [ ' url ' ] if ' url ' in link else ' ' ,
' rel ' : ' enclosure ' ,
' type ' : ' ' ,
# 'medium' : 'image',
' length ' : ' ' ,
}
entry_links . extend ( [ link_properties ] )
# Category
entry_category = entry . category if entry . has_key ( ' category ' ) else ' '
# Comments
entry_comments = entry . comments if entry . has_key ( ' comments ' ) else ' '
# href
entry_href = entry . href if entry . has_key ( ' href ' ) else ' '
# Link: Same as entry.links[0].href in most if not all cases
entry_link = entry . link if entry . has_key ( ' link ' ) else ' '
# Rating
entry_rating = entry . rating if entry . has_key ( ' rating ' ) else ' '
# Summary
entry_summary_text = entry . summary if entry . has_key ( ' summary ' ) else ' '
if entry . has_key ( ' summary_detail ' ) :
entry_summary_type = entry . summary_detail . type if entry . summary_detail . has_key ( ' type ' ) else ' '
entry_summary_lang = entry . summary_detail . lang if entry . summary_detail . has_key ( ' lang ' ) else ' '
entry_summary_base = entry . summary_detail . base if entry . summary_detail . has_key ( ' base ' ) else ' '
else :
entry_summary_type = ' '
entry_summary_lang = ' '
entry_summary_base = ' '
# Title
entry_title = entry . title if entry . has_key ( ' title ' ) else ' '
if entry . has_key ( ' title_detail ' ) :
entry_title_type = entry . title_detail . type if entry . title_detail . has_key ( ' type ' ) else ' '
else :
entry_title_type = ' '
###########################################################
# media_type = e_link.type[:e_link.type.index("/")]
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
entry_properties = {
" identifier " : entry_identifier ,
" link " : entry_link ,
" href " : entry_href ,
" title " : entry_title ,
" title_type " : entry_title_type ,
' summary_text ' : entry_summary_text ,
' summary_lang ' : entry_summary_lang ,
' summary_type ' : entry_summary_type ,
' summary_base ' : entry_summary_base ,
' category ' : entry_category ,
" comments " : entry_comments ,
" rating " : entry_rating ,
" published " : entry_published ,
" updated " : entry_updated ,
" read_status " : read_status }
new_entry = {
" entry_properties " : entry_properties ,
" entry_authors " : entry_authors ,
" entry_contributors " : entry_contributors ,
" entry_contents " : entry_contents ,
" entry_links " : entry_links ,
" entry_tags " : entry_tags }
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
return new_entry
2024-06-13 17:53:53 +02:00
2024-06-16 10:55:22 +02:00
"""
FIXME
1 ) https : / / wiki . pine64 . org
File " /slixfeed/crawl.py " , line 178 , in feed_mode_guess
address = Url . join_url ( url , parted_url . path . split ( ' / ' ) [ 1 ] + path )
~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ^ ^ ^
IndexError : list index out of range
TODO
1.1 ) Attempt to scan more paths : / blog / , / news / etc . , including root /
Attempt to scan sub domains
https : / / esmailelbob . xyz / en /
https : / / blog . esmailelbob . xyz / feed /
1.2 ) Consider utilizing fetch . http_response
2 ) DeviantArt
https : / / www . deviantart . com / nedesem / gallery
https : / / backend . deviantart . com / rss . xml ? q = gallery : nedesem
https : / / backend . deviantart . com / rss . xml ? q = nedesem
https : / / www . deviantart . com / search ? q =
https : / / backend . deviantart . com / rss . xml ? q = search :
FEEDS CRAWLER PROJECT
3 ) Mark redirects for manual check
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / atom . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / feed . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / feeds / rss / news . xml . php
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / jekyll / feed . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / news . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / news . xml . php
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / rdf . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / rss . xml
Title : JSON Feed
Link : https : / / www . jsonfeed . org / feed . json / videos . xml
"""
from aiohttp import ClientError , ClientSession , ClientTimeout
from lxml import etree
from lxml import html
from lxml . etree import fromstring
class FeedDiscovery :
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
2024-11-24 16:25:49 +01:00
async def probe_page ( settings_network , pathnames , url , document = None ) :
2024-06-16 10:55:22 +02:00
"""
Parameters
- - - - - - - - - -
url : str
URL .
document : TYPE
DESCRIPTION .
Returns
- - - - - - -
result : list or str
Single URL as list or selection of URLs as str .
"""
if not document :
2024-11-24 16:25:49 +01:00
response = await fetch . http ( settings_network , url )
2024-06-16 10:55:22 +02:00
if not response [ ' error ' ] :
document = response [ ' content ' ]
try :
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html . fromstring ( document )
result = None
except Exception as e :
logger . error ( str ( e ) )
try :
# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
# xml = html.fromstring(document.encode('utf-8'))
# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
# tree = fromstring(xml, parser=parser)
# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
#tree = html.fromstring(bytes(document, encoding='utf8'))
# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
#parser = etree.XMLParser(recover=True)
#tree = etree.fromstring(document, parser)
tree = html . fromstring ( document . encode ( ' utf-8 ' ) )
result = None
except Exception as e :
logger . error ( str ( e ) )
2024-11-24 17:17:13 +01:00
logger . warning ( f " Failed to parse URL as feed for { url } . " )
2024-06-16 10:55:22 +02:00
result = { ' link ' : None ,
' index ' : None ,
' name ' : None ,
' code ' : None ,
' error ' : True ,
' exist ' : None }
if not result :
2024-11-24 17:17:13 +01:00
logger . debug ( f " Feed auto-discovery engaged for { url } " )
2024-06-16 10:55:22 +02:00
result = FeedDiscovery . feed_mode_auto_discovery ( url , tree )
if not result :
2024-11-24 17:17:13 +01:00
logger . debug ( f " Feed link scan mode engaged for { url } " )
2024-11-24 16:25:49 +01:00
result = FeedDiscovery . feed_mode_scan ( url , tree , pathnames )
2024-06-16 10:55:22 +02:00
if not result :
2024-11-24 17:17:13 +01:00
logger . debug ( f " Feed arbitrary mode engaged for { url } " )
2024-11-24 16:25:49 +01:00
result = FeedDiscovery . feed_mode_guess ( url , pathnames )
2024-06-16 10:55:22 +02:00
if not result :
2024-11-24 17:17:13 +01:00
logger . debug ( f " No feeds were found for { url } " )
2024-06-16 10:55:22 +02:00
result = None
2024-11-24 17:17:13 +01:00
result = await FeedDiscovery . process_feed_selection ( settings_network , url , result )
2024-06-16 10:55:22 +02:00
return result
# TODO Improve scan by gradual decreasing of path
2024-11-24 16:25:49 +01:00
def feed_mode_guess ( url , pathnames ) :
2024-06-16 10:55:22 +02:00
"""
Lookup for feeds by pathname using HTTP Requests .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
2024-11-24 16:25:49 +01:00
pathnames : list
pathnames .
2024-06-16 10:55:22 +02:00
Returns
- - - - - - -
msg : str
Message with URLs .
"""
urls = [ ]
parted_url = urlsplit ( url )
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url . path and parted_url . path . split ( ' / ' ) [ 1 ] :
2024-11-24 16:25:49 +01:00
pathnames . extend (
2024-06-16 10:55:22 +02:00
[ " .atom " , " .feed " , " .rdf " , " .rss " ]
2024-11-24 16:25:49 +01:00
) if ' .rss ' not in pathnames else - 1
2024-06-16 10:55:22 +02:00
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
parted_url_path = parted_url . path if parted_url . path else ' / '
2024-11-24 16:25:49 +01:00
for path in pathnames :
2024-06-16 10:55:22 +02:00
address = Url . join_url ( url , parted_url_path . split ( ' / ' ) [ 1 ] + path )
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_guess")
return urls
2024-11-24 16:25:49 +01:00
def feed_mode_scan ( url , tree , pathnames ) :
2024-06-16 10:55:22 +02:00
"""
Scan page for potential feeds by pathname .
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
urls = [ ]
2024-11-24 16:25:49 +01:00
for path in pathnames :
2024-06-16 10:55:22 +02:00
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = (
" (//a[contains(@href, ' {} ' )])[position()<= {} ] "
) . format ( path , num )
addresses = tree . xpath ( xpath_query )
xpath_query = (
" (//a[contains(@href, ' {} ' )])[position()>last()- {} ] "
) . format ( path , num )
addresses + = tree . xpath ( xpath_query )
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses :
address = Url . join_url ( url , address . xpath ( ' @href ' ) [ 0 ] )
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_scan")
return urls
def feed_mode_auto_discovery ( url , tree ) :
"""
Lookup for feeds using RSS autodiscovery technique .
See : https : / / www . rssboard . org / rss - autodiscovery
Parameters
- - - - - - - - - -
db_file : str
Path to database file .
url : str
URL .
tree : TYPE
DESCRIPTION .
Returns
- - - - - - -
msg : str
Message with URLs .
"""
xpath_query = (
' //link[(@rel= " alternate " ) and '
' (@type= " application/atom+xml " or '
' @type= " application/rdf+xml " or '
' @type= " application/rss+xml " )] '
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree . xpath ( xpath_query )
if feeds :
urls = [ ]
for feed in feeds :
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await fetch.http(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
# feed_name = feed.xpath('@title')[0]
# feed_addr = Url.join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
address = Url . join_url ( url , feed . xpath ( ' @href ' ) [ 0 ] )
if address not in urls :
urls . extend ( [ address ] )
# breakpoint()
# print("feed_mode_auto_discovery")
return urls
# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
2024-11-24 17:17:13 +01:00
async def process_feed_selection ( settings_network , url , urls ) :
2024-06-16 10:55:22 +02:00
feeds = { }
for i in urls :
2024-11-24 17:17:13 +01:00
result = await fetch . http ( settings_network , i )
2024-06-16 10:55:22 +02:00
if not result [ ' error ' ] :
document = result [ ' content ' ]
status_code = result [ ' status_code ' ]
if status_code == 200 : # NOTE This line might be redundant
try :
feeds [ i ] = [ parse ( document ) ]
except :
continue
message = (
" Web feeds found for {} \n \n ``` \n "
) . format ( url )
urls = [ ]
for feed_url in feeds :
# try:
# res = await fetch.http(feed)
# except:
# continue
feed_name = None
if " title " in feeds [ feed_url ] [ 0 ] [ " feed " ] . keys ( ) :
feed_name = feeds [ feed_url ] [ 0 ] . feed . title
feed_name = feed_name if feed_name else " Untitled "
# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
# AttributeError: 'str' object has no attribute 'entries'
if " entries " in feeds [ feed_url ] [ 0 ] . keys ( ) :
feed_amnt = feeds [ feed_url ] [ 0 ] . entries
else :
continue
if feed_amnt :
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_url_mark is
# utilized in order to make fetch possible.
# NOTE feed_url_mark was a variable which stored
# single URL (probably first accepted as valid)
# in order to get an indication whether a single
# URL has been fetched, so that the receiving
# function will scan that single URL instead of
# listing it as a message.
url = { ' link ' : feed_url ,
' index ' : None ,
' name ' : feed_name ,
' code ' : status_code ,
' error ' : False ,
' exist ' : None }
urls . extend ( [ url ] )
count = len ( urls )
if count > 1 :
result = urls
elif count :
result = urls [ 0 ]
else :
result = None
return result
# def get_discovered_feeds(url, urls):
# message = (
# "Found {} web feeds:\n\n```\n"
# ).format(len(urls))
# if len(urls) > 1:
# for urls in urls:
# message += (
# "Title : {}\n"
# "Link : {}\n"
# "\n"
# ).format(url, url.title)
# message += (
# "```\nThe above feeds were extracted from\n{}"
# ).format(url)
# elif len(urls) > 0:
# result = urls
# else:
# message = (
# "No feeds were found for {}"
# ).format(url)
# return result
# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection
# async def start(url):
# while True:
# result = await fetch.http(url)
# document = result[0]
# status = result[1]
# if document:
# feed = parse(document)
# if is_feed(feed):
# print(url)
# else:
# urls = await probe_page(
# url, document)
# if len(urls) > 1:
# await process_feed_selection(urls)
# elif urls:
# url = urls[0]
# else:
# response = (
# "> {}\nFailed to load URL. Reason: {}"
# ).format(url, status)
# break
# return response
# url = "https://www.smh.com.au/rssheadlines"
# start(url)
2024-06-13 17:53:53 +02:00
class FeedTask :
# TODO Take this function out of
# <class 'slixmpp.clientxmpp.ClientXMPP'>
async def check_updates ( self , jid_bare ) :
"""
Start calling for update check up .
Parameters
- - - - - - - - - -
jid : str
Jabber ID .
"""
# print('Scanning for updates for JID {}'.format(jid_bare))
logger . info ( ' Scanning for updates for JID {} ' . format ( jid_bare ) )
while True :
2024-12-15 19:33:55 +01:00
db_file = Database . instantiate ( self . dir_data , jid_bare )
2024-06-24 11:46:05 +02:00
urls = sqlite . get_active_feeds_url_sorted_by_last_scanned ( db_file )
2024-06-13 17:53:53 +02:00
for url in urls :
2024-09-25 10:56:46 +02:00
#Message.printer('Scanning updates for URL {} ...'.format(url))
2024-06-13 17:53:53 +02:00
url = url [ 0 ]
# print('STA',url)
# # Skip Reddit
# if 'reddit.com' in str(url).lower():
# print('Reddit Atom Syndication feeds are not supported by Slixfeed.')
# print('Skipping URL:', url)
# continue
2024-11-24 17:17:13 +01:00
result = await fetch . http ( self . settings_network , url )
2024-06-13 17:53:53 +02:00
status_code = result [ ' status_code ' ]
feed_id = sqlite . get_feed_id ( db_file , url )
feed_id = feed_id [ 0 ]
if not result [ ' error ' ] :
2024-06-17 16:25:24 +02:00
identifier = sqlite . get_feed_identifier ( db_file , feed_id )
identifier = identifier [ 0 ]
if not identifier :
counter = 0
while True :
identifier = String . generate_identifier ( url , counter )
if sqlite . check_identifier_exist ( db_file , identifier ) :
counter + = 1
else :
break
await sqlite . update_feed_identifier ( db_file , feed_id , identifier )
# identifier = sqlite.get_feed_identifier(db_file, feed_id)
# identifier = identifier[0]
2024-06-13 17:53:53 +02:00
await sqlite . update_feed_status ( db_file , feed_id , status_code )
document = result [ ' content ' ]
feed = parse ( document )
feed_valid = 0 if feed . bozo else 1
await sqlite . update_feed_validity ( db_file , feed_id , feed_valid )
feed_properties = Feed . get_properties_of_feed (
db_file , feed_id , feed )
await sqlite . update_feed_properties (
db_file , feed_id , feed_properties )
2024-06-17 16:25:24 +02:00
new_entries = [ ]
for entry in feed . entries :
if entry . has_key ( " link " ) :
# link = complete_url(source, entry.link)
entry_link = Url . join_url ( url , entry . link )
entry_link = Url . trim_url ( entry_link )
entry_identifier = String . md5_hash ( entry_link )
# if 'f-droid.org' in url:
# breakpoint()
# print(entry.link)
# print(entry_identifier)
# Check if an entry identifier exists
if not sqlite . get_entry_id_by_identifier (
db_file , entry_identifier ) :
new_entry = Feed . get_properties_of_entry (
url , entry_identifier , entry )
# new_entries.append(new_entry)
new_entries . extend ( [ new_entry ] )
2024-06-13 17:53:53 +02:00
if new_entries :
await sqlite . add_entries_and_update_feed_state ( db_file , feed_id , new_entries )
2024-07-07 11:57:10 +02:00
limit = Config . get_setting_value ( self , jid_bare , ' archive ' )
2024-06-17 16:25:24 +02:00
ixs = sqlite . get_entries_id_of_feed ( db_file , feed_id )
ixs_invalid = { }
for ix in ixs :
ix = ix [ 0 ]
read_status = sqlite . is_entry_read ( db_file , ix )
read_status = read_status [ 0 ]
entry_identifier_local = sqlite . get_entry_identifier ( db_file , ix )
entry_identifier_local = entry_identifier_local [ 0 ]
valid = False
for entry in feed . entries :
if entry . has_key ( " link " ) :
entry_link = Url . join_url ( url , entry . link )
entry_link = Url . trim_url ( entry_link )
entry_identifier_external = Utilities . hash_url_to_md5 (
entry_link )
if entry_identifier_local == entry_identifier_external :
valid = True
continue
if not valid : ixs_invalid [ ix ] = read_status
if len ( ixs_invalid ) :
await sqlite . process_invalid_entries ( db_file , ixs_invalid )
# TODO return number of archived entries and add if statement to run archive maintainence function
2024-06-14 11:38:44 +02:00
await sqlite . maintain_archive ( db_file , limit )
# await sqlite.process_invalid_entries(db_file, ixs)
2024-06-24 11:46:05 +02:00
await asyncio . sleep ( 60 * 2 )
2024-07-07 11:57:10 +02:00
val = Config . get_setting_value ( self , jid_bare , ' check ' )
2024-06-13 17:53:53 +02:00
await asyncio . sleep ( 60 * float ( val ) )
# Schedule to call this function again in 90 minutes
# loop.call_at(
# loop.time() + 60 * 90,
# loop.create_task,
# self.check_updates(jid)
# )
2024-07-31 19:53:42 +02:00
# Consider an endless loop. See XmppPubsubTask.loop_task
# def restart_task(self, jid_bare):
2024-09-13 10:44:08 +02:00
async def loop_task ( self , jid_bare ) :
await asyncio . sleep ( 60 )
while True :
logger . info ( ' Looping task " check " for JID {} ' . format ( jid_bare ) )
print ( ' Looping task " check " for JID {} ' . format ( jid_bare ) )
await FeedTask . check_updates ( self , jid_bare )
await asyncio . sleep ( 60 * 60 )
2024-07-31 19:53:42 +02:00
2024-06-13 17:53:53 +02:00
def restart_task ( self , jid_bare ) :
if jid_bare not in self . task_manager :
self . task_manager [ jid_bare ] = { }
logger . info ( ' Creating new task manager for JID {} ' . format ( jid_bare ) )
logger . info ( ' Stopping task " check " for JID {} ' . format ( jid_bare ) )
try :
self . task_manager [ jid_bare ] [ ' check ' ] . cancel ( )
except :
logger . info ( ' No task " check " for JID {} (FeedTask.check_updates) '
. format ( jid_bare ) )
logger . info ( ' Starting tasks " check " for JID {} ' . format ( jid_bare ) )
self . task_manager [ jid_bare ] [ ' check ' ] = asyncio . create_task (
FeedTask . check_updates ( self , jid_bare ) )
class Opml :
# TODO Consider adding element jid as a pointer of import
def export_to_file ( jid , filename , results ) :
# print(jid, filename, results)
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} jid: {} filename: {} '
. format ( function_name , jid , filename ) )
root = ETR . Element ( " opml " )
root . set ( " version " , " 1.0 " )
head = ETR . SubElement ( root , " head " )
ETR . SubElement ( head , " title " ) . text = " {} " . format ( jid )
ETR . SubElement ( head , " description " ) . text = (
" Set of subscriptions exported by Slixfeed " )
ETR . SubElement ( head , " generator " ) . text = " Slixfeed "
ETR . SubElement ( head , " urlPublic " ) . text = (
" https://slixfeed.woodpeckersnest.space/ " )
2024-06-16 10:55:22 +02:00
time_stamp = DateAndTime . current_time ( )
2024-06-13 17:53:53 +02:00
ETR . SubElement ( head , " dateCreated " ) . text = time_stamp
ETR . SubElement ( head , " dateModified " ) . text = time_stamp
body = ETR . SubElement ( root , " body " )
for result in results :
outline = ETR . SubElement ( body , " outline " )
outline . set ( " text " , result [ 1 ] )
outline . set ( " xmlUrl " , result [ 2 ] )
# outline.set("type", result[2])
tree = ETR . ElementTree ( root )
tree . write ( filename )
async def import_from_file ( db_file , result ) :
function_name = sys . _getframe ( ) . f_code . co_name
logger . debug ( ' {} : db_file: {} '
. format ( function_name , db_file ) )
if not result [ ' error ' ] :
document = result [ ' content ' ]
root = ETR . fromstring ( document )
before = sqlite . get_number_of_items ( db_file , ' feeds_properties ' )
feeds = [ ]
for child in root . findall ( " .//outline " ) :
url = child . get ( " xmlUrl " )
title = child . get ( " text " )
# feed = (url, title)
# feeds.extend([feed])
feed = {
' title ' : title ,
' url ' : url ,
}
feeds . extend ( [ feed ] )
await sqlite . import_feeds ( db_file , feeds )
await sqlite . add_metadata ( db_file )
after = sqlite . get_number_of_items ( db_file , ' feeds_properties ' )
difference = int ( after ) - int ( before )
return difference