Fix new entry selection (Thank you lorenzo and roughnecks);

Fix handling with RDF documents which has caused to a halt during feed tscan;
Ignore items without a link (Thank you Kris);
Set entry identifier as MD5 hash of link (Thank you TheCoffeMaker).
This commit is contained in:
Schimon Jehudah, Adv. 2024-06-17 17:25:24 +03:00
parent 93ea8a9fab
commit 5babb02cf8
7 changed files with 484 additions and 297 deletions

View file

@ -2226,9 +2226,33 @@ async def add_entries_and_update_feed_state(db_file, feed_id, new_entries):
""" """
INSERT INSERT
INTO entries_properties( INTO entries_properties(
feed_id, identifier, link, title, title_type, summary_text, summary_lang, summary_type, summary_base, category, comments, published, updated) feed_id,
identifier,
link,
title,
title_type,
summary_text,
summary_lang,
summary_type,
summary_base,
category,
comments,
published,
updated)
VALUES( VALUES(
:feed_id, :identifier, :link, :title, :title_type, :summary_text, :summary_lang, :summary_type, :summary_base, :category, :comments, :published, :updated) :feed_id,
:identifier,
:link,
:title,
:title_type,
:summary_text,
:summary_lang,
:summary_type,
:summary_base,
:category,
:comments,
:published,
:updated)
""" """
) )
entry_properties = new_entry['entry_properties'] entry_properties = new_entry['entry_properties']
@ -2847,6 +2871,35 @@ def get_entries_of_feed(db_file, feed_id):
return items return items
def get_entries_id_of_feed(db_file, feed_id):
"""
Get entries of given feed.
Parameters
----------
db_file : str
Path to database file.
feed_id : str
Feed Id.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{} db_file: {} feed_id: {}'
.format(function_name, db_file, feed_id))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id
FROM entries_properties
WHERE feed_id = ?
ORDER BY published DESC
"""
)
par = (feed_id,)
items = cur.execute(sql, par).fetchall()
return items
# TODO What is this function for? 2024-01-02 # TODO What is this function for? 2024-01-02
# def get_feeds(db_file): # def get_feeds(db_file):
# """ # """
@ -3231,9 +3284,9 @@ def check_entry_exist(db_file, feed_id, identifier=None, title=None, link=None,
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} feed_id: {}' logger.debug('{}: db_file: {} feed_id: {}'
.format(function_name, db_file, feed_id)) .format(function_name, db_file, feed_id))
exist = False
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
exist = False
if identifier: if identifier:
sql = ( sql = (
""" """
@ -3291,6 +3344,76 @@ def check_entry_exist(db_file, feed_id, identifier=None, title=None, link=None,
return exist return exist
def get_entry_id_by_identifier(db_file, identifier):
"""
Get entry ID by its identifier.
Parameters
----------
db_file : str
Path to database file.
identifier : str
Entry identifier.
Returns
-------
result : tuple
Entry ID or None.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} identifier: {}'
.format(function_name, db_file, identifier))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id
FROM entries_properties
WHERE identifier = :identifier
"""
)
par = {
"identifier": identifier
}
result = cur.execute(sql, par).fetchone()
return result
def get_entry_identifier(db_file, ix):
"""
Get identifier by its entry ID.
Parameters
----------
db_file : str
Path to database file.
id : str
Entry ID.
Returns
-------
result : tuple
Entry ID or None.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} ix: {}'
.format(function_name, db_file, ix))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT identifier
FROM entries_properties
WHERE id = :ix
"""
)
par = {
"ix": ix
}
result = cur.execute(sql, par).fetchone()
return result
async def set_setting_value(db_file, key_value): async def set_setting_value(db_file, key_value):
""" """
Set setting value. Set setting value.

View file

@ -32,8 +32,8 @@ from slixfeed.config import Config
import slixfeed.fetch as fetch import slixfeed.fetch as fetch
from slixfeed.log import Logger from slixfeed.log import Logger
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
from slixfeed.utilities import DateAndTime, Url from slixfeed.utilities import DateAndTime, String, Url
from slixfeed.utilities import Html, MD from slixfeed.utilities import Html, MD, String, Utilities
from slixmpp.xmlstream import ET from slixmpp.xmlstream import ET
import sys import sys
from urllib.parse import urlsplit from urllib.parse import urlsplit
@ -274,8 +274,7 @@ class Feed:
while True: while True:
feed_id = sqlite.get_feed_id(db_file, url) feed_id = sqlite.get_feed_id(db_file, url)
if not feed_id: if not feed_id:
exist_identifier = sqlite.check_identifier_exist(db_file, identifier) if not sqlite.check_identifier_exist(db_file, identifier):
if not exist_identifier:
result = await fetch.http(url) result = await fetch.http(url)
message = result['message'] message = result['message']
status_code = result['status_code'] status_code = result['status_code']
@ -336,8 +335,17 @@ class Feed:
db_file, feed_id, feed_properties) db_file, feed_id, feed_properties)
feed_id = sqlite.get_feed_id(db_file, url) feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0] feed_id = feed_id[0]
new_entries = Feed.get_properties_of_entries( new_entries = []
jid_bare, db_file, url, feed_id, feed) for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
new_entries.extend([new_entry])
if new_entries: if new_entries:
await sqlite.add_entries_and_update_feed_state( await sqlite.add_entries_and_update_feed_state(
db_file, feed_id, new_entries) db_file, feed_id, new_entries)
@ -387,8 +395,7 @@ class Feed:
'identifier' : None} 'identifier' : None}
break break
else: else:
ix = exist_identifier[1] ix = sqlite.get_entry_id_by_identifier(db_file, identifier)
identifier = exist_identifier[2]
message = ('Identifier "{}" is already allocated.' message = ('Identifier "{}" is already allocated.'
.format(identifier)) .format(identifier))
result_final = {'link' : url, result_final = {'link' : url,
@ -517,14 +524,14 @@ class Feed:
# NOTE This function is not being utilized # NOTE This function is not being utilized
async def download_feed(self, db_file, feed_url): async def download_feed(self, db_file, feed_url):
""" """
Get feed content. Process feed content.
Parameters Parameters
---------- ----------
db_file : str db_file : str
Path to database file. Path to database file.
url : str, optional feed_url : str
URL. URL of feed.
""" """
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} url: {}' logger.debug('{}: db_file: {} url: {}'
@ -554,7 +561,15 @@ class Feed:
feed_encoding = feed.encoding if feed.has_key('encoding') else '' feed_encoding = feed.encoding if feed.has_key('encoding') else ''
feed_language = feed.feed.language if feed.feed.has_key('language') else '' feed_language = feed.feed.language if feed.feed.has_key('language') else ''
feed_icon = feed.feed.icon if feed.feed.has_key('icon') else '' feed_icon = feed.feed.icon if feed.feed.has_key('icon') else ''
feed_image = feed.feed.image.href if feed.feed.has_key('image') else '' # (Pdb) feed.feed.image
# {}
# (Pdb) feed.version
# 'rss10'
# (Pdb) feed.feed.image
# {'links': [{'rel': 'alternate', 'type': 'text/html'}]}
# (Pdb) feed.version
# ''
feed_image = feed.feed.image.href if feed.feed.has_key('image') and feed.feed.image.has_key('href') else ''
feed_logo = feed.feed.logo if feed.feed.has_key('logo') else '' feed_logo = feed.feed.logo if feed.feed.has_key('logo') else ''
feed_ttl = feed.feed.ttl if feed.feed.has_key('ttl') else '' feed_ttl = feed.feed.ttl if feed.feed.has_key('ttl') else ''
@ -576,248 +591,236 @@ class Feed:
# TODO get all active feeds of active accounts and scan the feed with the earliest scanned time # TODO get all active feeds of active accounts and scan the feed with the earliest scanned time
# TODO Rename function name (idea: scan_and_populate) # TODO Rename function name (idea: scan_and_populate)
def get_properties_of_entries(jid_bare, db_file, feed_url, feed_id, feed): def get_properties_of_entry(feed_url, entry_identifier, entry):
""" """
Get new entries. Process entry content.
Parameters Parameters
---------- ----------
db_file : str feed_url : str
Path to database file. URL of feed.
url : str, optional entry :
URL. Object of entry.
""" """
# print('MID', feed_url, jid_bare, 'get_properties_of_entries')
function_name = sys._getframe().f_code.co_name function_name = sys._getframe().f_code.co_name
logger.debug('{}: feed_id: {} url: {}' logger.debug('{} feed_url: {}'
.format(function_name, feed_id, feed_url)) .format(function_name, feed_url))
new_entries = [] read_status = 0
for entry in feed.entries: if entry.has_key("published"):
logger.debug('{}: entry: {}'.format(function_name, entry.link)) entry_published = entry.published
if entry.has_key("published"): entry_published = DateAndTime.rfc2822_to_iso8601(entry_published)
entry_published = entry.published else:
entry_published = DateAndTime.rfc2822_to_iso8601(entry_published) entry_published = ''
else: if entry.has_key("updated"):
entry_published = '' entry_updated = entry.updated
if entry.has_key("updated"): entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated)
entry_updated = entry.updated else:
entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated) entry_updated = DateAndTime.now()
else: if entry.has_key("link"):
entry_updated = DateAndTime.now() # link = complete_url(source, entry.link)
if entry.has_key("link"): entry_link = Url.join_url(feed_url, entry.link)
# link = complete_url(source, entry.link) entry_link = Url.trim_url(entry_link)
entry_link = Url.join_url(feed_url, entry.link) else:
entry_link = Url.trim_url(entry_link) entry_link = feed_url
else: # title = feed["feed"]["title"]
entry_link = feed_url # title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
# title = feed["feed"]["title"] entry_title = entry.title if entry.has_key("title") else entry_published
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title) # entry_id = entry.id if entry.has_key("id") else entry_link
entry_title = entry.title if entry.has_key("title") else entry_published # # Filter
entry_id = entry.id if entry.has_key("id") else entry_link # pathname = urlsplit(link).path
exist = sqlite.check_entry_exist(db_file, feed_id, # string = (
identifier=entry_id, # "{} {} {}"
title=entry_title, # ).format(
link=entry_link, # title, summary, pathname)
published=entry_published) # if self.settings['default']['filter']:
if not exist: # print('Filter is now processing data.')
read_status = 0 # allow_list = config.is_include_keyword(db_file,
# # Filter # "allow", string)
# pathname = urlsplit(link).path # if not allow_list:
# string = ( # reject_list = config.is_include_keyword(db_file,
# "{} {} {}" # "deny",
# ).format( # string)
# title, summary, pathname) # if reject_list:
# if self.settings['default']['filter']: # read_status = 1
# print('Filter is now processing data.') # logger.debug('Rejected : {}'
# allow_list = config.is_include_keyword(db_file, # '\n'
# "allow", string) # 'Keyword : {}'
# if not allow_list: # .format(link, reject_list))
# reject_list = config.is_include_keyword(db_file, if isinstance(entry_published, int):
# "deny", logger.error('Variable "published" is int: {}'.format(entry_published))
# string) if isinstance(entry_updated, int):
# if reject_list: logger.error('Variable "updated" is int: {}'.format(entry_updated))
# read_status = 1
# logger.debug('Rejected : {}' # Authors
# '\n' entry_authors =[]
# 'Keyword : {}' if entry.has_key('authors'):
# .format(link, reject_list)) for author in entry.authors:
if isinstance(entry_published, int): author_properties = {
logger.error('Variable "published" is int: {}'.format(entry_published)) 'name' : author.name if author.has_key('name') else '',
if isinstance(entry_updated, int): 'url' : author.href if author.has_key('href') else '',
logger.error('Variable "updated" is int: {}'.format(entry_updated)) 'email' : author.email if author.has_key('email') else '',
# Authors
entry_authors =[]
if entry.has_key('authors'):
for author in entry.authors:
author_properties = {
'name' : author.name if author.has_key('name') else '',
'url' : author.href if author.has_key('href') else '',
'email' : author.email if author.has_key('email') else '',
}
entry_authors.extend([author_properties])
elif entry.has_key('author_detail'):
author_properties = {
'name' : entry.author_detail.name if entry.author_detail.has_key('name') else '',
'url' : entry.author_detail.href if entry.author_detail.has_key('href') else '',
'email' : entry.author_detail.email if entry.author_detail.has_key('email') else '',
}
entry_authors.extend([author_properties])
elif entry.has_key('author'):
author_properties = {
'name' : entry.author,
'url' : '',
'email' : '',
}
entry_authors.extend([author_properties])
# Contributors
entry_contributors = []
if entry.has_key('contributors'):
for contributor in entry.contributors:
contributor_properties = {
'name' : contributor.name if contributor.has_key('name') else '',
'url' : contributor.href if contributor.has_key('href') else '',
'email' : contributor.email if contributor.has_key('email') else '',
}
entry_contributors.extend([contributor_properties])
# Tags
entry_tags = []
if entry.has_key('tags'):
for tag in entry.tags:
tag_properties = {
'term' : tag.term if tag.has_key('term') else '',
'scheme' : tag.scheme if tag.has_key('scheme') else '',
'label' : tag.label if tag.has_key('label') else '',
}
entry_tags.extend([tag_properties])
# Content
entry_contents = []
if entry.has_key('content'):
for content in entry.content:
text = content.value if content.has_key('value') else ''
type = content.type if content.has_key('type') else ''
lang = content.lang if content.has_key('lang') else ''
base = content.base if content.has_key('base') else ''
entry_content = {
'text' : text,
'lang' : lang,
'type' : type,
'base' : base,
}
entry_contents.extend([entry_content])
# Links and Enclosures
entry_links = []
if entry.has_key('links'):
for link in entry.links:
link_properties = {
'url' : link.href if link.has_key('href') else '',
'rel' : link.rel if link.has_key('rel') else '',
'type' : link.type if link.has_key('type') else '',
'length' : '',
}
entry_links.extend([link_properties])
# Element media:content is utilized by Mastodon
if entry.has_key('media_content'):
for link in entry.media_content:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : link['type'] if 'type' in link else '',
# 'medium' : link['medium'] if 'medium' in link else '',
'length' : link['filesize'] if 'filesize' in link else '',
}
entry_links.extend([link_properties])
if entry.has_key('media_thumbnail'):
for link in entry.media_thumbnail:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : '',
# 'medium' : 'image',
'length' : '',
}
entry_links.extend([link_properties])
# Category
entry_category = entry.category if entry.has_key('category') else ''
# Comments
entry_comments = entry.comments if entry.has_key('comments') else ''
# href
entry_href = entry.href if entry.has_key('href') else ''
# Link: Same as entry.links[0].href in most if not all cases
entry_link = entry.link if entry.has_key('link') else ''
# Rating
entry_rating = entry.rating if entry.has_key('rating') else ''
# Summary
entry_summary_text = entry.summary if entry.has_key('summary') else ''
if entry.has_key('summary_detail'):
entry_summary_type = entry.summary_detail.type if entry.summary_detail.has_key('type') else ''
entry_summary_lang = entry.summary_detail.lang if entry.summary_detail.has_key('lang') else ''
entry_summary_base = entry.summary_detail.base if entry.summary_detail.has_key('base') else ''
else:
entry_summary_type = ''
entry_summary_lang = ''
entry_summary_base = ''
# Title
entry_title = entry.title if entry.has_key('title') else ''
if entry.has_key('title_detail'):
entry_title_type = entry.title_detail.type if entry.title_detail.has_key('type') else ''
else:
entry_title_type = ''
###########################################################
# media_type = e_link.type[:e_link.type.index("/")]
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
entry_properties = {
"identifier": entry_id,
"link": entry_link,
"href": entry_href,
"title": entry_title,
"title_type": entry_title_type,
'summary_text' : entry_summary_text,
'summary_lang' : entry_summary_lang,
'summary_type' : entry_summary_type,
'summary_base' : entry_summary_base,
'category' : entry_category,
"comments": entry_comments,
"rating": entry_rating,
"published": entry_published,
"updated": entry_updated,
"read_status": read_status
} }
entry_authors.extend([author_properties])
new_entries.extend([{ elif entry.has_key('author_detail'):
"entry_properties" : entry_properties, author_properties = {
"entry_authors" : entry_authors, 'name' : entry.author_detail.name if entry.author_detail.has_key('name') else '',
"entry_contributors" : entry_contributors, 'url' : entry.author_detail.href if entry.author_detail.has_key('href') else '',
"entry_contents" : entry_contents, 'email' : entry.author_detail.email if entry.author_detail.has_key('email') else '',
"entry_links" : entry_links, }
"entry_tags" : entry_tags entry_authors.extend([author_properties])
}]) elif entry.has_key('author'):
# await sqlite.add_entry( author_properties = {
# db_file, title, link, entry_id, 'name' : entry.author,
# url, date, read_status) 'url' : '',
# await sqlite.set_date(db_file, url) 'email' : '',
return new_entries }
entry_authors.extend([author_properties])
# Contributors
entry_contributors = []
if entry.has_key('contributors'):
for contributor in entry.contributors:
contributor_properties = {
'name' : contributor.name if contributor.has_key('name') else '',
'url' : contributor.href if contributor.has_key('href') else '',
'email' : contributor.email if contributor.has_key('email') else '',
}
entry_contributors.extend([contributor_properties])
# Tags
entry_tags = []
if entry.has_key('tags'):
for tag in entry.tags:
tag_properties = {
'term' : tag.term if tag.has_key('term') else '',
'scheme' : tag.scheme if tag.has_key('scheme') else '',
'label' : tag.label if tag.has_key('label') else '',
}
entry_tags.extend([tag_properties])
# Content
entry_contents = []
if entry.has_key('content'):
for content in entry.content:
text = content.value if content.has_key('value') else ''
type = content.type if content.has_key('type') else ''
lang = content.lang if content.has_key('lang') else ''
base = content.base if content.has_key('base') else ''
entry_content = {
'text' : text,
'lang' : lang,
'type' : type,
'base' : base,
}
entry_contents.extend([entry_content])
# Links and Enclosures
entry_links = []
if entry.has_key('links'):
for link in entry.links:
link_properties = {
'url' : link.href if link.has_key('href') else '',
'rel' : link.rel if link.has_key('rel') else '',
'type' : link.type if link.has_key('type') else '',
'length' : '',
}
entry_links.extend([link_properties])
# Element media:content is utilized by Mastodon
if entry.has_key('media_content'):
for link in entry.media_content:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : link['type'] if 'type' in link else '',
# 'medium' : link['medium'] if 'medium' in link else '',
'length' : link['filesize'] if 'filesize' in link else '',
}
entry_links.extend([link_properties])
if entry.has_key('media_thumbnail'):
for link in entry.media_thumbnail:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : '',
# 'medium' : 'image',
'length' : '',
}
entry_links.extend([link_properties])
# Category
entry_category = entry.category if entry.has_key('category') else ''
# Comments
entry_comments = entry.comments if entry.has_key('comments') else ''
# href
entry_href = entry.href if entry.has_key('href') else ''
# Link: Same as entry.links[0].href in most if not all cases
entry_link = entry.link if entry.has_key('link') else ''
# Rating
entry_rating = entry.rating if entry.has_key('rating') else ''
# Summary
entry_summary_text = entry.summary if entry.has_key('summary') else ''
if entry.has_key('summary_detail'):
entry_summary_type = entry.summary_detail.type if entry.summary_detail.has_key('type') else ''
entry_summary_lang = entry.summary_detail.lang if entry.summary_detail.has_key('lang') else ''
entry_summary_base = entry.summary_detail.base if entry.summary_detail.has_key('base') else ''
else:
entry_summary_type = ''
entry_summary_lang = ''
entry_summary_base = ''
# Title
entry_title = entry.title if entry.has_key('title') else ''
if entry.has_key('title_detail'):
entry_title_type = entry.title_detail.type if entry.title_detail.has_key('type') else ''
else:
entry_title_type = ''
###########################################################
# media_type = e_link.type[:e_link.type.index("/")]
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
entry_properties = {
"identifier": entry_identifier,
"link": entry_link,
"href": entry_href,
"title": entry_title,
"title_type": entry_title_type,
'summary_text' : entry_summary_text,
'summary_lang' : entry_summary_lang,
'summary_type' : entry_summary_type,
'summary_base' : entry_summary_base,
'category' : entry_category,
"comments": entry_comments,
"rating": entry_rating,
"published": entry_published,
"updated": entry_updated,
"read_status": read_status}
new_entry = {
"entry_properties" : entry_properties,
"entry_authors" : entry_authors,
"entry_contributors" : entry_contributors,
"entry_contents" : entry_contents,
"entry_links" : entry_links,
"entry_tags" : entry_tags}
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
return new_entry
""" """
@ -1277,7 +1280,6 @@ class FeedTask:
urls = sqlite.get_active_feeds_url(db_file) urls = sqlite.get_active_feeds_url(db_file)
for url in urls: for url in urls:
url = url[0] url = url[0]
print('start scan\nurl {}\ndatabase {}'.format(url, db_file))
# print('STA',url) # print('STA',url)
# # Skip Reddit # # Skip Reddit
@ -1291,6 +1293,19 @@ class FeedTask:
feed_id = sqlite.get_feed_id(db_file, url) feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0] feed_id = feed_id[0]
if not result['error']: if not result['error']:
identifier = sqlite.get_feed_identifier(db_file, feed_id)
identifier = identifier[0]
if not identifier:
counter = 0
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
else:
break
await sqlite.update_feed_identifier(db_file, feed_id, identifier)
# identifier = sqlite.get_feed_identifier(db_file, feed_id)
# identifier = identifier[0]
await sqlite.update_feed_status(db_file, feed_id, status_code) await sqlite.update_feed_status(db_file, feed_id, status_code)
document = result['content'] document = result['content']
feed = parse(document) feed = parse(document)
@ -1300,17 +1315,54 @@ class FeedTask:
db_file, feed_id, feed) db_file, feed_id, feed)
await sqlite.update_feed_properties( await sqlite.update_feed_properties(
db_file, feed_id, feed_properties) db_file, feed_id, feed_properties)
new_entries = Feed.get_properties_of_entries( new_entries = []
jid_bare, db_file, url, feed_id, feed) for entry in feed.entries:
if entry.has_key("link"):
# link = complete_url(source, entry.link)
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
# if 'f-droid.org' in url:
# breakpoint()
# print(entry.link)
# print(entry_identifier)
# Check if an entry identifier exists
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
# new_entries.append(new_entry)
new_entries.extend([new_entry])
print(url)
if new_entries: if new_entries:
await sqlite.add_entries_and_update_feed_state(db_file, feed_id, new_entries) await sqlite.add_entries_and_update_feed_state(db_file, feed_id, new_entries)
limit = Config.get_setting_value(self.settings, jid_bare, 'archive') limit = Config.get_setting_value(self.settings, jid_bare, 'archive')
ixs = sqlite.get_invalid_entries(db_file, url, feed) ixs = sqlite.get_entries_id_of_feed(db_file, feed_id)
await sqlite.process_invalid_entries(db_file, ixs) ixs_invalid = {}
for ix in ixs:
ix = ix[0]
read_status = sqlite.is_entry_read(db_file, ix)
read_status = read_status[0]
entry_identifier_local = sqlite.get_entry_identifier(db_file, ix)
entry_identifier_local = entry_identifier_local[0]
valid = False
for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier_external = Utilities.hash_url_to_md5(
entry_link)
if entry_identifier_local == entry_identifier_external:
valid = True
continue
if not valid: ixs_invalid[ix] = read_status
if len(ixs_invalid):
print('erasing {}/{}'.format(len(ixs_invalid), len(feed.entries)))
await sqlite.process_invalid_entries(db_file, ixs_invalid)
# TODO return number of archived entries and add if statement to run archive maintainence function
await sqlite.maintain_archive(db_file, limit) await sqlite.maintain_archive(db_file, limit)
# await sqlite.process_invalid_entries(db_file, ixs) # await sqlite.process_invalid_entries(db_file, ixs)
print('end scan\nurl {}\ndatabase {}'.format(url, db_file)) # await asyncio.sleep(50)
await asyncio.sleep(50)
val = Config.get_setting_value(self.settings, jid_bare, 'check') val = Config.get_setting_value(self.settings, jid_bare, 'check')
await asyncio.sleep(60 * float(val)) await asyncio.sleep(60 * float(val))
# Schedule to call this function again in 90 minutes # Schedule to call this function again in 90 minutes

View file

@ -40,14 +40,13 @@ TODO
""" """
from datetime import datetime from datetime import datetime
from email.utils import parseaddr
from dateutil.parser import parse from dateutil.parser import parse
from email.utils import parsedate, parsedate_to_datetime from email.utils import parseaddr, parsedate, parsedate_to_datetime
import hashlib import hashlib
from lxml import etree, html
import os import os
import random import random
import slixfeed.config as config import slixfeed.config as config
from lxml import etree, html
import slixfeed.dt as dt import slixfeed.dt as dt
import slixfeed.fetch as fetch import slixfeed.fetch as fetch
from slixfeed.log import Logger from slixfeed.log import Logger
@ -681,9 +680,31 @@ class Url:
class String:
def generate_identifier(url, counter):
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
return identifier
# string_to_md5_hash
# NOTE Warning: Entry might not have a link
# TODO Handle situation error
def md5_hash(url):
url_encoded = url.encode()
url_hashed = hashlib.md5(url_encoded)
url_digest = url_hashed.hexdigest()
return url_digest
class Utilities: class Utilities:
# string_to_md5_hash
# NOTE Warning: Entry might not have a link # NOTE Warning: Entry might not have a link
# TODO Handle situation error # TODO Handle situation error
def hash_url_to_md5(url): def hash_url_to_md5(url):

View file

@ -1,2 +1,2 @@
__version__ = '0.1.82' __version__ = '0.1.83'
__version_info__ = (0, 1, 82) __version_info__ = (0, 1, 83)

View file

@ -48,7 +48,7 @@ import slixfeed.fetch as fetch
from slixfeed.log import Logger from slixfeed.log import Logger
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, FeedDiscovery, FeedTask, Opml from slixfeed.syndication import Feed, FeedDiscovery, FeedTask, Opml
from slixfeed.utilities import DateAndTime, Html, Task, Url, Utilities from slixfeed.utilities import DateAndTime, Html, String, Task, Url, Utilities
from slixfeed.version import __version__ from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.chat import XmppChat, XmppChatTask from slixfeed.xmpp.chat import XmppChat, XmppChatTask
@ -1776,12 +1776,10 @@ class XmppClient(slixmpp.ClientXMPP):
session['prev'] = None session['prev'] = None
# elif not identifier: # elif not identifier:
# counter = 0 # counter = 0
# hostname = Url.get_hostname(url)
# identifier = hostname + ':' + str(counter)
# while True: # while True:
# identifier = String.generate_identifier(url, counter)
# if sqlite.check_identifier_exist(db_file, identifier): # if sqlite.check_identifier_exist(db_file, identifier):
# counter += 1 # counter += 1
# identifier = hostname + ':' + str(counter)
# else: # else:
# break # break
# Several URLs to subscribe # Several URLs to subscribe
@ -1793,12 +1791,10 @@ class XmppClient(slixmpp.ClientXMPP):
exist_count = 0 exist_count = 0
for url in urls: for url in urls:
counter = 0 counter = 0
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True: while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier): if sqlite.check_identifier_exist(db_file, identifier):
counter += 1 counter += 1
identifier = hostname + ':' + str(counter)
else: else:
break break
result = await Feed.add_feed(self, jid_bare, db_file, url, result = await Feed.add_feed(self, jid_bare, db_file, url,
@ -1826,12 +1822,10 @@ class XmppClient(slixmpp.ClientXMPP):
if isinstance(url, list): if isinstance(url, list):
url = url[0] url = url[0]
counter = 0 counter = 0
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True: while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier): if sqlite.check_identifier_exist(db_file, identifier):
counter += 1 counter += 1
identifier = hostname + ':' + str(counter)
else: else:
break break
result = await Feed.add_feed(self, jid_bare, db_file, url, result = await Feed.add_feed(self, jid_bare, db_file, url,

View file

@ -9,7 +9,7 @@ import slixfeed.fetch as fetch
from slixfeed.log import Logger from slixfeed.log import Logger
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, FeedDiscovery, Opml from slixfeed.syndication import Feed, FeedDiscovery, Opml
from slixfeed.utilities import DateAndTime, Documentation, Url, Utilities from slixfeed.utilities import DateAndTime, Documentation, String, Url, Utilities
from slixfeed.version import __version__ from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.muc import XmppMuc from slixfeed.xmpp.muc import XmppMuc
@ -119,18 +119,15 @@ class XmppCommands:
if url.startswith('http'): if url.startswith('http'):
if not title: if not title:
title = Url.get_hostname(url) title = Url.get_hostname(url)
counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
exist = sqlite.get_feed_id_and_name(db_file, url) exist = sqlite.get_feed_id_and_name(db_file, url)
if not exist: if not exist:
counter = 0
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
else:
break
await sqlite.insert_feed(db_file, url, title, await sqlite.insert_feed(db_file, url, title,
identifier) identifier)
feed_id = sqlite.get_feed_id(db_file, url) feed_id = sqlite.get_feed_id(db_file, url)
@ -157,8 +154,17 @@ class XmppCommands:
feed_properties) feed_properties)
feed_id = sqlite.get_feed_id(db_file, url) feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0] feed_id = feed_id[0]
new_entries = Feed.get_properties_of_entries( new_entries = []
jid_bare, db_file, url, feed_id, feed) for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
new_entries.extend([new_entry])
if new_entries: if new_entries:
await sqlite.add_entries_and_update_feed_state( await sqlite.add_entries_and_update_feed_state(
db_file, feed_id, new_entries) db_file, feed_id, new_entries)
@ -390,14 +396,11 @@ class XmppCommands:
identifier = info[2] identifier = info[2]
else: else:
counter = 0 counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True: while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist( if sqlite.check_identifier_exist(
db_file, identifier): db_file, identifier):
counter += 1 counter += 1
identifier = hostname + ':' + str(counter)
else: else:
break break
# task.clean_tasks_xmpp_chat(self, jid_bare, ['status']) # task.clean_tasks_xmpp_chat(self, jid_bare, ['status'])
@ -479,13 +482,10 @@ class XmppCommands:
url = Url.feed_to_http(url) url = Url.feed_to_http(url)
url = (await Url.replace_hostname(url, 'feed')) or url url = (await Url.replace_hostname(url, 'feed')) or url
counter = 0 counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True: while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier): if sqlite.check_identifier_exist(db_file, identifier):
counter += 1 counter += 1
identifier = hostname + ':' + str(counter)
else: else:
break break
# try: # try:

View file

@ -16,7 +16,7 @@ from slixfeed.config import Config
from slixfeed.log import Logger from slixfeed.log import Logger
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed from slixfeed.syndication import Feed
from slixfeed.utilities import Url, Utilities from slixfeed.utilities import String, Url, Utilities
from slixfeed.xmpp.iq import XmppIQ from slixfeed.xmpp.iq import XmppIQ
import sys import sys
@ -336,13 +336,10 @@ class XmppPubsubAction:
node_id = node_id[0] node_id = node_id[0]
if not node_id: if not node_id:
counter = 0 counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True: while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier): if sqlite.check_identifier_exist(db_file, identifier):
counter += 1 counter += 1
identifier = hostname + ':' + str(counter)
else: else:
break break
await sqlite.update_feed_identifier(db_file, feed_id, identifier) await sqlite.update_feed_identifier(db_file, feed_id, identifier)