Fix new entry selection (Thank you lorenzo and roughnecks);

Fix handling with RDF documents which has caused to a halt during feed tscan;
Ignore items without a link (Thank you Kris);
Set entry identifier as MD5 hash of link (Thank you TheCoffeMaker).
This commit is contained in:
Schimon Jehudah, Adv. 2024-06-17 17:25:24 +03:00
parent 93ea8a9fab
commit 5babb02cf8
7 changed files with 484 additions and 297 deletions

View file

@ -2226,9 +2226,33 @@ async def add_entries_and_update_feed_state(db_file, feed_id, new_entries):
"""
INSERT
INTO entries_properties(
feed_id, identifier, link, title, title_type, summary_text, summary_lang, summary_type, summary_base, category, comments, published, updated)
feed_id,
identifier,
link,
title,
title_type,
summary_text,
summary_lang,
summary_type,
summary_base,
category,
comments,
published,
updated)
VALUES(
:feed_id, :identifier, :link, :title, :title_type, :summary_text, :summary_lang, :summary_type, :summary_base, :category, :comments, :published, :updated)
:feed_id,
:identifier,
:link,
:title,
:title_type,
:summary_text,
:summary_lang,
:summary_type,
:summary_base,
:category,
:comments,
:published,
:updated)
"""
)
entry_properties = new_entry['entry_properties']
@ -2847,6 +2871,35 @@ def get_entries_of_feed(db_file, feed_id):
return items
def get_entries_id_of_feed(db_file, feed_id):
"""
Get entries of given feed.
Parameters
----------
db_file : str
Path to database file.
feed_id : str
Feed Id.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{} db_file: {} feed_id: {}'
.format(function_name, db_file, feed_id))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id
FROM entries_properties
WHERE feed_id = ?
ORDER BY published DESC
"""
)
par = (feed_id,)
items = cur.execute(sql, par).fetchall()
return items
# TODO What is this function for? 2024-01-02
# def get_feeds(db_file):
# """
@ -3231,9 +3284,9 @@ def check_entry_exist(db_file, feed_id, identifier=None, title=None, link=None,
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} feed_id: {}'
.format(function_name, db_file, feed_id))
exist = False
with create_connection(db_file) as conn:
cur = conn.cursor()
exist = False
if identifier:
sql = (
"""
@ -3291,6 +3344,76 @@ def check_entry_exist(db_file, feed_id, identifier=None, title=None, link=None,
return exist
def get_entry_id_by_identifier(db_file, identifier):
"""
Get entry ID by its identifier.
Parameters
----------
db_file : str
Path to database file.
identifier : str
Entry identifier.
Returns
-------
result : tuple
Entry ID or None.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} identifier: {}'
.format(function_name, db_file, identifier))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id
FROM entries_properties
WHERE identifier = :identifier
"""
)
par = {
"identifier": identifier
}
result = cur.execute(sql, par).fetchone()
return result
def get_entry_identifier(db_file, ix):
"""
Get identifier by its entry ID.
Parameters
----------
db_file : str
Path to database file.
id : str
Entry ID.
Returns
-------
result : tuple
Entry ID or None.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} ix: {}'
.format(function_name, db_file, ix))
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT identifier
FROM entries_properties
WHERE id = :ix
"""
)
par = {
"ix": ix
}
result = cur.execute(sql, par).fetchone()
return result
async def set_setting_value(db_file, key_value):
"""
Set setting value.

View file

@ -32,8 +32,8 @@ from slixfeed.config import Config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.utilities import DateAndTime, Url
from slixfeed.utilities import Html, MD
from slixfeed.utilities import DateAndTime, String, Url
from slixfeed.utilities import Html, MD, String, Utilities
from slixmpp.xmlstream import ET
import sys
from urllib.parse import urlsplit
@ -274,8 +274,7 @@ class Feed:
while True:
feed_id = sqlite.get_feed_id(db_file, url)
if not feed_id:
exist_identifier = sqlite.check_identifier_exist(db_file, identifier)
if not exist_identifier:
if not sqlite.check_identifier_exist(db_file, identifier):
result = await fetch.http(url)
message = result['message']
status_code = result['status_code']
@ -336,8 +335,17 @@ class Feed:
db_file, feed_id, feed_properties)
feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
new_entries = Feed.get_properties_of_entries(
jid_bare, db_file, url, feed_id, feed)
new_entries = []
for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
new_entries.extend([new_entry])
if new_entries:
await sqlite.add_entries_and_update_feed_state(
db_file, feed_id, new_entries)
@ -387,8 +395,7 @@ class Feed:
'identifier' : None}
break
else:
ix = exist_identifier[1]
identifier = exist_identifier[2]
ix = sqlite.get_entry_id_by_identifier(db_file, identifier)
message = ('Identifier "{}" is already allocated.'
.format(identifier))
result_final = {'link' : url,
@ -517,14 +524,14 @@ class Feed:
# NOTE This function is not being utilized
async def download_feed(self, db_file, feed_url):
"""
Get feed content.
Process feed content.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL.
feed_url : str
URL of feed.
"""
function_name = sys._getframe().f_code.co_name
logger.debug('{}: db_file: {} url: {}'
@ -554,7 +561,15 @@ class Feed:
feed_encoding = feed.encoding if feed.has_key('encoding') else ''
feed_language = feed.feed.language if feed.feed.has_key('language') else ''
feed_icon = feed.feed.icon if feed.feed.has_key('icon') else ''
feed_image = feed.feed.image.href if feed.feed.has_key('image') else ''
# (Pdb) feed.feed.image
# {}
# (Pdb) feed.version
# 'rss10'
# (Pdb) feed.feed.image
# {'links': [{'rel': 'alternate', 'type': 'text/html'}]}
# (Pdb) feed.version
# ''
feed_image = feed.feed.image.href if feed.feed.has_key('image') and feed.feed.image.has_key('href') else ''
feed_logo = feed.feed.logo if feed.feed.has_key('logo') else ''
feed_ttl = feed.feed.ttl if feed.feed.has_key('ttl') else ''
@ -576,248 +591,236 @@ class Feed:
# TODO get all active feeds of active accounts and scan the feed with the earliest scanned time
# TODO Rename function name (idea: scan_and_populate)
def get_properties_of_entries(jid_bare, db_file, feed_url, feed_id, feed):
def get_properties_of_entry(feed_url, entry_identifier, entry):
"""
Get new entries.
Process entry content.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL.
feed_url : str
URL of feed.
entry :
Object of entry.
"""
# print('MID', feed_url, jid_bare, 'get_properties_of_entries')
function_name = sys._getframe().f_code.co_name
logger.debug('{}: feed_id: {} url: {}'
.format(function_name, feed_id, feed_url))
logger.debug('{} feed_url: {}'
.format(function_name, feed_url))
new_entries = []
for entry in feed.entries:
logger.debug('{}: entry: {}'.format(function_name, entry.link))
if entry.has_key("published"):
entry_published = entry.published
entry_published = DateAndTime.rfc2822_to_iso8601(entry_published)
else:
entry_published = ''
if entry.has_key("updated"):
entry_updated = entry.updated
entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated)
else:
entry_updated = DateAndTime.now()
if entry.has_key("link"):
# link = complete_url(source, entry.link)
entry_link = Url.join_url(feed_url, entry.link)
entry_link = Url.trim_url(entry_link)
else:
entry_link = feed_url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
entry_title = entry.title if entry.has_key("title") else entry_published
entry_id = entry.id if entry.has_key("id") else entry_link
exist = sqlite.check_entry_exist(db_file, feed_id,
identifier=entry_id,
title=entry_title,
link=entry_link,
published=entry_published)
if not exist:
read_status = 0
# # Filter
# pathname = urlsplit(link).path
# string = (
# "{} {} {}"
# ).format(
# title, summary, pathname)
# if self.settings['default']['filter']:
# print('Filter is now processing data.')
# allow_list = config.is_include_keyword(db_file,
# "allow", string)
# if not allow_list:
# reject_list = config.is_include_keyword(db_file,
# "deny",
# string)
# if reject_list:
# read_status = 1
# logger.debug('Rejected : {}'
# '\n'
# 'Keyword : {}'
# .format(link, reject_list))
if isinstance(entry_published, int):
logger.error('Variable "published" is int: {}'.format(entry_published))
if isinstance(entry_updated, int):
logger.error('Variable "updated" is int: {}'.format(entry_updated))
# Authors
entry_authors =[]
if entry.has_key('authors'):
for author in entry.authors:
author_properties = {
'name' : author.name if author.has_key('name') else '',
'url' : author.href if author.has_key('href') else '',
'email' : author.email if author.has_key('email') else '',
}
entry_authors.extend([author_properties])
elif entry.has_key('author_detail'):
author_properties = {
'name' : entry.author_detail.name if entry.author_detail.has_key('name') else '',
'url' : entry.author_detail.href if entry.author_detail.has_key('href') else '',
'email' : entry.author_detail.email if entry.author_detail.has_key('email') else '',
}
entry_authors.extend([author_properties])
elif entry.has_key('author'):
author_properties = {
'name' : entry.author,
'url' : '',
'email' : '',
}
entry_authors.extend([author_properties])
# Contributors
entry_contributors = []
if entry.has_key('contributors'):
for contributor in entry.contributors:
contributor_properties = {
'name' : contributor.name if contributor.has_key('name') else '',
'url' : contributor.href if contributor.has_key('href') else '',
'email' : contributor.email if contributor.has_key('email') else '',
}
entry_contributors.extend([contributor_properties])
# Tags
entry_tags = []
if entry.has_key('tags'):
for tag in entry.tags:
tag_properties = {
'term' : tag.term if tag.has_key('term') else '',
'scheme' : tag.scheme if tag.has_key('scheme') else '',
'label' : tag.label if tag.has_key('label') else '',
}
entry_tags.extend([tag_properties])
# Content
entry_contents = []
if entry.has_key('content'):
for content in entry.content:
text = content.value if content.has_key('value') else ''
type = content.type if content.has_key('type') else ''
lang = content.lang if content.has_key('lang') else ''
base = content.base if content.has_key('base') else ''
entry_content = {
'text' : text,
'lang' : lang,
'type' : type,
'base' : base,
}
entry_contents.extend([entry_content])
# Links and Enclosures
entry_links = []
if entry.has_key('links'):
for link in entry.links:
link_properties = {
'url' : link.href if link.has_key('href') else '',
'rel' : link.rel if link.has_key('rel') else '',
'type' : link.type if link.has_key('type') else '',
'length' : '',
}
entry_links.extend([link_properties])
# Element media:content is utilized by Mastodon
if entry.has_key('media_content'):
for link in entry.media_content:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : link['type'] if 'type' in link else '',
# 'medium' : link['medium'] if 'medium' in link else '',
'length' : link['filesize'] if 'filesize' in link else '',
}
entry_links.extend([link_properties])
if entry.has_key('media_thumbnail'):
for link in entry.media_thumbnail:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : '',
# 'medium' : 'image',
'length' : '',
}
entry_links.extend([link_properties])
# Category
entry_category = entry.category if entry.has_key('category') else ''
# Comments
entry_comments = entry.comments if entry.has_key('comments') else ''
# href
entry_href = entry.href if entry.has_key('href') else ''
# Link: Same as entry.links[0].href in most if not all cases
entry_link = entry.link if entry.has_key('link') else ''
# Rating
entry_rating = entry.rating if entry.has_key('rating') else ''
# Summary
entry_summary_text = entry.summary if entry.has_key('summary') else ''
if entry.has_key('summary_detail'):
entry_summary_type = entry.summary_detail.type if entry.summary_detail.has_key('type') else ''
entry_summary_lang = entry.summary_detail.lang if entry.summary_detail.has_key('lang') else ''
entry_summary_base = entry.summary_detail.base if entry.summary_detail.has_key('base') else ''
else:
entry_summary_type = ''
entry_summary_lang = ''
entry_summary_base = ''
# Title
entry_title = entry.title if entry.has_key('title') else ''
if entry.has_key('title_detail'):
entry_title_type = entry.title_detail.type if entry.title_detail.has_key('type') else ''
else:
entry_title_type = ''
###########################################################
# media_type = e_link.type[:e_link.type.index("/")]
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
entry_properties = {
"identifier": entry_id,
"link": entry_link,
"href": entry_href,
"title": entry_title,
"title_type": entry_title_type,
'summary_text' : entry_summary_text,
'summary_lang' : entry_summary_lang,
'summary_type' : entry_summary_type,
'summary_base' : entry_summary_base,
'category' : entry_category,
"comments": entry_comments,
"rating": entry_rating,
"published": entry_published,
"updated": entry_updated,
"read_status": read_status
read_status = 0
if entry.has_key("published"):
entry_published = entry.published
entry_published = DateAndTime.rfc2822_to_iso8601(entry_published)
else:
entry_published = ''
if entry.has_key("updated"):
entry_updated = entry.updated
entry_updated = DateAndTime.rfc2822_to_iso8601(entry_updated)
else:
entry_updated = DateAndTime.now()
if entry.has_key("link"):
# link = complete_url(source, entry.link)
entry_link = Url.join_url(feed_url, entry.link)
entry_link = Url.trim_url(entry_link)
else:
entry_link = feed_url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
entry_title = entry.title if entry.has_key("title") else entry_published
# entry_id = entry.id if entry.has_key("id") else entry_link
# # Filter
# pathname = urlsplit(link).path
# string = (
# "{} {} {}"
# ).format(
# title, summary, pathname)
# if self.settings['default']['filter']:
# print('Filter is now processing data.')
# allow_list = config.is_include_keyword(db_file,
# "allow", string)
# if not allow_list:
# reject_list = config.is_include_keyword(db_file,
# "deny",
# string)
# if reject_list:
# read_status = 1
# logger.debug('Rejected : {}'
# '\n'
# 'Keyword : {}'
# .format(link, reject_list))
if isinstance(entry_published, int):
logger.error('Variable "published" is int: {}'.format(entry_published))
if isinstance(entry_updated, int):
logger.error('Variable "updated" is int: {}'.format(entry_updated))
# Authors
entry_authors =[]
if entry.has_key('authors'):
for author in entry.authors:
author_properties = {
'name' : author.name if author.has_key('name') else '',
'url' : author.href if author.has_key('href') else '',
'email' : author.email if author.has_key('email') else '',
}
new_entries.extend([{
"entry_properties" : entry_properties,
"entry_authors" : entry_authors,
"entry_contributors" : entry_contributors,
"entry_contents" : entry_contents,
"entry_links" : entry_links,
"entry_tags" : entry_tags
}])
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
return new_entries
entry_authors.extend([author_properties])
elif entry.has_key('author_detail'):
author_properties = {
'name' : entry.author_detail.name if entry.author_detail.has_key('name') else '',
'url' : entry.author_detail.href if entry.author_detail.has_key('href') else '',
'email' : entry.author_detail.email if entry.author_detail.has_key('email') else '',
}
entry_authors.extend([author_properties])
elif entry.has_key('author'):
author_properties = {
'name' : entry.author,
'url' : '',
'email' : '',
}
entry_authors.extend([author_properties])
# Contributors
entry_contributors = []
if entry.has_key('contributors'):
for contributor in entry.contributors:
contributor_properties = {
'name' : contributor.name if contributor.has_key('name') else '',
'url' : contributor.href if contributor.has_key('href') else '',
'email' : contributor.email if contributor.has_key('email') else '',
}
entry_contributors.extend([contributor_properties])
# Tags
entry_tags = []
if entry.has_key('tags'):
for tag in entry.tags:
tag_properties = {
'term' : tag.term if tag.has_key('term') else '',
'scheme' : tag.scheme if tag.has_key('scheme') else '',
'label' : tag.label if tag.has_key('label') else '',
}
entry_tags.extend([tag_properties])
# Content
entry_contents = []
if entry.has_key('content'):
for content in entry.content:
text = content.value if content.has_key('value') else ''
type = content.type if content.has_key('type') else ''
lang = content.lang if content.has_key('lang') else ''
base = content.base if content.has_key('base') else ''
entry_content = {
'text' : text,
'lang' : lang,
'type' : type,
'base' : base,
}
entry_contents.extend([entry_content])
# Links and Enclosures
entry_links = []
if entry.has_key('links'):
for link in entry.links:
link_properties = {
'url' : link.href if link.has_key('href') else '',
'rel' : link.rel if link.has_key('rel') else '',
'type' : link.type if link.has_key('type') else '',
'length' : '',
}
entry_links.extend([link_properties])
# Element media:content is utilized by Mastodon
if entry.has_key('media_content'):
for link in entry.media_content:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : link['type'] if 'type' in link else '',
# 'medium' : link['medium'] if 'medium' in link else '',
'length' : link['filesize'] if 'filesize' in link else '',
}
entry_links.extend([link_properties])
if entry.has_key('media_thumbnail'):
for link in entry.media_thumbnail:
link_properties = {
'url' : link['url'] if 'url' in link else '',
'rel' : 'enclosure',
'type' : '',
# 'medium' : 'image',
'length' : '',
}
entry_links.extend([link_properties])
# Category
entry_category = entry.category if entry.has_key('category') else ''
# Comments
entry_comments = entry.comments if entry.has_key('comments') else ''
# href
entry_href = entry.href if entry.has_key('href') else ''
# Link: Same as entry.links[0].href in most if not all cases
entry_link = entry.link if entry.has_key('link') else ''
# Rating
entry_rating = entry.rating if entry.has_key('rating') else ''
# Summary
entry_summary_text = entry.summary if entry.has_key('summary') else ''
if entry.has_key('summary_detail'):
entry_summary_type = entry.summary_detail.type if entry.summary_detail.has_key('type') else ''
entry_summary_lang = entry.summary_detail.lang if entry.summary_detail.has_key('lang') else ''
entry_summary_base = entry.summary_detail.base if entry.summary_detail.has_key('base') else ''
else:
entry_summary_type = ''
entry_summary_lang = ''
entry_summary_base = ''
# Title
entry_title = entry.title if entry.has_key('title') else ''
if entry.has_key('title_detail'):
entry_title_type = entry.title_detail.type if entry.title_detail.has_key('type') else ''
else:
entry_title_type = ''
###########################################################
# media_type = e_link.type[:e_link.type.index("/")]
# if (e_link.rel == "enclosure" and
# media_type in ("audio", "image", "video")):
# media_link = e_link.href
# media_link = Url.join_url(url, e_link.href)
# media_link = Url.trim_url(media_link)
###########################################################
entry_properties = {
"identifier": entry_identifier,
"link": entry_link,
"href": entry_href,
"title": entry_title,
"title_type": entry_title_type,
'summary_text' : entry_summary_text,
'summary_lang' : entry_summary_lang,
'summary_type' : entry_summary_type,
'summary_base' : entry_summary_base,
'category' : entry_category,
"comments": entry_comments,
"rating": entry_rating,
"published": entry_published,
"updated": entry_updated,
"read_status": read_status}
new_entry = {
"entry_properties" : entry_properties,
"entry_authors" : entry_authors,
"entry_contributors" : entry_contributors,
"entry_contents" : entry_contents,
"entry_links" : entry_links,
"entry_tags" : entry_tags}
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
return new_entry
"""
@ -1277,7 +1280,6 @@ class FeedTask:
urls = sqlite.get_active_feeds_url(db_file)
for url in urls:
url = url[0]
print('start scan\nurl {}\ndatabase {}'.format(url, db_file))
# print('STA',url)
# # Skip Reddit
@ -1291,6 +1293,19 @@ class FeedTask:
feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
if not result['error']:
identifier = sqlite.get_feed_identifier(db_file, feed_id)
identifier = identifier[0]
if not identifier:
counter = 0
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
else:
break
await sqlite.update_feed_identifier(db_file, feed_id, identifier)
# identifier = sqlite.get_feed_identifier(db_file, feed_id)
# identifier = identifier[0]
await sqlite.update_feed_status(db_file, feed_id, status_code)
document = result['content']
feed = parse(document)
@ -1300,17 +1315,54 @@ class FeedTask:
db_file, feed_id, feed)
await sqlite.update_feed_properties(
db_file, feed_id, feed_properties)
new_entries = Feed.get_properties_of_entries(
jid_bare, db_file, url, feed_id, feed)
new_entries = []
for entry in feed.entries:
if entry.has_key("link"):
# link = complete_url(source, entry.link)
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
# if 'f-droid.org' in url:
# breakpoint()
# print(entry.link)
# print(entry_identifier)
# Check if an entry identifier exists
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
# new_entries.append(new_entry)
new_entries.extend([new_entry])
print(url)
if new_entries:
await sqlite.add_entries_and_update_feed_state(db_file, feed_id, new_entries)
limit = Config.get_setting_value(self.settings, jid_bare, 'archive')
ixs = sqlite.get_invalid_entries(db_file, url, feed)
await sqlite.process_invalid_entries(db_file, ixs)
ixs = sqlite.get_entries_id_of_feed(db_file, feed_id)
ixs_invalid = {}
for ix in ixs:
ix = ix[0]
read_status = sqlite.is_entry_read(db_file, ix)
read_status = read_status[0]
entry_identifier_local = sqlite.get_entry_identifier(db_file, ix)
entry_identifier_local = entry_identifier_local[0]
valid = False
for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier_external = Utilities.hash_url_to_md5(
entry_link)
if entry_identifier_local == entry_identifier_external:
valid = True
continue
if not valid: ixs_invalid[ix] = read_status
if len(ixs_invalid):
print('erasing {}/{}'.format(len(ixs_invalid), len(feed.entries)))
await sqlite.process_invalid_entries(db_file, ixs_invalid)
# TODO return number of archived entries and add if statement to run archive maintainence function
await sqlite.maintain_archive(db_file, limit)
# await sqlite.process_invalid_entries(db_file, ixs)
print('end scan\nurl {}\ndatabase {}'.format(url, db_file))
await asyncio.sleep(50)
# await asyncio.sleep(50)
val = Config.get_setting_value(self.settings, jid_bare, 'check')
await asyncio.sleep(60 * float(val))
# Schedule to call this function again in 90 minutes

View file

@ -40,14 +40,13 @@ TODO
"""
from datetime import datetime
from email.utils import parseaddr
from dateutil.parser import parse
from email.utils import parsedate, parsedate_to_datetime
from email.utils import parseaddr, parsedate, parsedate_to_datetime
import hashlib
from lxml import etree, html
import os
import random
import slixfeed.config as config
from lxml import etree, html
import slixfeed.dt as dt
import slixfeed.fetch as fetch
from slixfeed.log import Logger
@ -681,9 +680,31 @@ class Url:
class String:
def generate_identifier(url, counter):
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
return identifier
# string_to_md5_hash
# NOTE Warning: Entry might not have a link
# TODO Handle situation error
def md5_hash(url):
url_encoded = url.encode()
url_hashed = hashlib.md5(url_encoded)
url_digest = url_hashed.hexdigest()
return url_digest
class Utilities:
# string_to_md5_hash
# NOTE Warning: Entry might not have a link
# TODO Handle situation error
def hash_url_to_md5(url):

View file

@ -1,2 +1,2 @@
__version__ = '0.1.82'
__version_info__ = (0, 1, 82)
__version__ = '0.1.83'
__version_info__ = (0, 1, 83)

View file

@ -48,7 +48,7 @@ import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, FeedDiscovery, FeedTask, Opml
from slixfeed.utilities import DateAndTime, Html, Task, Url, Utilities
from slixfeed.utilities import DateAndTime, Html, String, Task, Url, Utilities
from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.chat import XmppChat, XmppChatTask
@ -1776,12 +1776,10 @@ class XmppClient(slixmpp.ClientXMPP):
session['prev'] = None
# elif not identifier:
# counter = 0
# hostname = Url.get_hostname(url)
# identifier = hostname + ':' + str(counter)
# while True:
# identifier = String.generate_identifier(url, counter)
# if sqlite.check_identifier_exist(db_file, identifier):
# counter += 1
# identifier = hostname + ':' + str(counter)
# else:
# break
# Several URLs to subscribe
@ -1793,12 +1791,10 @@ class XmppClient(slixmpp.ClientXMPP):
exist_count = 0
for url in urls:
counter = 0
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
result = await Feed.add_feed(self, jid_bare, db_file, url,
@ -1826,12 +1822,10 @@ class XmppClient(slixmpp.ClientXMPP):
if isinstance(url, list):
url = url[0]
counter = 0
hostname = Url.get_hostname(url)
identifier = hostname + ':' + str(counter)
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
result = await Feed.add_feed(self, jid_bare, db_file, url,

View file

@ -9,7 +9,7 @@ import slixfeed.fetch as fetch
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed, FeedDiscovery, Opml
from slixfeed.utilities import DateAndTime, Documentation, Url, Utilities
from slixfeed.utilities import DateAndTime, Documentation, String, Url, Utilities
from slixfeed.version import __version__
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.muc import XmppMuc
@ -119,18 +119,15 @@ class XmppCommands:
if url.startswith('http'):
if not title:
title = Url.get_hostname(url)
counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
exist = sqlite.get_feed_id_and_name(db_file, url)
if not exist:
counter = 0
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
else:
break
await sqlite.insert_feed(db_file, url, title,
identifier)
feed_id = sqlite.get_feed_id(db_file, url)
@ -157,8 +154,17 @@ class XmppCommands:
feed_properties)
feed_id = sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
new_entries = Feed.get_properties_of_entries(
jid_bare, db_file, url, feed_id, feed)
new_entries = []
for entry in feed.entries:
if entry.has_key("link"):
entry_link = Url.join_url(url, entry.link)
entry_link = Url.trim_url(entry_link)
entry_identifier = String.md5_hash(entry_link)
if not sqlite.get_entry_id_by_identifier(
db_file, entry_identifier):
new_entry = Feed.get_properties_of_entry(
url, entry_identifier, entry)
new_entries.extend([new_entry])
if new_entries:
await sqlite.add_entries_and_update_feed_state(
db_file, feed_id, new_entries)
@ -390,14 +396,11 @@ class XmppCommands:
identifier = info[2]
else:
counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(
db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
# task.clean_tasks_xmpp_chat(self, jid_bare, ['status'])
@ -479,13 +482,10 @@ class XmppCommands:
url = Url.feed_to_http(url)
url = (await Url.replace_hostname(url, 'feed')) or url
counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
# try:

View file

@ -16,7 +16,7 @@ from slixfeed.config import Config
from slixfeed.log import Logger
import slixfeed.sqlite as sqlite
from slixfeed.syndication import Feed
from slixfeed.utilities import Url, Utilities
from slixfeed.utilities import String, Url, Utilities
from slixfeed.xmpp.iq import XmppIQ
import sys
@ -336,13 +336,10 @@ class XmppPubsubAction:
node_id = node_id[0]
if not node_id:
counter = 0
hostname = Url.get_hostname(url)
hostname = hostname.replace('.','-')
identifier = hostname + ':' + str(counter)
while True:
identifier = String.generate_identifier(url, counter)
if sqlite.check_identifier_exist(db_file, identifier):
counter += 1
identifier = hostname + ':' + str(counter)
else:
break
await sqlite.update_feed_identifier(db_file, feed_id, identifier)