64727d207f
Add more classes and modules; Restore database maintenance; Remove JSON support; Remove Beautiful Soup.
347 lines
12 KiB
Python
347 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
TODO
|
|
|
|
1) Function scan at "for entry in entries"
|
|
Suppress directly calling function "add_entry" (accept db_file)
|
|
Pass a list of valid entries to a new function "add_entries"
|
|
(accept db_file) which would call function "add_entry" (accept cur).
|
|
* accelerate adding of large set of entries at once.
|
|
* prevent (or mitigate halt of consequent actions).
|
|
* reduce I/O.
|
|
|
|
2) Call sqlite function from function statistics.
|
|
Returning a list of values doesn't' seem to be a good practice.
|
|
|
|
3) Special statistics for operator:
|
|
* Size of database(s);
|
|
* Amount of JIDs subscribed;
|
|
* Amount of feeds of all JIDs;
|
|
* Amount of entries of all JIDs.
|
|
|
|
4) Consider to append text to remind to share presence
|
|
'✒️ Share online status to receive updates'
|
|
|
|
5) Request for subscription
|
|
if (await XmppUtilities.get_chat_type(self, jid_bare) == 'chat' and
|
|
not self.client_roster[jid_bare]['to']):
|
|
XmppPresence.subscription(self, jid_bare, 'subscribe')
|
|
await XmppRoster.add(self, jid_bare)
|
|
status_message = '✒️ Share online status to receive updates'
|
|
XmppPresence.send(self, jid_bare, status_message)
|
|
message_subject = 'RSS News Bot'
|
|
message_body = 'Share online status to receive updates.'
|
|
XmppMessage.send_headline(self, jid_bare, message_subject,
|
|
message_body, 'chat')
|
|
|
|
"""
|
|
|
|
import hashlib
|
|
import slixfeed.config as config
|
|
from slixfeed.config import Config
|
|
from lxml import etree, html
|
|
import slixfeed.dt as dt
|
|
import slixfeed.fetch as fetch
|
|
from slixfeed.log import Logger
|
|
import slixfeed.sqlite as sqlite
|
|
from slixfeed.url import join_url, complete_url
|
|
import sys
|
|
|
|
try:
|
|
import tomllib
|
|
except:
|
|
import tomli as tomllib
|
|
|
|
logger = Logger(__name__)
|
|
|
|
|
|
class Documentation:
|
|
|
|
|
|
def manual(filename, section=None, command=None):
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: filename: {}'.format(function_name, filename))
|
|
config_dir = config.get_default_config_directory()
|
|
with open(config_dir + '/' + filename, mode="rb") as commands:
|
|
cmds = tomllib.load(commands)
|
|
if section == 'all':
|
|
cmd_list = ''
|
|
for cmd in cmds:
|
|
for i in cmds[cmd]:
|
|
cmd_list += cmds[cmd][i] + '\n'
|
|
elif command and section:
|
|
try:
|
|
cmd_list = cmds[section][command]
|
|
except KeyError as e:
|
|
logger.error(e)
|
|
cmd_list = None
|
|
elif section:
|
|
try:
|
|
cmd_list = []
|
|
for cmd in cmds[section]:
|
|
cmd_list.extend([cmd])
|
|
except KeyError as e:
|
|
logger.error('KeyError:' + str(e))
|
|
cmd_list = None
|
|
else:
|
|
cmd_list = []
|
|
for cmd in cmds:
|
|
cmd_list.extend([cmd])
|
|
return cmd_list
|
|
|
|
|
|
class Html:
|
|
|
|
|
|
async def extract_image_from_html(url):
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: url: {}'.format(function_name, url))
|
|
result = await fetch.http(url)
|
|
if not result['error']:
|
|
data = result['content']
|
|
tree = html.fromstring(data)
|
|
# TODO Exclude banners, class="share" links etc.
|
|
images = tree.xpath(
|
|
'//img[not('
|
|
'contains(@src, "avatar") or '
|
|
'contains(@src, "cc-by-sa") or '
|
|
'contains(@src, "emoji") or '
|
|
'contains(@src, "icon") or '
|
|
'contains(@src, "logo") or '
|
|
'contains(@src, "letture") or '
|
|
'contains(@src, "poweredby_mediawi") or '
|
|
'contains(@src, "search") or '
|
|
'contains(@src, "share") or '
|
|
'contains(@src, "smiley")'
|
|
')]/@src')
|
|
if len(images):
|
|
image = images[0]
|
|
image = str(image)
|
|
image_url = complete_url(url, image)
|
|
return image_url
|
|
|
|
|
|
def remove_html_tags(data):
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}'.format(function_name))
|
|
parser = etree.HTMLParser()
|
|
tree = etree.fromstring(data, parser)
|
|
data = etree.tostring(tree, encoding='unicode', method='text')
|
|
data = data.replace("\n\n", "\n")
|
|
return data
|
|
|
|
|
|
# /questions/9662346/python-code-to-remove-html-tags-from-a-string
|
|
def _remove_html_tags(text):
|
|
import xml.etree.ElementTree
|
|
return ''.join(xml.etree.ElementTree.fromstring(text).itertext())
|
|
|
|
|
|
def __remove_html_tags(data):
|
|
from bs4 import BeautifulSoup
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}'.format(function_name))
|
|
data = BeautifulSoup(data, "lxml").text
|
|
data = data.replace("\n\n", "\n")
|
|
return data
|
|
|
|
|
|
class MD:
|
|
|
|
|
|
def export_to_markdown(jid, filename, results):
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: jid: {} filename: {}'
|
|
.format(function_name, jid, filename))
|
|
with open(filename, 'w') as file:
|
|
file.write('# Subscriptions for {}\n'.format(jid))
|
|
file.write('## Set of feeds exported with Slixfeed\n')
|
|
for result in results:
|
|
file.write('- [{}]({})\n'.format(result[1], result[2]))
|
|
file.write('\n\n* * *\n\nThis list was saved on {} from xmpp:{} using '
|
|
'[Slixfeed](https://slixfeed.woodpeckersnest.space/)\n'
|
|
.format(dt.current_date(), jid))
|
|
|
|
|
|
def log_to_markdown(timestamp, filename, jid, message):
|
|
"""
|
|
Log message to a markdown file.
|
|
|
|
Parameters
|
|
----------
|
|
timestamp : str
|
|
Time stamp.
|
|
filename : str
|
|
Jabber ID as name of file.
|
|
jid : str
|
|
Jabber ID.
|
|
message : str
|
|
Message content.
|
|
|
|
Returns
|
|
-------
|
|
None.
|
|
|
|
"""
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: timestamp: {} filename: {} jid: {} message: {}'.format(function_name, timestamp, filename, jid, message))
|
|
with open(filename + '.md', 'a') as file:
|
|
# entry = "{} {}:\n{}\n\n".format(timestamp, jid, message)
|
|
entry = '## {}\n### {}\n\n{}\n\n'.format(jid, timestamp, message)
|
|
file.write(entry)
|
|
|
|
|
|
class SQLiteMaintain:
|
|
|
|
|
|
# TODO
|
|
# (1) Check for duplications
|
|
# (2) append all duplications to a list
|
|
# (3) Send the list to a function in module sqlite.
|
|
async def remove_nonexistent_entries(self, jid_bare, db_file, url, feed):
|
|
"""
|
|
Remove entries that don't exist in a given parsed feed.
|
|
Check the entries returned from feed and delete read non
|
|
existing entries, otherwise move to table archive, if unread.
|
|
|
|
Parameters
|
|
----------
|
|
db_file : str
|
|
Path to database file.
|
|
url : str
|
|
Feed URL.
|
|
feed : list
|
|
Parsed feed document.
|
|
"""
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: db_file: {} url: {}'
|
|
.format(function_name, db_file, url))
|
|
feed_id = sqlite.get_feed_id(db_file, url)
|
|
feed_id = feed_id[0]
|
|
items = sqlite.get_entries_of_feed(db_file, feed_id)
|
|
entries = feed.entries
|
|
limit = Config.get_setting_value(self.settings, jid_bare, 'archive')
|
|
print(limit)
|
|
for item in items:
|
|
ix, entry_title, entry_link, entry_id, timestamp = item
|
|
read_status = sqlite.is_entry_read(db_file, ix)
|
|
read_status = read_status[0]
|
|
valid = False
|
|
for entry in entries:
|
|
title = None
|
|
link = None
|
|
time = None
|
|
# valid = False
|
|
# TODO better check and don't repeat code
|
|
if entry.has_key("id") and entry_id:
|
|
if entry.id == entry_id:
|
|
print("compare entry.id == entry_id:", entry.id)
|
|
print("compare entry.id == entry_id:", entry_id)
|
|
print("============")
|
|
valid = True
|
|
break
|
|
else:
|
|
if entry.has_key("title"):
|
|
title = entry.title
|
|
else:
|
|
title = feed["feed"]["title"]
|
|
if entry.has_key("link"):
|
|
link = join_url(url, entry.link)
|
|
else:
|
|
link = url
|
|
if entry.has_key("published") and timestamp:
|
|
print("compare published:", title, link, time)
|
|
print("compare published:", entry_title, entry_link, timestamp)
|
|
print("============")
|
|
time = dt.rfc2822_to_iso8601(entry.published)
|
|
if (entry_title == title and
|
|
entry_link == link and
|
|
timestamp == time):
|
|
valid = True
|
|
break
|
|
else:
|
|
if (entry_title == title and
|
|
entry_link == link):
|
|
print("compare entry_link == link:", title, link)
|
|
print("compare entry_title == title:", entry_title, entry_link)
|
|
print("============")
|
|
valid = True
|
|
break
|
|
# TODO better check and don't repeat code
|
|
if not valid:
|
|
# print("id: ", ix)
|
|
# if title:
|
|
# print("title: ", title)
|
|
# print("entry_title: ", entry_title)
|
|
# if link:
|
|
# print("link: ", link)
|
|
# print("entry_link: ", entry_link)
|
|
# if entry.id:
|
|
# print("last_entry:", entry.id)
|
|
# print("entry_id: ", entry_id)
|
|
# if time:
|
|
# print("time: ", time)
|
|
# print("timestamp: ", timestamp)
|
|
# print("read: ", read_status)
|
|
# breakpoint()
|
|
|
|
# TODO Send to table archive
|
|
# TODO Also make a regular/routine check for sources that
|
|
# have been changed (though that can only happen when
|
|
# manually editing)
|
|
# ix = item[0]
|
|
# print(">>> SOURCE: ", source)
|
|
# print(">>> INVALID:", entry_title)
|
|
# print("title:", entry_title)
|
|
# print("link :", entry_link)
|
|
# print("id :", entry_id)
|
|
if read_status == 1:
|
|
await sqlite.delete_entry_by_id(db_file, ix)
|
|
# print(">>> DELETING:", entry_title)
|
|
else:
|
|
# print(">>> ARCHIVING:", entry_title)
|
|
await sqlite.archive_entry(db_file, ix)
|
|
await sqlite.maintain_archive(db_file, limit)
|
|
|
|
|
|
class Task:
|
|
|
|
|
|
def start(self, jid_bare, callback):
|
|
callback(self, jid_bare)
|
|
|
|
|
|
def stop(self, jid_bare, task):
|
|
if (jid_bare in self.task_manager and
|
|
task in self.task_manager[jid_bare]):
|
|
self.task_manager[jid_bare][task].cancel()
|
|
else:
|
|
logger.debug('No task {} for JID {} (Task.stop)'
|
|
.format(task, jid_bare))
|
|
|
|
|
|
class Utilities:
|
|
|
|
|
|
# NOTE Warning: Entry might not have a link
|
|
# TODO Handle situation error
|
|
def hash_url_to_md5(url):
|
|
url_encoded = url.encode()
|
|
url_hashed = hashlib.md5(url_encoded)
|
|
url_digest = url_hashed.hexdigest()
|
|
return url_digest
|
|
|
|
|
|
def pick_a_feed(lang=None):
|
|
function_name = sys._getframe().f_code.co_name
|
|
logger.debug('{}: lang: {}'
|
|
.format(function_name, lang))
|
|
config_dir = config.get_default_config_directory()
|
|
with open(config_dir + '/' + 'feeds.toml', mode="rb") as feeds:
|
|
urls = tomllib.load(feeds)
|
|
import random
|
|
url = random.choice(urls['feeds'])
|
|
return url
|