Slixfeed/slixfeed/action.py

1634 lines
60 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) Function scan at "for entry in entries"
Suppress directly calling function "add_entry" (accept db_file)
Pass a list of valid entries to a new function "add_entries"
(accept db_file) which would call function "add_entry" (accept cur).
* accelerate adding of large set of entries at once.
* prevent (or mitigate halt of consequent actions).
* reduce I/O.
2) Call sqlite function from function statistics.
Returning a list of values doesn't' seem to be a good practice.
2024-01-14 19:05:12 +01:00
3) Special statistics for operator:
* Size of database(s);
* Amount of JIDs subscribed;
* Amount of feeds of all JIDs;
* Amount of entries of all JIDs.
"""
import asyncio
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
from feedparser import parse
2024-01-20 18:28:31 +01:00
from http.client import IncompleteRead
import json
import logging
from lxml import html
import os
import slixfeed.config as config
import slixfeed.crawl as crawl
import slixfeed.dt as dt
import slixfeed.fetch as fetch
import slixfeed.sqlite as sqlite
import slixfeed.url as uri
from slixfeed.url import (
complete_url,
join_url,
remove_tracking_parameters,
replace_hostname,
trim_url
)
2024-02-06 04:04:43 +01:00
import slixfeed.task as task
from slixfeed.xmpp.bookmark import XmppBookmark
from slixfeed.xmpp.message import XmppMessage
from slixfeed.xmpp.presence import XmppPresence
from slixfeed.xmpp.upload import XmppUpload
from slixfeed.xmpp.utility import get_chat_type
2024-01-27 20:21:45 +01:00
import tomllib
from urllib import error
from urllib.parse import parse_qs, urlsplit
import xml.etree.ElementTree as ET
2024-01-24 21:28:14 +01:00
try:
import xml2epub
2024-02-06 04:04:43 +01:00
except ImportError:
2024-01-24 21:28:14 +01:00
logging.info(
"Package xml2epub was not found.\n"
"ePUB support is disabled.")
try:
import html2text
2024-02-06 04:04:43 +01:00
except ImportError:
logging.info(
"Package html2text was not found.\n"
"Markdown support is disabled.")
try:
import pdfkit
2024-02-06 04:04:43 +01:00
except ImportError:
logging.info(
"Package pdfkit was not found.\n"
"PDF support is disabled.")
2024-01-10 22:20:02 +01:00
try:
from readability import Document
2024-02-06 04:04:43 +01:00
except ImportError:
2024-01-10 22:20:02 +01:00
logging.info(
"Package readability was not found.\n"
"Arc90 Lab algorithm is disabled.")
async def export_feeds(self, jid, jid_file, ext):
cache_dir = config.get_default_cache_directory()
if not os.path.isdir(cache_dir):
os.mkdir(cache_dir)
if not os.path.isdir(cache_dir + '/' + ext):
os.mkdir(cache_dir + '/' + ext)
filename = os.path.join(
cache_dir, ext, 'slixfeed_' + dt.timestamp() + '.' + ext)
db_file = config.get_pathname_to_database(jid_file)
results = await sqlite.get_feeds(db_file)
match ext:
# case 'html':
# response = 'Not yet implemented.'
case 'md':
export_to_markdown(jid, filename, results)
case 'opml':
export_to_opml(jid, filename, results)
# case 'xbel':
# response = 'Not yet implemented.'
return filename
async def xmpp_send_status(self, jid):
"""
Send status message.
Parameters
----------
jid : str
Jabber ID.
"""
logging.info('Sending a status message to JID {}'.format(jid))
status_text = '📜️ Slixfeed RSS News Bot'
jid_file = jid.replace('/', '_')
db_file = config.get_pathname_to_database(jid_file)
2024-02-12 20:01:28 +01:00
enabled = config.get_setting_value(db_file, 'enabled')
if not enabled:
status_mode = 'xa'
status_text = '📪️ Send "Start" to receive updates'
else:
feeds = await sqlite.get_number_of_items(db_file, 'feeds')
# print(await current_time(), jid, "has", feeds, "feeds")
if not feeds:
status_mode = 'available'
status_text = '📪️ Send a URL from a blog or a news website'
else:
unread = await sqlite.get_number_of_entries_unread(db_file)
if unread:
status_mode = 'chat'
status_text = '📬️ There are {} news items'.format(str(unread))
# status_text = (
# "📰 News items: {}"
# ).format(str(unread))
# status_text = (
# "📰 You have {} news items"
# ).format(str(unread))
else:
status_mode = 'available'
status_text = '📭️ No news'
# breakpoint()
# print(await current_time(), status_text, "for", jid)
XmppPresence.send(self, jid, status_text, status_type=status_mode)
# await asyncio.sleep(60 * 20)
# await refresh_task(self, jid, send_status, 'status', '90')
# loop.call_at(
# loop.time() + 60 * 20,
# loop.create_task,
# send_status(jid)
# )
async def xmpp_send_update(self, jid, num=None):
"""
Send news items as messages.
Parameters
----------
jid : str
Jabber ID.
num : str, optional
Number. The default is None.
"""
jid_file = jid.replace('/', '_')
db_file = config.get_pathname_to_database(jid_file)
2024-02-12 20:01:28 +01:00
enabled = config.get_setting_value(db_file, 'enabled')
if enabled:
2024-02-12 20:01:28 +01:00
show_media = config.get_setting_value(db_file, 'media')
if not num:
2024-02-12 20:01:28 +01:00
num = config.get_setting_value(db_file, 'quantum')
else:
num = int(num)
results = await sqlite.get_unread_entries(db_file, num)
news_digest = ''
media = None
chat_type = await get_chat_type(self, jid)
for result in results:
ix = result[0]
title_e = result[1]
url = result[2]
summary = result[3]
enclosure = result[4]
feed_id = result[5]
date = result[6]
title_f = sqlite.get_feed_title(db_file, feed_id)
title_f = title_f[0]
news_digest += list_unread_entries(result, title_f, jid_file)
# print(db_file)
# print(result[0])
# breakpoint()
await sqlite.mark_as_read(db_file, ix)
# Find media
# if url.startswith("magnet:"):
# media = action.get_magnet(url)
# elif enclosure.startswith("magnet:"):
# media = action.get_magnet(enclosure)
# elif enclosure:
if show_media:
if enclosure:
media = enclosure
else:
media = await extract_image_from_html(url)
if media and news_digest:
# Send textual message
XmppMessage.send(self, jid, news_digest, chat_type)
news_digest = ''
# Send media
XmppMessage.send_oob(self, jid, media, chat_type)
media = None
if news_digest:
XmppMessage.send(self, jid, news_digest, chat_type)
# TODO Add while loop to assure delivery.
# print(await current_time(), ">>> ACT send_message",jid)
# NOTE Do we need "if statement"? See NOTE at is_muc.
# if chat_type in ('chat', 'groupchat'):
# # TODO Provide a choice (with or without images)
# XmppMessage.send(self, jid, news_digest, chat_type)
# See XEP-0367
# if media:
# # message = xmpp.Slixfeed.make_message(
# # self, mto=jid, mbody=new, mtype=chat_type)
# message = xmpp.Slixfeed.make_message(
# self, mto=jid, mbody=media, mtype=chat_type)
# message['oob']['url'] = media
# message.send()
# TODO Do not refresh task before
# verifying that it was completed.
# await start_tasks_xmpp(self, jid, ['status'])
# await refresh_task(self, jid, send_update, 'interval')
# interval = await initdb(
# jid,
# sqlite.is_setting_key,
# "interval"
# )
# self.task_manager[jid]["interval"] = loop.call_at(
# loop.time() + 60 * interval,
# loop.create_task,
# send_update(jid)
# )
# print(await current_time(), "asyncio.get_event_loop().time()")
# print(await current_time(), asyncio.get_event_loop().time())
# await asyncio.sleep(60 * interval)
# loop.call_later(
# 60 * interval,
# loop.create_task,
# send_update(jid)
# )
# print
# await handle_event()
2024-01-28 12:17:31 +01:00
def manual(filename, section=None, command=None):
2024-01-27 20:21:45 +01:00
config_dir = config.get_default_config_directory()
2024-01-28 12:17:31 +01:00
with open(config_dir + '/' + filename, mode="rb") as commands:
2024-01-27 20:21:45 +01:00
cmds = tomllib.load(commands)
if section == 'all':
cmd_list = ''
for cmd in cmds:
for i in cmds[cmd]:
cmd_list += cmds[cmd][i] + '\n'
elif command and section:
2024-01-28 12:17:31 +01:00
try:
cmd_list = cmds[section][command]
except KeyError as e:
logging.error(str(e))
2024-01-28 12:17:31 +01:00
cmd_list = None
2024-01-27 20:21:45 +01:00
elif section:
2024-01-28 12:17:31 +01:00
try:
cmd_list = []
for cmd in cmds[section]:
2024-01-27 20:21:45 +01:00
cmd_list.extend([cmd])
except KeyError as e:
logging.error('KeyError:' + str(e))
2024-01-28 12:17:31 +01:00
cmd_list = None
2024-01-27 20:21:45 +01:00
else:
cmd_list = []
for cmd in cmds:
cmd_list.extend([cmd])
return cmd_list
def log_to_markdown(timestamp, filename, jid, message):
"""
Log message to file.
Parameters
----------
timestamp : str
Time stamp.
filename : str
Jabber ID as name of file.
jid : str
Jabber ID.
message : str
Message content.
Returns
-------
None.
"""
with open(filename + '.md', 'a') as file:
# entry = "{} {}:\n{}\n\n".format(timestamp, jid, message)
entry = (
"## {}\n"
"### {}\n\n"
"{}\n\n").format(jid, timestamp, message)
file.write(entry)
2024-01-20 18:28:31 +01:00
def is_feed_json(document):
"""
NOTE /kurtmckee/feedparser/issues/103
Determine whether document is json feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
value = False
try:
feed = json.loads(document)
if not feed['items']:
if "version" in feed.keys():
if 'jsonfeed' in feed['version']:
value = True
else: # TODO Test
value = False
# elif 'title' in feed.keys():
# value = True
else:
value = False
2024-01-20 18:28:31 +01:00
else:
value = True
except:
pass
2024-01-20 18:28:31 +01:00
return value
def is_feed(feed):
"""
Determine whether document is feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
value = False
2024-01-09 13:34:10 +01:00
# message = None
if not feed.entries:
if "version" in feed.keys():
2024-01-20 18:28:31 +01:00
# feed["version"]
if feed.version:
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
elif "title" in feed["feed"].keys():
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
else:
value = False
# message = (
# "No entries nor title for {}"
# ).format(url)
elif feed.bozo:
value = False
# message = (
# "Bozo detected for {}"
# ).format(url)
else:
value = True
# message = (
# "Good feed for {}"
# ).format(url)
return value
def list_unread_entries(result, feed_title, jid_file):
# TODO Add filtering
# TODO Do this when entry is added to list and mark it as read
# DONE!
# results = []
# if sqlite.is_setting_key(db_file, "deny"):
# while len(results) < num:
# result = cur.execute(sql).fetchone()
# blacklist = sqlite.get_setting_value(db_file, "deny").split(",")
# for i in blacklist:
# if i in result[1]:
# continue
# print("rejected:", result[1])
# print("accepted:", result[1])
# results.extend([result])
# news_list = "You've got {} news items:\n".format(num)
# NOTE Why doesn't this work without list?
# i.e. for result in results
# for result in results.fetchall():
ix = str(result[0])
title = str(result[1])
# # TODO Retrieve summary from feed
# # See fetch.view_entry
summary = result[3]
# Remove HTML tags
try:
summary = BeautifulSoup(summary, "lxml").text
except:
print(result[3])
breakpoint()
# TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
summary = summary.replace('\n', ' ')
summary = summary.replace(' ', ' ')
summary = summary.replace(' ', ' ')
db_file = config.get_pathname_to_database(jid_file)
length = config.get_setting_value(db_file, "length")
length = int(length)
summary = summary[:length] + " […]"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
link = result[2]
link = remove_tracking_parameters(link)
link = (replace_hostname(link, "link")) or link
# news_item = ("\n{}\n{}\n{} [{}]\n").format(str(title), str(link),
# str(feed_title), str(ix))
formatting = config.get_setting_value(db_file, 'formatting')
news_item = formatting.format(feed_title=feed_title,
title=title,
summary=summary,
link=link,
ix=ix)
news_item = news_item.replace('\\n', '\n')
return news_item
def list_search_results(query, results):
message = ("Search results for '{}':\n\n```"
.format(query))
for result in results:
message += ("\n{}\n{}\n"
.format(str(result[0]), str(result[1])))
if len(results):
2024-01-07 10:57:54 +01:00
message += "```\nTotal of {} results".format(len(results))
else:
2024-01-07 10:57:54 +01:00
message = "No results were found for: {}".format(query)
return message
def list_feeds_by_query(db_file, query):
results = sqlite.search_feeds(db_file, query)
message = ('Feeds containing "{}":\n\n```'
.format(query))
for result in results:
message += ('\nName : {} [{}]'
'\nURL : {}'
'\n'
.format(str(result[0]), str(result[1]), str(result[2])))
if len(results):
2024-01-07 10:57:54 +01:00
message += "\n```\nTotal of {} feeds".format(len(results))
else:
2024-01-07 10:57:54 +01:00
message = "No feeds were found for: {}".format(query)
return message
2024-01-14 19:05:12 +01:00
async def list_statistics(db_file):
"""
Return table statistics.
Parameters
----------
db_file : str
Path to database file.
Returns
-------
msg : str
Statistics as message.
"""
2024-01-14 19:05:12 +01:00
entries_unread = await sqlite.get_number_of_entries_unread(db_file)
entries = await sqlite.get_number_of_items(db_file, 'entries')
archive = await sqlite.get_number_of_items(db_file, 'archive')
entries_all = entries + archive
feeds_active = await sqlite.get_number_of_feeds_active(db_file)
feeds_all = await sqlite.get_number_of_items(db_file, 'feeds')
2024-02-12 20:01:28 +01:00
key_archive = config.get_setting_value(db_file, 'archive')
key_interval = config.get_setting_value(db_file, 'interval')
key_quantum = config.get_setting_value(db_file, 'quantum')
key_enabled = config.get_setting_value(db_file, 'enabled')
2024-01-14 19:05:12 +01:00
# msg = """You have {} unread news items out of {} from {} news sources.
# """.format(unread_entries, entries, feeds)
# try:
# value = cur.execute(sql, par).fetchone()[0]
# except:
# print("Error for key:", key)
# value = "Default"
# values.extend([value])
message = ("```"
"\nSTATISTICS\n"
"News items : {}/{}\n"
"News sources : {}/{}\n"
"\nOPTIONS\n"
"Items to archive : {}\n"
"Update interval : {}\n"
"Items per update : {}\n"
"Operation status : {}\n"
"```").format(entries_unread,
entries_all,
feeds_active,
feeds_all,
key_archive,
key_interval,
key_quantum,
key_enabled)
2024-01-07 10:57:54 +01:00
return message
# FIXME Replace counter by len
def list_last_entries(results, num):
2024-01-07 10:57:54 +01:00
message = "Recent {} titles:\n\n```".format(num)
for result in results:
message += ("\n{}\n{}\n"
.format(str(result[0]), str(result[1])))
if len(results):
2024-01-07 10:57:54 +01:00
message += "```\n"
else:
2024-01-07 10:57:54 +01:00
message = "There are no news at the moment."
return message
def pick_a_feed(lang=None):
config_dir = config.get_default_config_directory()
with open(config_dir + '/' + 'feeds.toml', mode="rb") as feeds:
urls = tomllib.load(feeds)
import random
url = random.choice(urls['feeds'])
return url
def list_feeds(results):
2024-01-07 10:57:54 +01:00
message = "\nList of subscriptions:\n\n```\n"
for result in results:
message += ("Name : {}\n"
"URL : {}\n"
# "Updated : {}\n"
# "Status : {}\n"
"ID : {}\n"
"\n"
.format(str(result[0]), str(result[1]), str(result[2])))
if len(results):
message += ('```\nTotal of {} subscriptions.\n'
.format(len(results)))
else:
url = pick_a_feed()
message = ('List of subscriptions is empty. To add a feed, send a URL.'
'Featured feed:\n*{}*\n{}'
.format(url['name'],
url['link']))
2024-01-07 10:57:54 +01:00
return message
async def list_bookmarks(self):
2024-02-06 04:04:43 +01:00
conferences = await XmppBookmark.get(self)
message = '\nList of groupchats:\n\n```\n'
for conference in conferences:
2024-02-14 04:04:49 +01:00
message += ('Name: {}\n'
'Room: {}\n'
'\n'
2024-02-14 04:04:49 +01:00
.format(conference['name'], conference['jid']))
message += ('```\nTotal of {} groupchats.\n'
.format(len(conferences)))
2024-01-07 10:57:54 +01:00
return message
def export_to_markdown(jid, filename, results):
with open(filename, 'w') as file:
file.write('# Subscriptions for {}\n'.format(jid))
file.write('## Set of feeds exported with Slixfeed\n')
for result in results:
file.write('- [{}]({})\n'.format(result[0], result[1]))
file.write('\n\n* * *\n\nThis list was saved on {} from xmpp:{} using '
'[Slixfeed](https://gitgud.io/sjehuda/slixfeed)\n'
.format(dt.current_date(), jid))
# TODO Consider adding element jid as a pointer of import
def export_to_opml(jid, filename, results):
root = ET.Element("opml")
root.set("version", "1.0")
head = ET.SubElement(root, "head")
ET.SubElement(head, "title").text = "{}".format(jid)
ET.SubElement(head, "description").text = (
"Set of subscriptions exported by Slixfeed")
ET.SubElement(head, "generator").text = "Slixfeed"
ET.SubElement(head, "urlPublic").text = (
"https://gitgud.io/sjehuda/slixfeed")
time_stamp = dt.current_time()
ET.SubElement(head, "dateCreated").text = time_stamp
ET.SubElement(head, "dateModified").text = time_stamp
body = ET.SubElement(root, "body")
for result in results:
outline = ET.SubElement(body, "outline")
outline.set("text", result[0])
outline.set("xmlUrl", result[1])
# outline.set("type", result[2])
tree = ET.ElementTree(root)
tree.write(filename)
async def import_opml(db_file, url):
result = await fetch.http(url)
if not result['error']:
document = result['content']
root = ET.fromstring(document)
before = await sqlite.get_number_of_items(
db_file, 'feeds')
feeds = []
for child in root.findall(".//outline"):
url = child.get("xmlUrl")
title = child.get("text")
# feed = (url, title)
# feeds.extend([feed])
feeds.extend([(url, title)])
await sqlite.import_feeds(db_file, feeds)
await sqlite.add_metadata(db_file)
after = await sqlite.get_number_of_items(
db_file, 'feeds')
difference = int(after) - int(before)
return difference
async def add_feed(db_file, url):
while True:
exist = await sqlite.get_feed_id_and_name(db_file, url)
if not exist:
status_code = None
result = await fetch.http(url)
if not result['error']:
document = result['content']
status_code = result['status_code']
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
if "language" in feed["feed"].keys():
language = feed["feed"]["language"]
else:
language = ''
if "encoding" in feed.keys():
encoding = feed["encoding"]
else:
encoding = ''
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
2024-01-17 15:36:28 +01:00
try:
updated = dt.convert_struct_time_to_iso8601(updated)
2024-01-17 15:36:28 +01:00
except:
updated = ''
else:
updated = ''
version = feed["version"]
entries = len(feed["entries"])
await sqlite.insert_feed(db_file, url,
title=title,
entries=entries,
version=version,
encoding=encoding,
language=language,
status_code=status_code,
updated=updated)
await scan(db_file, url)
2024-02-12 20:01:28 +01:00
old = config.get_setting_value(db_file, "old")
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
if not old:
await sqlite.mark_feed_as_read(db_file, feed_id)
result_final = {'link' : url,
'index' : feed_id,
'name' : title,
'code' : status_code,
'error' : False,
'exist' : False}
response = ('> {}\nNews source "{}" has been '
'added to subscription list.'
.format(url, title))
break
2024-01-20 18:28:31 +01:00
# NOTE This elif statement be unnecessary
# when feedparser be supporting json feed.
elif is_feed_json(document):
feed = json.loads(document)
if "title" in feed.keys():
title = feed["title"]
else:
title = urlsplit(url).netloc
if "language" in feed.keys():
language = feed["language"]
else:
language = ''
if "encoding" in feed.keys():
encoding = feed["encoding"]
else:
encoding = ''
if "date_published" in feed.keys():
updated = feed["date_published"]
try:
updated = dt.convert_struct_time_to_iso8601(updated)
2024-01-20 18:28:31 +01:00
except:
updated = ''
else:
updated = ''
version = 'json' + feed["version"].split('/').pop()
entries = len(feed["items"])
await sqlite.insert_feed(db_file, url,
title=title,
entries=entries,
version=version,
encoding=encoding,
language=language,
status_code=status_code,
updated=updated)
await scan_json(db_file, url)
2024-02-12 20:01:28 +01:00
old = config.get_setting_value(db_file, "old")
2024-01-20 18:28:31 +01:00
if not old:
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.mark_feed_as_read(db_file, feed_id)
result_final = {'link' : url,
'index' : feed_id,
'name' : title,
'code' : status_code,
'error' : False,
'exist' : False}
response = ('> {}\nNews source "{}" has been '
'added to subscription list.'
.format(url, title))
2024-01-20 18:28:31 +01:00
break
else:
# NOTE Do not be tempted to return a compact dictionary.
# That is, dictionary within dictionary
# Return multiple dictionaries in a list or tuple.
result = await crawl.probe_page(url, document)
if not result:
# Get out of the loop with dict indicating error.
result_final = {'link' : url,
'index' : None,
'name' : None,
'code' : status_code,
'error' : True,
'exist' : False}
break
elif isinstance(result, list):
# Get out of the loop and deliver a list of dicts.
result_final = result
break
2024-01-09 13:34:10 +01:00
else:
# Go back up to the while loop and try again.
url = result['link']
else:
result_final = {'link' : url,
'index' : None,
'name' : None,
'code' : status_code,
'error' : True,
'exist' : False}
response = ('> {}\nFailed to load URL. Reason: {}'
.format(url, status_code))
break
else:
ix = exist[0]
name = exist[1]
result_final = {'link' : url,
'index' : ix,
'name' : name,
'code' : None,
'error' : False,
'exist' : True}
response = ('> {}\nNews source "{}" is already '
'listed in the subscription list at '
'index {}'.format(url, name, ix))
break
return result_final
2024-01-20 18:28:31 +01:00
async def scan_json(db_file, url):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if isinstance(url, tuple): url = url[0]
result = await fetch.http(url)
if not result['error']:
document = result['content']
status = result['status_code']
new_entries = []
if document and status == 200:
feed = json.loads(document)
entries = feed["items"]
await remove_nonexistent_entries_json(
db_file, url, feed)
try:
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
# await sqlite.update_feed_validity(
# db_file, feed_id, valid)
if "date_published" in feed.keys():
updated = feed["date_published"]
try:
updated = dt.convert_struct_time_to_iso8601(updated)
except:
updated = ''
else:
2024-01-20 18:28:31 +01:00
updated = ''
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.update_feed_properties(
db_file, feed_id, len(feed["items"]), updated)
# await update_feed_status
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
logging.error(e)
return
# new_entry = 0
for entry in entries:
if "date_published" in entry.keys():
date = entry["date_published"]
date = dt.rfc2822_to_iso8601(date)
elif "date_modified" in entry.keys():
date = entry["date_modified"]
date = dt.rfc2822_to_iso8601(date)
else:
date = dt.now()
if "url" in entry.keys():
# link = complete_url(source, entry.link)
link = join_url(url, entry["url"])
link = trim_url(link)
else:
link = url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
title = entry["title"] if "title" in entry.keys() else date
entry_id = entry["id"] if "id" in entry.keys() else link
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
exist = await sqlite.check_entry_exist(
db_file, feed_id, entry_id=entry_id,
title=title, link=link, date=date)
if not exist:
summary = entry["summary"] if "summary" in entry.keys() else ''
if not summary:
summary = (entry["content_html"]
if "content_html" in entry.keys()
else '')
if not summary:
summary = (entry["content_text"]
if "content_text" in entry.keys()
else '')
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title, summary, pathname)
allow_list = config.is_include_keyword(db_file, "allow",
string)
if not allow_list:
reject_list = config.is_include_keyword(db_file, "deny",
string)
if reject_list:
read_status = 1
logging.debug('Rejected : {}'
'\n'
'Keyword : {}'
.format(link, reject_list))
if isinstance(date, int):
logging.error('Variable "date" is int: {}'.format(date))
media_link = ''
if "attachments" in entry.keys():
for e_link in entry["attachments"]:
try:
# if (link.rel == "enclosure" and
# (link.type.startswith("audio/") or
# link.type.startswith("image/") or
# link.type.startswith("video/"))
# ):
media_type = e_link["mime_type"][:e_link["mime_type"].index("/")]
if media_type in ("audio", "image", "video"):
media_link = e_link["url"]
media_link = join_url(url, e_link["url"])
media_link = trim_url(media_link)
break
except:
logging.info('KeyError: "url"\n'
'Missing "url" attribute for {}'
.format(url))
logging.info('Continue scanning for next '
'potential enclosure of {}'
.format(link))
entry = {
"title": title,
"link": link,
"enclosure": media_link,
"entry_id": entry_id,
"date": date,
"read_status": read_status
}
new_entries.extend([entry])
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
if len(new_entries):
2024-01-20 18:28:31 +01:00
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.add_entries_and_update_timestamp(db_file, feed_id,
new_entries)
2024-01-20 18:28:31 +01:00
async def view_feed(url):
while True:
result = await fetch.http(url)
if not result['error']:
document = result['content']
status = result['status_code']
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
entries = feed.entries
response = "Preview of {}:\n\n```\n".format(title)
counter = 0
for entry in entries:
counter += 1
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = dt.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = dt.rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
response += ("Title : {}\n"
"Date : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
.format(title, date, link, counter))
if counter > 4:
break
response += (
"```\nSource: {}"
).format(url)
break
else:
result = await crawl.probe_page(url, document)
2024-01-09 13:34:10 +01:00
if isinstance(result, str):
response = result
break
2024-01-09 13:34:10 +01:00
else:
url = result[0]
else:
response = ('> {}\nFailed to load URL. Reason: {}'
.format(url, status))
break
return response
async def view_entry(url, num):
while True:
result = await fetch.http(url)
if not result['error']:
document = result['content']
status = result['status_code']
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
entries = feed.entries
num = int(num) - 1
entry = entries[num]
response = "Preview of {}:\n\n```\n".format(title)
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("published"):
date = entry.published
date = dt.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = dt.rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
else:
summary = "*** No summary ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
response = ("{}\n"
"\n"
# "> {}\n"
"{}\n"
"\n"
"{}\n"
"\n"
.format(title, summary, link))
break
else:
result = await crawl.probe_page(url, document)
2024-01-09 13:34:10 +01:00
if isinstance(result, str):
response = result
break
2024-01-09 13:34:10 +01:00
else:
url = result[0]
else:
response = ('> {}\nFailed to load URL. Reason: {}'
.format(url, status))
break
return response
# TODO Rename function name (idea: scan_and_populate)
async def scan(db_file, url):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if isinstance(url, tuple): url = url[0]
result = await fetch.http(url)
if not result['error']:
document = result['content']
status = result['status_code']
new_entries = []
if document and status == 200:
feed = parse(document)
entries = feed.entries
# length = len(entries)
await remove_nonexistent_entries(db_file, url, feed)
try:
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
# print(bozo)
valid = 0
else:
valid = 1
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.update_feed_validity(
db_file, feed_id, valid)
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
try:
updated = dt.convert_struct_time_to_iso8601(updated)
except:
updated = ''
else:
2024-01-17 15:36:28 +01:00
updated = ''
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.update_feed_properties(db_file, feed_id,
len(feed["entries"]), updated)
# await update_feed_status
except (IncompleteReadError, IncompleteRead, error.URLError) as e:
logging.error(e)
return
# new_entry = 0
for entry in entries:
if entry.has_key("published"):
date = entry.published
date = dt.rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = dt.rfc2822_to_iso8601(date)
else:
date = dt.now()
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
title = entry.title if entry.has_key("title") else date
entry_id = entry.id if entry.has_key("id") else link
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
exist = await sqlite.check_entry_exist(db_file, feed_id,
entry_id=entry_id,
title=title, link=link,
date=date)
if not exist:
summary = entry.summary if entry.has_key("summary") else ''
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title, summary, pathname)
allow_list = config.is_include_keyword(db_file, "allow",
string)
if not allow_list:
reject_list = config.is_include_keyword(db_file, "deny",
string)
if reject_list:
read_status = 1
logging.debug('Rejected : {}'
'\n'
'Keyword : {}'.format(link,
reject_list))
if isinstance(date, int):
logging.error('Variable "date" is int: {}'
.format(date))
media_link = ''
if entry.has_key("links"):
for e_link in entry.links:
try:
# if (link.rel == "enclosure" and
# (link.type.startswith("audio/") or
# link.type.startswith("image/") or
# link.type.startswith("video/"))
# ):
media_type = e_link.type[:e_link.type.index("/")]
if e_link.has_key("rel"):
if (e_link.rel == "enclosure" and
media_type in ("audio", "image", "video")):
media_link = e_link.href
media_link = join_url(url, e_link.href)
media_link = trim_url(media_link)
break
except:
logging.info('KeyError: "href"\n'
'Missing "href" attribute for {}'
.format(url))
logging.info('Continue scanning for next '
'potential enclosure of {}'
.format(link))
entry = {
"title": title,
"link": link,
"summary": summary,
"enclosure": media_link,
"entry_id": entry_id,
"date": date,
"read_status": read_status
}
new_entries.extend([entry])
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
if len(new_entries):
2024-01-14 22:43:23 +01:00
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
await sqlite.add_entries_and_update_timestamp(db_file, feed_id,
new_entries)
def get_document_title(data):
try:
document = Document(data)
title = document.short_title()
except:
document = BeautifulSoup(data, 'html.parser')
title = document.title.string
return title
def get_document_content(data):
try:
document = Document(data)
content = document.summary()
except:
document = BeautifulSoup(data, 'html.parser')
content = data
return content
def get_document_content_as_text(data):
try:
document = Document(data)
content = document.summary()
except:
document = BeautifulSoup(data, 'html.parser')
content = data
text = remove_html_tags(content)
return text
def generate_document(data, url, ext, filename, readability=False):
error = None
if readability:
try:
document = Document(data)
content = document.summary()
except:
content = data
logging.warning('Check that package readability is installed.')
else:
content = data
match ext:
case "epub":
error = generate_epub(content, filename)
if error:
logging.error(error)
# logging.error(
# "Check that packages xml2epub is installed, "
# "or try again.")
case "html":
generate_html(content, filename)
case "md":
try:
generate_markdown(content, filename)
except:
logging.warning('Check that package html2text '
'is installed, or try again.')
error = 'Package html2text was not found.'
case "pdf":
error = generate_pdf(content, filename)
if error:
logging.error(error)
# logging.warning(
# "Check that packages pdfkit and wkhtmltopdf "
# "are installed, or try again.")
# error = (
# "Package pdfkit or wkhtmltopdf was not found.")
case "txt":
generate_txt(content, filename)
if error:
return error
2024-01-09 16:53:19 +01:00
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
2024-01-09 16:53:19 +01:00
async def extract_image_from_feed(db_file, feed_id, url):
feed_url = sqlite.get_feed_url(db_file, feed_id)
feed_url = feed_url[0]
result = await fetch.http(feed_url)
if not result['error']:
document = result['content']
feed = parse(document)
for entry in feed.entries:
try:
if entry.link == url:
for link in entry.links:
if (link.rel == "enclosure" and
link.type.startswith("image/")):
image_url = link.href
return image_url
except:
logging.error(url)
logging.error('AttributeError: object has no attribute "link"')
async def extract_image_from_html(url):
result = await fetch.http(url)
if not result['error']:
data = result['content']
try:
document = Document(data)
content = document.summary()
except:
content = data
logging.warning('Check that package readability is installed.')
tree = html.fromstring(content)
# TODO Exclude banners, class="share" links etc.
2024-01-17 15:36:28 +01:00
images = tree.xpath(
'//img[not('
'contains(@src, "avatar") or '
'contains(@src, "emoji") or '
'contains(@src, "icon") or '
'contains(@src, "logo") or '
'contains(@src, "search") or '
'contains(@src, "share") or '
2024-01-17 15:36:28 +01:00
'contains(@src, "smiley")'
')]/@src')
if len(images):
image = images[0]
image = str(image)
image_url = complete_url(url, image)
return image_url
def generate_epub(text, pathname):
## create an empty eBook
pathname_list = pathname.split("/")
filename = pathname_list.pop()
directory = "/".join(pathname_list)
book = xml2epub.Epub(filename)
## create chapters by url
# chapter0 = xml2epub.create_chapter_from_string(text, title=filename, strict=False)
chapter0 = xml2epub.create_chapter_from_string(text, strict=False)
#### create chapter objects
# chapter1 = xml2epub.create_chapter_from_url("https://dev.to/devteam/top-7-featured-dev-posts-from-the-past-week-h6h")
# chapter2 = xml2epub.create_chapter_from_url("https://dev.to/ks1912/getting-started-with-docker-34g6")
## add chapters to your eBook
try:
book.add_chapter(chapter0)
# book.add_chapter(chapter1)
# book.add_chapter(chapter2)
## generate epub file
filename_tmp = "slixfeedepub"
book.create_epub(directory, epub_name=filename_tmp)
pathname_tmp = os.path.join(directory, filename_tmp) + ".epub"
os.rename(pathname_tmp, pathname)
except ValueError as error:
return error
2024-01-09 16:53:19 +01:00
def generate_html(text, filename):
with open(filename, 'w') as file:
file.write(text)
2024-01-09 16:53:19 +01:00
def generate_markdown(text, filename):
h2m = html2text.HTML2Text()
# Convert HTML to Markdown
markdown = h2m.handle(text)
with open(filename, 'w') as file:
file.write(markdown)
2024-01-09 16:53:19 +01:00
def generate_pdf(text, filename):
try:
pdfkit.from_string(text, filename)
except IOError as error:
return error
except OSError as error:
return error
def generate_txt(text, filename):
text = remove_html_tags(text)
with open(filename, 'w') as file:
file.write(text)
def remove_html_tags(data):
data = BeautifulSoup(data, "lxml").text
data = data.replace("\n\n", "\n")
return data
# TODO Add support for eDonkey, Gnutella, Soulseek
async def get_magnet(link):
parted_link = urlsplit(link)
queries = parse_qs(parted_link.query)
query_xt = queries["xt"][0]
if query_xt.startswith("urn:btih:"):
filename = queries["dn"][0]
checksum = query_xt[len("urn:btih:"):]
torrent = await fetch.magnet(link)
logging.debug('Attempting to retrieve {} ({})'
.format(filename, checksum))
if not torrent:
logging.debug(
"Attempting to retrieve {} from HTTP caching service".format(
filename))
urls = [
'https://watercache.libertycorp.org/get/{}/{}',
'https://itorrents.org/torrent/{}.torrent?title={}',
'https://firecache.libertycorp.org/get/{}/{}',
'http://fcache63sakpihd44kxdduy6kgpdhgejgp323wci435zwy6kiylcnfad.onion/get/{}/{}'
]
for url in urls:
torrent = fetch.http(url.format(checksum, filename))
if torrent:
break
return torrent
2024-01-14 22:43:23 +01:00
async def remove_nonexistent_entries(db_file, url, feed):
2024-01-04 13:38:22 +01:00
"""
Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non
existing entries, otherwise move to table archive, if unread.
Parameters
----------
db_file : str
Path to database file.
2024-01-14 22:43:23 +01:00
url : str
Feed URL.
2024-01-04 13:38:22 +01:00
feed : list
Parsed feed document.
"""
2024-01-14 22:43:23 +01:00
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
2024-01-14 22:43:23 +01:00
items = await sqlite.get_entries_of_feed(db_file, feed_id)
2024-01-04 13:38:22 +01:00
entries = feed.entries
for item in items:
2024-01-14 22:43:23 +01:00
ix = item[0]
entry_title = item[1]
entry_link = item[2]
entry_id = item[3]
timestamp = item[4]
read_status = item[5]
2024-01-04 13:38:22 +01:00
valid = False
for entry in entries:
title = None
link = None
time = None
# valid = False
# TODO better check and don't repeat code
2024-01-14 22:43:23 +01:00
if entry.has_key("id") and entry_id:
if entry.id == entry_id:
2024-01-04 13:38:22 +01:00
# print("compare1:", entry.id)
2024-01-14 22:43:23 +01:00
# print("compare2:", entry_id)
2024-01-04 13:38:22 +01:00
# print("============")
valid = True
break
else:
if entry.has_key("title"):
title = entry.title
else:
title = feed["feed"]["title"]
if entry.has_key("link"):
link = join_url(url, entry.link)
2024-01-04 13:38:22 +01:00
else:
link = url
2024-01-14 22:43:23 +01:00
if entry.has_key("published") and timestamp:
2024-01-04 13:38:22 +01:00
# print("compare11:", title, link, time)
2024-01-14 22:43:23 +01:00
# print("compare22:", entry_title, entry_link, timestamp)
2024-01-04 13:38:22 +01:00
# print("============")
time = dt.rfc2822_to_iso8601(entry.published)
2024-01-14 22:43:23 +01:00
if (entry_title == title and
entry_link == link and
timestamp == time):
2024-01-04 13:38:22 +01:00
valid = True
break
else:
2024-01-14 22:43:23 +01:00
if (entry_title == title and
entry_link == link):
2024-01-04 13:38:22 +01:00
# print("compare111:", title, link)
2024-01-14 22:43:23 +01:00
# print("compare222:", entry_title, entry_link)
2024-01-04 13:38:22 +01:00
# print("============")
valid = True
break
# TODO better check and don't repeat code
if not valid:
2024-01-14 22:43:23 +01:00
# print("id: ", ix)
2024-01-04 13:38:22 +01:00
# if title:
# print("title: ", title)
2024-01-14 22:43:23 +01:00
# print("entry_title: ", entry_title)
2024-01-04 13:38:22 +01:00
# if link:
# print("link: ", link)
2024-01-14 22:43:23 +01:00
# print("entry_link: ", entry_link)
2024-01-04 13:38:22 +01:00
# if entry.id:
# print("last_entry:", entry.id)
2024-01-14 22:43:23 +01:00
# print("entry_id: ", entry_id)
2024-01-04 13:38:22 +01:00
# if time:
# print("time: ", time)
2024-01-14 22:43:23 +01:00
# print("timestamp: ", timestamp)
# print("read: ", read_status)
2024-01-04 13:38:22 +01:00
# breakpoint()
# TODO Send to table archive
# TODO Also make a regular/routine check for sources that
# have been changed (though that can only happen when
# manually editing)
2024-01-14 22:43:23 +01:00
# ix = item[0]
2024-01-04 13:38:22 +01:00
# print(">>> SOURCE: ", source)
2024-01-14 22:43:23 +01:00
# print(">>> INVALID:", entry_title)
# print("title:", entry_title)
# print("link :", entry_link)
# print("id :", entry_id)
if read_status == 1:
await sqlite.delete_entry_by_id(db_file, ix)
2024-01-14 22:43:23 +01:00
# print(">>> DELETING:", entry_title)
2024-01-04 13:38:22 +01:00
else:
2024-01-14 22:43:23 +01:00
# print(">>> ARCHIVING:", entry_title)
await sqlite.archive_entry(db_file, ix)
2024-02-12 20:01:28 +01:00
limit = config.get_setting_value(db_file, "archive")
2024-01-04 13:38:22 +01:00
await sqlite.maintain_archive(db_file, limit)
2024-01-20 18:28:31 +01:00
async def remove_nonexistent_entries_json(db_file, url, feed):
"""
Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non
existing entries, otherwise move to table archive, if unread.
Parameters
----------
db_file : str
Path to database file.
url : str
Feed URL.
feed : list
Parsed feed document.
"""
feed_id = await sqlite.get_feed_id(db_file, url)
feed_id = feed_id[0]
2024-01-20 18:28:31 +01:00
items = await sqlite.get_entries_of_feed(db_file, feed_id)
entries = feed["items"]
for item in items:
ix = item[0]
entry_title = item[1]
entry_link = item[2]
entry_id = item[3]
timestamp = item[4]
read_status = item[5]
valid = False
for entry in entries:
title = None
link = None
time = None
# valid = False
# TODO better check and don't repeat code
if entry.has_key("id") and entry_id:
if entry["id"] == entry_id:
# print("compare1:", entry.id)
# print("compare2:", entry_id)
# print("============")
valid = True
break
else:
if entry.has_key("title"):
title = entry["title"]
else:
title = feed["title"]
if entry.has_key("link"):
link = join_url(url, entry["link"])
else:
link = url
# "date_published" "date_modified"
if entry.has_key("date_published") and timestamp:
time = dt.rfc2822_to_iso8601(entry["date_published"])
2024-01-20 18:28:31 +01:00
if (entry_title == title and
entry_link == link and
timestamp == time):
valid = True
break
else:
if (entry_title == title and
entry_link == link):
valid = True
break
if not valid:
print("CHECK ENTRY OF JSON FEED IN ARCHIVE")
if read_status == 1:
await sqlite.delete_entry_by_id(db_file, ix)
else:
await sqlite.archive_entry(db_file, ix)
2024-02-12 20:01:28 +01:00
limit = config.get_setting_value(db_file, "archive")
await sqlite.maintain_archive(db_file, limit)