Slixfeed/slixfeed/action.py

951 lines
32 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) Call sqlite function from function statistics.
Returning a list of values doesn't' seem to be a good practice.
"""
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
2024-01-09 16:53:19 +01:00
import html2text
from http.client import IncompleteRead
from feedparser import parse
import logging
2024-01-09 16:53:19 +01:00
import pdfkit
from readability import Document
import slixfeed.config as config
import slixfeed.crawl as crawl
from slixfeed.datetime import (
current_date, current_time, now,
convert_struct_time_to_iso8601,
rfc2822_to_iso8601
)
import slixfeed.fetch as fetch
import slixfeed.sqlite as sqlite
from slixfeed.url import (
# complete_url,
join_url,
remove_tracking_parameters,
replace_hostname,
trim_url
)
import slixfeed.xmpp.bookmark as bookmark
from urllib import error
from urllib.parse import urlsplit
import xml.etree.ElementTree as ET
def log_to_markdown(timestamp, filename, jid, message):
"""
Log message to file.
Parameters
----------
timestamp : str
Time stamp.
filename : str
Jabber ID as name of file.
jid : str
Jabber ID.
message : str
Message content.
Returns
-------
None.
"""
with open(filename + '.md', 'a') as file:
# entry = "{} {}:\n{}\n\n".format(timestamp, jid, message)
entry = (
"## {}\n"
"### {}\n\n"
"{}\n\n").format(jid, timestamp, message)
file.write(entry)
def is_feed(feed):
"""
Determine whether document is feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
value = False
2024-01-09 13:34:10 +01:00
# message = None
if not feed.entries:
if "version" in feed.keys():
feed["version"]
if feed.version:
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
elif "title" in feed["feed"].keys():
value = True
# message = (
# "Empty feed for {}"
# ).format(url)
else:
value = False
# message = (
# "No entries nor title for {}"
# ).format(url)
elif feed.bozo:
value = False
# message = (
# "Bozo detected for {}"
# ).format(url)
else:
value = True
# message = (
# "Good feed for {}"
# ).format(url)
return value
def list_unread_entries(result, feed_title):
# TODO Add filtering
# TODO Do this when entry is added to list and mark it as read
# DONE!
# results = []
# if get_settings_value(db_file, "filter-deny"):
# while len(results) < num:
# result = cur.execute(sql).fetchone()
# blacklist = await get_settings_value(db_file, "filter-deny").split(",")
# for i in blacklist:
# if i in result[1]:
# continue
# print("rejected:", result[1])
# print("accepted:", result[1])
# results.extend([result])
# news_list = "You've got {} news items:\n".format(num)
# NOTE Why doesn't this work without list?
# i.e. for result in results
# for result in results.fetchall():
ix = result[0]
title = result[1]
# # TODO Retrieve summary from feed
# # See fetch.view_entry
# summary = result[2]
# # Remove HTML tags
# try:
# summary = BeautifulSoup(summary, "lxml").text
# except:
# print(result[2])
# breakpoint()
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# length = await get_settings_value(db_file, "length")
# summary = summary[:length] + " […]"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
link = result[2]
link = remove_tracking_parameters(link)
link = (replace_hostname(link, "link")) or link
news_item = (
"\n{}\n{}\n{}\n"
).format(str(title), str(link), str(feed_title))
return news_item
def list_search_results(query, results):
2024-01-07 10:57:54 +01:00
message = (
"Search results for '{}':\n\n```"
).format(query)
for result in results:
2024-01-07 10:57:54 +01:00
message += (
"\n{}\n{}\n"
).format(str(result[0]), str(result[1]))
if len(results):
2024-01-07 10:57:54 +01:00
message += "```\nTotal of {} results".format(len(results))
else:
2024-01-07 10:57:54 +01:00
message = "No results were found for: {}".format(query)
return message
def list_feeds_by_query(query, results):
2024-01-07 10:57:54 +01:00
message = (
"Feeds containing '{}':\n\n```"
).format(query)
for result in results:
2024-01-07 10:57:54 +01:00
message += (
"\nName : {} [{}]"
"\nURL : {}"
"\n"
).format(
str(result[0]), str(result[1]), str(result[2]))
if len(results):
2024-01-07 10:57:54 +01:00
message += "\n```\nTotal of {} feeds".format(len(results))
else:
2024-01-07 10:57:54 +01:00
message = "No feeds were found for: {}".format(query)
return message
def list_statistics(values):
"""
Return table statistics.
Parameters
----------
db_file : str
Path to database file.
Returns
-------
msg : str
Statistics as message.
"""
2024-01-07 10:57:54 +01:00
message = (
"```"
"\nSTATISTICS\n"
"News items : {}/{}\n"
"News sources : {}/{}\n"
"\nOPTIONS\n"
"Items to archive : {}\n"
"Update interval : {}\n"
"Items per update : {}\n"
"Operation status : {}\n"
"```"
).format(values[0], values[1], values[2], values[3],
values[4], values[5], values[6], values[7])
2024-01-07 10:57:54 +01:00
return message
# FIXME Replace counter by len
def list_last_entries(results, num):
2024-01-07 10:57:54 +01:00
message = "Recent {} titles:\n\n```".format(num)
for result in results:
2024-01-07 10:57:54 +01:00
message += (
"\n{}\n{}\n"
).format(
str(result[0]), str(result[1]))
if len(results):
2024-01-07 10:57:54 +01:00
message += "```\n"
else:
2024-01-07 10:57:54 +01:00
message = "There are no news at the moment."
return message
def list_feeds(results):
2024-01-07 10:57:54 +01:00
message = "\nList of subscriptions:\n\n```\n"
for result in results:
2024-01-07 10:57:54 +01:00
message += (
"Name : {}\n"
"URL : {}\n"
# "Updated : {}\n"
# "Status : {}\n"
"ID : {}\n"
"\n"
).format(
str(result[0]), str(result[1]), str(result[2]))
if len(results):
2024-01-07 10:57:54 +01:00
message += (
"```\nTotal of {} subscriptions.\n"
).format(len(results))
else:
2024-01-07 10:57:54 +01:00
message = (
"List of subscriptions is empty.\n"
"To add feed, send a URL\n"
"Try these:\n"
# TODO Pick random from featured/recommended
"https://reclaimthenet.org/feed/"
)
2024-01-07 10:57:54 +01:00
return message
async def list_bookmarks(self):
conferences = await bookmark.get(self)
2024-01-07 10:57:54 +01:00
message = "\nList of groupchats:\n\n```\n"
for conference in conferences:
2024-01-07 10:57:54 +01:00
message += (
"{}\n"
"\n"
).format(
conference["jid"]
)
2024-01-07 10:57:54 +01:00
message += (
"```\nTotal of {} groupchats.\n"
2024-01-07 10:57:54 +01:00
).format(len(conferences))
return message
def export_to_markdown(jid, filename, results):
with open(filename, 'w') as file:
file.write(
'# Subscriptions for {}\n'.format(jid))
file.write(
'## Set of feeds exported with Slixfeed\n')
for result in results:
file.write(
'- [{}]({})\n'.format(result[0], result[1]))
file.write(
'\n\n* * *\n\nThis list was saved on {} from xmpp:{} using '
'[Slixfeed](https://gitgud.io/sjehuda/slixfeed)\n'.format(
current_date(), jid))
def export_to_opml(jid, filename, results):
root = ET.Element("opml")
root.set("version", "1.0")
head = ET.SubElement(root, "head")
ET.SubElement(head, "title").text = "Subscriptions for {}".format(jid)
ET.SubElement(head, "description").text = (
"Set of feeds exported with Slixfeed")
ET.SubElement(head, "generator").text = "Slixfeed"
ET.SubElement(head, "urlPublic").text = (
"https://gitgud.io/sjehuda/slixfeed")
time_stamp = current_time()
ET.SubElement(head, "dateCreated").text = time_stamp
ET.SubElement(head, "dateModified").text = time_stamp
body = ET.SubElement(root, "body")
for result in results:
outline = ET.SubElement(body, "outline")
outline.set("text", result[0])
outline.set("xmlUrl", result[1])
# outline.set("type", result[2])
tree = ET.ElementTree(root)
tree.write(filename)
async def import_opml(db_file, url):
result = await fetch.download_feed(url)
document = result[0]
if document:
root = ET.fromstring(document)
before = await sqlite.get_number_of_items(
db_file, 'feeds')
feeds = []
for child in root.findall(".//outline"):
url = child.get("xmlUrl")
title = child.get("text")
# feed = (url, title)
# feeds.extend([feed])
feeds.extend([(url, title)])
await sqlite.import_feeds(
db_file, feeds)
after = await sqlite.get_number_of_items(
db_file, 'feeds')
difference = int(after) - int(before)
return difference
async def add_feed(db_file, url):
while True:
exist = await sqlite.get_feed_id_and_name(db_file, url)
if not exist:
2024-01-04 14:58:06 +01:00
result = await fetch.download_feed(url)
document = result[0]
status_code = result[1]
if document:
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
if "language" in feed["feed"].keys():
language = feed["feed"]["language"]
else:
language = ''
if "encoding" in feed.keys():
encoding = feed["encoding"]
else:
encoding = ''
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
updated = convert_struct_time_to_iso8601(updated)
else:
updated = ''
version = feed["version"]
entries = len(feed["entries"])
await sqlite.insert_feed(
db_file, url,
title=title,
entries=entries,
version=version,
encoding=encoding,
language=language,
status_code=status_code,
updated=updated
)
await scan(
db_file, url)
2024-01-04 13:38:22 +01:00
old = (
await sqlite.get_settings_value(
db_file, "old")
) or (
config.get_value_default(
"settings", "Settings", "old")
)
if not old:
await sqlite.mark_feed_as_read(
db_file, url)
response = (
"> {}\nNews source {} has been "
"added to subscription list."
).format(url, title)
break
else:
result = await crawl.probe_page(
url, document)
2024-01-09 13:34:10 +01:00
if isinstance(result, str):
response = result
break
2024-01-09 13:34:10 +01:00
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status_code)
break
else:
ix = exist[0]
name = exist[1]
response = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
break
return response
async def view_feed(url):
while True:
2024-01-04 14:58:06 +01:00
result = await fetch.download_feed(url)
document = result[0]
status = result[1]
if document:
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
entries = feed.entries
response = "Preview of {}:\n\n```\n".format(title)
counter = 0
for entry in entries:
counter += 1
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
response += (
"Title : {}\n"
"Date : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(title, date, link, counter)
if counter > 4:
break
response += (
"```\nSource: {}"
).format(url)
break
else:
result = await crawl.probe_page(
url, document)
2024-01-09 13:34:10 +01:00
if isinstance(result, str):
response = result
break
2024-01-09 13:34:10 +01:00
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
break
return response
async def view_entry(url, num):
while True:
2024-01-04 14:58:06 +01:00
result = await fetch.download_feed(url)
document = result[0]
status = result[1]
if document:
feed = parse(document)
# if is_feed(url, feed):
if is_feed(feed):
if "title" in feed["feed"].keys():
title = feed["feed"]["title"]
else:
title = urlsplit(url).netloc
entries = feed.entries
num = int(num) - 1
entry = entries[num]
response = "Preview of {}:\n\n```\n".format(title)
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
else:
summary = "*** No summary ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
response = (
"{}\n"
"\n"
# "> {}\n"
"{}\n"
"\n"
"{}\n"
"\n"
).format(title, summary, link)
break
else:
result = await crawl.probe_page(
url, document)
2024-01-09 13:34:10 +01:00
if isinstance(result, str):
response = result
break
2024-01-09 13:34:10 +01:00
else:
url = result[0]
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
break
return response
async def scan(db_file, url):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if isinstance(url, tuple): url = url[0]
2024-01-07 10:57:54 +01:00
result = await fetch.download_feed(url)
try:
document = result[0]
status = result[1]
except:
return
if document and status == 200:
feed = parse(document)
entries = feed.entries
# length = len(entries)
await remove_nonexistent_entries(
db_file, feed, url)
try:
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
# print(bozo)
valid = 0
else:
valid = 1
await sqlite.update_feed_validity(
db_file, url, valid)
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
updated = convert_struct_time_to_iso8601(updated)
else:
updated = ''
await sqlite.update_feed_properties(
db_file, url, len(feed["entries"]), updated)
# await update_feed_status
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
2024-01-07 10:57:54 +01:00
logging.error(e)
return
# new_entry = 0
for entry in entries:
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = now()
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
title = entry.title if entry.has_key("title") else date
entry_id = entry.id if entry.has_key("id") else link
exist = await sqlite.check_entry_exist(
db_file, url, entry_id=entry_id,
title=title, link=link, date=date)
if not exist:
2024-01-07 10:57:54 +01:00
summary = entry.summary if entry.has_key("summary") else ''
read_status = 0
pathname = urlsplit(link).path
string = ("{} {} {}"
).format(
title, summary, pathname
)
allow_list = await config.is_include_keyword(
db_file, "filter-allow", string)
if not allow_list:
reject_list = await config.is_include_keyword(
db_file, "filter-deny", string)
if reject_list:
read_status = 1
2024-01-09 13:34:10 +01:00
logging.debug(
"Rejected due to keyword {}".format(reject_list))
if isinstance(date, int):
2024-01-09 13:34:10 +01:00
logging.error(
"Variable 'date' is int: {}".format(date))
await sqlite.add_entry(
db_file, title, link, entry_id,
url, date, read_status)
await sqlite.set_date(db_file, url)
2024-01-09 16:53:19 +01:00
async def get_content(db_file, ix):
url = sqlite.get_entry_url(db_file, ix)
result = await fetch.download_feed(url)
if result[0]:
document = Document(result[0])
return document.summary()
# TODO Either adapt it to filename
# or change it to something else
#filename = document.title()
# with open(filename, 'w') as file:
# html_doc = document.summary()
# file.write(html_doc)
def generate_html(text, filename):
with open(filename, 'w') as file:
file.write(text)
def generate_pdf(text, filename):
pdfkit.from_string(text, filename)
def generate_markdown(text, filename):
h2m = html2text.HTML2Text()
# Convert HTML to Markdown
markdown = h2m.handle(text)
with open(filename, 'w') as file:
file.write(markdown)
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
for url in urls:
# print(os.path.basename(db_file), url[0])
url = url[0]
res = await fetch.download_feed(url)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed
# urls.next()
# next(urls)
continue
status = res[1]
await sqlite.update_feed_status(
db_file, url, status)
if res[0]:
try:
feed = parse(res[0])
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
# print(bozo)
valid = 0
else:
valid = 1
await sqlite.update_feed_validity(
db_file, url, valid)
if "updated_parsed" in feed["feed"].keys():
updated = feed["feed"]["updated_parsed"]
updated = convert_struct_time_to_iso8601(updated)
else:
updated = ''
entries = len(feed["entries"])
await sqlite.update_feed_properties(
db_file, url, entries, updated)
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
2024-01-09 13:34:10 +01:00
logging.error(e)
# TODO Print error to log
# None
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if status == 200:
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed.entries
# length = len(entries)
# await remove_entry(db_file, source, length)
2024-01-04 13:38:22 +01:00
await remove_nonexistent_entries(
db_file, feed, url)
# new_entry = 0
for entry in entries:
# TODO Pass date too for comparion check
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = now()
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
if entry.has_key("title"):
title = entry.title
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
else:
title = date
# title = feed["feed"]["title"]
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = url
if entry.has_key("id"):
eid = entry.id
else:
eid = link
exist = await sqlite.check_entry_exist(
db_file, url, eid=eid,
title=title, link=link, date=date)
if not exist:
# new_entry = new_entry + 1
# TODO Enhance summary
if entry.has_key("summary"):
summary = entry.summary
# # Remove HTML tags
# summary = BeautifulSoup(summary, "lxml").text
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# summary = summary[:300] + " […]‍⃨"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
else:
summary = "> *** No summary ***"
read_status = 0
pathname = urlsplit(link).path
2024-01-04 13:38:22 +01:00
string = ("{} {} {}"
).format(
title, summary, pathname
)
allow_list = await config.is_include_keyword(
db_file, "filter-allow", string)
if not allow_list:
2024-01-04 13:38:22 +01:00
reject_list = await config.is_include_keyword(
db_file, "filter-deny", string)
if reject_list:
# print(">>> REJECTED", title)
summary = (
"REJECTED {}".format(
reject_list.upper()
)
)
# summary = ""
read_status = 1
entry = (
title, link, eid, url, date, read_status)
if isinstance(date, int):
print("PROBLEM: date is int")
print(date)
# breakpoint()
# print(source)
# print(date)
await sqlite.add_entry_and_set_date(
db_file, url, entry)
# print(current_time(), entry, title)
# else:
# print(current_time(), exist, title)
async def remove_nonexistent_entries(db_file, feed, url):
2024-01-04 13:38:22 +01:00
"""
Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non
existing entries, otherwise move to table archive, if unread.
Parameters
----------
db_file : str
Path to database file.
feed : list
Parsed feed document.
url : str
2024-01-04 13:38:22 +01:00
Feed URL. URL of associated feed.
"""
items = await sqlite.get_entries_of_feed(db_file, feed, url)
2024-01-04 13:38:22 +01:00
entries = feed.entries
# breakpoint()
for item in items:
valid = False
for entry in entries:
title = None
link = None
time = None
# valid = False
# TODO better check and don't repeat code
if entry.has_key("id") and item[3]:
if entry.id == item[3]:
# print("compare1:", entry.id)
# print("compare2:", item[3])
# print("============")
valid = True
break
else:
if entry.has_key("title"):
title = entry.title
else:
title = feed["feed"]["title"]
if entry.has_key("link"):
link = join_url(url, entry.link)
2024-01-04 13:38:22 +01:00
else:
link = url
2024-01-04 13:38:22 +01:00
if entry.has_key("published") and item[4]:
# print("compare11:", title, link, time)
# print("compare22:", item[1], item[2], item[4])
# print("============")
time = rfc2822_to_iso8601(entry.published)
if (item[1] == title and
item[2] == link and
item[4] == time):
valid = True
break
else:
if (item[1] == title and
item[2] == link):
# print("compare111:", title, link)
# print("compare222:", item[1], item[2])
# print("============")
valid = True
break
# TODO better check and don't repeat code
if not valid:
# print("id: ", item[0])
# if title:
# print("title: ", title)
# print("item[1]: ", item[1])
# if link:
# print("link: ", link)
# print("item[2]: ", item[2])
# if entry.id:
# print("last_entry:", entry.id)
# print("item[3]: ", item[3])
# if time:
# print("time: ", time)
# print("item[4]: ", item[4])
# print("read: ", item[5])
# breakpoint()
# TODO Send to table archive
# TODO Also make a regular/routine check for sources that
# have been changed (though that can only happen when
# manually editing)
ix = item[0]
# print(">>> SOURCE: ", source)
# print(">>> INVALID:", item[1])
# print("title:", item[1])
# print("link :", item[2])
# print("id :", item[3])
if item[5] == 1:
await sqlite.delete_entry_by_id(db_file, ix)
2024-01-04 13:38:22 +01:00
# print(">>> DELETING:", item[1])
else:
# print(">>> ARCHIVING:", item[1])
await sqlite.archive_entry(db_file, ix)
2024-01-04 13:38:22 +01:00
limit = (
await sqlite.get_settings_value(db_file, "archive")
) or (
config.get_value_default("settings", "Settings", "archive")
)
await sqlite.maintain_archive(db_file, limit)