Improve SQLite performance.

Handle missing packages errors.
This commit is contained in:
Schimon Jehudah 2024-01-10 20:06:56 +00:00
parent 46a0819229
commit 0ca37dfdee
4 changed files with 327 additions and 122 deletions

View file

@ -5,19 +5,25 @@
TODO TODO
1) Call sqlite function from function statistics. 1) Function scan at "for entry in entries"
Suppress directly calling function "add_entry" (accept db_file)
Pass a list of valid entries to a new function "add_entries"
(accept db_file) which would call function "add_entry" (accept cur).
* accelerate adding of large set of entries at once.
* prevent (or mitigate halt of consequent actions).
* reduce I/O.
2) Call sqlite function from function statistics.
Returning a list of values doesn't' seem to be a good practice. Returning a list of values doesn't' seem to be a good practice.
""" """
from asyncio.exceptions import IncompleteReadError from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import html2text
from http.client import IncompleteRead from http.client import IncompleteRead
from feedparser import parse from feedparser import parse
import logging import logging
from lxml import html from lxml import html
import pdfkit
from readability import Document from readability import Document
import slixfeed.config as config import slixfeed.config as config
import slixfeed.crawl as crawl import slixfeed.crawl as crawl
@ -40,6 +46,20 @@ from urllib import error
from urllib.parse import urlsplit from urllib.parse import urlsplit
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
try:
import html2text
except:
logging.info(
"Package html2text was not found.\n"
"Markdown support is disabled.")
try:
import pdfkit
except:
logging.info(
"Package pdfkit was not found.\n"
"PDF support is disabled.")
def log_to_markdown(timestamp, filename, jid, message): def log_to_markdown(timestamp, filename, jid, message):
""" """
@ -302,13 +322,14 @@ def export_to_markdown(jid, filename, results):
current_date(), jid)) current_date(), jid))
# TODO Consider adding element jid as a pointer of import
def export_to_opml(jid, filename, results): def export_to_opml(jid, filename, results):
root = ET.Element("opml") root = ET.Element("opml")
root.set("version", "1.0") root.set("version", "1.0")
head = ET.SubElement(root, "head") head = ET.SubElement(root, "head")
ET.SubElement(head, "title").text = "Subscriptions for {}".format(jid) ET.SubElement(head, "title").text = "{}".format(jid)
ET.SubElement(head, "description").text = ( ET.SubElement(head, "description").text = (
"Set of feeds exported with Slixfeed") "Set of subscriptions exported by Slixfeed")
ET.SubElement(head, "generator").text = "Slixfeed" ET.SubElement(head, "generator").text = "Slixfeed"
ET.SubElement(head, "urlPublic").text = ( ET.SubElement(head, "urlPublic").text = (
"https://gitgud.io/sjehuda/slixfeed") "https://gitgud.io/sjehuda/slixfeed")
@ -339,8 +360,8 @@ async def import_opml(db_file, url):
# feed = (url, title) # feed = (url, title)
# feeds.extend([feed]) # feeds.extend([feed])
feeds.extend([(url, title)]) feeds.extend([(url, title)])
await sqlite.import_feeds( await sqlite.import_feeds(db_file, feeds)
db_file, feeds) await sqlite.add_metadata(db_file)
after = await sqlite.get_number_of_items( after = await sqlite.get_number_of_items(
db_file, 'feeds') db_file, 'feeds')
difference = int(after) - int(before) difference = int(after) - int(before)
@ -581,6 +602,7 @@ async def scan(db_file, url):
status = result[1] status = result[1]
except: except:
return return
new_entries = []
if document and status == 200: if document and status == 200:
feed = parse(document) feed = parse(document)
entries = feed.entries entries = feed.entries
@ -642,10 +664,10 @@ async def scan(db_file, url):
summary = entry.summary if entry.has_key("summary") else '' summary = entry.summary if entry.has_key("summary") else ''
read_status = 0 read_status = 0
pathname = urlsplit(link).path pathname = urlsplit(link).path
string = ("{} {} {}" string = (
).format( "{} {} {}"
title, summary, pathname ).format(
) title, summary, pathname)
allow_list = await config.is_include_keyword( allow_list = await config.is_include_keyword(
db_file, "filter-allow", string) db_file, "filter-allow", string)
if not allow_list: if not allow_list:
@ -654,24 +676,42 @@ async def scan(db_file, url):
if reject_list: if reject_list:
read_status = 1 read_status = 1
logging.debug( logging.debug(
"Rejected due to keyword {}".format(reject_list)) "Rejected : {}\n"
"Keyword : {}".format(
link, reject_list))
if isinstance(date, int): if isinstance(date, int):
logging.error( logging.error(
"Variable 'date' is int: {}".format(date)) "Variable 'date' is int: {}".format(date))
await sqlite.add_entry( entry = {
db_file, title, link, entry_id, "title": title,
url, date, read_status) "link": link,
await sqlite.set_date(db_file, url) "entry_id": entry_id,
"url": url,
"date": date,
"read_status": read_status
}
new_entries.extend([entry])
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
if len(new_entries):
await sqlite.add_entries_and_update_timestamp(
db_file, new_entries)
async def get_content(url): async def get_content(url):
result = await fetch.download_feed(url) result = await fetch.download_feed(url)
if result[0]: data = result[0]
code = result[1]
if data:
document = Document(result[0]) document = Document(result[0])
content = document.summary() content = document.summary()
info = [code, content]
else: else:
content = None info = [code, None]
return content return info
# TODO Either adapt it to filename # TODO Either adapt it to filename
# or change it to something else # or change it to something else
#filename = document.title() #filename = document.title()
@ -691,21 +731,22 @@ def extract_first_image(url, content):
image_url = None image_url = None
return image_url return image_url
def generate_html(text, filename): def generate_html(text, filename):
with open(filename, 'w') as file: with open(filename, 'w') as file:
file.write(text) file.write(text)
def generate_pdf(text, filename): def generate_pdf(text, filename):
pdfkit.from_string(text, filename) pdfkit.from_string(text, filename)
def generate_markdown(text, filename): def generate_markdown(text, filename):
h2m = html2text.HTML2Text() h2m = html2text.HTML2Text()
# Convert HTML to Markdown # Convert HTML to Markdown
markdown = h2m.handle(text) markdown = h2m.handle(text)
with open(filename, 'w') as file: with open(filename, 'w') as file:
file.write(markdown) file.write(markdown)
# NOTE Why (if res[0]) and (if res[1] == 200)? # NOTE Why (if res[0]) and (if res[1] == 200)?

View file

@ -5,14 +5,11 @@
TODO TODO
1) Table feeds: 1) Function to open connection (receive db_file).
category Function to close connection.
type (atom, rdf, rss0.9. rss2 etc.) All other functions to receive cursor.
2) Function mark_all_read for entries of given feed
3) Statistics
2) Merge function add_metadata into function import_feeds.
""" """
from asyncio import Lock from asyncio import Lock
@ -89,7 +86,7 @@ def create_tables(db_file):
""" """
CREATE TABLE IF NOT EXISTS properties ( CREATE TABLE IF NOT EXISTS properties (
id INTEGER NOT NULL, id INTEGER NOT NULL,
feed_id INTEGER NOT NULL, feed_id INTEGER NOT NULL UNIQUE,
type TEXT, type TEXT,
encoding TEXT, encoding TEXT,
language TEXT, language TEXT,
@ -105,7 +102,7 @@ def create_tables(db_file):
""" """
CREATE TABLE IF NOT EXISTS status ( CREATE TABLE IF NOT EXISTS status (
id INTEGER NOT NULL, id INTEGER NOT NULL,
feed_id INTEGER NOT NULL, feed_id INTEGER NOT NULL UNIQUE,
enabled INTEGER NOT NULL DEFAULT 1, enabled INTEGER NOT NULL DEFAULT 1,
updated TEXT, updated TEXT,
scanned TEXT, scanned TEXT,
@ -113,6 +110,7 @@ def create_tables(db_file):
status_code INTEGER, status_code INTEGER,
valid INTEGER, valid INTEGER,
filter INTEGER NOT NULL DEFAULT 1, filter INTEGER NOT NULL DEFAULT 1,
priority INTEGER,
FOREIGN KEY ("feed_id") REFERENCES "feeds" ("id") FOREIGN KEY ("feed_id") REFERENCES "feeds" ("id")
ON UPDATE CASCADE ON UPDATE CASCADE
ON DELETE CASCADE, ON DELETE CASCADE,
@ -260,6 +258,84 @@ async def import_feeds(db_file, feeds):
logging.error(e) logging.error(e)
async def add_metadata(db_file):
"""
Insert a new feed into the feeds table.
Parameters
----------
db_file : str
Path to database file.
"""
async with DBLOCK:
with create_connection(db_file) as conn:
cur = conn.cursor()
sql = (
"""
SELECT id
FROM feeds
ORDER BY id ASC
"""
)
ixs = cur.execute(sql).fetchall()
for ix in ixs:
feed_id = ix[0]
insert_feed_status(cur, feed_id)
insert_feed_properties(cur, feed_id)
def insert_feed_status(cur, feed_id):
"""
Set feed status.
Parameters
----------
cur : object
Cursor object.
"""
sql = (
"""
INSERT
INTO status(
feed_id)
VALUES(
?)
"""
)
try:
cur.execute(sql, (feed_id,))
except IntegrityError as e:
logging.warning(
"Skipping feed_id {} for table status".format(feed_id))
logging.error(e)
def insert_feed_properties(cur, feed_id):
"""
Set feed properties.
Parameters
----------
cur : object
Cursor object.
"""
sql = (
"""
INSERT
INTO properties(
feed_id)
VALUES(
?)
"""
)
try:
cur.execute(sql, (feed_id,))
except IntegrityError as e:
logging.warning(
"Skipping feed_id {} for table properties".format(feed_id))
logging.error(e)
async def insert_feed( async def insert_feed(
db_file, url, title=None, entries=None, version=None, db_file, url, title=None, entries=None, version=None,
encoding=None, language=None, status_code=None, updated=None): encoding=None, language=None, status_code=None, updated=None):
@ -339,6 +415,61 @@ async def insert_feed(
cur.execute(sql, properties) cur.execute(sql, properties)
async def insert_feed_(
db_file, url, title=None, entries=None, version=None,
encoding=None, language=None, status_code=None, updated=None):
"""
Insert a new feed into the feeds table.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
title : str, optional
Feed title. The default is None.
entries : int, optional
Number of entries. The default is None.
version : str, optional
Type of feed. The default is None.
encoding : str, optional
Encoding of feed. The default is None.
language : str, optional
Language code of feed. The default is None.
status : str, optional
HTTP status code. The default is None.
updated : ???, optional
Date feed was last updated. The default is None.
status : str, optional
HTTP status code. The default is None.
updated : ???, optional
Date feed was last updated. The default is None.
"""
async with DBLOCK:
with create_connection(db_file) as conn:
cur = conn.cursor()
feed = (
title, url
)
sql = (
"""
INSERT
INTO feeds(
name, url)
VALUES(
?, ?)
"""
)
cur.execute(sql, feed)
feed_id = get_feed_id(cur, url)
insert_feed_properties(
cur, feed_id, entries=None,
version=None, encoding=None, language=None)
insert_feed_status(
cur, feed_id, status_code=None, updated=None)
async def remove_feed_by_url(db_file, url): async def remove_feed_by_url(db_file, url):
""" """
Delete a feed by feed URL. Delete a feed by feed URL.
@ -560,7 +691,7 @@ async def get_unread_entries(db_file, num):
return results return results
async def get_feed_id(cur, url): def get_feed_id(cur, url):
""" """
Get index of given feed. Get index of given feed.
@ -896,9 +1027,9 @@ async def set_enabled_status(db_file, ix, status):
cur = conn.cursor() cur = conn.cursor()
sql = ( sql = (
""" """
UPDATE feeds UPDATE status
SET enabled = :status SET enabled = :status
WHERE id = :id WHERE feed_id = :id
""" """
) )
cur.execute(sql, { cur.execute(sql, {
@ -943,14 +1074,7 @@ async def add_entry(
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
feed_id = cur.execute(sql, (url,)).fetchone()[0]
sql = ( sql = (
""" """
INSERT INSERT
@ -985,6 +1109,59 @@ async def add_entry(
# # breakpoint() # # breakpoint()
async def add_entries_and_update_timestamp(db_file, new_entries):
"""
Add new entries.
Parameters
----------
db_file : str
Path to database file.
new_entries : list
Set of entries as dict.
"""
async with DBLOCK:
with create_connection(db_file) as conn:
cur = conn.cursor()
feeds = []
for entry in new_entries:
url = entry["url"]
feed_id = get_feed_id(cur, url)
sql = (
"""
INSERT
INTO entries(
title, link, entry_id, feed_id, timestamp, read)
VALUES(
:title, :link, :entry_id, :feed_id, :timestamp, :read)
"""
)
cur.execute(sql, {
"title": entry["title"],
"link": entry["link"],
"entry_id": entry["entry_id"],
"feed_id": feed_id,
"timestamp": entry["date"],
"read": entry["read_status"]
})
if url not in feeds:
feeds.extend([url])
for feed in feeds:
url = feed
feed_id = get_feed_id(cur, url)
sql = (
"""
UPDATE status
SET renewed = :today
WHERE feed_id = :feed_id
"""
)
cur.execute(sql, {
"today": date.today(),
"feed_id": feed_id
})
async def set_date(db_file, url): async def set_date(db_file, url):
""" """
Set renewed date of given feed. Set renewed date of given feed.
@ -999,14 +1176,7 @@ async def set_date(db_file, url):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
feed_id = cur.execute(sql, (url,)).fetchone()[0]
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1037,17 +1207,7 @@ async def update_feed_status(db_file, url, status_code):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
# try:
feed_id = cur.execute(sql, (url,)).fetchone()[0]
# except:
# breakpoint()
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1078,14 +1238,7 @@ async def update_feed_validity(db_file, url, valid):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
feed_id = cur.execute(sql, (url,)).fetchone()[0]
sql = ( sql = (
""" """
UPDATE status UPDATE status
@ -1117,14 +1270,7 @@ async def update_feed_properties(db_file, url, entries, updated):
async with DBLOCK: async with DBLOCK:
with create_connection(db_file) as conn: with create_connection(db_file) as conn:
cur = conn.cursor() cur = conn.cursor()
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
feed_id = cur.execute(sql, (url,)).fetchone()[0]
sql = ( sql = (
""" """
UPDATE properties UPDATE properties
@ -1455,14 +1601,7 @@ async def check_entry_exist(
cur = get_cursor(db_file) cur = get_cursor(db_file)
exist = False exist = False
if entry_id: if entry_id:
sql = ( feed_id = get_feed_id(cur, url)
"""
SELECT id
FROM feeds
WHERE url = :url
"""
)
feed_id = cur.execute(sql, (url,)).fetchone()[0]
sql = ( sql = (
""" """
SELECT id SELECT id

View file

@ -242,8 +242,11 @@ async def send_update(self, jid, num=None):
# breakpoint() # breakpoint()
await mark_as_read(db_file, result[0]) await mark_as_read(db_file, result[0])
if not image_url: if not image_url:
content = await action.get_content(url) info = await action.get_content(url)
image_url = action.extract_first_image(url, content) content = info[1]
status = info[0]
if status == 200:
image_url = action.extract_first_image(url, content)
new = " ".join(news_digest) new = " ".join(news_digest)
# breakpoint() # breakpoint()
if new: if new:

View file

@ -98,7 +98,7 @@ async def message(self, message):
count = await action.import_opml(db_file, url) count = await action.import_opml(db_file, url)
if count: if count:
response = ( response = (
"Successfully imported {} feeds" "Successfully imported {} feeds."
).format(count) ).format(count)
else: else:
response = ( response = (
@ -109,6 +109,7 @@ async def message(self, message):
await task.start_tasks_xmpp( await task.start_tasks_xmpp(
self, jid, ["status"]) self, jid, ["status"])
send_reply_message(self, message, response) send_reply_message(self, message, response)
return
if message["type"] == "groupchat": if message["type"] == "groupchat":
@ -399,19 +400,19 @@ async def message(self, message):
status_type = "dnd" status_type = "dnd"
status_message = ( status_message = (
"📤️ Procesing request to export feeds into {} ..." "📤️ Procesing request to export feeds into {} ..."
).format(key) ).format(ex)
send_status_message( send_status_message(
self, jid, status_type, status_message) self, jid, status_type, status_message)
data_dir = get_default_data_directory() data_dir = get_default_data_directory()
if not os.path.isdir(data_dir): if not os.path.isdir(data_dir):
os.mkdir(data_dir) os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/' + key): if not os.path.isdir(data_dir + '/' + ex):
os.mkdir(data_dir + '/' + key) os.mkdir(data_dir + '/' + ex)
filename = os.path.join( filename = os.path.join(
data_dir, key, "slixfeed_" + timestamp() + "." + key) data_dir, ex, "slixfeed_" + timestamp() + "." + ex)
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
results = await sqlite.get_feeds(db_file) results = await sqlite.get_feeds(db_file)
match key: match ex:
case "html": case "html":
response = "Not yet implemented." response = "Not yet implemented."
case "md": case "md":
@ -425,7 +426,7 @@ async def message(self, message):
url = await upload.start(self, jid, filename) url = await upload.start(self, jid, filename)
# response = ( # response = (
# "Feeds exported successfully to {}.\n{}" # "Feeds exported successfully to {}.\n{}"
# ).format(key, url) # ).format(ex, url)
# send_oob_reply_message(message, url, response) # send_oob_reply_message(message, url, response)
await send_oob_message( await send_oob_message(
self, jid, url) self, jid, url)
@ -468,23 +469,37 @@ async def message(self, message):
response = "No entry Id with {}".format(ix) response = "No entry Id with {}".format(ix)
except: except:
url = ix_url url = ix_url
content = await action.get_content(url) url = uri.remove_tracking_parameters(url)
url = (uri.replace_hostname(url, "link")) or url
info = await action.get_content(url)
content = info[1]
status = info[0]
if content: if content:
match ext: try:
case "html": match ext:
action.generate_html(content, filename) case "html":
case "md": action.generate_html(content, filename)
action.generate_markdown(content, filename) case "md":
case "pdf": action.generate_markdown(content, filename)
action.generate_pdf(content, filename) case "pdf":
url = await upload.start( action.generate_pdf(content, filename)
self, jid, filename) url = await upload.start(
await send_oob_message( self, jid, filename)
self, jid, url) await send_oob_message(
await task.start_tasks_xmpp( self, jid, url)
self, jid, ["status"]) await task.start_tasks_xmpp(
self, jid, ["status"])
except:
logging.warning(
"Check that packages html2text, pdfkit "
"and wkhtmltopdf are installed")
response = (
"Failed to export to {}"
).format(ext)
else: else:
response = "Failed to fetch resource." response = (
"Failed to fetch resource. Reason: {}"
).format(status)
else: else:
response = "Missing entry Id." response = "Missing entry Id."
else: else:
@ -506,7 +521,7 @@ async def message(self, message):
# count = await action.import_opml(db_file, url) # count = await action.import_opml(db_file, url)
# if count: # if count:
# response = ( # response = (
# "Successfully imported {} feeds" # "Successfully imported {} feeds."
# ).format(count) # ).format(count)
# else: # else:
# response = ( # response = (
@ -532,12 +547,19 @@ async def message(self, message):
url = uri.feed_to_http(url) url = uri.feed_to_http(url)
url = (uri.replace_hostname(url, "feed")) or url url = (uri.replace_hostname(url, "feed")) or url
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
response = await action.add_feed( try:
db_file, url) response = await action.add_feed(
await task.clean_tasks_xmpp( db_file, url)
jid, ["status"]) await task.clean_tasks_xmpp(
await task.start_tasks_xmpp( jid, ["status"])
self, jid, ["status"]) await task.start_tasks_xmpp(
self, jid, ["status"])
except:
response = (
"> {}\nNews source is in the process "
"of being added to the subscription "
"list.".format(url)
)
send_reply_message(self, message, response) send_reply_message(self, message, response)
case _ if message_lowercase.startswith("feeds"): case _ if message_lowercase.startswith("feeds"):
query = message_text[6:] query = message_text[6:]
@ -872,7 +894,7 @@ async def message(self, message):
try: try:
await sqlite.set_enabled_status(db_file, ix, 1) await sqlite.set_enabled_status(db_file, ix, 1)
response = ( response = (
"Updates are now disabled for news source {}." "Updates are now enabled for news source {}."
).format(ix) ).format(ix)
except: except:
response = "No news source with ID {}.".format(ix) response = "No news source with ID {}.".format(ix)