Blasta/blasta/utilities/data.py

319 lines
14 KiB
Python
Raw Normal View History

#!/usr/bin/python
# -*- coding: utf-8 -*-
from blasta.database.sqlite import DatabaseSQLite
from blasta.utilities.cryptography import UtilitiesCryptography
from blasta.utilities.syndication import UtilitiesSyndication
from blasta.xmpp.pubsub import XmppPubsub
from datetime import datetime
from lxml import etree
import os
from slixmpp.stanza.iq import Iq
import time
import tomli_w
try:
import tomllib
except:
import tomli as tomllib
class UtilitiesData:
def cache_items_and_tags_search(directory_cache, entries, jid, query):
"""Create a cache file of node items and tags."""
item_ids = []
tags = {}
for entry in entries:
entry_tags = entry['tags']
entry_url_hash = entry['url_hash']
tags_to_include = []
if query in ' '.join([entry['title'], entry['link'], entry['summary'], ' '.join(entry_tags)]):
item_ids.append(entry_url_hash)
tags_to_include += entry_tags
for tag_to_include in tags_to_include:
tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1
if tags:
tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0])))
tags = dict(list(tags.items())[:30])
if item_ids:
filename = os.path.join(directory_cache, 'data', jid + '_query.toml')
data = {
'item_ids' : item_ids,
'tags' : tags}
UtilitiesData.save_to_toml(filename, data)
def cache_items_and_tags_filter(directory_cache, entries, jid, tag):
"""Create a cache file of node items and tags."""
item_ids = []
tags = {}
for entry in entries:
entry_tags = entry['tags']
entry_url_hash = entry['url_hash']
tags_to_include = []
if tag in entry_tags:
item_ids.append(entry_url_hash)
tags_to_include += entry_tags
for tag_to_include in tags_to_include:
tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1
if tags:
tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0])))
tags = dict(list(tags.items())[:30])
del tags[tag]
if item_ids:
directory = os.path.join(directory_cache, 'data', jid)
if not os.path.exists(directory):
os.mkdir(directory)
filename = os.path.join(directory, tag + '.toml')
# Add support for search query
#filename = 'data/{}/query:{}.toml'.format(jid, query)
#filename = 'data/{}/tag:{}.toml'.format(jid, tag)
data = {
'item_ids' : item_ids,
'tags' : tags}
UtilitiesData.save_to_toml(filename, data)
def cache_items_and_tags(directory_cache, entries, jid):
"""Create a cache file of node items and tags."""
item_ids = []
tags = {}
for entry in entries:
entry_tags = entry['tags']
entry_url_hash = entry['url_hash']
tags_to_include = []
item_ids.append(entry_url_hash)
tags_to_include += entry_tags
for tag_to_include in tags_to_include:
tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1
if tags:
tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0])))
tags = dict(list(tags.items())[:30])
if item_ids:
filename = os.path.join(directory_cache, 'data', jid + '.toml')
data = {
'item_ids' : item_ids,
'tags' : tags}
UtilitiesData.save_to_toml(filename, data)
def extract_iq_items(iq, jabber_id):
iq_items = iq['pubsub']['items']
entries = []
name = jabber_id.split('@')[0]
for iq_item in iq_items:
item_payload = iq_item['payload']
entry = UtilitiesSyndication.extract_items(item_payload)
entries.append(entry)
# TODO Handle this with XEP-0059 (reverse: bool), instead of reversing it.
entries.reverse()
return entries
def extract_iq_items_extra(db_file, iq, jabber_id, limit=None):
iq_items = iq['pubsub']['items']
entries = []
name = jabber_id.split('@')[0]
for iq_item in iq_items:
item_payload = iq_item['payload']
entry = UtilitiesSyndication.extract_items(item_payload, limit)
url_hash = UtilitiesCryptography.hash_url_to_md5(entry['link'])
iq_item_id = iq_item['id']
if iq_item_id != url_hash:
logging.error(f'Item ID does not match MD5. id: {iq_item_id} hash: {url_hash}')
logging.warn(f'Item ID does not match MD5. id: {iq_item_id} hash: {url_hash}')
instances = DatabaseSQLite.get_entry_instances_by_url_hash(db_file, url_hash)
if entry:
entry['instances'] = instances or 0
entry['jid'] = jabber_id
entry['name'] = name
entry['url_hash'] = url_hash
entries.append(entry)
# TODO Handle this with XEP-0059 (reverse: bool), instead of reversing it.
entries.reverse()
result = entries
return result
2024-12-16 15:04:01 +01:00
def load_data_toml(data: dict) -> dict:
return tomllib.loads(data)
def load_data_netscape(html: str) -> dict:
bookmarks = []
current_summary = ""
parser = etree.XMLParser(recover=True)
lines = html.splitlines()
for line in lines:
line = line.strip()
if line:
# Parse given line
root = etree.fromstring(line, parser)
# Check for <DT> tag
if line.startswith("<DT>"):
# Look for <A> tag within <DT>
a_element = root.find('.//A')
if a_element is not None:
link = a_element.get('HREF')
add_date = a_element.get('ADD_DATE') or time.time()
last_modified = a_element.get('LAST_MODIFIED') or time.time()
tags = a_element.get('TAGS')
title = a_element.text or link
# Convert timestamps from seconds since epoch to ISO format
added_date = datetime.fromtimestamp(float(add_date)).isoformat()
modified_date = datetime.fromtimestamp(float(last_modified)).isoformat()
# Create bookmark dictionary
bookmark = {
'title': title,
'link': link,
'summary': current_summary,
'published': added_date,
'updated': modified_date,
'tags': [tag.strip() for tag in tags.split(',')] if tags else ['unclassified']
}
# Append bookmark to the list
bookmarks.append(bookmark)
# Reset summary for the next bookmark
current_summary = ""
# Check for <DD> tag
elif line.startswith("<DD>"):
# Extract summary from <DD>
bookmarks[len(bookmarks)-1]['summary'] = line[4:].strip()
#dd_element = root.find('.//DD')
#if dd_element:
# bookmarks[len(bookmarks)-1]['summary'] = dd_element.text.strip()
return {'entries': bookmarks}
def open_file_toml(filename: str) -> dict:
with open(filename, mode="rb") as fn:
data = tomllib.load(fn)
return data
def organize_tags(tags):
tags_organized = []
tags = tags.split(',')
#tags = sorted(set(tags))
for tag in tags:
if tag:
tag = tag.lower().strip()
if tag not in tags_organized:
tags_organized.append(tag)
return sorted(tags_organized)
def remove_item_from_cache(directory_cache, jabber_id, node, url_hash):
filename_items = os.path.join(directory_cache, 'items', jabber_id + '.toml')
if os.path.exists(filename_items):
#if os.path.exists(filename_items) and os.path.getsize(filename_items):
entries_cache = UtilitiesData.open_file_toml(filename_items)
if node in entries_cache:
entries_cache_node = entries_cache[node]
for entry_cache in entries_cache_node:
if entry_cache['url_hash'] == url_hash:
entry_cache_index = entries_cache_node.index(entry_cache)
del entries_cache_node[entry_cache_index]
break
data_items = entries_cache
UtilitiesData.save_to_toml(filename_items, data_items)
def save_to_json(filename: str, data) -> None:
with open(filename, 'w') as f:
json.dump(data, f)
def save_to_toml(filename: str, data: dict) -> None:
with open(filename, 'w') as fn:
data_as_string = tomli_w.dumps(data)
fn.write(data_as_string)
async def update_cache_and_database(
db_file, directory_cache, xmpp_instance, jabber_id: str, node_type: str, node_id: str):
# Download identifiers of node items.
iq = await XmppPubsub.get_node_item_ids(xmpp_instance, jabber_id, node_id)
if isinstance(iq, Iq):
iq_items_remote = iq['disco_items']
# Cache a list of identifiers of node items to a file.
iq_items_remote_name = []
for iq_item_remote in iq_items_remote:
iq_item_remote_name = iq_item_remote['name']
iq_items_remote_name.append(iq_item_remote_name)
#data_item_ids = {'iq_items' : iq_items_remote_name}
#filename_item_ids = 'item_ids/' + jabber_id + '.toml'
#Data.save_to_toml(filename_item_ids, data_item_ids)
filename_items = os.path.join(directory_cache, 'items', jabber_id + '.toml')
if not os.path.exists(filename_items) or os.path.getsize(filename_items) in (0, 13):
iq = await XmppPubsub.get_node_items(xmpp_instance, jabber_id, node_id)
if isinstance(iq, Iq):
entries_cache_node = UtilitiesData.extract_iq_items_extra(db_file, iq, jabber_id)
data_items = {node_type : entries_cache_node}
UtilitiesData.save_to_toml(filename_items, data_items)
return ['fine', iq] # TODO Remove this line
else:
return ['error', iq]
else:
entries_cache = UtilitiesData.open_file_toml(filename_items)
if not node_type in entries_cache:
return ['error', f'Directory "{node_type}" is empty']
entries_cache_node = entries_cache[node_type]
# Check whether items still exist on node
for entry in entries_cache_node:
iq_item_remote_exist = False
url_hash = None
for url_hash in iq_items_remote_name:
if url_hash == entry['url_hash']:
iq_item_remote_exist = True
break
if url_hash and not iq_item_remote_exist:
await DatabaseSQLite.delete_combination_row_by_jid_and_url_hash(
db_file, url_hash, jabber_id)
entry_index = entries_cache_node.index(entry)
del entries_cache_node[entry_index]
# Check for new items on node
entries_cache_node_new = []
for url_hash in iq_items_remote_name:
iq_item_local_exist = False
for entry in entries_cache_node:
if url_hash == entry['url_hash']:
iq_item_local_exist = True
break
if not iq_item_local_exist:
iq = await XmppPubsub.get_node_item(
xmpp_instance, jabber_id, node_id, url_hash)
if isinstance(iq, Iq):
entries_iq = UtilitiesData.extract_iq_items_extra(db_file, iq, jabber_id)
entries_cache_node_new += entries_iq
else:
# TODO
# Handle this concern in a different fashion,
# instead of stopping the whole operation.
return ['error', iq]
entries_cache_node += entries_cache_node_new
if node_type == 'public':
# Fast (low I/O)
if not DatabaseSQLite.get_jid_id_by_jid(db_file, jabber_id):
await DatabaseSQLite.set_jid(db_file, jabber_id)
#await DatabaseSQLite.add_new_entries(db_file, entries)
await DatabaseSQLite.add_tags(db_file, entries_cache_node)
# Slow (high I/O)
for entry in entries_cache_node:
url_hash = entry['url_hash']
if not DatabaseSQLite.get_entry_id_by_url_hash(db_file, url_hash):
await DatabaseSQLite.add_new_entries(db_file, entries_cache_node)
await DatabaseSQLite.associate_entries_tags_jids(db_file, entry)
#elif not DatabaseSQLite.is_jid_associated_with_url_hash(db_file, jabber_id, url_hash):
# await DatabaseSQLite.associate_entries_tags_jids(db_file, entry)
else:
await DatabaseSQLite.associate_entries_tags_jids(db_file, entry)
data_items = entries_cache
UtilitiesData.save_to_toml(filename_items, data_items)
return ['fine', iq] # TODO Remove this line
else:
return ['error', iq]