#!/usr/bin/python # -*- coding: utf-8 -*- from blasta.database.sqlite import DatabaseSQLite from blasta.utilities.cryptography import UtilitiesCryptography from blasta.utilities.syndication import UtilitiesSyndication from blasta.xmpp.pubsub import XmppPubsub import os from slixmpp.stanza.iq import Iq import tomli_w try: import tomllib except: import tomli as tomllib class UtilitiesData: def cache_items_and_tags_search(directory_cache, entries, jid, query): """Create a cache file of node items and tags.""" item_ids = [] tags = {} for entry in entries: entry_tags = entry['tags'] entry_url_hash = entry['url_hash'] tags_to_include = [] if query in ' '.join([entry['title'], entry['link'], entry['summary'], ' '.join(entry_tags)]): item_ids.append(entry_url_hash) tags_to_include += entry_tags for tag_to_include in tags_to_include: tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1 if tags: tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0]))) tags = dict(list(tags.items())[:30]) if item_ids: filename = os.path.join(directory_cache, 'data', jid + '_query.toml') data = { 'item_ids' : item_ids, 'tags' : tags} Data.save_to_toml(filename, data) def cache_items_and_tags_filter(directory_cache, entries, jid, tag): """Create a cache file of node items and tags.""" item_ids = [] tags = {} for entry in entries: entry_tags = entry['tags'] entry_url_hash = entry['url_hash'] tags_to_include = [] if tag in entry_tags: item_ids.append(entry_url_hash) tags_to_include += entry_tags for tag_to_include in tags_to_include: tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1 if tags: tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0]))) tags = dict(list(tags.items())[:30]) del tags[tag] if item_ids: directory = os.path.join(directory_cache, 'data', jid) if not os.path.exists(directory): os.mkdir(directory) filename = os.path.join(directory, tag + '.toml') # Add support for search query #filename = 'data/{}/query:{}.toml'.format(jid, query) #filename = 'data/{}/tag:{}.toml'.format(jid, tag) data = { 'item_ids' : item_ids, 'tags' : tags} Data.save_to_toml(filename, data) def cache_items_and_tags(directory_cache, entries, jid): """Create a cache file of node items and tags.""" item_ids = [] tags = {} for entry in entries: entry_tags = entry['tags'] entry_url_hash = entry['url_hash'] tags_to_include = [] item_ids.append(entry_url_hash) tags_to_include += entry_tags for tag_to_include in tags_to_include: tags[tag_to_include] = tags[tag_to_include]+1 if tag_to_include in tags else 1 if tags: tags = dict(sorted(tags.items(), key=lambda item: (-item[1], item[0]))) tags = dict(list(tags.items())[:30]) if item_ids: filename = os.path.join(directory_cache, 'data', jid + '.toml') data = { 'item_ids' : item_ids, 'tags' : tags} Data.save_to_toml(filename, data) def extract_iq_items(iq, jabber_id): iq_items = iq['pubsub']['items'] entries = [] name = jabber_id.split('@')[0] for iq_item in iq_items: item_payload = iq_item['payload'] entry = Syndication.extract_items(item_payload) entries.append(entry) # TODO Handle this with XEP-0059 (reverse: bool), instead of reversing it. entries.reverse() return entries def extract_iq_items_extra(db_file, iq, jabber_id, limit=None): iq_items = iq['pubsub']['items'] entries = [] name = jabber_id.split('@')[0] for iq_item in iq_items: item_payload = iq_item['payload'] entry = Syndication.extract_items(item_payload, limit) url_hash = UtilitiesCryptography.hash_url_to_md5(entry['link']) iq_item_id = iq_item['id'] if iq_item_id != url_hash: logging.error('Item ID does not match MD5. id: {} hash: {}'.format(iq_item_id, url_hash)) logging.warn('Item ID does not match MD5. id: {} hash: {}'.format(iq_item_id, url_hash)) instances = SQLite.get_entry_instances_by_url_hash(db_file, url_hash) if entry: entry['instances'] = instances or 0 entry['jid'] = jabber_id entry['name'] = name entry['url_hash'] = url_hash entries.append(entry) # TODO Handle this with XEP-0059 (reverse: bool), instead of reversing it. entries.reverse() result = entries return result def open_file_toml(filename: str) -> dict: with open(filename, mode="rb") as fn: data = tomllib.load(fn) return data def organize_tags(tags): tags_organized = [] tags = tags.split(',') #tags = sorted(set(tags)) for tag in tags: if tag: tag = tag.lower().strip() if tag not in tags_organized: tags_organized.append(tag) return sorted(tags_organized) def remove_item_from_cache(directory_cache, jabber_id, node, url_hash): filename_items = os.path.join(directory_cache, 'items', jabber_id + '.toml') entries_cache = Data.open_file_toml(filename_items) if node in entries_cache: entries_cache_node = entries_cache[node] for entry_cache in entries_cache_node: if entry_cache['url_hash'] == url_hash: entry_cache_index = entries_cache_node.index(entry_cache) del entries_cache_node[entry_cache_index] break data_items = entries_cache Data.save_to_toml(filename_items, data_items) def save_to_json(filename: str, data) -> None: with open(filename, 'w') as f: json.dump(data, f) def save_to_toml(filename: str, data: dict) -> None: with open(filename, 'w') as fn: data_as_string = tomli_w.dumps(data) fn.write(data_as_string) async def update_cache_and_database( db_file, directory_cache, xmpp_instance, jabber_id: str, node_type: str, node_id: str): # Download identifiers of node items. iq = await XmppPubsub.get_node_item_ids(xmpp_instance, jabber_id, node_id) if isinstance(iq, Iq): iq_items_remote = iq['disco_items'] # Cache a list of identifiers of node items to a file. iq_items_remote_name = [] for iq_item_remote in iq_items_remote: iq_item_remote_name = iq_item_remote['name'] iq_items_remote_name.append(iq_item_remote_name) #data_item_ids = {'iq_items' : iq_items_remote_name} #filename_item_ids = 'item_ids/' + jabber_id + '.toml' #Data.save_to_toml(filename_item_ids, data_item_ids) filename_items = os.path.join(directory_cache, 'items', jabber_id + '.toml') if not os.path.exists(filename_items) or os.path.getsize(filename_items) in (0, 13): iq = await XmppPubsub.get_node_items(xmpp_instance, jabber_id, node_id) if isinstance(iq, Iq): entries_cache_node = Data.extract_iq_items_extra(iq, jabber_id) data_items = {node_type : entries_cache_node} Data.save_to_toml(filename_items, data_items) return ['fine', iq] # TODO Remove this line else: return ['error', iq] else: entries_cache = Data.open_file_toml(filename_items) if not node_type in entries_cache: return ['error', 'Directory "{}" is empty'. format(node_type)] entries_cache_node = entries_cache[node_type] # Check whether items still exist on node for entry in entries_cache_node: iq_item_remote_exist = False url_hash = None for url_hash in iq_items_remote_name: if url_hash == entry['url_hash']: iq_item_remote_exist = True break if url_hash and not iq_item_remote_exist: await SQLite.delete_combination_row_by_jid_and_url_hash( db_file, url_hash, jabber_id) entry_index = entries_cache_node.index(entry) del entries_cache_node[entry_index] # Check for new items on node entries_cache_node_new = [] for url_hash in iq_items_remote_name: iq_item_local_exist = False for entry in entries_cache_node: if url_hash == entry['url_hash']: iq_item_local_exist = True break if not iq_item_local_exist: iq = await XmppPubsub.get_node_item( xmpp_instance, jabber_id, node_id, url_hash) if isinstance(iq, Iq): entries_iq = Data.extract_iq_items_extra(iq, jabber_id) entries_cache_node_new += entries_iq else: # TODO # Handle this concern in a different fashion, # instead of stopping the whole operation. return ['error', iq] entries_cache_node += entries_cache_node_new if node_type == 'public': # Fast (low I/O) if not SQLite.get_jid_id_by_jid(db_file, jabber_id): await SQLite.set_jid(db_file, jabber_id) #await SQLite.add_new_entries(db_file, entries) await SQLite.add_tags(db_file, entries_cache_node) # Slow (high I/O) for entry in entries_cache_node: url_hash = entry['url_hash'] if not SQLite.get_entry_id_by_url_hash(db_file, url_hash): await SQLite.add_new_entries(db_file, entries_cache_node) await SQLite.associate_entries_tags_jids(db_file, entry) #elif not SQLite.is_jid_associated_with_url_hash(db_file, jabber_id, url_hash): # await SQLite.associate_entries_tags_jids(db_file, entry) else: await SQLite.associate_entries_tags_jids(db_file, entry) data_items = entries_cache Data.save_to_toml(filename_items, data_items) return ['fine', iq] # TODO Remove this line else: return ['error', iq]