JabberCard/jabbercard/utilities/xml.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET

class Syndication:

#   def extract_vcard_items(xml_data):
#       namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
#       title = xml_data.find(namespace + 'title')
#
#       entry = {'fn' : content_text,
#                'note' : link_href,
#                'email' : published_text,
#                'impp' : summary_text,
#                'url' : tags}
#       return entry

    def extract_vcard_items(xml_data):
        """Extracts all items from a vCard XML ElementTree.

        Args:
            xml_data (ElementTree): The vCard XML as an ElementTree object.

        Returns:
            dict: A dictionary where keys are item names and values are their text content.
        """

        items = {}
        for item in xml_data.iter():
            # Skip the root element (vcard)
            if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard':
                continue

            # Extract item name and text content
            item_name = item.tag.split('}')[1]

            # Check for any direct text content or child elements
            item_text = []
            if item.text:
                item_text.append(item.text)
            for child in item:
                if child.text:
                    item_text.append(child.text)

            # Join text elements if multiple found
            if item_text:
                items[item_name] = ' '.join(item_text).strip() # Strip extra spaces
            else:
                items[item_name] = None

        return items

    def extract_vcard4_items(xml_data):
        namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
        vcard = {}

        element_em = xml_data.find(namespace + 'email')
        element_fn = xml_data.find(namespace + 'fn')
        element_nn = xml_data.find(namespace + 'nickname')
        element_nt = xml_data.find(namespace + 'note')
        element_og = xml_data.find(namespace + 'org')
        element_im = xml_data.find(namespace + 'impp')
        element_ul = xml_data.find(namespace + 'url')

        if isinstance(element_em, ET.Element):
            for i in element_em:
                text = i.text
                if text:
                    email = text
                    break
                else:
                    email = ''
        else:
            email = ''
        if isinstance(element_fn, ET.Element):
            for i in element_fn:
                text = i.text
                if text:
                    title = text
                    break
                else:
                    title = ''
        else:
            title = ''
        if isinstance(element_nn, ET.Element):
            for i in element_nn:
                text = i.text
                if text:
                    alias = text
                    break
                else:
                    alias = ''
        else:
            alias = ''
        if isinstance(element_nt, ET.Element):
            for i in element_nt:
                text = i.text
                if text:
                    note = text
                    break
                else:
                    note = ''
        else:
            note = ''
        if isinstance(element_og, ET.Element):
            for i in element_og:
                text = i.text
                if text:
                    org = text
                    break
                else:
                    org = ''
        else:
            org = ''
        if isinstance(element_im, ET.Element):
            for i in element_im:
                text = i.text
                if text:
                    impp = text
                    break
                else:
                    impp = ''
        else:
            impp = ''
        if isinstance(element_ul, ET.Element):
            for i in element_ul:
                text = i.text
                if text:
                    url = text
                    break
                else:
                    url = ''
        else:
            url = ''

        vcard['extras'] = {}
        for element in xml_data.findall(namespace + "group"):
            category = '?'
            for i in element.find(namespace + 'x-ablabel'):
                txt = i.text
            for i in element.find(namespace + 'url'):
                uri = i.text
            for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'):
                category = i.text
            if not category in vcard['extras']: vcard['extras'][category] = []
            vcard['extras'][category].append({'label' : txt, 'uri' : uri})

        vcard['alias'] = alias
        vcard['email'] = email
        vcard['fn'] = title
        vcard['note'] = note
        vcard['org'] = org
        vcard['impp'] = impp
        vcard['url'] = url
        return vcard


    def extract_atom_items(xml_data, limit=False):
        # NOTE
        # `.//` was not needded when node item payload was passed directly.
        # Now that item is saved as xml, it is required to use `.//`.
        # Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this.
        #namespace = './/{http://www.w3.org/2005/Atom}'
        namespace = '{http://www.w3.org/2005/Atom}'
        title = xml_data.find(namespace + 'title')
        links = xml_data.find(namespace + 'link')
        if (not isinstance(title, ET.Element) and
            not isinstance(links, ET.Element)): return None
        title_text = '' if title == None else title.text
        link_href = ''
        if isinstance(links, ET.Element):
            for link in xml_data.findall(namespace + 'link'):
                link_href = link.attrib['href'] if 'href' in link.attrib else ''
                if link_href: break
        contents = xml_data.find(namespace + 'content')
        content_text = ''
        if isinstance(contents, ET.Element):
            for content in xml_data.findall(namespace + 'content'):
                content_text = content.text or ''
                if content_text: break
        summaries = xml_data.find(namespace + 'summary')
        summary_text = ''
        if isinstance(summaries, ET.Element):
            for summary in xml_data.findall(namespace + 'summary'):
                summary_text = summary.text or ''
                if summary_text: break
        published = xml_data.find(namespace + 'published')
        published_text = '' if published == None else published.text
        categories = xml_data.find(namespace + 'category')
        tags = []
        if isinstance(categories, ET.Element):
            for category in xml_data.findall(namespace + 'category'):
                if 'term' in category.attrib and category.attrib['term']:
                    category_term = category.attrib['term']
                    if len(category_term) < 20:
                        tags.append(category_term)
                    elif len(category_term) < 50:
                        tags.append(category_term)
                    if limit and len(tags) > 4: break


        identifier = xml_data.find(namespace + 'id')
        if identifier and identifier.attrib: print(identifier.attrib)
        identifier_text = '' if identifier == None else identifier.text

        instances = '' # TODO Check the Blasta database for instances.

        entry = {'content' : content_text,
                 'href' : link_href,
                 'published' : published_text,
                 'summary' : summary_text,
                 'tags' : tags,
                 'title' : title_text,
                 'updated' : published_text} # TODO "Updated" is missing
        return entry