JabberCard/jabbercard/utilities/xml.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET

class Syndication:

#   def extract_vcard_items(xml_data):
#       namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
#       title = xml_data.find(namespace + 'title')
#
#       entry = {'fn' : content_text,
#                'note' : link_href,
#                'email' : published_text,
#                'impp' : summary_text,
#                'url' : tags}
#       return entry

    def extract_vcard_items(xml_data):
        """Extracts all items from a vCard XML ElementTree.

        Args:
            xml_data (ElementTree): The vCard XML as an ElementTree object.

        Returns:
            dict: A dictionary where keys are item names and values are their text content.
        """

        items = {}
        for item in xml_data.iter():
            # Skip the root element (vcard)
            if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard':
                continue

            # Extract item name and text content
            item_name = item.tag.split('}')[1]

            # Check for any direct text content or child elements
            item_text = []
            if item.text:
                item_text.append(item.text)
            for child in item:
                if child.text:
                    item_text.append(child.text)

            # Join text elements if multiple found
            if item_text:
                items[item_name] = ' '.join(item_text).strip() # Strip extra spaces
            else:
                items[item_name] = None

        return items

    def extract_vcard4_items(xml_data):
        namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
        vcard = {}

        element_em = xml_data.find(namespace + 'email')
        element_fn = xml_data.find(namespace + 'fn')
        element_nn = xml_data.find(namespace + 'nickname')
        element_nt = xml_data.find(namespace + 'note')
        element_og = xml_data.find(namespace + 'org')
        element_im = xml_data.find(namespace + 'impp')
        element_ul = xml_data.find(namespace + 'url')

        if isinstance(element_em, ET.Element):
            for i in element_em:
                text = i.text
                if text:
                    email = text
                    break
                else:
                    email = ''
        else:
            email = ''
        if isinstance(element_fn, ET.Element):
            for i in element_fn:
                text = i.text
                if text:
                    title = text
                    break
                else:
                    title = ''
        else:
            title = ''
        if isinstance(element_nn, ET.Element):
            for i in element_nn:
                text = i.text
                if text:
                    alias = text
                    break
                else:
                    alias = ''
        else:
            alias = ''
        if isinstance(element_nt, ET.Element):
            for i in element_nt:
                text = i.text
                if text:
                    note = text
                    break
                else:
                    note = ''
        else:
            note = ''
        if isinstance(element_og, ET.Element):
            for i in element_og:
                text = i.text
                if text:
                    org = text
                    break
                else:
                    org = ''
        else:
            org = ''
        if isinstance(element_im, ET.Element):
            for i in element_im:
                text = i.text
                if text:
                    impp = text
                    break
                else:
                    impp = ''
        else:
            impp = ''
        if isinstance(element_ul, ET.Element):
            for i in element_ul:
                text = i.text
                if text:
                    url = text
                    break
                else:
                    url = ''
        else:
            url = ''

        vcard['extras'] = {}
        for element in xml_data.findall(namespace + "group"):
            category = '?'
            for i in element.find(namespace + 'x-ablabel'):
                txt = i.text
            for i in element.find(namespace + 'url'):
                uri = i.text
            for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'):
                category = i.text
            if not category in vcard['extras']: vcard['extras'][category] = []
            vcard['extras'][category].append({'label' : txt, 'uri' : uri})

        vcard['alias'] = alias
        vcard['email'] = email
        vcard['fn'] = title
        vcard['note'] = note
        vcard['org'] = org
        vcard['impp'] = impp
        vcard['url'] = url
        return vcard


    def extract_atom_items(xml_data, limit=False):
        # NOTE
        # `.//` was not needded when node item payload was passed directly.
        # Now that item is saved as xml, it is required to use `.//`.
        # Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this.
        #namespace = './/{http://www.w3.org/2005/Atom}'
        namespace = '{http://www.w3.org/2005/Atom}'
        title = xml_data.find(namespace + 'title')
        links = xml_data.find(namespace + 'link')
        if (not isinstance(title, ET.Element) and
            not isinstance(links, ET.Element)): return None
        title_text = '' if title == None else title.text
        link_href = ''
        if isinstance(links, ET.Element):
            for link in xml_data.findall(namespace + 'link'):
                link_href = link.attrib['href'] if 'href' in link.attrib else ''
                if link_href: break
        contents = xml_data.find(namespace + 'content')
        content_text = ''
        if isinstance(contents, ET.Element):
            for content in xml_data.findall(namespace + 'content'):
                content_text = content.text or ''
                if content_text: break
        summaries = xml_data.find(namespace + 'summary')
        summary_text = ''
        if isinstance(summaries, ET.Element):
            for summary in xml_data.findall(namespace + 'summary'):
                summary_text = summary.text or ''
                if summary_text: break
        published = xml_data.find(namespace + 'published')
        published_text = '' if published == None else published.text
        categories = xml_data.find(namespace + 'category')
        tags = []
        if isinstance(categories, ET.Element):
            for category in xml_data.findall(namespace + 'category'):
                if 'term' in category.attrib and category.attrib['term']:
                    category_term = category.attrib['term']
                    if len(category_term) < 20:
                        tags.append(category_term)
                    elif len(category_term) < 50:
                        tags.append(category_term)
                    if limit and len(tags) > 4: break
    
    
        identifier = xml_data.find(namespace + 'id')
        if identifier and identifier.attrib: print(identifier.attrib)
        identifier_text = '' if identifier == None else identifier.text
    
        instances = '' # TODO Check the Blasta database for instances.
    
        entry = {'content' : content_text,
                 'href' : link_href,
                 'published' : published_text,
                 'summary' : summary_text,
                 'tags' : tags,
                 'title' : title_text,
                 'updated' : published_text} # TODO "Updated" is missing
        return entry
Add file PyProject; Support display of a single pubsub node item; Update document README; Modularize code; 2024-11-17 16:30:38 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`import xml.etree.ElementTree as ET`

			`class Syndication:`

			`# def extract_vcard_items(xml_data):`
			`# namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'`
			`# title = xml_data.find(namespace + 'title')`
			`#`
			`# entry = {'fn' : content_text,`
			`# 'note' : link_href,`
			`# 'email' : published_text,`
			`# 'impp' : summary_text,`
			`# 'url' : tags}`
			`# return entry`

			`def extract_vcard_items(xml_data):`
			`"""Extracts all items from a vCard XML ElementTree.`

			`Args:`
			`xml_data (ElementTree): The vCard XML as an ElementTree object.`

			`Returns:`
			`dict: A dictionary where keys are item names and values are their text content.`
			`"""`

			`items = {}`
			`for item in xml_data.iter():`
			`# Skip the root element (vcard)`
			`if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard':`
			`continue`

			`# Extract item name and text content`
			`item_name = item.tag.split('}')[1]`

			`# Check for any direct text content or child elements`
			`item_text = []`
			`if item.text:`
			`item_text.append(item.text)`
			`for child in item:`
			`if child.text:`
			`item_text.append(child.text)`

			`# Join text elements if multiple found`
			`if item_text:`
			`items[item_name] = ' '.join(item_text).strip() # Strip extra spaces`
			`else:`
			`items[item_name] = None`

			`return items`

			`def extract_vcard4_items(xml_data):`
			`namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'`
			`vcard = {}`

			`element_em = xml_data.find(namespace + 'email')`
			`element_fn = xml_data.find(namespace + 'fn')`
			`element_nn = xml_data.find(namespace + 'nickname')`
			`element_nt = xml_data.find(namespace + 'note')`
			`element_og = xml_data.find(namespace + 'org')`
			`element_im = xml_data.find(namespace + 'impp')`
			`element_ul = xml_data.find(namespace + 'url')`

			`if isinstance(element_em, ET.Element):`
			`for i in element_em:`
			`text = i.text`
			`if text:`
			`email = text`
			`break`
			`else:`
			`email = ''`
			`else:`
			`email = ''`
			`if isinstance(element_fn, ET.Element):`
			`for i in element_fn:`
			`text = i.text`
			`if text:`
			`title = text`
			`break`
			`else:`
			`title = ''`
			`else:`
			`title = ''`
			`if isinstance(element_nn, ET.Element):`
			`for i in element_nn:`
			`text = i.text`
			`if text:`
			`alias = text`
			`break`
			`else:`
			`alias = ''`
			`else:`
			`alias = ''`
			`if isinstance(element_nt, ET.Element):`
			`for i in element_nt:`
			`text = i.text`
			`if text:`
			`note = text`
			`break`
			`else:`
			`note = ''`
			`else:`
			`note = ''`
			`if isinstance(element_og, ET.Element):`
			`for i in element_og:`
			`text = i.text`
			`if text:`
			`org = text`
			`break`
			`else:`
			`org = ''`
			`else:`
			`org = ''`
			`if isinstance(element_im, ET.Element):`
			`for i in element_im:`
			`text = i.text`
			`if text:`
			`impp = text`
			`break`
			`else:`
			`impp = ''`
			`else:`
			`impp = ''`
			`if isinstance(element_ul, ET.Element):`
			`for i in element_ul:`
			`text = i.text`
			`if text:`
			`url = text`
			`break`
			`else:`
			`url = ''`
			`else:`
			`url = ''`

			`vcard['extras'] = {}`
			`for element in xml_data.findall(namespace + "group"):`
			`category = '?'`
			`for i in element.find(namespace + 'x-ablabel'):`
			`txt = i.text`
			`for i in element.find(namespace + 'url'):`
			`uri = i.text`
			`for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'):`
			`category = i.text`
			`if not category in vcard['extras']: vcard['extras'][category] = []`
			`vcard['extras'][category].append({'label' : txt, 'uri' : uri})`

			`vcard['alias'] = alias`
			`vcard['email'] = email`
			`vcard['fn'] = title`
			`vcard['note'] = note`
			`vcard['org'] = org`
			`vcard['impp'] = impp`
			`vcard['url'] = url`
			`return vcard`


			`def extract_atom_items(xml_data, limit=False):`
			`# NOTE`
			# `.//` was not needded when node item payload was passed directly.
			# Now that item is saved as xml, it is required to use `.//`.
			`# Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this.`
			`#namespace = './/{http://www.w3.org/2005/Atom}'`
			`namespace = '{http://www.w3.org/2005/Atom}'`
			`title = xml_data.find(namespace + 'title')`
			`links = xml_data.find(namespace + 'link')`
			`if (not isinstance(title, ET.Element) and`
			`not isinstance(links, ET.Element)): return None`
			`title_text = '' if title == None else title.text`
			`link_href = ''`
			`if isinstance(links, ET.Element):`
			`for link in xml_data.findall(namespace + 'link'):`
			`link_href = link.attrib['href'] if 'href' in link.attrib else ''`
			`if link_href: break`
			`contents = xml_data.find(namespace + 'content')`
			`content_text = ''`
			`if isinstance(contents, ET.Element):`
			`for content in xml_data.findall(namespace + 'content'):`
			`content_text = content.text or ''`
			`if content_text: break`
			`summaries = xml_data.find(namespace + 'summary')`
			`summary_text = ''`
			`if isinstance(summaries, ET.Element):`
			`for summary in xml_data.findall(namespace + 'summary'):`
			`summary_text = summary.text or ''`
			`if summary_text: break`
			`published = xml_data.find(namespace + 'published')`
			`published_text = '' if published == None else published.text`
			`categories = xml_data.find(namespace + 'category')`
			`tags = []`
			`if isinstance(categories, ET.Element):`
			`for category in xml_data.findall(namespace + 'category'):`
			`if 'term' in category.attrib and category.attrib['term']:`
			`category_term = category.attrib['term']`
			`if len(category_term) < 20:`
			`tags.append(category_term)`
			`elif len(category_term) < 50:`
			`tags.append(category_term)`
			`if limit and len(tags) > 4: break`


			`identifier = xml_data.find(namespace + 'id')`
			`if identifier and identifier.attrib: print(identifier.attrib)`
			`identifier_text = '' if identifier == None else identifier.text`

			`instances = '' # TODO Check the Blasta database for instances.`

			`entry = {'content' : content_text,`
			`'href' : link_href,`
			`'published' : published_text,`
			`'summary' : summary_text,`
			`'tags' : tags,`
			`'title' : title_text,`
			`'updated' : published_text} # TODO "Updated" is missing`
			`return entry`