JabberCard/jabbercard/utilities/xml.py

221 lines
7.5 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
class Syndication:
# def extract_vcard_items(xml_data):
# namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
# title = xml_data.find(namespace + 'title')
#
# entry = {'fn' : content_text,
# 'note' : link_href,
# 'email' : published_text,
# 'impp' : summary_text,
# 'url' : tags}
# return entry
def extract_vcard_items(xml_data):
"""Extracts all items from a vCard XML ElementTree.
Args:
xml_data (ElementTree): The vCard XML as an ElementTree object.
Returns:
dict: A dictionary where keys are item names and values are their text content.
"""
items = {}
for item in xml_data.iter():
# Skip the root element (vcard)
if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard':
continue
# Extract item name and text content
item_name = item.tag.split('}')[1]
# Check for any direct text content or child elements
item_text = []
if item.text:
item_text.append(item.text)
for child in item:
if child.text:
item_text.append(child.text)
# Join text elements if multiple found
if item_text:
items[item_name] = ' '.join(item_text).strip() # Strip extra spaces
else:
items[item_name] = None
return items
def extract_vcard4_items(xml_data):
namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
vcard = {}
element_em = xml_data.find(namespace + 'email')
element_fn = xml_data.find(namespace + 'fn')
element_nn = xml_data.find(namespace + 'nickname')
element_nt = xml_data.find(namespace + 'note')
element_og = xml_data.find(namespace + 'org')
element_im = xml_data.find(namespace + 'impp')
element_ul = xml_data.find(namespace + 'url')
if isinstance(element_em, ET.Element):
for i in element_em:
text = i.text
if text:
email = text
break
else:
email = ''
else:
email = ''
if isinstance(element_fn, ET.Element):
for i in element_fn:
text = i.text
if text:
title = text
break
else:
title = ''
else:
title = ''
if isinstance(element_nn, ET.Element):
for i in element_nn:
text = i.text
if text:
alias = text
break
else:
alias = ''
else:
alias = ''
if isinstance(element_nt, ET.Element):
for i in element_nt:
text = i.text
if text:
note = text
break
else:
note = ''
else:
note = ''
if isinstance(element_og, ET.Element):
for i in element_og:
text = i.text
if text:
org = text
break
else:
org = ''
else:
org = ''
if isinstance(element_im, ET.Element):
for i in element_im:
text = i.text
if text:
impp = text
break
else:
impp = ''
else:
impp = ''
if isinstance(element_ul, ET.Element):
for i in element_ul:
text = i.text
if text:
url = text
break
else:
url = ''
else:
url = ''
vcard['extras'] = {}
for element in xml_data.findall(namespace + "group"):
category = '?'
for i in element.find(namespace + 'x-ablabel'):
txt = i.text
for i in element.find(namespace + 'url'):
uri = i.text
for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'):
category = i.text
if not category in vcard['extras']: vcard['extras'][category] = []
vcard['extras'][category].append({'label' : txt, 'uri' : uri})
vcard['alias'] = alias
vcard['email'] = email
vcard['fn'] = title
vcard['note'] = note
vcard['org'] = org
vcard['impp'] = impp
vcard['url'] = url
return vcard
def extract_atom_items(xml_data, limit=False):
# NOTE
# `.//` was not needded when node item payload was passed directly.
# Now that item is saved as xml, it is required to use `.//`.
# Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this.
#namespace = './/{http://www.w3.org/2005/Atom}'
namespace = '{http://www.w3.org/2005/Atom}'
title = xml_data.find(namespace + 'title')
links = xml_data.find(namespace + 'link')
if (not isinstance(title, ET.Element) and
not isinstance(links, ET.Element)): return None
title_text = '' if title == None else title.text
link_href = ''
if isinstance(links, ET.Element):
for link in xml_data.findall(namespace + 'link'):
link_href = link.attrib['href'] if 'href' in link.attrib else ''
if link_href: break
contents = xml_data.find(namespace + 'content')
content_text = ''
if isinstance(contents, ET.Element):
for content in xml_data.findall(namespace + 'content'):
content_text = content.text or ''
if content_text: break
summaries = xml_data.find(namespace + 'summary')
summary_text = ''
if isinstance(summaries, ET.Element):
for summary in xml_data.findall(namespace + 'summary'):
summary_text = summary.text or ''
if summary_text: break
published = xml_data.find(namespace + 'published')
published_text = '' if published == None else published.text
categories = xml_data.find(namespace + 'category')
tags = []
if isinstance(categories, ET.Element):
for category in xml_data.findall(namespace + 'category'):
if 'term' in category.attrib and category.attrib['term']:
category_term = category.attrib['term']
if len(category_term) < 20:
tags.append(category_term)
elif len(category_term) < 50:
tags.append(category_term)
if limit and len(tags) > 4: break
identifier = xml_data.find(namespace + 'id')
if identifier and identifier.attrib: print(identifier.attrib)
identifier_text = '' if identifier == None else identifier.text
instances = '' # TODO Check the Blasta database for instances.
entry = {'content' : content_text,
'href' : link_href,
'published' : published_text,
'summary' : summary_text,
'tags' : tags,
'title' : title_text,
'updated' : published_text} # TODO "Updated" is missing
return entry