221 lines
7.5 KiB
Python
221 lines
7.5 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import xml.etree.ElementTree as ET
|
||
|
|
||
|
class Syndication:
|
||
|
|
||
|
# def extract_vcard_items(xml_data):
|
||
|
# namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
|
||
|
# title = xml_data.find(namespace + 'title')
|
||
|
#
|
||
|
# entry = {'fn' : content_text,
|
||
|
# 'note' : link_href,
|
||
|
# 'email' : published_text,
|
||
|
# 'impp' : summary_text,
|
||
|
# 'url' : tags}
|
||
|
# return entry
|
||
|
|
||
|
def extract_vcard_items(xml_data):
|
||
|
"""Extracts all items from a vCard XML ElementTree.
|
||
|
|
||
|
Args:
|
||
|
xml_data (ElementTree): The vCard XML as an ElementTree object.
|
||
|
|
||
|
Returns:
|
||
|
dict: A dictionary where keys are item names and values are their text content.
|
||
|
"""
|
||
|
|
||
|
items = {}
|
||
|
for item in xml_data.iter():
|
||
|
# Skip the root element (vcard)
|
||
|
if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard':
|
||
|
continue
|
||
|
|
||
|
# Extract item name and text content
|
||
|
item_name = item.tag.split('}')[1]
|
||
|
|
||
|
# Check for any direct text content or child elements
|
||
|
item_text = []
|
||
|
if item.text:
|
||
|
item_text.append(item.text)
|
||
|
for child in item:
|
||
|
if child.text:
|
||
|
item_text.append(child.text)
|
||
|
|
||
|
# Join text elements if multiple found
|
||
|
if item_text:
|
||
|
items[item_name] = ' '.join(item_text).strip() # Strip extra spaces
|
||
|
else:
|
||
|
items[item_name] = None
|
||
|
|
||
|
return items
|
||
|
|
||
|
def extract_vcard4_items(xml_data):
|
||
|
namespace = '{urn:ietf:params:xml:ns:vcard-4.0}'
|
||
|
vcard = {}
|
||
|
|
||
|
element_em = xml_data.find(namespace + 'email')
|
||
|
element_fn = xml_data.find(namespace + 'fn')
|
||
|
element_nn = xml_data.find(namespace + 'nickname')
|
||
|
element_nt = xml_data.find(namespace + 'note')
|
||
|
element_og = xml_data.find(namespace + 'org')
|
||
|
element_im = xml_data.find(namespace + 'impp')
|
||
|
element_ul = xml_data.find(namespace + 'url')
|
||
|
|
||
|
if isinstance(element_em, ET.Element):
|
||
|
for i in element_em:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
email = text
|
||
|
break
|
||
|
else:
|
||
|
email = ''
|
||
|
else:
|
||
|
email = ''
|
||
|
if isinstance(element_fn, ET.Element):
|
||
|
for i in element_fn:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
title = text
|
||
|
break
|
||
|
else:
|
||
|
title = ''
|
||
|
else:
|
||
|
title = ''
|
||
|
if isinstance(element_nn, ET.Element):
|
||
|
for i in element_nn:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
alias = text
|
||
|
break
|
||
|
else:
|
||
|
alias = ''
|
||
|
else:
|
||
|
alias = ''
|
||
|
if isinstance(element_nt, ET.Element):
|
||
|
for i in element_nt:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
note = text
|
||
|
break
|
||
|
else:
|
||
|
note = ''
|
||
|
else:
|
||
|
note = ''
|
||
|
if isinstance(element_og, ET.Element):
|
||
|
for i in element_og:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
org = text
|
||
|
break
|
||
|
else:
|
||
|
org = ''
|
||
|
else:
|
||
|
org = ''
|
||
|
if isinstance(element_im, ET.Element):
|
||
|
for i in element_im:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
impp = text
|
||
|
break
|
||
|
else:
|
||
|
impp = ''
|
||
|
else:
|
||
|
impp = ''
|
||
|
if isinstance(element_ul, ET.Element):
|
||
|
for i in element_ul:
|
||
|
text = i.text
|
||
|
if text:
|
||
|
url = text
|
||
|
break
|
||
|
else:
|
||
|
url = ''
|
||
|
else:
|
||
|
url = ''
|
||
|
|
||
|
vcard['extras'] = {}
|
||
|
for element in xml_data.findall(namespace + "group"):
|
||
|
category = '?'
|
||
|
for i in element.find(namespace + 'x-ablabel'):
|
||
|
txt = i.text
|
||
|
for i in element.find(namespace + 'url'):
|
||
|
uri = i.text
|
||
|
for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'):
|
||
|
category = i.text
|
||
|
if not category in vcard['extras']: vcard['extras'][category] = []
|
||
|
vcard['extras'][category].append({'label' : txt, 'uri' : uri})
|
||
|
|
||
|
vcard['alias'] = alias
|
||
|
vcard['email'] = email
|
||
|
vcard['fn'] = title
|
||
|
vcard['note'] = note
|
||
|
vcard['org'] = org
|
||
|
vcard['impp'] = impp
|
||
|
vcard['url'] = url
|
||
|
return vcard
|
||
|
|
||
|
|
||
|
def extract_atom_items(xml_data, limit=False):
|
||
|
# NOTE
|
||
|
# `.//` was not needded when node item payload was passed directly.
|
||
|
# Now that item is saved as xml, it is required to use `.//`.
|
||
|
# Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this.
|
||
|
#namespace = './/{http://www.w3.org/2005/Atom}'
|
||
|
namespace = '{http://www.w3.org/2005/Atom}'
|
||
|
title = xml_data.find(namespace + 'title')
|
||
|
links = xml_data.find(namespace + 'link')
|
||
|
if (not isinstance(title, ET.Element) and
|
||
|
not isinstance(links, ET.Element)): return None
|
||
|
title_text = '' if title == None else title.text
|
||
|
link_href = ''
|
||
|
if isinstance(links, ET.Element):
|
||
|
for link in xml_data.findall(namespace + 'link'):
|
||
|
link_href = link.attrib['href'] if 'href' in link.attrib else ''
|
||
|
if link_href: break
|
||
|
contents = xml_data.find(namespace + 'content')
|
||
|
content_text = ''
|
||
|
if isinstance(contents, ET.Element):
|
||
|
for content in xml_data.findall(namespace + 'content'):
|
||
|
content_text = content.text or ''
|
||
|
if content_text: break
|
||
|
summaries = xml_data.find(namespace + 'summary')
|
||
|
summary_text = ''
|
||
|
if isinstance(summaries, ET.Element):
|
||
|
for summary in xml_data.findall(namespace + 'summary'):
|
||
|
summary_text = summary.text or ''
|
||
|
if summary_text: break
|
||
|
published = xml_data.find(namespace + 'published')
|
||
|
published_text = '' if published == None else published.text
|
||
|
categories = xml_data.find(namespace + 'category')
|
||
|
tags = []
|
||
|
if isinstance(categories, ET.Element):
|
||
|
for category in xml_data.findall(namespace + 'category'):
|
||
|
if 'term' in category.attrib and category.attrib['term']:
|
||
|
category_term = category.attrib['term']
|
||
|
if len(category_term) < 20:
|
||
|
tags.append(category_term)
|
||
|
elif len(category_term) < 50:
|
||
|
tags.append(category_term)
|
||
|
if limit and len(tags) > 4: break
|
||
|
|
||
|
|
||
|
identifier = xml_data.find(namespace + 'id')
|
||
|
if identifier and identifier.attrib: print(identifier.attrib)
|
||
|
identifier_text = '' if identifier == None else identifier.text
|
||
|
|
||
|
instances = '' # TODO Check the Blasta database for instances.
|
||
|
|
||
|
entry = {'content' : content_text,
|
||
|
'href' : link_href,
|
||
|
'published' : published_text,
|
||
|
'summary' : summary_text,
|
||
|
'tags' : tags,
|
||
|
'title' : title_text,
|
||
|
'updated' : published_text} # TODO "Updated" is missing
|
||
|
return entry
|
||
|
|
||
|
|
||
|
|
||
|
|