#!/usr/bin/env python3 # -*- coding: utf-8 -*- import xml.etree.ElementTree as ET class Syndication: # def extract_vcard_items(xml_data): # namespace = '{urn:ietf:params:xml:ns:vcard-4.0}' # title = xml_data.find(namespace + 'title') # # entry = {'fn' : content_text, # 'note' : link_href, # 'email' : published_text, # 'impp' : summary_text, # 'url' : tags} # return entry def extract_vcard_items(xml_data): """Extracts all items from a vCard XML ElementTree. Args: xml_data (ElementTree): The vCard XML as an ElementTree object. Returns: dict: A dictionary where keys are item names and values are their text content. """ items = {} for item in xml_data.iter(): # Skip the root element (vcard) if item.tag == '{urn:ietf:params:xml:ns:vcard-4.0}vcard': continue # Extract item name and text content item_name = item.tag.split('}')[1] # Check for any direct text content or child elements item_text = [] if item.text: item_text.append(item.text) for child in item: if child.text: item_text.append(child.text) # Join text elements if multiple found if item_text: items[item_name] = ' '.join(item_text).strip() # Strip extra spaces else: items[item_name] = None return items def extract_vcard4_items(xml_data): namespace = '{urn:ietf:params:xml:ns:vcard-4.0}' vcard = {} element_em = xml_data.find(namespace + 'email') element_fn = xml_data.find(namespace + 'fn') element_nn = xml_data.find(namespace + 'nickname') element_nt = xml_data.find(namespace + 'note') element_og = xml_data.find(namespace + 'org') element_im = xml_data.find(namespace + 'impp') element_ul = xml_data.find(namespace + 'url') if isinstance(element_em, ET.Element): for i in element_em: text = i.text if text: email = text break else: email = '' else: email = '' if isinstance(element_fn, ET.Element): for i in element_fn: text = i.text if text: title = text break else: title = '' else: title = '' if isinstance(element_nn, ET.Element): for i in element_nn: text = i.text if text: alias = text break else: alias = '' else: alias = '' if isinstance(element_nt, ET.Element): for i in element_nt: text = i.text if text: note = text break else: note = '' else: note = '' if isinstance(element_og, ET.Element): for i in element_og: text = i.text if text: org = text break else: org = '' else: org = '' if isinstance(element_im, ET.Element): for i in element_im: text = i.text if text: impp = text break else: impp = '' else: impp = '' if isinstance(element_ul, ET.Element): for i in element_ul: text = i.text if text: url = text break else: url = '' else: url = '' vcard['extras'] = {} for element in xml_data.findall(namespace + "group"): category = '?' for i in element.find(namespace + 'x-ablabel'): txt = i.text for i in element.find(namespace + 'url'): uri = i.text for i in element.find(namespace + 'url/' + namespace + 'parameters/' + namespace + 'type'): category = i.text if not category in vcard['extras']: vcard['extras'][category] = [] vcard['extras'][category].append({'label' : txt, 'uri' : uri}) vcard['alias'] = alias vcard['email'] = email vcard['fn'] = title vcard['note'] = note vcard['org'] = org vcard['impp'] = impp vcard['url'] = url return vcard def extract_atom_items(xml_data, limit=False): # NOTE # `.//` was not needded when node item payload was passed directly. # Now that item is saved as xml, it is required to use `.//`. # Perhaps navigating a level down (i.e. to "child"), or removing the root from the file would solve this. #namespace = './/{http://www.w3.org/2005/Atom}' namespace = '{http://www.w3.org/2005/Atom}' title = xml_data.find(namespace + 'title') links = xml_data.find(namespace + 'link') if (not isinstance(title, ET.Element) and not isinstance(links, ET.Element)): return None title_text = '' if title == None else title.text link_href = '' if isinstance(links, ET.Element): for link in xml_data.findall(namespace + 'link'): link_href = link.attrib['href'] if 'href' in link.attrib else '' if link_href: break contents = xml_data.find(namespace + 'content') content_text = '' if isinstance(contents, ET.Element): for content in xml_data.findall(namespace + 'content'): content_text = content.text or '' if content_text: break summaries = xml_data.find(namespace + 'summary') summary_text = '' if isinstance(summaries, ET.Element): for summary in xml_data.findall(namespace + 'summary'): summary_text = summary.text or '' if summary_text: break published = xml_data.find(namespace + 'published') published_text = '' if published == None else published.text categories = xml_data.find(namespace + 'category') tags = [] if isinstance(categories, ET.Element): for category in xml_data.findall(namespace + 'category'): if 'term' in category.attrib and category.attrib['term']: category_term = category.attrib['term'] if len(category_term) < 20: tags.append(category_term) elif len(category_term) < 50: tags.append(category_term) if limit and len(tags) > 4: break identifier = xml_data.find(namespace + 'id') if identifier and identifier.attrib: print(identifier.attrib) identifier_text = '' if identifier == None else identifier.text instances = '' # TODO Check the Blasta database for instances. entry = {'content' : content_text, 'href' : link_href, 'published' : published_text, 'summary' : summary_text, 'tags' : tags, 'title' : title_text, 'updated' : published_text} # TODO "Updated" is missing return entry