#!/usr/bin/env python3 # -*- coding: utf-8 -*- import feedparser import aiohttp import asyncio import feedhandler import sqlitehandler from http.client import IncompleteRead from asyncio.exceptions import IncompleteReadError from urllib import error from bs4 import BeautifulSoup # from xml.etree.ElementTree import ElementTree, ParseError from urllib.parse import urlparse from lxml import html async def download_updates(db_file): """ Check feeds for new entries. :param db_file: Database filename. """ urls = await sqlitehandler.get_subscriptions(db_file) for url in urls: # print(os.path.basename(db_file), url[0]) source = url[0] res = await download_feed(source) # TypeError: 'NoneType' object is not subscriptable if res is None: # Skip to next feed # urls.next() # next(urls) continue await sqlitehandler.update_source_status(db_file, res[1], source) if res[0]: try: feed = feedparser.parse(res[0]) if feed.bozo: # bozo = ("WARNING: Bozo detected for feed <{}>. " # "For more information, visit " # "https://pythonhosted.org/feedparser/bozo.html" # .format(source)) # print(bozo) valid = 0 else: valid = 1 await sqlitehandler.update_source_validity(db_file, source, valid) except (IncompleteReadError, IncompleteRead, error.URLError) as e: print(e) # NOTE I don't think there should be "return" # because then we might stop scanning next URLs # return # TODO Place these couple of lines back down # NOTE Need to correct the SQL statement to do so # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW if res[1] == 200: # NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW # TODO Place these couple of lines back down # NOTE Need to correct the SQL statement to do so entries = feed.entries # length = len(entries) # await sqlitehandler.remove_entry(db_file, source, length) await sqlitehandler.remove_nonexistent_entries(db_file, feed, source) new_entry = 0 for entry in entries: if entry.has_key("title"): title = entry.title else: title = feed["feed"]["title"] if entry.has_key("link"): link = entry.link else: link = source exist = await sqlitehandler.check_entry_exist(db_file, title, link) if not exist: new_entry = new_entry + 1 # TODO Enhance summary if entry.has_key("summary"): summary = entry.summary # Remove HTML tags summary = BeautifulSoup(summary, "lxml").text # TODO Limit text length summary = summary.replace("\n\n", "\n")[:300] + " ‍⃨" else: summary = '*** No summary ***' entry = (title, summary, link, source, 0); await sqlitehandler.add_entry_and_set_date(db_file, source, entry) async def add_feed(db_file, url): """ Check whether feed exist, otherwise process it. :param db_file: Database filename. :param url: URL. :return: Status message. """ exist = await sqlitehandler.check_feed_exist(db_file, url) if not exist: res = await download_feed(url) if res[0]: feed = feedparser.parse(res[0]) title = await feedhandler.get_title(url, feed) if feed.bozo: bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url)) print(bozo) try: # tree = etree.fromstring(res[0]) # etree is for xml tree = html.fromstring(res[0]) except: return "Failed to parse URL <{}> as feed".format(url) print("RSS Auto-Discovery Engaged") xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]""" # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href""" # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href" feeds = tree.xpath(xpath_query) if len(feeds) > 1: msg = "RSS Auto-Discovery has found {} feeds:\n\n".format(len(feeds)) for feed in feeds: # # The following code works; # # The following code will catch # # only valid resources (i.e. not 404); # # The following code requires more bandwidth. # res = await download_feed(feed) # if res[0]: # disco = feedparser.parse(res[0]) # title = disco["feed"]["title"] # msg += "{} \n {} \n\n".format(title, feed) feed_name = feed.xpath('@title')[0] feed_addr = feed.xpath('@href')[0] msg += "{}\n{}\n\n".format(feed_name, feed_addr) msg += "The above feeds were extracted from\n{}".format(url) return msg elif feeds: url = feeds[0].xpath('@href')[0] # Why wouldn't add_feed return a message # upon success unless return is explicitly # mentioned, yet upon failure it wouldn't? return await add_feed(db_file, url) # Search for feeds by file extension and path paths = [ ".atom", ".rss", ".xml", "/?feed=atom", "/?feed=rdf", "/?feed=rss", "/?feed=xml", # wordpress "/?format=atom", "/?format=rdf", "/?format=rss", "/?format=xml", # phpbb "/app.php/feed", "/atom", "/atom.php", "/atom.xml", "/blog/feed/", "/content-feeds/", "/external.php?type=RSS2", "/en/feed/", "/feed", # good practice "/feed.atom", # "/feed.json", "/feed.php", "/feed.rdf", "/feed.rss", "/feed.xml", "/feed/atom/", "/feeds/news_feed", "/feeds/posts/default", "/feeds/posts/default?alt=atom", "/feeds/posts/default?alt=rss", "/feeds/rss/news.xml.php", "/forum_rss.php", "/index.atom", "/index.php/feed", "/index.php?type=atom;action=.xml", #smf "/index.php?type=rss;action=.xml", #smf "/index.rss", "/jekyll/feed.xml", "/latest.rss", "/news", "/news.xml", "/news.xml.php", "/news/feed", "/posts.rss", # discourse "/rdf", "/rdf.php", "/rdf.xml", "/rss", # "/rss.json", "/rss.php", "/rss.xml", "/timeline.rss", "/videos.atom", # "/videos.json", "/videos.xml", "/xml/feed.rss" ] print("RSS Scan Mode Engaged") feeds = {} for path in paths: # xpath_query = "//*[@*[contains(.,'{}')]]".format(path) xpath_query = "//a[contains(@href,'{}')]".format(path) addresses = tree.xpath(xpath_query) parted_url = urlparse(url) # NOTE Should number of addresses be limited or # perhaps be N from the start and N from the end for address in addresses: address = address.xpath('@href')[0] if address.startswith('/'): address = parted_url.scheme + '://' + parted_url.netloc + address res = await download_feed(address) if res[1] == 200: try: feeds[address] = feedparser.parse(res[0])["feed"]["title"] except: continue if len(feeds) > 1: msg = "RSS URL scan has found {} feeds:\n\n".format(len(feeds)) for feed in feeds: # try: # res = await download_feed(feed) # except: # continue feed_name = feeds[feed] feed_addr = feed msg += "{}\n{}\n\n".format(feed_name, feed_addr) msg += "The above feeds were extracted from\n{}".format(url) return msg elif feeds: url = list(feeds)[0] return await add_feed(db_file, url) # (HTTP) Request(s) Paths print("RSS Arbitrary Mode Engaged") feeds = {} parted_url = urlparse(url) for path in paths: address = parted_url.scheme + '://' + parted_url.netloc + path res = await download_feed(address) if res[1] == 200: # print(feedparser.parse(res[0])["feed"]["title"]) # feeds[address] = feedparser.parse(res[0])["feed"]["title"] try: title = feedparser.parse(res[0])["feed"]["title"] except: title = '*** No Title ***' feeds[address] = title # Check whether URL has path (i.e. not root) if parted_url.path.split('/')[1]: paths.extend([".atom", ".feed", ".rdf", ".rss"]) if '.rss' not in paths else -1 # if paths.index('.rss'): # paths.extend([".atom", ".feed", ".rdf", ".rss"]) address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path res = await download_feed(address) if res[1] == 200: print('ATTENTION') print(address) try: title = feedparser.parse(res[0])["feed"]["title"] except: title = '*** No Title ***' feeds[address] = title if len(feeds) > 1: msg = "RSS URL discovery has found {} feeds:\n\n".format(len(feeds)) for feed in feeds: feed_name = feeds[feed] feed_addr = feed msg += "{}\n{}\n\n".format(feed_name, feed_addr) msg += "The above feeds were extracted from\n{}".format(url) elif feeds: url = list(feeds)[0] msg = await add_feed(db_file, url) else: msg = "No news feeds were found for URL <{}>.".format(url) else: msg = await sqlitehandler.add_feed(db_file, title, url, res) else: msg = "Failed to get URL <{}>. Reason: {}".format(url, res[1]) else: ix = exist[0] name = exist[1] msg = "> {}\nNews source \"{}\" is already listed in the subscription list at index {}".format(url, name, ix) return msg async def download_feed(url): """ Download content of given URL. :param url: URL. :return: Document or error message. """ timeout = aiohttp.ClientTimeout(total=10) async with aiohttp.ClientSession() as session: # async with aiohttp.ClientSession(trust_env=True) as session: try: async with session.get(url, timeout=timeout) as response: status = response.status if response.status == 200: try: doc = await response.text() # print (response.content_type) return [doc, status] except: return [False, "The content of this document doesn't appear to be textual"] else: return [False, "HTTP Error: " + str(status)] except aiohttp.ClientError as e: print('Error', str(e)) return [False, "Error: " + str(e)] except asyncio.TimeoutError as e: # print('Timeout:', str(e)) return [False, "Timeout: " + str(e)]