forked from sch/Slixfeed
332 lines
14 KiB
Python
332 lines
14 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
import feedparser
|
||
|
import aiohttp
|
||
|
import asyncio
|
||
|
|
||
|
import feedhandler
|
||
|
import sqlitehandler
|
||
|
|
||
|
from http.client import IncompleteRead
|
||
|
from asyncio.exceptions import IncompleteReadError
|
||
|
from urllib import error
|
||
|
from bs4 import BeautifulSoup
|
||
|
# from xml.etree.ElementTree import ElementTree, ParseError
|
||
|
from urllib.parse import urlparse
|
||
|
from lxml import html
|
||
|
|
||
|
async def download_updates(db_file):
|
||
|
"""
|
||
|
Check feeds for new entries.
|
||
|
|
||
|
:param db_file: Database filename.
|
||
|
"""
|
||
|
urls = await sqlitehandler.get_subscriptions(db_file)
|
||
|
|
||
|
for url in urls:
|
||
|
# print(os.path.basename(db_file), url[0])
|
||
|
source = url[0]
|
||
|
res = await download_feed(source)
|
||
|
# TypeError: 'NoneType' object is not subscriptable
|
||
|
if res is None:
|
||
|
# Skip to next feed
|
||
|
# urls.next()
|
||
|
# next(urls)
|
||
|
continue
|
||
|
|
||
|
await sqlitehandler.update_source_status(db_file, res[1], source)
|
||
|
|
||
|
if res[0]:
|
||
|
try:
|
||
|
feed = feedparser.parse(res[0])
|
||
|
if feed.bozo:
|
||
|
# bozo = ("WARNING: Bozo detected for feed <{}>. "
|
||
|
# "For more information, visit "
|
||
|
# "https://pythonhosted.org/feedparser/bozo.html"
|
||
|
# .format(source))
|
||
|
# print(bozo)
|
||
|
valid = 0
|
||
|
else:
|
||
|
valid = 1
|
||
|
await sqlitehandler.update_source_validity(db_file, source, valid)
|
||
|
except (IncompleteReadError, IncompleteRead, error.URLError) as e:
|
||
|
print(e)
|
||
|
# NOTE I don't think there should be "return"
|
||
|
# because then we might stop scanning next URLs
|
||
|
# return
|
||
|
# TODO Place these couple of lines back down
|
||
|
# NOTE Need to correct the SQL statement to do so
|
||
|
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
|
||
|
|
||
|
if res[1] == 200:
|
||
|
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
|
||
|
# TODO Place these couple of lines back down
|
||
|
# NOTE Need to correct the SQL statement to do so
|
||
|
entries = feed.entries
|
||
|
# length = len(entries)
|
||
|
# await sqlitehandler.remove_entry(db_file, source, length)
|
||
|
await sqlitehandler.remove_nonexistent_entries(db_file, feed, source)
|
||
|
|
||
|
new_entry = 0
|
||
|
for entry in entries:
|
||
|
|
||
|
if entry.has_key("title"):
|
||
|
title = entry.title
|
||
|
else:
|
||
|
title = feed["feed"]["title"]
|
||
|
|
||
|
if entry.has_key("link"):
|
||
|
link = entry.link
|
||
|
else:
|
||
|
link = source
|
||
|
|
||
|
exist = await sqlitehandler.check_entry_exist(db_file, title, link)
|
||
|
|
||
|
if not exist:
|
||
|
new_entry = new_entry + 1
|
||
|
# TODO Enhance summary
|
||
|
if entry.has_key("summary"):
|
||
|
summary = entry.summary
|
||
|
# Remove HTML tags
|
||
|
summary = BeautifulSoup(summary, "lxml").text
|
||
|
# TODO Limit text length
|
||
|
summary = summary.replace("\n\n", "\n")[:300] + " ⃨"
|
||
|
else:
|
||
|
summary = '*** No summary ***'
|
||
|
entry = (title, summary, link, source, 0);
|
||
|
await sqlitehandler.add_entry_and_set_date(db_file, source, entry)
|
||
|
|
||
|
|
||
|
async def add_feed(db_file, url):
|
||
|
"""
|
||
|
Check whether feed exist, otherwise process it.
|
||
|
|
||
|
:param db_file: Database filename.
|
||
|
:param url: URL.
|
||
|
:return: Status message.
|
||
|
"""
|
||
|
exist = await sqlitehandler.check_feed_exist(db_file, url)
|
||
|
|
||
|
if not exist:
|
||
|
res = await download_feed(url)
|
||
|
if res[0]:
|
||
|
feed = feedparser.parse(res[0])
|
||
|
title = await feedhandler.get_title(url, feed)
|
||
|
if feed.bozo:
|
||
|
bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url))
|
||
|
print(bozo)
|
||
|
try:
|
||
|
# tree = etree.fromstring(res[0]) # etree is for xml
|
||
|
tree = html.fromstring(res[0])
|
||
|
except:
|
||
|
return "Failed to parse URL <{}> as feed".format(url)
|
||
|
|
||
|
print("RSS Auto-Discovery Engaged")
|
||
|
xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]"""
|
||
|
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
|
||
|
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
|
||
|
feeds = tree.xpath(xpath_query)
|
||
|
if len(feeds) > 1:
|
||
|
msg = "RSS Auto-Discovery has found {} feeds:\n\n".format(len(feeds))
|
||
|
for feed in feeds:
|
||
|
# # The following code works;
|
||
|
# # The following code will catch
|
||
|
# # only valid resources (i.e. not 404);
|
||
|
# # The following code requires more bandwidth.
|
||
|
# res = await download_feed(feed)
|
||
|
# if res[0]:
|
||
|
# disco = feedparser.parse(res[0])
|
||
|
# title = disco["feed"]["title"]
|
||
|
# msg += "{} \n {} \n\n".format(title, feed)
|
||
|
feed_name = feed.xpath('@title')[0]
|
||
|
feed_addr = feed.xpath('@href')[0]
|
||
|
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
|
||
|
msg += "The above feeds were extracted from\n{}".format(url)
|
||
|
return msg
|
||
|
elif feeds:
|
||
|
url = feeds[0].xpath('@href')[0]
|
||
|
# Why wouldn't add_feed return a message
|
||
|
# upon success unless return is explicitly
|
||
|
# mentioned, yet upon failure it wouldn't?
|
||
|
return await add_feed(db_file, url)
|
||
|
|
||
|
# Search for feeds by file extension and path
|
||
|
paths = [
|
||
|
".atom",
|
||
|
".rss",
|
||
|
".xml",
|
||
|
"/?feed=atom",
|
||
|
"/?feed=rdf",
|
||
|
"/?feed=rss",
|
||
|
"/?feed=xml", # wordpress
|
||
|
"/?format=atom",
|
||
|
"/?format=rdf",
|
||
|
"/?format=rss",
|
||
|
"/?format=xml", # phpbb
|
||
|
"/app.php/feed",
|
||
|
"/atom",
|
||
|
"/atom.php",
|
||
|
"/atom.xml",
|
||
|
"/blog/feed/",
|
||
|
"/content-feeds/",
|
||
|
"/external.php?type=RSS2",
|
||
|
"/en/feed/",
|
||
|
"/feed", # good practice
|
||
|
"/feed.atom",
|
||
|
# "/feed.json",
|
||
|
"/feed.php",
|
||
|
"/feed.rdf",
|
||
|
"/feed.rss",
|
||
|
"/feed.xml",
|
||
|
"/feed/atom/",
|
||
|
"/feeds/news_feed",
|
||
|
"/feeds/posts/default",
|
||
|
"/feeds/posts/default?alt=atom",
|
||
|
"/feeds/posts/default?alt=rss",
|
||
|
"/feeds/rss/news.xml.php",
|
||
|
"/forum_rss.php",
|
||
|
"/index.atom",
|
||
|
"/index.php/feed",
|
||
|
"/index.php?type=atom;action=.xml", #smf
|
||
|
"/index.php?type=rss;action=.xml", #smf
|
||
|
"/index.rss",
|
||
|
"/jekyll/feed.xml",
|
||
|
"/latest.rss",
|
||
|
"/news",
|
||
|
"/news.xml",
|
||
|
"/news.xml.php",
|
||
|
"/news/feed",
|
||
|
"/posts.rss", # discourse
|
||
|
"/rdf",
|
||
|
"/rdf.php",
|
||
|
"/rdf.xml",
|
||
|
"/rss",
|
||
|
# "/rss.json",
|
||
|
"/rss.php",
|
||
|
"/rss.xml",
|
||
|
"/timeline.rss",
|
||
|
"/videos.atom",
|
||
|
# "/videos.json",
|
||
|
"/videos.xml",
|
||
|
"/xml/feed.rss"
|
||
|
]
|
||
|
|
||
|
print("RSS Scan Mode Engaged")
|
||
|
feeds = {}
|
||
|
for path in paths:
|
||
|
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
||
|
xpath_query = "//a[contains(@href,'{}')]".format(path)
|
||
|
addresses = tree.xpath(xpath_query)
|
||
|
parted_url = urlparse(url)
|
||
|
# NOTE Should number of addresses be limited or
|
||
|
# perhaps be N from the start and N from the end
|
||
|
for address in addresses:
|
||
|
address = address.xpath('@href')[0]
|
||
|
if address.startswith('/'):
|
||
|
address = parted_url.scheme + '://' + parted_url.netloc + address
|
||
|
res = await download_feed(address)
|
||
|
if res[1] == 200:
|
||
|
try:
|
||
|
feeds[address] = feedparser.parse(res[0])["feed"]["title"]
|
||
|
except:
|
||
|
continue
|
||
|
if len(feeds) > 1:
|
||
|
msg = "RSS URL scan has found {} feeds:\n\n".format(len(feeds))
|
||
|
for feed in feeds:
|
||
|
# try:
|
||
|
# res = await download_feed(feed)
|
||
|
# except:
|
||
|
# continue
|
||
|
feed_name = feeds[feed]
|
||
|
feed_addr = feed
|
||
|
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
|
||
|
msg += "The above feeds were extracted from\n{}".format(url)
|
||
|
return msg
|
||
|
elif feeds:
|
||
|
url = list(feeds)[0]
|
||
|
return await add_feed(db_file, url)
|
||
|
|
||
|
# (HTTP) Request(s) Paths
|
||
|
print("RSS Arbitrary Mode Engaged")
|
||
|
feeds = {}
|
||
|
parted_url = urlparse(url)
|
||
|
for path in paths:
|
||
|
address = parted_url.scheme + '://' + parted_url.netloc + path
|
||
|
res = await download_feed(address)
|
||
|
if res[1] == 200:
|
||
|
# print(feedparser.parse(res[0])["feed"]["title"])
|
||
|
# feeds[address] = feedparser.parse(res[0])["feed"]["title"]
|
||
|
try:
|
||
|
title = feedparser.parse(res[0])["feed"]["title"]
|
||
|
except:
|
||
|
title = '*** No Title ***'
|
||
|
feeds[address] = title
|
||
|
|
||
|
# Check whether URL has path (i.e. not root)
|
||
|
if parted_url.path.split('/')[1]:
|
||
|
paths.extend([".atom", ".feed", ".rdf", ".rss"]) if '.rss' not in paths else -1
|
||
|
# if paths.index('.rss'):
|
||
|
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
|
||
|
address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path
|
||
|
res = await download_feed(address)
|
||
|
if res[1] == 200:
|
||
|
print('ATTENTION')
|
||
|
print(address)
|
||
|
try:
|
||
|
title = feedparser.parse(res[0])["feed"]["title"]
|
||
|
except:
|
||
|
title = '*** No Title ***'
|
||
|
feeds[address] = title
|
||
|
if len(feeds) > 1:
|
||
|
msg = "RSS URL discovery has found {} feeds:\n\n".format(len(feeds))
|
||
|
for feed in feeds:
|
||
|
feed_name = feeds[feed]
|
||
|
feed_addr = feed
|
||
|
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
|
||
|
msg += "The above feeds were extracted from\n{}".format(url)
|
||
|
elif feeds:
|
||
|
url = list(feeds)[0]
|
||
|
msg = await add_feed(db_file, url)
|
||
|
else:
|
||
|
msg = "No news feeds were found for URL <{}>.".format(url)
|
||
|
else:
|
||
|
msg = await sqlitehandler.add_feed(db_file, title, url, res)
|
||
|
else:
|
||
|
msg = "Failed to get URL <{}>. Reason: {}".format(url, res[1])
|
||
|
else:
|
||
|
ix = exist[0]
|
||
|
name = exist[1]
|
||
|
msg = "> {}\nNews source \"{}\" is already listed in the subscription list at index {}".format(url, name, ix)
|
||
|
return msg
|
||
|
|
||
|
|
||
|
async def download_feed(url):
|
||
|
"""
|
||
|
Download content of given URL.
|
||
|
|
||
|
:param url: URL.
|
||
|
:return: Document or error message.
|
||
|
"""
|
||
|
timeout = aiohttp.ClientTimeout(total=10)
|
||
|
async with aiohttp.ClientSession() as session:
|
||
|
# async with aiohttp.ClientSession(trust_env=True) as session:
|
||
|
try:
|
||
|
async with session.get(url, timeout=timeout) as response:
|
||
|
status = response.status
|
||
|
if response.status == 200:
|
||
|
try:
|
||
|
doc = await response.text()
|
||
|
# print (response.content_type)
|
||
|
return [doc, status]
|
||
|
except:
|
||
|
return [False, "The content of this document doesn't appear to be textual"]
|
||
|
else:
|
||
|
return [False, "HTTP Error: " + str(status)]
|
||
|
except aiohttp.ClientError as e:
|
||
|
print('Error', str(e))
|
||
|
return [False, "Error: " + str(e)]
|
||
|
except asyncio.TimeoutError as e:
|
||
|
# print('Timeout:', str(e))
|
||
|
return [False, "Timeout: " + str(e)]
|