Update datahandler.py
This commit is contained in:
parent
6c5f17e11a
commit
eb0bec12fd
1 changed files with 38 additions and 69 deletions
|
@ -4,9 +4,9 @@
|
||||||
import feedparser
|
import feedparser
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import os
|
||||||
import feedhandler
|
|
||||||
import sqlitehandler
|
import sqlitehandler
|
||||||
|
import confighandler
|
||||||
|
|
||||||
from http.client import IncompleteRead
|
from http.client import IncompleteRead
|
||||||
from asyncio.exceptions import IncompleteReadError
|
from asyncio.exceptions import IncompleteReadError
|
||||||
|
@ -112,7 +112,7 @@ async def add_feed(db_file, url):
|
||||||
res = await download_feed(url)
|
res = await download_feed(url)
|
||||||
if res[0]:
|
if res[0]:
|
||||||
feed = feedparser.parse(res[0])
|
feed = feedparser.parse(res[0])
|
||||||
title = await feedhandler.get_title(url, feed)
|
title = await get_title(url, feed)
|
||||||
if feed.bozo:
|
if feed.bozo:
|
||||||
bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url))
|
bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url))
|
||||||
print(bozo)
|
print(bozo)
|
||||||
|
@ -151,69 +151,24 @@ async def add_feed(db_file, url):
|
||||||
# mentioned, yet upon failure it wouldn't?
|
# mentioned, yet upon failure it wouldn't?
|
||||||
return await add_feed(db_file, url)
|
return await add_feed(db_file, url)
|
||||||
|
|
||||||
# Search for feeds by file extension and path
|
|
||||||
paths = [
|
|
||||||
".atom",
|
|
||||||
".rss",
|
|
||||||
".xml",
|
|
||||||
"/?feed=atom",
|
|
||||||
"/?feed=rdf",
|
|
||||||
"/?feed=rss",
|
|
||||||
"/?feed=xml", # wordpress
|
|
||||||
"/?format=atom",
|
|
||||||
"/?format=rdf",
|
|
||||||
"/?format=rss",
|
|
||||||
"/?format=xml", # phpbb
|
|
||||||
"/app.php/feed",
|
|
||||||
"/atom",
|
|
||||||
"/atom.php",
|
|
||||||
"/atom.xml",
|
|
||||||
"/blog/feed/",
|
|
||||||
"/content-feeds/",
|
|
||||||
"/external.php?type=RSS2",
|
|
||||||
"/en/feed/",
|
|
||||||
"/feed", # good practice
|
|
||||||
"/feed.atom",
|
|
||||||
# "/feed.json",
|
|
||||||
"/feed.php",
|
|
||||||
"/feed.rdf",
|
|
||||||
"/feed.rss",
|
|
||||||
"/feed.xml",
|
|
||||||
"/feed/atom/",
|
|
||||||
"/feeds/news_feed",
|
|
||||||
"/feeds/posts/default",
|
|
||||||
"/feeds/posts/default?alt=atom",
|
|
||||||
"/feeds/posts/default?alt=rss",
|
|
||||||
"/feeds/rss/news.xml.php",
|
|
||||||
"/forum_rss.php",
|
|
||||||
"/index.atom",
|
|
||||||
"/index.php/feed",
|
|
||||||
"/index.php?type=atom;action=.xml", #smf
|
|
||||||
"/index.php?type=rss;action=.xml", #smf
|
|
||||||
"/index.rss",
|
|
||||||
"/jekyll/feed.xml",
|
|
||||||
"/latest.rss",
|
|
||||||
"/news",
|
|
||||||
"/news.xml",
|
|
||||||
"/news.xml.php",
|
|
||||||
"/news/feed",
|
|
||||||
"/posts.rss", # discourse
|
|
||||||
"/rdf",
|
|
||||||
"/rdf.php",
|
|
||||||
"/rdf.xml",
|
|
||||||
"/rss",
|
|
||||||
# "/rss.json",
|
|
||||||
"/rss.php",
|
|
||||||
"/rss.xml",
|
|
||||||
"/timeline.rss",
|
|
||||||
"/videos.atom",
|
|
||||||
# "/videos.json",
|
|
||||||
"/videos.xml",
|
|
||||||
"/xml/feed.rss"
|
|
||||||
]
|
|
||||||
|
|
||||||
print("RSS Scan Mode Engaged")
|
print("RSS Scan Mode Engaged")
|
||||||
feeds = {}
|
feeds = {}
|
||||||
|
paths = []
|
||||||
|
# TODO Test
|
||||||
|
cfg_dir = confighandler.get_default_confdir()
|
||||||
|
if not os.path.isdir(cfg_dir):
|
||||||
|
os.mkdir(cfg_dir)
|
||||||
|
cfg_file = os.path.join(cfg_dir, r"url_paths.txt")
|
||||||
|
if not os.path.isfile(cfg_file):
|
||||||
|
# confighandler.generate_dictionary()
|
||||||
|
list = confighandler.get_default_list()
|
||||||
|
file = open(cfg_file, "w")
|
||||||
|
file.writelines("\n".join(list))
|
||||||
|
file.close()
|
||||||
|
file = open(cfg_file, "r")
|
||||||
|
lines = file.readlines()
|
||||||
|
for line in lines:
|
||||||
|
paths.extend([line.strip()])
|
||||||
for path in paths:
|
for path in paths:
|
||||||
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
|
||||||
xpath_query = "//a[contains(@href,'{}')]".format(path)
|
xpath_query = "//a[contains(@href,'{}')]".format(path)
|
||||||
|
@ -271,8 +226,6 @@ async def add_feed(db_file, url):
|
||||||
address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path
|
address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path
|
||||||
res = await download_feed(address)
|
res = await download_feed(address)
|
||||||
if res[1] == 200:
|
if res[1] == 200:
|
||||||
print('ATTENTION')
|
|
||||||
print(address)
|
|
||||||
try:
|
try:
|
||||||
title = feedparser.parse(res[0])["feed"]["title"]
|
title = feedparser.parse(res[0])["feed"]["title"]
|
||||||
except:
|
except:
|
||||||
|
@ -320,7 +273,8 @@ async def download_feed(url):
|
||||||
# print (response.content_type)
|
# print (response.content_type)
|
||||||
return [doc, status]
|
return [doc, status]
|
||||||
except:
|
except:
|
||||||
return [False, "The content of this document doesn't appear to be textual"]
|
# return [False, "The content of this document doesn't appear to be textual."]
|
||||||
|
return [False, "Document is too large or is not textual."]
|
||||||
else:
|
else:
|
||||||
return [False, "HTTP Error: " + str(status)]
|
return [False, "HTTP Error: " + str(status)]
|
||||||
except aiohttp.ClientError as e:
|
except aiohttp.ClientError as e:
|
||||||
|
@ -329,3 +283,18 @@ async def download_feed(url):
|
||||||
except asyncio.TimeoutError as e:
|
except asyncio.TimeoutError as e:
|
||||||
# print('Timeout:', str(e))
|
# print('Timeout:', str(e))
|
||||||
return [False, "Timeout: " + str(e)]
|
return [False, "Timeout: " + str(e)]
|
||||||
|
|
||||||
|
|
||||||
|
async def get_title(url, feed):
|
||||||
|
"""
|
||||||
|
Get title of feed.
|
||||||
|
|
||||||
|
:param url: URL
|
||||||
|
:param feed: Parsed feed
|
||||||
|
:return: Title or URL hostname.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
title = feed["feed"]["title"]
|
||||||
|
except:
|
||||||
|
title = urlparse(url).netloc
|
||||||
|
return title
|
||||||
|
|
Loading…
Reference in a new issue