Update datahandler.py

This commit is contained in:
Schimon Jehudah 2023-11-02 05:17:04 +00:00
parent 6c5f17e11a
commit eb0bec12fd

View file

@ -4,9 +4,9 @@
import feedparser import feedparser
import aiohttp import aiohttp
import asyncio import asyncio
import os
import feedhandler
import sqlitehandler import sqlitehandler
import confighandler
from http.client import IncompleteRead from http.client import IncompleteRead
from asyncio.exceptions import IncompleteReadError from asyncio.exceptions import IncompleteReadError
@ -112,7 +112,7 @@ async def add_feed(db_file, url):
res = await download_feed(url) res = await download_feed(url)
if res[0]: if res[0]:
feed = feedparser.parse(res[0]) feed = feedparser.parse(res[0])
title = await feedhandler.get_title(url, feed) title = await get_title(url, feed)
if feed.bozo: if feed.bozo:
bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url)) bozo = ("WARNING: Bozo detected. Failed to load <{}>.".format(url))
print(bozo) print(bozo)
@ -151,69 +151,24 @@ async def add_feed(db_file, url):
# mentioned, yet upon failure it wouldn't? # mentioned, yet upon failure it wouldn't?
return await add_feed(db_file, url) return await add_feed(db_file, url)
# Search for feeds by file extension and path
paths = [
".atom",
".rss",
".xml",
"/?feed=atom",
"/?feed=rdf",
"/?feed=rss",
"/?feed=xml", # wordpress
"/?format=atom",
"/?format=rdf",
"/?format=rss",
"/?format=xml", # phpbb
"/app.php/feed",
"/atom",
"/atom.php",
"/atom.xml",
"/blog/feed/",
"/content-feeds/",
"/external.php?type=RSS2",
"/en/feed/",
"/feed", # good practice
"/feed.atom",
# "/feed.json",
"/feed.php",
"/feed.rdf",
"/feed.rss",
"/feed.xml",
"/feed/atom/",
"/feeds/news_feed",
"/feeds/posts/default",
"/feeds/posts/default?alt=atom",
"/feeds/posts/default?alt=rss",
"/feeds/rss/news.xml.php",
"/forum_rss.php",
"/index.atom",
"/index.php/feed",
"/index.php?type=atom;action=.xml", #smf
"/index.php?type=rss;action=.xml", #smf
"/index.rss",
"/jekyll/feed.xml",
"/latest.rss",
"/news",
"/news.xml",
"/news.xml.php",
"/news/feed",
"/posts.rss", # discourse
"/rdf",
"/rdf.php",
"/rdf.xml",
"/rss",
# "/rss.json",
"/rss.php",
"/rss.xml",
"/timeline.rss",
"/videos.atom",
# "/videos.json",
"/videos.xml",
"/xml/feed.rss"
]
print("RSS Scan Mode Engaged") print("RSS Scan Mode Engaged")
feeds = {} feeds = {}
paths = []
# TODO Test
cfg_dir = confighandler.get_default_confdir()
if not os.path.isdir(cfg_dir):
os.mkdir(cfg_dir)
cfg_file = os.path.join(cfg_dir, r"url_paths.txt")
if not os.path.isfile(cfg_file):
# confighandler.generate_dictionary()
list = confighandler.get_default_list()
file = open(cfg_file, "w")
file.writelines("\n".join(list))
file.close()
file = open(cfg_file, "r")
lines = file.readlines()
for line in lines:
paths.extend([line.strip()])
for path in paths: for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path) # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
xpath_query = "//a[contains(@href,'{}')]".format(path) xpath_query = "//a[contains(@href,'{}')]".format(path)
@ -271,8 +226,6 @@ async def add_feed(db_file, url):
address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path address = parted_url.scheme + '://' + parted_url.netloc + '/' + parted_url.path.split('/')[1] + path
res = await download_feed(address) res = await download_feed(address)
if res[1] == 200: if res[1] == 200:
print('ATTENTION')
print(address)
try: try:
title = feedparser.parse(res[0])["feed"]["title"] title = feedparser.parse(res[0])["feed"]["title"]
except: except:
@ -320,7 +273,8 @@ async def download_feed(url):
# print (response.content_type) # print (response.content_type)
return [doc, status] return [doc, status]
except: except:
return [False, "The content of this document doesn't appear to be textual"] # return [False, "The content of this document doesn't appear to be textual."]
return [False, "Document is too large or is not textual."]
else: else:
return [False, "HTTP Error: " + str(status)] return [False, "HTTP Error: " + str(status)]
except aiohttp.ClientError as e: except aiohttp.ClientError as e:
@ -329,3 +283,18 @@ async def download_feed(url):
except asyncio.TimeoutError as e: except asyncio.TimeoutError as e:
# print('Timeout:', str(e)) # print('Timeout:', str(e))
return [False, "Timeout: " + str(e)] return [False, "Timeout: " + str(e)]
async def get_title(url, feed):
"""
Get title of feed.
:param url: URL
:param feed: Parsed feed
:return: Title or URL hostname.
"""
try:
title = feed["feed"]["title"]
except:
title = urlparse(url).netloc
return title