forked from sch/Slixfeed
Add support for JSON Feed
This commit is contained in:
parent
4c84f40e0e
commit
0512e14738
5 changed files with 346 additions and 9 deletions
24
README.md
24
README.md
|
@ -14,21 +14,33 @@ Slixfeed is primarily designed for XMPP (aka Jabber), yet it is built to be exte
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
#### Simultaneous
|
|
||||||
|
|
||||||
Slixfeed is designed to handle multiple contacts, including groupchats, Simultaneously.
|
|
||||||
|
|
||||||
#### Ease
|
#### Ease
|
||||||
|
|
||||||
Slixfeed automatically scans (i.e. crawls) for web feeds of given URL.
|
Slixfeed automatically scans (i.e. crawls) for web feeds of given URL.
|
||||||
|
|
||||||
|
#### Export
|
||||||
|
|
||||||
|
Download articles as ePUB, HTML, Markdown and PDF.
|
||||||
|
|
||||||
#### Filtering
|
#### Filtering
|
||||||
|
|
||||||
Slixfeed provides positive and nagative ways to filter by allow and deny lists.
|
Filter using lists of allow and deny.
|
||||||
|
|
||||||
|
#### Multimedia
|
||||||
|
|
||||||
|
Display audios pictures and videos inline.
|
||||||
|
|
||||||
|
#### Portable
|
||||||
|
|
||||||
|
Export and import feeds with a standard OPML file.
|
||||||
|
|
||||||
#### Proxy
|
#### Proxy
|
||||||
|
|
||||||
Redirect to alternative online back-ends, such as Invidious, Librarian, Nitter, for increased privacy and productivity and security.
|
Redirect to alternative back-ends, such as Invidious, Librarian, Nitter, for increased privacy, productivity and security.
|
||||||
|
|
||||||
|
#### Simultaneous
|
||||||
|
|
||||||
|
Slixfeed is designed to handle multiple contacts, including groupchats, Simultaneously.
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
__version__ = "1.0.0"
|
|
@ -26,8 +26,9 @@ TODO
|
||||||
|
|
||||||
from asyncio.exceptions import IncompleteReadError
|
from asyncio.exceptions import IncompleteReadError
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from http.client import IncompleteRead
|
|
||||||
from feedparser import parse
|
from feedparser import parse
|
||||||
|
from http.client import IncompleteRead
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import slixfeed.config as config
|
import slixfeed.config as config
|
||||||
|
@ -102,6 +103,38 @@ def log_to_markdown(timestamp, filename, jid, message):
|
||||||
file.write(entry)
|
file.write(entry)
|
||||||
|
|
||||||
|
|
||||||
|
def is_feed_json(document):
|
||||||
|
"""
|
||||||
|
|
||||||
|
NOTE /kurtmckee/feedparser/issues/103
|
||||||
|
|
||||||
|
Determine whether document is json feed or not.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
feed : dict
|
||||||
|
Parsed feed.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
val : boolean
|
||||||
|
True or False.
|
||||||
|
"""
|
||||||
|
value = False
|
||||||
|
feed = json.loads(document)
|
||||||
|
if not feed['items']:
|
||||||
|
if "version" in feed.keys():
|
||||||
|
if 'jsonfeed' in feed['version']:
|
||||||
|
value = True
|
||||||
|
# elif 'title' in feed.keys():
|
||||||
|
# value = True
|
||||||
|
else:
|
||||||
|
value = False
|
||||||
|
else:
|
||||||
|
value = True
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def is_feed(feed):
|
def is_feed(feed):
|
||||||
"""
|
"""
|
||||||
Determine whether document is feed or not.
|
Determine whether document is feed or not.
|
||||||
|
@ -120,7 +153,7 @@ def is_feed(feed):
|
||||||
# message = None
|
# message = None
|
||||||
if not feed.entries:
|
if not feed.entries:
|
||||||
if "version" in feed.keys():
|
if "version" in feed.keys():
|
||||||
feed["version"]
|
# feed["version"]
|
||||||
if feed.version:
|
if feed.version:
|
||||||
value = True
|
value = True
|
||||||
# message = (
|
# message = (
|
||||||
|
@ -471,6 +504,53 @@ async def add_feed(db_file, url):
|
||||||
"added to subscription list."
|
"added to subscription list."
|
||||||
).format(url, title)
|
).format(url, title)
|
||||||
break
|
break
|
||||||
|
# NOTE This elif statement be unnecessary
|
||||||
|
# when feedparser be supporting json feed.
|
||||||
|
elif is_feed_json(document):
|
||||||
|
feed = json.loads(document)
|
||||||
|
if "title" in feed.keys():
|
||||||
|
title = feed["title"]
|
||||||
|
else:
|
||||||
|
title = urlsplit(url).netloc
|
||||||
|
if "language" in feed.keys():
|
||||||
|
language = feed["language"]
|
||||||
|
else:
|
||||||
|
language = ''
|
||||||
|
if "encoding" in feed.keys():
|
||||||
|
encoding = feed["encoding"]
|
||||||
|
else:
|
||||||
|
encoding = ''
|
||||||
|
if "date_published" in feed.keys():
|
||||||
|
updated = feed["date_published"]
|
||||||
|
try:
|
||||||
|
updated = convert_struct_time_to_iso8601(updated)
|
||||||
|
except:
|
||||||
|
updated = ''
|
||||||
|
else:
|
||||||
|
updated = ''
|
||||||
|
version = 'json' + feed["version"].split('/').pop()
|
||||||
|
entries = len(feed["items"])
|
||||||
|
await sqlite.insert_feed(
|
||||||
|
db_file, url,
|
||||||
|
title=title,
|
||||||
|
entries=entries,
|
||||||
|
version=version,
|
||||||
|
encoding=encoding,
|
||||||
|
language=language,
|
||||||
|
status_code=status_code,
|
||||||
|
updated=updated
|
||||||
|
)
|
||||||
|
await scan_json(
|
||||||
|
db_file, url)
|
||||||
|
old = await get_setting_value(db_file, "old")
|
||||||
|
if not old:
|
||||||
|
await sqlite.mark_feed_as_read(
|
||||||
|
db_file, url)
|
||||||
|
response = (
|
||||||
|
"> {}\nNews source \"{}\" has been "
|
||||||
|
"added to subscription list."
|
||||||
|
).format(url, title)
|
||||||
|
break
|
||||||
else:
|
else:
|
||||||
result = await crawl.probe_page(
|
result = await crawl.probe_page(
|
||||||
url, document)
|
url, document)
|
||||||
|
@ -496,6 +576,144 @@ async def add_feed(db_file, url):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
async def scan_json(db_file, url):
|
||||||
|
"""
|
||||||
|
Check feeds for new entries.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
db_file : str
|
||||||
|
Path to database file.
|
||||||
|
url : str, optional
|
||||||
|
URL. The default is None.
|
||||||
|
"""
|
||||||
|
if isinstance(url, tuple): url = url[0]
|
||||||
|
result = await fetch.http(url)
|
||||||
|
try:
|
||||||
|
document = result[0]
|
||||||
|
status = result[1]
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
new_entries = []
|
||||||
|
if document and status == 200:
|
||||||
|
feed = json.loads(document)
|
||||||
|
entries = feed["items"]
|
||||||
|
await remove_nonexistent_entries_json(
|
||||||
|
db_file, url, feed)
|
||||||
|
try:
|
||||||
|
feed_id = await sqlite.get_feed_id(db_file, url)
|
||||||
|
# await sqlite.update_feed_validity(
|
||||||
|
# db_file, feed_id, valid)
|
||||||
|
if "date_published" in feed.keys():
|
||||||
|
updated = feed["date_published"]
|
||||||
|
try:
|
||||||
|
updated = convert_struct_time_to_iso8601(updated)
|
||||||
|
except:
|
||||||
|
updated = ''
|
||||||
|
else:
|
||||||
|
updated = ''
|
||||||
|
feed_id = await sqlite.get_feed_id(db_file, url)
|
||||||
|
await sqlite.update_feed_properties(
|
||||||
|
db_file, feed_id, len(feed["items"]), updated)
|
||||||
|
# await update_feed_status
|
||||||
|
except (
|
||||||
|
IncompleteReadError,
|
||||||
|
IncompleteRead,
|
||||||
|
error.URLError
|
||||||
|
) as e:
|
||||||
|
logging.error(e)
|
||||||
|
return
|
||||||
|
# new_entry = 0
|
||||||
|
for entry in entries:
|
||||||
|
if "date_published" in entry.keys():
|
||||||
|
date = entry["date_published"]
|
||||||
|
date = rfc2822_to_iso8601(date)
|
||||||
|
elif "date_modified" in entry.keys():
|
||||||
|
date = entry["date_modified"]
|
||||||
|
date = rfc2822_to_iso8601(date)
|
||||||
|
else:
|
||||||
|
date = now()
|
||||||
|
if "url" in entry.keys():
|
||||||
|
# link = complete_url(source, entry.link)
|
||||||
|
link = join_url(url, entry["url"])
|
||||||
|
link = trim_url(link)
|
||||||
|
else:
|
||||||
|
link = url
|
||||||
|
# title = feed["feed"]["title"]
|
||||||
|
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
|
||||||
|
title = entry["title"] if "title" in entry.keys() else date
|
||||||
|
entry_id = entry["id"] if "id" in entry.keys() else link
|
||||||
|
feed_id = await sqlite.get_feed_id(db_file, url)
|
||||||
|
exist = await sqlite.check_entry_exist(
|
||||||
|
db_file, feed_id, entry_id=entry_id,
|
||||||
|
title=title, link=link, date=date)
|
||||||
|
if not exist:
|
||||||
|
summary = entry["summary"] if "summary" in entry.keys() else ''
|
||||||
|
if not summary:
|
||||||
|
summary = entry["content_html"] if "content_html" in entry.keys() else ''
|
||||||
|
if not summary:
|
||||||
|
summary = entry["content_text"] if "content_text" in entry.keys() else ''
|
||||||
|
read_status = 0
|
||||||
|
pathname = urlsplit(link).path
|
||||||
|
string = (
|
||||||
|
"{} {} {}"
|
||||||
|
).format(
|
||||||
|
title, summary, pathname)
|
||||||
|
allow_list = await config.is_include_keyword(
|
||||||
|
db_file, "filter-allow", string)
|
||||||
|
if not allow_list:
|
||||||
|
reject_list = await config.is_include_keyword(
|
||||||
|
db_file, "filter-deny", string)
|
||||||
|
if reject_list:
|
||||||
|
read_status = 1
|
||||||
|
logging.debug(
|
||||||
|
"Rejected : {}\n"
|
||||||
|
"Keyword : {}".format(
|
||||||
|
link, reject_list))
|
||||||
|
if isinstance(date, int):
|
||||||
|
logging.error(
|
||||||
|
"Variable 'date' is int: {}".format(date))
|
||||||
|
media_link = ''
|
||||||
|
if "attachments" in entry.keys():
|
||||||
|
for e_link in entry["attachments"]:
|
||||||
|
try:
|
||||||
|
# if (link.rel == "enclosure" and
|
||||||
|
# (link.type.startswith("audio/") or
|
||||||
|
# link.type.startswith("image/") or
|
||||||
|
# link.type.startswith("video/"))
|
||||||
|
# ):
|
||||||
|
media_type = e_link["mime_type"][:e_link["mime_type"].index("/")]
|
||||||
|
if media_type in ("audio", "image", "video"):
|
||||||
|
media_link = e_link["url"]
|
||||||
|
media_link = join_url(url, e_link["url"])
|
||||||
|
media_link = trim_url(media_link)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
logging.error(
|
||||||
|
"KeyError: 'url'\n"
|
||||||
|
"Missing 'url' attribute for {}".format(url))
|
||||||
|
logging.info(
|
||||||
|
"Continue scanning for next potential "
|
||||||
|
"enclosure of {}".format(link))
|
||||||
|
entry = {
|
||||||
|
"title": title,
|
||||||
|
"link": link,
|
||||||
|
"enclosure": media_link,
|
||||||
|
"entry_id": entry_id,
|
||||||
|
"date": date,
|
||||||
|
"read_status": read_status
|
||||||
|
}
|
||||||
|
new_entries.extend([entry])
|
||||||
|
# await sqlite.add_entry(
|
||||||
|
# db_file, title, link, entry_id,
|
||||||
|
# url, date, read_status)
|
||||||
|
# await sqlite.set_date(db_file, url)
|
||||||
|
if len(new_entries):
|
||||||
|
feed_id = await sqlite.get_feed_id(db_file, url)
|
||||||
|
await sqlite.add_entries_and_update_timestamp(
|
||||||
|
db_file, feed_id, new_entries)
|
||||||
|
|
||||||
|
|
||||||
async def view_feed(url):
|
async def view_feed(url):
|
||||||
while True:
|
while True:
|
||||||
result = await fetch.http(url)
|
result = await fetch.http(url)
|
||||||
|
@ -845,7 +1063,6 @@ async def extract_image_from_feed(db_file, feed_id, url):
|
||||||
logging.error(url)
|
logging.error(url)
|
||||||
logging.error(
|
logging.error(
|
||||||
"AttributeError: object has no attribute 'link'")
|
"AttributeError: object has no attribute 'link'")
|
||||||
breakpoint()
|
|
||||||
|
|
||||||
|
|
||||||
async def extract_image_from_html(url):
|
async def extract_image_from_html(url):
|
||||||
|
@ -1024,3 +1241,75 @@ async def remove_nonexistent_entries(db_file, url, feed):
|
||||||
await sqlite.archive_entry(db_file, ix)
|
await sqlite.archive_entry(db_file, ix)
|
||||||
limit = await get_setting_value(db_file, "archive")
|
limit = await get_setting_value(db_file, "archive")
|
||||||
await sqlite.maintain_archive(db_file, limit)
|
await sqlite.maintain_archive(db_file, limit)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def remove_nonexistent_entries_json(db_file, url, feed):
|
||||||
|
"""
|
||||||
|
Remove entries that don't exist in a given parsed feed.
|
||||||
|
Check the entries returned from feed and delete read non
|
||||||
|
existing entries, otherwise move to table archive, if unread.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
db_file : str
|
||||||
|
Path to database file.
|
||||||
|
url : str
|
||||||
|
Feed URL.
|
||||||
|
feed : list
|
||||||
|
Parsed feed document.
|
||||||
|
"""
|
||||||
|
feed_id = await sqlite.get_feed_id(db_file, url)
|
||||||
|
items = await sqlite.get_entries_of_feed(db_file, feed_id)
|
||||||
|
entries = feed["items"]
|
||||||
|
for item in items:
|
||||||
|
ix = item[0]
|
||||||
|
entry_title = item[1]
|
||||||
|
entry_link = item[2]
|
||||||
|
entry_id = item[3]
|
||||||
|
timestamp = item[4]
|
||||||
|
read_status = item[5]
|
||||||
|
valid = False
|
||||||
|
for entry in entries:
|
||||||
|
title = None
|
||||||
|
link = None
|
||||||
|
time = None
|
||||||
|
# valid = False
|
||||||
|
# TODO better check and don't repeat code
|
||||||
|
if entry.has_key("id") and entry_id:
|
||||||
|
if entry["id"] == entry_id:
|
||||||
|
# print("compare1:", entry.id)
|
||||||
|
# print("compare2:", entry_id)
|
||||||
|
# print("============")
|
||||||
|
valid = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if entry.has_key("title"):
|
||||||
|
title = entry["title"]
|
||||||
|
else:
|
||||||
|
title = feed["title"]
|
||||||
|
if entry.has_key("link"):
|
||||||
|
link = join_url(url, entry["link"])
|
||||||
|
else:
|
||||||
|
link = url
|
||||||
|
# "date_published" "date_modified"
|
||||||
|
if entry.has_key("date_published") and timestamp:
|
||||||
|
time = rfc2822_to_iso8601(entry["date_published"])
|
||||||
|
if (entry_title == title and
|
||||||
|
entry_link == link and
|
||||||
|
timestamp == time):
|
||||||
|
valid = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if (entry_title == title and
|
||||||
|
entry_link == link):
|
||||||
|
valid = True
|
||||||
|
break
|
||||||
|
if not valid:
|
||||||
|
print("CHECK ENTRY OF JSON FEED IN ARCHIVE")
|
||||||
|
if read_status == 1:
|
||||||
|
await sqlite.delete_entry_by_id(db_file, ix)
|
||||||
|
else:
|
||||||
|
await sqlite.archive_entry(db_file, ix)
|
||||||
|
limit = await get_setting_value(db_file, "archive")
|
||||||
|
await sqlite.maintain_archive(db_file, limit)
|
|
@ -15,6 +15,39 @@ TODO
|
||||||
|
|
||||||
2) Consider merging with module fetch.py
|
2) Consider merging with module fetch.py
|
||||||
|
|
||||||
|
3) Mark redirects for manual check
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/atom.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/feed.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/news.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/news.xml.php
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/rdf.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/rss.xml
|
||||||
|
|
||||||
|
Title : JSON Feed
|
||||||
|
Link : https://www.jsonfeed.org/feed.json/videos.xml
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from aiohttp import ClientError, ClientSession, ClientTimeout
|
from aiohttp import ClientError, ClientSession, ClientTimeout
|
||||||
|
|
2
slixfeed/version.py
Normal file
2
slixfeed/version.py
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
__version__ = '1.0.0'
|
||||||
|
__version_info__ = (1, 0, 0)
|
Loading…
Reference in a new issue