Add support for JSON Feed

This commit is contained in:
Schimon Jehudah 2024-01-20 17:28:31 +00:00
parent 4c84f40e0e
commit 0512e14738
5 changed files with 346 additions and 9 deletions

View file

@ -14,21 +14,33 @@ Slixfeed is primarily designed for XMPP (aka Jabber), yet it is built to be exte
### Features
#### Simultaneous
Slixfeed is designed to handle multiple contacts, including groupchats, Simultaneously.
#### Ease
Slixfeed automatically scans (i.e. crawls) for web feeds of given URL.
#### Export
Download articles as ePUB, HTML, Markdown and PDF.
#### Filtering
Slixfeed provides positive and nagative ways to filter by allow and deny lists.
Filter using lists of allow and deny.
#### Multimedia
Display audios pictures and videos inline.
#### Portable
Export and import feeds with a standard OPML file.
#### Proxy
Redirect to alternative online back-ends, such as Invidious, Librarian, Nitter, for increased privacy and productivity and security.
Redirect to alternative back-ends, such as Invidious, Librarian, Nitter, for increased privacy, productivity and security.
#### Simultaneous
Slixfeed is designed to handle multiple contacts, including groupchats, Simultaneously.
## Getting Started

View file

@ -0,0 +1 @@
__version__ = "1.0.0"

View file

@ -26,8 +26,9 @@ TODO
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
from http.client import IncompleteRead
from feedparser import parse
from http.client import IncompleteRead
import json
import logging
from lxml import html
import slixfeed.config as config
@ -102,6 +103,38 @@ def log_to_markdown(timestamp, filename, jid, message):
file.write(entry)
def is_feed_json(document):
"""
NOTE /kurtmckee/feedparser/issues/103
Determine whether document is json feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
value = False
feed = json.loads(document)
if not feed['items']:
if "version" in feed.keys():
if 'jsonfeed' in feed['version']:
value = True
# elif 'title' in feed.keys():
# value = True
else:
value = False
else:
value = True
return value
def is_feed(feed):
"""
Determine whether document is feed or not.
@ -120,7 +153,7 @@ def is_feed(feed):
# message = None
if not feed.entries:
if "version" in feed.keys():
feed["version"]
# feed["version"]
if feed.version:
value = True
# message = (
@ -471,6 +504,53 @@ async def add_feed(db_file, url):
"added to subscription list."
).format(url, title)
break
# NOTE This elif statement be unnecessary
# when feedparser be supporting json feed.
elif is_feed_json(document):
feed = json.loads(document)
if "title" in feed.keys():
title = feed["title"]
else:
title = urlsplit(url).netloc
if "language" in feed.keys():
language = feed["language"]
else:
language = ''
if "encoding" in feed.keys():
encoding = feed["encoding"]
else:
encoding = ''
if "date_published" in feed.keys():
updated = feed["date_published"]
try:
updated = convert_struct_time_to_iso8601(updated)
except:
updated = ''
else:
updated = ''
version = 'json' + feed["version"].split('/').pop()
entries = len(feed["items"])
await sqlite.insert_feed(
db_file, url,
title=title,
entries=entries,
version=version,
encoding=encoding,
language=language,
status_code=status_code,
updated=updated
)
await scan_json(
db_file, url)
old = await get_setting_value(db_file, "old")
if not old:
await sqlite.mark_feed_as_read(
db_file, url)
response = (
"> {}\nNews source \"{}\" has been "
"added to subscription list."
).format(url, title)
break
else:
result = await crawl.probe_page(
url, document)
@ -496,6 +576,144 @@ async def add_feed(db_file, url):
return response
async def scan_json(db_file, url):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if isinstance(url, tuple): url = url[0]
result = await fetch.http(url)
try:
document = result[0]
status = result[1]
except:
return
new_entries = []
if document and status == 200:
feed = json.loads(document)
entries = feed["items"]
await remove_nonexistent_entries_json(
db_file, url, feed)
try:
feed_id = await sqlite.get_feed_id(db_file, url)
# await sqlite.update_feed_validity(
# db_file, feed_id, valid)
if "date_published" in feed.keys():
updated = feed["date_published"]
try:
updated = convert_struct_time_to_iso8601(updated)
except:
updated = ''
else:
updated = ''
feed_id = await sqlite.get_feed_id(db_file, url)
await sqlite.update_feed_properties(
db_file, feed_id, len(feed["items"]), updated)
# await update_feed_status
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
logging.error(e)
return
# new_entry = 0
for entry in entries:
if "date_published" in entry.keys():
date = entry["date_published"]
date = rfc2822_to_iso8601(date)
elif "date_modified" in entry.keys():
date = entry["date_modified"]
date = rfc2822_to_iso8601(date)
else:
date = now()
if "url" in entry.keys():
# link = complete_url(source, entry.link)
link = join_url(url, entry["url"])
link = trim_url(link)
else:
link = url
# title = feed["feed"]["title"]
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
title = entry["title"] if "title" in entry.keys() else date
entry_id = entry["id"] if "id" in entry.keys() else link
feed_id = await sqlite.get_feed_id(db_file, url)
exist = await sqlite.check_entry_exist(
db_file, feed_id, entry_id=entry_id,
title=title, link=link, date=date)
if not exist:
summary = entry["summary"] if "summary" in entry.keys() else ''
if not summary:
summary = entry["content_html"] if "content_html" in entry.keys() else ''
if not summary:
summary = entry["content_text"] if "content_text" in entry.keys() else ''
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title, summary, pathname)
allow_list = await config.is_include_keyword(
db_file, "filter-allow", string)
if not allow_list:
reject_list = await config.is_include_keyword(
db_file, "filter-deny", string)
if reject_list:
read_status = 1
logging.debug(
"Rejected : {}\n"
"Keyword : {}".format(
link, reject_list))
if isinstance(date, int):
logging.error(
"Variable 'date' is int: {}".format(date))
media_link = ''
if "attachments" in entry.keys():
for e_link in entry["attachments"]:
try:
# if (link.rel == "enclosure" and
# (link.type.startswith("audio/") or
# link.type.startswith("image/") or
# link.type.startswith("video/"))
# ):
media_type = e_link["mime_type"][:e_link["mime_type"].index("/")]
if media_type in ("audio", "image", "video"):
media_link = e_link["url"]
media_link = join_url(url, e_link["url"])
media_link = trim_url(media_link)
break
except:
logging.error(
"KeyError: 'url'\n"
"Missing 'url' attribute for {}".format(url))
logging.info(
"Continue scanning for next potential "
"enclosure of {}".format(link))
entry = {
"title": title,
"link": link,
"enclosure": media_link,
"entry_id": entry_id,
"date": date,
"read_status": read_status
}
new_entries.extend([entry])
# await sqlite.add_entry(
# db_file, title, link, entry_id,
# url, date, read_status)
# await sqlite.set_date(db_file, url)
if len(new_entries):
feed_id = await sqlite.get_feed_id(db_file, url)
await sqlite.add_entries_and_update_timestamp(
db_file, feed_id, new_entries)
async def view_feed(url):
while True:
result = await fetch.http(url)
@ -845,7 +1063,6 @@ async def extract_image_from_feed(db_file, feed_id, url):
logging.error(url)
logging.error(
"AttributeError: object has no attribute 'link'")
breakpoint()
async def extract_image_from_html(url):
@ -1024,3 +1241,75 @@ async def remove_nonexistent_entries(db_file, url, feed):
await sqlite.archive_entry(db_file, ix)
limit = await get_setting_value(db_file, "archive")
await sqlite.maintain_archive(db_file, limit)
async def remove_nonexistent_entries_json(db_file, url, feed):
"""
Remove entries that don't exist in a given parsed feed.
Check the entries returned from feed and delete read non
existing entries, otherwise move to table archive, if unread.
Parameters
----------
db_file : str
Path to database file.
url : str
Feed URL.
feed : list
Parsed feed document.
"""
feed_id = await sqlite.get_feed_id(db_file, url)
items = await sqlite.get_entries_of_feed(db_file, feed_id)
entries = feed["items"]
for item in items:
ix = item[0]
entry_title = item[1]
entry_link = item[2]
entry_id = item[3]
timestamp = item[4]
read_status = item[5]
valid = False
for entry in entries:
title = None
link = None
time = None
# valid = False
# TODO better check and don't repeat code
if entry.has_key("id") and entry_id:
if entry["id"] == entry_id:
# print("compare1:", entry.id)
# print("compare2:", entry_id)
# print("============")
valid = True
break
else:
if entry.has_key("title"):
title = entry["title"]
else:
title = feed["title"]
if entry.has_key("link"):
link = join_url(url, entry["link"])
else:
link = url
# "date_published" "date_modified"
if entry.has_key("date_published") and timestamp:
time = rfc2822_to_iso8601(entry["date_published"])
if (entry_title == title and
entry_link == link and
timestamp == time):
valid = True
break
else:
if (entry_title == title and
entry_link == link):
valid = True
break
if not valid:
print("CHECK ENTRY OF JSON FEED IN ARCHIVE")
if read_status == 1:
await sqlite.delete_entry_by_id(db_file, ix)
else:
await sqlite.archive_entry(db_file, ix)
limit = await get_setting_value(db_file, "archive")
await sqlite.maintain_archive(db_file, limit)

View file

@ -15,6 +15,39 @@ TODO
2) Consider merging with module fetch.py
3) Mark redirects for manual check
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/atom.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/news.xml.php
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rdf.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/rss.xml
Title : JSON Feed
Link : https://www.jsonfeed.org/feed.json/videos.xml
"""
from aiohttp import ClientError, ClientSession, ClientTimeout

2
slixfeed/version.py Normal file
View file

@ -0,0 +1,2 @@
__version__ = '1.0.0'
__version_info__ = (1, 0, 0)