Segregate code into more particular functions

This commit is contained in:
Schimon Jehudah 2024-01-04 01:16:24 +00:00
parent 96f3369539
commit 7135994888
13 changed files with 995 additions and 937 deletions

369
slixfeed/action.py Normal file
View file

@ -0,0 +1,369 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from asyncio.exceptions import IncompleteReadError
from bs4 import BeautifulSoup
from http.client import IncompleteRead
from feedparser import parse
import slixfeed.config as config
import slixfeed.crawl as crawl
from slixfeed.datetime import now, rfc2822_to_iso8601
import slixfeed.fetch as fetch
import slixfeed.sqlite as sqlite
import slixfeed.read as read
import slixfeed.task as task
from slixfeed.url import complete_url, join_url, trim_url
from urllib import error
from urllib.parse import urlsplit
async def add_feed(db_file, url):
while True:
exist = await sqlite.is_feed_exist(db_file, url)
if not exist:
result = await fetch.download_feed([url])
document = result[0]
status = result[1]
if document:
feed = parse(document)
# if read.is_feed(url, feed):
if read.is_feed(feed):
try:
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
await sqlite.insert_feed(
db_file, url, title, status)
await organize_items(
db_file, [url])
old = await sqlite.get_settings_value(
db_file, "old")
if not old:
await sqlite.mark_source_as_read(
db_file, url)
response = (
"> {}\nNews source {} has been "
"added to subscription list."
).format(url, title)
break
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result
break
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
break
else:
ix = exist[0]
name = exist[1]
response = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
break
return response
async def view_feed(url):
while True:
result = await fetch.download_feed([url])
document = result[0]
status = result[1]
if document:
feed = parse(document)
# if read.is_feed(url, feed):
if read.is_feed(feed):
try:
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
entries = feed.entries
response = "Preview of {}:\n\n```\n".format(title)
counter = 0
for entry in entries:
counter += 1
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
response += (
"Title : {}\n"
"Date : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(title, date, link, counter)
if counter > 4:
break
response += (
"```\nSource: {}"
).format(url)
break
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result
break
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
break
return response
async def view_entry(url, num):
while True:
result = await fetch.download_feed([url])
document = result[0]
status = result[1]
if document:
feed = parse(document)
# if read.is_feed(url, feed):
if read.is_feed(feed):
try:
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
entries = feed.entries
num = int(num) - 1
entry = entries[num]
response = "Preview of {}:\n\n```\n".format(title)
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
else:
summary = "*** No summary ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
response = (
"{}\n"
"\n"
# "> {}\n"
"{}\n"
"\n"
"{}\n"
"\n"
).format(title, summary, link)
break
else:
result = await crawl.probe_page(
url, document)
# TODO Check length and for a write a
# unified message for a set of feeds.
# Use logging if you so choose to
# distinct the methods
if isinstance(result, list):
url = result[0]
elif isinstance(result, str):
response = result
break
else:
response = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
break
return response
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def organize_items(db_file, urls):
"""
Check feeds for new entries.
Parameters
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
for url in urls:
# print(os.path.basename(db_file), url[0])
source = url[0]
res = await fetch.download_feed(source)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed
# urls.next()
# next(urls)
continue
await sqlite.update_source_status(
db_file, res[1], source)
if res[0]:
try:
feed = parse(res[0])
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(source)
# print(bozo)
valid = 0
else:
valid = 1
await sqlite.update_source_validity(
db_file, source, valid)
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
None
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res[1] == 200:
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed.entries
# length = len(entries)
# await remove_entry(db_file, source, length)
await sqlite.remove_nonexistent_entries(
db_file, feed, source)
# new_entry = 0
for entry in entries:
# TODO Pass date too for comparion check
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = now()
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
if entry.has_key("title"):
title = entry.title
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
else:
title = date
# title = feed["feed"]["title"]
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(source, entry.link)
link = trim_url(link)
else:
link = source
if entry.has_key("id"):
eid = entry.id
else:
eid = link
exist = await sqlite.check_entry_exist(
db_file, source, eid=eid,
title=title, link=link, date=date)
if not exist:
# new_entry = new_entry + 1
# TODO Enhance summary
if entry.has_key("summary"):
summary = entry.summary
# # Remove HTML tags
# summary = BeautifulSoup(summary, "lxml").text
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# summary = summary[:300] + " […]‍⃨"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
else:
summary = "> *** No summary ***"
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title,
summary,
pathname
)
allow_list = await config.is_listed(
db_file, "filter-allow", string)
if not allow_list:
reject_list = await config.is_listed(
db_file, "filter-deny", string)
if reject_list:
# print(">>> REJECTED", title)
summary = (
"REJECTED {}".format(
reject_list.upper()
)
)
# summary = ""
read_status = 1
entry = (
title, link, eid, source, date, read_status)
if isinstance(date, int):
print("PROBLEM: date is int")
print(date)
# breakpoint()
# print(source)
# print(date)
await sqlite.add_entry_and_set_date(
db_file, source, entry)
# print(current_time(), entry, title)
# else:
# print(current_time(), exist, title)

View file

@ -59,8 +59,9 @@ def get_value(filename, section, keys):
for key in keys: for key in keys:
try: try:
value = section_res[key] value = section_res[key]
logging.debug("Found value {} for key {}".format( logging.debug(
value, key)) "Found value {} for key {}".format(value, key)
)
except: except:
value = '' value = ''
logging.error("Missing key:", key) logging.error("Missing key:", key)
@ -70,7 +71,8 @@ def get_value(filename, section, keys):
try: try:
result = section_res[key] result = section_res[key]
logging.debug( logging.debug(
"Found value {} for key {}".format(result, key)) "Found value {} for key {}".format(result, key)
)
except: except:
result = '' result = ''
# logging.error("Missing key:", key) # logging.error("Missing key:", key)
@ -78,7 +80,8 @@ def get_value(filename, section, keys):
logging.error( logging.error(
"Check configuration file {}.ini for " "Check configuration file {}.ini for "
"missing key(s) \"{}\" under section [{}].".format( "missing key(s) \"{}\" under section [{}].".format(
filename, keys, section)) filename, keys, section)
)
else: else:
return result return result
@ -171,7 +174,9 @@ def get_default_dbdir():
else: else:
return os.path.abspath('.') return os.path.abspath('.')
else: else:
data_home = os.path.join(os.environ.get('HOME'), '.local', 'share') data_home = os.path.join(
os.environ.get('HOME'), '.local', 'share'
)
return os.path.join(data_home, 'slixfeed') return os.path.join(data_home, 'slixfeed')
@ -200,7 +205,9 @@ def get_default_confdir():
else: else:
return os.path.abspath('.') return os.path.abspath('.')
else: else:
config_home = os.path.join(os.environ.get('HOME'), '.config') config_home = os.path.join(
os.environ.get('HOME'), '.config'
)
return os.path.join(config_home, 'slixfeed') return os.path.join(config_home, 'slixfeed')

382
slixfeed/crawl.py Normal file
View file

@ -0,0 +1,382 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1.1) Do not compose messages.
1.2) Return URLs, nothing else other (e.g. processed messages).
1.3) Correction of URLs is aceptable.
"""
from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
from lxml import html
import slixfeed.config as config
from slixfeed.fetch import download_feed
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit
# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
# callback, url, document, num=None, db_file=None):
# result = None
# try:
# # tree = etree.fromstring(res[0]) # etree is for xml
# tree = html.fromstring(document)
# except:
# result = (
# "> {}\nFailed to parse URL as feed."
# ).format(url)
# if not result:
# print("RSS Auto-Discovery Engaged")
# result = await feed_mode_auto_discovery(url, tree)
# if not result:
# print("RSS Scan Mode Engaged")
# result = await feed_mode_scan(url, tree)
# if not result:
# print("RSS Arbitrary Mode Engaged")
# result = await feed_mode_request(url, tree)
# if not result:
# result = (
# "> {}\nNo news feeds were found for URL."
# ).format(url)
# # elif msg:
# else:
# if isinstance(result, str):
# return result
# elif isinstance(result, list):
# url = result[0]
# if db_file:
# # print("if db_file", db_file)
# return await callback(db_file, url)
# elif num:
# return await callback(url, num)
# else:
# return await callback(url)
async def probe_page(url, document):
"""
Parameters
----------
url : str
URL.
document : TYPE
DESCRIPTION.
Returns
-------
result : list or str
Single URL as list or selection of URLs as str.
"""
result = None
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(document)
except:
result = (
"> {}\nFailed to parse URL as feed."
).format(url)
if not result:
print("RSS Auto-Discovery Engaged")
result = await feed_mode_auto_discovery(url, tree)
if not result:
print("RSS Scan Mode Engaged")
result = await feed_mode_scan(url, tree)
if not result:
print("RSS Arbitrary Mode Engaged")
result = await feed_mode_request(url, tree)
if not result:
result = (
"> {}\nNo news feeds were found for URL."
).format(url)
return result
# TODO Improve scan by gradual decreasing of path
async def feed_mode_request(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
parted_url = urlsplit(url)
paths = config.get_list("lists.yaml")
paths = paths["pathnames"]
for path in paths:
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
# print(parse(res[0])["feed"]["title"])
# feeds[address] = parse(res[0])["feed"]["title"]
try:
title = parse(res[0])["feed"]["title"]
except:
title = '*** No Title ***'
feeds[address] = title
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
parted_url.path.split('/')[1] + path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds)
except:
continue
if len(feeds) > 1:
counter = 0
msg = (
"RSS URL discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
# AttributeError: 'str' object has no attribute 'entries'
try:
feed_amnt = len(feeds[feed].entries)
except:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title: {}\n"
"Link : {}\n"
"Items: {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
# paths = []
# TODO Test
paths = config.get_list("lists.yaml")
paths = paths["pathnames"]
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
# print(address.xpath('@href')[0])
# print(addresses)
address = address.xpath('@href')[0]
if "/" not in address:
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
if address.startswith('/'):
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint()
# print(feeds)
except:
continue
if len(feeds) > 1:
# print(feeds)
# breakpoint()
counter = 0
msg = (
"RSS URL scan has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
if len(feeds) > 1:
msg = (
"RSS Auto-Discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed.xpath('@title')[0]
feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
return msg
elif feeds:
feed_addr = join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr]

View file

@ -33,454 +33,24 @@ from http.client import IncompleteRead
from lxml import html from lxml import html
import slixfeed.config as config import slixfeed.config as config
from slixfeed.datetime import now, rfc2822_to_iso8601 from slixfeed.datetime import now, rfc2822_to_iso8601
import slixfeed.utility as utility
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
from slixfeed.url import complete_url, join_url, trim_url from slixfeed.url import complete_url, join_url, trim_url
from urllib import error from urllib import error
# from xml.etree.ElementTree import ElementTree, ParseError # from xml.etree.ElementTree import ElementTree, ParseError
from urllib.parse import urlsplit, urlunsplit from urllib.parse import urlsplit, urlunsplit
# NOTE Why (if res[0]) and (if res[1] == 200)?
async def download_updates(db_file, url=None):
"""
Check feeds for new entries.
Parameters # async def dat():
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
"""
if url:
urls = [url] # Valid [url] and [url,] and (url,)
else:
urls = await sqlite.get_feeds_url(db_file)
for url in urls:
# print(os.path.basename(db_file), url[0])
source = url[0]
res = await download_feed(source)
# TypeError: 'NoneType' object is not subscriptable
if res is None:
# Skip to next feed
# urls.next()
# next(urls)
continue
await sqlite.update_source_status(
db_file, res[1], source)
if res[0]:
try:
feed = parse(res[0])
if feed.bozo:
# bozo = (
# "WARNING: Bozo detected for feed: {}\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(source)
# print(bozo)
valid = 0
else:
valid = 1
await sqlite.update_source_validity(
db_file, source, valid)
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
None
# NOTE I don't think there should be "return"
# because then we might stop scanning next URLs
# return
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
if res[1] == 200:
# NOT SURE WHETHER I MEANT THE LINES ABOVE OR BELOW
# TODO Place these couple of lines back down
# NOTE Need to correct the SQL statement to do so
entries = feed.entries
# length = len(entries)
# await remove_entry(db_file, source, length)
await sqlite.remove_nonexistent_entries(
db_file, feed, source)
# new_entry = 0
for entry in entries:
# TODO Pass date too for comparion check
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
# TODO Just set date = "*** No date ***"
# date = await datetime.now().isoformat()
date = now()
# NOTE Would seconds result in better database performance
# date = datetime.datetime(date)
# date = (date-datetime.datetime(1970,1,1)).total_seconds()
if entry.has_key("title"):
title = entry.title
# title = "{}: *{}*".format(feed["feed"]["title"], entry.title)
else:
title = date
# title = feed["feed"]["title"]
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(source, entry.link)
link = trim_url(link)
else:
link = source
if entry.has_key("id"):
eid = entry.id
else:
eid = link
exist = await sqlite.check_entry_exist(
db_file, source, eid=eid,
title=title, link=link, date=date)
if not exist:
# new_entry = new_entry + 1
# TODO Enhance summary
if entry.has_key("summary"):
summary = entry.summary
# # Remove HTML tags
# summary = BeautifulSoup(summary, "lxml").text
# # TODO Limit text length
# summary = summary.replace("\n\n\n", "\n\n")
# summary = summary[:300] + " […]‍⃨"
# summary = summary.strip().split('\n')
# summary = ["> " + line for line in summary]
# summary = "\n".join(summary)
else:
summary = "> *** No summary ***"
read_status = 0
pathname = urlsplit(link).path
string = (
"{} {} {}"
).format(
title,
summary,
pathname
)
allow_list = await config.is_listed(
db_file, "filter-allow", string)
if not allow_list:
reject_list = await config.is_listed(
db_file, "filter-deny", string)
if reject_list:
# print(">>> REJECTED", title)
summary = (
"REJECTED {}".format(
reject_list.upper()
)
)
# summary = ""
read_status = 1
entry = (
title, link, eid, source, date, read_status)
if isinstance(date, int):
print("PROBLEM: date is int")
print(date)
# breakpoint()
# print(source)
# print(date)
await sqlite.add_entry_and_set_date(
db_file, source, entry)
# print(current_time(), entry, title)
# else:
# print(current_time(), exist, title)
# async def ftp():
# NOTE Why (if result[0]) and (if result[1] == 200)? # async def gemini():
async def view_feed(url):
"""
Check feeds for new entries.
Parameters # async def gopher():
----------
db_file : str
Path to database file.
url : str, optional
URL. The default is None.
Returns # async def http():
-------
msg : str
Feed content or error message.
"""
result = await download_feed(url)
if result[0]:
try:
feed = parse(result[0])
if feed.bozo:
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
msg = await probe_page(view_feed, url, result[0])
return msg
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
msg = (
"> {}\n"
"Error: {}"
).format(url, e)
# breakpoint()
if result[1] == 200:
feed = parse(result[0])
title = utility.get_title(url, feed)
entries = feed.entries
msg = "Preview of {}:\n\n```\n".format(title)
counter = 0
for entry in entries:
counter += 1
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
msg += (
"Title : {}\n"
"Date : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(title, date, link, counter)
if counter > 4:
break
msg += (
"```\nSource: {}"
).format(url)
else:
msg = (
">{}\nFailed to load URL. Reason: {}"
).format(url, result[1])
return msg
# NOTE Why (if result[0]) and (if result[1] == 200)?
async def view_entry(url, num):
result = await download_feed(url)
if result[0]:
try:
feed = parse(result[0])
if feed.bozo:
# msg = (
# ">{}\n"
# "WARNING: Bozo detected!\n"
# "For more information, visit "
# "https://pythonhosted.org/feedparser/bozo.html"
# ).format(url)
msg = await probe_page(view_entry, url, result[0], num=num)
return msg
except (
IncompleteReadError,
IncompleteRead,
error.URLError
) as e:
# print(e)
# TODO Print error to log
msg = (
"> {}\n"
"Error: {}"
).format(url, e)
# breakpoint()
if result[1] == 200:
feed = parse(result[0])
title = utility.get_title(url, result[0])
entries = feed.entries
num = int(num) - 1
entry = entries[num]
if entry.has_key("title"):
title = entry.title
else:
title = "*** No title ***"
if entry.has_key("published"):
date = entry.published
date = rfc2822_to_iso8601(date)
elif entry.has_key("updated"):
date = entry.updated
date = rfc2822_to_iso8601(date)
else:
date = "*** No date ***"
if entry.has_key("summary"):
summary = entry.summary
# Remove HTML tags
summary = BeautifulSoup(summary, "lxml").text
# TODO Limit text length
summary = summary.replace("\n\n\n", "\n\n")
else:
summary = "*** No summary ***"
if entry.has_key("link"):
# link = complete_url(source, entry.link)
link = join_url(url, entry.link)
link = trim_url(link)
else:
link = "*** No link ***"
msg = (
"{}\n"
"\n"
"> {}\n"
"\n"
"{}\n"
"\n"
).format(title, summary, link)
else:
msg = (
">{}\n"
"Failed to load URL. Reason: {}\n"
"Try again momentarily."
).format(url, result[1])
return msg
async def add_feed_no_check(db_file, data):
"""
Add given feed without validity check.
Parameters
----------
db_file : str
Path to database file.
data : str
URL or URL and Title.
Returns
-------
msg : str
Status message.
"""
url = data[0]
title = data[1]
url = trim_url(url)
exist = await sqlite.is_feed_exist(db_file, url)
if not exist:
msg = await sqlite.insert_feed(db_file, url, title)
await download_updates(db_file, [url])
else:
ix = exist[0]
name = exist[1]
msg = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
return msg
async def add_feed(db_file, url):
"""
Check whether feed exist, otherwise process it.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
Returns
-------
msg : str
Status message.
"""
msg = None
url = trim_url(url)
exist = await sqlite.is_feed_exist(db_file, url)
if not exist:
res = await download_feed(url)
if res[0]:
feed = parse(res[0])
title = utility.get_title(url, feed)
if utility.is_feed(url, feed):
status = res[1]
await sqlite.insert_feed(
db_file, url, title, status)
await download_updates(db_file, [url])
title = title if title else url
msg = (
"> {}\nNews source \"{}\" has been added "
"to subscription list."
).format(url, title)
else:
msg = await probe_page(
add_feed, url, res[0], db_file=db_file)
else:
status = res[1]
msg = (
"> {}\nFailed to load URL. Reason: {}"
).format(url, status)
else:
ix = exist[0]
name = exist[1]
msg = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
return msg
# TODO callback for use with add_feed and view_feed
async def probe_page(callback, url, doc, num=None, db_file=None):
msg = None
try:
# tree = etree.fromstring(res[0]) # etree is for xml
tree = html.fromstring(doc)
except:
msg = (
"> {}\nFailed to parse URL as feed."
).format(url)
if not msg:
print("RSS Auto-Discovery Engaged")
msg = await feed_mode_auto_discovery(url, tree)
if not msg:
print("RSS Scan Mode Engaged")
msg = await feed_mode_scan(url, tree)
if not msg:
print("RSS Arbitrary Mode Engaged")
msg = await feed_mode_request(url, tree)
if not msg:
msg = (
"> {}\nNo news feeds were found for URL."
).format(url)
# elif msg:
else:
if isinstance(msg, str):
return msg
elif isinstance(msg, list):
url = msg[0]
if db_file:
# print("if db_file", db_file)
return await callback(db_file, url)
elif num:
return await callback(url, num)
else:
return await callback(url)
# async def ipfs():
async def download_feed(url): async def download_feed(url):
""" """
@ -488,7 +58,7 @@ async def download_feed(url):
Parameters Parameters
---------- ----------
url : str url : list
URL. URL.
Returns Returns
@ -502,27 +72,23 @@ async def download_feed(url):
user_agent = "Slixfeed/0.1" user_agent = "Slixfeed/0.1"
if not len(user_agent): if not len(user_agent):
user_agent = "Slixfeed/0.1" user_agent = "Slixfeed/0.1"
headers = {'User-Agent': user_agent}
url = url[0]
proxy = (config.get_value("settings", "Network", "http_proxy")) or '' proxy = (config.get_value("settings", "Network", "http_proxy")) or ''
timeout = ClientTimeout(total=10) timeout = ClientTimeout(total=10)
headers = {'User-Agent': user_agent}
async with ClientSession(headers=headers) as session: async with ClientSession(headers=headers) as session:
# async with ClientSession(trust_env=True) as session: # async with ClientSession(trust_env=True) as session:
try: try:
async with session.get( async with session.get(url, proxy=proxy,
url, # proxy_auth=(proxy_username, proxy_password),
proxy=proxy, timeout=timeout
# proxy_auth=(proxy_username, proxy_password) ) as response:
timeout=timeout
) as response:
status = response.status status = response.status
if response.status == 200: if response.status == 200:
try: try:
doc = await response.text() doc = await response.text()
# print (response.content_type) # print (response.content_type)
msg = [ msg = [doc, status]
doc,
status
]
except: except:
# msg = [ # msg = [
# False, # False,
@ -531,307 +97,20 @@ async def download_feed(url):
# ) # )
# ] # ]
msg = [ msg = [
False, False, "Document is too large or is not textual."
"Document is too large or is not textual."
] ]
else: else:
msg = [ msg = [
False, False, "HTTP Error: " + str(status)
"HTTP Error: " + str(status)
] ]
except ClientError as e: except ClientError as e:
# print('Error', str(e)) # print('Error', str(e))
msg = [ msg = [
False, False, "Error: " + str(e)
"Error: " + str(e)
] ]
except TimeoutError as e: except TimeoutError as e:
# print('Timeout:', str(e)) # print('Timeout:', str(e))
msg = [ msg = [
False, False, "Timeout: " + str(e)
"Timeout: " + str(e)
] ]
return msg return msg
# TODO Improve scan by gradual decreasing of path
async def feed_mode_request(url, tree):
"""
Lookup for feeds by pathname using HTTP Requests.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
parted_url = urlsplit(url)
paths = config.get_list("lists.yaml")
paths = paths["pathnames"]
for path in paths:
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
# print(parse(res[0])["feed"]["title"])
# feeds[address] = parse(res[0])["feed"]["title"]
try:
title = parse(res[0])["feed"]["title"]
except:
title = '*** No Title ***'
feeds[address] = title
# Check whether URL has path (i.e. not root)
# Check parted_url.path to avoid error in case root wasn't given
# TODO Make more tests
if parted_url.path and parted_url.path.split('/')[1]:
paths.extend(
[".atom", ".feed", ".rdf", ".rss"]
) if '.rss' not in paths else -1
# if paths.index('.rss'):
# paths.extend([".atom", ".feed", ".rdf", ".rss"])
address = urlunsplit([
parted_url.scheme,
parted_url.netloc,
parted_url.path.split('/')[1] + path,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds)
except:
continue
if len(feeds) > 1:
counter = 0
msg = (
"RSS URL discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
# AttributeError: 'str' object has no attribute 'entries'
try:
feed_amnt = len(feeds[feed].entries)
except:
continue
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title: {}\n"
"Link : {}\n"
"Items: {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_scan(url, tree):
"""
Scan page for potential feeds by pathname.
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
feeds = {}
# paths = []
# TODO Test
paths = config.get_list("lists.yaml")
paths = paths["pathnames"]
for path in paths:
# xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
# xpath_query = "//a[contains(@href,'{}')]".format(path)
num = 5
xpath_query = "(//a[contains(@href,'{}')])[position()<={}]".format(path, num)
addresses = tree.xpath(xpath_query)
xpath_query = "(//a[contains(@href,'{}')])[position()>last()-{}]".format(path, num)
addresses += tree.xpath(xpath_query)
parted_url = urlsplit(url)
# NOTE Should number of addresses be limited or
# perhaps be N from the start and N from the end
for address in addresses:
# print(address.xpath('@href')[0])
# print(addresses)
address = address.xpath('@href')[0]
if "/" not in address:
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
if address.startswith('/'):
protocol = parted_url.scheme
hostname = parted_url.netloc
pathname = address
address = urlunsplit([
protocol,
hostname,
pathname,
None,
None
])
res = await download_feed(address)
if res[1] == 200:
try:
feeds[address] = parse(res[0])
# print(feeds[address])
# breakpoint()
# print(feeds)
except:
continue
if len(feeds) > 1:
# print(feeds)
# breakpoint()
counter = 0
msg = (
"RSS URL scan has found {} feeds:\n\n```\n"
).format(len(feeds))
feed_mark = 0
for feed in feeds:
# try:
# res = await download_feed(feed)
# except:
# continue
try:
feed_name = feeds[feed]["feed"]["title"]
except:
feed_name = urlsplit(feed).netloc
feed_addr = feed
feed_amnt = len(feeds[feed].entries)
if feed_amnt:
# NOTE Because there could be many false positives
# which are revealed in second phase of scan, we
# could end with a single feed, which would be
# listed instead of fetched, so feed_mark is
# utilized in order to make fetch possible.
feed_mark = [feed_addr]
counter += 1
msg += (
"Title : {}\n"
"Link : {}\n"
"Count : {}\n"
"\n"
).format(feed_name, feed_addr, feed_amnt)
if counter > 1:
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
elif feed_mark:
return feed_mark
else:
msg = (
"No feeds were found for {}"
).format(url)
return msg
elif feeds:
return feeds
async def feed_mode_auto_discovery(url, tree):
"""
Lookup for feeds using RSS autodiscovery technique.
See: https://www.rssboard.org/rss-autodiscovery
Parameters
----------
db_file : str
Path to database file.
url : str
URL.
tree : TYPE
DESCRIPTION.
Returns
-------
msg : str
Message with URLs.
"""
xpath_query = (
'//link[(@rel="alternate") and '
'(@type="application/atom+xml" or '
'@type="application/rdf+xml" or '
'@type="application/rss+xml")]'
)
# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
feeds = tree.xpath(xpath_query)
if len(feeds) > 1:
msg = (
"RSS Auto-Discovery has found {} feeds:\n\n```\n"
).format(len(feeds))
for feed in feeds:
# # The following code works;
# # The following code will catch
# # only valid resources (i.e. not 404);
# # The following code requires more bandwidth.
# res = await download_feed(feed)
# if res[0]:
# disco = parse(res[0])
# title = disco["feed"]["title"]
# msg += "{} \n {} \n\n".format(title, feed)
feed_name = feed.xpath('@title')[0]
feed_addr = join_url(url, feed.xpath('@href')[0])
# if feed_addr.startswith("/"):
# feed_addr = url + feed_addr
msg += "{}\n{}\n\n".format(feed_name, feed_addr)
msg += (
"```\nThe above feeds were extracted from\n{}"
).format(url)
return msg
elif feeds:
feed_addr = join_url(url, feeds[0].xpath('@href')[0])
return [feed_addr]

33
slixfeed/log.py Normal file
View file

@ -0,0 +1,33 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
def markdown(timestamp, filename, jid, message):
"""
Log message to file.
Parameters
----------
timestamp : str
Time stamp.
filename : str
Jabber ID as name of file.
jid : str
Jabber ID.
message : str
Message content.
Returns
-------
None.
"""
with open(filename + '.md', 'a') as file:
# entry = "{} {}:\n{}\n\n".format(timestamp, jid, message)
entry = (
"## {}\n"
"### {}\n\n"
"{}\n\n").format(jid, timestamp, message)
file.write(entry)

74
slixfeed/read.py Normal file
View file

@ -0,0 +1,74 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) is_feed: Look into the type ("atom", "rss2" etc.)
"""
def title(feed):
"""
Get title of feed.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed document.
Returns
-------
title : str
Title or None.
"""
try:
title = feed["feed"]["title"]
except:
title = None
return title
def is_feed(feed):
"""
Determine whether document is feed or not.
Parameters
----------
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
msg = None
if not feed.entries:
try:
feed["feed"]["title"]
val = True
# msg = (
# "Empty feed for {}"
# ).format(url)
except:
val = False
# msg = (
# "No entries nor title for {}"
# ).format(url)
elif feed.bozo:
val = False
# msg = (
# "Bozo detected for {}"
# ).format(url)
else:
val = True
# msg = (
# "Good feed for {}"
# ).format(url)
print(msg)
return val

View file

@ -222,9 +222,6 @@ async def remove_feed(db_file, ix):
"FROM feeds " "FROM feeds "
"WHERE id = ?" "WHERE id = ?"
) )
# cur
# for i in url:
# url = i[0]
url = cur.execute(sql, (ix,)).fetchone()[0] url = cur.execute(sql, (ix,)).fetchone()[0]
# NOTE Should we move DBLOCK to this line? 2022-12-23 # NOTE Should we move DBLOCK to this line? 2022-12-23
sql = ( sql = (
@ -246,8 +243,10 @@ async def remove_feed(db_file, ix):
cur.execute(sql, (ix,)) cur.execute(sql, (ix,))
# TODO Rename function name
async def is_feed_exist(db_file, url): async def is_feed_exist(db_file, url):
""" """
Get Id and Name of feed.
Check whether a feed exists. Check whether a feed exists.
Query for feeds by given url. Query for feeds by given url.
@ -270,8 +269,7 @@ async def is_feed_exist(db_file, url):
"WHERE address = ?" "WHERE address = ?"
) )
result = cur.execute(sql, (url,)).fetchone() result = cur.execute(sql, (url,)).fetchone()
if result: return result
return True
async def get_number_of_items(db_file, table): async def get_number_of_items(db_file, table):

View file

@ -49,13 +49,14 @@ from slixfeed.config import (
get_default_dbdir, get_default_dbdir,
get_value_default) get_value_default)
from slixfeed.datetime import current_time from slixfeed.datetime import current_time
from slixfeed.fetch import download_updates from slixfeed.action import organize_items
from slixfeed.sqlite import ( from slixfeed.sqlite import (
get_unread_entries,
get_feed_title, get_feed_title,
get_settings_value, get_feeds_url,
get_number_of_items, get_number_of_items,
get_number_of_entries_unread, get_number_of_entries_unread,
get_settings_value,
get_unread_entries,
mark_as_read, mark_as_read,
mark_entry_as_read, mark_entry_as_read,
delete_archived_entry delete_archived_entry
@ -329,7 +330,9 @@ async def refresh_task(self, jid, callback, key, val=None):
val : str, optional val : str, optional
Value. The default is None. Value. The default is None.
""" """
logging.debug("Refreshing task {} for JID {}".format(callback, jid)) logging.debug(
"Refreshing task {} for JID {}".format(callback, jid)
)
if not val: if not val:
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
val = await get_settings_value(db_file, key) val = await get_settings_value(db_file, key)
@ -340,7 +343,8 @@ async def refresh_task(self, jid, callback, key, val=None):
except: except:
logging.debug( logging.debug(
"No task of type {} to cancel for " "No task of type {} to cancel for "
"JID {} (clean_tasks)".format(key, jid)) "JID {} (clean_tasks)".format(key, jid)
)
# task_manager[jid][key] = loop.call_at( # task_manager[jid][key] = loop.call_at(
# loop.time() + 60 * float(val), # loop.time() + 60 * float(val),
# loop.create_task, # loop.create_task,
@ -378,10 +382,13 @@ async def check_updates(jid):
jid : str jid : str
Jabber ID. Jabber ID.
""" """
logging.debug("Scanning for updates for JID {}".format(jid)) logging.debug(
"Scanning for updates for JID {}".format(jid)
)
while True: while True:
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
await download_updates(db_file) urls = await get_feeds_url(db_file)
await organize_items(db_file, urls)
val = get_value_default("settings", "Settings", "check") val = get_value_default("settings", "Settings", "check")
await asyncio.sleep(60 * float(val)) await asyncio.sleep(60 * float(val))
# Schedule to call this function again in 90 minutes # Schedule to call this function again in 90 minutes
@ -394,12 +401,16 @@ async def check_updates(jid):
async def start_tasks(self, presence): async def start_tasks(self, presence):
jid = presence["from"].bare jid = presence["from"].bare
logging.debug("Beginning tasks for JID {}".format(jid)) logging.debug(
"Beginning tasks for JID {}".format(jid)
)
if jid not in self.boundjid.bare: if jid not in self.boundjid.bare:
await clean_tasks_xmpp( await clean_tasks_xmpp(
jid, ["interval", "status", "check"]) jid, ["interval", "status", "check"]
)
await start_tasks_xmpp( await start_tasks_xmpp(
self, jid, ["interval", "status", "check"]) self, jid, ["interval", "status", "check"]
)
# await task_jid(self, jid) # await task_jid(self, jid)
# main_task.extend([asyncio.create_task(task_jid(jid))]) # main_task.extend([asyncio.create_task(task_jid(jid))])
# print(main_task) # print(main_task)
@ -408,9 +419,12 @@ async def start_tasks(self, presence):
async def stop_tasks(self, presence): async def stop_tasks(self, presence):
if not self.boundjid.bare: if not self.boundjid.bare:
jid = presence["from"].bare jid = presence["from"].bare
logging.debug("Stopping tasks for JID {}".format(jid)) logging.debug(
"Stopping tasks for JID {}".format(jid)
)
await clean_tasks_xmpp( await clean_tasks_xmpp(
jid, ["interval", "status", "check"]) jid, ["interval", "status", "check"]
)
async def check_readiness(self, presence): async def check_readiness(self, presence):
@ -434,7 +448,9 @@ async def check_readiness(self, presence):
jid = presence["from"].bare jid = presence["from"].bare
if presence["show"] in ("away", "dnd", "xa"): if presence["show"] in ("away", "dnd", "xa"):
logging.debug("Stopping updates for JID {}".format(jid)) logging.debug(
"Stopping updates for JID {}".format(jid)
)
await clean_tasks_xmpp( await clean_tasks_xmpp(
jid, ["interval"]) jid, ["interval"])
await start_tasks_xmpp( await start_tasks_xmpp(
@ -477,7 +493,9 @@ async def select_file(self):
if (file.endswith(".db") and if (file.endswith(".db") and
not file.endswith(".db-jour.db")): not file.endswith(".db-jour.db")):
jid = file[:-3] jid = file[:-3]
main_task.extend([tg.create_task(self.task_jid(jid))]) main_task.extend(
[tg.create_task(self.task_jid(jid))]
)
# main_task = [tg.create_task(self.task_jid(jid))] # main_task = [tg.create_task(self.task_jid(jid))]
# task_manager.update({jid: tg}) # task_manager.update({jid: tg})

View file

@ -21,7 +21,7 @@ from urllib.parse import (
parse_qs, parse_qs,
urlencode, urlencode,
urljoin, urljoin,
urlparse, # urlparse,
urlsplit, urlsplit,
urlunsplit urlunsplit
) )

View file

@ -1,109 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TODO
1) is_feed: Look into the type ("atom", "rss2" etc.)
"""
from urllib.parse import urlsplit
def log_as_markdown(timestamp, filename, jid, message):
"""
Log message to file.
Parameters
----------
timestamp : str
Time stamp.
filename : str
Jabber ID as name of file.
jid : str
Jabber ID.
message : str
Message content.
Returns
-------
None.
"""
with open(filename + '.md', 'a') as file:
# entry = "{} {}:\n{}\n\n".format(timestamp, jid, message)
entry = (
"## {}\n"
"### {}\n\n"
"{}\n\n").format(jid, timestamp, message)
file.write(entry)
def get_title(url, feed):
"""
Get title of feed.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed document.
Returns
-------
title : str
Title or URL hostname.
"""
try:
title = feed["feed"]["title"]
except:
title = urlsplit(url).netloc
if not title:
title = urlsplit(url).netloc
return title
def is_feed(url, feed):
"""
Determine whether document is feed or not.
Parameters
----------
url : str
URL.
feed : dict
Parsed feed.
Returns
-------
val : boolean
True or False.
"""
msg = None
if not feed.entries:
try:
feed["feed"]["title"]
val = True
msg = (
"Empty feed for {}"
).format(url)
except:
val = False
msg = (
"No entries nor title for {}"
).format(url)
elif feed.bozo:
val = False
msg = (
"Bozo detected for {}"
).format(url)
else:
val = True
msg = (
"Good feed for {}"
).format(url)
print(msg)
return val

View file

@ -48,26 +48,20 @@ NOTE
""" """
import asyncio import asyncio
from slixfeed.config import add_to_list, get_list, remove_from_list
import slixfeed.fetch as fetcher
from slixfeed.datetime import current_time
import logging import logging
# import os # import os
from random import randrange from random import randrange
import slixmpp import slixmpp
from slixmpp.exceptions import IqError, IqTimeout
import slixfeed.sqlite as sqlite
import slixfeed.task as task import slixfeed.task as task
import slixfeed.url as urlfixer
from time import sleep from time import sleep
from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound from slixmpp.plugins.xep_0363.http_upload import FileTooBig, HTTPError, UploadServiceNotFound
# from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference # from slixmpp.plugins.xep_0402 import BookmarkStorage, Conference
from slixmpp.plugins.xep_0048.stanza import Bookmarks from slixmpp.plugins.xep_0048.stanza import Bookmarks
import xmltodict # import xmltodict
import xml.etree.ElementTree as ET # import xml.etree.ElementTree as ET
from lxml import etree # from lxml import etree
import slixfeed.xmpp.connect as connect import slixfeed.xmpp.connect as connect
import slixfeed.xmpp.process as process import slixfeed.xmpp.process as process

View file

@ -17,7 +17,8 @@ async def recover_connection(self, event, message):
# print(current_time(),"Maximum connection attempts exceeded.") # print(current_time(),"Maximum connection attempts exceeded.")
# logging.error("Maximum connection attempts exceeded.") # logging.error("Maximum connection attempts exceeded.")
print(current_time(), "Attempt number", self.connection_attempts) print(current_time(), "Attempt number", self.connection_attempts)
seconds = (get_value("accounts", "XMPP Connect", "reconnect_timeout")) or 30 seconds = (get_value(
"accounts", "XMPP Connect", "reconnect_timeout")) or 30
seconds = int(seconds) seconds = int(seconds)
print(current_time(), "Next attempt within", seconds, "seconds") print(current_time(), "Next attempt within", seconds, "seconds")
# NOTE asyncio.sleep doesn't interval as expected # NOTE asyncio.sleep doesn't interval as expected

View file

@ -19,19 +19,22 @@ TODO
""" """
import os import os
import slixfeed.action as action
from slixfeed.config import ( from slixfeed.config import (
add_to_list, add_to_list,
get_default_dbdir, get_default_dbdir,
get_value, get_value,
get_pathname_to_database, get_pathname_to_database,
remove_from_list) remove_from_list)
import slixfeed.crawl as crawl
from slixfeed.datetime import current_time, timestamp from slixfeed.datetime import current_time, timestamp
import slixfeed.export as export import slixfeed.export as export
import slixfeed.fetch as fetcher import slixfeed.fetch as fetch
import slixfeed.opml as opml import slixfeed.opml as opml
import slixfeed.sqlite as sqlite import slixfeed.sqlite as sqlite
import slixfeed.task as task import slixfeed.task as task
import slixfeed.utility as utility import slixfeed.log as log
import slixfeed.read as read
import slixfeed.url as uri import slixfeed.url as uri
import slixfeed.xmpp.bookmark as bookmark import slixfeed.xmpp.bookmark as bookmark
import slixfeed.xmpp.compose as compose import slixfeed.xmpp.compose as compose
@ -40,6 +43,7 @@ import slixfeed.xmpp.status as status
import slixfeed.xmpp.text as text import slixfeed.xmpp.text as text
import slixfeed.xmpp.upload as upload import slixfeed.xmpp.upload as upload
from slixfeed.xmpp.utility import jid_type from slixfeed.xmpp.utility import jid_type
from urllib.parse import urlsplit, urlunsplit
async def event(self, event): async def event(self, event):
@ -210,20 +214,35 @@ async def message(self, message):
# else: # else:
# response = "This command is valid for groupchat only." # response = "This command is valid for groupchat only."
case _ if message_lowercase.startswith("add"): case _ if message_lowercase.startswith("add"):
# Add given feed without validity check.
message_text = message_text[4:] message_text = message_text[4:]
url = message_text.split(" ")[0] url = message_text.split(" ")[0]
title = " ".join(message_text.split(" ")[1:]) title = " ".join(message_text.split(" ")[1:])
if url.startswith("http"): if url.startswith("http"):
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
response = await fetcher.add_feed_no_check(db_file, [url, title]) exist = await sqlite.is_feed_exist(db_file, url)
old = await sqlite.get_settings_value(db_file, "old") if not exist:
if old: await sqlite.insert_feed(db_file, url, title)
await task.clean_tasks_xmpp(jid, ["status"]) await action.organize_items(db_file, [url])
# await send_status(jid) old = await sqlite.get_settings_value(db_file, "old")
await task.start_tasks_xmpp(self, jid, ["status"]) if old:
await task.clean_tasks_xmpp(jid, ["status"])
# await send_status(jid)
await task.start_tasks_xmpp(self, jid, ["status"])
else:
await sqlite.mark_source_as_read(db_file, url)
response = (
"> {}\nNews source has been "
"added to subscription list."
).format(url)
else: else:
db_file = get_pathname_to_database(jid) ix = exist[0]
await sqlite.mark_source_as_read(db_file, url) name = exist[1]
response = (
"> {}\nNews source \"{}\" is already "
"listed in the subscription list at "
"index {}".format(url, name, ix)
)
else: else:
response = "Missing URL." response = "Missing URL."
send_reply_message(self, message, response) send_reply_message(self, message, response)
@ -388,31 +407,13 @@ async def message(self, message):
send_status_message(self, jid, status_type, status_message) send_status_message(self, jid, status_type, status_message)
if url.startswith("feed:"): if url.startswith("feed:"):
url = uri.feed_to_http(url) url = uri.feed_to_http(url)
# url_alt = await uri.replace_hostname(url, "feed")
# if url_alt:
# url = url_alt
url = (uri.replace_hostname(url, "feed")) or url url = (uri.replace_hostname(url, "feed")) or url
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
response = await fetcher.add_feed(db_file, url) response = await action.add_feed(db_file, url)
await task.start_tasks_xmpp(self, jid, ["status"]) await task.clean_tasks_xmpp(
# response = "> " + message + "\n" + response jid, ["status"])
# FIXME Make the taskhandler to update status message await task.start_tasks_xmpp(
# await refresh_task( self, jid, ["status"])
# self,
# jid,
# send_status,
# "status",
# 20
# )
# NOTE This would show the number of new unread entries
old = await sqlite.get_settings_value(db_file, "old")
if old:
await task.clean_tasks_xmpp(jid, ["status"])
# await send_status(jid)
await task.start_tasks_xmpp(self, jid, ["status"])
else:
db_file = get_pathname_to_database(jid)
await sqlite.mark_source_as_read(db_file, url)
send_reply_message(self, message, response) send_reply_message(self, message, response)
case _ if message_lowercase.startswith("feeds"): case _ if message_lowercase.startswith("feeds"):
query = message_text[6:] query = message_text[6:]
@ -521,7 +522,7 @@ async def message(self, message):
send_reply_message(self, message, response) send_reply_message(self, message, response)
case "new": case "new":
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
sqlite.set_settings_value(db_file, ["old", 0]) await sqlite.set_settings_value(db_file, ["old", 0])
response = ( response = (
"Only new items of newly added feeds will be sent." "Only new items of newly added feeds will be sent."
) )
@ -581,7 +582,8 @@ async def message(self, message):
data = message_text[5:] data = message_text[5:]
data = data.split() data = data.split()
url = data[0] url = data[0]
await task.clean_tasks_xmpp(jid, ["status"]) await task.clean_tasks_xmpp(
jid, ["status"])
status_type = "dnd" status_type = "dnd"
status_message = ( status_message = (
"📫️ Processing request to fetch data from {}" "📫️ Processing request to fetch data from {}"
@ -593,13 +595,13 @@ async def message(self, message):
match len(data): match len(data):
case 1: case 1:
if url.startswith("http"): if url.startswith("http"):
response = await fetcher.view_feed(url) response = await action.view_feed(url)
else: else:
response = "Missing URL." response = "Missing URL."
case 2: case 2:
num = data[1] num = data[1]
if url.startswith("http"): if url.startswith("http"):
response = await fetcher.view_entry(url, num) response = await action.view_entry(url, num)
else: else:
response = "Missing URL." response = "Missing URL."
case _: case _:
@ -627,15 +629,15 @@ async def message(self, message):
response = "Missing value." response = "Missing value."
send_reply_message(self, message, response) send_reply_message(self, message, response)
# NOTE Should people be asked for numeric value? # NOTE Should people be asked for numeric value?
case _ if message_lowercase.startswith("remove"): case _ if message_lowercase.startswith("remove "):
ix = message_text[7:] ix = message_text[7:]
if ix: if ix:
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
try: try:
await sqlite.remove_feed(db_file, ix) await sqlite.remove_feed(db_file, ix)
response = ( response = (
"> {}\nNews source has been removed " "News source {} has been removed "
"from subscription list.").format(url) "from subscription list.").format(ix)
# await refresh_task( # await refresh_task(
# self, # self,
# jid, # jid,
@ -643,10 +645,13 @@ async def message(self, message):
# "status", # "status",
# 20 # 20
# ) # )
await task.clean_tasks_xmpp(jid, ["status"]) await task.clean_tasks_xmpp(
await task.start_tasks_xmpp(self, jid, ["status"]) jid, ["status"])
await task.start_tasks_xmpp(
self, jid, ["status"])
except: except:
response = "No news source with ID {}.".format(ix) response = (
"No news source with ID {}.".format(ix))
else: else:
response = "Missing feed ID." response = "Missing feed ID."
send_reply_message(self, message, response) send_reply_message(self, message, response)
@ -655,7 +660,8 @@ async def message(self, message):
await task.clean_tasks_xmpp(jid, ["status"]) await task.clean_tasks_xmpp(jid, ["status"])
status_type = "dnd" status_type = "dnd"
status_message = "📫️ Marking entries as read..." status_message = "📫️ Marking entries as read..."
send_status_message(self, jid, status_type, status_message) send_status_message(
self, jid, status_type, status_message)
if source: if source:
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
await sqlite.mark_source_as_read(db_file, source) await sqlite.mark_source_as_read(db_file, source)
@ -688,9 +694,11 @@ async def message(self, message):
key = "enabled" key = "enabled"
val = 1 val = 1
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
await sqlite.set_settings_value(db_file, [key, val]) await sqlite.set_settings_value(
db_file, [key, val])
# asyncio.create_task(task_jid(self, jid)) # asyncio.create_task(task_jid(self, jid))
await task.start_tasks_xmpp(self, jid, ["interval", "status", "check"]) await task.start_tasks_xmpp(
self, jid, ["interval", "status", "check"])
response = "Updates are enabled." response = "Updates are enabled."
# print(current_time(), "task_manager[jid]") # print(current_time(), "task_manager[jid]")
# print(task_manager[jid]) # print(task_manager[jid])
@ -747,13 +755,17 @@ async def message(self, message):
key = "enabled" key = "enabled"
val = 0 val = 0
db_file = get_pathname_to_database(jid) db_file = get_pathname_to_database(jid)
await sqlite.set_settings_value(db_file, [key, val]) await sqlite.set_settings_value(
await task.clean_tasks_xmpp(jid, ["interval", "status"]) db_file, [key, val])
await task.clean_tasks_xmpp(
jid, ["interval", "status"])
response = "Updates are disabled." response = "Updates are disabled."
send_reply_message(self, message, response) send_reply_message(self, message, response)
status_type = "xa" status_type = "xa"
status_message = "💡️ Send \"Start\" to receive Jabber updates" status_message = (
send_status_message(self, jid, status_type, status_message) "💡️ Send \"Start\" to receive Jabber updates")
send_status_message(
self, jid, status_type, status_message)
case "support": case "support":
# TODO Send an invitation. # TODO Send an invitation.
response = ( response = (
@ -789,10 +801,10 @@ async def message(self, message):
os.mkdir(data_dir) os.mkdir(data_dir)
if not os.path.isdir(data_dir + '/logs/'): if not os.path.isdir(data_dir + '/logs/'):
os.mkdir(data_dir + '/logs/') os.mkdir(data_dir + '/logs/')
utility.log_as_markdown( log.markdown(
current_time(), os.path.join(data_dir, "logs", jid), current_time(), os.path.join(data_dir, "logs", jid),
jid, message_text) jid, message_text)
utility.log_as_markdown( log.markdown(
current_time(), os.path.join(data_dir, "logs", jid), current_time(), os.path.join(data_dir, "logs", jid),
self.boundjid.bare, response) self.boundjid.bare, response)