Slixfeed/slixfeed/crawl.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""

FIXME

1) https://wiki.pine64.org
     File "/slixfeed/crawl.py", line 178, in feed_mode_guess
       address = join_url(url, parted_url.path.split('/')[1] + path)
                               ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
   IndexError: list index out of range

TODO

1.1) Attempt to scan more paths: /blog/, /news/ etc., including root / 
   Attempt to scan sub domains
   https://esmailelbob.xyz/en/
   https://blog.esmailelbob.xyz/feed/

1.2) Consider utilizing fetch.http_response

2) Consider merging with module fetch.py

FEEDS CRAWLER PROJECT

3) Mark redirects for manual check

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/atom.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/feed.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/jekyll/feed.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/news.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/news.xml.php

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/rdf.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/rss.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/videos.xml


"""

from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
import logging
from lxml import etree
from lxml import html
from lxml.etree import fromstring
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.log import Logger
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit


# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
#     callback, url, document, num=None, db_file=None):
#     result = None
#     try:
#         # tree = etree.fromstring(res[0]) # etree is for xml
#         tree = html.fromstring(document)
#     except:
#         result = (
#             "> {}\nFailed to parse URL as feed."
#             ).format(url)
#     if not result:
#         print("RSS Auto-Discovery Engaged")
#         result = await feed_mode_auto_discovery(url, tree)
#     if not result:
#         print("RSS Scan Mode Engaged")
#         result = await feed_mode_scan(url, tree)
#     if not result:
#         print("RSS Arbitrary Mode Engaged")
#         result = await feed_mode_request(url, tree)
#     if not result:
#         result = (
#             "> {}\nNo news feeds were found for URL."
#             ).format(url)
#     # elif msg:
#     else:
#         if isinstance(result, str):
#             return result
#         elif isinstance(result, list):
#             url = result[0]
#             if db_file:
#                 # print("if db_file", db_file)
#                 return await callback(db_file, url)
#             elif num:
#                 return await callback(url, num)
#             else:
#                 return await callback(url)

logger = Logger(__name__)

async def probe_page(url, document=None):
    """
    Parameters
    ----------
    url : str
        URL.
    document : TYPE
        DESCRIPTION.

    Returns
    -------
    result : list or str
        Single URL as list or selection of URLs as str.
    """
    if not document:
        response = await fetch.http(url)
        if not response['error']:
            document = response['content']
    try:
        # tree = etree.fromstring(res[0]) # etree is for xml
        tree = html.fromstring(document)
        result = None
    except Exception as e:
        logger.error(str(e))
        try:
            # /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
            # xml = html.fromstring(document.encode('utf-8'))
            # parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
            # tree = fromstring(xml, parser=parser)

            # /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
            #tree = html.fromstring(bytes(document, encoding='utf8'))

            # https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html
            #parser = etree.XMLParser(recover=True)
            #tree = etree.fromstring(document, parser)

            tree = html.fromstring(document.encode('utf-8'))
            result = None
        except Exception as e:
            logger.error(str(e))
            logger.warning("Failed to parse URL as feed for {}.".format(url))
            result = {'link' : None,
                      'index' : None,
                      'name' : None,
                      'code' : None,
                      'error' : True,
                      'exist' : None}
    if not result:
        logger.debug("Feed auto-discovery engaged for {}".format(url))
        result = await feed_mode_auto_discovery(url, tree)
    if not result:
        logger.debug("Feed link scan mode engaged for {}".format(url))
        result = await feed_mode_scan(url, tree)
    if not result:
        logger.debug("Feed arbitrary mode engaged for {}".format(url))
        result = await feed_mode_guess(url, tree)
    if not result:
        logger.debug("No feeds were found for {}".format(url))
        result = None
    return result


# TODO Improve scan by gradual decreasing of path
async def feed_mode_guess(url, tree):
    """
    Lookup for feeds by pathname using HTTP Requests.

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    urls = []
    parted_url = urlsplit(url)
    paths = config.open_config_file("lists.toml")["pathnames"]
    # Check whether URL has path (i.e. not root)
    # Check parted_url.path to avoid error in case root wasn't given
    # TODO Make more tests
    if parted_url.path and parted_url.path.split('/')[1]:
        paths.extend(
            [".atom", ".feed", ".rdf", ".rss"]
            ) if '.rss' not in paths else -1
        # if paths.index('.rss'):
        #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
    parted_url_path = parted_url.path if parted_url.path else '/'
    for path in paths:
        address = join_url(url, parted_url_path.split('/')[1] + path)
        if address not in urls:
            urls.extend([address])
    # breakpoint()
    # print("feed_mode_guess")
    urls = await process_feed_selection(url, urls)
    return urls


async def feed_mode_scan(url, tree):
    """
    Scan page for potential feeds by pathname.

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    urls = []
    paths = config.open_config_file("lists.toml")["pathnames"]
    for path in paths:
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
        # xpath_query = "//a[contains(@href,'{}')]".format(path)
        num = 5
        xpath_query = (
            "(//a[contains(@href,'{}')])[position()<={}]"
            ).format(path, num)
        addresses = tree.xpath(xpath_query)
        xpath_query = (
            "(//a[contains(@href,'{}')])[position()>last()-{}]"
            ).format(path, num)
        addresses += tree.xpath(xpath_query)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
        for address in addresses:
            address = join_url(url, address.xpath('@href')[0])
            if address not in urls:
                urls.extend([address])
    # breakpoint()
    # print("feed_mode_scan")
    urls = await process_feed_selection(url, urls)
    return urls


async def feed_mode_auto_discovery(url, tree):
    """
    Lookup for feeds using RSS autodiscovery technique.

    See: https://www.rssboard.org/rss-autodiscovery

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    xpath_query = (
        '//link[(@rel="alternate") and '
        '(@type="application/atom+xml" or '
        '@type="application/rdf+xml" or '
        '@type="application/rss+xml")]'
        )
    # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
    # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
    feeds = tree.xpath(xpath_query)
    if feeds:
        urls = []
        for feed in feeds:
            # # The following code works;
            # # The following code will catch
            # # only valid resources (i.e. not 404);
            # # The following code requires more bandwidth.
            # res = await fetch.http(feed)
            # if res[0]:
            #     disco = parse(res[0])
            #     title = disco["feed"]["title"]
            #     msg += "{} \n {} \n\n".format(title, feed)

            # feed_name = feed.xpath('@title')[0]
            # feed_addr = join_url(url, feed.xpath('@href')[0])

            # if feed_addr.startswith("/"):
            #     feed_addr = url + feed_addr
            address = join_url(url, feed.xpath('@href')[0])
            if address not in urls:
                urls.extend([address])
        # breakpoint()
        # print("feed_mode_auto_discovery")
        urls = await process_feed_selection(url, urls)
        return urls


# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
    feeds = {}
    for i in urls:
        result = await fetch.http(i)
        if not result['error']:
            document = result['content']
            status_code = result['status_code']
            if status_code == 200: # NOTE This line might be redundant
                try:
                    feeds[i] = [parse(document)]
                except:
                    continue
    message = (
        "Web feeds found for {}\n\n```\n"
        ).format(url)
    urls = []
    for feed_url in feeds:
        # try:
        #     res = await fetch.http(feed)
        # except:
        #     continue
        feed_name = None
        if "title" in feeds[feed_url][0]["feed"].keys():
            feed_name = feeds[feed_url][0].feed.title
        feed_name = feed_name if feed_name else "Untitled"
        # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
        # AttributeError: 'str' object has no attribute 'entries'
        if "entries" in feeds[feed_url][0].keys():
            feed_amnt = feeds[feed_url][0].entries
        else:
            continue
        if feed_amnt:
            # NOTE Because there could be many false positives
            #      which are revealed in second phase of scan, we
            #      could end with a single feed, which would be
            #      listed instead of fetched, so feed_url_mark is
            #      utilized in order to make fetch possible.
            # NOTE feed_url_mark was a variable which stored
            #      single URL (probably first accepted as valid)
            #      in order to get an indication whether a single
            #      URL has been fetched, so that the receiving
            #      function will scan that single URL instead of
            #      listing it as a message.
            url = {'link' : feed_url,
                   'index' : None,
                   'name' : feed_name,
                   'code' : status_code,
                   'error' : False,
                   'exist' : None}
            urls.extend([url])
    count = len(urls)
    if count > 1:
        result = urls
    elif count:
        result = urls[0]
    else:
        result = None
    return result


# def get_discovered_feeds(url, urls):
#     message = (
#         "Found {} web feeds:\n\n```\n"
#         ).format(len(urls))
#     if len(urls) > 1:
#         for urls in urls:
#                 message += (
#                     "Title : {}\n"
#                     "Link  : {}\n"
#                     "\n"
#                     ).format(url, url.title)
#         message += (
#             "```\nThe above feeds were extracted from\n{}"
#             ).format(url)
#     elif len(urls) > 0:
#         result = urls
#     else:
#         message = (
#             "No feeds were found for {}"
#             ).format(url)
#     return result


# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection

# async def start(url):
#     while True:
#         result = await fetch.http(url)
#         document = result[0]
#         status = result[1]
#         if document:
#             feed = parse(document)
#             if is_feed(feed):
#                 print(url)
#             else:
#                 urls = await probe_page(
#                     url, document)
#                 if len(urls) > 1:
#                     await process_feed_selection(urls)
#                 elif urls:
#                     url = urls[0]
#         else:
#             response = (
#                 "> {}\nFailed to load URL.  Reason: {}"
#                 ).format(url, status)
#             break
#     return response

# url = "https://www.smh.com.au/rssheadlines"
# start(url)
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`"""`

WIP: Closer to fix double message. See task.py 2024-02-10 18:53:53 +01:00			`FIXME`

			`1) https://wiki.pine64.org`
			`File "/slixfeed/crawl.py", line 178, in feed_mode_guess`
			`address = join_url(url, parted_url.path.split('/')[1] + path)`
			`~~~~~~~~~~~~~~~~~~~~~~~~~~^^^`
			`IndexError: list index out of range`

Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`TODO`

Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /`
			`Attempt to scan sub domains`
			`https://esmailelbob.xyz/en/`
			`https://blog.esmailelbob.xyz/feed/`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`1.2) Consider utilizing fetch.http_response`
Restructure modules and database. Add OPML import functionality. Minor improvements. 2024-01-06 23:03:08 +01:00
			`2) Consider merging with module fetch.py`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`FEEDS CRAWLER PROJECT`

Add support for JSON Feed 2024-01-20 18:28:31 +01:00			`3) Mark redirects for manual check`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/atom.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/feed.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/news.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/news.xml.php`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/rdf.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/rss.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/videos.xml`


Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`

			`from aiohttp import ClientError, ClientSession, ClientTimeout`
			`from feedparser import parse`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`import logging`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`from lxml import etree`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`from lxml import html`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`from lxml.etree import fromstring`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`import slixfeed.config as config`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`import slixfeed.fetch as fetch`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`from slixfeed.log import Logger`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`from slixfeed.url import complete_url, join_url, trim_url`
			`from urllib.parse import urlsplit, urlunsplit`


			`# TODO Use boolean as a flag to determine whether a single URL was found`
			`# async def probe_page(`
			`# callback, url, document, num=None, db_file=None):`
			`# result = None`
			`# try:`
			`# # tree = etree.fromstring(res[0]) # etree is for xml`
			`# tree = html.fromstring(document)`
			`# except:`
			`# result = (`
			`# "> {}\nFailed to parse URL as feed."`
			`# ).format(url)`
			`# if not result:`
			`# print("RSS Auto-Discovery Engaged")`
			`# result = await feed_mode_auto_discovery(url, tree)`
			`# if not result:`
			`# print("RSS Scan Mode Engaged")`
			`# result = await feed_mode_scan(url, tree)`
			`# if not result:`
			`# print("RSS Arbitrary Mode Engaged")`
			`# result = await feed_mode_request(url, tree)`
			`# if not result:`
			`# result = (`
			`# "> {}\nNo news feeds were found for URL."`
			`# ).format(url)`
			`# # elif msg:`
			`# else:`
			`# if isinstance(result, str):`
			`# return result`
			`# elif isinstance(result, list):`
			`# url = result[0]`
			`# if db_file:`
			`# # print("if db_file", db_file)`
			`# return await callback(db_file, url)`
			`# elif num:`
			`# return await callback(url, num)`
			`# else:`
			`# return await callback(url)`

Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`logger = Logger(__name__)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`async def probe_page(url, document=None):`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`
			`Parameters`
			`----------`
			`url : str`
			`URL.`
			`document : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`result : list or str`
			`Single URL as list or selection of URLs as str.`
			`"""`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`if not document:`
			`response = await fetch.http(url)`
			`if not response['error']:`
			`document = response['content']`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`try:`
			`# tree = etree.fromstring(res[0]) # etree is for xml`
			`tree = html.fromstring(document)`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`result = None`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`except Exception as e:`
			`logger.error(str(e))`
			`try:`
			`# /questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported`
			`# xml = html.fromstring(document.encode('utf-8'))`
			`# parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')`
			`# tree = fromstring(xml, parser=parser)`

			`# /questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported`
			`#tree = html.fromstring(bytes(document, encoding='utf8'))`

			`# https://twigstechtips.blogspot.com/2013/06/python-lxml-strings-with-encoding.html`
			`#parser = etree.XMLParser(recover=True)`
			`#tree = etree.fromstring(document, parser)`

			`tree = html.fromstring(document.encode('utf-8'))`
			`result = None`
			`except Exception as e:`
			`logger.error(str(e))`
Rename module command.py to adhoc.py. 2024-06-11 04:34:18 +02:00			`logger.warning("Failed to parse URL as feed for {}.".format(url))`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`result = {'link' : None,`
			`'index' : None,`
			`'name' : None,`
			`'code' : None,`
			`'error' : True,`
			`'exist' : None}`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`if not result:`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`logger.debug("Feed auto-discovery engaged for {}".format(url))`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`result = await feed_mode_auto_discovery(url, tree)`
			`if not result:`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`logger.debug("Feed link scan mode engaged for {}".format(url))`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`result = await feed_mode_scan(url, tree)`
			`if not result:`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`logger.debug("Feed arbitrary mode engaged for {}".format(url))`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`result = await feed_mode_guess(url, tree)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`if not result:`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`logger.debug("No feeds were found for {}".format(url))`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`result = None`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`return result`


			`# TODO Improve scan by gradual decreasing of path`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`async def feed_mode_guess(url, tree):`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`
			`Lookup for feeds by pathname using HTTP Requests.`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`urls = []`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`parted_url = urlsplit(url)`
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`paths = config.open_config_file("lists.toml")["pathnames"]`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# Check whether URL has path (i.e. not root)`
			`# Check parted_url.path to avoid error in case root wasn't given`
			`# TODO Make more tests`
			`if parted_url.path and parted_url.path.split('/')[1]:`
			`paths.extend(`
			`[".atom", ".feed", ".rdf", ".rss"]`
			`) if '.rss' not in paths else -1`
			`# if paths.index('.rss'):`
			`# paths.extend([".atom", ".feed", ".rdf", ".rss"])`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`parted_url_path = parted_url.path if parted_url.path else '/'`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for path in paths:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`address = join_url(url, parted_url_path.split('/')[1] + path)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_guess")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00

			`async def feed_mode_scan(url, tree):`
			`"""`
			`Scan page for potential feeds by pathname.`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`urls = []`
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`paths = config.open_config_file("lists.toml")["pathnames"]`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for path in paths:`
			`# xpath_query = "//[@[contains(.,'{}')]]".format(path)`
			`# xpath_query = "//a[contains(@href,'{}')]".format(path)`
			`num = 5`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`xpath_query = (`
			`"(//a[contains(@href,'{}')])[position()<={}]"`
			`).format(path, num)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`addresses = tree.xpath(xpath_query)`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`xpath_query = (`
			`"(//a[contains(@href,'{}')])[position()>last()-{}]"`
			`).format(path, num)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`addresses += tree.xpath(xpath_query)`
			`# NOTE Should number of addresses be limited or`
			`# perhaps be N from the start and N from the end`
			`for address in addresses:`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`address = join_url(url, address.xpath('@href')[0])`
			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_scan")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00

			`async def feed_mode_auto_discovery(url, tree):`
			`"""`
			`Lookup for feeds using RSS autodiscovery technique.`

			`See: https://www.rssboard.org/rss-autodiscovery`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
			`xpath_query = (`
			`'//link[(@rel="alternate") and '`
			`'(@type="application/atom+xml" or '`
			`'@type="application/rdf+xml" or '`
			`'@type="application/rss+xml")]'`
			`)`
			`# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""`
			`# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"`
			`feeds = tree.xpath(xpath_query)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`if feeds:`
			`urls = []`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for feed in feeds:`
			`# # The following code works;`
			`# # The following code will catch`
			`# # only valid resources (i.e. not 404);`
			`# # The following code requires more bandwidth.`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# res = await fetch.http(feed)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`# if res[0]:`
			`# disco = parse(res[0])`
			`# title = disco["feed"]["title"]`
			`# msg += "{} \n {} \n\n".format(title, feed)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00
			`# feed_name = feed.xpath('@title')[0]`
			`# feed_addr = join_url(url, feed.xpath('@href')[0])`

Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`# if feed_addr.startswith("/"):`
			`# feed_addr = url + feed_addr`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`address = join_url(url, feed.xpath('@href')[0])`
			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_auto_discovery")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`


			`# TODO Segregate function into function that returns`
			`# URLs (string) and Feeds (dict) and function that`
			`# composes text message (string).`
			`# Maybe that's not necessary.`
			`async def process_feed_selection(url, urls):`
			`feeds = {}`
			`for i in urls:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`result = await fetch.http(i)`
			`if not result['error']:`
			`document = result['content']`
			`status_code = result['status_code']`
			`if status_code == 200: # NOTE This line might be redundant`
			`try:`
			`feeds[i] = [parse(document)]`
			`except:`
			`continue`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`message = (`
			"Web feeds found for {}\n\n```\n"
			`).format(url)`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`urls = []`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`for feed_url in feeds:`
			`# try:`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# res = await fetch.http(feed)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# except:`
			`# continue`
			`feed_name = None`
			`if "title" in feeds[feed_url][0]["feed"].keys():`
			`feed_name = feeds[feed_url][0].feed.title`
			`feed_name = feed_name if feed_name else "Untitled"`
			`# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc`
			`# AttributeError: 'str' object has no attribute 'entries'`
			`if "entries" in feeds[feed_url][0].keys():`
			`feed_amnt = feeds[feed_url][0].entries`
			`else:`
			`continue`
			`if feed_amnt:`
			`# NOTE Because there could be many false positives`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`# which are revealed in second phase of scan, we`
			`# could end with a single feed, which would be`
			`# listed instead of fetched, so feed_url_mark is`
			`# utilized in order to make fetch possible.`
			`# NOTE feed_url_mark was a variable which stored`
			`# single URL (probably first accepted as valid)`
			`# in order to get an indication whether a single`
			`# URL has been fetched, so that the receiving`
			`# function will scan that single URL instead of`
			`# listing it as a message.`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`url = {'link' : feed_url,`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`'index' : None,`
			`'name' : feed_name,`
			`'code' : status_code,`
			`'error' : False,`
			`'exist' : None}`
			`urls.extend([url])`
			`count = len(urls)`
			`if count > 1:`
			`result = urls`
			`elif count:`
			`result = urls[0]`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`else:`
			`result = None`
			`return result`


			`# def get_discovered_feeds(url, urls):`
			`# message = (`
			# "Found {} web feeds:\n\n```\n"
			`# ).format(len(urls))`
			`# if len(urls) > 1:`
			`# for urls in urls:`
			`# message += (`
			`# "Title : {}\n"`
			`# "Link : {}\n"`
			`# "\n"`
			`# ).format(url, url.title)`
			`# message += (`
			# "```\nThe above feeds were extracted from\n{}"
			`# ).format(url)`
			`# elif len(urls) > 0:`
			`# result = urls`
			`# else:`
			`# message = (`
			`# "No feeds were found for {}"`
			`# ).format(url)`
			`# return result`


			`# Test module`
			`# TODO ModuleNotFoundError: No module named 'slixfeed'`
			`# import slixfeed.fetch as fetch`
			`# from slixfeed.action import is_feed, process_feed_selection`

			`# async def start(url):`
			`# while True:`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# result = await fetch.http(url)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# document = result[0]`
			`# status = result[1]`
			`# if document:`
			`# feed = parse(document)`
			`# if is_feed(feed):`
			`# print(url)`
			`# else:`
			`# urls = await probe_page(`
			`# url, document)`
			`# if len(urls) > 1:`
			`# await process_feed_selection(urls)`
			`# elif urls:`
			`# url = urls[0]`
			`# else:`
			`# response = (`
			`# "> {}\nFailed to load URL. Reason: {}"`
			`# ).format(url, status)`
			`# break`
			`# return response`

			`# url = "https://www.smh.com.au/rssheadlines"`
Do not disqualify subscriptions due to being not-well-formed 2024-06-10 20:20:04 +02:00			`# start(url)`