Slixfeed/slixfeed/crawl.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""

FIXME

1) https://wiki.pine64.org
     File "/slixfeed/crawl.py", line 178, in feed_mode_guess
       address = join_url(url, parted_url.path.split('/')[1] + path)
                               ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
   IndexError: list index out of range

TODO

1.1) Attempt to scan more paths: /blog/, /news/ etc., including root / 
   Attempt to scan sub domains
   https://esmailelbob.xyz/en/
   https://blog.esmailelbob.xyz/feed/

1.2) Consider utilizing fetch.http_response

2) Consider merging with module fetch.py

FEEDS CRAWLER PROJECT

3) Mark redirects for manual check

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/atom.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/feed.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/jekyll/feed.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/news.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/news.xml.php

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/rdf.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/rss.xml

Title : JSON Feed
Link  : https://www.jsonfeed.org/feed.json/videos.xml


"""

from aiohttp import ClientError, ClientSession, ClientTimeout
from feedparser import parse
import logging
from lxml import html
import slixfeed.config as config
import slixfeed.fetch as fetch
from slixfeed.url import complete_url, join_url, trim_url
from urllib.parse import urlsplit, urlunsplit


# TODO Use boolean as a flag to determine whether a single URL was found
# async def probe_page(
#     callback, url, document, num=None, db_file=None):
#     result = None
#     try:
#         # tree = etree.fromstring(res[0]) # etree is for xml
#         tree = html.fromstring(document)
#     except:
#         result = (
#             "> {}\nFailed to parse URL as feed."
#             ).format(url)
#     if not result:
#         print("RSS Auto-Discovery Engaged")
#         result = await feed_mode_auto_discovery(url, tree)
#     if not result:
#         print("RSS Scan Mode Engaged")
#         result = await feed_mode_scan(url, tree)
#     if not result:
#         print("RSS Arbitrary Mode Engaged")
#         result = await feed_mode_request(url, tree)
#     if not result:
#         result = (
#             "> {}\nNo news feeds were found for URL."
#             ).format(url)
#     # elif msg:
#     else:
#         if isinstance(result, str):
#             return result
#         elif isinstance(result, list):
#             url = result[0]
#             if db_file:
#                 # print("if db_file", db_file)
#                 return await callback(db_file, url)
#             elif num:
#                 return await callback(url, num)
#             else:
#                 return await callback(url)


async def probe_page(url, document=None):
    """
    Parameters
    ----------
    url : str
        URL.
    document : TYPE
        DESCRIPTION.

    Returns
    -------
    result : list or str
        Single URL as list or selection of URLs as str.
    """
    if not document:
        response = await fetch.http(url)
        if not response['error']:
            document = response['content']
    try:
        # tree = etree.fromstring(res[0]) # etree is for xml
        tree = html.fromstring(document)
        result = None
    except:
        logging.debug("Failed to parse URL as feed for {}.".format(url))
        result = {'link' : None,
                  'index' : None,
                  'name' : None,
                  'code' : None,
                  'error' : True,
                  'exist' : None}
    if not result:
        logging.debug("Feed auto-discovery engaged for {}".format(url))
        result = await feed_mode_auto_discovery(url, tree)
    if not result:
        logging.debug("Feed link scan mode engaged for {}".format(url))
        result = await feed_mode_scan(url, tree)
    if not result:
        logging.debug("Feed arbitrary mode engaged for {}".format(url))
        result = await feed_mode_guess(url, tree)
    if not result:
        logging.debug("No feeds were found for {}".format(url))
        result = None
    return result


# TODO Improve scan by gradual decreasing of path
async def feed_mode_guess(url, tree):
    """
    Lookup for feeds by pathname using HTTP Requests.

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    urls = []
    parted_url = urlsplit(url)
    paths = config.open_config_file("lists.toml")["pathnames"]
    # Check whether URL has path (i.e. not root)
    # Check parted_url.path to avoid error in case root wasn't given
    # TODO Make more tests
    if parted_url.path and parted_url.path.split('/')[1]:
        paths.extend(
            [".atom", ".feed", ".rdf", ".rss"]
            ) if '.rss' not in paths else -1
        # if paths.index('.rss'):
        #     paths.extend([".atom", ".feed", ".rdf", ".rss"])
    parted_url_path = parted_url.path if parted_url.path else '/'
    for path in paths:
        address = join_url(url, parted_url_path.split('/')[1] + path)
        if address not in urls:
            urls.extend([address])
    # breakpoint()
    # print("feed_mode_guess")
    urls = await process_feed_selection(url, urls)
    return urls


async def feed_mode_scan(url, tree):
    """
    Scan page for potential feeds by pathname.

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    urls = []
    paths = config.open_config_file("lists.toml")["pathnames"]
    for path in paths:
        # xpath_query = "//*[@*[contains(.,'{}')]]".format(path)
        # xpath_query = "//a[contains(@href,'{}')]".format(path)
        num = 5
        xpath_query = (
            "(//a[contains(@href,'{}')])[position()<={}]"
            ).format(path, num)
        addresses = tree.xpath(xpath_query)
        xpath_query = (
            "(//a[contains(@href,'{}')])[position()>last()-{}]"
            ).format(path, num)
        addresses += tree.xpath(xpath_query)
        # NOTE Should number of addresses be limited or
        # perhaps be N from the start and N from the end
        for address in addresses:
            address = join_url(url, address.xpath('@href')[0])
            if address not in urls:
                urls.extend([address])
    # breakpoint()
    # print("feed_mode_scan")
    urls = await process_feed_selection(url, urls)
    return urls


async def feed_mode_auto_discovery(url, tree):
    """
    Lookup for feeds using RSS autodiscovery technique.

    See: https://www.rssboard.org/rss-autodiscovery

    Parameters
    ----------
    db_file : str
        Path to database file.
    url : str
        URL.
    tree : TYPE
        DESCRIPTION.

    Returns
    -------
    msg : str
        Message with URLs.
    """
    xpath_query = (
        '//link[(@rel="alternate") and '
        '(@type="application/atom+xml" or '
        '@type="application/rdf+xml" or '
        '@type="application/rss+xml")]'
        )
    # xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""
    # xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"
    feeds = tree.xpath(xpath_query)
    if feeds:
        urls = []
        for feed in feeds:
            # # The following code works;
            # # The following code will catch
            # # only valid resources (i.e. not 404);
            # # The following code requires more bandwidth.
            # res = await fetch.http(feed)
            # if res[0]:
            #     disco = parse(res[0])
            #     title = disco["feed"]["title"]
            #     msg += "{} \n {} \n\n".format(title, feed)

            # feed_name = feed.xpath('@title')[0]
            # feed_addr = join_url(url, feed.xpath('@href')[0])

            # if feed_addr.startswith("/"):
            #     feed_addr = url + feed_addr
            address = join_url(url, feed.xpath('@href')[0])
            if address not in urls:
                urls.extend([address])
        # breakpoint()
        # print("feed_mode_auto_discovery")
        urls = await process_feed_selection(url, urls)
        return urls


# TODO Segregate function into function that returns
# URLs (string) and Feeds (dict) and function that
# composes text message (string).
# Maybe that's not necessary.
async def process_feed_selection(url, urls):
    feeds = {}
    for i in urls:
        result = await fetch.http(i)
        if not result['error']:
            document = result['content']
            status_code = result['status_code']
            if status_code == 200: # NOTE This line might be redundant
                try:
                    feeds[i] = [parse(document)]
                except:
                    continue
    message = (
        "Web feeds found for {}\n\n```\n"
        ).format(url)
    urls = []
    for feed_url in feeds:
        # try:
        #     res = await fetch.http(feed)
        # except:
        #     continue
        feed_name = None
        if "title" in feeds[feed_url][0]["feed"].keys():
            feed_name = feeds[feed_url][0].feed.title
        feed_name = feed_name if feed_name else "Untitled"
        # feed_name = feed_name if feed_name else urlsplit(feed_url).netloc
        # AttributeError: 'str' object has no attribute 'entries'
        if "entries" in feeds[feed_url][0].keys():
            feed_amnt = feeds[feed_url][0].entries
        else:
            continue
        if feed_amnt:
            # NOTE Because there could be many false positives
            #      which are revealed in second phase of scan, we
            #      could end with a single feed, which would be
            #      listed instead of fetched, so feed_url_mark is
            #      utilized in order to make fetch possible.
            # NOTE feed_url_mark was a variable which stored
            #      single URL (probably first accepted as valid)
            #      in order to get an indication whether a single
            #      URL has been fetched, so that the receiving
            #      function will scan that single URL instead of
            #      listing it as a message.
            url = {'link' : feed_url,
                   'index' : None,
                   'name' : feed_name,
                   'code' : status_code,
                   'error' : False,
                   'exist' : None}
            urls.extend([url])
    count = len(urls)
    if count > 1:
        result = urls
    elif count:
        result = urls[0]
    else:
        result = None
    return result


# def get_discovered_feeds(url, urls):
#     message = (
#         "Found {} web feeds:\n\n```\n"
#         ).format(len(urls))
#     if len(urls) > 1:
#         for urls in urls:
#                 message += (
#                     "Title : {}\n"
#                     "Link  : {}\n"
#                     "\n"
#                     ).format(url, url.title)
#         message += (
#             "```\nThe above feeds were extracted from\n{}"
#             ).format(url)
#     elif len(urls) > 0:
#         result = urls
#     else:
#         message = (
#             "No feeds were found for {}"
#             ).format(url)
#     return result


# Test module
# TODO ModuleNotFoundError: No module named 'slixfeed'
# import slixfeed.fetch as fetch
# from slixfeed.action import is_feed, process_feed_selection

# async def start(url):
#     while True:
#         result = await fetch.http(url)
#         document = result[0]
#         status = result[1]
#         if document:
#             feed = parse(document)
#             if is_feed(feed):
#                 print(url)
#             else:
#                 urls = await probe_page(
#                     url, document)
#                 if len(urls) > 1:
#                     await process_feed_selection(urls)
#                 elif urls:
#                     url = urls[0]
#         else:
#             response = (
#                 "> {}\nFailed to load URL.  Reason: {}"
#                 ).format(url, status)
#             break
#     return response

# url = "https://www.smh.com.au/rssheadlines"
# start(url)
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`"""`

WIP: Closer to fix double message. See task.py 2024-02-10 18:53:53 +01:00			`FIXME`

			`1) https://wiki.pine64.org`
			`File "/slixfeed/crawl.py", line 178, in feed_mode_guess`
			`address = join_url(url, parted_url.path.split('/')[1] + path)`
			`~~~~~~~~~~~~~~~~~~~~~~~~~~^^^`
			`IndexError: list index out of range`

Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`TODO`

Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`1.1) Attempt to scan more paths: /blog/, /news/ etc., including root /`
			`Attempt to scan sub domains`
			`https://esmailelbob.xyz/en/`
			`https://blog.esmailelbob.xyz/feed/`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`1.2) Consider utilizing fetch.http_response`
Restructure modules and database. Add OPML import functionality. Minor improvements. 2024-01-06 23:03:08 +01:00
			`2) Consider merging with module fetch.py`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`FEEDS CRAWLER PROJECT`

Add support for JSON Feed 2024-01-20 18:28:31 +01:00			`3) Mark redirects for manual check`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/atom.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/feed.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/feeds/rss/news.xml.php`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/jekyll/feed.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/news.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/news.xml.php`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/rdf.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/rss.xml`

			`Title : JSON Feed`
			`Link : https://www.jsonfeed.org/feed.json/videos.xml`


Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`

			`from aiohttp import ClientError, ClientSession, ClientTimeout`
			`from feedparser import parse`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`import logging`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`from lxml import html`
			`import slixfeed.config as config`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`import slixfeed.fetch as fetch`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`from slixfeed.url import complete_url, join_url, trim_url`
			`from urllib.parse import urlsplit, urlunsplit`


			`# TODO Use boolean as a flag to determine whether a single URL was found`
			`# async def probe_page(`
			`# callback, url, document, num=None, db_file=None):`
			`# result = None`
			`# try:`
			`# # tree = etree.fromstring(res[0]) # etree is for xml`
			`# tree = html.fromstring(document)`
			`# except:`
			`# result = (`
			`# "> {}\nFailed to parse URL as feed."`
			`# ).format(url)`
			`# if not result:`
			`# print("RSS Auto-Discovery Engaged")`
			`# result = await feed_mode_auto_discovery(url, tree)`
			`# if not result:`
			`# print("RSS Scan Mode Engaged")`
			`# result = await feed_mode_scan(url, tree)`
			`# if not result:`
			`# print("RSS Arbitrary Mode Engaged")`
			`# result = await feed_mode_request(url, tree)`
			`# if not result:`
			`# result = (`
			`# "> {}\nNo news feeds were found for URL."`
			`# ).format(url)`
			`# # elif msg:`
			`# else:`
			`# if isinstance(result, str):`
			`# return result`
			`# elif isinstance(result, list):`
			`# url = result[0]`
			`# if db_file:`
			`# # print("if db_file", db_file)`
			`# return await callback(db_file, url)`
			`# elif num:`
			`# return await callback(url, num)`
			`# else:`
			`# return await callback(url)`


Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`async def probe_page(url, document=None):`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`
			`Parameters`
			`----------`
			`url : str`
			`URL.`
			`document : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`result : list or str`
			`Single URL as list or selection of URLs as str.`
			`"""`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`if not document:`
			`response = await fetch.http(url)`
			`if not response['error']:`
			`document = response['content']`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`try:`
			`# tree = etree.fromstring(res[0]) # etree is for xml`
			`tree = html.fromstring(document)`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`result = None`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`except:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`logging.debug("Failed to parse URL as feed for {}.".format(url))`
			`result = {'link' : None,`
			`'index' : None,`
			`'name' : None,`
			`'code' : None,`
			`'error' : True,`
			`'exist' : None}`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`if not result:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`logging.debug("Feed auto-discovery engaged for {}".format(url))`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`result = await feed_mode_auto_discovery(url, tree)`
			`if not result:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`logging.debug("Feed link scan mode engaged for {}".format(url))`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`result = await feed_mode_scan(url, tree)`
			`if not result:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`logging.debug("Feed arbitrary mode engaged for {}".format(url))`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`result = await feed_mode_guess(url, tree)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`if not result:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`logging.debug("No feeds were found for {}".format(url))`
			`result = None`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`return result`


			`# TODO Improve scan by gradual decreasing of path`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`async def feed_mode_guess(url, tree):`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`"""`
			`Lookup for feeds by pathname using HTTP Requests.`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`urls = []`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`parted_url = urlsplit(url)`
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`paths = config.open_config_file("lists.toml")["pathnames"]`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# Check whether URL has path (i.e. not root)`
			`# Check parted_url.path to avoid error in case root wasn't given`
			`# TODO Make more tests`
			`if parted_url.path and parted_url.path.split('/')[1]:`
			`paths.extend(`
			`[".atom", ".feed", ".rdf", ".rss"]`
			`) if '.rss' not in paths else -1`
			`# if paths.index('.rss'):`
			`# paths.extend([".atom", ".feed", ".rdf", ".rss"])`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`parted_url_path = parted_url.path if parted_url.path else '/'`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for path in paths:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`address = join_url(url, parted_url_path.split('/')[1] + path)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_guess")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00

			`async def feed_mode_scan(url, tree):`
			`"""`
			`Scan page for potential feeds by pathname.`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`urls = []`
Fix many issues amidst change of table structure 2024-02-04 18:08:12 +01:00			`paths = config.open_config_file("lists.toml")["pathnames"]`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for path in paths:`
			`# xpath_query = "//[@[contains(.,'{}')]]".format(path)`
			`# xpath_query = "//a[contains(@href,'{}')]".format(path)`
			`num = 5`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`xpath_query = (`
			`"(//a[contains(@href,'{}')])[position()<={}]"`
			`).format(path, num)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`addresses = tree.xpath(xpath_query)`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`xpath_query = (`
			`"(//a[contains(@href,'{}')])[position()>last()-{}]"`
			`).format(path, num)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`addresses += tree.xpath(xpath_query)`
			`# NOTE Should number of addresses be limited or`
			`# perhaps be N from the start and N from the end`
			`for address in addresses:`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`address = join_url(url, address.xpath('@href')[0])`
			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_scan")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00

			`async def feed_mode_auto_discovery(url, tree):`
			`"""`
			`Lookup for feeds using RSS autodiscovery technique.`

			`See: https://www.rssboard.org/rss-autodiscovery`

			`Parameters`
			`----------`
			`db_file : str`
			`Path to database file.`
			`url : str`
			`URL.`
			`tree : TYPE`
			`DESCRIPTION.`

			`Returns`
			`-------`
			`msg : str`
			`Message with URLs.`
			`"""`
			`xpath_query = (`
			`'//link[(@rel="alternate") and '`
			`'(@type="application/atom+xml" or '`
			`'@type="application/rdf+xml" or '`
			`'@type="application/rss+xml")]'`
			`)`
			`# xpath_query = """//link[(@rel="alternate") and (@type="application/atom+xml" or @type="application/rdf+xml" or @type="application/rss+xml")]/@href"""`
			`# xpath_query = "//link[@rel='alternate' and @type='application/atom+xml' or @rel='alternate' and @type='application/rss+xml' or @rel='alternate' and @type='application/rdf+xml']/@href"`
			`feeds = tree.xpath(xpath_query)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`if feeds:`
			`urls = []`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`for feed in feeds:`
			`# # The following code works;`
			`# # The following code will catch`
			`# # only valid resources (i.e. not 404);`
			`# # The following code requires more bandwidth.`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# res = await fetch.http(feed)`
Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`# if res[0]:`
			`# disco = parse(res[0])`
			`# title = disco["feed"]["title"]`
			`# msg += "{} \n {} \n\n".format(title, feed)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00
			`# feed_name = feed.xpath('@title')[0]`
			`# feed_addr = join_url(url, feed.xpath('@href')[0])`

Segregate code into more particular functions 2024-01-04 02:16:24 +01:00			`# if feed_addr.startswith("/"):`
			`# feed_addr = url + feed_addr`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`address = join_url(url, feed.xpath('@href')[0])`
			`if address not in urls:`
			`urls.extend([address])`
			`# breakpoint()`
			`# print("feed_mode_auto_discovery")`
			`urls = await process_feed_selection(url, urls)`
			`return urls`


			`# TODO Segregate function into function that returns`
			`# URLs (string) and Feeds (dict) and function that`
			`# composes text message (string).`
			`# Maybe that's not necessary.`
			`async def process_feed_selection(url, urls):`
			`feeds = {}`
			`for i in urls:`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`result = await fetch.http(i)`
			`if not result['error']:`
			`document = result['content']`
			`status_code = result['status_code']`
			`if status_code == 200: # NOTE This line might be redundant`
			`try:`
			`feeds[i] = [parse(document)]`
			`except:`
			`continue`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`message = (`
			"Web feeds found for {}\n\n```\n"
			`).format(url)`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`urls = []`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`for feed_url in feeds:`
			`# try:`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# res = await fetch.http(feed)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# except:`
			`# continue`
			`feed_name = None`
			`if "title" in feeds[feed_url][0]["feed"].keys():`
			`feed_name = feeds[feed_url][0].feed.title`
			`feed_name = feed_name if feed_name else "Untitled"`
			`# feed_name = feed_name if feed_name else urlsplit(feed_url).netloc`
			`# AttributeError: 'str' object has no attribute 'entries'`
			`if "entries" in feeds[feed_url][0].keys():`
			`feed_amnt = feeds[feed_url][0].entries`
			`else:`
			`continue`
			`if feed_amnt:`
			`# NOTE Because there could be many false positives`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`# which are revealed in second phase of scan, we`
			`# could end with a single feed, which would be`
			`# listed instead of fetched, so feed_url_mark is`
			`# utilized in order to make fetch possible.`
			`# NOTE feed_url_mark was a variable which stored`
			`# single URL (probably first accepted as valid)`
			`# in order to get an indication whether a single`
			`# URL has been fetched, so that the receiving`
			`# function will scan that single URL instead of`
			`# listing it as a message.`
Fix keywords extracted from sqlite. Improve modiles fetch and crawl. Add form featured feeds. Add form roster manager. Add form subscibers manager. WIP 2024-02-18 00:21:44 +01:00			`url = {'link' : feed_url,`
Add for Add Subscription. Segregating more code as one task per function, particularly adding of new subscription. Fix allow/deny keys. 2024-02-16 02:46:04 +01:00			`'index' : None,`
			`'name' : feed_name,`
			`'code' : status_code,`
			`'error' : False,`
			`'exist' : None}`
			`urls.extend([url])`
			`count = len(urls)`
			`if count > 1:`
			`result = urls`
			`elif count:`
			`result = urls[0]`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`else:`
			`result = None`
			`return result`


			`# def get_discovered_feeds(url, urls):`
			`# message = (`
			# "Found {} web feeds:\n\n```\n"
			`# ).format(len(urls))`
			`# if len(urls) > 1:`
			`# for urls in urls:`
			`# message += (`
			`# "Title : {}\n"`
			`# "Link : {}\n"`
			`# "\n"`
			`# ).format(url, url.title)`
			`# message += (`
			# "```\nThe above feeds were extracted from\n{}"
			`# ).format(url)`
			`# elif len(urls) > 0:`
			`# result = urls`
			`# else:`
			`# message = (`
			`# "No feeds were found for {}"`
			`# ).format(url)`
			`# return result`


			`# Test module`
			`# TODO ModuleNotFoundError: No module named 'slixfeed'`
			`# import slixfeed.fetch as fetch`
			`# from slixfeed.action import is_feed, process_feed_selection`

			`# async def start(url):`
			`# while True:`
Detect image from xml enclosure in addition to html img 2024-01-11 11:55:42 +01:00			`# result = await fetch.http(url)`
Improve code of module crawl.py 2024-01-09 13:34:10 +01:00			`# document = result[0]`
			`# status = result[1]`
			`# if document:`
			`# feed = parse(document)`
			`# if is_feed(feed):`
			`# print(url)`
			`# else:`
			`# urls = await probe_page(`
			`# url, document)`
			`# if len(urls) > 1:`
			`# await process_feed_selection(urls)`
			`# elif urls:`
			`# url = urls[0]`
			`# else:`
			`# response = (`
			`# "> {}\nFailed to load URL. Reason: {}"`
			`# ).format(url, status)`
			`# break`
			`# return response`

			`# url = "https://www.smh.com.au/rssheadlines"`
			`# start(url)`