Rivista/rivista/html/gmi.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

class HtmlGmi:

    def convert_to_gmi(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        footnotes = ['### References\n']
        footnote_counter = 1

        # Extract text and links
        for a in soup.find_all('a'):
            link_text = a.get_text()
            url = a['href']
            if not url.startswith('#') and not url.startswith('/'):
                url_modified = url.replace(' ', '%20')
                url_modified = url_modified.replace('(', '%28')
                url_modified = url_modified.replace(')', '%29')
                #footnotes.append(f"=> {url_modified} [{footnote_counter}]: {link_text}")
                footnotes.append(f"=> {url_modified} [{footnote_counter}]: {url}")
                footnote_marker = f"{link_text}[{footnote_counter}]"
                a.replace_with(footnote_marker)
                footnote_counter += 1

        # Handle <code> tags
        for code in soup.find_all('code'):
            if code.string:
                original_text = code.get_text()
                modified_text = '`' + original_text + '`'
                code.string.replace_with(modified_text)

        # Handle <pre> tags
        for pre in soup.find_all('pre'):
            pre.insert_before('\n```\n')  # Add Markdown code block start
            pre.insert_after('\n```\n')   # Add Markdown code block end

        # Convert <ul> and <li> to Markdown bullet points
        for ul in soup.find_all('ul'):
            for li in ul.find_all('li'):
                if li.string:
                    original_text = li.get_text()
                    modified_text = '- ' + original_text
                    li.string.replace_with(modified_text)

        # Get the text without HTML tags
        text = soup.get_text().strip().replace('#', '⋕')

        # Combine text with footnotes
        return f"{text}\n\n" + "\n".join(footnotes) if footnotes else text