Rivista/rivista/html/gmi.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup

class HtmlGmi:

    def convert_to_gmi(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        footnotes = ['### References\n']
        footnote_counter = 1

        # Extract text and links
        for a in soup.find_all('a'):
            link_text = a.get_text()
            url = a['href']
            if not url.startswith('#') and not url.startswith('/'):
                url_modified = url.replace(' ', '%20')
                url_modified = url_modified.replace('(', '%28')
                url_modified = url_modified.replace(')', '%29')
                #footnotes.append(f"=> {url_modified} [{footnote_counter}]: {link_text}")
                footnotes.append(f"=> {url_modified} [{footnote_counter}]: {url}")
                footnote_marker = f"{link_text}[{footnote_counter}]"
                a.replace_with(footnote_marker)
                footnote_counter += 1

        # Handle <code> tags
        for code in soup.find_all('code'):
            if code.string:
                original_text = code.get_text()
                modified_text = '`' + original_text + '`'
                code.string.replace_with(modified_text)

        # Handle <pre> tags
        for pre in soup.find_all('pre'):
            pre.insert_before('\n```\n')  # Add Markdown code block start
            pre.insert_after('\n```\n')   # Add Markdown code block end

        # Convert <ul> and <li> to Markdown bullet points
        for ul in soup.find_all('ul'):
            for li in ul.find_all('li'):
                if li.string:
                    original_text = li.get_text()
                    modified_text = '- ' + original_text
                    li.string.replace_with(modified_text)

        # Get the text without HTML tags
        text = soup.get_text().strip().replace('#', '⋕')

        # Combine text with footnotes
        return f"{text}\n\n" + "\n".join(footnotes) if footnotes else text
Modularize code; Add packaging instructions; Add modules to handle Gemini file type (no Gemini server yet); Improve handling of configuration. 2024-11-12 14:25:05 +01:00			`#!/usr/bin/python`
			`# -- coding: utf-8 --`

			`from bs4 import BeautifulSoup`

			`class HtmlGmi:`

			`def convert_to_gmi(html_content):`
			`soup = BeautifulSoup(html_content, 'html.parser')`
			`footnotes = ['### References\n']`
			`footnote_counter = 1`

			`# Extract text and links`
			`for a in soup.find_all('a'):`
			`link_text = a.get_text()`
			`url = a['href']`
			`if not url.startswith('#') and not url.startswith('/'):`
			`url_modified = url.replace(' ', '%20')`
			`url_modified = url_modified.replace('(', '%28')`
			`url_modified = url_modified.replace(')', '%29')`
			`#footnotes.append(f"=> {url_modified} [{footnote_counter}]: {link_text}")`
			`footnotes.append(f"=> {url_modified} [{footnote_counter}]: {url}")`
			`footnote_marker = f"{link_text}[{footnote_counter}]"`
			`a.replace_with(footnote_marker)`
			`footnote_counter += 1`

			`# Handle <code> tags`
			`for code in soup.find_all('code'):`
			`if code.string:`
			`original_text = code.get_text()`
			modified_text = '`' + original_text + '`'
			`code.string.replace_with(modified_text)`

			`# Handle <pre> tags`
			`for pre in soup.find_all('pre'):`
			pre.insert_before('\n```\n') # Add Markdown code block start
			pre.insert_after('\n```\n') # Add Markdown code block end

			`# Convert <ul> and <li> to Markdown bullet points`
			`for ul in soup.find_all('ul'):`
			`for li in ul.find_all('li'):`
			`if li.string:`
			`original_text = li.get_text()`
			`modified_text = '- ' + original_text`
			`li.string.replace_with(modified_text)`

			`# Get the text without HTML tags`
			`text = soup.get_text().strip().replace('#', '⋕')`

			`# Combine text with footnotes`
			`return f"{text}\n\n" + "\n".join(footnotes) if footnotes else text`