Rivista/rivista/html/gmi.py

52 lines
2 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
class HtmlGmi:
def convert_to_gmi(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
footnotes = ['### References\n']
footnote_counter = 1
# Extract text and links
for a in soup.find_all('a'):
link_text = a.get_text()
url = a['href']
if not url.startswith('#') and not url.startswith('/'):
url_modified = url.replace(' ', '%20')
url_modified = url_modified.replace('(', '%28')
url_modified = url_modified.replace(')', '%29')
#footnotes.append(f"=> {url_modified} [{footnote_counter}]: {link_text}")
footnotes.append(f"=> {url_modified} [{footnote_counter}]: {url}")
footnote_marker = f"{link_text}[{footnote_counter}]"
a.replace_with(footnote_marker)
footnote_counter += 1
# Handle <code> tags
for code in soup.find_all('code'):
if code.string:
original_text = code.get_text()
modified_text = '`' + original_text + '`'
code.string.replace_with(modified_text)
# Handle <pre> tags
for pre in soup.find_all('pre'):
pre.insert_before('\n```\n') # Add Markdown code block start
pre.insert_after('\n```\n') # Add Markdown code block end
# Convert <ul> and <li> to Markdown bullet points
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
if li.string:
original_text = li.get_text()
modified_text = '- ' + original_text
li.string.replace_with(modified_text)
# Get the text without HTML tags
text = soup.get_text().strip().replace('#', '')
# Combine text with footnotes
return f"{text}\n\n" + "\n".join(footnotes) if footnotes else text