52 lines
2 KiB
Python
52 lines
2 KiB
Python
|
#!/usr/bin/python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from bs4 import BeautifulSoup
|
||
|
|
||
|
class HtmlGmi:
|
||
|
|
||
|
def convert_to_gmi(html_content):
|
||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||
|
footnotes = ['### References\n']
|
||
|
footnote_counter = 1
|
||
|
|
||
|
# Extract text and links
|
||
|
for a in soup.find_all('a'):
|
||
|
link_text = a.get_text()
|
||
|
url = a['href']
|
||
|
if not url.startswith('#') and not url.startswith('/'):
|
||
|
url_modified = url.replace(' ', '%20')
|
||
|
url_modified = url_modified.replace('(', '%28')
|
||
|
url_modified = url_modified.replace(')', '%29')
|
||
|
#footnotes.append(f"=> {url_modified} [{footnote_counter}]: {link_text}")
|
||
|
footnotes.append(f"=> {url_modified} [{footnote_counter}]: {url}")
|
||
|
footnote_marker = f"{link_text}[{footnote_counter}]"
|
||
|
a.replace_with(footnote_marker)
|
||
|
footnote_counter += 1
|
||
|
|
||
|
# Handle <code> tags
|
||
|
for code in soup.find_all('code'):
|
||
|
if code.string:
|
||
|
original_text = code.get_text()
|
||
|
modified_text = '`' + original_text + '`'
|
||
|
code.string.replace_with(modified_text)
|
||
|
|
||
|
# Handle <pre> tags
|
||
|
for pre in soup.find_all('pre'):
|
||
|
pre.insert_before('\n```\n') # Add Markdown code block start
|
||
|
pre.insert_after('\n```\n') # Add Markdown code block end
|
||
|
|
||
|
# Convert <ul> and <li> to Markdown bullet points
|
||
|
for ul in soup.find_all('ul'):
|
||
|
for li in ul.find_all('li'):
|
||
|
if li.string:
|
||
|
original_text = li.get_text()
|
||
|
modified_text = '- ' + original_text
|
||
|
li.string.replace_with(modified_text)
|
||
|
|
||
|
# Get the text without HTML tags
|
||
|
text = soup.get_text().strip().replace('#', '⋕')
|
||
|
|
||
|
# Combine text with footnotes
|
||
|
return f"{text}\n\n" + "\n".join(footnotes) if footnotes else text
|