modseccfg: Artifact [b4e0fa5b6d]

Artifact b4e0fa5b6de7a373524732db8257d9821db6747fa02768d583ea90d0f32124f8:

Executable file html2mallard/html2mallard.py — part of check-in [12587995cd] at 2022-10-21 22:50:09 on branch trunk — flex decription tag detection (user: mario size: 11136)
#!/usr/bin/env python3
# api: cli
# encoding: utf-8
# type: transform
# title: HTML to mallard
# description: convert mkdocs´ html output to mallard/yelp xml
# category: documentation
# keywords: mkdocs mallard
# version: 0.3.0
# depends: python (>= 3.6), python:PyYAML (>= 5.0), python:markdown
# license: Public Domain
# url: https://fossil.include-once.org/modseccfg/wiki/html2mallard
# 
# Poor transformation approach, mostly salvaging some HTML structures
# and reshuffling document body into mallard <page> with allowed
# inline markup.
#
# XSLT might have been easier, but doesn't work on most HTML.
# BS/lxml is way overkill for this task (hence zero such tools).
# Noone's doing a markdown to ducktype/mallard converter either.
#
# Kinda only works because the mkdocs/markdown-generated HTML is
# fairly consistent. It's best combined with a `xmllint --recover`
# pipe anyhow.


import os, sys
import re, html
from textwrap import dedent, indent
from glob import glob
debug = True and re.search(" -+de?b?u?g?\\b", " ".join(sys.argv), re.I)

# output
template = dedent("""
    <page xmlns="http://projectmallard.org/1.0/"
     type="guide" id="{id}">

    <info>
        {links}
        <desc>{desc}</desc>
        <?http header="X-Generator: html2mallard" ?>
    </info>

    <title>{title}</title>

    {body}

    </page>
""").strip()

# regex all the way
extract = {
    # meta info
    "mkdocs_page_name = \"(.*?)\";": "title",
    "<title>(?:\w+:\s)?(.+?)</title>": "title",
    "<(?:h1|h2)>([^<]+)</(?:h1|h2)>": "title",
    '''<meta\s+name=["']?description["']?\s+content=["'](.+?)["'][^>]*>''': "desc",
    '<a class="reference internal" href="(\w+).html">.+?</a>': "links",
    '<a class="trail" href="(\w+).html(#.+?)?" title=".+?">': "links",
    # flags
    '(<.+>)': "is_html",
    '(mkdocs)': "is_mkdocs",
    'data-target="[#.]navbar-(collapse)"': "is_material",
    '(fossil|timeline)': "is_fossil",
    "(SphinxRtdTheme|readthedocs-doc-embed.js|aria-label=)": "is_sphinx",
    '(<div class="inner pagewide">)': "is_yelphtml",
     "(&\w+;)": "has_entities",
    '(<p>|<div|<table|<li>|<img|<strong|<em|<h\d|<span|<code)': "convert",
}
rewrite = {
    # trim and cleanup
    ("GENERAL HTML", "is_html"): {
        "<script.+?</script>": "",
        "<head>.+?</head>": "",
        "<!DOCTYPE[^>]+>|<html[^>]*>|</body>|</html>": "",
        "<span></span>": "",
    },
    ("MKDOCS", "is_mkdocs"): {
        "\\A.+?</nav>": "",   # might strip too much for any bottom-navigation templates
        "\\A.+?<div[^>]+role=\"main\">": "",   # mkdocs RTD template
        '<footer>.+\\Z': "",    # mkdocs footer
        'Next\s<span\sclass="icon\sicon-circle-arrow-right"></span>.+\\Z': "",   # mkdocs RTD theme
    },
    ("MATERIAL", "is_material"): {
        '\\A.+<div[^>]+role="main">': "",
        '<div\sclass="modal"\sid="mkdocs_search_modal".+\\Z': "",
    },
    ("FOSSIL WIKI", "is_fossil"): {
        '\\A.+<main[^>]*>': "", # wiki header
        '<div\sclass="submenu">.+?</div>': "", # page header
        '<footer\sid=fossil-footer>.+\\Z': "", # fossil footer
        '<h2>Attachments:</h2><ul>.+\\Z': "", # page footer
    },
    ("RTD.IO/SPHINX", "is_sphinx"): {
        "\\A.+?</nav>": "",   # might strip too much for any bottom-navigation templates
        '<footer>.+\\Z': "", 
        '<div\srole="navigation"\saria-label="breadcrumbs\snavigation">.+?</div>': "",  # RTD.io
    },
    ("YELPHTML", "is_yelphtml"): {
        "\\A.+?</header><article>": "",
    },
    ("ENTITIES", "has_entities"): {
        "&rarrq;": "→",
        "&nbsp;": "␣",
        "&mdash;": "–",
        "&(?!lt|gt|amp)\w+;": lambda m: html.unescape(m[0]),
    },

    # actual conversions
    ("CONVERSIONS", "convert"): {
        "<div\sclass=\"admonition\s(?:note|abstract|summary|tldr)\">(.+?)</div>": "<note style=\"tip\">\\1</note>",
        "<div\sclass=\"admonition\s(?:todo|seealso)\">(.+?)</div>": "<note style=\"advanced\">\\1</note>",
        "<div\sclass=\"admonition\s(?:danger|error|failure|fail|missing|bug)\">(.+?)</div>": "<note style=\"bug\">\\1</note>",
        "<div\sclass=\"admonition\s(?:info|todo)\">(.+?)</div>": "<note style=\"important\">\\1</note>",
        "<div\sclass=\"admonition\s(?:example|quote|cite)\">(.+?)</div>": "<note style=\"plain\">\\1</note>",
        "<div\sclass=\"admonition\s(?:question|help|faq)\">(.+?)</div>": "<note style=\"sidebar\">\\1</note>",
        "<div\sclass=\"admonition\s(?:notes|tip|hint|important)\">(.+?)</div>": "<note style=\"tip\">\\1</note>",
        "<div\sclass=\"admonition\s(?:warning|caution|attention)\">(.+?)</div>": "<note style=\"warning\">\\1</note>",
        "<div\sclass=\"admonition(?:\s\w+)?\">(.+?)</div>": "<note style=\"tip\">\\1</note>",
        "<p\sclass=\"admonition-title\">(.+?)</p>": "<subtitle>\\1</subtitle>",
        # headlines
        "(<h\d[^>]*>.+?(?<!\s))\s*(?=<h\d|<footer|</body|\Z)": "\n<section>\n\\1\n</section>\n",
        "<(?:h1|h2)[^>]*>(.+?)</(?:h1|h2)>": "<title>\\1</title>",
        "<(?:h3|h4)[^>]*>(.+?)</(?:h3|h4)>": "<subtitle>\\1</subtitle>",
        "<(?:h5|h6)[^>]*>(.+?)</(?:h5|h7)>": "<em>\\1</em>",
        "<strong>(.+?)</strong>": "<em style=\"strong\">\\1</em>",
        # lists
        "<ol[^>]*>(.+?)</ol>": "<steps>\\1</steps>",
        "<ul[^>]*>(.+?)</ul>": "<list>\\1</list>",
        "<li\\b[^>]*>(.+?)</li>": "<item><p>\\1</p></item>",
        "<dl[^>]*>(.+?)</dl>": "<terms>\\1</terms>",
        "<dt[^>]*>(.+?)</dt>": "<item><title>\\1</title>",
        "<dd[^>]*>(.+?)</dd>": "<p>\\1</p></item>",
        # fix nested list   \1         \2                 \3                      \4    
        "(<(?:item|steps|terms)>)<p> ([^<]+(?<!\s)) \s* <(list|steps|terms)> \s* (.+?) </\\3>":
            "\\1<p>\\2</p>\n <\\3>\n<item><p>\\4 </\\3>\n</item>",
        # links
        "<a\shref=\"([^\">]+)\.html\"[^>]*>(.+?)</a>": "<link type=\"seealso\" xref=\"\\1\">\\2</link>",
        "<a\shref=\"(\w+://[^\">]+)\"[^>]*>(.+?)</a>": "<link type=\"seealso\" href=\"\\1\">\\2</link>",
        "<a\shref=\"(\#[\w\-]+)\"[^>]*>(.+?)</a>": "<link xref=\"\\1\">\\2</link>",
        # media
        "<img[^>]+src=\"(.+?)\"[^>]*>": "<media type=\"image\" src=\"\\1\" mime=\"image/png\" />",
        # tables
        "</?tbody>": "",
        "<table[^>]*>": "<table shade=\"rows cols\" rules=\"rows cols\"><tbody>",
        "</table>": "</tbody></table>",
        "<tr[^>]*>": "<tr>",
        "<(td|th)\\b[^>]*>": "    <td><p>",
        "</(td|th)\\b[^>]*>": "</p></td>",

        # strip codehilite markup
        "<span\sclass=\"\w{1,2}\">(.+?)</span>": "<span>\\1</span>",
        "<span\sclass=\"([\w\-\s]+)\">(.+?)</span>": "<span style=\"\\1\">\\2</span>",
        "<code\sclass=\"([\w\-\s]+)\">(.+?)</code>": "<code><span style=\"\\1\">\\2</span></code>",
    },
     
    ("HTML BEGONE", "is_html"): { 
        # strip any remaining non-mallard tags, except: |div|revision|thead
        """</? 
           (?!(?:page|section|info|credit|link|link|title|desc|title|keywords|license|desc|
           years|email|name|links|code|media|p|screen|quote|comment|example|figure|listing|
           note|synopsis|list|item|steps|item|terms|item|tree|item|table|col|colgroup|tr|
           tbody|tfoot|td|th|title|subtitle|desc|cite|app|code|cmd|output|em|file|gui|guiseq|hi|
           link|media|keyseq|key|span|sys|input|var)\\b)
           \w+[^>]* >""": "",
    },

    ("PRETTIFY", "is_html"): {
        # prettify sections
        "(<section>)(.+?)(</section>)": lambda m: f"{m[1]}\n{indent(m[2].strip(), prefix=' ')}\n{m[3]}",
        # strip lone </section>, empty spans
        "(<section>.+?</section>)|</section>": "\\1",
        "(<span[^>]*></span>)": "",
        "(<p[^>]*><p[^>]*>)(.+?)(</p></p>)": "<p>\\2</p>",
    }
}

def convert(html, fn):
    """
        Convert HTML to mallard page document.
        
        Parameters
        ----------
        html : str
            HTML page source (`<html>...`)
        fn : str
            Original filename (`index.html`)
            
        Returns
        -------
        str
            Converted mallard xml .page source
    """

    # prepare snippets for .format kwargs
    kw = {
        "id": re.sub("\W+", "_", re.sub("^.+/|\.\w+$", "", fn)).lower(),
        "desc": "",
        "title": "",
        "body": "",
        "links": "",
    }
    for rx, name in extract.items():
        m = re.search(rx, html)
        if m and (not name in kw or not kw[name]):
            if name == "links":
                kw[name] = ["".join(row) for row in re.findall(rx, html)]
            else:
                kw[name] = re.sub("&\w+;|<.+?>", "", m.group(1))
    if kw["links"]:
        kw["links"] = indent("\n".join(f"<link type=\"guide\" xref=\"{id}\"/>" for id in kw["links"]), prefix="    ")
    if kw["id"] != "index":
        kw["links"] = """<link type="guide" xref="index#nav"/>\n""" + kw["links"]
    if not kw["title"]:
        kw["title"] = re.sub("^.+/|\.\w+$", "", fn).title()
        
    # simplify/convert html
    for (group, flag), patterns in rewrite.items():
        if not flag in kw: # possibly skip rule group
            continue
        elif debug:
            sys.stderr.write(f"group: {group}\n")
        for rx, repl in patterns.items():
            l = len(html)
            html = re.sub(rx, repl, html, 0, re.X|re.M|re.S|re.I)
            if debug and l != len(html):
                sys.stderr.write(f"rewrite: {len(html) - l} bytes, pattern: ~{rx}~\n")
    kw["body"] = html
    
    # return converted
    if kw["id"] == "index":
        kw["body"] = """<section id="nav">\n  <!--<title>Topics</title>-->\n</section>\n""" + kw["body"]
    return template.format(**kw)

# single file
def convert_file(fn):
    html = ""
    if re.match("https?://.+", fn):     # → html2mallard http://page.html
        import requests
        html = requests.get(fn).text
        fn = re.sub(".+/", "", fn)
    else:                               # → html2mallard "site/index.html"
        with open(fn, "r", encoding="utf-8") as f:
            html = f.read()
    if re.search("\.md$", fn):          # → html2mallard page.md
        import markdown
        html = markdown.markdown(html)
    return convert(html, fn)

# process directory
def mkdocs():
    import yaml
    src = open("mkdocs.yml", "r")   # → ought to be in current directory
    cfg = yaml.load(src, Loader=yaml.Loader)
    srcdir = cfg["site_dir"]
    target = cfg["mallard_dir"]    # → required param in mkdocs.yml
    if not os.path.exists(target):
        os.makedirs(target)
    for fn in glob(f"{srcdir}/*.html"):
        if debug:
            sys.stderr.write(f"--\nFILE: '{fn}' to {target}/*.page\n")
        page = convert_file(fn)
        fn = re.sub(".+/", "", fn)
        fn = re.sub("\.html", ".page", fn)
        with open(f"{target}/{fn}", "w", encoding="utf-8") as f:
            f.write(page)

# entry_points
def main():
    if len(sys.argv) >= 2:
        print(convert_file(sys.argv[1])) # first argument as input file
    else:
        mkdocs() # iterate through site/*html

if __name__ == "__main__":
    main()