syntaxcolor.py with syntax coloring

This HTML file was generated with Kalle's syntaxcolor.py


   1"""
   2Reads a Python/OpenSCAD/QuickBASIC program and generates an HTML file with
   3syntax coloring. The parser isn't very smart.
   4By Kalle (http://qalle.net)
   5"""
   6
   7import sys
   8import os.path
   9import string
  10
  11# copied from Python documentation; incomplete
  12PYTHON_KEYWORD_FILE = "syntaxcolor-keywords-py.txt"
  13
  14# copied from cheatsheet
  15OPENSCAD_KEYWORD_FILE = "syntaxcolor-keywords-scad.txt"
  16
  17# copied from QuickBASIC 4.5 help
  18QUICKBASIC_KEYWORD_FILE = "syntaxcolor-keywords-qb.txt"
  19
  20HELP_TEXT = """\
  21Reads a Python/OpenSCAD/QuickBASIC program and generates an HTML file with
  22syntax coloring. The parser isn't very smart.
  23
  24Args: SourceFile TargetFile
  25    SourceFile
  26        Name of file to read. Language and character encoding are autodetected
  27        based on file extension:
  28            ".py"   = Python 3, UTF-8
  29            ".scad" = OpenSCAD, UTF-8
  30            ".bas"  = QuickBASIC 4.5, code page 437
  31    TargetFile
  32        Name of HTML file to (over)write.\
  33"""
  34
  35# initial HTML
  36HTML_START = """\
  37<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">
  38<html>
  39<head>
  40<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  41<title>Colored syntax</title>
  42<style type="text/css">
  43
  44* { background:#000000; color:#ffffff; }
  45h1, h2, p { font-family:sans-serif; }
  46pre { font-size:large; }
  47a[href] { color:#ff40ff; }
  48span.kw { font-weight:bold; color:#00ff00; }
  49span.com { color:#00ffff; }
  50span.triquo { color:#ffff40; }
  51span.lnnum
  52{ background:#202020; color:#a0a0a0; padding:0 .25em; margin:0 .5em 0 0; }
  53
  54</style>
  55</head>
  56<body>
  57"""
  58
  59# final HTML
  60HTML_END = """
  61</body>
  62</html>\
  63"""
  64
  65def read_keyword_file(file, charsAllowed):
  66    keywords = set()
  67
  68    try:
  69        with open(file, "rt", encoding = "ascii") as hnd:
  70            for line in hnd:
  71                keyword = line.rstrip()
  72                if not all(char in charsAllowed for char in keyword):
  73                    exit("Invalid character in keyword file.")
  74                # search for duplicates
  75                #if keyword in keywords:
  76                #    exit('Keyword appears twice: "{:s}"'.format(keyword))
  77                keywords.add(keyword)
  78    except UnicodeError:
  79        exit("Keyword file is not a valid ASCII file.")
  80    except OSError:
  81        exit("Error reading keyword file.")
  82
  83    return keywords
  84
  85def HTML_special_chars(text):
  86    return (
  87        text
  88        .replace("&", "&amp;")
  89        .replace("<", "&lt;")
  90        .replace(">", "&gt;")
  91    )
  92
  93def split_line_to_words(line, keywordChars):
  94    """
  95    Split line to words. Each of them is either all keywordChars or all
  96    non-keywordChars
  97    Format: [(isRealWord, word), (isRealWord, word), ...]
  98    """
  99
 100    words = []
 101
 102    for (pos, char) in enumerate(line):
 103        isRealWordChar = (char in keywordChars)
 104
 105        if pos == 0:
 106            wordStart = 0
 107            isRealWord = isRealWordChar
 108        elif isRealWordChar != isRealWord:
 109            words.append((isRealWord, line[wordStart:pos]))
 110            wordStart = pos
 111            isRealWord = isRealWordChar
 112
 113    words.append((isRealWord, line[wordStart:]))
 114
 115    return words
 116
 117def color_ordinary_line(line, keywordChars, keywords, comments):
 118    if line == "":
 119        # empty line
 120        return ""
 121
 122    if any(line.startswith(comment) for comment in comments):
 123        # comment line
 124        return '<span class="com">{:s}</span>'.format(line)
 125
 126    words = split_line_to_words(line, keywordChars)
 127    printableWords = []
 128
 129    for (isRealWord, word) in words:
 130        if isRealWord and word in keywords:
 131            printableWords.append('<span class="kw">{:s}</span>'.format(word))
 132        elif isRealWord:
 133            printableWords.append(word)
 134        else:
 135            printableWords.append(HTML_special_chars(word))
 136
 137    return "".join(printableWords)
 138
 139def format_line(line, inTripleQuotes, keywordChars, keywords, comments):
 140    """Return printable line and new state of inTripleQuotes in a tuple."""
 141
 142    if line.startswith('"""') and line.endswith('"""') and line != '"""':
 143        # a single-line triple quotes block
 144        if inTripleQuotes:
 145            exit(
 146                "This program doesn't support two instances of triple "
 147                "quotes per line when already inside triple quotes."
 148            )
 149        return (
 150            '<span class="triquo">{:s}</span>'.format(
 151                HTML_special_chars(line)
 152            ),
 153            False
 154        )
 155
 156    if line.startswith('"""') or line.endswith('"""') or \
 157    line.endswith('"""\\'):
 158        # multi-line triple quotes block starts or ends
 159        return (
 160            '<span class="triquo">{:s}</span>'.format(
 161                HTML_special_chars(line)
 162            ),
 163            not inTripleQuotes
 164        )
 165
 166    if inTripleQuotes:
 167        # no triple quotes on this line, but we're inside a triple quotes block
 168        return (
 169            '<span class="triquo">{:s}</span>'.format(
 170                HTML_special_chars(line)
 171            ),
 172            True
 173        )
 174
 175    # ordinary line
 176    return (
 177        color_ordinary_line(line, keywordChars, keywords, comments),
 178        False
 179    )
 180
 181def print_file(sourceHnd, targetHnd, keywordChars, keywords, comments):
 182    # are we currently inside triple quotes
 183    inTripleQuotes = False
 184
 185    for (lineNum, line) in enumerate(sourceHnd):
 186        # remove trailing whitespace
 187        line = line.rstrip()
 188
 189        # remove leading whitespace, remember indent length
 190        indent = len(line) - len(line.lstrip())
 191        line = line.lstrip()
 192
 193        (printableLine, inTripleQuotes) = \
 194        format_line(line, inTripleQuotes, keywordChars, keywords, comments)
 195
 196        print('<span class="lnnum">{:4d}</span>{:s}{:s}'.format(
 197            lineNum + 1, indent * " ", printableLine
 198        ), file = targetHnd)
 199
 200    if inTripleQuotes:
 201        exit("Error: inside triple quotes at end of file.")
 202
 203def main():
 204    if len(sys.argv) != 3:
 205        exit(HELP_TEXT)
 206
 207    # read args
 208    (source, target) = sys.argv[1:]
 209
 210    # source and target files must not be the same
 211    try:
 212        if os.path.samefile(source, target):
 213            exit("Source and target files must not be the same.")
 214    except OSError:
 215        pass
 216
 217    # language-specific settings (comments = strings that a comment line can
 218    # start with)
 219    if source.lower().endswith(".py"):
 220        # Python 3
 221        keywordChars = string.ascii_letters + string.digits + "_"
 222        keywordFile = PYTHON_KEYWORD_FILE
 223        comments = ("#", "//")
 224        sourceEncoding = "utf8"
 225    elif source.lower().endswith(".scad"):
 226        # OpenSCAD
 227        keywordChars = string.ascii_lowercase + string.digits + "_"
 228        keywordFile = OPENSCAD_KEYWORD_FILE
 229        comments = ("//",)
 230        sourceEncoding = "utf8"
 231    elif source.lower().endswith(".bas"):
 232        # QuickBASIC 4.5
 233        keywordChars = string.ascii_uppercase + "$"
 234        keywordFile = QUICKBASIC_KEYWORD_FILE
 235        comments = ("'", "REM ")
 236        sourceEncoding = "cp437"
 237    else:
 238        exit('Source file extension must be ".py" or ".bas".')
 239
 240    # read language-specific keywords from file
 241    keywords = read_keyword_file(keywordFile, keywordChars)
 242
 243    # read source file and write target file
 244    try:
 245        with open(source, "rt", encoding = sourceEncoding) as sourceHnd, \
 246        open(target, "wt", encoding = "utf8", newline = "\n") as targetHnd:
 247            sourceHnd.seek(0)
 248            targetHnd.seek(0)
 249
 250            print(HTML_START, file = targetHnd)
 251            print("<h1><tt>{:s}</tt> with syntax coloring</h1>".format(
 252                HTML_special_chars(os.path.basename(source))
 253            ), file = targetHnd)
 254            print(
 255                "<p>This HTML file was generated with "
 256                "<a href=\"http://qalle.net\">Kalle</a>'s "
 257                "<tt>syntaxcolor.py</tt></p>",
 258                file = targetHnd
 259            )
 260            print("<hr>", file = targetHnd)
 261            print("<pre>", file = targetHnd)
 262
 263            print_file(sourceHnd, targetHnd, keywordChars, keywords, comments)
 264
 265            print("</pre>", file = targetHnd)
 266            print(HTML_END, file = targetHnd)
 267    except UnicodeError:
 268        exit("Source file is not a valid {:s} file.".format(sourceEncoding))
 269    except OSError:
 270        exit("Error reading source file or writing target file.")
 271
 272if __name__ == "__main__":
 273    main()