charfreq.py with syntax coloring

This HTML file was generated with Kalle's syntaxcolor.py


   1"""
   2Prints statistics of characters in a text file.
   3By Kalle (http://qalle.net)
   4"""
   5
   6import sys
   7import os.path
   8import unicodedata
   9
  10DEFAULT_SOURCE_ENCODING = "utf_8"
  11DEFAULT_ORDER = "C"
  12UNKNOWN_CHAR_NAME = "(unknown)"
  13TARGET_ENCODING = "ascii"
  14TARGET_NEWLINES = "\n"
  15
  16HELP_TEXT = """\
  17Prints statistics of characters in a text file.
  18
  19Arguments: SourceFile [SourceEncoding [Order]]
  20    SourceFile
  21        The name of the file to read.
  22    SourceEncoding
  23        The character encoding of SourceFile. Optional. The default value is
  24        utf_8
  25        Other possible values: e.g. utf_16, cp1252, cp437
  26        More encodings:
  27        http://docs.python.org/3/library/codecs.html#standard-encodings
  28    Order
  29        The order in which results are printed. Optional. If specified,
  30        SourceEncoding must also be specified.
  31        "c" or "C" = codepoint (ascending; default)
  32        "n" or "N" = name (ascending)
  33        "g" or "G" = General Category (ascending)
  34        "f" or "F" = frequency (descending)\
  35"""
  36
  37def count_chars(hnd):
  38    """
  39    Calculate the frequencies of codepoints in a file. Also count lines.
  40
  41    Args:
  42        hnd: file handle
  43
  44    Return: (
  45        {CP: freq, CP: freq, ...},
  46        lineCount
  47    )
  48    """
  49
  50    hnd.seek(0)
  51    freqs = {}
  52    lineCount = 0
  53
  54    for line in hnd:
  55        for char in line:
  56            codepoint = ord(char)
  57            freqs[codepoint] = freqs.get(codepoint, 0) + 1
  58
  59        lineCount += 1
  60
  61    return (freqs, lineCount)
  62
  63def sort_CPs(CPFreqs, order):
  64    """
  65    Sort codepoints.
  66
  67    Args:
  68        CPFreqs: {CP: freq, CP: freq, ...}
  69        order: how to sort
  70
  71    Return: (CP1, CP2, ...)
  72    """
  73
  74    # secondary sort (or the only one if we order by codepoint)
  75    orderedCPs = sorted(CPFreqs)
  76
  77    # primary sort (if we don't order by codepoint)
  78    if order == "N":
  79        orderedCPs.sort(key = lambda CP: unicodedata.name(chr(CP), ""))
  80    elif order == "G":
  81        orderedCPs.sort(key = lambda CP: unicodedata.category(chr(CP)))
  82    elif order == "F":
  83        orderedCPs.sort(key = lambda CP: CPFreqs[CP], reverse = True)
  84
  85    return tuple(orderedCPs)
  86
  87def create_line_format(CPFreqs):
  88    """
  89    Create a string for printing every line using .format()
  90
  91    Args:
  92        CPFreqs: {CP: freq, CP: freq, ...}
  93
  94    Return: format string
  95    """
  96
  97    maxCodepoint = max(CPFreqs)
  98    maxFreq = max(CPFreqs.values())
  99    totalCharCount = sum(CPFreqs.values())
 100
 101    # max decimal codepoint length
 102    maxDecCodepointLen = len(str(maxCodepoint))
 103
 104    # max hexadecimal codepoint length
 105    maxHexCodepointLen = len(format(maxCodepoint, "x"))
 106
 107    # max character name length
 108    maxNameLen = max(len(unicodedata.name(chr(CP), "")) for CP in CPFreqs)
 109    maxNameLen = max(maxNameLen, len(UNKNOWN_CHAR_NAME))
 110
 111    # max frequency length
 112    maxFreqLen = len(str(maxFreq))
 113
 114    # max percentage length
 115    maxPercentageLen = len(format(maxFreq / totalCharCount * 100, ".2f"))
 116
 117    return " ".join((
 118        "{CP:" + str(maxDecCodepointLen) + "d}",
 119        "{CP:" + str(maxHexCodepointLen) + "X}",
 120        "{cat:s}",
 121        "{name:" + str(maxNameLen) + "s}",
 122        "{freq:" + str(maxFreqLen) + "d}",
 123        "{percentage:" + str(maxPercentageLen) + ".2f}%",
 124    ))
 125
 126def print_chars(
 127    CPFreqs, orderedCPs, fileName, sourceEncoding, fileSize, lineCount
 128):
 129    """
 130    Print list of characters.
 131
 132    Args:
 133        CPFreqs: {CP: freq, CP:freq, ...}
 134        orderedCPs: (CP1, CP2, ...)
 135        fileName: input file name
 136        sourceEncoding: input file encoding
 137        fileSize: input file size
 138        lineCount: input file line count
 139    """
 140
 141    fileName = (
 142        fileName.encode("ascii", errors = "backslashreplace").decode("ascii")
 143    )
 144    lineFormat = create_line_format(CPFreqs)
 145    totalCharCount = sum(CPFreqs.values())
 146    uniqueCharCount = len(CPFreqs)
 147
 148    print('{:18s}: "{:s}"'.format("file", fileName))
 149    print("{:18s}: {:s}".format("encoding", sourceEncoding))
 150    print("{:18s}: {:d}".format("bytes", fileSize))
 151    print("{:18s}: {:d}".format("characters", totalCharCount))
 152    print("{:18s}: {:d}".format("unique characters", uniqueCharCount))
 153    print("{:18s}: {:d}".format("lines", lineCount))
 154    print()
 155
 156    print(
 157        "columns: codepoint (base 10&16), General Category, name, frequency, "
 158        "percentage"
 159    )
 160    print()
 161
 162    for CP in orderedCPs:
 163        char = chr(CP)
 164        category = unicodedata.category(char)
 165        freq = CPFreqs[CP]
 166
 167        name = unicodedata.name(char, UNKNOWN_CHAR_NAME)
 168
 169        percentage = freq / totalCharCount * 100
 170
 171        print(lineFormat.format(
 172            CP = CP,
 173            cat = category,
 174            name = name,
 175            freq = freq,
 176            percentage = percentage
 177        ))
 178
 179def main():
 180    if len(sys.argv) == 2:
 181        source = sys.argv[1]
 182        sourceEncoding = DEFAULT_SOURCE_ENCODING
 183        order = DEFAULT_ORDER
 184    elif len(sys.argv) == 3:
 185        (source, sourceEncoding) = sys.argv[1:]
 186        order = DEFAULT_ORDER
 187    elif len(sys.argv) == 4:
 188        (source, sourceEncoding, order) = sys.argv[1:]
 189        order = order.upper()
 190    else:
 191        exit(HELP_TEXT)
 192
 193    if order not in ("C", "N", "G", "F"):
 194        exit("Error: invalid order argument.")
 195
 196    # read source file
 197    try:
 198        with open(source, "rt", encoding = sourceEncoding, newline = "") as \
 199        hnd:
 200            fileSize = hnd.seek(0, 2)
 201            if fileSize == 0:
 202                exit("Error: file is empty.")
 203
 204            (CPFreqs, lineCount) = count_chars(hnd)
 205    except LookupError:
 206        exit("Error: no such encoding.")
 207    except UnicodeError:
 208        exit("Error: source file isn't valid in specified encoding.")
 209    except FileNotFoundError:
 210        exit("Source file not found.")
 211    except PermissionError:
 212        exit("Source file permission denied.")
 213    except OSError:
 214        exit("Error reading source file.")
 215
 216    # sort codepoints
 217    orderedCPs = sort_CPs(CPFreqs, order)
 218
 219    # print results
 220    baseName = os.path.basename(source)
 221    print_chars(
 222        CPFreqs, orderedCPs, baseName, sourceEncoding, fileSize, lineCount
 223    )
 224
 225if __name__ == "__main__":
 226    main()