uniquewords.py with syntax coloring

This HTML file was generated with Kalle's syntaxcolor.py


   1"""
   2Reads a text file, writes list of unique words to another file.
   3"""
   4
   5import sys
   6import os.path
   7import unicodedata
   8
   9HELP_TEXT = """\
  10Reads a text file, writes list of unique words to another file.
  11
  12Definition of "word": sequence of characters that have a Unicode General
  13Category value starting with "L", separated by any other character (including
  14newline).
  15
  16Arguments: SourceFile Mode TargetFile
  17    SourceFile:
  18        File to read. Must be UTF-8 encoded.
  19    Mode (case insensitive):
  20        A
  21            Primary order: alphabetical, ascending, case insensitive.
  22            Output: original letter case.
  23        AI
  24            First, convert all words to lowercase.
  25            Primary order: alphabetical, ascending, case insensitive.
  26            Output: in lowercase.
  27        F
  28            Primary order: frequency, descending.
  29            Secondary order: alphabetical, ascending, case insensitive.
  30            Output: in original letter case, preceded by frequency.
  31        FI
  32            First, convert all words to lowercase.
  33            Primary order: frequency, descending.
  34            Secondary order: alphabetical, ascending.
  35            Output: in lowercase, preceded by frequency.
  36    TargetFile:
  37        File to write, or overwrite if it already exists. Must be different
  38        from Source. Will be UTF-8 encoded and have Unix newlines.\
  39"""
  40
  41def read_word_from_file(hnd, caseInsensitive):
  42    """
  43    Yield words in a text file one by one.
  44    """
  45
  46    hnd.seek(0)
  47
  48    # start position of current word (None = not inside word)
  49    wordStartPos = None
  50
  51    for line in hnd:
  52        if caseInsensitive:
  53            line = line.lower()
  54
  55        for (pos, char) in enumerate(line):
  56            if unicodedata.category(char).startswith("L"):
  57                # character is a letter; start new word if not inside one
  58                if wordStartPos is None:
  59                    wordStartPos = pos
  60            elif wordStartPos is not None:
  61                # character isn't a letter and we're inside a word; end it
  62                yield line[wordStartPos:pos]
  63                wordStartPos = None
  64
  65        # end current word if line ended with a letter
  66        if wordStartPos is not None:
  67            yield line[wordStartPos:]
  68            wordStartPos = None
  69
  70def main():
  71    # exit if wrong number of args
  72    if len(sys.argv) != 4:
  73        exit(HELP_TEXT)
  74
  75    # read args; exit if invalid mode
  76    (source, mode, target) = sys.argv[1:]
  77    mode = mode.upper()
  78    if mode not in ("A", "AI", "F", "FI"):
  79        exit("Invalid mode. Run without args to see help.")
  80
  81    # source and target files must not be the same (ignore errors for now)
  82    try:
  83        if os.path.samefile(source, target):
  84            exit("Target file must be different from source file.")
  85    except OSError:
  86        pass
  87
  88    alphabeticalOrder = mode in ("A", "AI")
  89    caseInsensitive = mode in ("AI", "FI")
  90
  91    print("Reading source file...")
  92    try:
  93        with open(source, "rt", encoding = "utf8") as hnd:
  94            if alphabeticalOrder:
  95                words = set()
  96                for word in read_word_from_file(hnd, caseInsensitive):
  97                    words.add(word)
  98            else:
  99                wordCounts = dict()
 100                for word in read_word_from_file(hnd, caseInsensitive):
 101                    wordCounts[word] = wordCounts.get(word, 0) + 1
 102    except UnicodeError:
 103        exit("Error: source file isn't a valid UTF-8 file.")
 104    except FileNotFoundError:
 105        exit("Error: source file not found.")
 106    except PermissionError:
 107        exit("Error: source file permission denied.")
 108    except OSError:
 109        exit("Error: couldn't read source file.")
 110
 111    print("Sorting words...")
 112    if alphabeticalOrder:
 113        words = sorted(words)
 114        if not caseInsensitive:
 115            words.sort(key = lambda word: word.lower())
 116    else:
 117        words = sorted(wordCounts)
 118        if not caseInsensitive:
 119            words.sort(key = lambda word: word.lower())
 120        words.sort(key = lambda word: wordCounts[word], reverse = True)
 121
 122    print("Writing target file...")
 123    try:
 124        with open(target, "wt", encoding = "utf8", newline = "\n") as hnd:
 125            hnd.seek(0)
 126
 127            if alphabeticalOrder:
 128                for word in words:
 129                    print(word, file = hnd)
 130            else:
 131                for word in words:
 132                    print(wordCounts[word], word, file = hnd)
 133    except FileNotFoundError:
 134        exit("Error: target path not found.")
 135    except PermissionError:
 136        exit("Error: target file/path permission denied.")
 137    except OSError:
 138        exit("Error: couldn't write target file.")
 139
 140    # print summary
 141    if alphabeticalOrder:
 142        print("Unique words:", len(words))
 143    else:
 144        print("Unique words:", len(wordCounts))
 145        print("Total words:", sum(wordCounts.values()))
 146
 147if __name__ == "__main__":
 148    main()