#! /usr/bin/env python3 # This file is under GNU General Public License 3.0 # see LICENSE.txt from argparse import ArgumentParser from fileinput import FileInput, hook_encoded import re, itertools, sys try: from math import log2 except: from math import log def log2(x): return log(x) / log(2) word = re.compile(r"(\S*?)(/|\s.*|$)") unwanted = re.compile(r"(^\d|[^']*')") space = re.compile(r'^\s') p = ArgumentParser(description="create dictionary csv out of hunspell data") p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell", help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)') p.add_argument('--lang', '-l', type=str, default="en_US", help='use dictionary for language LANG (default: en_US)') p.add_argument('--encoding', '-e', type=str, default="utf-8", help='file encoding (default: utf-8)') p.add_argument('--cut', '-c', action='store_true', help="cut list - reduce to 65536 words") args = p.parse_args() try: from icu import UnicodeString, Locale except ImportError: print("warning: PyICU not installed, using fallback", file=sys.stderr) def upper(x): return x.upper(); else: locale = Locale(args.lang) def upper(x): u = UnicodeString(x) return str(u.toUpper(locale)) _all = ( upper(word.match(line).group(1)) for line in FileInput( args.hunspell + "/" + args.lang + ".dic", openhook=hook_encoded(args.encoding) ) if not space.match(line) ) _words = [w for w in _all if len(w) > 2 and not unwanted.match(w)] _words.sort() _words = [w for w, g in itertools.groupby(_words)] if args.cut: while len(_words) > 65536 * 2: _words = _words[::2] if len(_words) > 65536: if args.cut: _words = _words[:65536] elif len(_words) < 65536: sys.stderr.write( "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format( args.lang, log2(len(_words)), log2(len(_words))*5 ) ) _words.extend(_words[:65536-len(_words)]) if args.cut: assert len(_words) == 65536, "lenght is {}".format(len(_words)) for i, w in enumerate(_words): print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))