p≡p engine
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.3 KiB

  1. #! /usr/bin/env python3
  2. # This file is under GNU General Public License 3.0
  3. # see LICENSE.txt
  4. from argparse import ArgumentParser
  5. from fileinput import FileInput, hook_encoded
  6. import re, itertools, sys
  7. try:
  8. from math import log2
  9. except:
  10. from math import log
  11. def log2(x): return log(x) / log(2)
  12. word = re.compile(r"(\S*?)(/|\s.*|$)")
  13. unwanted = re.compile(r"(^\d|[^']*')")
  14. space = re.compile(r'^\s')
  15. p = ArgumentParser(description="create dictionary csv out of hunspell data")
  16. p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
  17. help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
  18. p.add_argument('--lang', '-l', type=str, default="en_US",
  19. help='use dictionary for language LANG (default: en_US)')
  20. p.add_argument('--encoding', '-e', type=str, default="utf-8",
  21. help='file encoding (default: utf-8)')
  22. p.add_argument('--full', '-f', action='store_true',
  23. help="full list - don't reduce to 65536 words")
  24. args = p.parse_args()
  25. try:
  26. from icu import UnicodeString, Locale
  27. except ImportError:
  28. print("warning: PyICU not installed, using fallback", file=sys.stderr)
  29. def upper(x):
  30. return x.upper();
  31. else:
  32. locale = Locale(args.lang)
  33. def upper(x):
  34. u = UnicodeString(x)
  35. return str(u.toUpper(locale))
  36. _all = (
  37. upper(word.match(line).group(1))
  38. for line in FileInput(
  39. args.hunspell + "/" + args.lang + ".dic",
  40. openhook=hook_encoded(args.encoding)
  41. )
  42. if not space.match(line)
  43. )
  44. _words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
  45. _words.sort()
  46. _words = [w for w, g in itertools.groupby(_words)]
  47. if not args.full:
  48. while len(_words) > 65536 * 2:
  49. _words = _words[::2]
  50. if len(_words) > 65536:
  51. if not args.full:
  52. _words = _words[:65536]
  53. elif len(_words) < 65536:
  54. sys.stderr.write(
  55. "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
  56. args.lang,
  57. log2(len(_words)),
  58. log2(len(_words))*5
  59. )
  60. )
  61. _words.extend(_words[:65536-len(_words)])
  62. if not args.full:
  63. assert len(_words) == 65536, "lenght is {}".format(len(_words))
  64. for i, w in enumerate(_words):
  65. print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))