From f83b25d29ff2ee17addec24aaebd15113475c360 Mon Sep 17 00:00:00 2001 From: Jan Holesovsky Date: Thu, 25 Feb 2016 14:35:03 +0100 Subject: dictionary-to-thesaurus.py: Move blacklist to a separate file. Change-Id: Ie05e0c0ce8b4f9541a5a143ddf9ccf960940a3b7 --- cs_CZ/thesaurus/blacklist.txt | 9 +++++++ cs_CZ/thesaurus/dictionary-to-thesaurus.py | 42 ++++++++++++++++++------------ 2 files changed, 35 insertions(+), 16 deletions(-) create mode 100644 cs_CZ/thesaurus/blacklist.txt (limited to 'cs_CZ') diff --git a/cs_CZ/thesaurus/blacklist.txt b/cs_CZ/thesaurus/blacklist.txt new file mode 100644 index 0000000..ab62ae5 --- /dev/null +++ b/cs_CZ/thesaurus/blacklist.txt @@ -0,0 +1,9 @@ +# Terms that are in the dictionary, but should be left out from thesaurus creation +# +# The words here are English Czech pairs, delimited by a TAB. When one of +# them is missing (is empty), it means "any". Empty lines are ignored + + ? + (by the way) + (po)štvat + 14. písmeno hebrejské abecedy diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py index d4974ed..8ee022c 100755 --- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py +++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py @@ -18,20 +18,13 @@ import os import re import sys -# add here the Czech words we want to leave out from the thesaurus generation -# (misbehaving, mistranslated, etc.) -ignore_words = [ - '?', - '(by the way)', - '(po)štvat', - '14. písmeno hebrejské abecedy', -] - def usage(): - message = """Usage: {program} slovnik_data_utf8.txt + message = """Usage: {program} slovnik_data_utf8.txt backlist.txt - slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php""" - print(message.format(program = os.path.basename(sys.argv[0]))) + slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php + blacklist.txt: List of words that should be ignored when generating +""" + sys.stderr.write(message.format(program = os.path.basename(sys.argv[0]))) def classify(typ): if typ == '': @@ -47,7 +40,18 @@ def classify(typ): return '' -def parse(filename): +def parse(filename, blacklistname): + blacklist = {} + + with open(blacklistname, "r") as fp: + for line in fp: + if (line == ''): + continue + elif (line[0] == '#'): + continue + else: + blacklist[line.strip(' \n')] = 1 + synonyms = {} meanings = {} @@ -73,7 +77,13 @@ def parse(filename): if (word != '' and word[0] == '"' and word[len(word)-1] == '"'): word = word.strip('" ') - if (word == '' or word in ignore_words): + if (word == ''): + continue + + if (index + '\t' + word in blacklist or + index in blacklist or + index + '\t' in blacklist or + '\t' + word in blacklist): continue typ = '' @@ -143,11 +153,11 @@ def buildThesaurus(synonyms, meanings): print line def main(args): - if (len(args) != 2): + if (len(args) != 3): usage() sys.exit(1) - (synonyms, meanings) = parse(args[1]) + (synonyms, meanings) = parse(args[1], args[2]) print "UTF-8" buildThesaurus(synonyms, meanings) -- cgit