summaryrefslogtreecommitdiff
path: root/cs_CZ
diff options
context:
space:
mode:
authorJan Holesovsky <kendy@collabora.com>2016-02-25 14:35:03 +0100
committerJan Holesovsky <kendy@collabora.com>2016-02-25 14:35:03 +0100
commitf83b25d29ff2ee17addec24aaebd15113475c360 (patch)
treec94d80f3fcae9af13e1540699e28325234be3244 /cs_CZ
parentbd5a09adea33acfe9164f26a3061ae1152933438 (diff)
dictionary-to-thesaurus.py: Move blacklist to a separate file.
Change-Id: Ie05e0c0ce8b4f9541a5a143ddf9ccf960940a3b7
Diffstat (limited to 'cs_CZ')
-rw-r--r--cs_CZ/thesaurus/blacklist.txt9
-rwxr-xr-xcs_CZ/thesaurus/dictionary-to-thesaurus.py42
2 files changed, 35 insertions, 16 deletions
diff --git a/cs_CZ/thesaurus/blacklist.txt b/cs_CZ/thesaurus/blacklist.txt
new file mode 100644
index 0000000..ab62ae5
--- /dev/null
+++ b/cs_CZ/thesaurus/blacklist.txt
@@ -0,0 +1,9 @@
+# Terms that are in the dictionary, but should be left out from thesaurus creation
+#
+# The words here are English Czech pairs, delimited by a TAB. When one of
+# them is missing (is empty), it means "any". Empty lines are ignored
+
+ ?
+ (by the way)
+ (po)štvat
+ 14. písmeno hebrejské abecedy
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index d4974ed..8ee022c 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -18,20 +18,13 @@ import os
import re
import sys
-# add here the Czech words we want to leave out from the thesaurus generation
-# (misbehaving, mistranslated, etc.)
-ignore_words = [
- '?',
- '(by the way)',
- '(po)štvat',
- '14. písmeno hebrejské abecedy',
-]
-
def usage():
- message = """Usage: {program} slovnik_data_utf8.txt
+ message = """Usage: {program} slovnik_data_utf8.txt backlist.txt
- slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
- print(message.format(program = os.path.basename(sys.argv[0])))
+ slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
+ blacklist.txt: List of words that should be ignored when generating
+"""
+ sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
def classify(typ):
if typ == '':
@@ -47,7 +40,18 @@ def classify(typ):
return ''
-def parse(filename):
+def parse(filename, blacklistname):
+ blacklist = {}
+
+ with open(blacklistname, "r") as fp:
+ for line in fp:
+ if (line == ''):
+ continue
+ elif (line[0] == '#'):
+ continue
+ else:
+ blacklist[line.strip(' \n')] = 1
+
synonyms = {}
meanings = {}
@@ -73,7 +77,13 @@ def parse(filename):
if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
word = word.strip('" ')
- if (word == '' or word in ignore_words):
+ if (word == ''):
+ continue
+
+ if (index + '\t' + word in blacklist or
+ index in blacklist or
+ index + '\t' in blacklist or
+ '\t' + word in blacklist):
continue
typ = ''
@@ -143,11 +153,11 @@ def buildThesaurus(synonyms, meanings):
print line
def main(args):
- if (len(args) != 2):
+ if (len(args) != 3):
usage()
sys.exit(1)
- (synonyms, meanings) = parse(args[1])
+ (synonyms, meanings) = parse(args[1], args[2])
print "UTF-8"
buildThesaurus(synonyms, meanings)