summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Holesovsky <kendy@collabora.com>2016-02-25 10:57:34 +0100
committerJan Holesovsky <kendy@collabora.com>2016-02-25 11:18:30 +0100
commit46febeac1b9d8e4edea5c9cbac327aa0e2c846ae (patch)
tree59053c0e37dd3535af97d3447ff1f6534115fb5e
parent74e081219ce46b85768838e01b32eb3837e8e2bc (diff)
dictionary-to-thesaurus.py: Various cleanups.
* word classifiacation (when available) * word blacklist * ignore some non-translations (eg. irregular verbs) * ignore vulgarisms (when marked), they only add confusion
-rwxr-xr-xcs_CZ/thesaurus/dictionary-to-thesaurus.py78
1 files changed, 66 insertions, 12 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index ec20c25..8a0ae8a 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+# coding=utf-8
#
# This file is part of the LibreOffice project.
#
@@ -14,18 +15,45 @@
# thanks! :-)
import os
+import re
import sys
+# add here the Czech words we want to leave out from the thesaurus generation
+# (misbehaving, mistranslated, etc.)
+ignore_words = [
+ '?',
+ '(by the way)',
+ '(po)štvat',
+ '14. písmeno hebrejské abecedy',
+]
+
def usage():
message = """Usage: {program} slovnik_data_utf8.txt
slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
print(message.format(program = os.path.basename(sys.argv[0])))
+def classify(typ):
+ if typ == '':
+ return ''
+ elif typ == 'adj':
+ return '(adj)'
+ elif typ == 'adv':
+ return '(adv)'
+ elif typ == 'n':
+ return '(noun)'
+ elif typ == 'v':
+ return '(verb)'
+
+ return ''
+
def parse(filename):
synonyms = {}
meanings = {}
+ match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
+ match_cleanup = re.compile('(\[.*\]|\*|:.*)')
+
with open(filename, "r") as fp:
for line in fp:
if (line == ''):
@@ -42,14 +70,29 @@ def parse(filename):
continue
word = terms[1].strip()
- if (word == ''):
+ if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
+ word = word.strip('" ')
+
+ if (word == '' or word in ignore_words):
continue
- #type = terms[2] TODO for now the type (n:, adj:, ...) is ignored
+
+ typ = ''
+ if (len(terms) >= 2):
+ typ = terms[2]
+
+ # ignore non-translations
+ if match_ignore.search(typ) != None:
+ continue
+
+ typ = match_cleanup.sub('', typ)
+ typ = typ.strip()
+
+ typ = classify(typ)
if index in synonyms:
- synonyms[index].append(word)
+ synonyms[index].append( (word, typ) )
else:
- synonyms[index] = [ word ]
+ synonyms[index] = [ (word, typ) ]
if word in meanings:
meanings[word].append(index)
@@ -73,15 +116,26 @@ def buildThesaurus(synonyms, meanings):
for index in indexes:
syns = synonyms[index]
- line = ''
- for syn in syns:
- if not syn in used_this_round:
- line += '|' + syn
- used_this_round.append(syn)
+ # collect types first
+ types = []
+ for (w, t) in syns:
+ if not t in types:
+ types.append(t)
- if line != '':
- # TODO prepend the line with '(adj)' or '(noun)' or so; see 'type' above
- output_lines.append(line)
+ line = {}
+ for syn in syns:
+ (w, t) = syn
+ if not w in used_this_round:
+ if t in line:
+ line[t] += '|' + w
+ else:
+ line[t] = '|' + w
+ used_this_round.append(w)
+
+ if len(line) != 0:
+ for t in types:
+ if t in line:
+ output_lines.append(t + line[t])
if len(output_lines) > 0:
print word + '|' + str(len(output_lines))