dictionary-to-thesaurus.py: Various cleanups.

* word classifiacation (when available) * word blacklist * ignore some non-translations (eg. irregular verbs) * ignore vulgarisms (when marked), they only add confusion
author: Jan Holesovsky <kendy@collabora.com> 2016-02-25 10:57:34 +0100
committer: Jan Holesovsky <kendy@collabora.com> 2016-02-25 11:18:30 +0100
commit: 46febeac1b9d8e4edea5c9cbac327aa0e2c846ae (patch)
tree: 59053c0e37dd3535af97d3447ff1f6534115fb5e
parent: 74e081219ce46b85768838e01b32eb3837e8e2bc (diff)
1 files changed, 66 insertions, 12 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index ec20c25..8a0ae8a 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# coding=utf-8
 #
 # This file is part of the LibreOffice project.
 #
@@ -14,18 +15,45 @@
 # thanks! :-)
 
 import os
+import re
 import sys
 
+# add here the Czech words we want to leave out from the thesaurus generation
+# (misbehaving, mistranslated, etc.)
+ignore_words = [
+    '?',
+    '(by the way)',
+    '(po)štvat',
+    '14. písmeno hebrejské abecedy',
+]
+
 def usage():
     message = """Usage: {program} slovnik_data_utf8.txt
 
   slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
     print(message.format(program = os.path.basename(sys.argv[0])))
 
+def classify(typ):
+    if typ == '':
+        return ''
+    elif typ == 'adj':
+        return '(adj)'
+    elif typ == 'adv':
+        return '(adv)'
+    elif typ == 'n':
+        return '(noun)'
+    elif typ == 'v':
+        return '(verb)'
+
+    return ''
+
 def parse(filename):
     synonyms = {}
     meanings = {}
 
+    match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
+    match_cleanup = re.compile('(\[.*\]|\*|:.*)')
+
     with open(filename, "r") as fp:
         for line in fp:
             if (line == ''):
@@ -42,14 +70,29 @@ def parse(filename):
                     continue
 
                 word = terms[1].strip()
-                if (word == ''):
+                if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
+                    word = word.strip('" ')
+
+                if (word == '' or word in ignore_words):
                     continue
-                #type = terms[2] TODO for now the type (n:, adj:, ...) is ignored
+
+                typ = ''
+                if (len(terms) >= 2):
+                    typ = terms[2]
+
+                    # ignore non-translations
+                    if match_ignore.search(typ) != None:
+                        continue
+
+                    typ = match_cleanup.sub('', typ)
+                    typ = typ.strip()
+
+                typ = classify(typ)
 
                 if index in synonyms:
-                    synonyms[index].append(word)
+                    synonyms[index].append( (word, typ) )
                 else:
-                    synonyms[index] = [ word ]
+                    synonyms[index] = [ (word, typ) ]
 
                 if word in meanings:
                     meanings[word].append(index)
@@ -73,15 +116,26 @@ def buildThesaurus(synonyms, meanings):
         for index in indexes:
             syns = synonyms[index]
 
-            line = ''
-            for syn in syns:
-                if not syn in used_this_round:
-                    line += '|' + syn
-                    used_this_round.append(syn)
+            # collect types first
+            types = []
+            for (w, t) in syns:
+                if not t in types:
+                    types.append(t)
 
-            if line != '':
-                # TODO prepend the line with '(adj)' or '(noun)' or so; see 'type' above
-                output_lines.append(line)
+            line = {}
+            for syn in syns:
+                (w, t) = syn
+                if not w in used_this_round:
+                    if t in line:
+                        line[t] += '|' + w
+                    else:
+                        line[t] = '|' + w
+                    used_this_round.append(w)
+
+            if len(line) != 0:
+                for t in types:
+                    if t in line:
+                        output_lines.append(t + line[t])
 
         if len(output_lines) > 0:
             print word + '|' + str(len(output_lines))
author	Jan Holesovsky <kendy@collabora.com>	2016-02-25 10:57:34 +0100
committer	Jan Holesovsky <kendy@collabora.com>	2016-02-25 11:18:30 +0100
commit	46febeac1b9d8e4edea5c9cbac327aa0e2c846ae (patch)
tree	59053c0e37dd3535af97d3447ff1f6534115fb5e
parent	74e081219ce46b85768838e01b32eb3837e8e2bc (diff)