dictionary-to-thesaurus.py: Only output the same class of word.

When the class of the word is unambiguous, limit the output only to that - gives more precise & expected results. [Like, it is interesting to see the other possibilities too, but I guess less choices but more focused ones are preferred.] Change-Id: I2876fbb4fa02c00fc7e65189812365f77b9a5ed6
author: Jan Holesovsky <kendy@collabora.com> 2016-02-26 08:38:58 +0100
committer: Jan Holesovsky <kendy@collabora.com> 2016-02-26 08:46:32 +0100
commit: c32de9bba681bcc7becc64f0ea3b605ff2b38266 (patch)
tree: 1afe71589e1e970bc4c5f15ec27de7a5db32c6b5 /cs_CZ
parent: 8442e91f9d62d25d478f891d940a56551b2fd484 (diff)
1 files changed, 27 insertions, 5 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index 8ee022c..63f906a 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -54,6 +54,7 @@ def parse(filename, blacklistname):
 
     synonyms = {}
     meanings = {}
+    classification = {}
 
     match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
     match_cleanup = re.compile('(\[.*\]|\*|:.*)')
@@ -109,9 +110,16 @@ def parse(filename, blacklistname):
                 else:
                     meanings[word] = [ index ]
 
-    return (synonyms, meanings)
+                if typ != '':
+                    if word in classification:
+                        if not typ in classification[word]:
+                            classification[word].append(typ)
+                    else:
+                        classification[word] = [ typ ]
+
+    return (synonyms, meanings, classification)
 
-def buildThesaurus(synonyms, meanings):
+def buildThesaurus(synonyms, meanings, classification):
     # for every word:
     #   find all the indexes, and then again map the indexes to words - these are the synonyms
     for word in sorted(meanings.keys()):
@@ -119,6 +127,11 @@ def buildThesaurus(synonyms, meanings):
         # meanings; not generally true, but...
         indexes = meanings[word]
 
+        # only limit the words if the type is unambiguous
+        typ = ''
+        if word in classification and len(classification[word]) == 1:
+            typ = classification[word][0]
+
         # we want to output each word just once
         used_this_round = [ word ]
 
@@ -132,9 +145,14 @@ def buildThesaurus(synonyms, meanings):
                 if not t in types:
                     types.append(t)
 
+            # build the various thesaurus lines
             line = {}
             for syn in syns:
                 (w, t) = syn
+
+                if typ != '' and t != '' and typ != t:
+                    continue
+
                 if not w in used_this_round:
                     if t in line:
                         line[t] += '|' + w
@@ -145,7 +163,11 @@ def buildThesaurus(synonyms, meanings):
             if len(line) != 0:
                 for t in types:
                     if t in line:
-                        output_lines.append(t + line[t])
+                        if typ == '':
+                            # classification is abmiguous, output the type too
+                            output_lines.append(t + line[t])
+                        else:
+                            output_lines.append(line[t])
 
         if len(output_lines) > 0:
             print word + '|' + str(len(output_lines))
@@ -157,10 +179,10 @@ def main(args):
         usage()
         sys.exit(1)
 
-    (synonyms, meanings) = parse(args[1], args[2])
+    (synonyms, meanings, classification) = parse(args[1], args[2])
 
     print "UTF-8"
-    buildThesaurus(synonyms, meanings)
+    buildThesaurus(synonyms, meanings, classification)
 
 if __name__ == "__main__":
     main(sys.argv)
author	Jan Holesovsky <kendy@collabora.com>	2016-02-26 08:38:58 +0100
committer	Jan Holesovsky <kendy@collabora.com>	2016-02-26 08:46:32 +0100
commit	c32de9bba681bcc7becc64f0ea3b605ff2b38266 (patch)
tree	1afe71589e1e970bc4c5f15ec27de7a5db32c6b5 /cs_CZ
parent	8442e91f9d62d25d478f891d940a56551b2fd484 (diff)