summaryrefslogtreecommitdiff
path: root/cs_CZ/thesaurus/dictionary-to-thesaurus.py
diff options
context:
space:
mode:
Diffstat (limited to 'cs_CZ/thesaurus/dictionary-to-thesaurus.py')
-rwxr-xr-xcs_CZ/thesaurus/dictionary-to-thesaurus.py104
1 files changed, 104 insertions, 0 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
new file mode 100755
index 0000000..ec20c25
--- /dev/null
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#
+
+# This utility translates a normal dictionary (in this case English/Czech)
+# into a thesaurus for one of the languages (in this case Czech).
+#
+# Based on idea of Zdenek Zabokrtsky <zabokrtsky@ufal.mff.cuni.cz>, big
+# thanks! :-)
+
+import os
+import sys
+
+def usage():
+ message = """Usage: {program} slovnik_data_utf8.txt
+
+ slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
+ print(message.format(program = os.path.basename(sys.argv[0])))
+
+def parse(filename):
+ synonyms = {}
+ meanings = {}
+
+ with open(filename, "r") as fp:
+ for line in fp:
+ if (line == ''):
+ continue
+ elif (line[0] == '#'):
+ continue
+ else:
+ terms = line.split('\t')
+ if (terms[0] == '' or len(terms) < 2):
+ continue
+
+ index = terms[0].strip()
+ if (index == ''):
+ continue
+
+ word = terms[1].strip()
+ if (word == ''):
+ continue
+ #type = terms[2] TODO for now the type (n:, adj:, ...) is ignored
+
+ if index in synonyms:
+ synonyms[index].append(word)
+ else:
+ synonyms[index] = [ word ]
+
+ if word in meanings:
+ meanings[word].append(index)
+ else:
+ meanings[word] = [ index ]
+
+ return (synonyms, meanings)
+
+def buildThesaurus(synonyms, meanings):
+ # for every word:
+ # find all the indexes, and then again map the indexes to words - these are the synonyms
+ for word in sorted(meanings.keys()):
+ # we assume that various indexes (english words here) are various
+ # meanings; not generally true, but...
+ indexes = meanings[word]
+
+ # we want to output each word just once
+ used_this_round = [ word ]
+
+ output_lines = []
+ for index in indexes:
+ syns = synonyms[index]
+
+ line = ''
+ for syn in syns:
+ if not syn in used_this_round:
+ line += '|' + syn
+ used_this_round.append(syn)
+
+ if line != '':
+ # TODO prepend the line with '(adj)' or '(noun)' or so; see 'type' above
+ output_lines.append(line)
+
+ if len(output_lines) > 0:
+ print word + '|' + str(len(output_lines))
+ for line in output_lines:
+ print line
+
+def main(args):
+ if (len(args) != 2):
+ usage()
+ sys.exit(1)
+
+ (synonyms, meanings) = parse(args[1])
+
+ print "UTF-8"
+ buildThesaurus(synonyms, meanings)
+
+if __name__ == "__main__":
+ main(sys.argv)
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab: