cs_CZ/thesaurus/dictionary-to-thesaurus.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

#!/usr/bin/env python
# coding=utf-8
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

# This utility translates a normal dictionary (in this case English/Czech)
# into a thesaurus for one of the languages (in this case Czech).
#
# Based on idea of Zdenek Zabokrtsky <zabokrtsky@ufal.mff.cuni.cz>, big
# thanks! :-)

import os
import re
import sys

def usage():
    message = """Usage: {program} slovnik_data_utf8.txt backlist.txt

  slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
  blacklist.txt:         List of words that should be ignored when generating
"""
    sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))

def classify(typ):
    if typ == '':
        return ''
    elif typ == 'adj':
        return '(příd. jm.)'
    elif typ == 'adv':
        return '(přísl.)'
    elif typ == 'n':
        return '(podst. jm.)'
    elif typ == 'v':
        return '(slov.)'

    return ''

def parse(filename, blacklistname):
    blacklist = {}

    with open(blacklistname, "r") as fp:
        for line in fp:
            if (line == ''):
                continue
            elif (line[0] == '#'):
                continue
            else:
                blacklist[line.strip(' \n')] = 1

    synonyms = {}
    meanings = {}

    match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
    match_cleanup = re.compile('(\[.*\]|\*|:.*)')

    with open(filename, "r") as fp:
        for line in fp:
            if (line == ''):
                continue
            elif (line[0] == '#'):
                continue
            else:
                terms = line.split('\t')
                if (terms[0] == '' or len(terms) < 2):
                    continue

                index = terms[0].strip()
                if (index == ''):
                    continue

                word = terms[1].strip()
                if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
                    word = word.strip('" ')

                if (word == ''):
                    continue

                if (index + '\t' + word in blacklist or
                        index in blacklist or
                        index + '\t' in blacklist or
                        '\t' + word in blacklist):
                    continue

                typ = ''
                if (len(terms) >= 2):
                    typ = terms[2]

                    # ignore non-translations
                    if match_ignore.search(typ) != None:
                        continue

                    typ = match_cleanup.sub('', typ)
                    typ = typ.strip()

                typ = classify(typ)

                if index in synonyms:
                    synonyms[index].append( (word, typ) )
                else:
                    synonyms[index] = [ (word, typ) ]

                if word in meanings:
                    meanings[word].append(index)
                else:
                    meanings[word] = [ index ]

    return (synonyms, meanings)

def buildThesaurus(synonyms, meanings):
    # for every word:
    #   find all the indexes, and then again map the indexes to words - these are the synonyms
    for word in sorted(meanings.keys()):
        # we assume that various indexes (english words here) are various
        # meanings; not generally true, but...
        indexes = meanings[word]

        # we want to output each word just once
        used_this_round = [ word ]

        output_lines = []
        for index in indexes:
            syns = synonyms[index]

            # collect types first
            types = []
            for (w, t) in syns:
                if not t in types:
                    types.append(t)

            line = {}
            for syn in syns:
                (w, t) = syn
                if not w in used_this_round:
                    if t in line:
                        line[t] += '|' + w
                    else:
                        line[t] = '|' + w
                    used_this_round.append(w)

            if len(line) != 0:
                for t in types:
                    if t in line:
                        output_lines.append(t + line[t])

        if len(output_lines) > 0:
            print word + '|' + str(len(output_lines))
            for line in output_lines:
                print line

def main(args):
    if (len(args) != 3):
        usage()
        sys.exit(1)

    (synonyms, meanings) = parse(args[1], args[2])

    print "UTF-8"
    buildThesaurus(synonyms, meanings)

if __name__ == "__main__":
    main(sys.argv)

# vim:set shiftwidth=4 softtabstop=4 expandtab: