summaryrefslogtreecommitdiff
path: root/cs_CZ/thesaurus/dictionary-to-thesaurus.py
blob: ac4fe67dae2bee2776ac4b9a7e29fa9a522a0d4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#!/usr/bin/env python
# coding=utf-8
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#

# This utility translates a normal dictionary (in this case English/Czech)
# into a thesaurus for one of the languages (in this case Czech).
#
# Based on idea of Zdenek Zabokrtsky <zabokrtsky@ufal.mff.cuni.cz>, big
# thanks! :-)

import os
import re
import sys


def usage():
    message = """Usage: {program} en-cs.txt blacklist.txt

  en-cs.txt:     Dictionary data from https://www.svobodneslovniky.cz/
  blacklist.txt: List of words that should be ignored when generating
"""
    print(
        message.format(program=os.path.basename(sys.argv[0])),
        file=sys.stderr)


def classify(typ):
    if typ == '':
        return ''
    elif typ == 'adj':
        return '(příd. jm.)'
    elif typ == 'adv':
        return '(přísl.)'
    elif typ == 'n':
        return '(podst. jm.)'
    elif typ == 'v':
        return '(slov.)'

    return ''


def parse(filename, blacklistname):
    blacklist = {}

    with open(blacklistname, "r") as fp:
        for line in fp:
            if (line == ''):
                continue
            elif (line[0] == '#'):
                continue
            else:
                blacklist[line.strip(' \n')] = 1

    synonyms = {}
    meanings = {}
    classification = {}

    match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])')
    match_cleanup = re.compile('(\[.*\]|\*|:.*)')

    with open(filename, "r") as fp:
        for line in fp:
            if (line == ''):
                continue
            elif (line[0] == '#'):
                continue
            else:
                terms = line.split('\t')
                if (terms[0] == '' or len(terms) < 2):
                    continue

                index = terms[0].strip()
                if (index == ''):
                    continue

                word = terms[1].strip()
                if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
                    word = word.strip('" ')

                if (word == ''):
                    continue

                if (index + '\t' + word in blacklist or
                        index in blacklist or
                        index + '\t' in blacklist or
                        '\t' + word in blacklist):
                    continue

                typ = ''
                if (len(terms) >= 2):
                    typ = terms[2]

                    # ignore non-translations
                    if match_ignore.search(typ) is not None:
                        continue

                    typ = match_cleanup.sub('', typ)
                    typ = typ.strip()

                typ = classify(typ)

                if index in synonyms:
                    synonyms[index].append((word, typ))
                else:
                    synonyms[index] = [(word, typ)]

                if word in meanings:
                    meanings[word].append(index)
                else:
                    meanings[word] = [index]

                if typ != '':
                    if word in classification:
                        if typ not in classification[word]:
                            classification[word].append(typ)
                    else:
                        classification[word] = [typ]

    return (synonyms, meanings, classification)


def buildThesaurus(synonyms, meanings, classification):
    # for every word:
    #   find all the indexes, and then again map the indexes to words - these are the synonyms
    for word in sorted(meanings.keys()):
        # we assume that various indexes (english words here) are various
        # meanings; not generally true, but...
        indexes = meanings[word]

        # only limit the words if the type is unambiguous
        typ = ''
        if word in classification and len(classification[word]) == 1:
            typ = classification[word][0]

        # we want to output each word just once
        used_this_round = [word]

        output_lines = []
        for index in indexes:
            syns = synonyms[index]

            # collect types first
            types = []
            for (w, t) in syns:
                if t not in types:
                    types.append(t)

            # build the various thesaurus lines
            line = {}
            for syn in syns:
                (w, t) = syn

                if typ != '' and t != '' and typ != t:
                    continue

                if w not in used_this_round:
                    if t in line:
                        line[t] += '|' + w
                    else:
                        line[t] = '|' + w
                    used_this_round.append(w)

            if len(line) != 0:
                for t in types:
                    if t in line:
                        output_lines.append((t, line[t]))

        if len(output_lines) > 0:
            print word + '|' + str(len(output_lines))

            # those with existing classification are probably a better fit,
            # put them to the front (even if we don't output the
            # classification in the end)
            for i in [0, 1]:
                for (t, line) in output_lines:
                    # first pass only non-empty, 2nd pass only empty
                    if (i == 0 and t != '') or (i == 1 and t == ''):
                        if typ == '':
                            print t + line
                        else:
                            print line


def main(args):
    if (len(args) != 3):
        usage()
        sys.exit(1)

    (synonyms, meanings, classification) = parse(args[1], args[2])

    print "UTF-8"
    buildThesaurus(synonyms, meanings, classification)


if __name__ == "__main__":
    main(sys.argv)

# vim:set shiftwidth=4 softtabstop=4 expandtab: