diff options
author | Jan Holesovsky <kendy@collabora.com> | 2016-02-25 10:57:34 +0100 |
---|---|---|
committer | Jan Holesovsky <kendy@collabora.com> | 2016-02-25 11:18:30 +0100 |
commit | 46febeac1b9d8e4edea5c9cbac327aa0e2c846ae (patch) | |
tree | 59053c0e37dd3535af97d3447ff1f6534115fb5e | |
parent | 74e081219ce46b85768838e01b32eb3837e8e2bc (diff) |
dictionary-to-thesaurus.py: Various cleanups.
* word classifiacation (when available)
* word blacklist
* ignore some non-translations (eg. irregular verbs)
* ignore vulgarisms (when marked), they only add confusion
-rwxr-xr-x | cs_CZ/thesaurus/dictionary-to-thesaurus.py | 78 |
1 files changed, 66 insertions, 12 deletions
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py index ec20c25..8a0ae8a 100755 --- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py +++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding=utf-8 # # This file is part of the LibreOffice project. # @@ -14,18 +15,45 @@ # thanks! :-) import os +import re import sys +# add here the Czech words we want to leave out from the thesaurus generation +# (misbehaving, mistranslated, etc.) +ignore_words = [ + '?', + '(by the way)', + '(po)štvat', + '14. písmeno hebrejské abecedy', +] + def usage(): message = """Usage: {program} slovnik_data_utf8.txt slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php""" print(message.format(program = os.path.basename(sys.argv[0]))) +def classify(typ): + if typ == '': + return '' + elif typ == 'adj': + return '(adj)' + elif typ == 'adv': + return '(adv)' + elif typ == 'n': + return '(noun)' + elif typ == 'v': + return '(verb)' + + return '' + def parse(filename): synonyms = {} meanings = {} + match_ignore = re.compile('(\[neprav\.\]|\[vulg\.\])') + match_cleanup = re.compile('(\[.*\]|\*|:.*)') + with open(filename, "r") as fp: for line in fp: if (line == ''): @@ -42,14 +70,29 @@ def parse(filename): continue word = terms[1].strip() - if (word == ''): + if (word != '' and word[0] == '"' and word[len(word)-1] == '"'): + word = word.strip('" ') + + if (word == '' or word in ignore_words): continue - #type = terms[2] TODO for now the type (n:, adj:, ...) is ignored + + typ = '' + if (len(terms) >= 2): + typ = terms[2] + + # ignore non-translations + if match_ignore.search(typ) != None: + continue + + typ = match_cleanup.sub('', typ) + typ = typ.strip() + + typ = classify(typ) if index in synonyms: - synonyms[index].append(word) + synonyms[index].append( (word, typ) ) else: - synonyms[index] = [ word ] + synonyms[index] = [ (word, typ) ] if word in meanings: meanings[word].append(index) @@ -73,15 +116,26 @@ def buildThesaurus(synonyms, meanings): for index in indexes: syns = synonyms[index] - line = '' - for syn in syns: - if not syn in used_this_round: - line += '|' + syn - used_this_round.append(syn) + # collect types first + types = [] + for (w, t) in syns: + if not t in types: + types.append(t) - if line != '': - # TODO prepend the line with '(adj)' or '(noun)' or so; see 'type' above - output_lines.append(line) + line = {} + for syn in syns: + (w, t) = syn + if not w in used_this_round: + if t in line: + line[t] += '|' + w + else: + line[t] = '|' + w + used_this_round.append(w) + + if len(line) != 0: + for t in types: + if t in line: + output_lines.append(t + line[t]) if len(output_lines) > 0: print word + '|' + str(len(output_lines)) |