summaryrefslogtreecommitdiff
path: root/util/th_check.pl
blob: 04acc3c4e9f01523f5838a983053eb4c3d63245e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
:
eval 'exec perl -wS $0 ${1+"$@"}'
    if 0;

# Version: MPL 1.1 / GPLv3+ / LGPLv3+
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License or as specified alternatively below. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Initial Developer of the Original Code is
#       Steven Butler <sebutler@gmail.com>
# Portions created by the Initial Developer are Copyright (C) 2011 the
# Initial Developer. All Rights Reserved.
#
# For minor contributions see the git repository.
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 3 or later (the "GPLv3+"), or
# the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
# in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
# instead of those above.

use strict;

sub processFile($) {
    my ($input) = @_;

    if (!open(INPUT, $input)) {
        print "FAIL: $input (no input found)\n";
        return 1;
    }
    # top line of thesaurus provides encoding (we ignore it)
    $_=<INPUT>;
    my $line = 1;


    my $expectedEntries;
    my $actualEntries = 0;
    my $word;
    my %words = ();
    my @errors = ();
    while (<INPUT>){
        $line++;
        s/\n$//;
        s/\r$//;
        s/\s+$//;
        if (m/^([^\|]+)\|(\d+)$/) {

            my $tword = $1;
            my $texpectedEntries = $2;
            #print $tword, $texpectedEntries, "\n";
            if (defined $expectedEntries) {
                # Check if the last word's actual entries matched the expected
                if ($actualEntries != $expectedEntries) {
                    push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n";
                }
            }
            $word = $tword;
            $expectedEntries = $texpectedEntries;
            if (defined $words{$word}) {
                push @errors, "$line: $word previously defined on $words{$word}\n";
            } else {
                $words{$word} = $line;
            }
            $actualEntries = 0;
        } elsif (m/^[\(\-\|]/) {
            $actualEntries++;
        } else {
            push @errors, "$line: Unrecognised line format: $_\n";
            if (m/^(interj|prep|conj)\|/) {
                $actualEntries++;
            }
        }

    }
    close(INPUT);


    if (scalar(@errors)) {
        print $input, ':', join($input.':', @errors);
        return 1;
    }
    else {
        return 0;
    }
}

if (scalar(@ARGV) == 0) {
    print "Usage: $0 <thesaurus .dat file>+\n";
    print "\tscans for some common issues found in mythes format thesaurus files\n";
    exit(1);
}

my $errors = 0;
foreach (@ARGV) {
    $errors += processFile($_);
}
exit($errors);
s
+SFX D s e [^ghk]s
+SFX D s em s
+SFX D s ov s
+SFX D s s
+SFX D s m s
+SFX D s ech [^ighk]s
+SFX D s y [^i]s
+SFX D s ch is
+SFX D s ch ks
+SFX D s i is
SFX D os a os
SFX D os u os
SFX D os ovi os
@@ -292,19 +305,19 @@ SFX D us i [ei]us
SFX D 0 [^eou]s
SFX D 0 [^asy]
SFX D a a
+SFX D ky ek [aeiouy][lr]ky
+SFX D y 0 [^aeiouy][lr]ky
SFX D ky ek [dntv]ky
-SFX D y [^dntv]ky
-SFX D ny n any
-SFX D y [^kn]y
-SFX D y m [dntv]ky
-SFX D y m [^dntv]ky
-SFX D y m [^k]y
+SFX D y 0 [^dntvrl]ky
+SFX D ly el [dkpz]ly
+SFX D y 0 [^dkpz]ly
+SFX D y 0 [^kl]y
+SFX D y m y
SFX D y ch [gh]y
-SFX D y ch [dntv]ky
-SFX D ky cch [^dntv]ky
-SFX D y ech [^ghk]y
+SFX D y ch ky
+SFX D y ch ry
+SFX D y ech [^ghkr]y
SFX D y ch [sz]y
-SFX D y ami [dntv]ky
SFX H Y 24
SFX H 0 u [^ey]
@@ -389,14 +402,15 @@ SFX L 0 [^e]n
SFX L 0 m [^e]n
SFX L 0 ech [^e]n
-SFX S Y 60
-SFX S 0 e [^ecn]
-SFX S 0 i [^ecn]
-SFX S 0 em [^ecn]
-SFX S 0 [^ecn]
-SFX S 0 m [^ecn]
+SFX S Y 45
+SFX S 0 e [^ec]
+SFX S 0 i [^ec]
+SFX S 0 em [^ec]
+SFX S 0 [^ec]
+SFX S 0 m [^ec]
SFX S 0 ch [^ecnlsz]
-SFX S 0 ech [sz]
+SFX S 0 y n
+SFX S 0 ech [nsz]
SFX S 0 ech l
SFX S 0 ch [^]l
SFX S ec ce ec
@@ -427,22 +441,6 @@ SFX S 0 [^e]
SFX S 0 m [^e]
SFX S nm [^e]
SFX S nch [^e]
-SFX S en ne [^mn]en
-SFX S en nu [^mn]en
-SFX S en ni [^mn]en
-SFX S en nem [^mn]en
-SFX S en ny [^mn]en
-SFX S en n [^mn]en
-SFX S en nm [^mn]en
-SFX S en nech [^mn]en
-SFX S 0 e [mn]en
-SFX S 0 u [mn]en
-SFX S 0 i [mn]en
-SFX S 0 em [mn]en
-SFX S 0 y [mn]en
-SFX S 0 [mn]en
-SFX S 0 m [mn]en
-SFX S 0 ech [mn]en
SFX S 0 m
SFX S 0 ch
SFX S 0 mi
@@ -774,7 +772,7 @@ SFX Z dm
SFX Z dch
SFX Z dmi
-SFX C Y 106
+SFX C Y 109
SFX C ce c ice
SFX C e [^i]ce
SFX C e [ijl]e
@@ -788,6 +786,9 @@ SFX C m
SFX C ch
SFX C 0 mi
SFX C 0 m
+SFX C e e e
+SFX C e i e
+SFX C e e
SFX C o a o
SFX C o u o
SFX C o em o
@@ -936,7 +937,7 @@ SFX M m
SFX M m
SFX M ch
-SFX K Y 84
+SFX K Y 85
SFX K 0 te [e]
SFX K 0 ti [e]
SFX K 0 tem [e]
@@ -974,11 +975,11 @@ SFX K 0 tech a
SFX K 0 ty a
SFX K 0 i [^aes]
SFX K 0 [^aes]
-SFX K 0 em [^aesmcp]
-SFX K 0 ech [^aesmcp]
-SFX K 0 mi [^aesmc]
-SFX K 0 ch [p]
-SFX K 0 m [p]
+SFX K 0 em [^aesmcpv]
+SFX K 0 ech [^aesmcpv]
+SFX K 0 mi [^aesmcv]
+SFX K 0 ch [pv]
+SFX K 0 m [pv]
SFX K 0 ch oc
SFX K 0 m oc
SFX K 0 ech moc
@@ -999,6 +1000,7 @@ SFX K tm
SFX K tch
SFX K tmi [^]
SFX K 0 mi
+SFX K 0 mi v
SFX K es si es
SFX K es s es
SFX K es sm es
@@ -1378,7 +1380,7 @@ SFX J out ulo out
SFX J out uli out
SFX J out uly out
-SFX A Y 763
+SFX A Y 767
SFX A st tu st
SFX A st te st
SFX A st te st
@@ -1933,6 +1935,9 @@ SFX A nit ate nit
SFX A t 0 tit
SFX A it me tit
SFX A it te tit
+SFX A t 0 htit
+SFX A it me htit
+SFX A it te htit
SFX A tit [^i]stit
SFX A tit me [^i]stit
SFX A tit te [^i]stit
@@ -2116,6 +2121,7 @@ SFX A nout la [aeiouy]rnout
SFX A nout lo [aeiouy]rnout
SFX A nout li [aeiouy]rnout
SFX A nout ly [aeiouy]rnout
+SFX A nout l [aeiouyr][^aeiouyrl][^aeiouyrl]nout
SFX A nout l [aeiouyr][^aeiouyrl]nout
SFX A out ul [^aeiouy]l[^aeiouyrl]nout
SFX A nout l l[^aeiouyrl]nout
@@ -2259,7 +2265,7 @@ SFX T t yti t
SFX T t yty t
SFX T t ytu t
-SFX B Y 257
+SFX B Y 250
SFX B e e e
SFX B e u [^cj]e
SFX B e i ce
@@ -2328,10 +2334,10 @@ SFX B u n u
SFX B u nme u
SFX B u nte u
SFX B u i lu
-SFX B u i [^aeiouy][^aeiouy]u
-SFX B u me [^aeiouy][bdfmnptvw]u
+SFX B u i [^aeiouy][^aeiouyt]u
+SFX B u me [^aeiouy][bdfmnpvw]u
SFX B u eme [^aeiouy][cghjklrsz]u
-SFX B u te [^aeiouy][bdfmnptvw]u
+SFX B u te [^aeiouy][bdfmnpvw]u
SFX B u ete [^aeiouy][cghjklrsz]u
SFX B du [aeiouy]du
SFX B du me [aeiouy]du
@@ -2387,24 +2393,17 @@ SFX B t ila t
SFX B t ilo t
SFX B t ili t
SFX B t ily t
-SFX B t ji t
-SFX B t je t
-SFX B t je t
-SFX B t jeme t
-SFX B t jete t
-SFX B t j t
-SFX B t j t
-SFX B t jme t
-SFX B t jte t
-SFX B t ji et
-SFX B t je et
-SFX B t je et
-SFX B t jeme et
-SFX B t jete et
-SFX B t j et
-SFX B t j et
-SFX B t jme et
-SFX B t jte et
+SFX B t m t
+SFX B t t
+SFX B t t
+SFX B t me t
+SFX B t te t
+SFX B t 0 t
+SFX B t me t
+SFX B t te t
+SFX B t 0 t
+SFX B t c t
+SFX B t ce t
SFX B t yji t
SFX B t yje t
SFX B t yje t
diff --git a/cs_CZ/cs_CZ.dic b/cs_CZ/cs_CZ.dic
index 77c2431..bbb4e4a 100644
--- a/cs_CZ/cs_CZ.dic
+++ b/cs_CZ/cs_CZ.dic