1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
:
eval 'exec perl -wS $0 ${1+"$@"}'
if 0;
# Version: MPL 1.1 / GPLv3+ / LGPLv3+
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License or as specified alternatively below. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Initial Developer of the Original Code is
# Steven Butler <sebutler@gmail.com>
# Portions created by the Initial Developer are Copyright (C) 2011 the
# Initial Developer. All Rights Reserved.
#
# For minor contributions see the git repository.
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 3 or later (the "GPLv3+"), or
# the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
# in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
# instead of those above.
use strict;
sub processFile($) {
my ($input) = @_;
if (!open(INPUT, $input)) {
print "FAIL: $input (no input found)\n";
return 1;
}
# top line of thesaurus provides encoding (we ignore it)
$_=<INPUT>;
my $line = 1;
my $expectedEntries;
my $actualEntries = 0;
my $word;
my %words = ();
my @errors = ();
while (<INPUT>){
$line++;
s/\n$//;
s/\r$//;
s/\s+$//;
if (m/^([^\|]+)\|(\d+)$/) {
my $tword = $1;
my $texpectedEntries = $2;
#print $tword, $texpectedEntries, "\n";
if (defined $expectedEntries) {
# Check if the last word's actual entries matched the expected
if ($actualEntries != $expectedEntries) {
push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n";
}
}
$word = $tword;
$expectedEntries = $texpectedEntries;
if (defined $words{$word}) {
push @errors, "$line: $word previously defined on $words{$word}\n";
} else {
$words{$word} = $line;
}
$actualEntries = 0;
} elsif (m/^[\(\-\|]/) {
$actualEntries++;
} else {
push @errors, "$line: Unrecognised line format: $_\n";
if (m/^(interj|prep|conj)\|/) {
$actualEntries++;
}
}
}
close(INPUT);
if (scalar(@errors)) {
print $input, ':', join($input.':', @errors);
return 1;
}
else {
return 0;
}
}
if (scalar(@ARGV) == 0) {
print "Usage: $0 <thesaurus .dat file>+\n";
print "\tscans for some common issues found in mythes format thesaurus files\n";
exit(1);
}
my $errors = 0;
foreach (@ARGV) {
$errors += processFile($_);
}
exit($errors);
|