diff options
author | Norbert Thiebaud <nthiebaud@gmail.com> | 2012-09-01 09:50:26 -0500 |
---|---|---|
committer | Norbert Thiebaud <nthiebaud@gmail.com> | 2012-10-16 11:09:27 -0500 |
commit | a4473e06b56bfe35187e302754f6baaa8d75e54f (patch) | |
tree | fd17c2dc5dbf56469de2eaa851eda4087f385313 /util/th_check.pl | |
parent | 0493c1b142b0c498931e8ff5d6460ef852026d20 (diff) |
move dictionaries structure one directory up
Change-Id: I70388bf6b95d8692cc6f25fc5a9c7baf3a675710
Diffstat (limited to 'util/th_check.pl')
-rwxr-xr-x | util/th_check.pl | 105 |
1 files changed, 105 insertions, 0 deletions
diff --git a/util/th_check.pl b/util/th_check.pl new file mode 100755 index 0000000..04acc3c --- /dev/null +++ b/util/th_check.pl @@ -0,0 +1,105 @@ +: +eval 'exec perl -wS $0 ${1+"$@"}' + if 0; + +# Version: MPL 1.1 / GPLv3+ / LGPLv3+ +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License or as specified alternatively below. You may obtain a copy of +# the License at http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Initial Developer of the Original Code is +# Steven Butler <sebutler@gmail.com> +# Portions created by the Initial Developer are Copyright (C) 2011 the +# Initial Developer. All Rights Reserved. +# +# For minor contributions see the git repository. +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 3 or later (the "GPLv3+"), or +# the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"), +# in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable +# instead of those above. + +use strict; + +sub processFile($) { + my ($input) = @_; + + if (!open(INPUT, $input)) { + print "FAIL: $input (no input found)\n"; + return 1; + } + # top line of thesaurus provides encoding (we ignore it) + $_=<INPUT>; + my $line = 1; + + + my $expectedEntries; + my $actualEntries = 0; + my $word; + my %words = (); + my @errors = (); + while (<INPUT>){ + $line++; + s/\n$//; + s/\r$//; + s/\s+$//; + if (m/^([^\|]+)\|(\d+)$/) { + + my $tword = $1; + my $texpectedEntries = $2; + #print $tword, $texpectedEntries, "\n"; + if (defined $expectedEntries) { + # Check if the last word's actual entries matched the expected + if ($actualEntries != $expectedEntries) { + push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n"; + } + } + $word = $tword; + $expectedEntries = $texpectedEntries; + if (defined $words{$word}) { + push @errors, "$line: $word previously defined on $words{$word}\n"; + } else { + $words{$word} = $line; + } + $actualEntries = 0; + } elsif (m/^[\(\-\|]/) { + $actualEntries++; + } else { + push @errors, "$line: Unrecognised line format: $_\n"; + if (m/^(interj|prep|conj)\|/) { + $actualEntries++; + } + } + + } + close(INPUT); + + + if (scalar(@errors)) { + print $input, ':', join($input.':', @errors); + return 1; + } + else { + return 0; + } +} + +if (scalar(@ARGV) == 0) { + print "Usage: $0 <thesaurus .dat file>+\n"; + print "\tscans for some common issues found in mythes format thesaurus files\n"; + exit(1); +} + +my $errors = 0; +foreach (@ARGV) { + $errors += processFile($_); +} +exit($errors); |