From cda4ee0c50d2a4d3b471e63fe115b1a90b285f89 Mon Sep 17 00:00:00 2001 From: Jeroen Nijhof Date: Thu, 1 May 2014 19:32:17 +0200 Subject: Speed up find-german-comments: speed up text_cat -s follow-up commit to https://gerrit.libreoffice.org/#/c/9226/ . text_cat -s was documented as "Not very efficient yet, because language models are re-loaded after each line." So if we want to use text_cat -s for thousands of lines, better read the language models only once and store them. When tested on svl/source, the speed-up was a futher factor 1.5 (reduced from 6 s to 4 s). Change-Id: I654a250b0e369e01c5eac5970b64df1390f0ef35 Reviewed-on: https://gerrit.libreoffice.org/9227 Reviewed-by: Michael Meeks Tested-by: Michael Meeks --- bin/text_cat/text_cat | 81 +++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/bin/text_cat/text_cat b/bin/text_cat/text_cat index c907f50b96bb..74dae861d802 100755 --- a/bin/text_cat/text_cat +++ b/bin/text_cat/text_cat @@ -8,6 +8,8 @@ use Getopt::Std; use Benchmark; my $non_word_characters='0-9\s'; +my @languages; # languages (sorted by name) +my %ngram_for; # map language x ngram => rang # OPTIONS getopts('a:d:f:hi:lnst:u:v'); @@ -94,55 +96,64 @@ if ($opt_n) { classify(input()); } -# CLASSIFICATION -sub classify { - my ($input)=@_; - my %results=(); - my $maxp = $opt_t; +sub read_model { + my ($file) = @_; + open(LM,"$file") or die "cannot open $file: $!\n"; + my %ngram; + my $rang = 1; + while () { + chomp; + # only use lines starting with appropriate character. Others are + # ignored. + if (/^[^$non_word_characters]+/o) { + $ngram{$&} = $rang++; + } + } + return \%ngram; +} + +sub read_models { # open directory to find which languages are supported opendir DIR, "$opt_d" or die "directory $opt_d: $!\n"; - my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); + @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR)); closedir DIR; @languages or die "sorry, can't read any language models from $opt_d\n" . "language models must reside in files with .lm ending\n"; + foreach my $language (@languages) { + $ngram_for{$language} = read_model("$opt_d/$language.lm"); + } +} + +# CLASSIFICATION +sub classify { + my ($input)=@_; + my %results=(); + my $maxp = $opt_t; + read_models() if !@languages; # create ngrams for input. Note that hash %unknown is not used; # it contains the actual counts which are only used under -n: creating # new language model (and even then they are not really required). my @unknown=create_lm($input); - # load model and count for each language. - my $language; + my $t1 = new Benchmark; - foreach $language (@languages) { - # loads the language model into hash %$language. - my %ngram=(); - my $rang=1; - open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n"; - while () { - chomp; - # only use lines starting with appropriate character. Others are - # ignored. - if (/^[^$non_word_characters]+/o) { - $ngram{$&} = $rang++; - } - } - close(LM); - #print STDERR "loaded language model $language\n" if $opt_v; - - # compares the language model with input ngrams list - my ($i,$p)=(0,0); - while ($i < @unknown) { - if ($ngram{$unknown[$i]}) { - $p=$p+abs($ngram{$unknown[$i]}-$i); - } else { - $p=$p+$maxp; + foreach my $language (@languages) { + # compares the language model with input ngrams list + my $ngram = $ngram_for{$language} or die "no ngrams for $language"; + + my ($i,$p)=(0,0); + while ($i < @unknown) { + if ($ngram->{$unknown[$i]}) { + $p=$p+abs($ngram->{$unknown[$i]}-$i); + } else { + $p=$p+$maxp; + } + ++$i; } - ++$i; - } - #print STDERR "$language: $p\n" if $opt_v; + #print STDERR "$language: $p\n" if $opt_v; - $results{$language} = $p; + $results{$language} = $p; } print STDERR "read language models done (" . timestr(timediff(new Benchmark, $t1)) . -- cgit