From cda4ee0c50d2a4d3b471e63fe115b1a90b285f89 Mon Sep 17 00:00:00 2001
From: Jeroen Nijhof <jeroen@nijhof.co.uk>
Date: Thu, 1 May 2014 19:32:17 +0200
Subject: Speed up find-german-comments: speed up text_cat -s

follow-up commit to https://gerrit.libreoffice.org/#/c/9226/ .

text_cat -s was documented as "Not very efficient yet, because
language models are re-loaded after each line." So if we want
to use text_cat -s for thousands of lines, better
read the language models only once and store them.

When tested on svl/source, the speed-up was a futher factor 1.5
(reduced from 6 s to 4 s).

Change-Id: I654a250b0e369e01c5eac5970b64df1390f0ef35
Reviewed-on: https://gerrit.libreoffice.org/9227
Reviewed-by: Michael Meeks <michael.meeks@collabora.com>
Tested-by: Michael Meeks <michael.meeks@collabora.com>
---
 bin/text_cat/text_cat | 81 +++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 35 deletions(-)
diff --git a/bin/text_cat/text_cat b/bin/text_cat/text_cat
index c907f50b96bb..74dae861d802 100755
--- a/bin/text_cat/text_cat
+++ b/bin/text_cat/text_cat
@@ -8,6 +8,8 @@ use Getopt::Std;
 use Benchmark;
 
 my $non_word_characters='0-9\s';
+my @languages; # languages (sorted by name)
+my %ngram_for; # map language x ngram => rang
 
 # OPTIONS
 getopts('a:d:f:hi:lnst:u:v');
@@ -94,55 +96,64 @@ if ($opt_n) {
     classify(input()); 
 }
 
-# CLASSIFICATION
-sub classify {
-  my ($input)=@_;
-  my %results=();
-  my $maxp = $opt_t;
+sub read_model {
+    my ($file) = @_;
+    open(LM,"$file") or  die "cannot open $file: $!\n";
+    my %ngram;
+    my $rang = 1;
+    while (<LM>) {
+	chomp;
+	# only use lines starting with appropriate character. Others are
+	# ignored.
+	if (/^[^$non_word_characters]+/o) {
+	    $ngram{$&} = $rang++;
+	}
+    }
+    return \%ngram;
+}
+
+sub read_models {
   # open directory to find which languages are supported
   opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
-  my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
+  @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
   closedir DIR;
   @languages or die "sorry, can't read any language models from $opt_d\n" .
     "language models must reside in files with .lm ending\n";
 
+  foreach my $language (@languages) {
+      $ngram_for{$language} = read_model("$opt_d/$language.lm");
+  }
+}
+
+# CLASSIFICATION
+sub classify {
+  my ($input)=@_;
+  my %results=();
+  my $maxp = $opt_t;
+  read_models() if  !@languages;
 
   # create ngrams for input. Note that hash %unknown is not used;
   # it contains the actual counts which are only used under -n: creating
   # new language model (and even then they are not really required).
   my @unknown=create_lm($input);
-  # load model and count for each language.
-  my $language;
+
   my $t1 = new Benchmark;
-  foreach $language (@languages) {
-    # loads the language model into hash %$language.
-    my %ngram=();
-    my $rang=1;
-    open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
-    while (<LM>) {
-      chomp;
-      # only use lines starting with appropriate character. Others are
-      # ignored.
-      if (/^[^$non_word_characters]+/o) {
-	$ngram{$&} = $rang++;
-      } 
-    }
-    close(LM);
-    #print STDERR "loaded language model $language\n" if $opt_v;
-    
-    # compares the language model with input ngrams list
-    my ($i,$p)=(0,0);
-    while ($i < @unknown) {
-      if ($ngram{$unknown[$i]}) {
-	$p=$p+abs($ngram{$unknown[$i]}-$i);
-      } else { 
-	$p=$p+$maxp; 
+  foreach my $language (@languages) {
+      # compares the language model with input ngrams list
+      my $ngram = $ngram_for{$language} or die "no ngrams for $language";
+
+      my ($i,$p)=(0,0);
+      while ($i < @unknown) {
+	  if ($ngram->{$unknown[$i]}) {
+	      $p=$p+abs($ngram->{$unknown[$i]}-$i);
+	  } else {
+	      $p=$p+$maxp;
+	  }
+	  ++$i;
       }
-      ++$i;
-    }
-    #print STDERR "$language: $p\n" if $opt_v;
+      #print STDERR "$language: $p\n" if $opt_v;
     
-    $results{$language} = $p;
+      $results{$language} = $p;
   }
   print STDERR "read language models done (" . 
     timestr(timediff(new Benchmark, $t1)) . 
-- 
cgit