summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKurt Zenker <kz@openoffice.org>2007-06-19 15:01:46 +0000
committerKurt Zenker <kz@openoffice.org>2007-06-19 15:01:46 +0000
commitcab1870e677f33a2501f15916110da46d3336d24 (patch)
tree127f79cbde6826d93d732989efb31b3b31616f49
parent4020743a66e0d439bc4bfcc9d870a1bbbc6719f5 (diff)
INTEGRATION: CWS languageguessing (1.1.2); FILE ADDED
2007/01/12 11:07:09 tl 1.1.2.1: #i73173# integrate Google SoC language-guessing
-rw-r--r--lingucomponent/source/languageguessing/simpleguesser.cxx246
1 files changed, 246 insertions, 0 deletions
diff --git a/lingucomponent/source/languageguessing/simpleguesser.cxx b/lingucomponent/source/languageguessing/simpleguesser.cxx
new file mode 100644
index 000000000000..f32f517d920a
--- /dev/null
+++ b/lingucomponent/source/languageguessing/simpleguesser.cxx
@@ -0,0 +1,246 @@
+/***************************************************************************
+ * Copyright (C) 2006 by Jocelyn Merand *
+ * joc.mer@gmail.com *
+ * *
+ *
+ * OpenOffice.org - a multi-platform office productivity suite
+ *
+ * $RCSfile: simpleguesser.cxx,v $
+ *
+ * $Revision: 1.2 $
+ *
+ * last change: $Author: kz $ $Date: 2007-06-19 16:01:46 $
+ *
+ * The Contents of this file are made available subject to
+ * the terms of GNU Lesser General Public License Version 2.1.
+ *
+ *
+ * GNU Lesser General Public License Version 2.1
+ * =============================================
+ * Copyright 2005 by Sun Microsystems, Inc.
+ * 901 San Antonio Road, Palo Alto, CA 94303, USA
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ *
+ ************************************************************************/
+
+ /**
+ *
+ *
+ *
+ *
+ * TODO
+ * - Add exception throwing when h == NULL
+ * - Not init h when implicit constructor is launched
+ */
+#include <string.h>
+#include <sstream>
+#include <iostream>
+
+#include <libtextcat/textcat.h>
+#include <libtextcat/common.h>
+#include <libtextcat/constants.h>
+#include <libtextcat/fingerprint.h>
+#include <libtextcat/utf8misc.h>
+
+#include <sal/types.h>
+
+#include "altstrfunc.hxx"
+#include "simpleguesser.hxx"
+
+#ifndef _UTF8_
+#define _UTF8_
+#endif
+
+
+using namespace std;
+
+
+/**
+ * This 3 following structures are from fingerprint.c and textcat.c
+ */
+
+typedef struct ngram_t {
+
+ sint2 rank;
+ char str[MAXNGRAMSIZE+1];
+
+} ngram_t;
+
+typedef struct fp_t {
+
+ const char *name;
+ ngram_t *fprint;
+ uint4 size;
+
+} fp_t;
+
+typedef struct textcat_t{
+
+ void **fprint;
+ char *fprint_disable;
+ uint4 size;
+ uint4 maxsize;
+
+ char output[MAXOUTPUTSIZE];
+
+} textcat_t;
+/** end of the 3 structs */
+
+SimpleGuesser::SimpleGuesser()
+{
+ h = NULL;
+}
+
+SimpleGuesser::SimpleGuesser(const char* confFile, const char* prefix)
+{
+ h = special_textcat_Init(confFile, prefix);
+}
+
+void SimpleGuesser::operator=(SimpleGuesser& sg){
+ if(h){textcat_Done(h);}
+ h = sg.h;
+}
+
+SimpleGuesser::~SimpleGuesser()
+{
+ if(h){textcat_Done(h);}
+}
+
+
+/*!
+ \fn SimpleGuesser::GuessLanguage(char* text)
+ */
+vector<Guess> SimpleGuesser::GuessLanguage(char* text)
+{
+ vector<Guess> guesses;
+
+ if(!h){return guesses;}
+
+ //calculate le number of unicode charcters (symbols)
+ int len = utfstrlen(text);
+
+ if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;}
+
+ char *guess_list = textcat_Classify(h, text, len);
+
+ if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){
+ return guesses;
+ }
+
+ int current_pointer = 0;
+
+ for(int i = 0; guess_list[current_pointer] != '\0'; i++)
+ {
+ while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){
+ current_pointer++;
+ }
+ if(guess_list[current_pointer] != '\0')
+ {
+ Guess g((char*)(guess_list + current_pointer),i);
+
+ guesses.push_back(g);
+
+ current_pointer++;
+ }
+ }
+
+ return guesses;
+}
+
+/*!
+ \fn SimpleGuesser::GuessPrimaryLanguage(char* text)
+ */
+Guess SimpleGuesser::GuessPrimaryLanguage(char* text)
+{
+ vector<Guess> ret = GuessLanguage(text);
+ if(ret.size() > 0){
+ return GuessLanguage(text)[0];
+ }
+ else{
+ return Guess();
+ }
+}
+/**
+ * Is used to know wich language is available, unavailable or both
+ * when mask = 0xF0, return only Available
+ * when mask = 0x0F, return only Unavailable
+ * when mask = 0xFF, return both Available and Unavailable
+ */
+vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask)
+{
+ size_t i;
+ textcat_t *tables = (textcat_t*)h;
+
+ vector<Guess> lang;
+ if(!h){return lang;}
+
+ for (i=0; i<tables->size; i++) {
+ if(tables->fprint_disable[i] & mask){
+ string langStr = "[";
+ langStr += (char*)fp_Name(tables->fprint[i]);
+ Guess g( (char *)langStr.c_str() , i);
+ lang.push_back(g);
+ }
+ }
+
+ return lang;
+}
+
+vector<Guess> SimpleGuesser::GetAvailableLanguages(){
+ return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) );
+}
+
+vector<Guess> SimpleGuesser::GetUnavailableLanguages(){
+ return GetManagedLanguages( sal::static_int_cast< char >( 0x0F ));
+}
+
+vector<Guess> SimpleGuesser::GetAllManagedLanguages(){
+ return GetManagedLanguages( sal::static_int_cast< char >( 0xFF ));
+}
+
+void SimpleGuesser::XableLanguage(string lang, char mask){
+ size_t i;
+ textcat_t *tables = (textcat_t*)h;
+
+ if(!h){return;}
+
+ for (i=0; i<tables->size; i++) {
+ string language(fp_Name(tables->fprint[i]));
+ if(start(language,lang) == 0){
+ //cout << language << endl;
+ tables->fprint_disable[i] = mask;
+ //continue;
+ }
+ }
+}
+
+void SimpleGuesser::EnableLanguage(string lang){
+ XableLanguage(lang, sal::static_int_cast< char >( 0xF0 ));
+}
+
+void SimpleGuesser::DisableLanguage(string lang){
+ XableLanguage(lang, sal::static_int_cast< char >( 0x0F ));
+}
+
+/**
+*
+*/
+void SimpleGuesser::SetDBPath(const char* path, const char* prefix){
+ if(h){
+ textcat_Done(h);
+ }
+ h = special_textcat_Init(path, prefix);
+}