diff options
author | Kurt Zenker <kz@openoffice.org> | 2007-06-19 15:01:46 +0000 |
---|---|---|
committer | Kurt Zenker <kz@openoffice.org> | 2007-06-19 15:01:46 +0000 |
commit | cab1870e677f33a2501f15916110da46d3336d24 (patch) | |
tree | 127f79cbde6826d93d732989efb31b3b31616f49 | |
parent | 4020743a66e0d439bc4bfcc9d870a1bbbc6719f5 (diff) |
INTEGRATION: CWS languageguessing (1.1.2); FILE ADDED
2007/01/12 11:07:09 tl 1.1.2.1: #i73173# integrate Google SoC language-guessing
-rw-r--r-- | lingucomponent/source/languageguessing/simpleguesser.cxx | 246 |
1 files changed, 246 insertions, 0 deletions
diff --git a/lingucomponent/source/languageguessing/simpleguesser.cxx b/lingucomponent/source/languageguessing/simpleguesser.cxx new file mode 100644 index 000000000000..f32f517d920a --- /dev/null +++ b/lingucomponent/source/languageguessing/simpleguesser.cxx @@ -0,0 +1,246 @@ +/*************************************************************************** + * Copyright (C) 2006 by Jocelyn Merand * + * joc.mer@gmail.com * + * * + * + * OpenOffice.org - a multi-platform office productivity suite + * + * $RCSfile: simpleguesser.cxx,v $ + * + * $Revision: 1.2 $ + * + * last change: $Author: kz $ $Date: 2007-06-19 16:01:46 $ + * + * The Contents of this file are made available subject to + * the terms of GNU Lesser General Public License Version 2.1. + * + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2005 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ************************************************************************/ + + /** + * + * + * + * + * TODO + * - Add exception throwing when h == NULL + * - Not init h when implicit constructor is launched + */ +#include <string.h> +#include <sstream> +#include <iostream> + +#include <libtextcat/textcat.h> +#include <libtextcat/common.h> +#include <libtextcat/constants.h> +#include <libtextcat/fingerprint.h> +#include <libtextcat/utf8misc.h> + +#include <sal/types.h> + +#include "altstrfunc.hxx" +#include "simpleguesser.hxx" + +#ifndef _UTF8_ +#define _UTF8_ +#endif + + +using namespace std; + + +/** + * This 3 following structures are from fingerprint.c and textcat.c + */ + +typedef struct ngram_t { + + sint2 rank; + char str[MAXNGRAMSIZE+1]; + +} ngram_t; + +typedef struct fp_t { + + const char *name; + ngram_t *fprint; + uint4 size; + +} fp_t; + +typedef struct textcat_t{ + + void **fprint; + char *fprint_disable; + uint4 size; + uint4 maxsize; + + char output[MAXOUTPUTSIZE]; + +} textcat_t; +/** end of the 3 structs */ + +SimpleGuesser::SimpleGuesser() +{ + h = NULL; +} + +SimpleGuesser::SimpleGuesser(const char* confFile, const char* prefix) +{ + h = special_textcat_Init(confFile, prefix); +} + +void SimpleGuesser::operator=(SimpleGuesser& sg){ + if(h){textcat_Done(h);} + h = sg.h; +} + +SimpleGuesser::~SimpleGuesser() +{ + if(h){textcat_Done(h);} +} + + +/*! + \fn SimpleGuesser::GuessLanguage(char* text) + */ +vector<Guess> SimpleGuesser::GuessLanguage(char* text) +{ + vector<Guess> guesses; + + if(!h){return guesses;} + + //calculate le number of unicode charcters (symbols) + int len = utfstrlen(text); + + if( len > MAX_STRING_LENGTH_TO_ANALYSE ){len = MAX_STRING_LENGTH_TO_ANALYSE ;} + + char *guess_list = textcat_Classify(h, text, len); + + if(strcmp(guess_list, _TEXTCAT_RESULT_SHORT) == 0){ + return guesses; + } + + int current_pointer = 0; + + for(int i = 0; guess_list[current_pointer] != '\0'; i++) + { + while(guess_list[current_pointer] != GUESS_SEPARATOR_OPEN && guess_list[current_pointer] != '\0'){ + current_pointer++; + } + if(guess_list[current_pointer] != '\0') + { + Guess g((char*)(guess_list + current_pointer),i); + + guesses.push_back(g); + + current_pointer++; + } + } + + return guesses; +} + +/*! + \fn SimpleGuesser::GuessPrimaryLanguage(char* text) + */ +Guess SimpleGuesser::GuessPrimaryLanguage(char* text) +{ + vector<Guess> ret = GuessLanguage(text); + if(ret.size() > 0){ + return GuessLanguage(text)[0]; + } + else{ + return Guess(); + } +} +/** + * Is used to know wich language is available, unavailable or both + * when mask = 0xF0, return only Available + * when mask = 0x0F, return only Unavailable + * when mask = 0xFF, return both Available and Unavailable + */ +vector<Guess> SimpleGuesser::GetManagedLanguages(const char mask) +{ + size_t i; + textcat_t *tables = (textcat_t*)h; + + vector<Guess> lang; + if(!h){return lang;} + + for (i=0; i<tables->size; i++) { + if(tables->fprint_disable[i] & mask){ + string langStr = "["; + langStr += (char*)fp_Name(tables->fprint[i]); + Guess g( (char *)langStr.c_str() , i); + lang.push_back(g); + } + } + + return lang; +} + +vector<Guess> SimpleGuesser::GetAvailableLanguages(){ + return GetManagedLanguages( sal::static_int_cast< char >( 0xF0 ) ); +} + +vector<Guess> SimpleGuesser::GetUnavailableLanguages(){ + return GetManagedLanguages( sal::static_int_cast< char >( 0x0F )); +} + +vector<Guess> SimpleGuesser::GetAllManagedLanguages(){ + return GetManagedLanguages( sal::static_int_cast< char >( 0xFF )); +} + +void SimpleGuesser::XableLanguage(string lang, char mask){ + size_t i; + textcat_t *tables = (textcat_t*)h; + + if(!h){return;} + + for (i=0; i<tables->size; i++) { + string language(fp_Name(tables->fprint[i])); + if(start(language,lang) == 0){ + //cout << language << endl; + tables->fprint_disable[i] = mask; + //continue; + } + } +} + +void SimpleGuesser::EnableLanguage(string lang){ + XableLanguage(lang, sal::static_int_cast< char >( 0xF0 )); +} + +void SimpleGuesser::DisableLanguage(string lang){ + XableLanguage(lang, sal::static_int_cast< char >( 0x0F )); +} + +/** +* +*/ +void SimpleGuesser::SetDBPath(const char* path, const char* prefix){ + if(h){ + textcat_Done(h); + } + h = special_textcat_Init(path, prefix); +} |