diff options
author | Thomas Lange <tl@openoffice.org> | 2007-01-12 11:40:54 +0000 |
---|---|---|
committer | Thomas Lange <tl@openoffice.org> | 2007-01-12 11:40:54 +0000 |
commit | 23147b5b1f280e1c7758c4ce27b99dc92135b354 (patch) | |
tree | cdee4b730e97cad5db3fd941f5513dc826530fd8 /libtextcat/libtextcat-2.2.patch | |
parent | 2bb6503c63165d28d1f9a0224b675565b6acaa96 (diff) |
#i73173# integrate Google SoC language-guessing
Diffstat (limited to 'libtextcat/libtextcat-2.2.patch')
-rw-r--r-- | libtextcat/libtextcat-2.2.patch | 2137 |
1 files changed, 2137 insertions, 0 deletions
diff --git a/libtextcat/libtextcat-2.2.patch b/libtextcat/libtextcat-2.2.patch new file mode 100644 index 000000000000..81babb0eb0aa --- /dev/null +++ b/libtextcat/libtextcat-2.2.patch @@ -0,0 +1,2137 @@ +*** misc/libtextcat-2.2/src/common.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/common.c 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 3,25 **** + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 3,25 ---- + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 114,124 **** + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +! return( result ); + } + +! extern void* wg_realloc( void *ptr, size_t size ) +! { + void *result; + + if (!size) { +--- 114,124 ---- + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +! return( result ); + } + +! extern void* wg_realloc( void *ptr, size_t size ) +! { + void *result; + + if (!size) { +*************** +*** 131,137 **** + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +! return( result ); + } + + extern void wg_free( void *mem ) +--- 131,137 ---- + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +! return( result ); + } + + extern void wg_free( void *mem ) +*************** +*** 148,159 **** + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +! + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +! } +! + return line; + } + +--- 148,159 ---- + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +! + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +! } +! + return line; + } + +*************** +*** 164,202 **** + * + * ARGUMENTS: + * - result: +! * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +! * +! * - dest: +! * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +! * +! * - src: +! * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +! * + * Example: +! * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +! * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +! * +! * - maxsegments: +! * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +! * + * RETURN VALUE: + * The number of segments found. + */ +--- 164,202 ---- + * + * ARGUMENTS: + * - result: +! * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +! * +! * - dest: +! * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +! * +! * - src: +! * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +! * + * Example: +! * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +! * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +! * +! * - maxsegments: +! * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +! * + * RETURN VALUE: + * The number of segments found. + */ +*************** +*** 223,229 **** + } + state = 1; + +! case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +--- 223,229 ---- + } + state = 1; + +! case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +*************** +*** 237,243 **** + p++; + state = 0; + break; +! } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +--- 237,243 ---- + p++; + state = 0; + break; +! } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +*************** +*** 292,308 **** + } + + + extern void wg_timerstart(wgtimer_t *t) + { +- #ifdef HAVE_GETTIMEOFDAY + gettimeofday( &(t->start), NULL ); +- #endif + } + + + extern uint4 wg_timerstop(wgtimer_t *t) + { +- #ifdef HAVE_GETTIMEOFDAY + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +--- 292,308 ---- + } + + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t) + { + gettimeofday( &(t->start), NULL ); + } ++ #endif /* TL : no struct timeval under Win32 */ + + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern uint4 wg_timerstop(wgtimer_t *t) + { + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +*************** +*** 312,336 **** + t->start.tv_usec = t->stop.tv_usec; + + return result; +- #else +- return 0; +- #endif + } + + + /** + * wg_strgmov -- a guarded strcpy() variation +! * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +! * finished, the function returns NULL after restoring the first +! * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +! + if ( !dest || dest >= destlimit ) { + return NULL; + } +--- 312,334 ---- + t->start.tv_usec = t->stop.tv_usec; + + return result; + } ++ #endif /* TL : no struct timeval under Win32 */ + + + /** + * wg_strgmov -- a guarded strcpy() variation +! * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +! * finished, the function returns NULL after restoring the first +! * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +! + if ( !dest || dest >= destlimit ) { + return NULL; + } +*************** +*** 355,361 **** + } + + /* +! * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +--- 353,359 ---- + } + + /* +! * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +*************** +*** 373,379 **** + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +! + while ( isspace((int)*p) ) { + p++; + } +--- 371,377 ---- + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +! + while ( isspace((int)*p) ) { + p++; + } +*** misc/libtextcat-2.2/src/common.h 2003-05-22 15:02:29.000000000 +0200 +--- misc/build/libtextcat-2.2/src/common.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 1,28 **** + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +! * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 1,28 ---- + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +! * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 86,95 **** +--- 86,97 ---- + typedef char boole; + #endif + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; + } wgtimer_t; ++ #endif /* TL : no struct timeval under Win32 */ + + + extern void *wg_malloc( size_t size ); +*************** +*** 101,113 **** + + extern char *wg_getline( char *line, int size, FILE *fp ); + + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +! + #endif + +--- 103,117 ---- + + extern char *wg_getline( char *line, int size, FILE *fp ); + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); ++ #endif /* TL : no struct timeval under Win32 */ + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +! + #endif + +*** misc/libtextcat-2.2/src/constants.h 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/constants.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 39,44 **** +--- 39,46 ---- + */ + #include <limits.h> + ++ #define _UTF8_ ++ + #define DESCRIPTION "out of place" + + /* Reported matches are those fingerprints with a score less than best +*************** +*** 59,72 **** + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +! /* Maximum size of an n-gram? */ +! #define MAXNGRAMSIZE 5 + + /* Which characters are not acceptable in n-grams? */ + #define INVALID(c) (isspace((int)c) || isdigit((int)c)) + + /* Minimum size (in characters) for accepting a document */ +! #define MINDOCSIZE 25 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +--- 61,81 ---- + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +! /* Maximum number of character of an n-gram? */ +! #define MAXNGRAMSYMBOL 5 +! +! /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ +! #ifdef _UTF8_ +! #define MAXNGRAMSIZE 20 +! #else +! #define MAXNGRAMSIZE MAXNGRAMSYMBOL +! #endif + + /* Which characters are not acceptable in n-grams? */ + #define INVALID(c) (isspace((int)c) || isdigit((int)c)) + + /* Minimum size (in characters) for accepting a document */ +! #define MINDOCSIZE 6 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +*************** +*** 76,79 **** +--- 85,91 ---- + + #define MAXSCORE INT_MAX + ++ /* where the fingerprints files are stored */ ++ #define DEFAULT_FINGERPRINTS_PATH "" ++ + #endif +*** misc/libtextcat-2.2/src/fingerprint.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/fingerprint.c 2007-01-12 12:51:59.000000000 +0100 +*************** +*** 6,28 **** + * All rights reserved. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 6,28 ---- + * All rights reserved. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 51,57 **** + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +! * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +--- 51,57 ---- + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +! * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +*************** +*** 63,68 **** +--- 63,72 ---- + * - put table/heap datastructure in a separate file. + */ + ++ #ifndef _UTF8_ ++ #define _UTF8_ ++ #endif ++ + #include "config.h" + #include <stdio.h> + #ifdef HAVE_STDLIB_H +*************** +*** 80,89 **** +--- 84,95 ---- + #include "wg_mempool.h" + #include "constants.h" + ++ #include "utf8misc.h" + + #define TABLESIZE (1<<TABLEPOW) + #define TABLEMASK ((TABLESIZE)-1) + ++ + typedef struct { + + sint2 rank; +*************** +*** 96,102 **** + const char *name; + ngram_t *fprint; + uint4 size; +! + } fp_t; + + typedef struct entry_s { +--- 102,108 ---- + const char *name; + ngram_t *fprint; + uint4 size; +! + } fp_t; + + typedef struct entry_s { +*************** +*** 105,117 **** + struct entry_s *next; + } entry_t; + +! typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +! + uint4 heapsize; + uint4 size; + } table_t; +--- 111,123 ---- + struct entry_s *next; + } entry_t; + +! typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +! + uint4 heapsize; + uint4 size; + } table_t; +*************** +*** 122,128 **** + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +! * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +--- 128,134 ---- + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +! * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +*************** +*** 134,162 **** + } + + +- /* checks if n-gram lex is a prefix of key and of length len */ +- inline int issame( char *lex, char *key, int len ) +- { +- int i; +- for (i=0; i<len; i++) { +- if ( key[i] != lex[i] ) { +- return 0; +- } +- } +- if ( lex[i] != 0 ) { +- return 0; +- } +- return 1; +- } +- + + /* increases frequency of ngram(p,len) */ +! static inline int increasefreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +--- 140,153 ---- + } + + + + /* increases frequency of ngram(p,len) */ +! static int increasefreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +*************** +*** 168,174 **** + } + + /*** Not found, so create ***/ +! entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); + strcpy( entry->str, p ); + entry->cnt = 1; + +--- 159,165 ---- + } + + /*** Not found, so create ***/ +! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); + strcpy( entry->str, p ); + entry->cnt = 1; + +*************** +*** 181,192 **** + #if 0 + + /* looks up ngram(p,len) */ +! static entry_t *findfreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +--- 172,183 ---- + #if 0 + + /* looks up ngram(p,len) */ +! static entry_t *findfreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +*************** +*** 219,225 **** + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +! inline static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +--- 210,216 ---- + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +! static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +*************** +*** 241,247 **** + } + + +! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +--- 232,238 ---- + } + + +! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +*************** +*** 273,279 **** + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +! t->size++; + return 0; + } + +--- 264,270 ---- + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +! t->size++; + return 0; + } + +*************** +*** 316,333 **** + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +! entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +! } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +! { + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +--- 307,324 ---- + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +! entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +! } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +! { + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +*************** +*** 347,360 **** + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +! wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +! + if ( name ) { + h->name = wg_strdup(name); + } +--- 338,351 ---- + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +! wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +! + if ( name ) { + h->name = wg_strdup(name); + } +*************** +*** 458,478 **** + return dest; + } + +! + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +! for (;;p++) { + +! const char *q = p; + char *m = n; + + /*** First char may be an underscore ***/ +! *m++ = *q++; + *m = '\0'; + + increasefreq( t, n, 1 ); +--- 449,475 ---- + return dest; + } + +! /** +! * this function extract all n-gram from past buffer and put them into the table "t" +! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice +! */ + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; ++ int pointer = 0; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +! while(1) { + +! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ + char *m = n; + + /*** First char may be an underscore ***/ +! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ +! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ +! m += decay; /*[modified]*/ + *m = '\0'; + + increasefreq( t, n, 1 ); +*************** +*** 482,500 **** + } + + /*** Let the compiler unroll this ***/ +! for ( i=2; i<=MAXNGRAMSIZE; i++) { + +! *m++ = *q; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +! q++; + if ( *q == '\0' ) { + return; + } + } + } + return; + } +--- 479,500 ---- + } + + /*** Let the compiler unroll this ***/ +! for ( i=2; i<=MAXNGRAMSYMBOL; i++) { + +! decay = charcopy(q, m); /*[modified] like above*/ +! m += decay; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +! q += decay; + if ( *q == '\0' ) { + return; + } + } ++ ++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ + } + return; + } +*************** +*** 514,520 **** + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return mystrcmp( x->str, y->str ); + } + +--- 514,520 ---- + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return mystrcmp( x->str, y->str ); + } + +*************** +*** 522,533 **** + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return x->rank - y->rank; + } + + /** +! * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +--- 522,533 ---- + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return x->rank - y->rank; + } + + /** +! * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +*************** +*** 544,563 **** + } + + /*** Throw out all invalid chars ***/ +! tmp = prepbuffer( buffer, bufsize ); + if ( tmp == NULL ) { + return 0; + } +- + h = (fp_t*)handle; + t = inittable(maxngrams); + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +! + /*** Take the top N n-grams and add them to the profile ***/ +! table2heap(t); +! maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +--- 544,564 ---- + } + + /*** Throw out all invalid chars ***/ +! tmp = prepbuffer( buffer, bufsize ); +! /*printf("Cleaned buffer : %s\n",tmp);*/ + if ( tmp == NULL ) { + return 0; + } + h = (fp_t*)handle; + t = inittable(maxngrams); ++ /*printf("Table initialized\n");*/ + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +! /*printf("Table created\n");*/ + /*** Take the top N n-grams and add them to the profile ***/ +! table2heap(t); +! maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +*************** +*** 568,574 **** + entry_t tmp2; + + heapextract(t, &tmp2); +! + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +--- 569,575 ---- + entry_t tmp2; + + heapextract(t, &tmp2); +! + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +*************** +*** 578,584 **** + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +--- 579,585 ---- + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +*************** +*** 608,614 **** + #endif + return 0; + } +! + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +--- 609,615 ---- + #endif + return 0; + } +! + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +*************** +*** 635,641 **** + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +--- 636,642 ---- + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +*************** +*** 648,661 **** + { + uint4 i; + fp_t *h = (fp_t *)handle; +! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); +! + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +! fprintf( fp, "%s\n", tmp[i].str ); + } + wg_free( tmp ); + } +--- 649,663 ---- + { + uint4 i; + fp_t *h = (fp_t *)handle; +! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); +! + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ +! fprintf( fp, "%s\n", tmp[i].str); + } + wg_free( tmp ); + } +*************** +*** 669,675 **** + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +! + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +--- 671,677 ---- + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +! + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +*************** +*** 705,711 **** + } + + return sum; +! + } + + +--- 707,713 ---- + } + + return sum; +! + } + + +*** misc/libtextcat-2.2/src/fingerprint.h 2003-05-19 14:16:31.000000000 +0200 +--- misc/build/libtextcat-2.2/src/fingerprint.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 41,47 **** +--- 41,53 ---- + extern int fp_Read( void *handle, const char *fname, int maxngrams ); + extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); + extern void fp_Show( void *handle ); ++ #ifdef __cplusplus ++ extern "C" { ++ #endif + extern const char *fp_Name( void *handle ); ++ #ifdef __cplusplus ++ } ++ #endif + extern void fp_Print( void *handle, FILE *fp ); + + #endif +*** misc/libtextcat-2.2/src/Makefile.in 2003-05-22 13:39:52.000000000 +0200 +--- misc/build/libtextcat-2.2/src/Makefile.in 2007-01-12 12:48:19.181803000 +0100 +*************** +*** 124,143 **** + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +! WARNS = -W -Wall -Wshadow -Wpointer-arith +! IFLAGS = +! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +! common.h constants.h fingerprint.h textcat.h wg_mempool.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +! common.c fingerprint.c textcat.c wg_mempool.c + + + bin_PROGRAMS = createfp +--- 124,143 ---- + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +! #WARNS = -W -Wall -Wshadow -Wpointer-arith +! IFLAGS = +! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c + + + bin_PROGRAMS = createfp +*************** +*** 156,162 **** + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +! wg_mempool.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +--- 156,162 ---- + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +! wg_mempool.lo utf8misc.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +*************** +*** 177,183 **** + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +--- 177,184 ---- + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +*************** +*** 213,219 **** + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +--- 214,220 ---- + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +*************** +*** 247,253 **** + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) +--- 248,254 ---- + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) +*************** +*** 285,294 **** + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +--- 286,295 ---- + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +*************** +*** 304,309 **** +--- 305,311 ---- + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ ++ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ + + distclean-depend: + -rm -rf ./$(DEPDIR) +*** misc/libtextcat-2.2/src/makefile.mk 2007-01-12 12:55:41.709348000 +0100 +--- misc/build/libtextcat-2.2/src/makefile.mk 2007-01-12 12:48:19.214530000 +0100 +*************** +*** 1 **** +! dummy +--- 1,91 ---- +! #************************************************************************* +! # +! # $RCSfile: libtextcat-2.2.patch,v $ +! # +! # $Revision: 1.1 $ +! # +! # last change: $Author: tl $ $Date: 2007-01-12 12:34:52 $ +! # +! #* The Contents of this file are made available subject to +! #* the terms of GNU Lesser General Public License Version 2.1. +! #* +! #* +! #* GNU Lesser General Public License Version 2.1 +! #* ============================================= +! #* Copyright 2005 by Sun Microsystems, Inc. +! #* 901 San Antonio Road, Palo Alto, CA 94303, USA +! #* +! #* This library is free software; you can redistribute it and/or +! #* modify it under the terms of the GNU Lesser General Public +! #* License version 2.1, as published by the Free Software Foundation. +! #* +! #* This library is distributed in the hope that it will be useful, +! #* but WITHOUT ANY WARRANTY; without even the implied warranty of +! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! #* Lesser General Public License for more details. +! #* +! #* You should have received a copy of the GNU Lesser General Public +! #* License along with this library; if not, write to the Free Software +! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, +! #* MA 02111-1307 USA +! #* +! #************************************************************************* +! +! PRJ = ..$/..$/..$/..$/.. +! +! PRJNAME = libtextcat +! TARGET = libtextcat +! CFLAGSCALL=gsd +! +! USE_DEFFILE=TRUE +! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE +! +! .INCLUDE : settings.mk +! +! # --- Files -------------------------------------------------------- +! +! # !! not to be compiled because those belong to a stand alone programs: !! +! # $(SLO)$/createfp.obj\ +! # $(SLO)$/testtextcat.obj +! +! SLOFILES= \ +! $(SLO)$/common.obj\ +! $(SLO)$/fingerprint.obj\ +! $(SLO)$/textcat.obj\ +! $(SLO)$/wg_mempool.obj\ +! $(SLO)$/utf8misc.obj +! +! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) +! SHL1TARGET= $(TARGET) +! +! SHL1STDLIBS= +! +! # build DLL +! SHL1LIBS= $(SLB)$/$(TARGET).lib +! SHL1IMPLIB= i$(TARGET) +! SHL1DEPN= $(SHL1LIBS) +! SHL1DEF= $(MISC)$/$(SHL1TARGET).def +! +! # build DEF file +! DEF1NAME= $(SHL1TARGET) +! DEF1LIBNAME=$(TARGET) +! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt +! +! # --- Targets ------------------------------------------------------ +! +! .INCLUDE : target.mk +! +! # copy hand supplied configuration file for Win32 builds to the file +! # which is included in the source code +! $(SLOFILES) : config.h +! config.h : +! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h +! +! +! $(MISC)$/$(SHL1TARGET).flt: makefile.mk +! @echo ------------------------------ +! @echo Making: $@ +! @echo Imp>$@ +! @echo __CT>>$@ +! @echo _real>>$@ +! @echo unnamed>>$@ +*** misc/libtextcat-2.2/src/textcat.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/textcat.c 2007-01-12 12:52:41.000000000 +0100 +*************** +*** 4,26 **** + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 4,26 ---- + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 74,79 **** +--- 74,80 ---- + typedef struct { + + void **fprint; ++ char *fprint_disable; + uint4 size; + uint4 maxsize; + +*************** +*** 112,122 **** + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); + wg_free( h ); + + } + +! extern void *textcat_Init( const char *conffile ) + { + textcat_t *h; + char line[1024]; +--- 113,133 ---- + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); ++ wg_free( h->fprint_disable ); + wg_free( h ); + + } + +! /** Replaces older function */ +! extern void *textcat_Init( const char *conffile ){ +! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); +! } +! +! /** +! * Originaly this function had only one parameter (conffile) it has been modified since OOo use +! * Basicaly prefix is the directory path where fingerprints are stored +! */ +! extern void *special_textcat_Init( const char *conffile, const char *prefix ) + { + textcat_t *h; + char line[1024]; +*************** +*** 134,144 **** + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +! int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +--- 145,157 ---- + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +! char finger_print_file_name[512]; +! int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +*************** +*** 156,172 **** + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +! } + h->size++; + } + +--- 169,191 ---- + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); +! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +! finger_print_file_name[0] = '\0'; +! strcat(finger_print_file_name, prefix); +! strcat(finger_print_file_name, segment[0]); +! +! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +! } +! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ + h->size++; + } + +*************** +*** 203,213 **** + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +! + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +! int score = fp_Compare( h->fprint[i], unknown, threshold ); +! candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +--- 222,239 ---- + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +! + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +! int score; +! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ +! score = MAXSCORE; +! } +! else{ +! score = fp_Compare( h->fprint[i], unknown, threshold ); +! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ +! } +! candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +*************** +*** 218,224 **** + /*** Find the best performers ***/ + for (i=0; i<h->size; i++) { + if ( candidates[i].score < threshold ) { +- + if ( ++cnt == MAXCANDIDATES+1 ) { + break; + } +--- 244,249 ---- +*************** +*** 235,241 **** + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +! + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +--- 260,266 ---- + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +! + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +*************** +*** 247,253 **** + } + READY: + fp_Done(unknown); +! #ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +--- 272,278 ---- + } + READY: + fp_Done(unknown); +! #ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +*** misc/libtextcat-2.2/src/textcat.h 2003-05-19 14:16:31.000000000 +0200 +--- misc/build/libtextcat-2.2/src/textcat.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 40,45 **** +--- 40,48 ---- + #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" + #define _TEXTCAT_RESULT_SHORT "SHORT" + ++ #ifdef __cplusplus ++ extern "C" { ++ #endif + + /** + * textcat_Init() - Initialize the text classifier. The textfile +*************** +*** 51,60 **** +--- 54,72 ---- + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) ++ * ++ * Replace older function (and has exacly the same behaviour) ++ * see below + */ + extern void *textcat_Init( const char *conffile ); + + /** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++ extern void *special_textcat_Init( const char *conffile, const char *prefix ); ++ ++ /** + * textcat_Done() - Free up resources for handle + */ + extern void textcat_Done( void *handle ); +*************** +*** 77,80 **** +--- 89,96 ---- + * textcat_Version() - Returns a string describing the version of this classifier. + */ + extern char *textcat_Version(); ++ ++ #ifdef __cplusplus ++ } ++ #endif + #endif +*** misc/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:55:41.584585000 +0100 +--- misc/build/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:54:50.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,132 ---- +! /*************************************************************************** +! * Copyright (C) 2006 by Jocelyn Merand * +! * joc.mer@gmail.com * +! * * +! * THE BSD LICENSE +! * +! * Redistribution and use in source and binary forms, with or without +! * modification, are permitted provided that the following conditions +! * are met: +! * +! * - Redistributions of source code must retain the above copyright +! * notice, this list of conditions and the following disclaimer. +! * +! * - Redistributions in binary form must reproduce the above copyright +! * notice, this list of conditions and the following disclaimer in the +! * documentation and/or other materials provided with the +! * distribution. +! * +! * - Neither the name of the WiseGuys Internet B.V. nor the names of +! * its contributors may be used to endorse or promote products derived +! * from this software without specific prior written permission. +! * +! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +! ***************************************************************************/ +! +! #ifndef _UTF8_MISC_H_ +! #include "utf8misc.h" +! #endif +! +! +! int nextcharstart(const char *str, int position){ +! int pointer = position; +! +! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then str[pointer] is an escape character*/ +! +! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ +! ++pointer; +! } +! return pointer; +! } +! +! +! int charcopy(const char *str, char *dest){ +! +! int pointer = 0; +! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then str[pointer] is an escape character*/ +! +! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ +! dest[pointer] = str[pointer]; +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! if(str[pointer]){ +! dest[pointer] = str[pointer]; +! ++pointer; +! } +! +! return pointer; +! } +! +! +! int issame( char *lex, char *key, int len ) +! { +! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ +! int char_counter = 0; +! int pointer = 0; +! while(char_counter < len) { +! +! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then key[pointer] is an escap character*/ +! +! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! ++char_counter; /*and we are on a new utf8 character*/ +! if ( key[pointer] != lex[pointer] ) { +! return 0; +! /*printf(" NO\n", lex, key, len);*/ +! } +! ++pointer; +! } +! if ( lex[pointer] != '\0' ) { +! return 0; +! /*printf(" NO\n");*/ +! } +! +! /*printf(" YES\n");*/ +! +! return 1; +! } +! +! +! extern int utfstrlen(const char* str){ +! int char_counter = 0; +! int pointer = 0; +! while(str[pointer]) { +! pointer = nextcharstart(str, pointer); +! +! ++char_counter; /*and we are on a new utf8 character*/ +! } +! return char_counter; +! } +! +*** misc/libtextcat-2.2/src/utf8misc.h 2007-01-12 12:55:41.547021000 +0100 +--- misc/build/libtextcat-2.2/src/utf8misc.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,88 ---- +! /*************************************************************************** +! * Copyright (C) 2006 by Jocelyn Merand * +! * joc.mer@gmail.com * +! * * +! * THE BSD LICENSE +! * +! * Redistribution and use in source and binary forms, with or without +! * modification, are permitted provided that the following conditions +! * are met: +! * +! * - Redistributions of source code must retain the above copyright +! * notice, this list of conditions and the following disclaimer. +! * +! * - Redistributions in binary form must reproduce the above copyright +! * notice, this list of conditions and the following disclaimer in the +! * documentation and/or other materials provided with the +! * distribution. +! * +! * - Neither the name of the WiseGuys Internet B.V. nor the names of +! * its contributors may be used to endorse or promote products derived +! * from this software without specific prior written permission. +! * +! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +! ***************************************************************************/ +! +! #ifndef _UTF8_MISC_H_ +! #define _UTF8_MISC_H_ +! +! /** +! * These variables are used in character processing functions +! * These have been added to manage utf-8 symbols, particularly escape chars +! */ +! #ifdef _UTF8_ +! #define ESCAPE_MASK 0x80 +! #define WEIGHT_MASK 0xF0 +! #else +! #define ESCAPE_MASK 0xFF +! #define WEIGHT_MASK 0x00 +! #endif +! +! +! /* +! * Is used to jump to the next start of char +! * of course it's only usefull when encoding is utf-8 +! * This function have been added by Jocelyn Merand to use libtextcat in OOo +! */ +! int nextcharstart(const char *str, int position); +! +! +! /*Copy the char in str to dest +! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char +! * return the number of char jumped +! * This function have been added by Jocelyn Merand to use libtextcat in OOo +! */ +! int charcopy(const char *str, char *dest); +! +! +! /* checks if n-gram lex is a prefix of key and of length len +! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex +! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 +! */ +! int issame( char *lex, char *key, int len ); +! +! +! /* Counts the number of characters +! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str +! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 +! */ +! #ifdef __cplusplus +! extern "C" { +! #endif +! extern int utfstrlen(const char* str); +! #ifdef __cplusplus +! } +! #endif +! +! #endif +! +*** misc/libtextcat-2.2/src/win32_config.h 2007-01-12 12:55:41.643465000 +0100 +--- misc/build/libtextcat-2.2/src/win32_config.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,136 ---- +! /* src/config.h. Generated by configure. */ +! /* src/config.h.in. Generated from configure.ac by autoheader. */ +! +! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP +! systems. This function is required for `alloca.c' support on those systems. +! */ +! /* #undef CRAY_STACKSEG_END */ +! +! /* Define to 1 if using `alloca.c'. */ +! /* #undef C_ALLOCA */ +! +! /* Define to 1 if you have `alloca', as a function or macro. */ +! /* #undef HAVE_ALLOCA */ +! +! /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). +! */ +! /* #undef HAVE_ALLOCA_H */ +! +! /* Define to 1 if you have the <dlfcn.h> header file. */ +! #define HAVE_DLFCN_H 1 +! +! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +! /* #undef HAVE_DOPRNT */ +! +! /* Define to 1 if you have the `gettimeofday' function. */ +! /* #undef HAVE_GETTIMEOFDAY */ +! +! /* Define to 1 if you have the <inttypes.h> header file. */ +! /* #undef HAVE_INTTYPES_H */ +! +! /* Define to 1 if you have the <limits.h> header file. */ +! #define HAVE_LIMITS_H 1 +! +! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and +! to 0 otherwise. */ +! #define HAVE_MALLOC 1 +! +! /* Define to 1 if you have the <memory.h> header file. */ +! #define HAVE_MEMORY_H 1 +! +! /* Define to 1 if you have the `memset' function. */ +! #define HAVE_MEMSET 1 +! +! /* Define to 1 if your system has a GNU libc compatible `realloc' function, +! and to 0 otherwise. */ +! #define HAVE_REALLOC 1 +! +! /* Define to 1 if you have the <stdint.h> header file. */ +! /* #undef HAVE_STDINT_H */ +! +! /* Define to 1 if you have the <stdlib.h> header file. */ +! #define HAVE_STDLIB_H 1 +! +! /* Define to 1 if you have the `strchr' function. */ +! #define HAVE_STRCHR 1 +! +! /* Define to 1 if you have the `strdup' function. */ +! #define HAVE_STRDUP 1 +! +! /* Define to 1 if you have the <strings.h> header file. */ +! /* #undef HAVE_STRINGS_H */ +! +! /* Define to 1 if you have the <string.h> header file. */ +! #define HAVE_STRING_H 1 +! +! /* Define to 1 if you have the `strpbrk' function. */ +! #define HAVE_STRPBRK 1 +! +! /* Define to 1 if you have the <sys/stat.h> header file. */ +! #define HAVE_SYS_STAT_H 1 +! +! /* Define to 1 if you have the <sys/time.h> header file. */ +! /* #undef HAVE_SYS_TIME_H */ +! +! /* Define to 1 if you have the <sys/types.h> header file. */ +! #define HAVE_SYS_TYPES_H 1 +! +! /* Define to 1 if you have the <unistd.h> header file. */ +! #define HAVE_UNISTD_H 1 +! +! /* Define to 1 if you have the `vprintf' function. */ +! #define HAVE_VPRINTF 1 +! +! /* Name of package */ +! #define PACKAGE "libtextcat" +! +! /* Define to the address where bug reports for this package should be sent. */ +! #define PACKAGE_BUGREPORT "" +! +! /* Define to the full name of this package. */ +! #define PACKAGE_NAME "libtextcat" +! +! /* Define to the full name and version of this package. */ +! #define PACKAGE_STRING "libtextcat 2.2" +! +! /* Define to the one symbol short name of this package. */ +! #define PACKAGE_TARNAME "libtextcat" +! +! /* Define to the version of this package. */ +! #define PACKAGE_VERSION "2.2" +! +! /* If using the C implementation of alloca, define if you know the +! direction of stack growth for your system; otherwise it will be +! automatically deduced at run-time. +! STACK_DIRECTION > 0 => grows toward higher addresses +! STACK_DIRECTION < 0 => grows toward lower addresses +! STACK_DIRECTION = 0 => direction of growth unknown */ +! /* #undef STACK_DIRECTION */ +! +! /* Define to 1 if you have the ANSI C header files. */ +! #define STDC_HEADERS 1 +! +! /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ +! #define TIME_WITH_SYS_TIME 1 +! +! /* Define to 1 if your <sys/time.h> declares `struct tm'. */ +! /* #undef TM_IN_SYS_TIME */ +! +! /* Version number of package */ +! #define VERSION "2.2" +! +! /* Define to empty if `const' does not conform to ANSI C. */ +! /* #undef const */ +! +! /* Define as `__inline' if that's what the C compiler calls it, or to nothing +! if it is not supported. */ +! /* #undef inline */ +! +! /* Define to rpl_malloc if the replacement function should be used. */ +! /* #undef malloc */ +! +! /* Define to rpl_realloc if the replacement function should be used. */ +! /* #undef realloc */ +! +! /* Define to `unsigned' if <sys/types.h> does not define. */ +! /* #undef size_t */ |