summaryrefslogtreecommitdiff
path: root/libtextcat
diff options
context:
space:
mode:
authorThomas Lange <tl@openoffice.org>2007-01-12 11:40:54 +0000
committerThomas Lange <tl@openoffice.org>2007-01-12 11:40:54 +0000
commit23147b5b1f280e1c7758c4ce27b99dc92135b354 (patch)
treecdee4b730e97cad5db3fd941f5513dc826530fd8 /libtextcat
parent2bb6503c63165d28d1f9a0224b675565b6acaa96 (diff)
#i73173# integrate Google SoC language-guessing
Diffstat (limited to 'libtextcat')
-rw-r--r--libtextcat/data/new_fingerprints/fpdb.conf82
-rw-r--r--libtextcat/data/new_fingerprints/lm/afrikaans.lm400
-rw-r--r--libtextcat/data/new_fingerprints/lm/albanian.lm400
-rw-r--r--libtextcat/data/new_fingerprints/lm/amharic_utf.lm400
-rw-r--r--libtextcat/data/new_fingerprints/lm/arabic.lm400
-rw-r--r--libtextcat/data/new_fingerprints/lm/armenian.lm0
-rw-r--r--libtextcat/libtextcat-2.2.patch2137
-rw-r--r--libtextcat/makefile.mk92
-rw-r--r--libtextcat/prj/build.lst3
-rw-r--r--libtextcat/prj/d.lst8
10 files changed, 3922 insertions, 0 deletions
diff --git a/libtextcat/data/new_fingerprints/fpdb.conf b/libtextcat/data/new_fingerprints/fpdb.conf
new file mode 100644
index 000000000000..b72e103ddffb
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/fpdb.conf
@@ -0,0 +1,82 @@
+#
+# A sample config file for the language models
+# provided with Gertjan van Noords language guesser
+# (http://odur.let.rug.nl/~vannoord/TextCat/)
+#
+# Notes:
+# - You may consider eliminating a couple of small languages from this
+# list because they cause false positives with big languages and are
+# bad for performance. (Do you really want to recognize Drents?)
+# - Putting the most probable languages at the top of the list
+# improves performance, because this will raise the threshold for
+# likely candidates more quickly.
+#
+
+# this file have been modified (to OOo by Jocelyn MERAND joc.mer@gmail.com) to include country and encoding
+# guess strings are made as following : language-country-encoding
+
+afrikaans.lm af---utf8
+albanian.lm sq---utf8
+amharic_utf.lm am---utf8
+arabic.lm ar---utf8
+basque.lm eu---utf8
+belarus.lm be---utf8
+bosnian.lm bs---utf8
+breton.lm br---utf8
+catalan.lm ca---utf8
+chinese_simplified.lm zh-CN--utf8
+chinese_traditional.lm zh-TW--utf8
+croatian.lm hr---utf8
+czech.lm cs---utf8
+danish.lm da---utf8
+dutch.lm nl---utf8
+english.lm en---utf8
+esperanto.lm eo---utf8
+estonian.lm et---utf8
+finnish.lm fi---utf8
+french.lm fr---utf8
+frisian.lm fy---utf8
+georgian.lm ka---utf8
+german.lm de---utf8
+greek.lm el---utf8
+hebrew.lm he---utf8
+hindi.lm hi---utf8
+hungarian.lm hu---utf8
+icelandic.lm is---utf8
+indonesian.lm id---utf8
+irish_gaelic.lm ga---utf8
+italian.lm it---utf8
+japanese.lm ja---utf8
+korean.lm ko---utf8
+latin.lm la---utf8
+latvian.lm lv---utf8
+lithuanian.lm lt---utf8
+malay.lm ms---utf8
+manx_gaelic.lm gv---utf8
+marathi.lm mr---utf8
+nepali.lm ne---utf8
+norwegian.lm nb---utf8 # Norwegian (Bokmal)
+persian.lm fa---utf8 # Farsi
+polish.lm pl---utf8
+portuguese.lm pt-PT--utf8
+quechua.lm qu---utf8
+romanian.lm ro---utf8
+romansh.lm rm---utf8
+russian.lm ru---utf8
+sanskrit.lm sa---utf8
+scots.lm sco---utf8
+scots_gaelic.lm gd---utf8
+serbian_ascii.lm sh-YU--utf8
+slovak_ascii.lm sk-SK--utf8
+slovenian.lm sl---utf8
+spanish.lm es---utf8
+swahili.lm sw---utf8
+swedish.lm sv---utf8
+tagalog.lm tl---utf8
+tamil.lm ta---utf8
+thai.lm th---utf8
+turkish.lm tr---utf8
+ukrainian.lm uk---utf8
+vietnamese.lm vi---utf8
+welsh.lm cy---utf8
+yiddish_utf.lm yi---utf8
diff --git a/libtextcat/data/new_fingerprints/lm/afrikaans.lm b/libtextcat/data/new_fingerprints/lm/afrikaans.lm
new file mode 100644
index 000000000000..c110f154b664
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/lm/afrikaans.lm
@@ -0,0 +1,400 @@
+_ 23602
+e 8036
+a 4087
+n 3782
+i 3726
+o 3314
+r 2951
+s 2885
+t 2749
+d 2479
+e_ 2118
+l 1854
+k 1741
+ie 1670
+g 1601
+n_ 1447
+m 1440
+_d 1219
+t_ 1143
+er 1124
+h 1124
+u 1110
+ie_ 1079
+y 1048
+w 986
+s_ 982
+_s 969
+_h 956
+di 924
+an 922
+r_ 912
+aa 882
+v 876
+en 807
+_di 807
+. 790
+y_ 747
+_v 709
+et 706
+._ 694
+die 691
+die_ 667
+_n 666
+_die 651
+p 639
+_m 634
+_die_ 633
+_w 632
+ee 607
+ge 606
+_o 598
+b 586
+te 568
+, 560
+in 555
+k_ 550
+_e 550
+,_ 548
+oo 516
+et_ 511
+de 509
+el 489
+_g 486
+f 461
+ar 451
+ni 450
+nd 442
+an_ 440
+en_ 437
+_i 426
+he 423
+g_ 418
+_t 412
+oe 410
+at 406
+er_ 400
+om 381
+wa 378
+_a 378
+_b 377
+_k 371
+nie 371
+_he 370
+aar 355
+_ge 351
+es 351
+_ni 348
+da 346
+m_ 342
+ou 338
+it 335
+_nie 335
+d_ 332
+l_ 330
+_wa 329
+or 327
+le 326
+we 326
+ek 324
+het 321
+me 319
+_het 319
+is 318
+j 315
+at_ 311
+on 309
+se 308
+_en 298
+ma 294
+st 291
+as 280
+va 277
+_en_ 270
+re 270
+" 269
+' 265
+het_ 261
+_het_ 260
+om_ 254
+al 252
+ar_ 250
+li 248
+te_ 247
+aar_ 247
+_da 245
+u_ 242
+nde 241
+ou_ 237
+_l 231
+be 229
+_' 226
+rd 224
+_va 224
+ig 223
+ng 222
+ns 221
+ve 220
+it_ 218
+_j 216
+_me 216
+sy 215
+ke 213
+_sy 212
+aan 212
+van 212
+_in 210
+is_ 210
+in_ 208
+sy_ 206
+_sy_ 206
+'n 205
+ro 205
+ko 204
+_'n 203
+ra 203
+'n_ 203
+_'n_ 202
+so 202
+D 202
+ho 201
+rs 200
+eer 200
+ik 199
+la 198
+_te 196
+_van 196
+_ma 195
+as_ 194
+ui 194
+ver 192
+e. 192
+der 191
+to 188
+op 187
+van_ 184
+ag 184
+_ve 182
+and 180
+_van_ 178
+ha 178
+f_ 176
+ka 176
+ne 175
+_is 175
+sk 174
+e._ 174
+oor 174
+_ver 170
+ek_ 170
+_hy 170
+hy 170
+p_ 168
+_be 168
+ri 168
+ur 167
+nie_ 165
+_so 165
+_D 164
+si 164
+ll 164
+no 164
+_in_ 163
+_hy_ 162
+hy_ 162
+ed 161
+ers 160
+_r 156
+ak 156
+_ho 155
+_nie_ 153
+eg 153
+nt 152
+de_ 152
+_p 151
+_we 148
+_is_ 148
+ei 147
+es_ 142
+maa 142
+wee 142
+na 141
+nder 139
+a_ 138
+ing 138
+ew 138
+S 135
+lle 135
+_om 135
+_te_ 134
+eu 134
+ie. 134
+wo 132
+em 132
+wat 131
+_no 130
+_" 130
+vo 130
+E 129
+H 128
+_wat 127
+ti 126
+mo 126
+A 126
+e, 126
+_ha 125
+vi 125
+el_ 125
+ter 125
+e,_ 124
+dat 124
+eer_ 124
+wat_ 124
+le_ 124
+ta 124
+Di 123
+dat_ 123
+_wat_ 122
+ie._ 122
+was 121
+ste 121
+_H 121
+_se 121
+se_ 120
+ul 120
+al_ 120
+_was 120
+_om_ 119
+_st 119
+lik 118
+"_ 118
+_ko 118
+_maa 118
+lo 117
+_to 117
+ns_ 115
+aan_ 115
+nie. 114
+_vi 114
+met 114
+_nie. 111
+nk 110
+_Di 110
+- 110
+_op 109
+_oo 109
+_on 108
+ir 108
+ord 108
+uit 106
+ens 105
+_was_ 105
+was_ 105
+een 105
+_met 105
+os 105
+_S 104
+nie._ 104
+ig_ 103
+_sk 102
+op_ 101
+_ek 101
+_wee 101
+ir_ 101
+met_ 100
+_met_ 100
+rt 100
+ik_ 99
+end 99
+nd_ 99
+gt 99
+ond 98
+ot 98
+_aa 97
+og 97
+vir_ 95
+vir 95
+_ka 94
+hu 94
+_mo 94
+_vir_ 94
+_vir 94
+_dit 93
+kr 93
+am 93
+ol 93
+dit 93
+_ek_ 93
+ki 93
+sa 93
+_aan 92
+man 92
+jy 92
+ng_ 92
+aak 92
+lle_ 91
+_hu 91
+_na 91
+_vo 90
+ewe 90
+of 90
+jy_ 90
+_dit_ 90
+dit_ 90
+_jy 89
+der_ 89
+jo 89
+_f 88
+_u 88
+sie 87
+_dat 87
+_jy_ 87
+daa 87
+do 87
+vr 87
+wi 86
+ry 86
+_dat_ 86
+eur 86
+rs_ 85
+_jo 85
+_wo 84
+_ne 84
+jie 84
+ji 84
+pe 83
+moe 83
+my 82
+ull 82
+Die 81
+maar 81
+_hom 81
+ulle 81
+_maar 81
+hom 81
+_uit 80
+_ui 80
+ges 80
+raa 80
+or_ 80
+ies 80
+jou 79
+_la 79
+maar_ 79
+ulle_ 79
+_daa 79
+Die_ 79
+daar 78
+_daar 78
+ien 78
+_my 78
+_jou 78
+ok 78
+il 78
+lik_ 77
+sta 77
+_Die 77
+ur_ 77
+ga 77
+ag_ 77
+kan 77
diff --git a/libtextcat/data/new_fingerprints/lm/albanian.lm b/libtextcat/data/new_fingerprints/lm/albanian.lm
new file mode 100644
index 000000000000..0665a962d018
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/lm/albanian.lm
@@ -0,0 +1,400 @@
+_ 19480
+ë 4099
+e 4082
+t 3635
+i 3134
+a 2893
+r 2820
+n 2610
+s 2380
+h 2060
+ë_ 2055
+e_ 1825
+j 1677
+u 1489
+d 1381
+o 1370
+m 1318
+k 1264
+të 1091
+p 1072
+_t 1068
+sh 998
+l 936
+_n 876
+a_ 822
+, 816
+,_ 808
+të_ 795
+i_ 770
+_p 739
+_m 702
+_s 700
+te 653
+ër 620
+_d 613
+_e 607
+g 602
+_k 601
+_të 593
+. 575
+_të_ 574
+v 567
+_e_ 554
+r_ 525
+._ 523
+ht 503
+n_ 480
+he 473
+në 462
+sht 461
+te_ 457
+q 454
+nd 436
+ri 432
+is 414
+et 403
+b 402
+je 401
+me 395
+in 391
+it 381
+rë 374
+_a 374
+t_ 359
+ur 353
+_i 346
+ar 342
+ës 339
+er 338
+në_ 338
+ën 338
+dh 337
+en 336
+pë 334
+f 328
+_v 323
+jë 318
+nj 313
+ish 312
+për 294
+y 285
+z 282
+es 281
+at 274
+_me 273
+_q 273
+gj 269
+ra 261
+as 258
+_në 256
+ku 256
+j_ 250
+ta 249
+re 246
+një 245
+o_ 243
+ni 243
+_pë 240
+hte 240
+_nj 239
+on 239
+isht 236
+pa 234
+th 233
+shte 233
+_për 232
+se 228
+_g 223
+ve 221
+in_ 220
+s_ 219
+_në_ 219
+do 218
+hte_ 218
+më 216
+ti 215
+aj 212
+shte_ 212
+ej 212
+u_ 211
+që 211
+_sh 210
+nt 207
+jë_ 206
+_b 205
+_një 203
+di 202
+_pa 201
+_i_ 201
+ll 199
+_f 199
+kë 198
+me_ 197
+dhe 195
+ishte 195
+si 194
+hi 191
+he_ 188
+- 187
+ja 187
+_që 187
+ua 186
+il 184
+_dh 184
+ur_ 183
+ër_ 182
+or 180
+se_ 179
+që_ 178
+S 176
+ç 175
+_h 173
+an 172
+një_ 172
+ng 170
+nte 170
+_që_ 169
+_S 169
+rë_ 166
+dhe_ 165
+_me_ 164
+ka 162
+im 159
+hë 158
+mi 157
+to 156
+tu 156
+ën_ 155
+_një_ 154
+ha 153
+nte_ 150
+tr 148
+sa 148
+ët 148
+_gj 148
+un 147
+rr 147
+ë, 147
+_dhe 147
+ej_ 147
+ki 146
+ë,_ 146
+_ku 145
+_- 144
+_ng 142
+ik 141
+_nd 140
+end 138
+uk 137
+etë 135
+ko 135
+_dhe_ 135
+_ve 132
+va 131
+_l 131
+për_ 131
+shi 131
+erë 129
+ke 127
+kis 127
+së 126
+jo 125
+li 124
+ga 124
+kish 123
+_ki 122
+po 122
+_se 122
+' 121
+du 120
+mb 120
+_më 119
+Si 115
+më_ 115
+esh 115
+_si 114
+qe 114
+lë 114
+_kis 113
+oh 113
+_kish 113
+_Si 113
+pr 112
+_u 112
+uar 111
+de 111
+hu 111
+_th 111
+al 111
+ta_ 109
+ilv 108
+Sil 108
+Silv 108
+lv 108
+k_ 108
+e, 108
+ji 107
+e,_ 106
+_Sil 106
+_Silv 106
+_r 105
+os 104
+_se_ 104
+kisht 102
+_di 102
+st 101
+_për_ 101
+bë 101
+tj 100
+_nga 99
+nga 99
+_du 98
+ra_ 98
+vë 98
+gji 98
+_ish 96
+rt 96
+_is 96
+ro 95
+ir 94
+ga_ 94
+ësh 94
+ont 93
+c 93
+t, 93
+t,_ 93
+hin 92
+a, 92
+_at 92
+und 92
+jt 91
+_mb 91
+a,_ 91
+tje 90
+_nga_ 90
+_do 90
+_pr 90
+rit 90
+men 90
+nga_ 90
+ri_ 89
+N 89
+ma 89
+it_ 88
+_kë 88
+-_ 88
+m_ 87
+jo_ 87
+onte 87
+atë 87
+la 87
+ëri 87
+ilva 86
+shin 86
+ë. 86
+Silva 86
+lva 86
+së_ 85
+jer 85
+et_ 85
+_po 85
+ës_ 84
+kur 84
+ru 84
+nin 83
+ot 83
+hin_ 83
+_N 83
+her 83
+htë 82
+ap 82
+shin_ 82
+mo 81
+ash 81
+tha 81
+_ç 81
+ë._ 81
+ëm 81
+jit 80
+_ta 80
+ul 80
+le 80
+ho 80
+_z 79
+dr 78
+jet 78
+nin_ 78
+_më_ 78
+gjit 78
+A 78
+hk 78
+onte_ 78
+oni 77
+lo 77
+ba 77
+herë 77
+ndo 76
+shk 76
+mend 75
+_vë 75
+ha_ 75
+dë 75
+tur 74
+_A 74
+el 74
+bi 74
+_ko 74
+uk_ 73
+erë_ 73
+si_ 73
+_sa 73
+ar_ 72
+P 72
+rs 72
+pas 72
+ith 72
+uar_ 71
+_isht 71
+ai 70
+e. 70
+_vet 70
+vet 70
+_bë 70
+zi 70
+d_ 70
+jith 70
+da 70
+gjith 69
+duk 69
+na 69
+hej 69
+tër 68
+_men 68
+_ka 68
+am 68
+nd_ 68
+_c 67
+_pas 67
+_duk 67
+jes 67
+ak 67
+s, 67
+e._ 67
+s,_ 67
+K 67
+ësht 67
+mu 66
+kur_ 66
+yr 66
+em 65
+_së 65
+tha_ 65
+imi 65
+ie 65
+hej_ 64
+_së_ 64
+_u_ 64
+? 64
+fu 64
+_P 64
diff --git a/libtextcat/data/new_fingerprints/lm/amharic_utf.lm b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm
new file mode 100644
index 000000000000..0c5bc813e663
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm
@@ -0,0 +1,400 @@
+ 21403
+_ 10092
+ 7734
+ 6558
+_ 5003
+ 4717
+ 4401
+ 4274
+ 4176
+ 4054
+ 3868
+ 2728
+ 1656
+ 1591
+ 1579
+ 1425
+ 1402
+_ 1261
+_ 1231
+ 1217
+ 1187
+ 1183
+_ 1160
+ 1145
+ 1123
+ 1097
+ን 1043
+ 1043
+ 1041
+ 1004
+_ 991
+ 936
+ 880
+ 855
+ው 855
+ 849
+ 805
+ 783
+ት 783
+_ 763
+ 709
+ 704
+ 682
+በ 682
+ 679
+ 670
+ 667
+በ 666
+ 666
+ 658
+ 643
+የ 637
+ 637
+ 627
+የ 627
+ለ 614
+ 614
+ 611
+ር 611
+_ 588
+_ 583
+ት_ 583
+_የ 577
+_የ 574
+ለ 573
+ 573
+ን 570
+ 570
+መ 563
+ 563
+ 557
+መ 557
+ 554
+አ 554
+አ 553
+ተ 553
+ 553
+ 553
+ተ 547
+ 547
+ም 534
+ 534
+ 532
+- 531
+ስ 525
+ 525
+-- 521
+ል 515
+ 515
+--- 512
+---- 503
+_በ 499
+----- 494
+_በ 487
+ 479
+ 477
+_ 473
+ 469
+ው 469
+ን_ 468
+_ 468
+ 465
+ 464
+ያ 457
+ 457
+ 444
+_አ 424
+_አ 424
+ስ 423
+ 423
+_ 415
+ 402
+ 401
+ 390
+ 389
+ 382
+_ 378
+ው_ 378
+ 365
+ 364
+ያ 364
+ 363
+ል 357
+ 357
+ 356
+_ 351
+ 347
+ች 347
+ 341
+ነ 341
+ይ 337
+ 337
+። 337
+ 337
+_ 337
+።_ 337
+ 336
+ 334
+ 320
+እ 320
+ 320
+ 320
+ 318
+እ 318
+_ 314
+ር_ 314
+ 312
+ 311
+ 301
+ና 300
+ገ 300
+ 300
+ 300
+ 299
+ 297
+ር 294
+ 294
+ግ 294
+ 294
+ 293
+ 291
+ከ 291
+ 291
+ 291
+ 291
+ም 291
+ገ 291
+ነ 291
+ደ 288
+ 288
+_እ 285
+_እ 283
+ 279
+ 279
+ከ 279
+ 279
+በ 279
+ን 276
+ 276
+ 276
+_ 272
+ 270
+ብ 270
+_ 269
+ግ 264
+ 264
+ 262
+ 262
+መ 262
+ 262
+ይ 261
+ 261
+ 260
+ማ 260
+ደ 259
+ 259
+ራ 254
+ 254
+ባ 254
+ 254
+ 253
+ 249
+ 247
+ 245
+ 244
+ላ 242
+ 242
+የ 242
+ 242
+ማ 238
+ 238
+ 237
+ረ 237
+ 237
+ 236
+ተ 236
+ም_ 235
+_ 235
+ 234
+ 233
+ 233
+ 230
+ 230
+ባ 230
+ሚ 230
+ድ 228
+ 228
+_መ 227
+ 227
+_መ 226
+ 225
+ረ 225
+ 225
+። 222
+።_ 222
+ 216
+እ 214
+ሚ 214
+ 214
+ 214
+ 213
+ 212
+ 210
+ 209
+ላ 209
+ 208
+ 207
+ 206
+_ 206
+ 205
+ብ 205
+ 202
+ 200
+ታ 200
+ 200
+ሰ 200
+ 199
+ራ 199
+ሰ 198
+ 198
+ት 195
+ወ 195
+ 195
+ 195
+ 194
+ወ 194
+ 191
+ 191
+_ 189
+ች_ 189
+ 188
+ 186
+ 186
+_ለ 184
+_ለ 183
+ለ 183
+ 183
+ን 180
+ 179
+የ 179
+ 178
+ 177
+ን 177
+_ከ 175
+ 174
+ጥ 174
+ 172
+አ 172
+_ከ 170
+ 170
+_ 169
+ን 169
+ 166
+ 166
+ል 165
+_ 165
+ 165
+ና_ 163
+_ 163
+ 160
+ቸ 160
+ቸ 160
+ 160
+ 160
+ 159
+ 159
+ 158
+ቀ 158
+ 158
+ 156
+ቀ 155
+ 155
+ች 154
+ 154
+ል_ 154
+ 154
+ው 154
+ቸ 154
+_ 154
+ 152
+ 151
+ው 151
+ 150
+_ነ 150
+ 150
+_ነ 150
+ 150
+_ይ 150
+_ይ 150
+ 149
+ታ 149
+። 148
+ 147
+ደ 147
+ 147
+በ 147
+ 146
+_ተ 146
+_ተ 146
+ለ 145
+ 145
+ድ 144
+ 144
+ 144
+ 144
+ቅ 143
+ 143
+_ 143
+ግ 142
+ 142
+ 141
+ዳ 141
+ 139
+ህ 138
+ 138
+ 137
+ና 137
+ 137
+ን 136
+ 136
+አ 135
+ 135
+ 135
+ስ 134
+ 134
+ጠ 133
+ዳ 133
+ 133
+ሆ 133
+ 133
+ሆ 133
+ 133
+ 133
+ሆ 132
+ 132
+ተ 131
+ያ 131
+ተ 131
+ 131
+ 129
+ 128
+ 128
+ክ 128
+ፍ 128
+ 128
+ 127
+ 127
+ጠ 127
+ 126
+ካ 126
+ 124
+በ 123
+ 123
+ያ 123
+ 123
+ 123
+ 122
+ከ 122
+ 121
+ገ 121
diff --git a/libtextcat/data/new_fingerprints/lm/arabic.lm b/libtextcat/data/new_fingerprints/lm/arabic.lm
new file mode 100644
index 000000000000..85f701965e2e
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/lm/arabic.lm
@@ -0,0 +1,400 @@
+_
+ال
+_ا
+_ال
+.
+_و
+ن_
+ا_
+..
+_م
+._
+ة_
+ه_
+لا
+_أ
+ان
+_ف
+_ب
+لم
+د_
+ول
+ي_
+ى_
+...
+وج
+_ل
+_ع
+ل_
+وا
+جو
+ْ.
+الم
+_الم
+..._
+.._
+ود
+من
+وجو
+َ_
+في
+لا_
+جود
+ر_
+لى_
+لى
+ان_
+وجود
+لو
+م_
+_ت
+_من
+ْ...
+_وا
+لع
+الو
+عل
+ْ..._
+ْ..
+ين
+الع
+_في
+ات
+_ي
+_الع
+ُ_
+_ك
+_الو
+من_
+_ان
+مر
+في_
+يا
+ب_
+را
+،_
+ِ_
+_في_
+تب
+_من_
+لوج
+كا
+لي
+ت_
+لوجو
+ون
+الوج
+اء
+جود_
+أح
+_أح
+الوجو
+له
+ود_
+ها
+حا
+ذا
+_ر
+على_
+وجود_
+على
+رب
+لوجود
+عر
+_ان_
+او
+اول
+رت
+لت
+بْ
+أحا
+_الوج
+أحاو
+با
+وال
+_ول
+اد
+_وال
+حاول
+_أحاو
+_أحا
+أحاول
+_،_
+حاو
+_،
+ني
+بي
+_عل
+لن
+ته
+ما
+-_
+-
+مرتب
+نا
+_.
+ها_
+مرت
+_._
+_-
+_-_
+بة
+ول_
+_ح
+رتب
+دا
+له_
+ء_
+ك_
+قي
+تبة
+اول_
+مرتبة
+ية
+بل
+ور
+ده
+الت
+رتبة
+الا
+رتبة_
+ين_
+عرب
+ير
+بة_
+تبة_
+قد
+ربْ
+لعربْ
+لعر
+العر
+أن
+لك
+حد
+ون_
+لعرب
+_على_
+_العر
+تُ
+عن
+بْ.
+_لا
+حاول_
+ذات
+العرب
+_على
+ية_
+عربْ
+اب
+سا
+نو
+كو
+المر
+لل
+يت
+_ش
+لم_
+_المر
+اع
+مو
+لمر
+_الا
+ته_
+اج
+_ق
+س_
+ائ
+جب
+ام
+اجب_
+كون
+واجب_
+لَ
+_لا_
+اني
+سي
+واج
+سم
+لَ_
+يس
+ال_
+_ولا
+عي
+وص
+عا
+جب_
+اس
+ير_
+_مر
+واجب
+اجب
+_بل
+الن
+ولا
+_بال
+وأ
+أع
+اك
+وق
+بلاد
+نت
+نف
+ضا
+نه
+كون_
+بْ..
+ثل
+كل
+ولا_
+_ذا
+ذاته
+المرت
+دة
+ذاته_
+ور_
+بال
+بْ...
+_ولا_
+_الت
+يه
+_الل
+_س
+اء_
+ات_
+بلا
+_وأ
+_ذ
+صو
+ربْ.
+_بلاد
+لاد
+_بلا
+لمرتب
+_ه
+بن
+لمرت
+عربْ.
+_ن
+_ذات
+اته_
+لله
+ْ._
+_با
+اته
+_إ
+وم
+الل
+الوا
+موج
+_الله
+نْ
+لُ
+اف
+_يكو
+لر
+قا
+عين
+ست
+يكون
+موجو
+ليس
+ده_
+لُ_
+_وج
+_وص
+دي
+حم
+الواج
+بين
+_الر
+_يك
+مس
+مُ
+لله_
+ٍ_
+عد
+يل
+_الن
+عق
+اش
+يكو
+يق
+الر
+تُ_
+_كا
+شي
+_يكون
+لوا
+ار
+موجود
+يك
+هْ
+_ذاته
+ع_
+جا
+الله
+فو
+وب
+_عي
+رس
+دة_
+لواجب
+يكون_
+لواج
+رك
+ف_
+كان
+لص
+لش
+لث
+زا
+ياء
+ساء
+لعق
+انت
+علم
+العق
+ما_
+قد_
+لف
+الله_
diff --git a/libtextcat/data/new_fingerprints/lm/armenian.lm b/libtextcat/data/new_fingerprints/lm/armenian.lm
new file mode 100644
index 000000000000..e69de29bb2d1
--- /dev/null
+++ b/libtextcat/data/new_fingerprints/lm/armenian.lm
diff --git a/libtextcat/libtextcat-2.2.patch b/libtextcat/libtextcat-2.2.patch
new file mode 100644
index 000000000000..81babb0eb0aa
--- /dev/null
+++ b/libtextcat/libtextcat-2.2.patch
@@ -0,0 +1,2137 @@
+*** misc/libtextcat-2.2/src/common.c 2003-05-22 13:32:43.000000000 +0200
+--- misc/build/libtextcat-2.2/src/common.c 2007-01-11 13:19:40.000000000 +0100
+***************
+*** 3,25 ****
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+--- 3,25 ----
+ *
+ * Copyright (c) 2003, WiseGuys Internet B.V.
+ * All rights reserved.
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+***************
+*** 114,124 ****
+ wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
+ }
+
+! return( result );
+ }
+
+! extern void* wg_realloc( void *ptr, size_t size )
+! {
+ void *result;
+
+ if (!size) {
+--- 114,124 ----
+ wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
+ }
+
+! return( result );
+ }
+
+! extern void* wg_realloc( void *ptr, size_t size )
+! {
+ void *result;
+
+ if (!size) {
+***************
+*** 131,137 ****
+ wgmem_error( "Error while reallocing %u bytes.\n", size );
+ }
+
+! return( result );
+ }
+
+ extern void wg_free( void *mem )
+--- 131,137 ----
+ wgmem_error( "Error while reallocing %u bytes.\n", size );
+ }
+
+! return( result );
+ }
+
+ extern void wg_free( void *mem )
+***************
+*** 148,159 ****
+ if ( fgets(line, size, fp) == NULL ) {
+ return NULL;
+ }
+!
+ /** kill term null **/
+ if ( (p = strpbrk( line, "\n\r" )) ) {
+ *p = '\0';
+! }
+!
+ return line;
+ }
+
+--- 148,159 ----
+ if ( fgets(line, size, fp) == NULL ) {
+ return NULL;
+ }
+!
+ /** kill term null **/
+ if ( (p = strpbrk( line, "\n\r" )) ) {
+ *p = '\0';
+! }
+!
+ return line;
+ }
+
+***************
+*** 164,202 ****
+ *
+ * ARGUMENTS:
+ * - result:
+! *
+ * After the split, this array contains pointers to the start of each
+ * detected segment. Must be preallocated and at least as large as
+ * maxsegments. The pointers point into the dest buffer.
+! *
+! * - dest:
+! *
+ * String into which result points as an index. Must be preallocated, and
+ * at least as big as src. You can use src as dest, but in that case src
+ * is overwritten!
+! *
+! * - src:
+! *
+ * The string to split. Sequences of whitespace are treated as separators, unless
+ * escaped. There are two ways to escape: by using single quotes (anything
+ * between single quotes is treated as one segment), or by using a backslash
+ * to escape the next character. The backslash escape works inside quotation
+ * as well.
+! *
+ * Example:
+! *
+ * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
+! *
+ * "It's"
+ * "very easy"
+ * "to use WiseGuys' wg_split()"
+ * "function"
+! *
+! * - maxsegments:
+! *
+ * The maximum number of segments. If the splitter runs out of segments,
+ * the remainder of the string is stored in the last segment.
+! *
+ * RETURN VALUE:
+ * The number of segments found.
+ */
+--- 164,202 ----
+ *
+ * ARGUMENTS:
+ * - result:
+! *
+ * After the split, this array contains pointers to the start of each
+ * detected segment. Must be preallocated and at least as large as
+ * maxsegments. The pointers point into the dest buffer.
+! *
+! * - dest:
+! *
+ * String into which result points as an index. Must be preallocated, and
+ * at least as big as src. You can use src as dest, but in that case src
+ * is overwritten!
+! *
+! * - src:
+! *
+ * The string to split. Sequences of whitespace are treated as separators, unless
+ * escaped. There are two ways to escape: by using single quotes (anything
+ * between single quotes is treated as one segment), or by using a backslash
+ * to escape the next character. The backslash escape works inside quotation
+ * as well.
+! *
+ * Example:
+! *
+ * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
+! *
+ * "It's"
+ * "very easy"
+ * "to use WiseGuys' wg_split()"
+ * "function"
+! *
+! * - maxsegments:
+! *
+ * The maximum number of segments. If the splitter runs out of segments,
+ * the remainder of the string is stored in the last segment.
+! *
+ * RETURN VALUE:
+ * The number of segments found.
+ */
+***************
+*** 223,229 ****
+ }
+ state = 1;
+
+! case 1:
+ /*** Start segment ***/
+ result[cnt] = w;
+ cnt++;
+--- 223,229 ----
+ }
+ state = 1;
+
+! case 1:
+ /*** Start segment ***/
+ result[cnt] = w;
+ cnt++;
+***************
+*** 237,243 ****
+ p++;
+ state = 0;
+ break;
+! }
+ else if ( *p == '\'' ) {
+ /*** Start quotation ***/
+ p++;
+--- 237,243 ----
+ p++;
+ state = 0;
+ break;
+! }
+ else if ( *p == '\'' ) {
+ /*** Start quotation ***/
+ p++;
+***************
+*** 292,308 ****
+ }
+
+
+ extern void wg_timerstart(wgtimer_t *t)
+ {
+- #ifdef HAVE_GETTIMEOFDAY
+ gettimeofday( &(t->start), NULL );
+- #endif
+ }
+
+
+ extern uint4 wg_timerstop(wgtimer_t *t)
+ {
+- #ifdef HAVE_GETTIMEOFDAY
+ uint4 result;
+ gettimeofday( &(t->stop), NULL );
+ result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
+--- 292,308 ----
+ }
+
+
++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
+ extern void wg_timerstart(wgtimer_t *t)
+ {
+ gettimeofday( &(t->start), NULL );
+ }
++ #endif /* TL : no struct timeval under Win32 */
+
+
++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
+ extern uint4 wg_timerstop(wgtimer_t *t)
+ {
+ uint4 result;
+ gettimeofday( &(t->stop), NULL );
+ result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
+***************
+*** 312,336 ****
+ t->start.tv_usec = t->stop.tv_usec;
+
+ return result;
+- #else
+- return 0;
+- #endif
+ }
+
+
+ /**
+ * wg_strgmov -- a guarded strcpy() variation
+! *
+ * copies src to dest (including terminating zero), and returns
+ * pointer to position of terminating zero in dest. The function is
+ * guaranteed not to write past destlimit. If the copy couldn't be
+! * finished, the function returns NULL after restoring the first
+! * character in dest for your convenience (since this is usually a zero).
+ */
+ char *wg_strgmov( char *dest, const char *src, const char *destlimit )
+ {
+ char tmp, *w;
+!
+ if ( !dest || dest >= destlimit ) {
+ return NULL;
+ }
+--- 312,334 ----
+ t->start.tv_usec = t->stop.tv_usec;
+
+ return result;
+ }
++ #endif /* TL : no struct timeval under Win32 */
+
+
+ /**
+ * wg_strgmov -- a guarded strcpy() variation
+! *
+ * copies src to dest (including terminating zero), and returns
+ * pointer to position of terminating zero in dest. The function is
+ * guaranteed not to write past destlimit. If the copy couldn't be
+! * finished, the function returns NULL after restoring the first
+! * character in dest for your convenience (since this is usually a zero).
+ */
+ char *wg_strgmov( char *dest, const char *src, const char *destlimit )
+ {
+ char tmp, *w;
+!
+ if ( !dest || dest >= destlimit ) {
+ return NULL;
+ }
+***************
+*** 355,361 ****
+ }
+
+ /*
+! * wg_trim() -- remove whitespace surrounding a string.
+ *
+ * Example: " bla bla bla " becomes "bla bla bla" after trimming.
+ *
+--- 353,359 ----
+ }
+
+ /*
+! * wg_trim() -- remove whitespace surrounding a string.
+ *
+ * Example: " bla bla bla " becomes "bla bla bla" after trimming.
+ *
+***************
+*** 373,379 ****
+ char *lastnonspace = &dest[-1];
+ const char *p = src;
+ char *w = dest;
+!
+ while ( isspace((int)*p) ) {
+ p++;
+ }
+--- 371,377 ----
+ char *lastnonspace = &dest[-1];
+ const char *p = src;
+ char *w = dest;
+!
+ while ( isspace((int)*p) ) {
+ p++;
+ }
+*** misc/libtextcat-2.2/src/common.h 2003-05-22 15:02:29.000000000 +0200
+--- misc/build/libtextcat-2.2/src/common.h 2007-01-11 13:19:40.000000000 +0100
+***************
+*** 1,28 ****
+ #ifndef _COMMON_H_
+ #define _COMMON_H_
+ /**
+! * common.h -- a mixed bag of helper functions
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+--- 1,28 ----
+ #ifndef _COMMON_H_
+ #define _COMMON_H_
+ /**
+! * common.h -- a mixed bag of helper functions
+ *
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+***************
+*** 86,95 ****
+--- 86,97 ----
+ typedef char boole;
+ #endif
+
++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
+ typedef struct wgtimer_s {
+ struct timeval start;
+ struct timeval stop;
+ } wgtimer_t;
++ #endif /* TL : no struct timeval under Win32 */
+
+
+ extern void *wg_malloc( size_t size );
+***************
+*** 101,113 ****
+
+ extern char *wg_getline( char *line, int size, FILE *fp );
+
+ extern void wg_timerstart(wgtimer_t *t);
+ extern uint4 wg_timerstop(wgtimer_t *t);
+
+ extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
+ extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
+ extern char *wg_trim( char *dest, const char *src );
+
+!
+ #endif
+
+--- 103,117 ----
+
+ extern char *wg_getline( char *line, int size, FILE *fp );
+
++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
+ extern void wg_timerstart(wgtimer_t *t);
+ extern uint4 wg_timerstop(wgtimer_t *t);
++ #endif /* TL : no struct timeval under Win32 */
+
+ extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
+ extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
+ extern char *wg_trim( char *dest, const char *src );
+
+!
+ #endif
+
+*** misc/libtextcat-2.2/src/constants.h 2003-05-22 13:32:43.000000000 +0200
+--- misc/build/libtextcat-2.2/src/constants.h 2007-01-11 13:19:40.000000000 +0100
+***************
+*** 39,44 ****
+--- 39,46 ----
+ */
+ #include <limits.h>
+
++ #define _UTF8_
++
+ #define DESCRIPTION "out of place"
+
+ /* Reported matches are those fingerprints with a score less than best
+***************
+*** 59,72 ****
+ /* Maximum number of n-grams in a fingerprint */
+ #define MAXNGRAMS 400
+
+! /* Maximum size of an n-gram? */
+! #define MAXNGRAMSIZE 5
+
+ /* Which characters are not acceptable in n-grams? */
+ #define INVALID(c) (isspace((int)c) || isdigit((int)c))
+
+ /* Minimum size (in characters) for accepting a document */
+! #define MINDOCSIZE 25
+
+ /* Maximum penalty for missing an n-gram in fingerprint */
+ #define MAXOUTOFPLACE 400
+--- 61,81 ----
+ /* Maximum number of n-grams in a fingerprint */
+ #define MAXNGRAMS 400
+
+! /* Maximum number of character of an n-gram? */
+! #define MAXNGRAMSYMBOL 5
+!
+! /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
+! #ifdef _UTF8_
+! #define MAXNGRAMSIZE 20
+! #else
+! #define MAXNGRAMSIZE MAXNGRAMSYMBOL
+! #endif
+
+ /* Which characters are not acceptable in n-grams? */
+ #define INVALID(c) (isspace((int)c) || isdigit((int)c))
+
+ /* Minimum size (in characters) for accepting a document */
+! #define MINDOCSIZE 6
+
+ /* Maximum penalty for missing an n-gram in fingerprint */
+ #define MAXOUTOFPLACE 400
+***************
+*** 76,79 ****
+--- 85,91 ----
+
+ #define MAXSCORE INT_MAX
+
++ /* where the fingerprints files are stored */
++ #define DEFAULT_FINGERPRINTS_PATH ""
++
+ #endif
+*** misc/libtextcat-2.2/src/fingerprint.c 2003-05-22 13:32:43.000000000 +0200
+--- misc/build/libtextcat-2.2/src/fingerprint.c 2007-01-12 12:51:59.000000000 +0100
+***************
+*** 6,28 ****
+ * All rights reserved.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+--- 6,28 ----
+ * All rights reserved.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+***************
+*** 51,57 ****
+ * The reason why we go through the trouble of doing a partial
+ * (heap)sort is that a full quicksort behaves horribly on the data:
+ * most n-grams have a very low count, resulting in a data set in
+! * nearly-sorted order. This causes quicksort to behave very badly.
+ * Heapsort, on the other hand, behaves handsomely: worst case is
+ * Mlog(N) for M n-grams filtered through a N-sized heap.
+ *
+--- 51,57 ----
+ * The reason why we go through the trouble of doing a partial
+ * (heap)sort is that a full quicksort behaves horribly on the data:
+ * most n-grams have a very low count, resulting in a data set in
+! * nearly-sorted order. This causes quicksort to behave very badly.
+ * Heapsort, on the other hand, behaves handsomely: worst case is
+ * Mlog(N) for M n-grams filtered through a N-sized heap.
+ *
+***************
+*** 63,68 ****
+--- 63,72 ----
+ * - put table/heap datastructure in a separate file.
+ */
+
++ #ifndef _UTF8_
++ #define _UTF8_
++ #endif
++
+ #include "config.h"
+ #include <stdio.h>
+ #ifdef HAVE_STDLIB_H
+***************
+*** 80,89 ****
+--- 84,95 ----
+ #include "wg_mempool.h"
+ #include "constants.h"
+
++ #include "utf8misc.h"
+
+ #define TABLESIZE (1<<TABLEPOW)
+ #define TABLEMASK ((TABLESIZE)-1)
+
++
+ typedef struct {
+
+ sint2 rank;
+***************
+*** 96,102 ****
+ const char *name;
+ ngram_t *fprint;
+ uint4 size;
+!
+ } fp_t;
+
+ typedef struct entry_s {
+--- 102,108 ----
+ const char *name;
+ ngram_t *fprint;
+ uint4 size;
+!
+ } fp_t;
+
+ typedef struct entry_s {
+***************
+*** 105,117 ****
+ struct entry_s *next;
+ } entry_t;
+
+! typedef struct table_s {
+ void *pool;
+ entry_t **table;
+ entry_t *heap;
+
+ struct table_s *next;
+!
+ uint4 heapsize;
+ uint4 size;
+ } table_t;
+--- 111,123 ----
+ struct entry_s *next;
+ } entry_t;
+
+! typedef struct table_s {
+ void *pool;
+ entry_t **table;
+ entry_t *heap;
+
+ struct table_s *next;
+!
+ uint4 heapsize;
+ uint4 size;
+ } table_t;
+***************
+*** 122,128 ****
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+! * during n-gram construction)
+ */
+ static uint4 simplehash( const char *p, int len )
+ {
+--- 128,134 ----
+ * fast and furious little hash function
+ *
+ * (Note that we could use some kind of rolling checksum, and update it
+! * during n-gram construction)
+ */
+ static uint4 simplehash( const char *p, int len )
+ {
+***************
+*** 134,162 ****
+ }
+
+
+- /* checks if n-gram lex is a prefix of key and of length len */
+- inline int issame( char *lex, char *key, int len )
+- {
+- int i;
+- for (i=0; i<len; i++) {
+- if ( key[i] != lex[i] ) {
+- return 0;
+- }
+- }
+- if ( lex[i] != 0 ) {
+- return 0;
+- }
+- return 1;
+- }
+-
+
+ /* increases frequency of ngram(p,len) */
+! static inline int increasefreq( table_t *t, char *p, int len )
+! {
+! uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+!
+! while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+--- 140,153 ----
+ }
+
+
+
+ /* increases frequency of ngram(p,len) */
+! static int increasefreq( table_t *t, char *p, int len )
+! {
+! uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+!
+! while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ /*** Found it! ***/
+ entry->cnt++;
+***************
+*** 168,174 ****
+ }
+
+ /*** Not found, so create ***/
+! entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+--- 159,165 ----
+ }
+
+ /*** Not found, so create ***/
+! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
+ strcpy( entry->str, p );
+ entry->cnt = 1;
+
+***************
+*** 181,192 ****
+ #if 0
+
+ /* looks up ngram(p,len) */
+! static entry_t *findfreq( table_t *t, char *p, int len )
+! {
+! uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+!
+! while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+--- 172,183 ----
+ #if 0
+
+ /* looks up ngram(p,len) */
+! static entry_t *findfreq( table_t *t, char *p, int len )
+! {
+! uint4 hash = simplehash( p, len ) & TABLEMASK;
+ entry_t *entry = t->table[ hash ];
+!
+! while ( entry ) {
+ if ( issame( entry->str, p, len ) ) {
+ return entry;
+ }
+***************
+*** 219,225 ****
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y) ((x).cnt < (y).cnt)
+
+! inline static void siftup( table_t *t, unsigned int child )
+ {
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+--- 210,216 ----
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y) ((x).cnt < (y).cnt)
+
+! static void siftup( table_t *t, unsigned int child )
+ {
+ entry_t *heap = t->heap;
+ unsigned int parent = (child-1) >> 1;
+***************
+*** 241,247 ****
+ }
+
+
+! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+--- 232,238 ----
+ }
+
+
+! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ entry_t *heap = t->heap;
+ unsigned int child = parent*2 + 1;
+***************
+*** 273,279 ****
+ if (t->size < t->heapsize) {
+ memcpy( &(heap[t->size]), item, sizeof(entry_t));
+ siftup( t, t->size );
+! t->size++;
+ return 0;
+ }
+
+--- 264,270 ----
+ if (t->size < t->heapsize) {
+ memcpy( &(heap[t->size]), item, sizeof(entry_t));
+ siftup( t, t->size );
+! t->size++;
+ return 0;
+ }
+
+***************
+*** 316,333 ****
+
+ /*** Fill result heap ***/
+ for (i=0; i<TABLESIZE; i++) {
+! entry_t *p = t->table[i];
+ while (p) {
+ heapinsert(t, p);
+ p = p->next;
+ }
+! }
+ return 1;
+ }
+
+
+ static table_t *inittable(uint4 maxngrams)
+! {
+ table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+ result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+ result->pool = wgmempool_Init( 10000, 10 );
+--- 307,324 ----
+
+ /*** Fill result heap ***/
+ for (i=0; i<TABLESIZE; i++) {
+! entry_t *p = t->table[i];
+ while (p) {
+ heapinsert(t, p);
+ p = p->next;
+ }
+! }
+ return 1;
+ }
+
+
+ static table_t *inittable(uint4 maxngrams)
+! {
+ table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
+ result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
+ result->pool = wgmempool_Init( 10000, 10 );
+***************
+*** 347,360 ****
+ wgmempool_Done(t->pool);
+ wg_free(t->table);
+ wg_free(t->heap);
+! wg_free(t);
+ }
+
+
+ extern void *fp_Init(const char *name)
+ {
+ fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
+!
+ if ( name ) {
+ h->name = wg_strdup(name);
+ }
+--- 338,351 ----
+ wgmempool_Done(t->pool);
+ wg_free(t->table);
+ wg_free(t->heap);
+! wg_free(t);
+ }
+
+
+ extern void *fp_Init(const char *name)
+ {
+ fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
+!
+ if ( name ) {
+ h->name = wg_strdup(name);
+ }
+***************
+*** 458,478 ****
+ return dest;
+ }
+
+!
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ char n[MAXNGRAMSIZE+1];
+ const char *p = buf;
+ int i;
+
+ /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+! for (;;p++) {
+
+! const char *q = p;
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+! *m++ = *q++;
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+--- 449,475 ----
+ return dest;
+ }
+
+! /**
+! * this function extract all n-gram from past buffer and put them into the table "t"
+! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
+! */
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ char n[MAXNGRAMSIZE+1];
+ const char *p = buf;
+ int i;
++ int pointer = 0;
+
+ /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+! while(1) {
+
+! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
+ char *m = n;
+
+ /*** First char may be an underscore ***/
+! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
+! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
+! m += decay; /*[modified]*/
+ *m = '\0';
+
+ increasefreq( t, n, 1 );
+***************
+*** 482,500 ****
+ }
+
+ /*** Let the compiler unroll this ***/
+! for ( i=2; i<=MAXNGRAMSIZE; i++) {
+
+! *m++ = *q;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+! q++;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
+ }
+ return;
+ }
+--- 479,500 ----
+ }
+
+ /*** Let the compiler unroll this ***/
+! for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
+
+! decay = charcopy(q, m); /*[modified] like above*/
+! m += decay;
+ *m = '\0';
+
+ increasefreq( t, n, i );
+
+ if ( *q == '_' ) break;
+! q += decay;
+ if ( *q == '\0' ) {
+ return;
+ }
+ }
++
++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
+ }
+ return;
+ }
+***************
+*** 514,520 ****
+ {
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+!
+ return mystrcmp( x->str, y->str );
+ }
+
+--- 514,520 ----
+ {
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+!
+ return mystrcmp( x->str, y->str );
+ }
+
+***************
+*** 522,533 ****
+ {
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+!
+ return x->rank - y->rank;
+ }
+
+ /**
+! * Create a fingerprint:
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+--- 522,533 ----
+ {
+ ngram_t *x = (ngram_t *)a;
+ ngram_t *y = (ngram_t *)b;
+!
+ return x->rank - y->rank;
+ }
+
+ /**
+! * Create a fingerprint:
+ * - record the frequency of each unique n-gram in a hash table
+ * - take the most frequent n-grams
+ * - sort them alphabetically, recording their relative rank
+***************
+*** 544,563 ****
+ }
+
+ /*** Throw out all invalid chars ***/
+! tmp = prepbuffer( buffer, bufsize );
+ if ( tmp == NULL ) {
+ return 0;
+ }
+-
+ h = (fp_t*)handle;
+ t = inittable(maxngrams);
+
+ /*** Create a hash table containing n-gram counts ***/
+ createngramtable(t, tmp);
+!
+ /*** Take the top N n-grams and add them to the profile ***/
+! table2heap(t);
+! maxngrams = WGMIN( maxngrams, t->size );
+
+ h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+ h->size = maxngrams;
+--- 544,564 ----
+ }
+
+ /*** Throw out all invalid chars ***/
+! tmp = prepbuffer( buffer, bufsize );
+! /*printf("Cleaned buffer : %s\n",tmp);*/
+ if ( tmp == NULL ) {
+ return 0;
+ }
+ h = (fp_t*)handle;
+ t = inittable(maxngrams);
++ /*printf("Table initialized\n");*/
+
+ /*** Create a hash table containing n-gram counts ***/
+ createngramtable(t, tmp);
+! /*printf("Table created\n");*/
+ /*** Take the top N n-grams and add them to the profile ***/
+! table2heap(t);
+! maxngrams = WGMIN( maxngrams, t->size );
+
+ h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
+ h->size = maxngrams;
+***************
+*** 568,574 ****
+ entry_t tmp2;
+
+ heapextract(t, &tmp2);
+!
+ /*** the string and its rank is all we need ***/
+ strcpy( h->fprint[i].str, tmp2.str );
+ h->fprint[i].rank = i;
+--- 569,575 ----
+ entry_t tmp2;
+
+ heapextract(t, &tmp2);
+!
+ /*** the string and its rank is all we need ***/
+ strcpy( h->fprint[i].str, tmp2.str );
+ h->fprint[i].rank = i;
+***************
+*** 578,584 ****
+ wg_free(tmp);
+
+ /*** Sort n-grams alphabetically, for easy comparison ***/
+! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+ return 1;
+ }
+
+--- 579,585 ----
+ wg_free(tmp);
+
+ /*** Sort n-grams alphabetically, for easy comparison ***/
+! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+ return 1;
+ }
+
+***************
+*** 608,614 ****
+ #endif
+ return 0;
+ }
+!
+ h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+ while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+--- 609,615 ----
+ #endif
+ return 0;
+ }
+!
+ h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
+
+ while (cnt < maxngrams && wg_getline(line,1024,fp)) {
+***************
+*** 635,641 ****
+ h->size = cnt;
+
+ /*** Sort n-grams, for easy comparison later on ***/
+! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+
+ fclose(fp);
+
+--- 636,642 ----
+ h->size = cnt;
+
+ /*** Sort n-grams, for easy comparison later on ***/
+! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
+
+ fclose(fp);
+
+***************
+*** 648,661 ****
+ {
+ uint4 i;
+ fp_t *h = (fp_t *)handle;
+! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size );
+!
+ /*** Make a temporary and sort it on rank ***/
+ memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
+! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
+
+ for (i=0; i<h->size; i++) {
+! fprintf( fp, "%s\n", tmp[i].str );
+ }
+ wg_free( tmp );
+ }
+--- 649,663 ----
+ {
+ uint4 i;
+ fp_t *h = (fp_t *)handle;
+! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size );
+!
+ /*** Make a temporary and sort it on rank ***/
+ memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
+! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
+
+ for (i=0; i<h->size; i++) {
+! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/
+! fprintf( fp, "%s\n", tmp[i].str);
+ }
+ wg_free( tmp );
+ }
+***************
+*** 669,675 ****
+ uint4 i = 0;
+ uint4 j = 0;
+ sint4 sum = 0;
+!
+ /*** Compare the profiles in mergesort fashion ***/
+ while ( i < c->size && j < u->size ) {
+
+--- 671,677 ----
+ uint4 i = 0;
+ uint4 j = 0;
+ sint4 sum = 0;
+!
+ /*** Compare the profiles in mergesort fashion ***/
+ while ( i < c->size && j < u->size ) {
+
+***************
+*** 705,711 ****
+ }
+
+ return sum;
+!
+ }
+
+
+--- 707,713 ----
+ }
+
+ return sum;
+!
+ }
+
+
+*** misc/libtextcat-2.2/src/fingerprint.h 2003-05-19 14:16:31.000000000 +0200
+--- misc/build/libtextcat-2.2/src/fingerprint.h 2007-01-11 13:19:40.000000000 +0100
+***************
+*** 41,47 ****
+--- 41,53 ----
+ extern int fp_Read( void *handle, const char *fname, int maxngrams );
+ extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
+ extern void fp_Show( void *handle );
++ #ifdef __cplusplus
++ extern "C" {
++ #endif
+ extern const char *fp_Name( void *handle );
++ #ifdef __cplusplus
++ }
++ #endif
+ extern void fp_Print( void *handle, FILE *fp );
+
+ #endif
+*** misc/libtextcat-2.2/src/Makefile.in 2003-05-22 13:39:52.000000000 +0200
+--- misc/build/libtextcat-2.2/src/Makefile.in 2007-01-12 12:48:19.181803000 +0100
+***************
+*** 124,143 ****
+ target_vendor = @target_vendor@
+ AUTOMAKE_OPTIONS = 1.4 foreign
+
+! WARNS = -W -Wall -Wshadow -Wpointer-arith
+! IFLAGS =
+! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+ VERBOSE = -DVERBOSE
+ AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
+ AM_LDFLAGS = -g
+
+ noinst_HEADERS = \
+! common.h constants.h fingerprint.h textcat.h wg_mempool.h
+
+
+ lib_LTLIBRARIES = libtextcat.la
+ libtextcat_la_SOURCES = \
+! common.c fingerprint.c textcat.c wg_mempool.c
+
+
+ bin_PROGRAMS = createfp
+--- 124,143 ----
+ target_vendor = @target_vendor@
+ AUTOMAKE_OPTIONS = 1.4 foreign
+
+! #WARNS = -W -Wall -Wshadow -Wpointer-arith
+! IFLAGS =
+! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
+ VERBOSE = -DVERBOSE
+ AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
+ AM_LDFLAGS = -g
+
+ noinst_HEADERS = \
+! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h
+
+
+ lib_LTLIBRARIES = libtextcat.la
+ libtextcat_la_SOURCES = \
+! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
+
+
+ bin_PROGRAMS = createfp
+***************
+*** 156,162 ****
+ libtextcat_la_LDFLAGS =
+ libtextcat_la_LIBADD =
+ am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
+! wg_mempool.lo
+ libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
+ bin_PROGRAMS = createfp$(EXEEXT)
+ noinst_PROGRAMS = testtextcat$(EXEEXT)
+--- 156,162 ----
+ libtextcat_la_LDFLAGS =
+ libtextcat_la_LIBADD =
+ am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
+! wg_mempool.lo utf8misc.lo
+ libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
+ bin_PROGRAMS = createfp$(EXEEXT)
+ noinst_PROGRAMS = testtextcat$(EXEEXT)
+***************
+*** 177,183 ****
+ @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \
+ @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \
+ @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \
+! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo
+ COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+ LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
+--- 177,184 ----
+ @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \
+ @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \
+ @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \
+! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \
+! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo
+ COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+ LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
+***************
+*** 213,219 ****
+ @rm -f stamp-h1
+ cd $(top_builddir) && $(SHELL) ./config.status src/config.h
+
+! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
+ cd $(top_srcdir) && $(AUTOHEADER)
+ touch $(srcdir)/config.h.in
+
+--- 214,220 ----
+ @rm -f stamp-h1
+ cd $(top_builddir) && $(SHELL) ./config.status src/config.h
+
+! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
+ cd $(top_srcdir) && $(AUTOHEADER)
+ touch $(srcdir)/config.h.in
+
+***************
+*** 247,253 ****
+ echo "rm -f \"$${dir}/so_locations\""; \
+ rm -f "$${dir}/so_locations"; \
+ done
+! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
+ $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
+ binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
+ install-binPROGRAMS: $(bin_PROGRAMS)
+--- 248,254 ----
+ echo "rm -f \"$${dir}/so_locations\""; \
+ rm -f "$${dir}/so_locations"; \
+ done
+! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
+ $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
+ binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
+ install-binPROGRAMS: $(bin_PROGRAMS)
+***************
+*** 285,294 ****
+ echo " rm -f $$p $$f"; \
+ rm -f $$p $$f ; \
+ done
+! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
+ @rm -f createfp$(EXEEXT)
+ $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS)
+! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
+ @rm -f testtextcat$(EXEEXT)
+ $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS)
+
+--- 286,295 ----
+ echo " rm -f $$p $$f"; \
+ rm -f $$p $$f ; \
+ done
+! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
+ @rm -f createfp$(EXEEXT)
+ $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS)
+! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
+ @rm -f testtextcat$(EXEEXT)
+ $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS)
+
+***************
+*** 304,309 ****
+--- 305,311 ----
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@
++ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@
+
+ distclean-depend:
+ -rm -rf ./$(DEPDIR)
+*** misc/libtextcat-2.2/src/makefile.mk 2007-01-12 12:55:41.709348000 +0100
+--- misc/build/libtextcat-2.2/src/makefile.mk 2007-01-12 12:48:19.214530000 +0100
+***************
+*** 1 ****
+! dummy
+--- 1,91 ----
+! #*************************************************************************
+! #
+! # $RCSfile: libtextcat-2.2.patch,v $
+! #
+! # $Revision: 1.1 $
+! #
+! # last change: $Author: tl $ $Date: 2007-01-12 12:34:52 $
+! #
+! #* The Contents of this file are made available subject to
+! #* the terms of GNU Lesser General Public License Version 2.1.
+! #*
+! #*
+! #* GNU Lesser General Public License Version 2.1
+! #* =============================================
+! #* Copyright 2005 by Sun Microsystems, Inc.
+! #* 901 San Antonio Road, Palo Alto, CA 94303, USA
+! #*
+! #* This library is free software; you can redistribute it and/or
+! #* modify it under the terms of the GNU Lesser General Public
+! #* License version 2.1, as published by the Free Software Foundation.
+! #*
+! #* This library is distributed in the hope that it will be useful,
+! #* but WITHOUT ANY WARRANTY; without even the implied warranty of
+! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+! #* Lesser General Public License for more details.
+! #*
+! #* You should have received a copy of the GNU Lesser General Public
+! #* License along with this library; if not, write to the Free Software
+! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+! #* MA 02111-1307 USA
+! #*
+! #*************************************************************************
+!
+! PRJ = ..$/..$/..$/..$/..
+!
+! PRJNAME = libtextcat
+! TARGET = libtextcat
+! CFLAGSCALL=gsd
+!
+! USE_DEFFILE=TRUE
+! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE
+!
+! .INCLUDE : settings.mk
+!
+! # --- Files --------------------------------------------------------
+!
+! # !! not to be compiled because those belong to a stand alone programs: !!
+! # $(SLO)$/createfp.obj\
+! # $(SLO)$/testtextcat.obj
+!
+! SLOFILES= \
+! $(SLO)$/common.obj\
+! $(SLO)$/fingerprint.obj\
+! $(SLO)$/textcat.obj\
+! $(SLO)$/wg_mempool.obj\
+! $(SLO)$/utf8misc.obj
+!
+! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX)
+! SHL1TARGET= $(TARGET)
+!
+! SHL1STDLIBS=
+!
+! # build DLL
+! SHL1LIBS= $(SLB)$/$(TARGET).lib
+! SHL1IMPLIB= i$(TARGET)
+! SHL1DEPN= $(SHL1LIBS)
+! SHL1DEF= $(MISC)$/$(SHL1TARGET).def
+!
+! # build DEF file
+! DEF1NAME= $(SHL1TARGET)
+! DEF1LIBNAME=$(TARGET)
+! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt
+!
+! # --- Targets ------------------------------------------------------
+!
+! .INCLUDE : target.mk
+!
+! # copy hand supplied configuration file for Win32 builds to the file
+! # which is included in the source code
+! $(SLOFILES) : config.h
+! config.h :
+! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h
+!
+!
+! $(MISC)$/$(SHL1TARGET).flt: makefile.mk
+! @echo ------------------------------
+! @echo Making: $@
+! @echo Imp>$@
+! @echo __CT>>$@
+! @echo _real>>$@
+! @echo unnamed>>$@
+*** misc/libtextcat-2.2/src/textcat.c 2003-05-22 13:32:43.000000000 +0200
+--- misc/build/libtextcat-2.2/src/textcat.c 2007-01-12 12:52:41.000000000 +0100
+***************
+*** 4,26 ****
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+--- 4,26 ----
+ * Copyright (C) 2003 WiseGuys Internet B.V.
+ *
+ * THE BSD LICENSE
+! *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+! *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+! *
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the
+ * distribution.
+! *
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
+ * its contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+! *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+***************
+*** 74,79 ****
+--- 74,80 ----
+ typedef struct {
+
+ void **fprint;
++ char *fprint_disable;
+ uint4 size;
+ uint4 maxsize;
+
+***************
+*** 112,122 ****
+ fp_Done( h->fprint[i] );
+ }
+ wg_free( h->fprint );
+ wg_free( h );
+
+ }
+
+! extern void *textcat_Init( const char *conffile )
+ {
+ textcat_t *h;
+ char line[1024];
+--- 113,133 ----
+ fp_Done( h->fprint[i] );
+ }
+ wg_free( h->fprint );
++ wg_free( h->fprint_disable );
+ wg_free( h );
+
+ }
+
+! /** Replaces older function */
+! extern void *textcat_Init( const char *conffile ){
+! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
+! }
+!
+! /**
+! * Originaly this function had only one parameter (conffile) it has been modified since OOo use
+! * Basicaly prefix is the directory path where fingerprints are stored
+! */
+! extern void *special_textcat_Init( const char *conffile, const char *prefix )
+ {
+ textcat_t *h;
+ char line[1024];
+***************
+*** 134,144 ****
+ h->size = 0;
+ h->maxsize = 16;
+ h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
+
+ while ( wg_getline( line, 1024, fp ) ) {
+ char *p;
+ char *segment[4];
+! int res;
+
+ /*** Skip comments ***/
+ #ifdef HAVE_STRCHR
+--- 145,157 ----
+ h->size = 0;
+ h->maxsize = 16;
+ h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
+
+ while ( wg_getline( line, 1024, fp ) ) {
+ char *p;
+ char *segment[4];
+! char finger_print_file_name[512];
+! int res;
+
+ /*** Skip comments ***/
+ #ifdef HAVE_STRCHR
+***************
+*** 156,172 ****
+ /*** Ensure enough space ***/
+ if ( h->size == h->maxsize ) {
+ h->maxsize *= 2;
+! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
+ }
+
+ /*** Load data ***/
+ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ goto ERROR;
+ }
+! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
+ textcat_Done(h);
+ goto ERROR;
+! }
+ h->size++;
+ }
+
+--- 169,191 ----
+ /*** Ensure enough space ***/
+ if ( h->size == h->maxsize ) {
+ h->maxsize *= 2;
+! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
+! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
+ }
+
+ /*** Load data ***/
+ if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ goto ERROR;
+ }
+! finger_print_file_name[0] = '\0';
+! strcat(finger_print_file_name, prefix);
+! strcat(finger_print_file_name, segment[0]);
+!
+! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
+ textcat_Done(h);
+ goto ERROR;
+! }
+! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
+ h->size++;
+ }
+
+***************
+*** 203,213 ****
+ result = _TEXTCAT_RESULT_SHORT;
+ goto READY;
+ }
+!
+ /*** Calculate the score for each category. ***/
+ for (i=0; i<h->size; i++) {
+! int score = fp_Compare( h->fprint[i], unknown, threshold );
+! candidates[i].score = score;
+ candidates[i].name = fp_Name( h->fprint[i] );
+ if ( score < minscore ) {
+ minscore = score;
+--- 222,239 ----
+ result = _TEXTCAT_RESULT_SHORT;
+ goto READY;
+ }
+!
+ /*** Calculate the score for each category. ***/
+ for (i=0; i<h->size; i++) {
+! int score;
+! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
+! score = MAXSCORE;
+! }
+! else{
+! score = fp_Compare( h->fprint[i], unknown, threshold );
+! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
+! }
+! candidates[i].score = score;
+ candidates[i].name = fp_Name( h->fprint[i] );
+ if ( score < minscore ) {
+ minscore = score;
+***************
+*** 218,224 ****
+ /*** Find the best performers ***/
+ for (i=0; i<h->size; i++) {
+ if ( candidates[i].score < threshold ) {
+-
+ if ( ++cnt == MAXCANDIDATES+1 ) {
+ break;
+ }
+--- 244,249 ----
+***************
+*** 235,241 ****
+ else {
+ char *p = result;
+ char *plimit = result+MAXOUTPUTSIZE;
+!
+ qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
+
+ *p = '\0';
+--- 260,266 ----
+ else {
+ char *p = result;
+ char *plimit = result+MAXOUTPUTSIZE;
+!
+ qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
+
+ *p = '\0';
+***************
+*** 247,253 ****
+ }
+ READY:
+ fp_Done(unknown);
+! #ifdef SHOULD_FREE
+ free(candidates);
+ #undef SHOULD_FREE
+ #endif
+--- 272,278 ----
+ }
+ READY:
+ fp_Done(unknown);
+! #ifdef SHOULD_FREE
+ free(candidates);
+ #undef SHOULD_FREE
+ #endif
+*** misc/libtextcat-2.2/src/textcat.h 2003-05-19 14:16:31.000000000 +0200
+--- misc/build/libtextcat-2.2/src/textcat.h 2007-01-11 13:19:41.000000000 +0100
+***************
+*** 40,45 ****
+--- 40,48 ----
+ #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
+ #define _TEXTCAT_RESULT_SHORT "SHORT"
+
++ #ifdef __cplusplus
++ extern "C" {
++ #endif
+
+ /**
+ * textcat_Init() - Initialize the text classifier. The textfile
+***************
+*** 51,60 ****
+--- 54,72 ----
+ * Returns: handle on success, NULL on error. (At the moment, the
+ * only way errors can occur, is when the library cannot read the
+ * conffile, or one of the fingerprint files listed in it.)
++ *
++ * Replace older function (and has exacly the same behaviour)
++ * see below
+ */
+ extern void *textcat_Init( const char *conffile );
+
+ /**
++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
++ * Basicaly prefix is the directory path where fingerprints are stored
++ */
++ extern void *special_textcat_Init( const char *conffile, const char *prefix );
++
++ /**
+ * textcat_Done() - Free up resources for handle
+ */
+ extern void textcat_Done( void *handle );
+***************
+*** 77,80 ****
+--- 89,96 ----
+ * textcat_Version() - Returns a string describing the version of this classifier.
+ */
+ extern char *textcat_Version();
++
++ #ifdef __cplusplus
++ }
++ #endif
+ #endif
+*** misc/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:55:41.584585000 +0100
+--- misc/build/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:54:50.000000000 +0100
+***************
+*** 1 ****
+! dummy
+--- 1,132 ----
+! /***************************************************************************
+! * Copyright (C) 2006 by Jocelyn Merand *
+! * joc.mer@gmail.com *
+! * *
+! * THE BSD LICENSE
+! *
+! * Redistribution and use in source and binary forms, with or without
+! * modification, are permitted provided that the following conditions
+! * are met:
+! *
+! * - Redistributions of source code must retain the above copyright
+! * notice, this list of conditions and the following disclaimer.
+! *
+! * - Redistributions in binary form must reproduce the above copyright
+! * notice, this list of conditions and the following disclaimer in the
+! * documentation and/or other materials provided with the
+! * distribution.
+! *
+! * - Neither the name of the WiseGuys Internet B.V. nor the names of
+! * its contributors may be used to endorse or promote products derived
+! * from this software without specific prior written permission.
+! *
+! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+! ***************************************************************************/
+!
+! #ifndef _UTF8_MISC_H_
+! #include "utf8misc.h"
+! #endif
+!
+!
+! int nextcharstart(const char *str, int position){
+! int pointer = position;
+!
+! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
+!
+! /*then str[pointer] is an escape character*/
+!
+! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
+!
+! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
+! escape_char = escape_char <<1;
+! ++pointer;
+! }
+! }
+! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
+! ++pointer;
+! }
+! return pointer;
+! }
+!
+!
+! int charcopy(const char *str, char *dest){
+!
+! int pointer = 0;
+! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
+!
+! /*then str[pointer] is an escape character*/
+!
+! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
+!
+! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
+! dest[pointer] = str[pointer];
+! escape_char = escape_char <<1;
+! ++pointer;
+! }
+! }
+! if(str[pointer]){
+! dest[pointer] = str[pointer];
+! ++pointer;
+! }
+!
+! return pointer;
+! }
+!
+!
+! int issame( char *lex, char *key, int len )
+! {
+! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
+! int char_counter = 0;
+! int pointer = 0;
+! while(char_counter < len) {
+!
+! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
+!
+! /*then key[pointer] is an escap character*/
+!
+! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
+!
+! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
+! escape_char = escape_char <<1;
+! ++pointer;
+! }
+! }
+! ++char_counter; /*and we are on a new utf8 character*/
+! if ( key[pointer] != lex[pointer] ) {
+! return 0;
+! /*printf(" NO\n", lex, key, len);*/
+! }
+! ++pointer;
+! }
+! if ( lex[pointer] != '\0' ) {
+! return 0;
+! /*printf(" NO\n");*/
+! }
+!
+! /*printf(" YES\n");*/
+!
+! return 1;
+! }
+!
+!
+! extern int utfstrlen(const char* str){
+! int char_counter = 0;
+! int pointer = 0;
+! while(str[pointer]) {
+! pointer = nextcharstart(str, pointer);
+!
+! ++char_counter; /*and we are on a new utf8 character*/
+! }
+! return char_counter;
+! }
+!
+*** misc/libtextcat-2.2/src/utf8misc.h 2007-01-12 12:55:41.547021000 +0100
+--- misc/build/libtextcat-2.2/src/utf8misc.h 2007-01-11 13:19:41.000000000 +0100
+***************
+*** 1 ****
+! dummy
+--- 1,88 ----
+! /***************************************************************************
+! * Copyright (C) 2006 by Jocelyn Merand *
+! * joc.mer@gmail.com *
+! * *
+! * THE BSD LICENSE
+! *
+! * Redistribution and use in source and binary forms, with or without
+! * modification, are permitted provided that the following conditions
+! * are met:
+! *
+! * - Redistributions of source code must retain the above copyright
+! * notice, this list of conditions and the following disclaimer.
+! *
+! * - Redistributions in binary form must reproduce the above copyright
+! * notice, this list of conditions and the following disclaimer in the
+! * documentation and/or other materials provided with the
+! * distribution.
+! *
+! * - Neither the name of the WiseGuys Internet B.V. nor the names of
+! * its contributors may be used to endorse or promote products derived
+! * from this software without specific prior written permission.
+! *
+! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+! ***************************************************************************/
+!
+! #ifndef _UTF8_MISC_H_
+! #define _UTF8_MISC_H_
+!
+! /**
+! * These variables are used in character processing functions
+! * These have been added to manage utf-8 symbols, particularly escape chars
+! */
+! #ifdef _UTF8_
+! #define ESCAPE_MASK 0x80
+! #define WEIGHT_MASK 0xF0
+! #else
+! #define ESCAPE_MASK 0xFF
+! #define WEIGHT_MASK 0x00
+! #endif
+!
+!
+! /*
+! * Is used to jump to the next start of char
+! * of course it's only usefull when encoding is utf-8
+! * This function have been added by Jocelyn Merand to use libtextcat in OOo
+! */
+! int nextcharstart(const char *str, int position);
+!
+!
+! /*Copy the char in str to dest
+! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
+! * return the number of char jumped
+! * This function have been added by Jocelyn Merand to use libtextcat in OOo
+! */
+! int charcopy(const char *str, char *dest);
+!
+!
+! /* checks if n-gram lex is a prefix of key and of length len
+! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
+! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
+! */
+! int issame( char *lex, char *key, int len );
+!
+!
+! /* Counts the number of characters
+! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
+! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
+! */
+! #ifdef __cplusplus
+! extern "C" {
+! #endif
+! extern int utfstrlen(const char* str);
+! #ifdef __cplusplus
+! }
+! #endif
+!
+! #endif
+!
+*** misc/libtextcat-2.2/src/win32_config.h 2007-01-12 12:55:41.643465000 +0100
+--- misc/build/libtextcat-2.2/src/win32_config.h 2007-01-11 13:19:41.000000000 +0100
+***************
+*** 1 ****
+! dummy
+--- 1,136 ----
+! /* src/config.h. Generated by configure. */
+! /* src/config.h.in. Generated from configure.ac by autoheader. */
+!
+! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+! systems. This function is required for `alloca.c' support on those systems.
+! */
+! /* #undef CRAY_STACKSEG_END */
+!
+! /* Define to 1 if using `alloca.c'. */
+! /* #undef C_ALLOCA */
+!
+! /* Define to 1 if you have `alloca', as a function or macro. */
+! /* #undef HAVE_ALLOCA */
+!
+! /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+! */
+! /* #undef HAVE_ALLOCA_H */
+!
+! /* Define to 1 if you have the <dlfcn.h> header file. */
+! #define HAVE_DLFCN_H 1
+!
+! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */
+! /* #undef HAVE_DOPRNT */
+!
+! /* Define to 1 if you have the `gettimeofday' function. */
+! /* #undef HAVE_GETTIMEOFDAY */
+!
+! /* Define to 1 if you have the <inttypes.h> header file. */
+! /* #undef HAVE_INTTYPES_H */
+!
+! /* Define to 1 if you have the <limits.h> header file. */
+! #define HAVE_LIMITS_H 1
+!
+! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and
+! to 0 otherwise. */
+! #define HAVE_MALLOC 1
+!
+! /* Define to 1 if you have the <memory.h> header file. */
+! #define HAVE_MEMORY_H 1
+!
+! /* Define to 1 if you have the `memset' function. */
+! #define HAVE_MEMSET 1
+!
+! /* Define to 1 if your system has a GNU libc compatible `realloc' function,
+! and to 0 otherwise. */
+! #define HAVE_REALLOC 1
+!
+! /* Define to 1 if you have the <stdint.h> header file. */
+! /* #undef HAVE_STDINT_H */
+!
+! /* Define to 1 if you have the <stdlib.h> header file. */
+! #define HAVE_STDLIB_H 1
+!
+! /* Define to 1 if you have the `strchr' function. */
+! #define HAVE_STRCHR 1
+!
+! /* Define to 1 if you have the `strdup' function. */
+! #define HAVE_STRDUP 1
+!
+! /* Define to 1 if you have the <strings.h> header file. */
+! /* #undef HAVE_STRINGS_H */
+!
+! /* Define to 1 if you have the <string.h> header file. */
+! #define HAVE_STRING_H 1
+!
+! /* Define to 1 if you have the `strpbrk' function. */
+! #define HAVE_STRPBRK 1
+!
+! /* Define to 1 if you have the <sys/stat.h> header file. */
+! #define HAVE_SYS_STAT_H 1
+!
+! /* Define to 1 if you have the <sys/time.h> header file. */
+! /* #undef HAVE_SYS_TIME_H */
+!
+! /* Define to 1 if you have the <sys/types.h> header file. */
+! #define HAVE_SYS_TYPES_H 1
+!
+! /* Define to 1 if you have the <unistd.h> header file. */
+! #define HAVE_UNISTD_H 1
+!
+! /* Define to 1 if you have the `vprintf' function. */
+! #define HAVE_VPRINTF 1
+!
+! /* Name of package */
+! #define PACKAGE "libtextcat"
+!
+! /* Define to the address where bug reports for this package should be sent. */
+! #define PACKAGE_BUGREPORT ""
+!
+! /* Define to the full name of this package. */
+! #define PACKAGE_NAME "libtextcat"
+!
+! /* Define to the full name and version of this package. */
+! #define PACKAGE_STRING "libtextcat 2.2"
+!
+! /* Define to the one symbol short name of this package. */
+! #define PACKAGE_TARNAME "libtextcat"
+!
+! /* Define to the version of this package. */
+! #define PACKAGE_VERSION "2.2"
+!
+! /* If using the C implementation of alloca, define if you know the
+! direction of stack growth for your system; otherwise it will be
+! automatically deduced at run-time.
+! STACK_DIRECTION > 0 => grows toward higher addresses
+! STACK_DIRECTION < 0 => grows toward lower addresses
+! STACK_DIRECTION = 0 => direction of growth unknown */
+! /* #undef STACK_DIRECTION */
+!
+! /* Define to 1 if you have the ANSI C header files. */
+! #define STDC_HEADERS 1
+!
+! /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+! #define TIME_WITH_SYS_TIME 1
+!
+! /* Define to 1 if your <sys/time.h> declares `struct tm'. */
+! /* #undef TM_IN_SYS_TIME */
+!
+! /* Version number of package */
+! #define VERSION "2.2"
+!
+! /* Define to empty if `const' does not conform to ANSI C. */
+! /* #undef const */
+!
+! /* Define as `__inline' if that's what the C compiler calls it, or to nothing
+! if it is not supported. */
+! /* #undef inline */
+!
+! /* Define to rpl_malloc if the replacement function should be used. */
+! /* #undef malloc */
+!
+! /* Define to rpl_realloc if the replacement function should be used. */
+! /* #undef realloc */
+!
+! /* Define to `unsigned' if <sys/types.h> does not define. */
+! /* #undef size_t */
diff --git a/libtextcat/makefile.mk b/libtextcat/makefile.mk
new file mode 100644
index 000000000000..c1f8795c7cb0
--- /dev/null
+++ b/libtextcat/makefile.mk
@@ -0,0 +1,92 @@
+#*************************************************************************
+#
+# OpenOffice.org - a multi-platform office productivity suite
+#
+# $RCSfile: makefile.mk,v $
+#
+# $Revision: 1.1 $
+#
+# last change: $Author: tl $ $Date: 2007-01-12 12:34:28 $
+#
+# The Contents of this file are made available subject to
+# the terms of GNU Lesser General Public License Version 2.1.
+#
+#
+# GNU Lesser General Public License Version 2.1
+# =============================================
+# Copyright 2005 by Sun Microsystems, Inc.
+# 901 San Antonio Road, Palo Alto, CA 94303, USA
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License version 2.1, as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+# MA 02111-1307 USA
+#
+#*************************************************************************
+
+PRJ=.
+
+PRJNAME=libtextcat
+TARGET=libtextcat
+
+# --- Settings -----------------------------------------------------
+
+.INCLUDE : settings.mk
+
+# --- Files --------------------------------------------------------
+
+TARFILE_NAME=libtextcat-2.2
+TARFILE_ROOTDIR=libtextcat-2.2
+
+PATCH_FILE_NAME=libtextcat-2.2.patch
+
+
+ADDITIONAL_FILES= \
+ src$/utf8misc.h \
+ src$/utf8misc.c \
+ src$/win32_config.h \
+ src$/makefile.mk
+
+.IF "$(GUI)"=="UNX"
+#CONFIGURE_DIR=$(BUILD_DIR)
+
+#relative to CONFIGURE_DIR
+CONFIGURE_ACTION=configure
+CONFIGURE_FLAGS=
+
+BUILD_ACTION=make
+
+OUT2LIB=$(BUILD_DIR)$/src$/.libs$/libtextcat*.so
+
+.ENDIF # "$(GUI)"=="UNX"
+
+
+.IF "$(GUI)"=="WNT"
+BUILD_ACTION=cd src && dmake
+
+.ENDIF # "$(GUI)"=="WNT"
+
+
+OUT2INC= \
+ $(BUILD_DIR)$/src$/config.h \
+ $(BUILD_DIR)$/src$/common.h \
+ $(BUILD_DIR)$/src$/fingerprint.h \
+ $(BUILD_DIR)$/src$/textcat.h \
+ $(BUILD_DIR)$/src$/wg_mempool.h
+
+
+# --- Targets ------------------------------------------------------
+
+.INCLUDE : set_ext.mk
+.INCLUDE : target.mk
+.INCLUDE : tg_ext.mk
+
diff --git a/libtextcat/prj/build.lst b/libtextcat/prj/build.lst
new file mode 100644
index 000000000000..8f514f151426
--- /dev/null
+++ b/libtextcat/prj/build.lst
@@ -0,0 +1,3 @@
+ltc libtextcat : solenv NULL
+ltc libtextcat usr1 - all ltc_mkout NULL
+ltc libtextcat nmake - all ltc_libtextcat NULL
diff --git a/libtextcat/prj/d.lst b/libtextcat/prj/d.lst
new file mode 100644
index 000000000000..44ce02b77c44
--- /dev/null
+++ b/libtextcat/prj/d.lst
@@ -0,0 +1,8 @@
+
+..\%__SRC%\lib\lib*.* %_DEST%\lib%_EXT%\lib*.*
+..\%__SRC%\lib\ilib*.* %_DEST%\lib%_EXT%\ilib*.*
+..\%__SRC%\bin\l*.dll %_DEST%\bin%_EXT%\*.dll
+
+mkdir: %_DEST%\inc%_EXT%\libtextcat
+hedabu: ..\%__SRC%\misc\build\libtextcat-2.2\src\*.h %_DEST%\inc%_EXT%\libtextcat\*.h
+