diff options
author | Thomas Lange <tl@openoffice.org> | 2007-01-12 11:40:54 +0000 |
---|---|---|
committer | Thomas Lange <tl@openoffice.org> | 2007-01-12 11:40:54 +0000 |
commit | 23147b5b1f280e1c7758c4ce27b99dc92135b354 (patch) | |
tree | cdee4b730e97cad5db3fd941f5513dc826530fd8 /libtextcat | |
parent | 2bb6503c63165d28d1f9a0224b675565b6acaa96 (diff) |
#i73173# integrate Google SoC language-guessing
Diffstat (limited to 'libtextcat')
-rw-r--r-- | libtextcat/data/new_fingerprints/fpdb.conf | 82 | ||||
-rw-r--r-- | libtextcat/data/new_fingerprints/lm/afrikaans.lm | 400 | ||||
-rw-r--r-- | libtextcat/data/new_fingerprints/lm/albanian.lm | 400 | ||||
-rw-r--r-- | libtextcat/data/new_fingerprints/lm/amharic_utf.lm | 400 | ||||
-rw-r--r-- | libtextcat/data/new_fingerprints/lm/arabic.lm | 400 | ||||
-rw-r--r-- | libtextcat/data/new_fingerprints/lm/armenian.lm | 0 | ||||
-rw-r--r-- | libtextcat/libtextcat-2.2.patch | 2137 | ||||
-rw-r--r-- | libtextcat/makefile.mk | 92 | ||||
-rw-r--r-- | libtextcat/prj/build.lst | 3 | ||||
-rw-r--r-- | libtextcat/prj/d.lst | 8 |
10 files changed, 3922 insertions, 0 deletions
diff --git a/libtextcat/data/new_fingerprints/fpdb.conf b/libtextcat/data/new_fingerprints/fpdb.conf new file mode 100644 index 000000000000..b72e103ddffb --- /dev/null +++ b/libtextcat/data/new_fingerprints/fpdb.conf @@ -0,0 +1,82 @@ +# +# A sample config file for the language models +# provided with Gertjan van Noords language guesser +# (http://odur.let.rug.nl/~vannoord/TextCat/) +# +# Notes: +# - You may consider eliminating a couple of small languages from this +# list because they cause false positives with big languages and are +# bad for performance. (Do you really want to recognize Drents?) +# - Putting the most probable languages at the top of the list +# improves performance, because this will raise the threshold for +# likely candidates more quickly. +# + +# this file have been modified (to OOo by Jocelyn MERAND joc.mer@gmail.com) to include country and encoding +# guess strings are made as following : language-country-encoding + +afrikaans.lm af---utf8 +albanian.lm sq---utf8 +amharic_utf.lm am---utf8 +arabic.lm ar---utf8 +basque.lm eu---utf8 +belarus.lm be---utf8 +bosnian.lm bs---utf8 +breton.lm br---utf8 +catalan.lm ca---utf8 +chinese_simplified.lm zh-CN--utf8 +chinese_traditional.lm zh-TW--utf8 +croatian.lm hr---utf8 +czech.lm cs---utf8 +danish.lm da---utf8 +dutch.lm nl---utf8 +english.lm en---utf8 +esperanto.lm eo---utf8 +estonian.lm et---utf8 +finnish.lm fi---utf8 +french.lm fr---utf8 +frisian.lm fy---utf8 +georgian.lm ka---utf8 +german.lm de---utf8 +greek.lm el---utf8 +hebrew.lm he---utf8 +hindi.lm hi---utf8 +hungarian.lm hu---utf8 +icelandic.lm is---utf8 +indonesian.lm id---utf8 +irish_gaelic.lm ga---utf8 +italian.lm it---utf8 +japanese.lm ja---utf8 +korean.lm ko---utf8 +latin.lm la---utf8 +latvian.lm lv---utf8 +lithuanian.lm lt---utf8 +malay.lm ms---utf8 +manx_gaelic.lm gv---utf8 +marathi.lm mr---utf8 +nepali.lm ne---utf8 +norwegian.lm nb---utf8 # Norwegian (Bokmal) +persian.lm fa---utf8 # Farsi +polish.lm pl---utf8 +portuguese.lm pt-PT--utf8 +quechua.lm qu---utf8 +romanian.lm ro---utf8 +romansh.lm rm---utf8 +russian.lm ru---utf8 +sanskrit.lm sa---utf8 +scots.lm sco---utf8 +scots_gaelic.lm gd---utf8 +serbian_ascii.lm sh-YU--utf8 +slovak_ascii.lm sk-SK--utf8 +slovenian.lm sl---utf8 +spanish.lm es---utf8 +swahili.lm sw---utf8 +swedish.lm sv---utf8 +tagalog.lm tl---utf8 +tamil.lm ta---utf8 +thai.lm th---utf8 +turkish.lm tr---utf8 +ukrainian.lm uk---utf8 +vietnamese.lm vi---utf8 +welsh.lm cy---utf8 +yiddish_utf.lm yi---utf8 diff --git a/libtextcat/data/new_fingerprints/lm/afrikaans.lm b/libtextcat/data/new_fingerprints/lm/afrikaans.lm new file mode 100644 index 000000000000..c110f154b664 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/afrikaans.lm @@ -0,0 +1,400 @@ +_ 23602 +e 8036 +a 4087 +n 3782 +i 3726 +o 3314 +r 2951 +s 2885 +t 2749 +d 2479 +e_ 2118 +l 1854 +k 1741 +ie 1670 +g 1601 +n_ 1447 +m 1440 +_d 1219 +t_ 1143 +er 1124 +h 1124 +u 1110 +ie_ 1079 +y 1048 +w 986 +s_ 982 +_s 969 +_h 956 +di 924 +an 922 +r_ 912 +aa 882 +v 876 +en 807 +_di 807 +. 790 +y_ 747 +_v 709 +et 706 +._ 694 +die 691 +die_ 667 +_n 666 +_die 651 +p 639 +_m 634 +_die_ 633 +_w 632 +ee 607 +ge 606 +_o 598 +b 586 +te 568 +, 560 +in 555 +k_ 550 +_e 550 +,_ 548 +oo 516 +et_ 511 +de 509 +el 489 +_g 486 +f 461 +ar 451 +ni 450 +nd 442 +an_ 440 +en_ 437 +_i 426 +he 423 +g_ 418 +_t 412 +oe 410 +at 406 +er_ 400 +om 381 +wa 378 +_a 378 +_b 377 +_k 371 +nie 371 +_he 370 +aar 355 +_ge 351 +es 351 +_ni 348 +da 346 +m_ 342 +ou 338 +it 335 +_nie 335 +d_ 332 +l_ 330 +_wa 329 +or 327 +le 326 +we 326 +ek 324 +het 321 +me 319 +_het 319 +is 318 +j 315 +at_ 311 +on 309 +se 308 +_en 298 +ma 294 +st 291 +as 280 +va 277 +_en_ 270 +re 270 +" 269 +' 265 +het_ 261 +_het_ 260 +om_ 254 +al 252 +ar_ 250 +li 248 +te_ 247 +aar_ 247 +_da 245 +u_ 242 +nde 241 +ou_ 237 +_l 231 +be 229 +_' 226 +rd 224 +_va 224 +ig 223 +ng 222 +ns 221 +ve 220 +it_ 218 +_j 216 +_me 216 +sy 215 +ke 213 +_sy 212 +aan 212 +van 212 +_in 210 +is_ 210 +in_ 208 +sy_ 206 +_sy_ 206 +'n 205 +ro 205 +ko 204 +_'n 203 +ra 203 +'n_ 203 +_'n_ 202 +so 202 +D 202 +ho 201 +rs 200 +eer 200 +ik 199 +la 198 +_te 196 +_van 196 +_ma 195 +as_ 194 +ui 194 +ver 192 +e. 192 +der 191 +to 188 +op 187 +van_ 184 +ag 184 +_ve 182 +and 180 +_van_ 178 +ha 178 +f_ 176 +ka 176 +ne 175 +_is 175 +sk 174 +e._ 174 +oor 174 +_ver 170 +ek_ 170 +_hy 170 +hy 170 +p_ 168 +_be 168 +ri 168 +ur 167 +nie_ 165 +_so 165 +_D 164 +si 164 +ll 164 +no 164 +_in_ 163 +_hy_ 162 +hy_ 162 +ed 161 +ers 160 +_r 156 +ak 156 +_ho 155 +_nie_ 153 +eg 153 +nt 152 +de_ 152 +_p 151 +_we 148 +_is_ 148 +ei 147 +es_ 142 +maa 142 +wee 142 +na 141 +nder 139 +a_ 138 +ing 138 +ew 138 +S 135 +lle 135 +_om 135 +_te_ 134 +eu 134 +ie. 134 +wo 132 +em 132 +wat 131 +_no 130 +_" 130 +vo 130 +E 129 +H 128 +_wat 127 +ti 126 +mo 126 +A 126 +e, 126 +_ha 125 +vi 125 +el_ 125 +ter 125 +e,_ 124 +dat 124 +eer_ 124 +wat_ 124 +le_ 124 +ta 124 +Di 123 +dat_ 123 +_wat_ 122 +ie._ 122 +was 121 +ste 121 +_H 121 +_se 121 +se_ 120 +ul 120 +al_ 120 +_was 120 +_om_ 119 +_st 119 +lik 118 +"_ 118 +_ko 118 +_maa 118 +lo 117 +_to 117 +ns_ 115 +aan_ 115 +nie. 114 +_vi 114 +met 114 +_nie. 111 +nk 110 +_Di 110 +- 110 +_op 109 +_oo 109 +_on 108 +ir 108 +ord 108 +uit 106 +ens 105 +_was_ 105 +was_ 105 +een 105 +_met 105 +os 105 +_S 104 +nie._ 104 +ig_ 103 +_sk 102 +op_ 101 +_ek 101 +_wee 101 +ir_ 101 +met_ 100 +_met_ 100 +rt 100 +ik_ 99 +end 99 +nd_ 99 +gt 99 +ond 98 +ot 98 +_aa 97 +og 97 +vir_ 95 +vir 95 +_ka 94 +hu 94 +_mo 94 +_vir_ 94 +_vir 94 +_dit 93 +kr 93 +am 93 +ol 93 +dit 93 +_ek_ 93 +ki 93 +sa 93 +_aan 92 +man 92 +jy 92 +ng_ 92 +aak 92 +lle_ 91 +_hu 91 +_na 91 +_vo 90 +ewe 90 +of 90 +jy_ 90 +_dit_ 90 +dit_ 90 +_jy 89 +der_ 89 +jo 89 +_f 88 +_u 88 +sie 87 +_dat 87 +_jy_ 87 +daa 87 +do 87 +vr 87 +wi 86 +ry 86 +_dat_ 86 +eur 86 +rs_ 85 +_jo 85 +_wo 84 +_ne 84 +jie 84 +ji 84 +pe 83 +moe 83 +my 82 +ull 82 +Die 81 +maar 81 +_hom 81 +ulle 81 +_maar 81 +hom 81 +_uit 80 +_ui 80 +ges 80 +raa 80 +or_ 80 +ies 80 +jou 79 +_la 79 +maar_ 79 +ulle_ 79 +_daa 79 +Die_ 79 +daar 78 +_daar 78 +ien 78 +_my 78 +_jou 78 +ok 78 +il 78 +lik_ 77 +sta 77 +_Die 77 +ur_ 77 +ga 77 +ag_ 77 +kan 77 diff --git a/libtextcat/data/new_fingerprints/lm/albanian.lm b/libtextcat/data/new_fingerprints/lm/albanian.lm new file mode 100644 index 000000000000..0665a962d018 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/albanian.lm @@ -0,0 +1,400 @@ +_ 19480 +ë 4099 +e 4082 +t 3635 +i 3134 +a 2893 +r 2820 +n 2610 +s 2380 +h 2060 +ë_ 2055 +e_ 1825 +j 1677 +u 1489 +d 1381 +o 1370 +m 1318 +k 1264 +të 1091 +p 1072 +_t 1068 +sh 998 +l 936 +_n 876 +a_ 822 +, 816 +,_ 808 +të_ 795 +i_ 770 +_p 739 +_m 702 +_s 700 +te 653 +ër 620 +_d 613 +_e 607 +g 602 +_k 601 +_të 593 +. 575 +_të_ 574 +v 567 +_e_ 554 +r_ 525 +._ 523 +ht 503 +n_ 480 +he 473 +në 462 +sht 461 +te_ 457 +q 454 +nd 436 +ri 432 +is 414 +et 403 +b 402 +je 401 +me 395 +in 391 +it 381 +rë 374 +_a 374 +t_ 359 +ur 353 +_i 346 +ar 342 +ës 339 +er 338 +në_ 338 +ën 338 +dh 337 +en 336 +pë 334 +f 328 +_v 323 +jë 318 +nj 313 +ish 312 +për 294 +y 285 +z 282 +es 281 +at 274 +_me 273 +_q 273 +gj 269 +ra 261 +as 258 +_në 256 +ku 256 +j_ 250 +ta 249 +re 246 +një 245 +o_ 243 +ni 243 +_pë 240 +hte 240 +_nj 239 +on 239 +isht 236 +pa 234 +th 233 +shte 233 +_për 232 +se 228 +_g 223 +ve 221 +in_ 220 +s_ 219 +_në_ 219 +do 218 +hte_ 218 +më 216 +ti 215 +aj 212 +shte_ 212 +ej 212 +u_ 211 +që 211 +_sh 210 +nt 207 +jë_ 206 +_b 205 +_një 203 +di 202 +_pa 201 +_i_ 201 +ll 199 +_f 199 +kë 198 +me_ 197 +dhe 195 +ishte 195 +si 194 +hi 191 +he_ 188 +- 187 +ja 187 +_që 187 +ua 186 +il 184 +_dh 184 +ur_ 183 +ër_ 182 +or 180 +se_ 179 +që_ 178 +S 176 +ç 175 +_h 173 +an 172 +një_ 172 +ng 170 +nte 170 +_që_ 169 +_S 169 +rë_ 166 +dhe_ 165 +_me_ 164 +ka 162 +im 159 +hë 158 +mi 157 +to 156 +tu 156 +ën_ 155 +_një_ 154 +ha 153 +nte_ 150 +tr 148 +sa 148 +ët 148 +_gj 148 +un 147 +rr 147 +ë, 147 +_dhe 147 +ej_ 147 +ki 146 +ë,_ 146 +_ku 145 +_- 144 +_ng 142 +ik 141 +_nd 140 +end 138 +uk 137 +etë 135 +ko 135 +_dhe_ 135 +_ve 132 +va 131 +_l 131 +për_ 131 +shi 131 +erë 129 +ke 127 +kis 127 +së 126 +jo 125 +li 124 +ga 124 +kish 123 +_ki 122 +po 122 +_se 122 +' 121 +du 120 +mb 120 +_më 119 +Si 115 +më_ 115 +esh 115 +_si 114 +qe 114 +lë 114 +_kis 113 +oh 113 +_kish 113 +_Si 113 +pr 112 +_u 112 +uar 111 +de 111 +hu 111 +_th 111 +al 111 +ta_ 109 +ilv 108 +Sil 108 +Silv 108 +lv 108 +k_ 108 +e, 108 +ji 107 +e,_ 106 +_Sil 106 +_Silv 106 +_r 105 +os 104 +_se_ 104 +kisht 102 +_di 102 +st 101 +_për_ 101 +bë 101 +tj 100 +_nga 99 +nga 99 +_du 98 +ra_ 98 +vë 98 +gji 98 +_ish 96 +rt 96 +_is 96 +ro 95 +ir 94 +ga_ 94 +ësh 94 +ont 93 +c 93 +t, 93 +t,_ 93 +hin 92 +a, 92 +_at 92 +und 92 +jt 91 +_mb 91 +a,_ 91 +tje 90 +_nga_ 90 +_do 90 +_pr 90 +rit 90 +men 90 +nga_ 90 +ri_ 89 +N 89 +ma 89 +it_ 88 +_kë 88 +-_ 88 +m_ 87 +jo_ 87 +onte 87 +atë 87 +la 87 +ëri 87 +ilva 86 +shin 86 +ë. 86 +Silva 86 +lva 86 +së_ 85 +jer 85 +et_ 85 +_po 85 +ës_ 84 +kur 84 +ru 84 +nin 83 +ot 83 +hin_ 83 +_N 83 +her 83 +htë 82 +ap 82 +shin_ 82 +mo 81 +ash 81 +tha 81 +_ç 81 +ë._ 81 +ëm 81 +jit 80 +_ta 80 +ul 80 +le 80 +ho 80 +_z 79 +dr 78 +jet 78 +nin_ 78 +_më_ 78 +gjit 78 +A 78 +hk 78 +onte_ 78 +oni 77 +lo 77 +ba 77 +herë 77 +ndo 76 +shk 76 +mend 75 +_vë 75 +ha_ 75 +dë 75 +tur 74 +_A 74 +el 74 +bi 74 +_ko 74 +uk_ 73 +erë_ 73 +si_ 73 +_sa 73 +ar_ 72 +P 72 +rs 72 +pas 72 +ith 72 +uar_ 71 +_isht 71 +ai 70 +e. 70 +_vet 70 +vet 70 +_bë 70 +zi 70 +d_ 70 +jith 70 +da 70 +gjith 69 +duk 69 +na 69 +hej 69 +tër 68 +_men 68 +_ka 68 +am 68 +nd_ 68 +_c 67 +_pas 67 +_duk 67 +jes 67 +ak 67 +s, 67 +e._ 67 +s,_ 67 +K 67 +ësht 67 +mu 66 +kur_ 66 +yr 66 +em 65 +_së 65 +tha_ 65 +imi 65 +ie 65 +hej_ 64 +_së_ 64 +_u_ 64 +? 64 +fu 64 +_P 64 diff --git a/libtextcat/data/new_fingerprints/lm/amharic_utf.lm b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm new file mode 100644 index 000000000000..0c5bc813e663 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/amharic_utf.lm @@ -0,0 +1,400 @@ + 21403 +_ 10092 + 7734 + 6558 +_ 5003 + 4717 + 4401 + 4274 + 4176 + 4054 + 3868 + 2728 + 1656 + 1591 + 1579 + 1425 + 1402 +_ 1261 +_ 1231 + 1217 + 1187 + 1183 +_ 1160 + 1145 + 1123 + 1097 +ን 1043 + 1043 + 1041 + 1004 +_ 991 + 936 + 880 + 855 +ው 855 + 849 + 805 + 783 +ት 783 +_ 763 + 709 + 704 + 682 +በ 682 + 679 + 670 + 667 +በ 666 + 666 + 658 + 643 +የ 637 + 637 + 627 +የ 627 +ለ 614 + 614 + 611 +ር 611 +_ 588 +_ 583 +ት_ 583 +_የ 577 +_የ 574 +ለ 573 + 573 +ን 570 + 570 +መ 563 + 563 + 557 +መ 557 + 554 +አ 554 +አ 553 +ተ 553 + 553 + 553 +ተ 547 + 547 +ም 534 + 534 + 532 +- 531 +ስ 525 + 525 +-- 521 +ል 515 + 515 +--- 512 +---- 503 +_በ 499 +----- 494 +_በ 487 + 479 + 477 +_ 473 + 469 +ው 469 +ን_ 468 +_ 468 + 465 + 464 +ያ 457 + 457 + 444 +_አ 424 +_አ 424 +ስ 423 + 423 +_ 415 + 402 + 401 + 390 + 389 + 382 +_ 378 +ው_ 378 + 365 + 364 +ያ 364 + 363 +ል 357 + 357 + 356 +_ 351 + 347 +ች 347 + 341 +ነ 341 +ይ 337 + 337 +። 337 + 337 +_ 337 +።_ 337 + 336 + 334 + 320 +እ 320 + 320 + 320 + 318 +እ 318 +_ 314 +ር_ 314 + 312 + 311 + 301 +ና 300 +ገ 300 + 300 + 300 + 299 + 297 +ር 294 + 294 +ግ 294 + 294 + 293 + 291 +ከ 291 + 291 + 291 + 291 +ም 291 +ገ 291 +ነ 291 +ደ 288 + 288 +_እ 285 +_እ 283 + 279 + 279 +ከ 279 + 279 +በ 279 +ን 276 + 276 + 276 +_ 272 + 270 +ብ 270 +_ 269 +ግ 264 + 264 + 262 + 262 +መ 262 + 262 +ይ 261 + 261 + 260 +ማ 260 +ደ 259 + 259 +ራ 254 + 254 +ባ 254 + 254 + 253 + 249 + 247 + 245 + 244 +ላ 242 + 242 +የ 242 + 242 +ማ 238 + 238 + 237 +ረ 237 + 237 + 236 +ተ 236 +ም_ 235 +_ 235 + 234 + 233 + 233 + 230 + 230 +ባ 230 +ሚ 230 +ድ 228 + 228 +_መ 227 + 227 +_መ 226 + 225 +ረ 225 + 225 +። 222 +።_ 222 + 216 +እ 214 +ሚ 214 + 214 + 214 + 213 + 212 + 210 + 209 +ላ 209 + 208 + 207 + 206 +_ 206 + 205 +ብ 205 + 202 + 200 +ታ 200 + 200 +ሰ 200 + 199 +ራ 199 +ሰ 198 + 198 +ት 195 +ወ 195 + 195 + 195 + 194 +ወ 194 + 191 + 191 +_ 189 +ች_ 189 + 188 + 186 + 186 +_ለ 184 +_ለ 183 +ለ 183 + 183 +ን 180 + 179 +የ 179 + 178 + 177 +ን 177 +_ከ 175 + 174 +ጥ 174 + 172 +አ 172 +_ከ 170 + 170 +_ 169 +ን 169 + 166 + 166 +ል 165 +_ 165 + 165 +ና_ 163 +_ 163 + 160 +ቸ 160 +ቸ 160 + 160 + 160 + 159 + 159 + 158 +ቀ 158 + 158 + 156 +ቀ 155 + 155 +ች 154 + 154 +ል_ 154 + 154 +ው 154 +ቸ 154 +_ 154 + 152 + 151 +ው 151 + 150 +_ነ 150 + 150 +_ነ 150 + 150 +_ይ 150 +_ይ 150 + 149 +ታ 149 +። 148 + 147 +ደ 147 + 147 +በ 147 + 146 +_ተ 146 +_ተ 146 +ለ 145 + 145 +ድ 144 + 144 + 144 + 144 +ቅ 143 + 143 +_ 143 +ግ 142 + 142 + 141 +ዳ 141 + 139 +ህ 138 + 138 + 137 +ና 137 + 137 +ን 136 + 136 +አ 135 + 135 + 135 +ስ 134 + 134 +ጠ 133 +ዳ 133 + 133 +ሆ 133 + 133 +ሆ 133 + 133 + 133 +ሆ 132 + 132 +ተ 131 +ያ 131 +ተ 131 + 131 + 129 + 128 + 128 +ክ 128 +ፍ 128 + 128 + 127 + 127 +ጠ 127 + 126 +ካ 126 + 124 +በ 123 + 123 +ያ 123 + 123 + 123 + 122 +ከ 122 + 121 +ገ 121 diff --git a/libtextcat/data/new_fingerprints/lm/arabic.lm b/libtextcat/data/new_fingerprints/lm/arabic.lm new file mode 100644 index 000000000000..85f701965e2e --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/arabic.lm @@ -0,0 +1,400 @@ +_ +ا +ل +و +ال +_ا +ي +ن +م +_ال +ر +ب +. +ت +د +ع +ه +_و +ن_ +ف +ا_ +ك +ج +.. +ة +ح +أ +س +_م +._ +ق +ة_ +ه_ +لا +ْ +_أ +ان +_ف +ُ +_ب +َ +لم +د_ +ول +ي_ +ى +ى_ +... +وج +_ل +_ع +ل_ +وا +جو +ْ. +ص +الم +_الم +..._ +.._ +ث +ود +ذ +ش +من +وجو +َ_ +في +لا_ +جود +ر_ +لى_ +لى +ان_ +وجود +لو +م_ +_ت +ِ +_من +ْ... +_وا +لع +الو +عل +ْ..._ +ْ.. +ين +الع +_في +ز +ات +_ي +_الع +ُ_ +_ك +_الو +من_ +_ان +مر +ء +في_ +يا +ب_ +را +،_ +ِ_ +، +ض +_في_ +تب +_من_ +لوج +كا +لي +ت_ +لوجو +ّ +ون +الوج +اء +جود_ +أح +_أح +الوجو +له +ود_ +ها +حا +ذا +_ر +على_ +وجود_ +على +رب +لوجود +عر +_ان_ +او +اول +ط +رت +لت +بْ +أحا +_الوج +أحاو +با +وال +_ول +اد +_وال +حاول +_أحاو +_أحا +أحاول +_،_ +حاو +_، +ني +بي +_عل +لن +ته +ما +-_ +- +مرتب +نا +_. +ها_ +مرت +_._ +_- +_-_ +بة +ول_ +_ح +رتب +دا +له_ +ء_ +ك_ +قي +تبة +اول_ +مرتبة +ية +بل +ور +ده +الت +خ +رتبة +الا +رتبة_ +ين_ +عرب +ير +بة_ +تبة_ +قد +ربْ +لعربْ +لعر +العر +أن +لك +حد +ون_ +لعرب +_على_ +_العر +تُ +عن +بْ. +_لا +حاول_ +ذات +العرب +_على +ية_ +عربْ +إ +اب +ئ +سا +نو +كو +المر +لل +يت +_ش +لم_ +_المر +اع +مو +لمر +_الا +ته_ +اج +ٍ +_ق +س_ +ائ +جب +ام +اجب_ +كون +واجب_ +لَ +_لا_ +اني +سي +واج +سم +لَ_ +يس +ال_ +_ولا +عي +وص +عا +جب_ +اس +ير_ +_مر +واجب +اجب +_بل +الن +ولا +_بال +وأ +أع +اك +وق +بلاد +نت +نف +ضا +نه +كون_ +بْ.. +ثل +كل +ولا_ +_ذا +ذاته +المرت +دة +ذاته_ +ور_ +بال +بْ... +_ولا_ +_الت +يه +_الل +_س +اء_ +ات_ +بلا +_وأ +_ذ +صو +ربْ. +_بلاد +لاد +_بلا +غ +لمرتب +_ه +بن +لمرت +عربْ. +_ن +_ذات +اته_ +لله +ْ._ +_با +اته +_إ +وم +الل +الوا +موج +_الله +نْ +لُ +اف +_يكو +لر +قا +عين +ست +يكون +موجو +ليس +ده_ +لُ_ +_وج +_وص +دي +حم +الواج +بين +_الر +_يك +مس +مُ +لله_ +ٍ_ +عد +يل +_الن +عق +اش +يكو +يق +الر +تُ_ +_كا +شي +_يكون +لوا +ار +موجود +يك +هْ +_ذاته +ع_ +جا +الله +فو +وب +_عي +رس +دة_ +لواجب +يكون_ +لواج +رك +ف_ +كان +لص +لش +لث +زا +ياء +ساء +لعق +انت +علم +العق +ما_ +قد_ +لف +الله_ diff --git a/libtextcat/data/new_fingerprints/lm/armenian.lm b/libtextcat/data/new_fingerprints/lm/armenian.lm new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/libtextcat/data/new_fingerprints/lm/armenian.lm diff --git a/libtextcat/libtextcat-2.2.patch b/libtextcat/libtextcat-2.2.patch new file mode 100644 index 000000000000..81babb0eb0aa --- /dev/null +++ b/libtextcat/libtextcat-2.2.patch @@ -0,0 +1,2137 @@ +*** misc/libtextcat-2.2/src/common.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/common.c 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 3,25 **** + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 3,25 ---- + * + * Copyright (c) 2003, WiseGuys Internet B.V. + * All rights reserved. +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 114,124 **** + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +! return( result ); + } + +! extern void* wg_realloc( void *ptr, size_t size ) +! { + void *result; + + if (!size) { +--- 114,124 ---- + wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); + } + +! return( result ); + } + +! extern void* wg_realloc( void *ptr, size_t size ) +! { + void *result; + + if (!size) { +*************** +*** 131,137 **** + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +! return( result ); + } + + extern void wg_free( void *mem ) +--- 131,137 ---- + wgmem_error( "Error while reallocing %u bytes.\n", size ); + } + +! return( result ); + } + + extern void wg_free( void *mem ) +*************** +*** 148,159 **** + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +! + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +! } +! + return line; + } + +--- 148,159 ---- + if ( fgets(line, size, fp) == NULL ) { + return NULL; + } +! + /** kill term null **/ + if ( (p = strpbrk( line, "\n\r" )) ) { + *p = '\0'; +! } +! + return line; + } + +*************** +*** 164,202 **** + * + * ARGUMENTS: + * - result: +! * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +! * +! * - dest: +! * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +! * +! * - src: +! * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +! * + * Example: +! * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +! * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +! * +! * - maxsegments: +! * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +! * + * RETURN VALUE: + * The number of segments found. + */ +--- 164,202 ---- + * + * ARGUMENTS: + * - result: +! * + * After the split, this array contains pointers to the start of each + * detected segment. Must be preallocated and at least as large as + * maxsegments. The pointers point into the dest buffer. +! * +! * - dest: +! * + * String into which result points as an index. Must be preallocated, and + * at least as big as src. You can use src as dest, but in that case src + * is overwritten! +! * +! * - src: +! * + * The string to split. Sequences of whitespace are treated as separators, unless + * escaped. There are two ways to escape: by using single quotes (anything + * between single quotes is treated as one segment), or by using a backslash + * to escape the next character. The backslash escape works inside quotation + * as well. +! * + * Example: +! * + * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: +! * + * "It's" + * "very easy" + * "to use WiseGuys' wg_split()" + * "function" +! * +! * - maxsegments: +! * + * The maximum number of segments. If the splitter runs out of segments, + * the remainder of the string is stored in the last segment. +! * + * RETURN VALUE: + * The number of segments found. + */ +*************** +*** 223,229 **** + } + state = 1; + +! case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +--- 223,229 ---- + } + state = 1; + +! case 1: + /*** Start segment ***/ + result[cnt] = w; + cnt++; +*************** +*** 237,243 **** + p++; + state = 0; + break; +! } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +--- 237,243 ---- + p++; + state = 0; + break; +! } + else if ( *p == '\'' ) { + /*** Start quotation ***/ + p++; +*************** +*** 292,308 **** + } + + + extern void wg_timerstart(wgtimer_t *t) + { +- #ifdef HAVE_GETTIMEOFDAY + gettimeofday( &(t->start), NULL ); +- #endif + } + + + extern uint4 wg_timerstop(wgtimer_t *t) + { +- #ifdef HAVE_GETTIMEOFDAY + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +--- 292,308 ---- + } + + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t) + { + gettimeofday( &(t->start), NULL ); + } ++ #endif /* TL : no struct timeval under Win32 */ + + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern uint4 wg_timerstop(wgtimer_t *t) + { + uint4 result; + gettimeofday( &(t->stop), NULL ); + result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + +*************** +*** 312,336 **** + t->start.tv_usec = t->stop.tv_usec; + + return result; +- #else +- return 0; +- #endif + } + + + /** + * wg_strgmov -- a guarded strcpy() variation +! * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +! * finished, the function returns NULL after restoring the first +! * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +! + if ( !dest || dest >= destlimit ) { + return NULL; + } +--- 312,334 ---- + t->start.tv_usec = t->stop.tv_usec; + + return result; + } ++ #endif /* TL : no struct timeval under Win32 */ + + + /** + * wg_strgmov -- a guarded strcpy() variation +! * + * copies src to dest (including terminating zero), and returns + * pointer to position of terminating zero in dest. The function is + * guaranteed not to write past destlimit. If the copy couldn't be +! * finished, the function returns NULL after restoring the first +! * character in dest for your convenience (since this is usually a zero). + */ + char *wg_strgmov( char *dest, const char *src, const char *destlimit ) + { + char tmp, *w; +! + if ( !dest || dest >= destlimit ) { + return NULL; + } +*************** +*** 355,361 **** + } + + /* +! * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +--- 353,359 ---- + } + + /* +! * wg_trim() -- remove whitespace surrounding a string. + * + * Example: " bla bla bla " becomes "bla bla bla" after trimming. + * +*************** +*** 373,379 **** + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +! + while ( isspace((int)*p) ) { + p++; + } +--- 371,377 ---- + char *lastnonspace = &dest[-1]; + const char *p = src; + char *w = dest; +! + while ( isspace((int)*p) ) { + p++; + } +*** misc/libtextcat-2.2/src/common.h 2003-05-22 15:02:29.000000000 +0200 +--- misc/build/libtextcat-2.2/src/common.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 1,28 **** + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +! * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 1,28 ---- + #ifndef _COMMON_H_ + #define _COMMON_H_ + /** +! * common.h -- a mixed bag of helper functions + * + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 86,95 **** +--- 86,97 ---- + typedef char boole; + #endif + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + typedef struct wgtimer_s { + struct timeval start; + struct timeval stop; + } wgtimer_t; ++ #endif /* TL : no struct timeval under Win32 */ + + + extern void *wg_malloc( size_t size ); +*************** +*** 101,113 **** + + extern char *wg_getline( char *line, int size, FILE *fp ); + + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +! + #endif + +--- 103,117 ---- + + extern char *wg_getline( char *line, int size, FILE *fp ); + ++ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ + extern void wg_timerstart(wgtimer_t *t); + extern uint4 wg_timerstop(wgtimer_t *t); ++ #endif /* TL : no struct timeval under Win32 */ + + extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); + extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); + extern char *wg_trim( char *dest, const char *src ); + +! + #endif + +*** misc/libtextcat-2.2/src/constants.h 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/constants.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 39,44 **** +--- 39,46 ---- + */ + #include <limits.h> + ++ #define _UTF8_ ++ + #define DESCRIPTION "out of place" + + /* Reported matches are those fingerprints with a score less than best +*************** +*** 59,72 **** + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +! /* Maximum size of an n-gram? */ +! #define MAXNGRAMSIZE 5 + + /* Which characters are not acceptable in n-grams? */ + #define INVALID(c) (isspace((int)c) || isdigit((int)c)) + + /* Minimum size (in characters) for accepting a document */ +! #define MINDOCSIZE 25 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +--- 61,81 ---- + /* Maximum number of n-grams in a fingerprint */ + #define MAXNGRAMS 400 + +! /* Maximum number of character of an n-gram? */ +! #define MAXNGRAMSYMBOL 5 +! +! /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ +! #ifdef _UTF8_ +! #define MAXNGRAMSIZE 20 +! #else +! #define MAXNGRAMSIZE MAXNGRAMSYMBOL +! #endif + + /* Which characters are not acceptable in n-grams? */ + #define INVALID(c) (isspace((int)c) || isdigit((int)c)) + + /* Minimum size (in characters) for accepting a document */ +! #define MINDOCSIZE 6 + + /* Maximum penalty for missing an n-gram in fingerprint */ + #define MAXOUTOFPLACE 400 +*************** +*** 76,79 **** +--- 85,91 ---- + + #define MAXSCORE INT_MAX + ++ /* where the fingerprints files are stored */ ++ #define DEFAULT_FINGERPRINTS_PATH "" ++ + #endif +*** misc/libtextcat-2.2/src/fingerprint.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/fingerprint.c 2007-01-12 12:51:59.000000000 +0100 +*************** +*** 6,28 **** + * All rights reserved. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 6,28 ---- + * All rights reserved. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 51,57 **** + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +! * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +--- 51,57 ---- + * The reason why we go through the trouble of doing a partial + * (heap)sort is that a full quicksort behaves horribly on the data: + * most n-grams have a very low count, resulting in a data set in +! * nearly-sorted order. This causes quicksort to behave very badly. + * Heapsort, on the other hand, behaves handsomely: worst case is + * Mlog(N) for M n-grams filtered through a N-sized heap. + * +*************** +*** 63,68 **** +--- 63,72 ---- + * - put table/heap datastructure in a separate file. + */ + ++ #ifndef _UTF8_ ++ #define _UTF8_ ++ #endif ++ + #include "config.h" + #include <stdio.h> + #ifdef HAVE_STDLIB_H +*************** +*** 80,89 **** +--- 84,95 ---- + #include "wg_mempool.h" + #include "constants.h" + ++ #include "utf8misc.h" + + #define TABLESIZE (1<<TABLEPOW) + #define TABLEMASK ((TABLESIZE)-1) + ++ + typedef struct { + + sint2 rank; +*************** +*** 96,102 **** + const char *name; + ngram_t *fprint; + uint4 size; +! + } fp_t; + + typedef struct entry_s { +--- 102,108 ---- + const char *name; + ngram_t *fprint; + uint4 size; +! + } fp_t; + + typedef struct entry_s { +*************** +*** 105,117 **** + struct entry_s *next; + } entry_t; + +! typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +! + uint4 heapsize; + uint4 size; + } table_t; +--- 111,123 ---- + struct entry_s *next; + } entry_t; + +! typedef struct table_s { + void *pool; + entry_t **table; + entry_t *heap; + + struct table_s *next; +! + uint4 heapsize; + uint4 size; + } table_t; +*************** +*** 122,128 **** + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +! * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +--- 128,134 ---- + * fast and furious little hash function + * + * (Note that we could use some kind of rolling checksum, and update it +! * during n-gram construction) + */ + static uint4 simplehash( const char *p, int len ) + { +*************** +*** 134,162 **** + } + + +- /* checks if n-gram lex is a prefix of key and of length len */ +- inline int issame( char *lex, char *key, int len ) +- { +- int i; +- for (i=0; i<len; i++) { +- if ( key[i] != lex[i] ) { +- return 0; +- } +- } +- if ( lex[i] != 0 ) { +- return 0; +- } +- return 1; +- } +- + + /* increases frequency of ngram(p,len) */ +! static inline int increasefreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +--- 140,153 ---- + } + + + + /* increases frequency of ngram(p,len) */ +! static int increasefreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + /*** Found it! ***/ + entry->cnt++; +*************** +*** 168,174 **** + } + + /*** Not found, so create ***/ +! entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); + strcpy( entry->str, p ); + entry->cnt = 1; + +--- 159,165 ---- + } + + /*** Not found, so create ***/ +! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); + strcpy( entry->str, p ); + entry->cnt = 1; + +*************** +*** 181,192 **** + #if 0 + + /* looks up ngram(p,len) */ +! static entry_t *findfreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +--- 172,183 ---- + #if 0 + + /* looks up ngram(p,len) */ +! static entry_t *findfreq( table_t *t, char *p, int len ) +! { +! uint4 hash = simplehash( p, len ) & TABLEMASK; + entry_t *entry = t->table[ hash ]; +! +! while ( entry ) { + if ( issame( entry->str, p, len ) ) { + return entry; + } +*************** +*** 219,225 **** + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +! inline static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +--- 210,216 ---- + #define GREATER(x,y) ((x).cnt > (y).cnt) + #define LESS(x,y) ((x).cnt < (y).cnt) + +! static void siftup( table_t *t, unsigned int child ) + { + entry_t *heap = t->heap; + unsigned int parent = (child-1) >> 1; +*************** +*** 241,247 **** + } + + +! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +--- 232,238 ---- + } + + +! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) + { + entry_t *heap = t->heap; + unsigned int child = parent*2 + 1; +*************** +*** 273,279 **** + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +! t->size++; + return 0; + } + +--- 264,270 ---- + if (t->size < t->heapsize) { + memcpy( &(heap[t->size]), item, sizeof(entry_t)); + siftup( t, t->size ); +! t->size++; + return 0; + } + +*************** +*** 316,333 **** + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +! entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +! } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +! { + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +--- 307,324 ---- + + /*** Fill result heap ***/ + for (i=0; i<TABLESIZE; i++) { +! entry_t *p = t->table[i]; + while (p) { + heapinsert(t, p); + p = p->next; + } +! } + return 1; + } + + + static table_t *inittable(uint4 maxngrams) +! { + table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); + result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); + result->pool = wgmempool_Init( 10000, 10 ); +*************** +*** 347,360 **** + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +! wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +! + if ( name ) { + h->name = wg_strdup(name); + } +--- 338,351 ---- + wgmempool_Done(t->pool); + wg_free(t->table); + wg_free(t->heap); +! wg_free(t); + } + + + extern void *fp_Init(const char *name) + { + fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); +! + if ( name ) { + h->name = wg_strdup(name); + } +*************** +*** 458,478 **** + return dest; + } + +! + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +! for (;;p++) { + +! const char *q = p; + char *m = n; + + /*** First char may be an underscore ***/ +! *m++ = *q++; + *m = '\0'; + + increasefreq( t, n, 1 ); +--- 449,475 ---- + return dest; + } + +! /** +! * this function extract all n-gram from past buffer and put them into the table "t" +! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice +! */ + static void createngramtable( table_t *t, const char *buf ) + { + char n[MAXNGRAMSIZE+1]; + const char *p = buf; + int i; ++ int pointer = 0; + + /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ +! while(1) { + +! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ + char *m = n; + + /*** First char may be an underscore ***/ +! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ +! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ +! m += decay; /*[modified]*/ + *m = '\0'; + + increasefreq( t, n, 1 ); +*************** +*** 482,500 **** + } + + /*** Let the compiler unroll this ***/ +! for ( i=2; i<=MAXNGRAMSIZE; i++) { + +! *m++ = *q; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +! q++; + if ( *q == '\0' ) { + return; + } + } + } + return; + } +--- 479,500 ---- + } + + /*** Let the compiler unroll this ***/ +! for ( i=2; i<=MAXNGRAMSYMBOL; i++) { + +! decay = charcopy(q, m); /*[modified] like above*/ +! m += decay; + *m = '\0'; + + increasefreq( t, n, i ); + + if ( *q == '_' ) break; +! q += decay; + if ( *q == '\0' ) { + return; + } + } ++ ++ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ + } + return; + } +*************** +*** 514,520 **** + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return mystrcmp( x->str, y->str ); + } + +--- 514,520 ---- + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return mystrcmp( x->str, y->str ); + } + +*************** +*** 522,533 **** + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return x->rank - y->rank; + } + + /** +! * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +--- 522,533 ---- + { + ngram_t *x = (ngram_t *)a; + ngram_t *y = (ngram_t *)b; +! + return x->rank - y->rank; + } + + /** +! * Create a fingerprint: + * - record the frequency of each unique n-gram in a hash table + * - take the most frequent n-grams + * - sort them alphabetically, recording their relative rank +*************** +*** 544,563 **** + } + + /*** Throw out all invalid chars ***/ +! tmp = prepbuffer( buffer, bufsize ); + if ( tmp == NULL ) { + return 0; + } +- + h = (fp_t*)handle; + t = inittable(maxngrams); + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +! + /*** Take the top N n-grams and add them to the profile ***/ +! table2heap(t); +! maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +--- 544,564 ---- + } + + /*** Throw out all invalid chars ***/ +! tmp = prepbuffer( buffer, bufsize ); +! /*printf("Cleaned buffer : %s\n",tmp);*/ + if ( tmp == NULL ) { + return 0; + } + h = (fp_t*)handle; + t = inittable(maxngrams); ++ /*printf("Table initialized\n");*/ + + /*** Create a hash table containing n-gram counts ***/ + createngramtable(t, tmp); +! /*printf("Table created\n");*/ + /*** Take the top N n-grams and add them to the profile ***/ +! table2heap(t); +! maxngrams = WGMIN( maxngrams, t->size ); + + h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); + h->size = maxngrams; +*************** +*** 568,574 **** + entry_t tmp2; + + heapextract(t, &tmp2); +! + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +--- 569,575 ---- + entry_t tmp2; + + heapextract(t, &tmp2); +! + /*** the string and its rank is all we need ***/ + strcpy( h->fprint[i].str, tmp2.str ); + h->fprint[i].rank = i; +*************** +*** 578,584 **** + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +--- 579,585 ---- + wg_free(tmp); + + /*** Sort n-grams alphabetically, for easy comparison ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + return 1; + } + +*************** +*** 608,614 **** + #endif + return 0; + } +! + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +--- 609,615 ---- + #endif + return 0; + } +! + h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); + + while (cnt < maxngrams && wg_getline(line,1024,fp)) { +*************** +*** 635,641 **** + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +--- 636,642 ---- + h->size = cnt; + + /*** Sort n-grams, for easy comparison later on ***/ +! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); + + fclose(fp); + +*************** +*** 648,661 **** + { + uint4 i; + fp_t *h = (fp_t *)handle; +! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); +! + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +! fprintf( fp, "%s\n", tmp[i].str ); + } + wg_free( tmp ); + } +--- 649,663 ---- + { + uint4 i; + fp_t *h = (fp_t *)handle; +! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); +! + /*** Make a temporary and sort it on rank ***/ + memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); +! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); + + for (i=0; i<h->size; i++) { +! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ +! fprintf( fp, "%s\n", tmp[i].str); + } + wg_free( tmp ); + } +*************** +*** 669,675 **** + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +! + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +--- 671,677 ---- + uint4 i = 0; + uint4 j = 0; + sint4 sum = 0; +! + /*** Compare the profiles in mergesort fashion ***/ + while ( i < c->size && j < u->size ) { + +*************** +*** 705,711 **** + } + + return sum; +! + } + + +--- 707,713 ---- + } + + return sum; +! + } + + +*** misc/libtextcat-2.2/src/fingerprint.h 2003-05-19 14:16:31.000000000 +0200 +--- misc/build/libtextcat-2.2/src/fingerprint.h 2007-01-11 13:19:40.000000000 +0100 +*************** +*** 41,47 **** +--- 41,53 ---- + extern int fp_Read( void *handle, const char *fname, int maxngrams ); + extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); + extern void fp_Show( void *handle ); ++ #ifdef __cplusplus ++ extern "C" { ++ #endif + extern const char *fp_Name( void *handle ); ++ #ifdef __cplusplus ++ } ++ #endif + extern void fp_Print( void *handle, FILE *fp ); + + #endif +*** misc/libtextcat-2.2/src/Makefile.in 2003-05-22 13:39:52.000000000 +0200 +--- misc/build/libtextcat-2.2/src/Makefile.in 2007-01-12 12:48:19.181803000 +0100 +*************** +*** 124,143 **** + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +! WARNS = -W -Wall -Wshadow -Wpointer-arith +! IFLAGS = +! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +! common.h constants.h fingerprint.h textcat.h wg_mempool.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +! common.c fingerprint.c textcat.c wg_mempool.c + + + bin_PROGRAMS = createfp +--- 124,143 ---- + target_vendor = @target_vendor@ + AUTOMAKE_OPTIONS = 1.4 foreign + +! #WARNS = -W -Wall -Wshadow -Wpointer-arith +! IFLAGS = +! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE + VERBOSE = -DVERBOSE + AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) + AM_LDFLAGS = -g + + noinst_HEADERS = \ +! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h + + + lib_LTLIBRARIES = libtextcat.la + libtextcat_la_SOURCES = \ +! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c + + + bin_PROGRAMS = createfp +*************** +*** 156,162 **** + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +! wg_mempool.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +--- 156,162 ---- + libtextcat_la_LDFLAGS = + libtextcat_la_LIBADD = + am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ +! wg_mempool.lo utf8misc.lo + libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) + bin_PROGRAMS = createfp$(EXEEXT) + noinst_PROGRAMS = testtextcat$(EXEEXT) +*************** +*** 177,183 **** + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +--- 177,184 ---- + @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ + @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ + @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ +! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo + COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ +*************** +*** 213,219 **** + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +--- 214,220 ---- + @rm -f stamp-h1 + cd $(top_builddir) && $(SHELL) ./config.status src/config.h + +! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOHEADER) + touch $(srcdir)/config.h.in + +*************** +*** 247,253 **** + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) +--- 248,254 ---- + echo "rm -f \"$${dir}/so_locations\""; \ + rm -f "$${dir}/so_locations"; \ + done +! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) + $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) + binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) + install-binPROGRAMS: $(bin_PROGRAMS) +*************** +*** 285,294 **** + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +--- 286,295 ---- + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) + @rm -f createfp$(EXEEXT) + $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) +! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) + @rm -f testtextcat$(EXEEXT) + $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) + +*************** +*** 304,309 **** +--- 305,311 ---- + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ ++ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ + + distclean-depend: + -rm -rf ./$(DEPDIR) +*** misc/libtextcat-2.2/src/makefile.mk 2007-01-12 12:55:41.709348000 +0100 +--- misc/build/libtextcat-2.2/src/makefile.mk 2007-01-12 12:48:19.214530000 +0100 +*************** +*** 1 **** +! dummy +--- 1,91 ---- +! #************************************************************************* +! # +! # $RCSfile: libtextcat-2.2.patch,v $ +! # +! # $Revision: 1.1 $ +! # +! # last change: $Author: tl $ $Date: 2007-01-12 12:34:52 $ +! # +! #* The Contents of this file are made available subject to +! #* the terms of GNU Lesser General Public License Version 2.1. +! #* +! #* +! #* GNU Lesser General Public License Version 2.1 +! #* ============================================= +! #* Copyright 2005 by Sun Microsystems, Inc. +! #* 901 San Antonio Road, Palo Alto, CA 94303, USA +! #* +! #* This library is free software; you can redistribute it and/or +! #* modify it under the terms of the GNU Lesser General Public +! #* License version 2.1, as published by the Free Software Foundation. +! #* +! #* This library is distributed in the hope that it will be useful, +! #* but WITHOUT ANY WARRANTY; without even the implied warranty of +! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +! #* Lesser General Public License for more details. +! #* +! #* You should have received a copy of the GNU Lesser General Public +! #* License along with this library; if not, write to the Free Software +! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, +! #* MA 02111-1307 USA +! #* +! #************************************************************************* +! +! PRJ = ..$/..$/..$/..$/.. +! +! PRJNAME = libtextcat +! TARGET = libtextcat +! CFLAGSCALL=gsd +! +! USE_DEFFILE=TRUE +! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE +! +! .INCLUDE : settings.mk +! +! # --- Files -------------------------------------------------------- +! +! # !! not to be compiled because those belong to a stand alone programs: !! +! # $(SLO)$/createfp.obj\ +! # $(SLO)$/testtextcat.obj +! +! SLOFILES= \ +! $(SLO)$/common.obj\ +! $(SLO)$/fingerprint.obj\ +! $(SLO)$/textcat.obj\ +! $(SLO)$/wg_mempool.obj\ +! $(SLO)$/utf8misc.obj +! +! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) +! SHL1TARGET= $(TARGET) +! +! SHL1STDLIBS= +! +! # build DLL +! SHL1LIBS= $(SLB)$/$(TARGET).lib +! SHL1IMPLIB= i$(TARGET) +! SHL1DEPN= $(SHL1LIBS) +! SHL1DEF= $(MISC)$/$(SHL1TARGET).def +! +! # build DEF file +! DEF1NAME= $(SHL1TARGET) +! DEF1LIBNAME=$(TARGET) +! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt +! +! # --- Targets ------------------------------------------------------ +! +! .INCLUDE : target.mk +! +! # copy hand supplied configuration file for Win32 builds to the file +! # which is included in the source code +! $(SLOFILES) : config.h +! config.h : +! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h +! +! +! $(MISC)$/$(SHL1TARGET).flt: makefile.mk +! @echo ------------------------------ +! @echo Making: $@ +! @echo Imp>$@ +! @echo __CT>>$@ +! @echo _real>>$@ +! @echo unnamed>>$@ +*** misc/libtextcat-2.2/src/textcat.c 2003-05-22 13:32:43.000000000 +0200 +--- misc/build/libtextcat-2.2/src/textcat.c 2007-01-12 12:52:41.000000000 +0100 +*************** +*** 4,26 **** + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +--- 4,26 ---- + * Copyright (C) 2003 WiseGuys Internet B.V. + * + * THE BSD LICENSE +! * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: +! * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. +! * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. +! * + * - Neither the name of the WiseGuys Internet B.V. nor the names of + * its contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. +! * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +*************** +*** 74,79 **** +--- 74,80 ---- + typedef struct { + + void **fprint; ++ char *fprint_disable; + uint4 size; + uint4 maxsize; + +*************** +*** 112,122 **** + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); + wg_free( h ); + + } + +! extern void *textcat_Init( const char *conffile ) + { + textcat_t *h; + char line[1024]; +--- 113,133 ---- + fp_Done( h->fprint[i] ); + } + wg_free( h->fprint ); ++ wg_free( h->fprint_disable ); + wg_free( h ); + + } + +! /** Replaces older function */ +! extern void *textcat_Init( const char *conffile ){ +! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); +! } +! +! /** +! * Originaly this function had only one parameter (conffile) it has been modified since OOo use +! * Basicaly prefix is the directory path where fingerprints are stored +! */ +! extern void *special_textcat_Init( const char *conffile, const char *prefix ) + { + textcat_t *h; + char line[1024]; +*************** +*** 134,144 **** + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +! int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +--- 145,157 ---- + h->size = 0; + h->maxsize = 16; + h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); ++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ + + while ( wg_getline( line, 1024, fp ) ) { + char *p; + char *segment[4]; +! char finger_print_file_name[512]; +! int res; + + /*** Skip comments ***/ + #ifdef HAVE_STRCHR +*************** +*** 156,172 **** + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +! } + h->size++; + } + +--- 169,191 ---- + /*** Ensure enough space ***/ + if ( h->size == h->maxsize ) { + h->maxsize *= 2; +! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); +! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); + } + + /*** Load data ***/ + if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { + goto ERROR; + } +! finger_print_file_name[0] = '\0'; +! strcat(finger_print_file_name, prefix); +! strcat(finger_print_file_name, segment[0]); +! +! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { + textcat_Done(h); + goto ERROR; +! } +! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ + h->size++; + } + +*************** +*** 203,213 **** + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +! + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +! int score = fp_Compare( h->fprint[i], unknown, threshold ); +! candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +--- 222,239 ---- + result = _TEXTCAT_RESULT_SHORT; + goto READY; + } +! + /*** Calculate the score for each category. ***/ + for (i=0; i<h->size; i++) { +! int score; +! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ +! score = MAXSCORE; +! } +! else{ +! score = fp_Compare( h->fprint[i], unknown, threshold ); +! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ +! } +! candidates[i].score = score; + candidates[i].name = fp_Name( h->fprint[i] ); + if ( score < minscore ) { + minscore = score; +*************** +*** 218,224 **** + /*** Find the best performers ***/ + for (i=0; i<h->size; i++) { + if ( candidates[i].score < threshold ) { +- + if ( ++cnt == MAXCANDIDATES+1 ) { + break; + } +--- 244,249 ---- +*************** +*** 235,241 **** + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +! + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +--- 260,266 ---- + else { + char *p = result; + char *plimit = result+MAXOUTPUTSIZE; +! + qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); + + *p = '\0'; +*************** +*** 247,253 **** + } + READY: + fp_Done(unknown); +! #ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +--- 272,278 ---- + } + READY: + fp_Done(unknown); +! #ifdef SHOULD_FREE + free(candidates); + #undef SHOULD_FREE + #endif +*** misc/libtextcat-2.2/src/textcat.h 2003-05-19 14:16:31.000000000 +0200 +--- misc/build/libtextcat-2.2/src/textcat.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 40,45 **** +--- 40,48 ---- + #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" + #define _TEXTCAT_RESULT_SHORT "SHORT" + ++ #ifdef __cplusplus ++ extern "C" { ++ #endif + + /** + * textcat_Init() - Initialize the text classifier. The textfile +*************** +*** 51,60 **** +--- 54,72 ---- + * Returns: handle on success, NULL on error. (At the moment, the + * only way errors can occur, is when the library cannot read the + * conffile, or one of the fingerprint files listed in it.) ++ * ++ * Replace older function (and has exacly the same behaviour) ++ * see below + */ + extern void *textcat_Init( const char *conffile ); + + /** ++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB ++ * Basicaly prefix is the directory path where fingerprints are stored ++ */ ++ extern void *special_textcat_Init( const char *conffile, const char *prefix ); ++ ++ /** + * textcat_Done() - Free up resources for handle + */ + extern void textcat_Done( void *handle ); +*************** +*** 77,80 **** +--- 89,96 ---- + * textcat_Version() - Returns a string describing the version of this classifier. + */ + extern char *textcat_Version(); ++ ++ #ifdef __cplusplus ++ } ++ #endif + #endif +*** misc/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:55:41.584585000 +0100 +--- misc/build/libtextcat-2.2/src/utf8misc.c 2007-01-12 12:54:50.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,132 ---- +! /*************************************************************************** +! * Copyright (C) 2006 by Jocelyn Merand * +! * joc.mer@gmail.com * +! * * +! * THE BSD LICENSE +! * +! * Redistribution and use in source and binary forms, with or without +! * modification, are permitted provided that the following conditions +! * are met: +! * +! * - Redistributions of source code must retain the above copyright +! * notice, this list of conditions and the following disclaimer. +! * +! * - Redistributions in binary form must reproduce the above copyright +! * notice, this list of conditions and the following disclaimer in the +! * documentation and/or other materials provided with the +! * distribution. +! * +! * - Neither the name of the WiseGuys Internet B.V. nor the names of +! * its contributors may be used to endorse or promote products derived +! * from this software without specific prior written permission. +! * +! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +! ***************************************************************************/ +! +! #ifndef _UTF8_MISC_H_ +! #include "utf8misc.h" +! #endif +! +! +! int nextcharstart(const char *str, int position){ +! int pointer = position; +! +! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then str[pointer] is an escape character*/ +! +! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ +! ++pointer; +! } +! return pointer; +! } +! +! +! int charcopy(const char *str, char *dest){ +! +! int pointer = 0; +! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then str[pointer] is an escape character*/ +! +! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ +! dest[pointer] = str[pointer]; +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! if(str[pointer]){ +! dest[pointer] = str[pointer]; +! ++pointer; +! } +! +! return pointer; +! } +! +! +! int issame( char *lex, char *key, int len ) +! { +! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ +! int char_counter = 0; +! int pointer = 0; +! while(char_counter < len) { +! +! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ +! +! /*then key[pointer] is an escap character*/ +! +! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ +! +! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ +! escape_char = escape_char <<1; +! ++pointer; +! } +! } +! ++char_counter; /*and we are on a new utf8 character*/ +! if ( key[pointer] != lex[pointer] ) { +! return 0; +! /*printf(" NO\n", lex, key, len);*/ +! } +! ++pointer; +! } +! if ( lex[pointer] != '\0' ) { +! return 0; +! /*printf(" NO\n");*/ +! } +! +! /*printf(" YES\n");*/ +! +! return 1; +! } +! +! +! extern int utfstrlen(const char* str){ +! int char_counter = 0; +! int pointer = 0; +! while(str[pointer]) { +! pointer = nextcharstart(str, pointer); +! +! ++char_counter; /*and we are on a new utf8 character*/ +! } +! return char_counter; +! } +! +*** misc/libtextcat-2.2/src/utf8misc.h 2007-01-12 12:55:41.547021000 +0100 +--- misc/build/libtextcat-2.2/src/utf8misc.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,88 ---- +! /*************************************************************************** +! * Copyright (C) 2006 by Jocelyn Merand * +! * joc.mer@gmail.com * +! * * +! * THE BSD LICENSE +! * +! * Redistribution and use in source and binary forms, with or without +! * modification, are permitted provided that the following conditions +! * are met: +! * +! * - Redistributions of source code must retain the above copyright +! * notice, this list of conditions and the following disclaimer. +! * +! * - Redistributions in binary form must reproduce the above copyright +! * notice, this list of conditions and the following disclaimer in the +! * documentation and/or other materials provided with the +! * distribution. +! * +! * - Neither the name of the WiseGuys Internet B.V. nor the names of +! * its contributors may be used to endorse or promote products derived +! * from this software without specific prior written permission. +! * +! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +! ***************************************************************************/ +! +! #ifndef _UTF8_MISC_H_ +! #define _UTF8_MISC_H_ +! +! /** +! * These variables are used in character processing functions +! * These have been added to manage utf-8 symbols, particularly escape chars +! */ +! #ifdef _UTF8_ +! #define ESCAPE_MASK 0x80 +! #define WEIGHT_MASK 0xF0 +! #else +! #define ESCAPE_MASK 0xFF +! #define WEIGHT_MASK 0x00 +! #endif +! +! +! /* +! * Is used to jump to the next start of char +! * of course it's only usefull when encoding is utf-8 +! * This function have been added by Jocelyn Merand to use libtextcat in OOo +! */ +! int nextcharstart(const char *str, int position); +! +! +! /*Copy the char in str to dest +! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char +! * return the number of char jumped +! * This function have been added by Jocelyn Merand to use libtextcat in OOo +! */ +! int charcopy(const char *str, char *dest); +! +! +! /* checks if n-gram lex is a prefix of key and of length len +! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex +! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 +! */ +! int issame( char *lex, char *key, int len ); +! +! +! /* Counts the number of characters +! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str +! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 +! */ +! #ifdef __cplusplus +! extern "C" { +! #endif +! extern int utfstrlen(const char* str); +! #ifdef __cplusplus +! } +! #endif +! +! #endif +! +*** misc/libtextcat-2.2/src/win32_config.h 2007-01-12 12:55:41.643465000 +0100 +--- misc/build/libtextcat-2.2/src/win32_config.h 2007-01-11 13:19:41.000000000 +0100 +*************** +*** 1 **** +! dummy +--- 1,136 ---- +! /* src/config.h. Generated by configure. */ +! /* src/config.h.in. Generated from configure.ac by autoheader. */ +! +! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP +! systems. This function is required for `alloca.c' support on those systems. +! */ +! /* #undef CRAY_STACKSEG_END */ +! +! /* Define to 1 if using `alloca.c'. */ +! /* #undef C_ALLOCA */ +! +! /* Define to 1 if you have `alloca', as a function or macro. */ +! /* #undef HAVE_ALLOCA */ +! +! /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). +! */ +! /* #undef HAVE_ALLOCA_H */ +! +! /* Define to 1 if you have the <dlfcn.h> header file. */ +! #define HAVE_DLFCN_H 1 +! +! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ +! /* #undef HAVE_DOPRNT */ +! +! /* Define to 1 if you have the `gettimeofday' function. */ +! /* #undef HAVE_GETTIMEOFDAY */ +! +! /* Define to 1 if you have the <inttypes.h> header file. */ +! /* #undef HAVE_INTTYPES_H */ +! +! /* Define to 1 if you have the <limits.h> header file. */ +! #define HAVE_LIMITS_H 1 +! +! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and +! to 0 otherwise. */ +! #define HAVE_MALLOC 1 +! +! /* Define to 1 if you have the <memory.h> header file. */ +! #define HAVE_MEMORY_H 1 +! +! /* Define to 1 if you have the `memset' function. */ +! #define HAVE_MEMSET 1 +! +! /* Define to 1 if your system has a GNU libc compatible `realloc' function, +! and to 0 otherwise. */ +! #define HAVE_REALLOC 1 +! +! /* Define to 1 if you have the <stdint.h> header file. */ +! /* #undef HAVE_STDINT_H */ +! +! /* Define to 1 if you have the <stdlib.h> header file. */ +! #define HAVE_STDLIB_H 1 +! +! /* Define to 1 if you have the `strchr' function. */ +! #define HAVE_STRCHR 1 +! +! /* Define to 1 if you have the `strdup' function. */ +! #define HAVE_STRDUP 1 +! +! /* Define to 1 if you have the <strings.h> header file. */ +! /* #undef HAVE_STRINGS_H */ +! +! /* Define to 1 if you have the <string.h> header file. */ +! #define HAVE_STRING_H 1 +! +! /* Define to 1 if you have the `strpbrk' function. */ +! #define HAVE_STRPBRK 1 +! +! /* Define to 1 if you have the <sys/stat.h> header file. */ +! #define HAVE_SYS_STAT_H 1 +! +! /* Define to 1 if you have the <sys/time.h> header file. */ +! /* #undef HAVE_SYS_TIME_H */ +! +! /* Define to 1 if you have the <sys/types.h> header file. */ +! #define HAVE_SYS_TYPES_H 1 +! +! /* Define to 1 if you have the <unistd.h> header file. */ +! #define HAVE_UNISTD_H 1 +! +! /* Define to 1 if you have the `vprintf' function. */ +! #define HAVE_VPRINTF 1 +! +! /* Name of package */ +! #define PACKAGE "libtextcat" +! +! /* Define to the address where bug reports for this package should be sent. */ +! #define PACKAGE_BUGREPORT "" +! +! /* Define to the full name of this package. */ +! #define PACKAGE_NAME "libtextcat" +! +! /* Define to the full name and version of this package. */ +! #define PACKAGE_STRING "libtextcat 2.2" +! +! /* Define to the one symbol short name of this package. */ +! #define PACKAGE_TARNAME "libtextcat" +! +! /* Define to the version of this package. */ +! #define PACKAGE_VERSION "2.2" +! +! /* If using the C implementation of alloca, define if you know the +! direction of stack growth for your system; otherwise it will be +! automatically deduced at run-time. +! STACK_DIRECTION > 0 => grows toward higher addresses +! STACK_DIRECTION < 0 => grows toward lower addresses +! STACK_DIRECTION = 0 => direction of growth unknown */ +! /* #undef STACK_DIRECTION */ +! +! /* Define to 1 if you have the ANSI C header files. */ +! #define STDC_HEADERS 1 +! +! /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */ +! #define TIME_WITH_SYS_TIME 1 +! +! /* Define to 1 if your <sys/time.h> declares `struct tm'. */ +! /* #undef TM_IN_SYS_TIME */ +! +! /* Version number of package */ +! #define VERSION "2.2" +! +! /* Define to empty if `const' does not conform to ANSI C. */ +! /* #undef const */ +! +! /* Define as `__inline' if that's what the C compiler calls it, or to nothing +! if it is not supported. */ +! /* #undef inline */ +! +! /* Define to rpl_malloc if the replacement function should be used. */ +! /* #undef malloc */ +! +! /* Define to rpl_realloc if the replacement function should be used. */ +! /* #undef realloc */ +! +! /* Define to `unsigned' if <sys/types.h> does not define. */ +! /* #undef size_t */ diff --git a/libtextcat/makefile.mk b/libtextcat/makefile.mk new file mode 100644 index 000000000000..c1f8795c7cb0 --- /dev/null +++ b/libtextcat/makefile.mk @@ -0,0 +1,92 @@ +#************************************************************************* +# +# OpenOffice.org - a multi-platform office productivity suite +# +# $RCSfile: makefile.mk,v $ +# +# $Revision: 1.1 $ +# +# last change: $Author: tl $ $Date: 2007-01-12 12:34:28 $ +# +# The Contents of this file are made available subject to +# the terms of GNU Lesser General Public License Version 2.1. +# +# +# GNU Lesser General Public License Version 2.1 +# ============================================= +# Copyright 2005 by Sun Microsystems, Inc. +# 901 San Antonio Road, Palo Alto, CA 94303, USA +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +#************************************************************************* + +PRJ=. + +PRJNAME=libtextcat +TARGET=libtextcat + +# --- Settings ----------------------------------------------------- + +.INCLUDE : settings.mk + +# --- Files -------------------------------------------------------- + +TARFILE_NAME=libtextcat-2.2 +TARFILE_ROOTDIR=libtextcat-2.2 + +PATCH_FILE_NAME=libtextcat-2.2.patch + + +ADDITIONAL_FILES= \ + src$/utf8misc.h \ + src$/utf8misc.c \ + src$/win32_config.h \ + src$/makefile.mk + +.IF "$(GUI)"=="UNX" +#CONFIGURE_DIR=$(BUILD_DIR) + +#relative to CONFIGURE_DIR +CONFIGURE_ACTION=configure +CONFIGURE_FLAGS= + +BUILD_ACTION=make + +OUT2LIB=$(BUILD_DIR)$/src$/.libs$/libtextcat*.so + +.ENDIF # "$(GUI)"=="UNX" + + +.IF "$(GUI)"=="WNT" +BUILD_ACTION=cd src && dmake + +.ENDIF # "$(GUI)"=="WNT" + + +OUT2INC= \ + $(BUILD_DIR)$/src$/config.h \ + $(BUILD_DIR)$/src$/common.h \ + $(BUILD_DIR)$/src$/fingerprint.h \ + $(BUILD_DIR)$/src$/textcat.h \ + $(BUILD_DIR)$/src$/wg_mempool.h + + +# --- Targets ------------------------------------------------------ + +.INCLUDE : set_ext.mk +.INCLUDE : target.mk +.INCLUDE : tg_ext.mk + diff --git a/libtextcat/prj/build.lst b/libtextcat/prj/build.lst new file mode 100644 index 000000000000..8f514f151426 --- /dev/null +++ b/libtextcat/prj/build.lst @@ -0,0 +1,3 @@ +ltc libtextcat : solenv NULL +ltc libtextcat usr1 - all ltc_mkout NULL +ltc libtextcat nmake - all ltc_libtextcat NULL diff --git a/libtextcat/prj/d.lst b/libtextcat/prj/d.lst new file mode 100644 index 000000000000..44ce02b77c44 --- /dev/null +++ b/libtextcat/prj/d.lst @@ -0,0 +1,8 @@ + +..\%__SRC%\lib\lib*.* %_DEST%\lib%_EXT%\lib*.* +..\%__SRC%\lib\ilib*.* %_DEST%\lib%_EXT%\ilib*.* +..\%__SRC%\bin\l*.dll %_DEST%\bin%_EXT%\*.dll + +mkdir: %_DEST%\inc%_EXT%\libtextcat +hedabu: ..\%__SRC%\misc\build\libtextcat-2.2\src\*.h %_DEST%\inc%_EXT%\libtextcat\*.h + |