diff options
author | Andreas Bille <abi@openoffice.org> | 2001-05-08 11:02:45 +0000 |
---|---|---|
committer | Andreas Bille <abi@openoffice.org> | 2001-05-08 11:02:45 +0000 |
commit | b62e52a1289f4c4f14f517958fdc09151c4ac20c (patch) | |
tree | c81347382ce60c5fc9307439a0fd7b0a766ff2f8 /xmlhelp | |
parent | 45436f7242a4890c1849077544e2405f4b67945d (diff) |
Initial revision
XmlSearch query engine C++ version
Diffstat (limited to 'xmlhelp')
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/ConceptData.cxx | 105 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/ContextTables.cxx | 737 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/DocGenerator.cxx | 447 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/Query.cxx | 387 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/QueryProcessor.cxx | 227 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/Search.cxx | 625 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/XmlIndex.cxx | 327 | ||||
-rw-r--r-- | xmlhelp/source/cxxhelp/qe/makefile.mk | 94 |
8 files changed, 2949 insertions, 0 deletions
diff --git a/xmlhelp/source/cxxhelp/qe/ConceptData.cxx b/xmlhelp/source/cxxhelp/qe/ConceptData.cxx new file mode 100644 index 000000000000..0dfc3483efa7 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/ConceptData.cxx @@ -0,0 +1,105 @@ +/************************************************************************* + * + * $RCSfile: ConceptData.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_CONCEPTDATA_HXX_ +#include <qe/ConceptData.hxx> +#endif +#ifndef _XMLSEARCH_QE_QUERY_HXX_ +#include <qe/Query.hxx> +#endif + +using namespace xmlsearch::qe; + + +const sal_Int32 ConceptData::ProxPerTerm = 10; + + +void ConceptData::runBy( std::vector< Query* >& queries ) +{ + ConceptData* cd = this; + do + { + queries[ cd->queryNo_ ]->updateEstimate( cd->role_,cd->penalty_ ); + } + while( cd = cd->next_ ); +} + + +#ifndef _XMLSEARCH_QE_DOCGENERATOR_HXX_ +#include <qe/DocGenerator.hxx> +#endif + + +void ConceptData::generateFillers( std::vector< RoleFiller* >& array, sal_Int32 pos ) +{ + if( array[ queryNo_ ] != RoleFiller::STOP() ) // not 'prohibited' + { +// cout << "Conc " << sal_Int32( nColumns_ ) << endl; + ( new RoleFiller( nColumns_, + this, + role_, + pos, + ctx_->wordContextLin( pos ), + pos + proximity_ ) )->use( array, queryNo_ ); + } + // !!! maybe eliminate tail recursion + if( next_ ) + next_->generateFillers( array,pos ); +} diff --git a/xmlhelp/source/cxxhelp/qe/ContextTables.cxx b/xmlhelp/source/cxxhelp/qe/ContextTables.cxx new file mode 100644 index 000000000000..f5ba2704df38 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/ContextTables.cxx @@ -0,0 +1,737 @@ +/************************************************************************* + * + * $RCSfile: ContextTables.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_CONTEXTTABLES_HXX_ +#include <qe/ContextTables.hxx> +#endif +#ifndef _XMLSEARCH_UTIL_BYTEARRAYDECOMPRESSOR_HXX_ +#include <util/Decompressor.hxx> +#endif + +using namespace xmlsearch; +using namespace xmlsearch::qe; + + + +// Tables::Tables(int[] initialWords, +// int[] dests, +// int[] linkTypes, +// int[] seqNumbers) +// { +// _ +// _destsCached = dests; +// _linkTypesCached = linkTypes; +// _seqNumbersCached = seqNumbers; +// } + + +Tables::Tables( ContextTables* p ) + : initialWordsCached_( new sal_Int32[ initialWordsCachedL_ = p->initialWordsL_ ] ), + destsCached_( new sal_Int32[ destsCachedL_ = p->destsL_ ] ), + linkTypesCached_( new sal_Int32[ linkTypesCachedL_ = p->linkTypesL_ ] ), + seqNumbersCached_( new sal_Int32[ seqNumbersCachedL_ = p->seqNumbersL_ ] ) +{ +} + + + +Tables::~Tables() +{ + delete[] seqNumbersCached_; + delete[] linkTypesCached_; + delete[] destsCached_; + delete[] initialWordsCached_; +} + + + +void Tables::setTables( ContextTables* p ) +{ + delete[] p->initialWords_; + p->initialWordsL_ = initialWordsCachedL_; + p->initialWords_ = initialWordsCached_; + initialWordsCached_ = 0; + + delete[] p->dests_; + p->destsL_ = destsCachedL_; + p->dests_ = destsCached_; + destsCached_ = 0; + + delete[] p->linkTypes_; + p->linkTypesL_ = linkTypesCachedL_; + p->linkTypes_ = linkTypesCached_; + linkTypesCached_ = 0; + + delete[] p->seqNumbers_; + p->seqNumbersL_ = seqNumbersCachedL_; + p->seqNumbers_ = seqNumbersCached_; + seqNumbersCached_ = 0; + + p->nTextNodes_ = initialWordsCachedL_; +} + + + + +ContextTables::ContextTables( const std::vector< sal_Int32 >& offsets, + sal_Int32 contextDataL,sal_Int8 *contextData, + sal_Int32 linkNamesL,rtl::OUString *linkNames ) + : kTable_( 5 ), + auxArray_( 4096 ), + lastDocNo_( -1 ), + offsets_( offsets ), + contextDataL_( contextDataL ), + contextData_( contextData ), + linkNamesL_( linkNamesL ), + linkNames_( linkNames ), + + cache_( offsets.size() ), + initialWordsL_( 0 ), + initialWords_( 0 ), + destsL_( 0 ), + dests_( 0 ), + linkTypesL_( 0 ), + linkTypes_( 0 ), + seqNumbersL_( 0 ), + seqNumbers_( 0 ), + markersL_( 0 ), + markers_( 0 ) +{ + for( sal_uInt32 i = 0; i < offsets_.size(); ++i ) + cache_[i] = 0; +} + + + +ContextTables::~ContextTables() +{ + delete[] markers_; + delete[] seqNumbers_; + delete[] linkTypes_; + delete[] dests_; + delete[] initialWords_; + + for( sal_uInt32 i = 0; i < cache_.size(); ++i ) + delete cache_[i]; +} + + + +void ContextTables::setMicroindex( sal_Int32 docNo ) throw( excep::XmlSearchException ) +{ + if( docNo != lastDocNo_ ) + { // check if we need to do anything + if( cache_[ docNo ] ) + cache_[ docNo ]->setTables( this ); + else + { + sal_Int32 offset = offsets_[ docNo ]; + sal_Int32 k0 = contextData_[ offset ] & 0xFF; + util::ByteArrayDecompressor compr( contextDataL_,contextData_,offset + 1 ); + kTable_.clear(); + compr.decode( k0,kTable_ ); + // decompress initialWords into auxiliary array + auxArray_.clear(); +#ifdef ABIDEBUG +// cout << kTable_[0] << endl; + + +// exit(1); +#endif + compr.ascDecode( kTable_[0],auxArray_ ); // _initialWords + + initialWords_ = new sal_Int32[ initialWordsL_ = auxArray_.size() ]; + sal_Int32 k; + for( k = 0; k < initialWordsL_; ++k ) //?opt + initialWords_[k] = auxArray_[k]; + + nTextNodes_ = initialWordsL_; + // decompress destinations into auxiliary array + auxArray_.clear(); + compr.decode( kTable_[1],auxArray_ ); // _dests + auxArray_.push_back( -1 ); // sentinel, root + + dests_ = new sal_Int32[ destsL_ = auxArray_.size() ]; + for( k = 0; k < destsL_; ++k ) //?opt + dests_[k] = auxArray_[k]; + +#ifdef ABIDEBUG +// cout << " linkTypesL_ = " << destsL_ - nTextNodes_ - 1 << endl; +#endif + + linkTypes_ = new sal_Int32[ linkTypesL_ = destsL_ - nTextNodes_ - 1 ]; + compr.decode( kTable_[2],linkTypes_ ); + + seqNumbers_ = new sal_Int32[ seqNumbersL_ = destsL_ - 1 ]; + compr.decode( kTable_[ 3 ],seqNumbers_ ); + + cache_[docNo] = new Tables( this ); + + /* + System.out.println("|_initialWords| = " + _nTextNodes); + System.out.println("|_dests| -1 = " + (_dests.length - 1)); + System.out.println("|_seqNumbers| = " + _seqNumbers.length); + System.out.println("|_linkTypes| = " + _linkTypes.length); + */ + } + + lastDocNo_ = docNo; + +#ifdef ABIDEBUG +// cout << "destL_ = " << destsL_ << endl; +#endif + + markers_ = new sal_Int32[ markersL_ = destsL_ ]; + } + initialWordsIndex_ = 0; +} + + + +sal_Int32 ContextTables::parentContext( sal_Int32 context) +{ + return dests_[ context ]; +} + + +rtl::OUString ContextTables::linkName( sal_Int32 context ) +{ + return linkNames_[ linkTypes_[context ] ]; +} + + +sal_Int32 ContextTables::linkCode( const rtl::OUString& linkName ) +{ + for( sal_Int32 i = 0; i < linkNamesL_; ++i ) + if( linkName == linkNames_[i] ) + return i; + return -1; // when not found +} + + +bool* ContextTables::getIgnoredElementsSet( sal_Int32& len, + const sal_Int32 ignoredElementsL, + const rtl::OUString* ignoredElements ) +{ + bool *result = 0; + if( ignoredElements && ignoredElementsL > 0 ) + { + for( sal_Int32 i = 0; i < ignoredElementsL; ++i ) + { + sal_Int32 code = linkCode( ignoredElements[i] ); + if( code > -1 ) + { + if( ! result ) + result = new bool[ len = linkNamesL_ ]; + + result[ code ] = true; + } + } + } + return result; +} + + + +bool ContextTables::notIgnored( sal_Int32 ctx, + sal_Int32 ignoredElementsL,bool* ignoredElements ) +{ + do + { + if( ignoredElements[ linkTypes_[ ctx ] ] ) + { + // cout << rtl::OUString::createFromAscii( "hit ignored" ) << endl; + return false; + } + } + while( ( ctx = dests_[ ctx ] ) > -1 ); // parentContext 'hand inlined' + return true; +} + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with the given linkCode + */ + +sal_Int32 ContextTables::firstParentWithCode( const sal_Int32 pos,const sal_Int32 linkCode ) +{ + sal_Int32 ctx = dests_[ wordContextLin(pos) ]; // first parent of text node + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + while( linkTypes_[ ctx - shift ] != linkCode ) + if( ( ctx = dests_[ ctx ] ) == limit ) + return -1; + return ctx; +} + + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with the given linkCode and given parent code + */ + +sal_Int32 ContextTables::firstParentWithCode2( sal_Int32 pos,const sal_Int32 linkCode,const sal_Int32 parentCode) +{ + sal_Int32 ctx = dests_[ wordContextLin( pos ) ]; // first parent of text node + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + for( sal_Int32 parent = dests_[ctx]; parent < limit; parent = dests_[ parent ] ) + if( linkTypes_[ parent - shift ] == parentCode && linkTypes_[ ctx - shift ] == linkCode ) + return ctx; + else + ctx = parent; + return -1; +} + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with the given linkCode and given ancestor code + */ + +sal_Int32 ContextTables::firstParentWithCode3( sal_Int32 pos,sal_Int32 linkCode,sal_Int32 ancestorCode ) +{ + sal_Int32 ctx = dests_[ wordContextLin( pos ) ]; + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + // find first instance of linkCode + while( ctx < limit && linkTypes_[ ctx - shift ] != linkCode ) + ctx = dests_[ ctx ]; + if( ctx < limit ) // found linkCode, check ancestry + for( sal_Int32 ancestor = dests_[ctx]; + ancestor < limit; + ancestor = dests_[ancestor]) + if (linkTypes_[ancestor - shift] == ancestorCode) // ancestor confirmed + return ctx; // match found, return successful ctx + return -1; // match NOT found +} + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with any of the given linkCode + */ + +sal_Int32 ContextTables::firstParentWithCode4(sal_Int32 pos, sal_Int32 linkCodesL,sal_Int32* linkCodes) +{ + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + for (sal_Int32 ctx = dests_[wordContextLin(pos)]; ctx < limit; ctx = dests_[ctx]) + { + const sal_Int32 code = linkTypes_[ctx - shift]; + for (sal_Int32 i = 0; i < linkCodesL; i++) + if (code == linkCodes[i]) + return ctx; + } + return -1; +} + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with the given path + */ + +sal_Int32 ContextTables::firstParentWithCode5(sal_Int32 pos,sal_Int32 pathCodesL,sal_Int32* pathCodes) +{ + const sal_Int32 lastCode = pathCodes[ pathCodesL - 1 ]; + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + sal_Int32 ctx = dests_[wordContextLin(pos)]; + + SEARCH: + for(sal_Int32 parent = dests_[ctx]; + parent < limit; + parent = dests_[parent]) + if( linkTypes_[ctx - shift] == lastCode ) + { // initial match + // try to match the entire path + for(sal_Int32 i = pathCodesL - 2, parent2 = parent; i >= 0; i--) + if (linkTypes_[parent2 - shift] != pathCodes[i]) // match failure + goto SEARCH; // try to match higher + else if ((parent2 = dests_[parent2]) == limit) + return -1; + return ctx; + } + else + ctx = parent; + return -1; +} + + +/* + * starting with ctx and going up the ancestry tree look for the first + * context with the given linkCode + */ + +sal_Int32 ContextTables::firstParentWithCode7( const sal_Int32 pos,const sal_Int32 linkCode,const sal_Int32 seq) +{ + sal_Int32 ctx = dests_[ wordContextLin(pos) ]; // first parent of text node + const sal_Int32 shift = nTextNodes_; + const sal_Int32 limit = destsL_ - 1; + while (linkTypes_[ctx - shift] != linkCode || seqNumbers_[ctx] != seq) + if ((ctx = dests_[ctx]) == limit) + return -1; + return ctx; +} + + +bool ContextTables::isGoverning(sal_Int32 context) +{ + return linkName(context).equalsAsciiL( RTL_CONSTASCII_STRINGPARAM("TITLE") ) != 0; +} + + +void ContextTables::resetContextSearch() +{ + initialWordsIndex_ = 0; +} + + +// void ContextTables::appendSegment( sal_Int32 context,rtl::OUStringBuffer& result ) +// { +// result.append( context < nTextNodes_ ? +// rtl::OUString::createFromAscii( "text()" ) : +// linkNames_[ linkTypes_[ context - nTextNodes_ ] ] ); +// result.append(sal_Unicode( '[' ) ); +// result.append( seqNumbers_[ context ] ); +// result.append(sal_Unicode( "]/" ) ); +// } + + +// /* +// * XPath (forking) location of the hit +// */ + +// void ContextTables::hitLocation( sal_Int32 termsL,rtl::OUString* terms, +// sal_Int32 matchesL,sal_Int32* matches, +// StringBuffer& result ) +// { +// const sal_Int32 N = termsL; +// std::vector< sal_Int32 > stacks( N ); +// sal_Int32* wordNumbers = new sal_Int32[N]; +// std::vector< sal_Int32 > stack; +// sal_Int32 lastInitialWordIndex = -1; +// sal_Int32 pattern = 0,context = 0,nPopped = 0,pathStart = 0,pathEnd = 0; +// for( sal_Int32 i = 0,marker = 1; i < N; i++,marker <<= 1 ) +// if ( terms[i] ) +// { +// const sal_Int32 wordNumber = matches[i*2 + 1]; +// const sal_Int32 initialWordIndex = findIndexBin(wordNumber); +// wordNumbers[i] = wordNumber - initialWords_[initialWordIndex] + 1; +// if( initialWordIndex == lastInitialWordIndex ) // save work +// ; // do nothing, path will be reused +// else +// { +// pattern |= marker; +// stack = stacks[i] = new IntegerArray(); + +// context = initialWordIndex; +// do +// { +// const sal_Int32 parent = dests_[context]; +// if( parent != -1 ) +// { +// stack.add( context ); +// markers_[context] |= marker; +// context = parent; +// } +// else +// break; +// } +// while( true ); +// lastInitialWordIndex = initialWordIndex; +// } +// } + +// // find and output common path +// // process first non-missing match + +// sal_Int32 i = 0, marker = 1, nMissing = 0; + +// // find first non-missing matching term +// // there has to be at least one if the hit was built +// // count potential leading missing terms to output appropriate elements +// // before outputting elements for matches + +// for ( ; i < N; i++,marker <<= 1 ) +// if (terms[i] != null) +// { +// result.append( rtl::OUString::createFromAscii( "<Matches path=\"" ) ); +// stack = stacks[i]; +// while (stack.size() > 0) +// { +// context = stack.popLast(); +// if ( markers_[context] == pattern ) +// { +// markers_[context] = 0; +// appendSegment( context,result ); // belongs to common +// context = -1; // used +// ++nPopped; +// } +// else +// break; +// } +// // end of 'matches' && common path +// result.append("\">"); +// // output elements for any leading missingTerms +// while (--nMissing >= 0) +// result.append("<MissingTerm/>"); + +// result.append("<Match term=\""); +// result.append(terms[i]); +// result.append("\" path=\""); +// pathStart = result.getLength(); +// if (context != -1) +// { +// appendSegment(context, result); +// markers_[context] = 0; +// } +// while (stack.size() > 0 ) +// { +// context = stack.popLast(); +// appendSegment(context, result); +// markers_[context] = 0; +// } + +// pathEnd = result.length(); + +// result.append("\" tokenNumber=\""); +// result.append(wordNumbers[i]); +// result.append("]\"/>"); + +// break; // just the first non-zero +// } +// else +// ++nMissing; // only count leading missing terms + +// // process the remaining matches +// for (i++, marker <<= 1 ; i < N; i++, marker <<= 1) +// if (terms[i] != null) { +// result.append("<Match term=\""); +// result.append(terms[i]); +// result.append("\" path=\""); +// stack = stacks[i]; +// if (stack == null) // reuse previously generated path +// result.append(result.substring(pathStart, pathEnd)); +// else { +// stack.pop(nPopped); +// pathStart = result.length(); +// while (stack.cardinality() > 0) { +// context = stack.popLast(); +// appendSegment(context, result); +// _markers[context] = 0; +// } +// pathEnd = result.length(); +// } +// result.append("\" tokenNumber=\""); +// result.append(wordNumbers[i]); +// result.append("]\"/>"); +// } +// else +// result.append("<MissingTerm/>"); +// result.append("</Matches>"); +// } + + +// /* +// * QueryHitData is initialized in the caller +// * this function fills the commonPath for all matching terms +// * and relative paths for the individual terms +// */ + +// void ContextTables::hitLocation(String[] terms, sal_Int32[] matches, QueryHitData data) { +// StringBuffer buffer = new StringBuffer(512); +// const sal_Int32 N = terms.length; +// IntegerArray[] stacks = new IntegerArray[N]; +// sal_Int32[] wordNumbers = new sal_Int32[N]; +// IntegerArray stack; +// sal_Int32 lastInitialWordIndex = -1; +// sal_Int32 pattern = 0, nPopped = 0, pathStart = 0, pathEnd = 0; +// for (sal_Int32 i = 0, marker = 1; i < N; i++, marker <<= 1) +// if (terms[i] != null) { +// const sal_Int32 wordNumber = matches[i*2 + 1]; +// const sal_Int32 initialWordIndex = findIndexBin(wordNumber); +// wordNumbers[i] = wordNumber - _initialWords[initialWordIndex] + 1; +// if (initialWordIndex == lastInitialWordIndex) // save work +// ; // do nothing, path will be reused +// else { +// pattern |= marker; +// stack = stacks[i] = new IntegerArray(); +// for (sal_Int32 ctx = initialWordIndex;;) { +// const sal_Int32 parent = _dests[ctx]; +// if (parent != -1) { +// stack.add(ctx); +// _markers[ctx] |= marker; +// ctx = parent; +// } +// else +// break; +// } +// lastInitialWordIndex = initialWordIndex; +// } +// } +// // find and output common path +// // process first match +// StringBuffer path = new StringBuffer(256); +// String previousPath = null; // we may be copying subpaths from it +// sal_Int32 i = 0, marker = 1; +// for ( ; i < N; i++, marker <<= 1) +// if (terms[i] != null) { +// sal_Int32 context = 0; +// stack = stacks[i]; +// while (stack.cardinality() > 0) { +// context = stack.popLast(); +// if (_markers[context] == pattern) { +// _markers[context] = 0; +// appendSegment(context, path); // belongs to common +// context = -1; // used +// ++nPopped; +// } +// else +// break; +// } +// data.setCommonPath(path.toString()); +// // end of 'matches' && common path +// path.setLength(0); // will now be used for relative paths +// pathStart = 0; +// if (context != -1) { +// appendSegment(context, path); +// _markers[context] = 0; +// } +// while (stack.cardinality() > 0) { +// context = stack.popLast(); +// appendSegment(context, path); +// _markers[context] = 0; +// } +// pathEnd = path.length(); +// data.setMatchLocation(i, previousPath = path.toString(), wordNumbers[i]); +// break; // just the first non-zero +// } + +// // process the remaining matches +// for (i++, marker <<= 1 ; i < N; i++, marker <<= 1) +// if (terms[i] != null) { +// path.setLength(0); +// stack = stacks[i]; +// if (stack == null) // reuse previously generated path +// path.append(previousPath.substring(pathStart, pathEnd)); +// else { +// stack.pop(nPopped); +// pathStart = path.length(); +// while (stack.cardinality() > 0) { +// const sal_Int32 context = stack.popLast(); +// appendSegment(context, path); +// _markers[context] = 0; +// } +// pathEnd = path.length(); +// } +// data.setMatchLocation(i, previousPath = path.toString(), wordNumbers[i]); +// } +// } + +// private sal_Int32 ContextTables::findIndexBin(const sal_Int32 wordNumber) { +// sal_Int32 i = 0, j = _nTextNodes - 1; +// while (i <= j) { +// const sal_Int32 k = (i + j) >>> 1; +// if (_initialWords[k] < wordNumber) +// i = k + 1; +// else if (_initialWords[k] > wordNumber) +// j = k - 1; +// else +// return k; +// } +// return i - 1; +// } + + /* + public void addGoverningFiller(int query, RoleFiller rf, int parent) { + // !!! for now assume just one query + GoverningContext gc = null; + if (_governingContexts[parent] == null) { + // find parent governing context + for (int c = _dests[parent]; ; c = _dests[c]) + if (_governingContexts[c] != null || c == 0) { + // System.out.println("parent found at " + c); + gc = new GoverningContext(c, rf); + break; + } + } + else + gc = new GoverningContext(_governingContexts[parent], rf); + _governingContexts[parent] = gc; + } + */ + + +sal_Int32 ContextTables::wordContextLin(sal_Int32 wordNumber) +{ + for (sal_Int32 i = initialWordsIndex_; i < nTextNodes_; ++i ) + if (initialWords_[i] > wordNumber) + { // first such i + // - 1 if wordNumbers can be the same + initialWordsIndex_ = i; // cached to speed up next search + return i - 1; + } + return nTextNodes_ - 1; +} + diff --git a/xmlhelp/source/cxxhelp/qe/DocGenerator.cxx b/xmlhelp/source/cxxhelp/qe/DocGenerator.cxx new file mode 100644 index 000000000000..668d2db5ee9e --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/DocGenerator.cxx @@ -0,0 +1,447 @@ +/************************************************************************* + * + * $RCSfile: DocGenerator.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_DOCGENERATOR_HXX_ +#include <qe/DocGenerator.hxx> +#endif +#ifndef _XMLSEARCH_QE_QUERY_HXX_ +#include <qe/Query.hxx> +#endif + + +using namespace xmlsearch; +using namespace xmlsearch::qe; + + +const sal_Int32 NonnegativeIntegerGenerator::END = -1; +const sal_Int32 ConceptGroupGenerator::NConceptsInGroup = 16; +const sal_Int32 ConceptGroupGenerator::BitsInLabel = 4; + + +RoleFiller RoleFiller::roleFiller_; + + +RoleFiller::RoleFiller( sal_Int32 nColumns, + ConceptData* first, + sal_Int32 role, + sal_Int32 pos, + sal_Int32 parentContext, + sal_Int32 limit ) + : next_( 0 ), + conceptData_( first ), + fixedRole_( sal_Int8( role) ) // primary/constitutive concept/role +{ + // cout << "RoleFiller constructed" << nColumns << ' ' << role << ' ' << pos << endl; + filled_ = sal_Int16( 1 << fixedRole_ ); + begin_ = pos; // offset in file + // _end = _begin + first.getConceptLength(); + end_ = begin_ + 1; + limit_ = limit; + parentContext_ = parentContext; + next_ = 0; + fillers_ = new RoleFiller*[ fillersL_ = nColumns ]; + for( int i = 0; i < fillersL_; ++i ) + fillers_[i] = 0; + fillers_[ role ] = this; +} + + +RoleFiller::~RoleFiller() +{ + for( int i = 0; i < fillersL_; ++i ) + delete fillers_[i]; + delete[] fillers_; +} + + +void RoleFiller::scoreList( Query* query,sal_Int32 document ) +{ + sal_Int32 nColumns = query->getNColumns(); + RoleFiller* candidateHit = this; // function called for the head of list + RoleFiller* next; // lookahead: if overlap, if so, is it better + + // 'candidateHit' always points at the current candidate to be converted to a QueryHit + // 'penalty' is its penalty + // 'next' is used to explore earlier overlapping fillers + // the decision to emit a QueryHit is made when either there's no next + // or next doesn't overlap the current candidate + // the loop's logic makes sure that at emit time there's no better/earlier filler + // to overlap with the candidate + + double penalty = candidateHit->penalty( query,nColumns ); + + for( next = candidateHit->next_; next; next = next->next_ ) + if( next->end_ < candidateHit->begin_ ) + { // no overlap + candidateHit->makeQueryHit( query,document,penalty ); + candidateHit = next; + penalty = candidateHit->penalty( query,nColumns ); + } + else + { // !!! can be computed in two steps + double penalty2 = next->penalty( query,nColumns ); + if( penalty2 <= penalty ) + { // prefer next, disregard candidateHit + penalty = penalty2; + candidateHit = next; + } + } + candidateHit->makeQueryHit(query,document,penalty); +} + + + + +void RoleFiller::makeQueryHit( Query* query,sal_Int32 doc,double penalty ) +{ + QueryHit* hit = query->maybeCreateQueryHit( penalty,doc, + begin_,end_,parentContext_ ); + if( hit ) + { + sal_Int32 N; + sal_Int32* matches = hit->getMatches( N ); + N /= 2; + + for( sal_Int32 i = 0,j = 0; i < N; ++i ) + if( filled_ & 1 << i ) + { + matches[ j++ ] = fillers_[ i ]->getConcept(); + matches[ j++ ] = fillers_[ i ]->begin_; + } + else + j += 2; + } +} + + + +sal_Int32 RoleFiller::getConcept() +{ + return conceptData_->getConcept(); +} + + + +void RoleFiller::use( std::vector< RoleFiller*>& place,sal_Int32 query ) +{ + RoleFiller* rf; + if( rf = place[ query ] ) + { + place[ query ] = this; // put at the head of list + next_ = rf; + while( rf->limit_ >= begin_ ) + { + // check if we can grow/improve a hit + // we don't ever replace filler's fixed role + if( fixedRole_ != rf->fixedRole_ && + // in same parent context eg. PARA + rf->parentContext_ == parentContext_ ) + { + if( ( rf->filled_ & ( 1 << fixedRole_ ) ) == 0 ) + { + // not filled yet + rf->filled_ |= 1 << fixedRole_; + rf->fillers_[ fixedRole_ ] = this; + rf->end_ = end_; + } + else + rf->considerReplacementWith( this ); + } + + if( rf->next_ ) + rf = rf->next_; + else + return; + } + } + else + place[query] = this; +} + + +void RoleFiller::considerReplacementWith( RoleFiller* replacement ) +{ + // !!! simplistic for now + // needs gap and out of order + sal_Int32 role = replacement->fixedRole_; + if( replacement->getScore() > fillers_[role]->getScore() ) + fillers_[ role ] = replacement; +} + + + +double RoleFiller::penalty( Query* query,sal_Int32 nColumns ) +{ + sal_Int32 length = end_ - begin_ + 1; + double penalty = query->lookupPenalty( filled_ ); + // !!! here is a chance to check against query + // if hit worth scoring further + // might not be if query already has lots of good hits + for( sal_Int32 i = 0; i < nColumns; ++i ) + if( filled_ & ( 1 << i ) ) + { + penalty += fillers_[i]->conceptData_->getPenalty(); + //length -= _fillers[i]._conceptData.getConceptLength() + 1; + length -= 2; // !!! ??? c.length is not used ? + if( filled_ >> (i + 1) ) + for( sal_Int32 j = i + 1; j < nColumns; ++j ) + if( ( filled_ & 1 << j ) && fillers_[j]->begin_ < begin_ ) + penalty += query->getOutOufOrderPenalty(); + } + double result = penalty + length * query->getGapPenalty(); + return result < 0.0 ? 0.0 : result; // !!! quick fix +} + + + + + +void NextDocGeneratorHeap::heapify( sal_Int32 i ) +{ + NextDocGenerator* temp; + for( sal_Int32 r,l,smallest; ; ) + { + r = ( i + 1 ) << 1; + l = r - 1; + smallest = ( l < heapSize_ && heap_[l]->smallerThan( heap_[i] ) ) ? l : i; + if( r < heapSize_ && heap_[r]->smallerThan( heap_[ smallest ] ) ) + smallest = r; + if( smallest != i ) + { + temp = heap_[ smallest ]; + heap_[ smallest ] = heap_[ i ]; + heap_[i] = temp; + i = smallest; + } + else + break; + } +} + + +void NextDocGeneratorHeap::step() throw( excep::XmlSearchException ) +{ + if( heap_[0]->next() != NonnegativeIntegerGenerator::END ) + heapify(0); + else if ( heapSize_ > 1 ) + { + heap_[0] = heap_[--heapSize_]; + heapify(0); + } + else + nonEmpty_ = false; +} + + +bool NextDocGeneratorHeap::atDocument( sal_Int32 document ) +{ + return nonEmpty_ && heap_[0]->getDocument() == document; +} + + +ConceptGroupGenerator::ConceptGroupGenerator( sal_Int32 dataL,sal_Int8* data,sal_Int32 index,sal_Int32 k ) + : last_( 0 ), + k1_( k ), + k2_( BitsInLabel ), + bits_( new util::ByteArrayDecompressor( dataL,data,index ) ), + table_( NConceptsInGroup ) +{ + for( sal_Int32 i = 0; i < NConceptsInGroup; ++i ) + table_[i] = 0; +} + + + +ConceptGroupGenerator::ConceptGroupGenerator() + : last_( 0 ), + k1_( 0 ), + k2_( BitsInLabel ), + bits_( 0 ), + table_( NConceptsInGroup ) +{ +} + + +ConceptGroupGenerator::~ConceptGroupGenerator() +{ + delete bits_; +} + + +void ConceptGroupGenerator::generateFillers( std::vector< RoleFiller* >& array ) +{ + cData_->generateFillers( array,last_ ); +} + + +bool ConceptGroupGenerator::next() throw( excep::XmlSearchException ) +{ + while( bits_->readNext( k1_,this ) ) + { + sal_Int32 bla = bits_->read( k2_ ); +// cout << bla << endl; + if( cData_ = table_[ bla ] ) + return true; + } + return false; +} + + +sal_Int32 ConceptGroupGenerator::decodeConcepts( sal_Int32 k, + sal_Int32 shift, + sal_Int32 *concepts ) + throw( excep::XmlSearchException ) +{ + return bits_->ascendingDecode( k,shift,concepts ); +} + + + +void ConceptGroupGenerator::init( sal_Int32 bytesL,sal_Int8* bytes,sal_Int32 index,sal_Int32 k ) +{ + k1_ = k; + delete bits_; + bits_ = new util::ByteArrayDecompressor( bytesL,bytes,index ); + last_ = 0; + for( sal_Int32 i = 0;i < NConceptsInGroup; i++ ) + { + delete table_[i]; + table_[i] = 0; + } +} + + + +bool GeneratorHeap::start( std::vector< RoleFiller* >& array ) throw( xmlsearch::excep::XmlSearchException ) +{ + if( ( heapSize_ = heap_.size() ) > 0 ) + { + for( sal_Int32 i = 0; i < heapSize_; ++i ) + heap_[i]->next(); + + buildHeap(); + heap_[0]->generateFillers( array ); + return true; + } + else + return false; +} + + +bool GeneratorHeap::next( std::vector< RoleFiller* >& array ) throw( xmlsearch::excep::XmlSearchException ) +{ + if( heapSize_ > 0 ) + { + if( ! heap_[0]->next() ) // no more + if( heapSize_ > 1) + heap_[0] = heap_[--heapSize_]; + else + { + heapSize_ = 0; + return false; + } + heapify(0); + heap_[0]->generateFillers( array ); + return true; + } + else + return false; +} + + + +void GeneratorHeap::reset() +{ + for( sal_uInt32 i = 0; i < heap_.size(); ++i ) + delete heap_[i]; + heap_.clear(); + heapSize_ = 0; +} + + + +void GeneratorHeap::buildHeap() +{ + for( sal_Int32 i = heapSize_/2; i >= 0; i-- ) + heapify(i); +} + + +void GeneratorHeap::heapify( sal_Int32 root ) +{ + for( sal_Int32 smallest = 0; ; ) + { + sal_Int32 right = ( root + 1 ) << 1; + sal_Int32 left = right - 1; + smallest = ( left < heapSize_ && heap_[left]->position() < heap_[ root ]->position() ) ? left : root; + if( right< heapSize_ && heap_[right]->position() < heap_[smallest]->position() ) + smallest = right; + if( smallest != root ) + { + ConceptGroupGenerator* temp = heap_[smallest]; + heap_[smallest] = heap_[root]; + heap_[root] = temp; + root = smallest; + } + else + break; + } +} + diff --git a/xmlhelp/source/cxxhelp/qe/Query.cxx b/xmlhelp/source/cxxhelp/qe/Query.cxx new file mode 100644 index 000000000000..8c4dde5e5bf6 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/Query.cxx @@ -0,0 +1,387 @@ +/************************************************************************* + * + * $RCSfile: Query.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_QUERY_HXX_ +#include <qe/Query.hxx> +#endif +#ifndef _XMLSEARCH_QE_XMLINDEX_HXX_ +#include <qe/XmlIndex.hxx> +#endif +#ifndef _XMLSEARCH_QE_CONCEPTDATA_HXX_ +#include <qe/ConceptData.hxx> +#endif +#ifndef _XMLSEARCH_QE_QUERYPROCESSOR_HXX_ +#include <qe/QueryProcessor.hxx> +#endif +#ifndef _XMLSEARCH_QE_CONTEXTTABLES_HXX_ +#include <qe/ContextTables.hxx> +#endif + + +using namespace xmlsearch::qe; + + +/******************************************************************************/ +/* */ +/* HitStore */ +/* */ +/******************************************************************************/ + + +HitStore::HitStore( double initialStandard,sal_Int32 limit,sal_Int32 nColumns ) + : standard_( initialStandard ), + heap_( limit_ = limit ), + nColumns_( nColumns ), + free_( 0 ), + index_( 0 ) +{ + for( sal_uInt32 i = 0; i < heap_.size(); ++i ) + heap_[i] = 0; +} + + + +HitStore::~HitStore() +{ + for( sal_uInt32 i = 0; i < heap_.size(); ++i ) + delete heap_[i]; +} + + + +bool HitStore::goodEnough( double penalty, sal_Int32 begin, sal_Int32 end ) +{ + return free_ == limit_ ? heap_[0]->worseThan( penalty,begin,end ) : true; +} + + + +QueryHit* HitStore::createQueryHit( double penalty,sal_Int32 doc,sal_Int32 begin,sal_Int32 end ) +{ + QueryHit* hit = new QueryHit( nColumns_,penalty,doc,begin,end ); + if( free_ == limit_ ) + { // goodEnough'ness checked already + delete heap_[0]; + heap_[0] = hit; + heapify( 0 ); + standard_ = heap_[0]->getPenalty(); + } + else if( free_ < limit_ ) + { + heap_[ free_++ ] = hit; + if( free_ == limit_ ) + { // we have the needed number + for( sal_Int32 i = free_/2; i >= 0; --i ) // build heap + heapify( i ); + standard_ = heap_[0]->getPenalty(); + } + } + return hit; +} + + +#include <stl/algorithm> + + +QueryHit* HitStore::firstBestQueryHit() +{ + if( free_ > 0) + { + // std::sort( heap_.begin(),heap_.end() ); + // Arrays.sort( heap_,0,free_ ); + index_ = 0; + return nextBestQueryHit(); + } + else + return 0; +} + + +QueryHit* HitStore::nextBestQueryHit() +{ + return index_ < free_ ? heap_[ index_++ ] : 0; +} + + +void HitStore::heapify( sal_Int32 i ) +{ + for( sal_Int32 r,l,worst; ; ) + { + r = (i + 1) << 1; l = r - 1; + worst = l < free_ && heap_[i]->betterThan( heap_[l] ) ? l : i; + if( r < free_ && heap_[ worst ]->betterThan( heap_[r] ) ) + worst = r; + if (worst != i) + { + QueryHit* temp = heap_[ worst ]; + heap_[ worst ] = heap_[ i ]; + heap_[i] = temp; + i = worst; // continue + } + else + break; + } +} + + + + +/******************************************************************************/ +/* */ +/* Query */ +/* */ +/******************************************************************************/ + + +#define MissingTermPenalty 10.0 + + +Query::Query( XmlIndex* env, + sal_Int32 nColumns, + sal_Int32 nHits, + sal_Int32 missingPenaltiesL, + double* missingPenalties ) + : env_( env ), + ctx_( env ? env->getContextInfo() : 0 ), + nColumns_( nColumns ), + nHitsRequested_( nHits ), + missingPenalty_( new double[ missingPenaltyL_ = nColumns_ ] ) , + upperboundTemplate_( new double[ upperboundTemplateL_ = nColumns_ ] ), + penaltiesL_( missingPenaltiesL ), + penalties_( missingPenalties ), + currentStandard_( nColumns * MissingTermPenalty - 0.0001 ), + missingTermsPenalty_( 0.0 ), + store_( currentStandard_,nHits,nColumns ), + ignoredElementsL_( 0 ), + ignoredElements_( 0 ) +{ + // for the EmptyQuery case (awaits arch improvement pass) + + //cout << " upperboundTemplateL_ = " << upperboundTemplateL_ << endl; + + if( missingPenalties ) + for( sal_Int32 i = 0;i < nColumns_; ++i ) + missingPenalty_[i] = missingPenalties[i]; + else + for( sal_Int32 i = 0;i < nColumns_; ++i ) + missingPenalty_[i] = MissingTermPenalty; + + makePenaltiesTable(); + // _roleFillerList = RoleFiller.STOP; +} + + +Query::~Query() +{ + delete[] missingPenalty_; + delete[] upperboundTemplate_; + delete[] penalties_; + delete[] ignoredElements_; +} + + +void Query::setIgnoredElements( const rtl::OUString& element ) +{ +} + + + +void Query::setIgnoredElements( const sal_Int32 ignoredElementsL,const rtl::OUString* ignoredElements ) +{ + if( ctx_ ) + ignoredElements_ = ctx_->getIgnoredElementsSet( ignoredElementsL_, + ignoredElementsL,ignoredElements ); + + if( ! ctx_ ) ignoredElementsL_ = 0; +} + + + +void Query::missingTerms( sal_Int32 nMissingTerms ) +{ + missingTermsPenalty_ = MissingTermPenalty * nMissingTerms; +} + + + +ConceptData* Query::makeConceptData( sal_Int32 col,sal_Int32 concept,double penalty,sal_Int32 queryNo ) +{ + return new ConceptData( concept,col,penalty,queryNo,nColumns_,env_->getContextInfo() );; +} + + +void Query::getHits( std::vector< QueryHitData* >& data,sal_Int32 n ) +{ + if( n <= 0 ) + return; + + QueryHit* qh = store_.firstBestQueryHit(); + + while( qh ) + { + data.push_back( env_->hitToData( qh ) ); + qh = data.size() < sal_uInt32( n ) ? store_.nextBestQueryHit() : 0; + } +} + + +QueryHit* Query::maybeCreateQueryHit( double penalty, + sal_Int32 doc, sal_Int32 begin, sal_Int32 end, sal_Int32 parentContext) +{ + // hits are located using only terms actually present in text + // if B is not present, the query A B C reduces to A C and penalties + // are computed as if B did not occur in query + // to meaningfully merge results from different servers, some of which + // may have B, penalty has to be normalized to the common computing scheme + + return + ( store_.goodEnough( penalty += missingTermsPenalty_,begin,end ) + && ( ! ignoredElements_ || ctx_->notIgnored( parentContext,ignoredElementsL_,ignoredElements_ ) ) ) + ? + store_.createQueryHit( penalty,doc,begin,end ) + : + 0; +} + + +void Query::makePenaltiesTable() +{ + sal_Int32 nPatterns = 1 << nColumns_; + delete[] penalties_; + penalties_ = new double[ penaltiesL_ = nPatterns ]; + for (sal_Int32 i = 0; i < nPatterns; ++i ) + penalties_[i] = computePenalty(i); +} + + +double Query::computePenalty( sal_Int32 n ) +{ + double penalty = 0.0; + for( sal_Int32 i = 0; i < nColumns_; ++i ) + if( ( n & 1 << i ) == 0 ) + penalty += missingPenalty_[i]; + return penalty; +} + + +void Query::resetForNextDocument() +{ + currentStandard_ = store_.getCurrentStandard(); + // "everything's missing" + for( sal_Int32 i = 0; i < nColumns_; i++ ) + upperboundTemplate_[i] = missingPenalty_[i]; + vote_ = false; +} + + +bool Query::vote() +{ + double sum = 0.0; + for( sal_Int32 i = 0; i < nColumns_; i++ ) + sum += upperboundTemplate_[i]; + return vote_ = (sum <= currentStandard_ ); +} + + + +/******************************************************************************/ +/* */ +/* QueryHitIterator */ +/* */ +/******************************************************************************/ + + + +QueryHitIterator::QueryHitIterator( const QueryResults* result ) + : result_( result ), + index_( -1 ) +{ +} + + +QueryHitIterator::~QueryHitIterator() +{ + delete result_; +} + + +bool QueryHitIterator::next() +{ + return accessible_ = ( ++index_ < sal_Int32( result_->queryHits_.size() ) ); +} + + +QueryHitData* QueryHitIterator::getHit( const PrefixTranslator* ) const +{ + if( accessible_ ) + return result_->queryHits_[index_]; + else + return 0; +} + + +double QueryHitIterator::getPenalty() +{ + if( accessible_ ) + return result_->queryHits_[index_]->getPenalty(); + else + return 1.0E30; +} + diff --git a/xmlhelp/source/cxxhelp/qe/QueryProcessor.cxx b/xmlhelp/source/cxxhelp/qe/QueryProcessor.cxx new file mode 100644 index 000000000000..2f10d5740fa9 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/QueryProcessor.cxx @@ -0,0 +1,227 @@ +/************************************************************************* + * + * $RCSfile: QueryProcessor.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_QUERYPROCESSOR_HXX_ +#include <qe/QueryProcessor.hxx> +#endif +#ifndef _XMLSEARCH_DB_DBENV_HXX_ +#include <db/DBEnv.hxx> +#endif + + + +using namespace std; +using namespace xmlsearch; +using namespace xmlsearch::qe; + + +const double QueryProcessor::INFLpenalty = 0.0; + + +QueryProcessor::QueryProcessor( const rtl::OUString& installDir ) + : env_( new XmlIndex( installDir ) ) +{ +} + + +QueryProcessor::~QueryProcessor() +{ + delete env_; +} + + + +QueryResults* QueryProcessor::processQuery( const QueryStatement& ment ) +{ + Search search( env_ ); + Query* query = processQuery( search,ment ); + query->setIgnoredElements( rtl::OUString::createFromAscii("") ); + search.startSearch(); + return makeQueryResults( query,ment.getHitCount() ); +} + + +#ifdef ABIDEBUG +extern ostream& operator<<( ostream& out,const rtl::OUString& bla ); +#endif + + +Query* QueryProcessor::processQuery( Search& search,const QueryStatement& ment ) +{ + sal_Int32 nValidTerms = 0, nMissingTerms = 0, nContentTerms = 0; + double variantPenalty = 0.0; + + const sal_Int32 nHits = ment.getHitCount(); + const rtl::OUString scope = ment.getScope(); + const vector< rtl::OUString >& terms = ment.getTerms(); + const sal_Int32 nTerms = terms.size(); + + vector< sal_Int32 > primary( nTerms ); + vector< sal_Int32 > missingTerms( nTerms ); + vector< vector< sal_Int32 > > columns( nTerms ); + + for( int i = 0; i < nTerms; ++i ) + { + const sal_Int32 lgt = terms[i].getLength(); + const sal_Unicode* str = terms[i].getStr(); + + if( str[0] == sal_Unicode('+') ) + { + // poor man's escape for query control + // not implemented yet + } + else + { + ++nContentTerms; + rtl::OUString term = terms[i].toLowerCase(); + sal_Int32 id = 0; + std::vector< sal_Int32 > ids; + if( str[0] == sal_Unicode('\"') ) + { + id = env_->fetch( term.copy( 1 ) ); // goes to BtreeDict::fetch + } + else if( str[lgt-1] == sal_Unicode( '*' ) ) + { +// cout << term.copy( 0,lgt - 1 ) << endl; + ids = env_->withPrefix( term.copy( 0,lgt - 1 ) ); // goes to BtreeDict::withPrefix + variantPenalty = 0.0; +// for( int i = 0; i < ids.size(); ++i ) +// cout << ids[i] << endl; + } + else + { + sal_Int32 formID; + id = env_->fetch( term ); + + // cout << id << endl; + + // std::vector< rtl::OUString > variants( morph_->getVariants( term ) ); + std::vector< rtl::OUString > variants; + + for( sal_uInt32 i = 0; i < variants.size(); ++i ) + { + formID = env_->fetch( variants[i] ); + if( formID > 0 && formID != id ) + ids.push_back( formID ); + } + variantPenalty = INFLpenalty; + } + + if( ids.size() > 0 || id > 0 ) + { + columns[ nValidTerms ] = ids; + primary[ nValidTerms++ ] = id; + } + else + { + ++nMissingTerms; + // !!! not used now (intended to fill out appropriate missing terms in QueryHits + missingTerms.push_back( nContentTerms - 1 ); + } + + } + } + +// try +// { +// cout << env_->documentName( 1 ) << endl; +// } +// catch( const excep::XmlSearchException& e ) +// { +// cout << e.getMessage() << endl; +// } + +#ifdef ABIDEBUG +// cout << scope <<endl; +// cout << nValidTerms << endl; +// cout << nMissingTerms << endl; +// cout << nHits << endl; +// for( int a = 0; a < primary.size(); ++a ) +// { +// // cout << primary[a] << endl; +// for( int b = 0; b < columns[a].size(); ++b ) +// cout << columns[a][b] << endl; +// } +// exit( 1 ); +#endif + + + return search.addQuery( scope, + nValidTerms,nMissingTerms,nHits, + variantPenalty, + primary, + columns ); +} + + + +QueryResults::QueryResults( Query* query, sal_Int32 nHits ) +{ + if( query ) + query->getHits( queryHits_,nHits ); +} + + + +QueryResults* QueryProcessor::makeQueryResults( Query* query,sal_Int32 nHits ) +{ + return new QueryResults( query,nHits ); +} + diff --git a/xmlhelp/source/cxxhelp/qe/Search.cxx b/xmlhelp/source/cxxhelp/qe/Search.cxx new file mode 100644 index 000000000000..bcc115fa6353 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/Search.cxx @@ -0,0 +1,625 @@ +/************************************************************************* + * + * $RCSfile: Search.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_SEARCH_HXX_ +#include <qe/Search.hxx> +#endif + + +using namespace std; +using namespace xmlsearch; +using namespace xmlsearch::qe; + + +class QueryFactoryImpl +{ +public: + + Query* makeQuery( XmlIndex* env,const rtl::OUString& context,sal_Int32 nColumns,sal_Int32 nHits); + +}; + + +Query* QueryFactoryImpl::makeQuery( XmlIndex* env, + const rtl::OUString& context, + sal_Int32 nColumns, + sal_Int32 nHits ) +{ + // ContextTables* contextTables = env->getContextInfo(); + + if( ! context.getLength() ) + { + // cout << "contextlength zero" << endl; + return new Query( env,nColumns,nHits,0,0 ); + } + else if( context.indexOf( sal_Unicode( '|' ) ) != -1 ) + { + return 0; // needs to be modified + } + else if( context.indexOf( rtl::OUString::createFromAscii( "//" ) ) != -1 ) + { + return 0; // needs to be modified + } + else if( context.indexOf( sal_Unicode( '/' ) ) != -1 ) + { + return 0; // needs to be modified + } + else if( context.indexOf( sal_Unicode( '@' ) ) != -1 ) + { + return 0; // needs to be modified + } + else if( context.indexOf( sal_Unicode( '[' ) ) != -1 ) + { + return 0; // needs to be modified + } + else + { + return 0; // needs to be modified + } +} + + + +Search::Search( XmlIndex* env ) + : env_( env ), + queryFactory_( 0 ), + nextDocGenHeap_(), + limit_( 0 ), + firstGenerator_(), + dataL_( 0 ), + data_( 0 ), + base_( 0 ), + concepts_( new sal_Int32[ ConceptGroupGenerator::NConceptsInGroup ] ) +{ +} + + + +Search::~Search() +{ + delete queryFactory_; + + sal_uInt32 i; + + for( i = 0; i < queries_.size(); ++i ) + delete queries_[i]; + + for( i = 0; i < conceptData_.size(); ++i ) + delete conceptData_[i]; + + delete[] concepts_; +} + + + + +Query* Search::addQuery( const rtl::OUString& context, + sal_Int32 nValidTerms,sal_Int32 nMissingTerms,sal_Int32 nHits, + double variantPenalty, + const std::vector< sal_Int32 >& primary, + const std::vector< std::vector< sal_Int32 > >& columns ) +{ + // by now, scope == context + if( ! queryFactory_ ) + queryFactory_ = new QueryFactoryImpl(); + + Query* query = queryFactory_->makeQuery( env_,context,nValidTerms,nHits ); + query->missingTerms( nMissingTerms ); + queries_.push_back( query ); + + for( sal_Int32 i = 0; i < nValidTerms; ++i ) + { + if( primary[i] > 0 ) + addTerm( i,primary[i],0.0 ); + + for( sal_uInt32 j = 0; j < columns[i].size(); ++j ) + addTerm( i,columns[i][j],variantPenalty ); + } + + // start stop + query->addControlConceptData( this,queries_.size()-1 ); + return query; +} + + +#ifdef ABIDEBUG +extern ostream& operator<<( ostream& out,const rtl::OUString& bla ); +#endif + + +void Search::startSearch() +{ +#ifdef ABIDEBUG +// for( int k = 0; k < conceptData_.size(); ++k ) +// { +// if( ! conceptData_[k] ) +// continue; + +// cout << conceptData_[k]->getConcept() << endl; +// cout << conceptData_[k]->getPenalty() << endl; +// cout << sal_Int32( conceptData_[k]->getRole() ) << endl; +// cout << sal_Int32( conceptData_[k]->getQuery() ) << endl; +// cout << conceptData_[k]->getScore() << endl; +// } +#endif + + sal_Int32 i,j; + // set up ConceptData lists + // order search terms + sal_Int32 _free2 = conceptData_.size(); + quicksort( 0, _free2 - 1); + + // remove duplicates + for (i = 0; i < _free2 - 1; i = j) + for (j = i + 1; j < _free2; j++) + if( conceptData_[i]->crqEquals( conceptData_[j] ) ) + conceptData_[j] = 0; + else + i = j; + + // create lists + for( i = 0; i < _free2 - 1; i = j ) + for (j = i + 1; j < _free2; j++ ) + if( conceptData_[j] ) + if( conceptData_[i]->cEquals( conceptData_[j] ) ) + { + conceptData_[i]->addLast( conceptData_[j] ); + conceptData_[j] = 0; + } + else + i = j; + + // densify + for (i = 0; i < _free2 - 1; i++) + if( ! conceptData_[i] ) + for( j = i + 1; j < _free2; j++) + if (conceptData_[j] ) + { + conceptData_[i] = conceptData_[j]; + conceptData_[j] = 0; + break; + } + +#ifdef ABIDEBUG + for( i = 0; i < conceptData_.size(); ++i ) + { + if( ! conceptData_[i] ) + continue; + +// cout << conceptData_[i]->getConcept() << endl; +// cout << conceptData_[i]->getPenalty() << endl; +// cout << sal_Int32( conceptData_[i]->getRole() ) << endl; +// cout << sal_Int32( conceptData_[i]->getQuery() ) << endl; +// cout << conceptData_[i]->getScore() << endl; + } +#endif + + // set up new document generators + nextDocGenHeap_.reset(); + for( i = 0; i < _free2 && conceptData_[i]; i++) + { +#ifdef ABIDEBUG +// if( conceptData_[i] ) +// cout << rtl::OUString::createFromAscii( "conceptData not zero" ) << endl; +#endif + + NextDocGenerator* gen = new NextDocGenerator( conceptData_[i],env_ ); + try + { + sal_Int32 doc; + gen->first(); + if( ( doc = gen->getDocument() ) != NonnegativeIntegerGenerator::END ) + { + /* !!! ??? is concept length used any more in any way + conceptData_[i]. + setConceptLength(_env. + getConceptLength(conceptData_[i].getConcept())); + */ +// cout << "doc = " << doc << endl; + nextDocGenHeap_.addGenerator( gen ); + } + } + catch( ... ) + { +#ifdef ABIDEBUG + // cout << "Search::startSearch -> catched exception" << endl; +#endif + } + } + + nextDocGenHeap_.start(); + env_->reset(); + env_->resetContextSearch(); + searchDocument(); +} + + + + +void Search::addTerm( sal_Int32 col,sal_Int32 concept,double score ) +{ +#ifdef ABIDEBUG +// cout << concept << endl; +#endif + if( env_->occursInText( concept ) ) + { +#ifdef ABIDEBUG +// cout << "occurs" << endl; +#endif + conceptData_.push_back( queries_[queries_.size()-1]->makeConceptData( col,concept,score,queries_.size()-1 ) ); + } +#ifdef ABIDEBUG +// else +// cout << "does not occur" << endl; +#endif +} + + + + + +void Search::searchDocument() +{ + std::vector< RoleFiller* > start( queries_.size() ); + do + { + try + { + switch( nextDocument( start ) ) + { + case 0: // multi group + genHeap_.start( start ); + while( genHeap_.next( start ) ) + ; + break; + + case 1: // single group + while( firstGenerator_.next() ) + firstGenerator_.generateFillers( start ); + break; + + case 2: // reached the end + return; + } + } + catch( const excep::XmlSearchException& ) + { + continue; + } + + RoleFiller* next; + for( sal_uInt32 i = 0; i < queries_.size(); ++i ) + { + if( ( next = start[i] ) != 0 && next != RoleFiller::STOP() ) + next->scoreList( queries_[i],document_ ); + else if( queries_[i]->zoned() ) + { + RoleFiller* rfs = queries_[i]->getRoleFillers(); + if( rfs && rfs != RoleFiller::STOP() ) + rfs->scoreList( queries_[i],document_ ); + } + } + genHeap_.reset(); + } + while( nextDocGenHeap_.isNonEmpty() ); +} + + +sal_Int32 Search::nextDocument( std::vector< RoleFiller* >& start ) throw( xmlsearch::excep::XmlSearchException ) +{ + while( nextDocGenHeap_.isNonEmpty() ) + { // still something to do + sal_uInt32 i; + for( i = 0; i < queries_.size(); ++i ) + if( queries_[i] ) + queries_[i]->resetForNextDocument(); + + // gather all concepts this document has + // and store associated conceptData + sal_Int32 index = 0; + document_ = nextDocGenHeap_.getDocument(); + docConcepts_.clear(); + queryMasks_.clear(); + do + { + docConcepts_.push_back( nextDocGenHeap_.getConcept() ); + queryMasks_.push_back( nextDocGenHeap_.getQueryMask() ); + ( conceptData_[ index++ ] = nextDocGenHeap_.getTerms() )->runBy( queries_ ); + nextDocGenHeap_.step(); + } + while( nextDocGenHeap_.atDocument( document_) ); + + // if there is no saturation model, some query will always vote YES + // and so every document will be opened + // even if this case, however, savings can be achieved by not generating fillers + // for some queries (not scoring, etc) + // and, with more care, creation of some GroupGenerators can be avoided + // saturating queries with lots of good hits will lead to best results + + rtl::OUString docName = env_->documentName( document_); + + +#ifdef ABIDEBUG +// if( document_ == 148 ) +// { +// cout << "_document = " << document_ << endl; +// cout << "docName = " << env_->documentName( document_) << endl; +// } +// else +// { +// +// cout << "_document = " << document_ << endl; +// cout << "docName = " << docName << endl; +// } +#endif + + sal_Int32 voteMask = 0; + Query* query; + for( i = 0; i < queries_.size(); ++i ) + { + query = queries_[i]; + if( query ) + { + query->saveRoleFillers( 0 ); + if( query->vote() ) + { + // normal reset + start[i] = query->zoned() ? RoleFiller::STOP() : 0; + voteMask |= 1 << i; + } + else + start[i] = RoleFiller::STOP(); // prohibit setting + } + } + + // we may eliminate some ConceptGroupGenerators + // those which would be used only by Queries which voted NO + if( voteMask != 0 ) + { // need to open up document + ConceptGroupGenerator* gen; + // !!! don't gather Fillers for disinterested Queries + if( openDocumentIndex( document_ ) ) + {// multi group + // set up all needed generators + sal_Int32 i = 0; + while( ( queryMasks_[i] & voteMask ) == 0 ) + ++i; + // assert(i < index); + sal_Int32 c = docConcepts_[i]; + sal_Int32 group = 0; + // find first group + while( c > maxConcepts_[ group ] && ++group < limit_ ) + ; + gen = makeGenerator( group ); + gen->addTerms( indexOf(c), conceptData_[i] ); + + for( ++i; i < index; i++ ) + if( ( queryMasks_[i] & voteMask ) > 0 ) + { + c = docConcepts_[i]; + if( c > max_ ) + { // need to find another group + // assert(group < _limit); + while( c > maxConcepts_[ group ] && ++group < limit_ ) ; + gen = makeGenerator( group ); + } + gen->addTerms( indexOf(c), conceptData_[i] ); + } + return 0; + } + else + { // single group + for( sal_Int32 i = 0; i < index; i++ ) + if( queryMasks_[i] & voteMask ) +// #ifdef ABIDEBUG +// { +// sal_Int32 bla = indexOf( docConcepts_[i] ); +// cout << "indexOf = " << bla << endl; +// firstGenerator_.addTerms( bla,conceptData_[i] ); +// } +// #elif + firstGenerator_.addTerms( indexOf( docConcepts_[i] ),conceptData_[i] ); +// #endif + return 1; + } + } + } + return 2; +} + + + + +bool Search::openDocumentIndex( sal_Int32 docNo ) throw( excep::XmlSearchException ) +{ + data_ = env_->getPositions( dataL_,docNo ); + base_ = env_->getDocumentIndex( docNo ); + + startingIndex_ = 0; + sal_Int32 kk = data_[ base_ ] & 0xFF, k2; + + switch( kk >> 6 ) + { // get type + case 0: // single group, no extents + k2 = data_[base_ + 1]; + firstGenerator_.init( dataL_,data_,base_ += 2,k2 ); + // decode concept table + nConcepts_ = firstGenerator_.decodeConcepts( kk & 0x3F,0,concepts_ ); +// cout << "false" << endl; + return false; + + case 2: // multi group, no extents + { + kTable_.clear(); + offsets_.clear(); + maxConcepts_.clear(); + util::ByteArrayDecompressor compr( dataL_,data_,base_ + 1 ); + compr.decode( kk & 0x3F,kTable_ ); + + sal_Int32 last = kTable_.back(); + kTable_.pop_back(); + compr.ascDecode( last,offsets_ ); + last = kTable_.back(); + kTable_.pop_back(); + compr.ascDecode( last,maxConcepts_ ); + + base_ += 1 + compr.bytesRead(); + limit_ = maxConcepts_.size(); + } +// cout << "true" << endl; + return true; + + case 1: // single group, extents + case 3: // multi group, extents + throw excep::XmlSearchException( rtl::OUString::createFromAscii( "extents not yet implemented\n" ) ); + } + +// cout << "false1" << endl; + return false; +} + + + + + +ConceptGroupGenerator* Search::makeGenerator( sal_Int32 group ) + throw( excep::XmlSearchException ) +{ + sal_Int32 shift,index; + if( group > 0 ) + { + index = base_ + offsets_[ group - 1 ]; + shift = maxConcepts_[ group - 1 ]; + } + else + { + index = base_; + shift = 0; + } + + // initialize generator + ConceptGroupGenerator* gen = + new ConceptGroupGenerator( dataL_,data_,index,kTable_[ 2*group + 1 ] ); + // decode concept table + nConcepts_ = gen->decodeConcepts( kTable_[2*group],shift,concepts_ ); + + if( group < limit_ ) + max_ = concepts_[ nConcepts_ ] = maxConcepts_[ group ]; + else + max_ = concepts_[ nConcepts_ - 1 ]; + + genHeap_.addGenerator( gen ); + startingIndex_ = 0; // in _concepts; lower search index + return gen; +} + + + +sal_Int32 Search::indexOf(sal_Int32 concept) throw( excep::XmlSearchException ) +{ + sal_Int32 i = startingIndex_,j = nConcepts_,k; + while( i <= j ) + if( concepts_[ k = (i + j)/2 ] < concept ) + i = k + 1; + else if( concept < concepts_[k] ) + j = k - 1; + else + { + startingIndex_ = k + 1; + return k; + } + throw excep::XmlSearchException( rtl::OUString::createFromAscii( "indexOf not found" ) ); +} + + + + +sal_Int32 Search::partition( sal_Int32 p,sal_Int32 r ) +{ + ConceptData* x = conceptData_[ ((p + r) >> 1) & 0x7FFFFFFF ]; + sal_Int32 i = p - 1, j = r + 1; + while( true ) + { + while( x->compareWith( conceptData_[--j] ) ) + ; + while( conceptData_[++i]->compareWith( x ) ) + ; + if( i < j ) + { + ConceptData* t = conceptData_[i]; + conceptData_[i] = conceptData_[j]; + conceptData_[j] = t; + } + else + return j; + } +} + + + +void Search::quicksort( sal_Int32 p,sal_Int32 r ) +{ + while (p < r) + { + sal_Int32 q = partition( p,r ); + quicksort(p, q); + p = q + 1; + } +} diff --git a/xmlhelp/source/cxxhelp/qe/XmlIndex.cxx b/xmlhelp/source/cxxhelp/qe/XmlIndex.cxx new file mode 100644 index 000000000000..f96497db15c3 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/XmlIndex.cxx @@ -0,0 +1,327 @@ +/************************************************************************* + * + * $RCSfile: XmlIndex.cxx,v $ + * + * $Revision: 1.1 $ + * + * last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ + * + * The Contents of this file are made available subject to the terms of + * either of the following licenses + * + * - GNU Lesser General Public License Version 2.1 + * - Sun Industry Standards Source License Version 1.1 + * + * Sun Microsystems Inc., October, 2000 + * + * GNU Lesser General Public License Version 2.1 + * ============================================= + * Copyright 2000 by Sun Microsystems, Inc. + * 901 San Antonio Road, Palo Alto, CA 94303, USA + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * + * Sun Industry Standards Source License Version 1.1 + * ================================================= + * The contents of this file are subject to the Sun Industry Standards + * Source License Version 1.1 (the "License"); You may not use this file + * except in compliance with the License. You may obtain a copy of the + * License at http://www.openoffice.org/license.html. + * + * Software provided under this License is provided on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, + * MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. + * See the License for the specific provisions governing your rights and + * obligations concerning the Software. + * + * The Initial Developer of the Original Code is: Sun Microsystems, Inc. + * + * Copyright: 2000 by Sun Microsystems, Inc. + * + * All Rights Reserved. + * + * Contributor(s): _______________________________________ + * + * + ************************************************************************/ +#ifndef _XMLSEARCH_QE_XMLINDEX_HXX_ +#include <qe/XmlIndex.hxx> +#endif +#ifndef _XMLSEARCH_QE_DOCGENERATOR_HXX_ +#include <qe/DocGenerator.hxx> +#endif +#ifndef _XMLSEARCH_UTIL_CONCEPTLIST_HXX_ +#include <util/ConceptList.hxx> +#endif +#ifndef _XMLSEARCH_UTIL_RANDOMACCESSSTREAM_HXX_ +#include <util/RandomAccessStream.hxx> +#endif +#ifndef _XMLSEARCH_UTIL_DECOMPRESSOR_HXX_ +#include <util/Decompressor.hxx> +#endif + + +using namespace xmlsearch; +using namespace xmlsearch::qe; + + +// extern sal_Int32 getInteger_( const sal_Int8* ); + + +XmlIndex::XmlIndex( const rtl::OUString& indexDir ) + : indexAccessor_( indexDir ), + dict_( 0 ), + documents_( 0 ), + concepts_( 0 ), + allLists_( 0 ), + allListsL_( 0 ), + positionsL_( 0 ), + positions_( 0 ), + contextsDataL_( 0 ), + contextsData_( 0 ), + contextTables_( 0 ) +{ + // reading DOCS + { + allListsL_ = indexAccessor_.readByteArray( allLists_, + rtl::OUString::createFromAscii("DOCS") ); // reading DOCS + } + + // reading CONTEXTS + { + contextsDataL_ = indexAccessor_.readByteArray( contextsData_, + rtl::OUString::createFromAscii("CONTEXTS") ); // reading CONTEXTS + } + + // reading POSITIONS + { + positionsFile_ = indexAccessor_.getStream( rtl::OUString::createFromAscii( "POSITIONS" ), + rtl::OUString::createFromAscii( "r" ) ); + + //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt + if( allInCache_ = true ) // yes, intended + { + reset(); + positions_ = new sal_Int8[ positionsL_ = positionsFile_->length() ]; + positionsFile_->readBytes( positions_,positionsL_ ); + } + } + + + // reading DOCS.TAB + { + util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "DOCS.TAB" ), + rtl::OUString::createFromAscii( "r" ) ); + sal_Int8 a[4]; + a[0] = a[1] = a[2] = 0; + in->readBytes( &a[3],1 ); + sal_Int32 k1 = ::getInteger_( a ); + util::StreamDecompressor sddocs( in ); + sddocs.ascDecode( k1,concepts_ ); + in->readBytes( &a[3],1 ); + sal_Int32 k2 = ::getInteger_( a ); + offsets_.push_back( 0 ); + util::StreamDecompressor sdoffsets( in ); + sdoffsets.ascDecode( k2,offsets_ ); + delete in; + +// int a; +// for( a = 0; a < offsets_.size(); ++a ) +// cout << "concepts_[" << a << "] = " << concepts_[a] << endl; +// for( a = 0; a < offsets_.size(); ++a ) +// cout << "offsets_[" << a << "] = " << offsets_[a] << endl; + } + + // reading OFFSETS + { + util::RandomAccessStream* in = indexAccessor_.getStream( rtl::OUString::createFromAscii( "OFFSETS" ), + rtl::OUString::createFromAscii( "r" ) ); + sal_Int8 a[4]; + a[0] = a[1] = a[2] = 0; + in->readBytes( &a[3],1 ); + sal_Int32 k1 = ::getInteger_( a ); + util::StreamDecompressor sddocs( in ); + sddocs.decode( k1,documents_ ); + in->readBytes( &a[3],1 ); + sal_Int32 k2 = ::getInteger_( a ); + util::StreamDecompressor sdoffsets( in ); + sdoffsets.ascDecode( k2,microIndexOffsets_ ); + in->readBytes( &a[3],1 ); + sal_Int32 k3 = ::getInteger_( a ); + util::StreamDecompressor sdtitles( in ); + sdtitles.decode( k3,titles_ ); + + in->readBytes( &a[3],1 ); + sal_Int32 k4 = ::getInteger_( a ); + // contextsOffsets_ = new IntegerArray(_documents.cardinality() + 1); + util::StreamDecompressor co(in); + // _contextsOffsets.add(0); // first, trivial offset + co.ascDecode( k4,contextsOffsets_ ); + + delete in; + } + + // Hard coding linknames ( object serialization is hard to undo ) + { + linkNames_ = new rtl::OUString[ linkNamesL_ = 8 ]; + linkNames_[0] = rtl::OUString::createFromAscii( "help:link" ); + linkNames_[1] = rtl::OUString::createFromAscii( "help:help-text" ); + linkNames_[2] = rtl::OUString::createFromAscii( "text:p" ); + linkNames_[3] = rtl::OUString::createFromAscii( "text:span" ); + linkNames_[4] = rtl::OUString::createFromAscii( "headingheading" ); + linkNames_[5] = rtl::OUString::createFromAscii( "office:body" ); + linkNames_[6] = rtl::OUString::createFromAscii( "help:to-be-embedded" ); + linkNames_[7] = rtl::OUString::createFromAscii( "office:document" ); + } + + + { + contextTables_ = new ContextTables( contextsOffsets_, + contextsDataL_,contextsData_, + linkNamesL_,linkNames_ ); + } +} + + +XmlIndex::~XmlIndex() +{ + delete[] allLists_; + delete[] contextsData_; + delete[] linkNames_; + delete[] positions_; + delete positionsFile_; + delete contextTables_; +} + + + +void XmlIndex::reset() +{ + maxDocNumberInCache_ = ( allInCache_ ? ( microIndexOffsets_.size() - 1 ) : sal_Int32( -1 ) ); +} + + +sal_Int32 binarySearch( const std::vector<sal_Int32>& arr,sal_Int32 value ) +{ + sal_Int32 i = 0, j = arr.size(), k; + while (i <= j) + if (arr[k = (i + j)/2] < value) + i = k + 1; + else if (value < arr[k]) + j = k - 1; + else + return k; + return -1; +} + + +NonnegativeIntegerGenerator* XmlIndex::getDocumentIterator( sal_Int32 concept ) +{ +// #ifdef ABIDEBUG +// cout << concept << endl; +// #endif + + sal_Int32 index = binarySearch( concepts_,concept ); + +#ifdef ABIDEBUG +// cout << index << " " << allListsL_ << " " << allLists_ << endl; + +// for( int i = 0; i < allListsL_; ++i ) +// cout << "_allList[" << i << "] = " << sal_Int32( allLists_[i] ) << endl; + +// for( int i = 0; i < offsets_.size(); ++i ) +// cout << "offsets[" << i << "] = " << offsets_[i] << endl; +#endif + + if( index >= 0 ) + return new util::ConceptList( allLists_,allListsL_,offsets_[index] ); + else + return 0; +} + + +bool XmlIndex::occursInText( sal_Int32 concept ) +{ + return binarySearch( concepts_,concept) >= 0; +} + + +sal_Int8* XmlIndex::getPositions( sal_Int32& len,sal_Int32 docNo ) throw( excep::XmlSearchException ) +{ + contextTables_->setMicroindex( docNo ); + if( docNo > maxDocNumberInCache_ ) + readMicroindexes( docNo ); + + len = positionsL_; + return positions_; +} + + +rtl::OUString XmlIndex::documentName( sal_Int32 docNumber ) throw( excep::XmlSearchException ) +{ + if( docNumber < 0 || documents_.size() <= sal_uInt32( docNumber ) ) + { + rtl::OUString message = rtl::OUString::createFromAscii( "XmlIndex::documentName -> " ); + throw excep::XmlSearchException( message ); + } + + return dict_.fetch( documents_[ docNumber ] ); +} + + + + +void XmlIndex::readMicroindexes( sal_Int32 docNo ) throw( xmlsearch::excep::IOException ) +{ + currentBatchOffset_ = microIndexOffsets_[docNo]; + sal_Int32 offsetLimit = currentBatchOffset_ + positionsL_; + sal_Int32 upTo = 0, nextDoc = docNo; + sal_Int32 lastOffset = 0; + + do + { + if( ++nextDoc == sal_Int32( microIndexOffsets_.size() ) ) + lastOffset = sal_Int32( positionsFile_->length() ); + else if( microIndexOffsets_[ nextDoc ] > offsetLimit ) + lastOffset = microIndexOffsets_[ nextDoc ]; + } + while( lastOffset == 0 ); + + if( lastOffset > offsetLimit ) + { + upTo = microIndexOffsets_[ nextDoc - 1 ]; + maxDocNumberInCache_ = nextDoc - 2; + } + else + { + upTo = lastOffset; + maxDocNumberInCache_ = nextDoc - 1; + } + + if( maxDocNumberInCache_ < docNo ) + { // cache too small + // for current microindex + // System.out.println("expanding cache to " + _positionsCacheSize); + delete[] positions_; + positions_ = new sal_Int8[ positionsL_ = lastOffset - currentBatchOffset_ ]; + readMicroindexes( docNo ); + return; + } + + positionsFile_->seek( currentBatchOffset_ ); + positionsFile_->readBytes( positions_,upTo - currentBatchOffset_ ); +} diff --git a/xmlhelp/source/cxxhelp/qe/makefile.mk b/xmlhelp/source/cxxhelp/qe/makefile.mk new file mode 100644 index 000000000000..dfc51d4cae06 --- /dev/null +++ b/xmlhelp/source/cxxhelp/qe/makefile.mk @@ -0,0 +1,94 @@ +#************************************************************************* +# +# $RCSfile: makefile.mk,v $ +# +# $Revision: 1.1 $ +# +# last change: $Author: abi $ $Date: 2001-05-08 12:02:45 $ +# +# The Contents of this file are made available subject to the terms of +# either of the following licenses +# +# - GNU Lesser General Public License Version 2.1 +# - Sun Industry Standards Source License Version 1.1 +# +# Sun Microsystems Inc., October, 2000 +# +# GNU Lesser General Public License Version 2.1 +# ============================================= +# Copyright 2000 by Sun Microsystems, Inc. +# 901 San Antonio Road, Palo Alto, CA 94303, USA +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License version 2.1, as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307 USA +# +# +# Sun Industry Standards Source License Version 1.1 +# ================================================= +# The contents of this file are subject to the Sun Industry Standards +# Source License Version 1.1 (the "License"); You may not use this file +# except in compliance with the License. You may obtain a copy of the +# License at http://www.openoffice.org/license.html. +# +# Software provided under this License is provided on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, +# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS, +# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING. +# See the License for the specific provisions governing your rights and +# obligations concerning the Software. +# +# The Initial Developer of the Original Code is: Sun Microsystems, Inc. +# +# Copyright: 2000 by Sun Microsystems, Inc. +# +# All Rights Reserved. +# +# Contributor(s): _______________________________________ +# +# +# +#************************************************************************* + +PRJ=..$/..$/.. + +PRJNAME= xmlhelp +TARGET= jaqe +AUTOSEG= TRUE + +ENABLE_EXCEPTIONS=TRUE + +# --- Settings ----------------------------------------------------- + +.INCLUDE : svpre.mk +.INCLUDE : settings.mk +.INCLUDE : sv.mk + +# --- Files -------------------------------------------------------- + +.IF "$(header)" == "" + +SLOFILES=\ + $(SLO)$/ConceptData.obj \ + $(SLO)$/ContextTables.obj \ + $(SLO)$/DocGenerator.obj \ + $(SLO)$/Query.obj \ + $(SLO)$/QueryProcessor.obj \ + $(SLO)$/Search.obj \ + $(SLO)$/XmlIndex.obj +.ENDIF + +# --- Targets ------------------------------------------------------ + +.INCLUDE : target.mk + |