/************************************************************************* * * OpenOffice.org - a multi-platform office productivity suite * * $RCSfile: Query.cxx,v $ * * $Revision: 1.11 $ * * last change: $Author: obo $ $Date: 2006-09-17 01:19:32 $ * * The Contents of this file are made available subject to * the terms of GNU Lesser General Public License Version 2.1. * * * GNU Lesser General Public License Version 2.1 * ============================================= * Copyright 2005 by Sun Microsystems, Inc. * 901 San Antonio Road, Palo Alto, CA 94303, USA * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software Foundation. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA * ************************************************************************/ // MARKER(update_precomp.py): autogen include statement, do not remove #include "precompiled_xmlhelp.hxx" #ifndef _XMLSEARCH_QE_QUERY_HXX_ #include #endif #ifndef _XMLSEARCH_QE_XMLINDEX_HXX_ #include #endif #ifndef _XMLSEARCH_QE_CONCEPTDATA_HXX_ #include #endif #ifndef _XMLSEARCH_QE_QUERYPROCESSOR_HXX_ #include #endif #ifndef _XMLSEARCH_QE_CONTEXTTABLES_HXX_ #include #endif using namespace xmlsearch::qe; sal_Int32* QueryHit::getMatches( sal_Int32& matchesL ) { matchesL = matchesL_; return matches_; } /******************************************************************************/ /* */ /* HitStore */ /* */ /******************************************************************************/ HitStore::HitStore( double initialStandard,sal_Int32 limit,sal_Int32 nColumns ) : limit_( limit ), nColumns_( nColumns ), index_( 0 ), free_( 0 ), standard_( initialStandard ), heap_( limit ) { for( sal_uInt32 i = 0; i < heap_.size(); ++i ) heap_[i] = 0; } HitStore::~HitStore() { for( sal_uInt32 i = 0; i < heap_.size(); ++i ) delete heap_[i]; } bool HitStore::goodEnough( double penalty, sal_Int32 begin, sal_Int32 end ) { return free_ == limit_ ? heap_[0]->worseThan( penalty,begin,end ) : true; } QueryHit* HitStore::createQueryHit( double penalty,sal_Int32 doc,sal_Int32 begin,sal_Int32 end ) { QueryHit* hit = new QueryHit( nColumns_,penalty,doc,begin,end ); if( free_ == limit_ ) { // goodEnough'ness checked already delete heap_[0]; heap_[0] = hit; heapify( 0 ); standard_ = heap_[0]->getPenalty(); } else if( free_ < limit_ ) { heap_[ free_++ ] = hit; if( free_ == limit_ ) { // we have the needed number for( sal_Int32 i = free_/2; i >= 0; --i ) // build heap heapify( i ); standard_ = heap_[0]->getPenalty(); } } return hit; } struct CompareQueryHit { bool operator()( const QueryHit* l,const QueryHit* r ) { return l->compareTo( r ); } }; #include QueryHit* HitStore::firstBestQueryHit() { if( free_ > 0) { CompareQueryHit bla; heap_.resize( free_ ); std::stable_sort( heap_.begin(),heap_.end(),bla ); index_ = 0; return nextBestQueryHit(); } else return 0; } QueryHit* HitStore::nextBestQueryHit() { return index_ < free_ ? heap_[ index_++ ] : 0; } void HitStore::heapify( sal_Int32 i ) { for( sal_Int32 r,l,worst; ; ) { r = (i + 1) << 1; l = r - 1; worst = l < free_ && heap_[i]->betterThan( heap_[l] ) ? l : i; if( r < free_ && heap_[ worst ]->betterThan( heap_[r] ) ) worst = r; if (worst != i) { QueryHit* temp = heap_[ worst ]; heap_[ worst ] = heap_[ i ]; heap_[i] = temp; i = worst; // continue } else break; } } // sal_Int32 HitStore::partition( sal_Int32 p,sal_Int32 r ) // { // QueryHit* x = heap_[ ((p + r) >> 1) & 0x7FFFFFFF ]; // sal_Int32 i = p - 1, j = r + 1; // while( true ) // { // while( x->compareTo( heap_[--j] ) ) // ; // while( heap_[++i]->compareTo( x ) ) // ; // if( i < j ) // { // QueryHit* t = heap_[i]; // heap_[i] = heap_[j]; // heap_[j] = t; // } // else // return j; // } // } // void HitStore::quicksort( sal_Int32 p,sal_Int32 r ) // { // while( p < r ) // { // sal_Int32 q = partition( p,r ); // quicksort(p, q); // p = q + 1; // } // } /******************************************************************************/ /* */ /* Query */ /* */ /******************************************************************************/ #define MissingTermPenalty 10.0 Query::Query( XmlIndex* env, sal_Int32 nColumns, sal_Int32 nHits, sal_Int32 missingPenaltiesL, double* missingPenalties ) : env_( env ), ctx_( env ? env->getContextInfo() : 0 ), store_( nColumns * MissingTermPenalty - 0.0001,nHits,nColumns ), nHitsRequested_( nHits ), nColumns_( nColumns ), currentStandard_( nColumns * MissingTermPenalty - 0.0001 ), missingPenaltyL_( nColumns ), upperboundTemplateL_( nColumns ), penaltiesL_( missingPenaltiesL ), missingPenalty_( new double[ nColumns ] ), upperboundTemplate_( new double[ nColumns ] ), penalties_( missingPenalties ), ignoredElementsL_( 0 ), ignoredElements_( 0 ), missingTermsPenalty_( 0.0 ) { // for the EmptyQuery case (awaits arch improvement pass) if( missingPenalties ) for( sal_Int32 i = 0;i < nColumns_; ++i ) missingPenalty_[i] = missingPenalties[i]; else for( sal_Int32 i = 0;i < nColumns_; ++i ) missingPenalty_[i] = MissingTermPenalty; makePenaltiesTable(); // _roleFillerList = RoleFiller.STOP; } Query::~Query() { delete[] missingPenalty_; delete[] upperboundTemplate_; delete[] penalties_; delete[] ignoredElements_; } void Query::setIgnoredElements( const sal_Int32 ignoredElementsL,const rtl::OUString* ignoredElements ) { if( ctx_ ) ignoredElements_ = ctx_->getIgnoredElementsSet( ignoredElementsL_, ignoredElementsL,ignoredElements ); if( ! ctx_ ) { ignoredElementsL_ = 0; ignoredElements_ = 0; } } void Query::missingTerms( sal_Int32 nMissingTerms ) { missingTermsPenalty_ = MissingTermPenalty * nMissingTerms; } ConceptData* Query::makeConceptData( sal_Int32 col,sal_Int32 concept,double penalty,sal_Int32 queryNo ) { return new ConceptData( concept,col,penalty,queryNo,nColumns_,env_->getContextInfo() );; } void Query::getHits( std::vector< QueryHitData* >& data,sal_Int32 n ) { if( n <= 0 ) return; QueryHit* qh = store_.firstBestQueryHit(); while( qh ) { data.push_back( env_->hitToData( qh ) ); qh = data.size() < sal_uInt32( n ) ? store_.nextBestQueryHit() : 0; } } QueryHit* Query::maybeCreateQueryHit( double penalty, sal_Int32 doc, sal_Int32 begin, sal_Int32 end, sal_Int32 parentContext ) { // hits are located using only terms actually present in text // if B is not present, the query A B C reduces to A C and penalties // are computed as if B did not occur in query // to meaningfully merge results from different servers, some of which // may have B, penalty has to be normalized to the common computing scheme QueryHit* res = ( store_.goodEnough( penalty += missingTermsPenalty_,begin,end ) && ( ! ignoredElements_ || ctx_->notIgnored( parentContext,ignoredElementsL_,ignoredElements_ ) ) ) ? store_.createQueryHit( penalty,doc,begin,end ) : 0; return res; } void Query::makePenaltiesTable() { sal_Int32 nPatterns = 1 << nColumns_; delete[] penalties_; penalties_ = new double[ penaltiesL_ = nPatterns ]; for (sal_Int32 i = 0; i < nPatterns; ++i ) penalties_[i] = computePenalty(i); } double Query::computePenalty( sal_Int32 n ) { double penalty = 0.0; for( sal_Int32 i = 0; i < nColumns_; ++i ) if( ( n & 1 << i ) == 0 ) penalty += missingPenalty_[i]; return penalty; } void Query::resetForNextDocument() { currentStandard_ = store_.getCurrentStandard(); // "everything's missing" for( sal_Int32 i = 0; i < nColumns_; i++ ) upperboundTemplate_[i] = missingPenalty_[i]; vote_ = false; } bool Query::vote() { double sum = 0.0; for( sal_Int32 i = 0; i < nColumns_; i++ ) sum += upperboundTemplate_[i]; return vote_ = (sum <= currentStandard_ ); } void Query::updateEstimate( sal_Int32 role,double penalty ) { if( penalty < upperboundTemplate_[ role ] ) upperboundTemplate_[ role ] = penalty; } /******************************************************************************/ /* */ /* QueryHitIterator */ /* */ /******************************************************************************/ QueryHitIterator::QueryHitIterator( const QueryResults* result ) : index_( -1 ), result_( result ) { } QueryHitIterator::~QueryHitIterator() { delete result_; } bool QueryHitIterator::next() { return accessible_ = ( ++index_ < sal_Int32( result_->queryHits_.size() ) ); } QueryHitData* QueryHitIterator::getHit( const PrefixTranslator* ) const { if( accessible_ ) return result_->queryHits_[index_]; else return 0; } double QueryHitIterator::getPenalty() { if( accessible_ ) return result_->queryHits_[index_]->getPenalty(); else return 1.0E30; }