summaryrefslogtreecommitdiff
path: root/xmlhelp/source/com
diff options
context:
space:
mode:
authorKurt Zenker <kz@openoffice.org>2008-06-24 15:18:42 +0000
committerKurt Zenker <kz@openoffice.org>2008-06-24 15:18:42 +0000
commite82de9330ef0f972a4544f1841c3fee1049aac15 (patch)
treedfd713b21ae78c1d93fb4f1c6b95a8af1546e96b /xmlhelp/source/com
parentd9c33e48174f32f2010c40bbb42081a689c2d319 (diff)
INTEGRATION: CWS ab52 (1.13.6); FILE MERGED
2008/06/18 10:35:42 ab 1.13.6.1: #i83625# Migration to Lucene
Diffstat (limited to 'xmlhelp/source/com')
-rw-r--r--xmlhelp/source/com/sun/star/help/HelpLinker.cxx5038
1 files changed, 197 insertions, 4841 deletions
diff --git a/xmlhelp/source/com/sun/star/help/HelpLinker.cxx b/xmlhelp/source/com/sun/star/help/HelpLinker.cxx
index ddd5efb4055f..d35ae39e2e89 100644
--- a/xmlhelp/source/com/sun/star/help/HelpLinker.cxx
+++ b/xmlhelp/source/com/sun/star/help/HelpLinker.cxx
@@ -7,7 +7,7 @@
* OpenOffice.org - a multi-platform office productivity suite
*
* $RCSfile: HelpLinker.cxx,v $
- * $Revision: 1.13 $
+ * $Revision: 1.14 $
*
* This file is part of OpenOffice.org.
*
@@ -35,9 +35,6 @@
#include <string.h>
#include <limits.h>
-#include <boost/shared_ptr.hpp>
-#include <boost/tokenizer.hpp>
-
#include <libxslt/xslt.h>
#include <libxslt/transform.h>
#include <libxslt/xsltutils.h>
@@ -57,21 +54,101 @@
#include <expat/xmlparse.h>
#endif
-class JarOutputStream
+class IndexerPreProcessor
{
private:
- fs::path filename;
- std::ostringstream perlline;
+ std::string m_aModuleName;
+ fs::path m_fsIndexBaseDir;
+ fs::path m_fsCaptionFilesDirName;
+ fs::path m_fsContentFilesDirName;
+
+ xsltStylesheetPtr m_xsltStylesheetPtrCaption;
+ xsltStylesheetPtr m_xsltStylesheetPtrContent;
+
public:
- JarOutputStream();
- void setname(const fs::path &name) { filename = name; }
- const fs::path& getname() const { return filename; }
- void addFile(const std::string &name, const std::string &key);
- void addTree(const std::string &dir, const std::string &key);
- void dontCompress(const std::string &key);
- void commit();
+ IndexerPreProcessor( const std::string& aModuleName, const fs::path& fsIndexBaseDir,
+ const fs::path& idxCaptionStylesheet, const fs::path& idxContentStylesheet );
+ ~IndexerPreProcessor();
+
+ void processDocument( xmlDocPtr doc, const std::string& EncodedDocPath );
};
+IndexerPreProcessor::IndexerPreProcessor
+ ( const std::string& aModuleName, const fs::path& fsIndexBaseDir,
+ const fs::path& idxCaptionStylesheet, const fs::path& idxContentStylesheet )
+ : m_aModuleName( aModuleName )
+ , m_fsIndexBaseDir( fsIndexBaseDir )
+{
+ m_fsCaptionFilesDirName = fsIndexBaseDir / "caption";
+ fs::create_directory( m_fsCaptionFilesDirName );
+
+ m_fsContentFilesDirName = fsIndexBaseDir / "content";
+ fs::create_directory( m_fsContentFilesDirName );
+
+ m_xsltStylesheetPtrCaption = xsltParseStylesheetFile
+ ((const xmlChar *)idxCaptionStylesheet.native_file_string().c_str());
+ m_xsltStylesheetPtrContent = xsltParseStylesheetFile
+ ((const xmlChar *)idxContentStylesheet.native_file_string().c_str());
+}
+
+IndexerPreProcessor::~IndexerPreProcessor()
+{
+ if( m_xsltStylesheetPtrCaption )
+ xsltFreeStylesheet( m_xsltStylesheetPtrCaption );
+ if( m_xsltStylesheetPtrContent )
+ xsltFreeStylesheet( m_xsltStylesheetPtrContent );
+}
+
+
+std::string getEncodedPath( const std::string& Path )
+{
+ rtl::OString aOStr_Path( Path.c_str() );
+ rtl::OUString aOUStr_Path( rtl::OStringToOUString
+ ( aOStr_Path, fs::getThreadTextEncoding() ) );
+ rtl::OUString aPathURL;
+ osl::File::getFileURLFromSystemPath( aOUStr_Path, aPathURL );
+ rtl::OString aOStr_PathURL( rtl::OUStringToOString
+ ( aPathURL, fs::getThreadTextEncoding() ) );
+ std::string aStdStr_PathURL( aOStr_PathURL.getStr() );
+ return aStdStr_PathURL;
+}
+
+void IndexerPreProcessor::processDocument
+ ( xmlDocPtr doc, const std::string &EncodedDocPath )
+{
+ std::string aStdStr_EncodedDocPathURL = getEncodedPath( EncodedDocPath );
+
+ xmlDocPtr resCaption = xsltApplyStylesheet( m_xsltStylesheetPtrCaption, doc, NULL );
+ xmlNodePtr pResNodeCaption = resCaption->xmlChildrenNode;
+ if( pResNodeCaption )
+ {
+ fs::path fsCaptionPureTextFile_docURL = m_fsCaptionFilesDirName / aStdStr_EncodedDocPathURL;
+ std::string aCaptionPureTextFileStr_docURL = fsCaptionPureTextFile_docURL.native_file_string();
+ FILE* pFile_docURL = fopen( aCaptionPureTextFileStr_docURL.c_str(), "w" );
+ if( pFile_docURL )
+ {
+ fprintf( pFile_docURL, "%s\n", pResNodeCaption->content );
+ fclose( pFile_docURL );
+ }
+ }
+ xmlFreeDoc(resCaption);
+
+ xmlDocPtr resContent = xsltApplyStylesheet( m_xsltStylesheetPtrContent, doc, NULL );
+ xmlNodePtr pResNodeContent = resContent->xmlChildrenNode;
+ if( pResNodeContent )
+ {
+ fs::path fsContentPureTextFile_docURL = m_fsContentFilesDirName / aStdStr_EncodedDocPathURL;
+ std::string aContentPureTextFileStr_docURL = fsContentPureTextFile_docURL.native_file_string();
+ FILE* pFile_docURL = fopen( aContentPureTextFileStr_docURL.c_str(), "w" );
+ if( pFile_docURL )
+ {
+ fprintf( pFile_docURL, "%s\n", pResNodeContent->content );
+ fclose( pFile_docURL );
+ }
+ }
+ xmlFreeDoc(resContent);
+}
+
struct Data
{
std::vector<std::string> _idList;
@@ -128,4572 +205,40 @@ public:
}
};
-namespace PrefixTranslator
-{
- std::string translatePrefix(const std::string &input)
- {
- if (input.find("vnd.sun.star.help://") == 0)
- return std::string("#HLP#") + input.substr(strlen("vnd.sun.star.help://"));
- else
- return input;
- }
-}
-
-class IndexAccessor
-{
- fs::path _dirName;
-public:
- IndexAccessor(const fs::path &dirName) : _dirName(dirName) {}
- IndexAccessor(const IndexAccessor &another) { _dirName = another._dirName; }
- fs::path indexFile(const std::string &name) const { return _dirName / name; }
- std::ifstream* getLineInput(const std::string &name);
- std::fstream* getOutputStream(const std::string &name);
- std::vector<unsigned char> readByteArray(const std::string &fileName);
- void clear();
- std::fstream *getRAF(const std::string &name, bool update) throw( HelpProcessingException );
- void createIfNeeded() {}
-};
-
-std::ifstream* IndexAccessor::getLineInput(const std::string &name)
-{
- return new std::ifstream(indexFile(name).native_file_string().c_str());
-}
-
-std::fstream* IndexAccessor::getOutputStream(const std::string &name)
-{
- return new std::fstream(indexFile(name).native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
-}
-
-std::vector<unsigned char> IndexAccessor::readByteArray(const std::string &fileName)
-{
- std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
- std::vector<unsigned char> ret(1024*16);
- int i=0;
- while (in.good())
- {
- int len = in.readsome((char *)&ret[i], 1024*16);
- if (!len)
- break;
- i += len;
- ret.resize(i+1024*16);
- }
- ret.resize(i);
- return ret;
-}
-
-std::fstream* IndexAccessor::getRAF(const std::string &name, bool update)
- throw( HelpProcessingException )
-{
- std::fstream *_file = new std::fstream;
- fs::path fullname = indexFile(name);
- if (!update)
- {
- _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::binary);
- }
- else
- {
- _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary);
- if (!_file->is_open())
- {
- HCDBG(std::cerr << "didn't exist" << std::endl);
- _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary);
- }
- if (!_file->is_open())
- {
- std::stringstream aStrStream;
- aStrStream << "Cannot open " << name << std::endl;
- throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
- }
- }
- return _file;
-}
-
-void IndexAccessor::clear()
-{
-#if 0
- File thisDir = indexFile(".");
- File[] components = thisDir.listFiles();
- if (components != null)
- for (int i = 0; i < components.length; i++)
- components[i].delete();
-#endif
-}
-
-typedef std::vector< std::string > VectorLines;
-
-class Schema : public IndexAccessor
-{
-private:
- static std::string PartName;
- bool _update;
- VectorLines _lines;
-public:
- Schema(const IndexAccessor &index, bool update);
- std::ifstream* getSchemaLineInput() { return getLineInput(PartName); }
- void read();
- Stringtable parameters(const std::string &name) const;
- void update(const std::string &partName, const std::string &parameters);
- void save();
-};
-
-std::string Schema::PartName = "SCHEMA";
-
-
-class startsWith
-{
-public:
- startsWith(const std::string &in) : str(in) {}
- bool operator() ( const std::string &in ) const { return (in.find(str) == 0); }
-private:
- const std::string &str;
-};
-
-void Schema::update(const std::string &partName, const std::string &inparameters)
-{
- VectorLines::iterator aEnd = std::remove_if(_lines.begin(), _lines.end(), startsWith(partName));
- if (aEnd != _lines.end()) _lines.erase(aEnd, _lines.end());
- _lines.push_back(partName + " " + inparameters);
-}
-
-Stringtable Schema::parameters(const std::string &name) const
-{
- Stringtable result;
- VectorLines::const_iterator aEnd = _lines.end();
- for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter)
- {
- if (aIter->find(name) == 0)
- {
- boost::char_separator<char> sep(" =");
- boost::tokenizer< boost::char_separator<char> > tokens(name, sep);
- boost::tokenizer< boost::char_separator<char> >::const_iterator it = tokens.begin();
- ++it; // skip name
- while(it != tokens.end())
- {
- const std::string &part1 = *it;
- ++it;
- if (it == tokens.end())
- break;
- const std::string &part2 = *it;
- result[part1] = part2;
- ++it;
- }
- break;
- }
- }
- return result;
-}
-
-Schema::Schema(const IndexAccessor &index, bool inupdate) : IndexAccessor(index),
- _update(inupdate)
-{
- read();
-}
-
-#ifdef UNX
-#define MAX_LINE PATH_MAX
-#else
-#define MAX_LINE _MAX_PATH
-#endif
-
-void Schema::read()
-{
- std::ifstream* in = getSchemaLineInput();
- char line[MAX_LINE];
- // This needs to be replaced with our XML Parser
- while (in->getline(line, MAX_LINE))
- _lines.push_back(line);
- delete in;
-}
-
-void Schema::save()
-{
- if (_update)
- {
- std::fstream* out = getOutputStream(PartName);
- *out << "JavaSearch 1.0\n";
- VectorLines::const_iterator aEnd = _lines.end();
- for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter)
- *out << *aIter << '\n';
- delete out;
- }
-}
-
-class DBPartParameters
-{
- Schema &_schema;
- std::string _partName;
- Stringtable _parameters;
-protected:
- bool parametersKnown() const;
- void updateSchema(const std::string &parameters) { _schema.update(_partName, parameters); }
-public:
- DBPartParameters(Schema &schema, const std::string &partName);
- int integerParameter(const std::string &name);
-};
-
-DBPartParameters::DBPartParameters(Schema &schema, const std::string &partName)
- : _schema(schema), _partName(partName)
-{
- _parameters = schema.parameters(partName);
-}
-
-bool DBPartParameters::parametersKnown() const
-{
- return !_parameters.empty();
-}
-
-int DBPartParameters::integerParameter(const std::string &name)
-{
- std::istringstream converter(_parameters[name]);
- int ret;
- converter >> ret;
- return ret;
-}
-
-class BlockManagerParameters : public DBPartParameters
-{
-private:
- fs::path _file;
- int _blockSize;
-protected:
- int _root;
-public:
- BlockManagerParameters(Schema &schema, const std::string &partName);
- bool readState();
- const fs::path& getFile() const { return _file; }
- int getBlockSize() const { return _blockSize; }
- void setBlockSize(int size) { _blockSize = size; }
- int getRootPosition() const { return _root; }
- void setRoot(int root) { _root = root; }
- void updateSchema(const std::string &params);
-};
-
-void BlockManagerParameters::updateSchema(const std::string &params)
-{
- std::ostringstream tmp;
- tmp << "bs=" << _blockSize << " rt=" << _root << " fl=-1 " << params;
- DBPartParameters::updateSchema(tmp.str());
-}
-
-BlockManagerParameters::BlockManagerParameters(Schema &schema, const std::string &partName)
- : DBPartParameters(schema, partName), _root(0)
-{
- _file = schema.indexFile(partName);
- HCDBG(std::cerr << "file name set to " << _file.native_file_string());
- readState();
-}
-
-bool BlockManagerParameters::readState()
-{
- if (parametersKnown())
- {
- _blockSize = integerParameter("bs");
- _root = integerParameter("rt");
- return true;
- }
- else
- return false;
-}
-
-class BtreeDictParameters : public BlockManagerParameters
-{
-private:
- int _id1;
-public:
- BtreeDictParameters(Schema &schema, const std::string &partName);
- int getFreeID() const { return _id1; }
- void setFreeID(int id) { _id1 = id; }
- void updateSchema();
-};
-
-void BtreeDictParameters::updateSchema()
-{
- std::ostringstream tmp;
- tmp << "id1=" << _id1 << " id2=1";
- BlockManagerParameters::updateSchema(tmp.str());
-}
-
-BtreeDictParameters::BtreeDictParameters(Schema &schema, const std::string &partName)
- : BlockManagerParameters(schema, partName)
-{
-}
-
-int readInt(std::fstream &in)
-{
- HCDBG(std::cerr << "want to read at " << in.tellg() << std::endl);
- int ret = 0;
- for (int i = 3; i >= 0; --i)
- {
- unsigned char byte;
- in.read( (char*)&byte, 1 );
- ret |= (static_cast<unsigned int>(byte) << (i*8));
- HCDBG(fprintf(stderr, "inputting %x ret is now %x\n", byte, ret));
- }
- return ret;
-}
-
-void writeByte(std::fstream &out, unsigned char byte)
-{
- out.write( (const char *)&byte, 1 );
-}
-
-void writeShort(std::fstream &out, int item)
-{
- for (int i = 1; i >= 0; --i)
- {
- unsigned char byte = static_cast<unsigned char>((item >> (i*8)));
- out.write( (const char*)&byte, 1 );
- }
-}
-
-void writeInt(std::fstream &out, int item)
-{
- HCDBG(std::cerr << "want to write at " << out.tellp() << std::endl);
- for (int i = 3; i >= 0; --i)
- {
- unsigned char byte = static_cast<unsigned char>((item >> (i*8)));
- HCDBG(fprintf(stderr, "outputting %x in is %x\n", byte, item));
- out.write( (const char*)&byte, 1 );
- }
-}
-
-void readFully(std::fstream &in, std::vector<unsigned char> &_data)
-{
- in.read((char*)(&_data[0]), _data.size());
-}
-
-/**
-
- Base class for (typically btree) blocks to hold either
- byte vectors representing graph/tree edges,
- or pairs (key, id) for dictionaries
-
- Each block has a header and a data section
-
- */
-
-class Block
-{
-public:
- static int HEADERLEN;
- // length of Block ID in bytes
- static int IDLEN;
-
- // number of the block
- // used for both referring to the block
- // and addresssing the block in file
- unsigned int _number;
- bool _isLeaf;
- // first available byte in data section
- int _free;
- std::vector<unsigned char> _data;
-
- Block(int blocksize) : _number(0), _isLeaf(true), _free(0)
- {
- _data.resize(blocksize - HEADERLEN);
- }
-
- virtual ~Block() {}
-
- void setBlockNumber(int n) { _number = n; }
- virtual void setFree(int free) { _free = free; }
- // interpret 4 bytes at 'i' as an integer
- int integerAt(int i) const
- {
- int result = ((((((_data[i]&0xFF)<<8)
- |_data[i+1]&0xFF)<<8)
- |_data[i+2]&0xFF)<<8)
- |_data[i+3]&0xFF;
- return result;
- }
- void setIntegerAt(int i, int value)
- {
- /*
- for (int j = i + 3; j >= i; j--, value >>= 8)
- _data[j] = (unsigned char)(value & 0xFF);
- */
- _data[i++] = (unsigned char)((value >> 24) & 0xFF);
- _data[i++] = (unsigned char)((value >> 16) & 0xFF);
- _data[i++] = (unsigned char)((value >> 8) & 0xFF);
- _data[i] = (unsigned char)(value & 0xFF);
- }
- void readIn(std::fstream &in)
- {
- _number = readInt(in);
- int twoFields = readInt(in);
- _isLeaf = (twoFields & 0x80000000) != 0;
- HCDBG(std::cerr << "read leaf as " << _isLeaf << std::endl);
- _free = twoFields & 0x7FFFFFFF;
- readFully(in, _data);
- }
- void writeOut(std::fstream &out) const
- {
- writeInt(out, _number);
- writeInt(out, _free | (_isLeaf ? 0x80000000 : 0));
- out.write((const char*)(&_data[0]), _data.size());
- }
-};
-
-int Block::HEADERLEN = 8;
-// length of Block ID in bytes
-int Block::IDLEN = 4;
-
-class BtreeDict;
-class EntryProcessor;
-typedef std::vector<int> IntegerArray;
-
-class DictBlock : public Block
-{
-public:
- DictBlock();
- int free() const { return _free + firstEntry(); }
- int numberOfEntries() const { return integerAt(0); }
- int nthPointer(int n) const { return integerAt(4*(n + 1)); }
- int getChildIdx(int index) const;
- int entryKeyLength(int i) const { return _data[i] & 0xFF; }
- int entryCompression(int i) const { return _data[i + 1] & 0xFF; }
- int entryID(int i) const { return integerAt(i + 2); }
- int entryLength(int entry) const;
- int entryKey(int entry) const;
- int firstEntry() const { return 4; }
- int nextEntry(int entry) const { return entry + entryLength(entry); }
- void restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer);
- std::string restoreKey(int entry, std::vector<unsigned char> &buffer);
- std::string findID(int id) throw( HelpProcessingException );
- void setBlockNumbers(std::vector<int> &blocks) const;
- void listBlock();
- void doMap(BtreeDict &owner, const EntryProcessor &processor);
- void withPrefix(BtreeDict &owner, const std::string &prefix,
- size_t prefLen, IntegerArray &result);
-};
-
-class BlockFactory;
-
-class BlockProcessor;
-
-class BlockDescriptor
-{
-public:
- Block *_block;
- bool _modf;
- BlockDescriptor(Block *block) : _block(block), _modf(false) {}
-}; // end of BlockDescriptor
-
-class BlockManager
-{
-private:
- static int INCR;
- std::fstream _file;
- long _blockSize;
- bool _update;
- BlockFactory *_blockFactory;
- std::vector<BlockDescriptor> _blockTab;
-public:
- BlockManager(const BlockManagerParameters *params,
- bool update, BlockFactory *bfactory) throw( HelpProcessingException );
- ~BlockManager();
- Block& accessBlock(int blockNumber);
- void setModified(int blNum);
- void close();
- Block& getNewBlock();
- void processBlocks(BlockProcessor &processor);
- void mapBlock(Block* block);
- void addDescriptor(Block* block) throw( HelpProcessingException );
-private:
- void writeBlock(const Block &bl);
-};
-
-int BlockManager::INCR = 64; // size increment
-
-class EntryProcessor
-{
-public:
- virtual void processEntry(const std::string &string, int id) const = 0;
- virtual ~EntryProcessor() {};
-};
-
-class BtreeDict
-{
-public:
- static int ENTHEADERLEN;
- static int BLOCKSIZE;
- static int DATALEN;
- static int MaxKeyLength;
- static int lastPtrIndex;
-protected:
- BlockManager *blockManager;
- int root;
- std::vector<int> blocks;
-
- BtreeDict() {/*empty*/}
- ~BtreeDict() { delete blockManager; }
- BtreeDict(const BtreeDictParameters *params);
- void init(const BtreeDictParameters *params, bool update,
- BlockFactory *bfactory);
-public:
- int fetch(const std::string &key);
- void close();
-private:
- std::string fetch(int conceptID);
- IntegerArray withPrefix(const std::string &prefix);
-public:
- DictBlock& accessBlock(int index);
- DictBlock& child(const DictBlock &bl, int index) throw( HelpProcessingException );
-private:
- std::string findID(int blNum, int id);
- int find(const DictBlock &bl, std::vector<unsigned char> &key, int index);
- int find(const DictBlock &bl, std::vector<unsigned char> &key);
- void setBlocks(std::vector<int> &blocks);
- void map(const EntryProcessor &processor);
-public:
- void dumpnode(DictBlock &bl, int level);
-};
-
-class BlockFactory
-{
-public:
- virtual Block* makeBlock() const = 0;
- virtual ~BlockFactory() {}
-};
-
-static int dictcount;
-
-class DictBlockFactory : public BlockFactory
-{
-public:
- Block* makeBlock() const
- {
- dictcount++;
- return new DictBlock;
- }
-};
-
-BtreeDict::BtreeDict(const BtreeDictParameters *params)
-{
- init(params, false, new DictBlockFactory());
- blocks.resize(params->getFreeID());
- setBlocks(blocks);
-}
-
-void BtreeDict::dumpnode(DictBlock &bl, int level)
-{
- if (!bl._isLeaf)
- {
- fprintf(stderr, "\n");
- for (int i = 0; i < level; ++i)
- fprintf(stderr, "\t");
- fprintf(stderr, "there are %d entries\n", bl.numberOfEntries());
- for (int i = 0; i < level; ++i)
- fprintf(stderr, "\t");
- for (int i = 0; i < bl.numberOfEntries(); ++i)
- {
- int index = bl.getChildIdx(i);
- fprintf(stderr, " %d ", index);
- DictBlock &thischild = accessBlock(index);
- dumpnode(thischild, level + 1);
- }
- fprintf(stderr, "\n");
- }
-}
-
-int BtreeDict::fetch(const std::string &key)
-{
- HCDBG(std::cerr << "fetching " << key << " from root " << root << std::endl);
- DictBlock &bl = accessBlock(root);
-
- int length = key.size();
- std::vector<unsigned char> Key(length + 1);
- memcpy(&(Key[0]), key.c_str(), length);
- Key[length] = 0; // sentinel
-
- return find(bl, Key);
-}
-
-std::string BtreeDict::fetch(int conceptID)
-{
- return findID(blocks[conceptID], conceptID);
-}
-
-IntegerArray BtreeDict::withPrefix(const std::string &prefix)
-{
- IntegerArray result;
- accessBlock(root).withPrefix(*this, prefix, prefix.size(), result);
- return result;
-}
-
-void BtreeDict::close()
-{
- blockManager->close();
-}
-
-void BtreeDict::init(const BtreeDictParameters *params, bool update,
- BlockFactory *bfactory)
-{
- blockManager = new BlockManager(params, update, bfactory);
- root = params->getRootPosition();
-}
-
-DictBlock& BtreeDict::accessBlock(int index)
-{
- return (DictBlock&)blockManager->accessBlock(index);
-}
-
-DictBlock& BtreeDict::child(const DictBlock &bl, int index) throw( HelpProcessingException )
-{
- if (bl._isLeaf)
- {
- std::stringstream aStrStream;
- aStrStream << "leaf's can't have children, screwed!" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- return accessBlock(bl.getChildIdx(index));
-}
-
-std::string BtreeDict::findID(int blNum, int id)
-{
- return accessBlock(blNum).findID(id);
-}
-
-int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key, int index)
-{
- HCDBG(std::cerr << "find2: " << bl._isLeaf << " : " << index << " : " << std::endl);
-
- return bl._isLeaf ? 0 : find(child(bl, index), key);
-}
-
-int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key)
-{
- int inputKeyLen = key.size() - 1;
- int entryPtr = bl.firstEntry();
- int freeSpace = bl.free();
- int nCharsEqual = 0;
- int compression = 0;
-
- HCDBG(std::cerr << "find1: " << inputKeyLen << " : "
- << entryPtr << " : " << freeSpace << " : " << nCharsEqual << " "
- << compression << std::endl);
-
- for (int entryIdx = 0;;)
- {
- if (entryPtr == freeSpace)
- return find(bl, key, bl.numberOfEntries());
- else if (compression == nCharsEqual)
- {
- int keyLen = bl.entryKeyLength(entryPtr);
- int keyPtr = bl.entryKey(entryPtr), i;
- for (i = 0; i < keyLen && key[nCharsEqual] == bl._data[keyPtr + i]; i++)
- ++nCharsEqual;
- if (i == keyLen)
- {
- if (nCharsEqual == inputKeyLen)
- return bl.entryID(entryPtr);
- }
- else if ((key[nCharsEqual]&0xFF) < (bl._data[keyPtr + i]&0xFF))
- return find(bl, key, entryIdx);
- }
- else if (compression < nCharsEqual) // compression dropped
- return find(bl, key, entryPtr == freeSpace
- ? bl.numberOfEntries() : entryIdx);
- do
- {
- entryPtr = bl.nextEntry(entryPtr);
- ++entryIdx;
- }
- while (bl.entryCompression(entryPtr) > nCharsEqual);
- compression = bl.entryCompression(entryPtr);
- }
-}
-
-class BlockProcessor
-{
-protected:
- std::vector<int> &blocks;
-public:
- BlockProcessor(std::vector<int> &_blocks) : blocks(_blocks) {}
- virtual void process(const Block &block) = 0;
- virtual ~BlockProcessor() {}
-};
-
-
-class DictBlockProcessor : public BlockProcessor
-{
-public:
- DictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {}
- void process(const Block &block)
- {
- ((const DictBlock&)block).setBlockNumbers(blocks);
- }
-};
-
-BlockManager::BlockManager(const BlockManagerParameters *params,
- bool update, BlockFactory *bfactory) throw( HelpProcessingException )
- : _blockFactory(bfactory)
-{
- _update = update;
- // params.readState();
- _blockSize = params->getBlockSize();
- HCDBG(std::cerr << "opening " << params->getFile().native_file_string() << std::endl);
- if (!update)
- {
- _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::binary);
- }
- else
- {
- _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary);
- if (!_file.is_open())
- {
- HCDBG(std::cerr << "didn't exist" << std::endl);
- _file.open(params->getFile().native_file_string().c_str(),
- std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary);
- }
- if (!_file.is_open())
- {
- std::stringstream aStrStream;
- aStrStream << "Cannot open " << params->getFile().native_file_string() << std::endl;
- throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
- }
- }
-
- _file.seekg(0, std::ios::end);
- long length = _file.tellg();
- if (length < 0) length = 0;
- _file.seekg(0, std::ios::beg);
- _file.clear();
-
- HCDBG(std::cerr << "len is " << length << std::endl);
-
- if (length <= 0 && update)
- {
- Block* _dummy = bfactory->makeBlock();
- _dummy->setBlockNumber(0);
- writeBlock(*_dummy);
- delete _dummy;
- length = _blockSize;
- }
-
- _file.seekg(0, std::ios::beg);
-
- int _blockTableSize = (length/_blockSize);
- HCDBG(std::cerr << "len is now " << _blockTableSize << std::endl);
- for (int i = 0; i < _blockTableSize; ++i)
- mapBlock(bfactory->makeBlock());
-}
-
-Block& BlockManager::getNewBlock()
-{
- unsigned int number = _blockTab.size();
-
- Block *bl = _blockFactory->makeBlock();
- bl->setBlockNumber(number);
- writeBlock(*bl);
- addDescriptor(bl);
-
- return *(_blockTab[number]._block);
-}
-
-void BlockManager::setModified(int blNum)
-{
- _blockTab[blNum]._modf = true;
-}
-
-void BlockManager::close()
-{
- if (_update)
- {
- std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end();
- for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin();
- aIter != aEnd; ++aIter)
- {
- if (aIter->_modf)
- writeBlock(*(aIter->_block));
- }
- }
- _file.close();
-}
-
-void BlockManager::processBlocks(BlockProcessor &processor)
-{
- std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end();
- for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin();
- aIter != aEnd; ++aIter)
- {
- processor.process(*(aIter->_block));
- }
-}
-
-void BlockManager::mapBlock(Block* block)
-{
- block->readIn(_file);
- addDescriptor(block);
-}
-
-void BlockManager::addDescriptor(Block *block) throw( HelpProcessingException )
-{
- BlockDescriptor desc(block);
- _blockTab.push_back(desc);
- HCDBG(std::cerr << "numbers are " << block->_number << " " << (_blockTab.size()-1) << std::endl);
- if (block->_number != _blockTab.size() - 1)
- {
- std::stringstream aStrStream;
- aStrStream << "totally screwed" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- HCDBG(std::cerr << "addDescriptor blocks are now " << _blockTab.size() << std::endl);
-}
-
-void BlockManager::writeBlock(const Block &bl)
-{
- _file.seekp(_blockSize * bl._number);
- bl.writeOut(_file);
-}
-
-Block& BlockManager::accessBlock(int blockNumber)
-{
- return *(_blockTab[blockNumber]._block);
-}
-
-BlockManager::~BlockManager()
-{
- std::vector<BlockDescriptor>::iterator aEnd = _blockTab.end();
- for (std::vector<BlockDescriptor>::iterator aIter = _blockTab.begin();
- aIter != aEnd; ++aIter)
- {
- delete aIter->_block;
- }
- delete _blockFactory;
-}
-
-void BtreeDict::setBlocks(std::vector<int> &inblocks)
-{
- DictBlockProcessor foo(inblocks);
- blockManager->processBlocks(foo);
-}
-
-// can go to Full
-void BtreeDict::map(const EntryProcessor &processor)
-{
- accessBlock(root).doMap(*this, processor);
-}
-
-void DictBlock::restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer)
-{
- int howMany = entryKeyLength(entry);
- int where = entryCompression(entry);
- int from = entryKey(entry);
- while (howMany-- > 0)
- buffer[where++] = _data[from++];
-}
-
-std::string DictBlock::restoreKey(int entry, std::vector<unsigned char> &buffer)
-{
- int howMany = entryKeyLength(entry);
- int where = entryCompression(entry);
- int from = entryKey(entry);
- while (howMany-- > 0)
- buffer[where++] = _data[from++];
- return std::string((const char*)(&buffer[0]), 0, where);
-}
-
-std::string DictBlock::findID(int id) throw( HelpProcessingException )
-{
- std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
- int freeSpace = free();
- for (int ent = firstEntry(); ent < freeSpace; ent = nextEntry(ent))
- {
- if (entryID(ent) == id) // found
- return restoreKey(ent, buffer);
- else
- restoreKeyInBuffer(ent, buffer);
- }
- std::stringstream aStrStream;
- aStrStream << "ID not found in block" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
-}
-
-void DictBlock::setBlockNumbers(std::vector<int> &blocks) const
-{
- for (int e = firstEntry(); e < _free; e = nextEntry(e))
- blocks[entryID(e)] = _number;
-}
-
-void DictBlock::listBlock()
-{
- std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
- int freeSpace = free();
- int entryPtr = firstEntry();
- if (_isLeaf)
- {
- while (entryPtr < freeSpace)
- {
- std::cout << restoreKey(entryPtr, buffer) << " " <<
- entryID(entryPtr);
- entryPtr = nextEntry(entryPtr);
- }
- }
- else
- std::cout << "not leaf" << std::endl;
-}
-
-void DictBlock::doMap(BtreeDict &owner, const EntryProcessor &processor)
-{
- std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
- int freeSpace = free();
- int entryPtr = firstEntry();
- if (_isLeaf)
- {
- while (entryPtr < freeSpace)
- {
- processor.processEntry(restoreKey(entryPtr, buffer),
- entryID(entryPtr));
- entryPtr = nextEntry(entryPtr);
- }
- }
- else
- {
- int entryIdx = 0;
- while (entryPtr < freeSpace)
- {
- owner.accessBlock(getChildIdx(entryIdx)).doMap(owner,processor);
- processor.processEntry(restoreKey(entryPtr, buffer),
- entryID(entryPtr));
- entryPtr = nextEntry(entryPtr);
- ++entryIdx;
- }
- owner.accessBlock(getChildIdx(entryIdx)).doMap(owner, processor);
- }
-}
-
-void DictBlock::withPrefix(BtreeDict &owner, const std::string &prefix,
- size_t prefLen, IntegerArray &result)
-{
- std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
- int freeSpace = free();
- int entryPtr = firstEntry();
- if (_isLeaf)
- {
- while (entryPtr < freeSpace)
- {
- if (restoreKey(entryPtr, buffer).find(prefix) == 0)
- result.push_back(entryID(entryPtr));
- entryPtr = nextEntry(entryPtr);
- }
- }
- else
- {
- int entryIndex = 0;
- while (entryPtr < freeSpace)
- {
- std::string key = restoreKey(entryPtr, buffer);
- if (key.size() > prefLen)
- key = key.substr(0, prefLen);
- int cmp = key.compare(prefix);
- if (cmp < 0)
- {
- entryPtr = nextEntry(entryPtr);
- ++entryIndex;
- }
- else if (cmp == 0)
- {
- result.push_back(entryID(entryPtr));
- owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result);
- entryPtr = nextEntry(entryPtr);
- ++entryIndex;
- }
- else
- {
- owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result);
- return;
- }
- }
- owner.accessBlock(getChildIdx(numberOfEntries())).withPrefix(owner, prefix, prefLen, result);
- }
-}
-
-int BtreeDict::ENTHEADERLEN = 6;
-int BtreeDict::BLOCKSIZE = 2048;
-int BtreeDict::DATALEN = BtreeDict::BLOCKSIZE - Block::HEADERLEN;
-int BtreeDict::MaxKeyLength = 255;
- //!!! Careful with that number, Eugene
-int BtreeDict::lastPtrIndex = 508;
-
-DictBlock::DictBlock() : Block(BtreeDict::BLOCKSIZE)
-{
-}
-
-int DictBlock::getChildIdx(int index) const
-{
- return nthPointer(BtreeDict::lastPtrIndex - index);
-}
-
-int DictBlock::entryLength(int entry) const
-{
- return BtreeDict::ENTHEADERLEN + entryKeyLength(entry);
-}
-
-int DictBlock::entryKey(int entry) const
-{
- return entry + BtreeDict::ENTHEADERLEN;
-}
-
-void setBlockNumber2(std::vector<int> &blocks, size_t index, int number)
-{
- if (index >= blocks.size())
- blocks.resize(index + 1000);
- blocks[index] = number;
-}
-
-class Entry
-{
-public:
- std::vector<unsigned char> key;
- int id;
- int block;
-
- Entry(const std::vector<unsigned char> &keyin, int length, int idin) : key(length+1), id(idin), block(-1)
- {
- memcpy(&key[0], &keyin[0], length);
- }
-
- Entry(const std::string &keyin, int idin) : key(keyin.size()+1), id(idin), block(-1)
- {
- memcpy(&key[0], keyin.c_str(), keyin.size());
- }
-
- bool smallerThan(const Entry &other)
- {
- for (size_t i = 0; i < std::min(key.size(), other.key.size()); i++)
- if (key[i] != other.key[i])
- return (key[i]&0xFF) < (other.key[i]&0xFF);
- return false;
- }
-}; // end of internal class Entry
-
-class FullDictBlock;
-
-class FullBtreeDict : public BtreeDict
-{
-protected:
- BtreeDictParameters *_params;
- bool update;
-public:
- FullBtreeDict(BtreeDictParameters &params, bool update);
- void store(const std::string &bla, int id) throw( HelpProcessingException );
- boost::shared_ptr<Entry> insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent);
- boost::shared_ptr<Entry> insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
- throw( HelpProcessingException );
- FullDictBlock& getNewBlock();
- void setModified(Block &bl);
- void close(int freeID);
-};
-
-class FullDictBlock : public DictBlock
-{
-public:
- virtual void setFree(int free);
- void setNumberOfEntries(int n) { setIntegerAt(0, n); }
- void setChildIndex(int index, int value)
- {
- setIntegerAt(4*(BtreeDict::lastPtrIndex - index + 1), value);
- }
- void setEntryID(int i, int id) { setIntegerAt(i + 2, id); }
- void setBlockNumbers(std::vector<int> &blocks) const;
- bool insert(const Entry &entry);
- void makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr);
- bool insert(const Entry &ent, int entryPtr, int compr1, int compr2, int index);
- int insertInternal(const Entry &entry);
- boost::shared_ptr<Entry> split(FullDictBlock &newbl);
- void initInternal(int leftBlock, const Entry &entry);
- bool insert(boost::shared_ptr<Entry> entry);
- bool insert(boost::shared_ptr<Entry> ent, int entryPtr,
- int compr1, int compr2, int index);
-
-};
-
-void FullDictBlock::initInternal(int leftBlock, const Entry &entry)
-{
- _isLeaf = false;
- setNumberOfEntries(1);
- setChildIndex(0, leftBlock);
- setChildIndex(1, entry.block);
- int ent = firstEntry();
- makeEntry(ent, entry.key, entry.id, entry.key.size() - 1, 0);
- setFree(nextEntry(ent));
-}
-
-void FullDictBlock::setFree(int infree)
-{
- _free = infree - firstEntry();
- _data[infree] = _data[infree + 1] = 0; // sentinel
-}
-
-boost::shared_ptr<Entry> FullDictBlock::split(FullDictBlock& newbl)
-{
- std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength);
- int freeSpace = free();
- int half = freeSpace/2;
- int index = 0; // of middle entry
- newbl._isLeaf = _isLeaf;
- int ent;
- for (ent = firstEntry(); ent < half; ent = nextEntry(ent))
- {
- restoreKeyInBuffer(ent, buffer);
- ++index;
- }
- int entriesToMove = numberOfEntries() - index - 1;
- // middle entry
- restoreKeyInBuffer(ent, buffer);
- int len = entryKeyLength(ent) + entryCompression(ent);
- boost::shared_ptr<Entry> result(new Entry(buffer, len, entryID(ent)));
- result->block = newbl._number;
- int newFree = ent;
- // rest goes to the new block
- ent = nextEntry(ent);
- restoreKeyInBuffer(ent, buffer);
- len = entryKeyLength(ent) + entryCompression(ent);
- int nptr = firstEntry();
- newbl.makeEntry(nptr, buffer, entryID(ent), len, 0);
- ent = nextEntry(ent);
- memmove(&(newbl._data[newbl.nextEntry(nptr)]), &(_data[ent]), freeSpace - ent);
- newbl.setNumberOfEntries(entriesToMove);
- newbl.setFree(newbl.nextEntry(nptr) + freeSpace - ent);
- if (_isLeaf == false) // need to split pointers
- {
- int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1);
- int to = from + 4*(index + 1);
- memmove(&(newbl._data[to]), &(_data[from]), 4*(entriesToMove + 1));
- }
- // this entry will end here
- setFree(newFree);
- setNumberOfEntries(index);
- return result;
- //!!!remember updating ID -> string association
-}
-
-void FullDictBlock::setBlockNumbers(std::vector<int> &blocks) const
-{
- for (int e = firstEntry(); e < _free; e = nextEntry(e))
- setBlockNumber2(blocks, entryID(e), _number);
-}
-
-bool FullDictBlock::insert(boost::shared_ptr<Entry> ent, int entryPtr,
- int compr1, int compr2, int index)
-{
- const std::vector<unsigned char> &key = ent->key;
- int keyLen = key.size() - 1 - compr1;
- int freeSpace = free();
- // calculate how much space is needed to add the new entry
- // first, how many bytes are needed for just the new entry
- int demand = BtreeDict::ENTHEADERLEN + keyLen;
- // adding an entry can increase compression in the following entry
-
- int increase = 0;
- if (entryPtr < freeSpace)
- if (entryCompression(entryPtr) < compr2)
- increase = compr2 - entryCompression(entryPtr);
- /*
- std::cerr << "key " << key << std::endl;
- std::cerr << "entryPtr " << entryPtr << std::endl;
- std::cerr << "compr1 " << compr1) << std::endl;
- std::cerr << "compr2 " << compr2) << std::endl;
- std::cerr << "index " << index) << std::endl;
- std::cerr << "demand " << demand) << std::endl;
- std::cerr << "increase " << increase) << std::endl;
- */
- // check if enough space is available
- int limit = _isLeaf ? BtreeDict::DATALEN-2 : 4*(BtreeDict::lastPtrIndex-numberOfEntries()-1);
-
- if (freeSpace + demand - increase <= limit) // 2 for sentinel
- {
- if (entryPtr < freeSpace)
- {
- // need to shift extant entries forward
- int toMove = increase > 0 ? entryPtr + BtreeDict::ENTHEADERLEN + increase : entryPtr;
- // move entries
- memmove(&(_data[toMove + demand - increase]), &(_data[toMove]), freeSpace - toMove);
-
- if (increase > 0)
- {
- // update header
- unsigned char tmp = static_cast<unsigned char>(increase);
- _data[entryPtr] = _data[entryPtr] - tmp;
- _data[entryPtr + 1] = _data[entryPtr + 1] + tmp;
- // shift header
- memmove(&(_data[entryPtr + demand]), &(_data[entryPtr]), BtreeDict::ENTHEADERLEN);
- }
- }
- // now write the new entry in the space made above
- makeEntry(entryPtr, key, ent->id, keyLen, compr1);
-
- if (_isLeaf == false)
- {
- int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1);
- memmove(&(_data[from - 4]), &(_data[from]), 4*(numberOfEntries() - index));
- setChildIndex(index + 1, ent->block);
- }
- setFree(freeSpace + demand - increase);
- setNumberOfEntries(numberOfEntries() + 1);
-
- /*
- System.err.println("------------list--------------");
- byte[] buffer = new byte[MaxKeyLength];
- final int freeSpace2 = free();
- int entryPtr2 = firstEntry();
- while (entryPtr2 < freeSpace2)
- {
- System.err.println(entryPtr2);
- System.err.println(entryKeyLength(entryPtr2));
- System.err.println(entryCompression(entryPtr2));
- System.err.println(new String(_data,
- entryKey(entryPtr2),
- entryKeyLength(entryPtr2)));
- System.err.println(restoreKey(entryPtr2, buffer)+" "+
- entryID(entryPtr2));
- entryPtr2 = nextEntry(entryPtr2);
- }
- System.err.println("------------end--------------");
- */
- return true;
- }
- else
- return false;
-}
-
-// finds the place and context
-bool FullDictBlock::insert(boost::shared_ptr<Entry> entry)
-{
- const std::vector<unsigned char> &inkey = entry->key;
- int inputKeyLen = inkey.size() - 1;
- int freeSpace = free();
- int entryPtr = firstEntry();
- int nCharsEqual = 0;
- int prevNCEqual = 0;
- int compression = 0;
-
- for (int entryIndex = 0;;)
- {
- if (entryPtr == freeSpace)
- return insert(entry, entryPtr, nCharsEqual, 0, numberOfEntries());
- else if (compression == nCharsEqual)
- {
- int keyLen = entryKeyLength(entryPtr);
- int keyPtr = entryKey(entryPtr), i;
- prevNCEqual = nCharsEqual;
- for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++)
- ++nCharsEqual;
- if (i == keyLen)
- {
- if (nCharsEqual == inputKeyLen)
- {
- HCDBG(std::cerr << "setting to " << entry->id << std::endl);
- setEntryID(entryPtr, entry->id);
- return true;
- }
- }
- else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF))
- return insert(entry, entryPtr, prevNCEqual, nCharsEqual, entryIndex);
- }
- else if (compression < nCharsEqual) // compression dropped
- {
- int index = entryPtr == freeSpace ? numberOfEntries() : entryIndex;
- return insert(entry, entryPtr, nCharsEqual, compression, index);
- }
- do
- {
- entryPtr = nextEntry(entryPtr);
- ++entryIndex;
- }
- while (entryCompression(entryPtr) > nCharsEqual);
- compression = entryCompression(entryPtr);
- }
-}
-
-static int fulldictcount;
-
-class FullDictBlockFactory : public BlockFactory
-{
-public:
- Block* makeBlock() const
- {
- fulldictcount++;
- return new FullDictBlock;
- }
-};
-
-class FullDictBlockProcessor : public BlockProcessor
-{
-public:
- FullDictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {}
- void process(const Block &block)
- {
- ((const FullDictBlock&)block).setBlockNumbers(blocks);
- }
-};
-
-FullBtreeDict::FullBtreeDict(BtreeDictParameters &params, bool _update) :
- _params(&params), update(_update)
-{
- init(_params, update, new FullDictBlockFactory());
- HCDBG(std::cerr << "id is " << params.getFreeID() << std::endl);
- blocks.resize(params.getFreeID());
-
- FullDictBlockProcessor foo(blocks);
- blockManager->processBlocks(foo);
- /*
- if (logging)
- log = new FileWriter("/tmp/FullBtreeDict.log");
- */
-}
-
-void FullBtreeDict::setModified(Block &bl)
-{
- blockManager->setModified(bl._number);
-}
-
-FullDictBlock& FullBtreeDict::getNewBlock()
-{
- FullDictBlock &nbl = (FullDictBlock&)blockManager->getNewBlock();
- setModified(nbl);
- return nbl;
-}
-
-boost::shared_ptr<Entry> FullBtreeDict::insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
- throw( HelpProcessingException )
-{
- setModified(bl); // to be modified in any case
- if (bl.insert(ent))
- return boost::shared_ptr<Entry>();
- else
- {
- FullDictBlock &nbl = getNewBlock();
- boost::shared_ptr<Entry> middle = bl.split(nbl);
- nbl.setBlockNumbers(blocks);
- if ((middle->smallerThan(*ent) ? nbl : bl).insert(ent) == false)
- {
- std::stringstream aStrStream;
- aStrStream << "entry didn't fit into a freshly split block" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- return middle;
- }
-}
-
-void FullDictBlock::makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr)
-{
- _data[entry] = static_cast<unsigned char>(length);
- _data[entry + 1] = static_cast<unsigned char>(compr);
- setEntryID(entry, id);
- memmove(&(_data[entryKey(entry)]), &(key[compr]), length);
-}
-
-int FullDictBlock::insertInternal(const Entry &entry)
-{
- const std::vector<unsigned char> &inkey = entry.key;
- int inputKeyLen = inkey.size() - 1;
- int entryPtr = firstEntry();
- int freeSpace = free();
- int nCharsEqual = 0;
- int compression = 0;
-
- for (int entryIndex = 0;;)
- {
- if (entryPtr == freeSpace)
- return numberOfEntries();
- else if (compression == nCharsEqual)
- {
- int i;
- int keyLen = entryKeyLength(entryPtr);
- int keyPtr = entryKey(entryPtr);
- for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++)
- ++nCharsEqual;
- if (i == keyLen)
- {
- if (nCharsEqual == inputKeyLen)
- {
- setEntryID(entryPtr, entry.id);
- return -1;
- }
- }
- else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF))
- return entryIndex;
- }
- else if (compression < nCharsEqual) // compression dropped
- return entryPtr >= freeSpace ? numberOfEntries() : entryIndex;
-
- do
- {
- entryPtr = nextEntry(entryPtr);
- ++entryIndex;
- }
- while (entryCompression(entryPtr) > nCharsEqual);
- compression = entryCompression(entryPtr);
- }
-}
-
-/*
- delegation to powerful primitives at the FullDictBlock level lets us
- express the insertion algorithm very succintly here
-*/
-boost::shared_ptr<Entry> FullBtreeDict::insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent)
-{
- if (bl._isLeaf)
- ent = insertHere(bl, ent);
- else
- {
- int index = bl.insertInternal(*ent);
- if (index != -1)
- {
- ent = insert((FullDictBlock&)child(bl, index), ent);
- if (ent.get())
- ent = insertHere(bl, ent);
- }
- }
- return ent;
-}
-
-void FullBtreeDict::store(const std::string &key, int id) throw( HelpProcessingException )
-{
- HCDBG(std::cerr << "so storing " << key << " id " << id << std::endl);
-
- if (key.size() >= 250)
- {
- std::stringstream aStrStream;
- aStrStream << "token " << key << " too long" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- boost::shared_ptr<Entry> aTemp(new Entry(key, id));
- FullDictBlock &rBlock = (FullDictBlock&)accessBlock(root);
- boost::shared_ptr<Entry> entry = insert(rBlock, aTemp);
- if (entry.get())
- {
- // new root; writing to params needed
- FullDictBlock &nbl = getNewBlock();
- nbl.initInternal(root, *entry);
- setBlockNumber2(blocks, entry->id, root = nbl._number);
- _params->setRoot(root);
- }
-}
-
-void FullBtreeDict::close(int freeID)
-{
- _params->setFreeID(freeID);
- if (update)
- _params->updateSchema();
- BtreeDict::close();
- /*
- if (logging)
- log.close();
- */
-}
-
-class ConceptLocation
-{
-public:
- int _concept;
- int _begin;
- int _end;
-public:
- ConceptLocation(int conceptID, int begin, int end);
- static void sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2);
- static void sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2);
- int getConcept() const { return _concept; }
- void setConcept(int concept) { _concept = concept; }
- int getBegin() const { return _begin; }
- int getEnd() const { return _end; }
- int getLength() const { return _end - _begin; }
- bool equals(const ConceptLocation &other) const
- {
- return _concept==other._concept&&_begin==other._begin&&_end==other._end;
- }
-};
-
-class DocumentCompressor;
-
-class Index : public IndexAccessor
-{
-protected:
- typedef std::hash_map<std::string, int, pref_hash> IndexHashtable;
- bool _update;
- IndexHashtable _cache;
- Schema *_schema;
-private:
- BtreeDictParameters *_dictParams;
- FullBtreeDict *_dict;
- int _freeID;
- std::fstream *_positionsFile;
- std::fstream *_offsetsFile;
- DocumentCompressor *_documentCompressor;
- IntegerArray _concepts;
- IntegerArray _offsets;
- std::vector<unsigned char> _allLists; // POSITIONS
- void readDocumentsTable(const std::string &fileName);
- void readOffsetsTables(const std::string &fileName);
- void readPositions();
-protected:
- IntegerArray _microIndexOffsets;
- IntegerArray _documents;
- IntegerArray _titles;
- std::vector<unsigned char> _positions;
-private:
- int _positionsCacheSize;
- int _currentBatchOffset;
- bool _allInCache;
-protected:
- virtual void writeOutOffsets();
-public:
- Index(const fs::path &indexName, bool update);
- virtual ~Index();
- void init();
- int intern(const std::string &name);
- std::fstream& getPositionsFile();
- std::fstream& getOffsetsFile();
- DocumentCompressor& getDocumentCompressor();
- virtual void compress(int docID, int titleID,
- std::vector<ConceptLocation> &locations,
- std::vector<ConceptLocation> &extents);
- void close();
-};
-
-Index::Index(const fs::path &indexName, bool update) : IndexAccessor(indexName),
- _update(update), _cache(256), _schema(NULL), _dictParams(NULL), _dict(NULL), _positionsFile(0), _offsetsFile(0), _documentCompressor(0),
- _positionsCacheSize(0), _currentBatchOffset(0), _allInCache(false)
-{
-}
-
-class CompressorIterator;
-class Decompressor
-{
-private:
- static int BitsInByte;
- static int NBits;
-
- int _readByte;
- int _toRead;
- int _path;
-
-protected:
- virtual int getNextByte() = 0;
- virtual void initReading() { _toRead = 0; _path = 0; }
-
-private:
- int countZeroes();
- // reads 1 bit; returns non-0 for bit "1"
- int read();
-
-public:
- int read(int kBits);
- void beginIteration() { _path = 0; }
- bool readNext(int k, CompressorIterator &it);
- void decode(int k, IntegerArray &array);
- void ascDecode(int k, IntegerArray &array);
- int ascendingDecode(int k, int start, std::vector<int> &array);
- virtual ~Decompressor() {}
-};
-
-int Decompressor::BitsInByte = 8;
-int Decompressor::NBits = 32;
-
-class ByteArrayDecompressor : public Decompressor
-{
-private:
- const std::vector<unsigned char> *_array;
- int _index;
- int _index0;
-public:
- ByteArrayDecompressor(const std::vector<unsigned char> *array, int index) { initReading(array, index); }
- using Decompressor::initReading;
- virtual void initReading(const std::vector<unsigned char> *array, int index)
- {
- _array = array;
- _index = _index0 = index;
- Decompressor::initReading();
- }
- int bytesRead() { return _index - _index0; }
-protected:
- int getNextByte()
- {
- int ret = (*_array)[_index] & 0xFF;
- HCDBG(fprintf(stderr, "ByteArrayDecompressor::getNextByte of %d at index %d\n", ret, _index));
- _index++;
- return ret;
- }
-};
-
-bool isExtensionMode( void );
-
-class IndexInverter;
-
-class MicroIndex
-{
-public:
- static int RANGE;
- static int NConcepts;
-private:
- int _currentRange;
- int _documentNumber;
- std::vector<int> _concepts;
- short _group;
- short _ix;
- IntegerArray _kTable;
- IntegerArray _offsets;
- IntegerArray _maxConcepts;
- const std::vector<unsigned char> *_data;
- int _base;
- int _limit;
- int _nc;
- ByteArrayDecompressor _decmp;
-public:
- MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index);
- bool smallerThan(const MicroIndex &other)
- {
- return _currentRange < other._currentRange ||
- _currentRange == other._currentRange &&
- _documentNumber < other._documentNumber;
- }
-
-private:
- bool next()
- {
- if (_group <= _limit)
- {
- int shift, index;
- if (_group > 0)
- {
- index = _base + _offsets[_group - 1];
- shift = _maxConcepts[_group - 1];
- }
- else
- {
- index = _base;
- shift = 0;
- }
-
- _decmp.initReading(_data, index);
- _nc = _decmp.ascendingDecode(_kTable[_group*2], shift, _concepts);
- HCDBG(std::cerr << "nc b set to " << _nc << std::endl);
- if (_group < _limit)
- {
- HCDBG(fprintf(stderr, "microindex concept index %d set to %d\n", _nc, _maxConcepts[_group]));
- _concepts[_nc++] = _maxConcepts[_group];
- }
- _currentRange = _concepts[_ix = 0]/RANGE;
- _group++;
- return true;
- }
- else
- return false;
- }
-
- void openDocumentIndex()
- {
- unsigned int kk = (*_data)[_base] & 0xFF;
- HCDBG(std::cerr << "openDocumentIndex, kk is " << kk
- << " base is " << _base << std::endl);
- switch (kk >> 6) // get type
- {
- case 0: // single group, no extents
- _decmp.initReading(_data, _base += 2);
- _nc = _decmp.ascendingDecode(kk & 0x3F, 0, _concepts);
- HCDBG(std::cerr << "nc a set to " << _nc << std::endl);
- _currentRange = _concepts[_ix = 0]/RANGE;
- _limit = 0;
- _group = 1;
- break;
- case 2: // multi group, no extents
- {
- _decmp.initReading(_data, _base + 1);
- _decmp.decode(kk & 0x3F, _kTable);
- int last = _kTable.back();
- _kTable.pop_back();
- _decmp.ascDecode(last, _offsets);
- last = _kTable.back();
- _kTable.pop_back();
- _decmp.ascDecode(last, _maxConcepts);
- _base += 1 + _decmp.bytesRead();
- _limit = _maxConcepts.size();
- _group = 0;
- next();
- }
- break;
- case 1: // single group, extents
- case 3: // multi group, extents
- if( !isExtensionMode() )
- std::cerr << "extents not yet implemented" << std::endl;
- break;
- }
- }
-
-public:
- bool process(IndexInverter &lists);
-};
-
-int MicroIndex::RANGE = 1024;
-int MicroIndex::NConcepts = 16;
-
-class BitBuffer
-{
-private:
- static int InitSize;
- static int NBits;
- static int BitsInByte;
- static int BytesInInt;
-
- int _avail;
- unsigned int _word;
- int _free;
- int _size;
- std::vector<unsigned int> _array;
-
-public:
- BitBuffer() : _avail(NBits), _word(0), _free(0), _size(InitSize)
- {
- _array.resize(InitSize);
- }
-
- void close()
- {
- if (_avail < NBits)
- store(_word << _avail);
- else
- _avail = 0;
- }
-
- void write(std::fstream &out) const
- {
- for (int i = 0; i < _free - 1; i++)
- writeInt(out, _array[i]);
- unsigned int word = _array[_free - 1];
- int bytes = BytesInInt - _avail/BitsInByte;
- int shift = NBits;
- while (bytes-- > 0)
- writeByte(out, static_cast<unsigned char>((word >> (shift -= BitsInByte)) & 0xFF));
- }
-
- void clear()
- {
- _word = 0;
- _avail = NBits;
- _free = 0;
- }
-
- int byteCount() { return _free*BytesInInt - _avail/BitsInByte; }
- int bitCount() { return _free*NBits - _avail; }
-
- void setFrom(const BitBuffer &rhs)
- {
- _word = rhs._word;
- _avail = rhs._avail;
- if ((_free = rhs._free) > _size)
- _array.resize(_size = rhs._free);
- _array = rhs._array;
- }
-private:
- void growArray(int newSize)
- {
- _array.resize(newSize);
- _size = newSize;
- }
-
- void store(unsigned int value)
- {
- if (_free == _size)
- growArray(_size * 2);
- HCDBG(fprintf(stderr, "store of %x to %d\n", (int)value, _free));
- _array[_free++] = value;
- }
-
-public:
- void append(int bit)
- {
- _word = (_word << 1) | bit;
- if (--_avail == 0)
- {
- store(_word);
- _word = 0;
- _avail = NBits;
- }
- }
-
- void append(unsigned int source, int kBits)
- {
- if (kBits < _avail)
- {
- _word = (_word << kBits) | source;
- _avail -= kBits;
- }
- else if (kBits > _avail)
- {
- int leftover = kBits - _avail;
- store((_word << _avail) | (source >> leftover));
- _word = source;
- _avail = NBits - leftover;
- }
- else
- {
- store((_word << kBits) | source);
- _word = 0;
- _avail = NBits;
- }
- }
-
- void concatenate(const BitBuffer &bb)
- {
- if (_size - _free < bb._free)
- growArray(_free + bb._free + 1);
-
- if (_avail == 0)
- {
- memmove(&_array[_free], &bb._array[0], bb._free * sizeof(unsigned int));
- _avail = bb._avail;
- _free += bb._free;
- HCDBG(fprintf(stderr, "free bumped to %d\n", _free));
- }
- else
- {
- int tp = _free - 1; // target
- int sp = 0; // source
- do
- {
- _array[tp] |= bb._array[sp] >> (NBits - _avail);
- _array[++tp] = bb._array[sp++] << _avail;
- }
- while (sp < bb._free);
- _free += bb._free;
- if ((_avail += bb._avail) >= NBits)
- {
- _avail -= NBits;
- _free--;
- }
- HCDBG(fprintf(stderr, "other free bumped to %d\n", _free));
- }
- }
-};
-
-class Compressor
-{
-private:
- static int NBits;
- static int BeginK;
- BitBuffer _buffer;
-public:
- void write(std::fstream &out) const { _buffer.write(out); }
- int byteCount() { return _buffer.byteCount(); }
- void clear() { _buffer.clear(); }
- void concatenate(const Compressor &other) { _buffer.concatenate(other._buffer); }
- void encode(const IntegerArray &pos, int k);
- void encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2);
- // k: starting value for minimization
- int minimize(const IntegerArray &array, int startK);
- int compressAscending(const IntegerArray &array);
-};
-
-void toDifferences(const IntegerArray &in, IntegerArray &out)
-{
- if (out.size() < in.size())
- out.resize(in.size());
- if (in.empty())
- return;
- out[0] = in[0];
- for (size_t i = 1; i < in.size(); ++i)
- out[i] = in[i] - in[i - 1];
-}
-
-class IndexInverter
-{
-private:
- static int K;
- std::vector<IntegerArray> _arrays;
- int _minConcept;
- int _limit;
- IntegerArray _concepts;
- IntegerArray _offsets;
- Compressor _compr;
- IntegerArray _diffs;
- std::fstream *_mainFile;
- // heap
- int _heapSize;
- std::vector<MicroIndex*> _heap;
-
- Index &_index;
-
-public:
- IndexInverter(Index &index) : _arrays(MicroIndex::RANGE),
- _minConcept(0), _limit(MicroIndex::RANGE),
- _mainFile(0), _heapSize(0), _index(index) {}
- ~IndexInverter()
- {
- delete _mainFile;
- for (int i = 0; i < _heapSize; i++)
- {
- HCDBG(fprintf(stderr, "deleting number %d\n", i));
- delete _heap[i];
- }
- }
- void invertIndex(int nDocuments, const IntegerArray &microIndexOffsets)
- {
- _mainFile = _index.getOutputStream("DOCS");
- for (int i = 0; i < MicroIndex::RANGE; i++)
- _arrays[i] = IntegerArray();
-
- // read in the whole POSITIONS file
- std::vector<unsigned char> positions = _index.readByteArray("POSITIONS");
- // build heap
- _heap.clear();
- _heap.resize(_heapSize = nDocuments);
- for (int i = 0; i < nDocuments; i++)
- _heap[i] = new MicroIndex(i, &positions, microIndexOffsets[i]);
- for (int i = _heapSize/2; i >= 0; i--)
- heapify(i);
- // process till exhausted
- while (!_heap.empty())
- if (_heap[0]->process(*this))
- heapify(0);
- else if (_heapSize > 1)
- {
- delete _heap[0];
- _heap[0] = _heap[--_heapSize];
- heapify(0);
- }
- else
- break;
- // closing
- flush();
- _mainFile->close();
- // compress index file
- std::fstream *indexFile = _index.getOutputStream("DOCS.TAB");
- unsigned char byte = static_cast<unsigned char>(
- _compr.compressAscending(_concepts));
- indexFile->write( (const char*)&byte, 1 ); // write k
- _compr.write(*indexFile);
- _compr.clear();
- byte = static_cast<unsigned char>(_compr.minimize(_offsets, K));
- indexFile->write( (const char*)&byte, 1 ); // write k
- _compr.write(*indexFile);
- indexFile->close();
- delete indexFile;
- }
-
- short process(int documentNumber, std::vector<int> &concepts,
- int n, short start, bool firstTime)
- {
- if (firstTime && concepts[start] >= _limit)
- flush();
- concepts[n] = _limit; // sentinel
- while (concepts[start] < _limit)
- {
- _arrays[concepts[start++] - _minConcept].push_back(documentNumber);
- }
- return start;
- }
-
-private:
- void heapify(int i)
- {
- int r = (i + 1) << 1, l = r - 1;
- int smallest = l < _heapSize && _heap[l]->smallerThan(*_heap[i]) ? l : i;
- if (r < _heapSize && _heap[r]->smallerThan(*_heap[smallest]))
- smallest = r;
- if (smallest != i)
- {
- MicroIndex *temp = _heap[smallest];
- _heap[smallest] = _heap[i];
- _heap[i] = temp;
- heapify(smallest);
- }
- }
-
- void flush()
- {
- for (int i = 0; i < MicroIndex::RANGE; ++i)
- {
- if (!_arrays[i].empty())
- {
- toDifferences(_arrays[i], _diffs);
- unsigned char byte = static_cast<unsigned char>(
- _compr.minimize(_diffs, K));
- _mainFile->write( (const char*)&byte, 1 ); // write k
- _offsets.push_back(_compr.byteCount() + 1);
- _compr.write(*_mainFile);
- _concepts.push_back(_minConcept + i);
- _arrays[i].clear();
- _diffs.clear();
- _compr.clear();
- }
- }
- _limit += MicroIndex::RANGE;
- _minConcept += MicroIndex::RANGE;
- }
-};
-
-int IndexInverter::K = 3;
-
-MicroIndex::MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index)
- : _concepts(NConcepts + 1), _data(positions), _decmp(NULL, 0)
-{
- _documentNumber = documentNumber;
- _base = index;
- openDocumentIndex();
-}
-
-bool MicroIndex::process(IndexInverter &lists)
-{
- bool firstTime = true;
- while (true)
- {
- short stop = lists.process(_documentNumber, _concepts, _nc, _ix, firstTime);
- if (stop < _nc)
- {
- _currentRange = _concepts[_ix = stop]/RANGE;
- return true;
- }
- else if (next())
- firstTime = false;
- else
- return false;
- }
-}
-
-void Index::close()
-{
- /*
- BtreeDictCompactor source = new BtreeDictCompactor(_dictParams, false);
-
- URL url = new URL("file", "", _indexDir + "compacted");
- BtreeDictParameters params =
- new BtreeDictParameters(url, _dictParams.getBlockSize(), 0, _freeID);
- source.compact(params);
- URL tmapURL = new URL("file", "", _indexDir + "DICTIONARY");
- File tmap = new File(tmapURL.getFile());
- File compacted = new File(url.getFile());
- compacted.renameTo(tmap);
- _dictParams.setRoot(params.getRootPosition());
- _dictParams.updateSchema();
- */
- _dict->close(_freeID);
- if (_positionsFile)
- {
- delete _positionsFile;
- _positionsFile = NULL;
- }
-
- if (_update)
- {
- writeOutOffsets();
- _dictParams->setFreeID(_freeID);
- _dictParams->updateSchema();
- _schema->save();
- IndexInverter inverter(*this);
- inverter.invertIndex(_documents.size(), _microIndexOffsets);
- }
- if (_offsetsFile)
- {
- delete _offsetsFile;
- _offsetsFile = NULL;
- }
-}
-
-void Index::init()
-{
- bool indexExists = false;
- if (_update)
- {
- createIfNeeded();
- _cache.clear();
- }
- if (_schema) delete _schema;
- _schema = new Schema(*this, _update);
-
- if (_dictParams) delete _dictParams;
- _dictParams = new BtreeDictParameters(*_schema, "DICTIONARY");
-
- if (_dictParams->readState() == false)
- {
- _dictParams->setBlockSize(2048);
- _dictParams->setRoot(0);
- _dictParams->setFreeID(1);
- }
- else
- indexExists = true;
-
- if (_dict) delete _dict;
- _dict = new FullBtreeDict(*_dictParams, _update);
-
- _freeID = _dictParams->getFreeID();
-
- _documents.clear();
- if (indexExists)
- {
- // read in index parts
- _allLists = readByteArray("DOCS");
- readDocumentsTable("DOCS.TAB");
- readOffsetsTables("OFFSETS");
- readPositions();
- }
- else
- {
- _microIndexOffsets.clear();
- _titles.clear();
- }
-}
-
-namespace
-{
- std::string cliptoken(const std::string &name)
- {
- std::string key = name;
- int length = key.size();
- while(key.size() >= 250)
- key = name.substr(--length);
- return key;
- }
-}
-
-int Index::intern(const std::string &name)
-{
- std::string key = cliptoken(name);
- IndexHashtable::const_iterator aIter = _cache.find(key);
- if (aIter != _cache.end())
- return aIter->second;
- else
- {
- //Seeing as we always start off with an empty dictionary,
- //our entries will always be in the _cache, so don't ever
- //search the underlying dictionary
- int id = _freeID++;
- _dict->store(key, id);
- _cache.insert(IndexHashtable::value_type(key, id)).first->second = id;
- return id;
- }
-}
-
-std::fstream& Index::getPositionsFile()
-{
- if (!_positionsFile)
- _positionsFile = getRAF("POSITIONS", _update);
- return *_positionsFile;
-}
-
-std::fstream& Index::getOffsetsFile()
-{
- if (!_offsetsFile)
- _offsetsFile = getRAF("OFFSETS", _update);
- return *_offsetsFile;
-}
-
-class VectorBtreeParameters : public BlockManagerParameters
-{
-private:
- int _vectorLength;
-public:
- VectorBtreeParameters(Schema &schema, const std::string &partName) :
- BlockManagerParameters(schema, partName)
- {
- _vectorLength = integerParameter("vl");
- }
-
- void updateSchema()
- {
- std::ostringstream tmp;
- tmp << "vl=" << _vectorLength;
- BlockManagerParameters::updateSchema(tmp.str());
- }
-
- VectorBtreeParameters(Schema &schema, const std::string &partName, int vecLen)
- : BlockManagerParameters(schema, partName)
- {
- _vectorLength = vecLen;
- }
-
- int getVectorLength() { return _vectorLength; }
-};
-
-enum outerbreak { dobreak, docontinue, donothing };
-
-class VectorProcessor
-{
- std::vector<unsigned char> _vector;
-public:
- virtual bool processVector() = 0;
- std::vector<unsigned char>& getVectorBuffer() { return _vector; }
- virtual ~VectorProcessor() {}
-};
-
-class VectorBlock;
-
-class VectorBtree
-{
-protected:
- VectorBlock *_root;
- BlockManager *_blockManager;
- VectorBtreeParameters *_params;
- int _blockSize;
-public:
- int _maxEntries;
- int _leafDataLimit;
-protected:
- int _vectorsOffset;
- VectorBlock& accessBlock(int index);
- VectorBtree() {/*empty*/}
-public:
- int _vecLen;
- int vector(int index) const;
- static int memcmp(const std::vector<unsigned char> &v1,
- const std::vector<unsigned char> &v2, int i2, int n);
- VectorBtree(VectorBtreeParameters *params);
- ~VectorBtree() { delete _blockManager; }
-};
-
-class VectorBlockFactory : public BlockFactory
-{
-private:
- int _blockSize;
-public:
- VectorBlockFactory(int blockSize) : _blockSize(blockSize) {}
- Block* makeBlock() const;
-};
-
-VectorBtree::VectorBtree(VectorBtreeParameters *params)
-{
- _params = params;
- _vecLen = params->getVectorLength();
- _blockSize = params->getBlockSize();
- _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN);
- if ((_maxEntries & 1) == 0) // needs to be odd
- _maxEntries--;
-
- _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN;
-
- _vectorsOffset = (_maxEntries + 1)*Block::IDLEN;
- _blockManager = new BlockManager(_params, false, new VectorBlockFactory(_blockSize));
- _root = &(accessBlock(params->getRootPosition()));
-}
-
-VectorBlock& VectorBtree::accessBlock(int index)
-{
- return (VectorBlock&)_blockManager->accessBlock(index);
-}
-
-int VectorBtree::memcmp(const std::vector<unsigned char> &v1,
- const std::vector<unsigned char> &v2, int i2, int n)
-{
- for (int i = 0; i < n; i++, i2++)
- if (v1[i] != v2[i2])
- return (v1[i]&0xFF) - (v2[i2]&0xFF);
- return 0;
-}
-
-class VectorBlock : public Block
-{
-public:
- VectorBlock(int size) : Block(size) {}
-protected:
- int findIndex(const std::vector<unsigned char> &key, const VectorBtree &tree)
- {
- int i = 0, j = _free - 1;
- while (i <= j)
- {
- int k = (i + j)/2;
- int test = VectorBtree::memcmp(key, _data, tree.vector(k),tree._vecLen);
- // std::cerr << "k = " << k << ", test = " << test << std::endl;
- if (test > 0)
- i = k + 1;
- else if (test < 0)
- j = k - 1;
- else
- return -1 - k; // result always negative; "k" encoded
- }
- return i;
- }
-private:
- int FindVectorsInLeaf(const std::vector<unsigned char> &lo,
- const std::vector<unsigned char> &hi, int commLen, int prefLen,
- std::vector<unsigned char> &buffer, int size, const VectorBtree &tree)
- {
- int idx = 0, start;
- for (int nBytesEq = 0;;)
- {
- // std::cout << "idx = " << idx << std::endl;
- if (_data[idx] == nBytesEq) // at compression byte
- {
- int i;
- outerbreak hack(donothing);
- for (i = nBytesEq; i < tree._vecLen; i++)
- {
- if (lo[i] == _data[++idx])
- ++nBytesEq;
- else if ((lo[i]&0xFF) < (_data[idx]&0xFF))
- if (nBytesEq >= commLen && (i >= prefLen || (hi[i]&0xFF) >= (_data[idx]&0xFF)))
- {
- start = nBytesEq;
- hack = dobreak;
- break;
- }
- else
- return 0;
- else
- {
- idx += tree._vecLen - i; // skip
- hack = docontinue;
- break;
- }
- }
-
- if (hack == dobreak)
- break;
- else if (hack == docontinue)
- continue;
-
- if (i == tree._vecLen) // eq vec found
- if ((_data[++idx]&0xFF) >= prefLen)
- {
- start = _data[idx++]&0xFF;
- break;
- }
- else
- return 0;
- }
- else if (_data[idx] < nBytesEq) // drop
- {
- std::cout << idx << std::endl;
- nBytesEq = (_data[idx++]);
- std::cout << nBytesEq << std::endl;
- if (nBytesEq < commLen)
- return 0;
- else if (lo[nBytesEq] < (_data[idx]&0xFF))
- if (hi[nBytesEq] < (_data[idx]&0xFF))
- return 0;
- else
- {
- start = nBytesEq; // found
- break;
- }
- else
- idx += tree._vecLen - nBytesEq;
- }
- else if ((_data[idx]&0xFF) == 0xFF)
- return 0;
- else // compression is bigger
- idx += tree._vecLen + 1 - _data[idx];
- }
-
- int length = std::min(size - start, _free - idx);
- buffer[0] = static_cast<unsigned char>(start);
- memcpy(&(buffer[1]), &(_data[idx]), length);
- buffer[length + 1] = 0;
- return length + 1;
- }
-protected:
- bool searchLeafBlock(const std::vector<unsigned char> &key, const VectorBtree &tree)
- {
-#if 0
- processLeafBlock(_printer);
-#endif
- int nBytesEq = 0;
- for (int idx = 0;; idx += tree._vecLen + 1 - _data[idx])
- {
- if (_data[idx] == nBytesEq)
- {
- int i, j;
- outerbreak hack(donothing);
- for (i = _data[idx], j = idx + 1; i < tree._vecLen; i++, j++)
- {
- if (key[i] == _data[j])
- ++nBytesEq;
- else if ((key[i]&0xFF) < (_data[j]&0xFF))
- return false;
- else /* key[i] > _data[j] */
- {
- hack = dobreak;
- break;
- }
- }
-
- if (hack == dobreak)
- break;
-
- if (i == tree._vecLen) /* or nBytesEq == _vecLen */
- return true; /* equal vector found */
- }
- else if (_data[idx] < nBytesEq)
- return false;
- }
- return false;
- }
-public:
- bool processLeafBlock(VectorProcessor &processor, const VectorBtree &tree)
- {
- std::vector<unsigned char> &buffer = processor.getVectorBuffer();
- for (int ix = 0; ix < _free; ix += tree._vecLen - _data[ix] + 1)
- {
- // cmc: the below line was a comment in the original java, somewhere along
- // the line I suspect this was written in c++, then into java
- // and now I'm putting it back to c++ :-(
- // ::memcpy(&buffer[_data[ix]], &_data[ix + 1], _vecLen - _data[ix]);
- memcpy(&(buffer[_data[ix]]), &(_data[ix + 1]), tree._vecLen - _data[ix]);
- if (processor.processVector())
- return true;
- }
- return false;
- }
-}; // VectorBlock
-
-Block* VectorBlockFactory::makeBlock() const
-{
- return new VectorBlock(_blockSize);
-}
-
-class FullVectorBlock : public VectorBlock
-{
-public:
- FullVectorBlock(int size) : VectorBlock(size) {}
- bool isFull(const VectorBtree &tree) const
- {
- //return pbl->_leaf ? pbl->_free > _leafDataLimit : pbl->_free == _maxEntries;
- return _isLeaf ? _free > tree._leafDataLimit : _free == tree._maxEntries;
- }
-};
-
-class FullVectorBtree : public VectorBtree
-{
-private:
- static int MaxVeclen;
- static double SplitRatio;
-public:
- FullVectorBtree(VectorBtreeParameters* params, bool update);
- bool insertVector(const std::vector<unsigned char> &key);
-private:
- bool treeInsertNonfull(const FullVectorBlock &bl, const std::vector<unsigned char> &key);
- bool treeInsertNonfullRoot(const std::vector<unsigned char> &key);
- FullVectorBlock& getNewBlock();
- void enableModif(const Block &bl);
- void declareModif(const Block &bl);
-public:
- void close() { _blockManager->close(); }
-};
-
-int FullVectorBtree::MaxVeclen = 128;
-double FullVectorBtree::SplitRatio = 0.5;
-
-class FullVectorBlockFactory : public BlockFactory
-{
-private:
- int _blockSize;
-public:
- FullVectorBlockFactory(int blockSize) : _blockSize(blockSize) {}
- Block* makeBlock() const
- {
- return new FullVectorBlock(_blockSize);
- }
-};
-
-FullVectorBtree::FullVectorBtree(VectorBtreeParameters *params, bool update)
-{
- _params = params;
- _vecLen = params->getVectorLength();
- _blockSize = params->getBlockSize();
- _blockManager = new BlockManager(params, update, new FullVectorBlockFactory(_blockSize));
- _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN);
- // System.out.println("_maxEntries = " + _maxEntries);
- if ((_maxEntries & 1) == 0) // needs to be odd
- _maxEntries--;
- _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN;
- _vectorsOffset = (_maxEntries + 1)*Block::IDLEN;
- _root = &(accessBlock(params->getRootPosition()));
-}
-
-class CompressorIterator
-{
-public:
- virtual void value(int value) = 0;
- virtual ~CompressorIterator() {}
-};
-
-int Decompressor::countZeroes()
-{
- for (int count = 0;; _readByte = getNextByte(), _toRead = BitsInByte)
- {
- HCDBG(fprintf(stderr, "count is %d\n", count));
- HCDBG(fprintf(stderr, "Decompressor::countZeroes is %x\n", _readByte));
- HCDBG(fprintf(stderr, "_toRead is %d\n", _toRead));
- HCDBG(fprintf(stderr, "_readByte is %x\n", _readByte));
- while (_toRead-- > 0)
- {
- if ((_readByte & (1 << _toRead)) != 0)
- {
- HCDBG(fprintf(stderr, "returning count of %d\n", count));
- return count;
- }
- else
- {
- ++count;
- HCDBG(fprintf(stderr, "int count to %d\n", count));
- }
- }
- }
- //return 0;
-}
-
-// reads 1 bit; returns non-0 for bit "1"
-int Decompressor::read()
-{
- if (_toRead-- > 0)
- return _readByte & (1 << _toRead);
- else
- { // get next word
- _toRead = BitsInByte - 1;
- return (_readByte = getNextByte()) & 0x80;
- }
-}
-
-int Decompressor::read(int kBits)
-{
- int shift = BitsInByte - _toRead;
- if (kBits <= _toRead)
- {
- HCDBG(fprintf(stderr, "leg 1\n"));
- return ((_readByte<<shift) & 0xFF) >> (shift + (_toRead-=kBits));
- }
- else
- {
- HCDBG(fprintf(stderr, "leg 2 _readByte is %d, shift %d\n", _readByte, shift));
- int result = _toRead > 0 ? ((_readByte << shift) & 0xFF) >> shift : 0;
- HCDBG(fprintf(stderr, "result is %d\n", result));
- for (kBits -= _toRead; kBits >= BitsInByte; kBits -= BitsInByte)
- {
- int foo = getNextByte();
- HCDBG(fprintf(stderr, "byte is %d\n", foo));
- result = (result << BitsInByte) | foo;
- HCDBG(fprintf(stderr, "and result is %d\n", result));
- }
- if (kBits > 0)
- {
- int foo = getNextByte();
- HCDBG(fprintf(stderr, "and byte is %d\n", foo));
- int thing = BitsInByte - kBits;
- HCDBG(fprintf(stderr, "thing is %d\n", thing));
- _toRead = thing;
- _readByte = foo;
- int right = (_readByte >> _toRead);
- HCDBG(fprintf(stderr, "right is %d\n", right));
- int left = result << kBits;
- HCDBG(fprintf(stderr, "kbits are %d\n", kBits));
- HCDBG(fprintf(stderr, "left is %d\n", left));
- int ret = left | right;
-// int ret = (result << kBits) | ((_readByte = foo) >> (_toRead = BitsInByte - kBits));
- HCDBG(fprintf(stderr, "and final is %d\n", ret));
- return ret;
- }
- else
- {
- _toRead = 0;
- HCDBG(fprintf(stderr, "and this result says %d\n", result));
- return result;
- }
- }
-}
-
-bool Decompressor::readNext(int k, CompressorIterator &it)
-{
- if (read() != 0)
- {
- it.value(_path | read(k));
- return true;
- }
- else
- {
- for (int count = 1;; _readByte = getNextByte(), _toRead = BitsInByte)
- {
- while (_toRead-- > 0)
- {
- if ((_readByte & (1 << _toRead)) != 0)
- {
- int saved = _path;
- _path = ((_path >> (k + count) << count) | read(count)) << k;
- if (_path != saved)
- {
- it.value(_path | read(k));
- return true;
- }
- else
- {
- return false;
- }
- }
- else
- {
- ++count;
- }
- }
- }
- }
-}
-
-void Decompressor::decode(int k, IntegerArray &array)
-{
- for (int path = 0;;)
- {
- if (read() != 0)
- {
- array.push_back(path | read(k));
- }
- else
- {
- int count = countZeroes() + 1;
- int saved = path;
- path = ((path >> (k + count) << count) | read(count)) << k;
- if (path != saved) // convention for end
- array.push_back(path | read(k));
- else
- break;
- }
- }
-}
-
-void Decompressor::ascDecode(int k, IntegerArray &array)
-{
- for (int path = 0, start = 0;;)
- {
- HCDBG(fprintf(stderr, "path is %d, start is %d\n", path, start));
- if (read() != 0)
- {
- int inread = read(k);
- start += path | inread;
- HCDBG(fprintf(stderr, "inread is %d\n", inread));
- int final = start;
- HCDBG(fprintf(stderr, "1:Decompressor::ascDecode to %d\n", final));
- array.push_back(final);
- }
- else
- {
- int count = countZeroes() + 1;
- HCDBG(fprintf(stderr, "count is %d\n", count));
- int saved = path;
- int inread = read(count);
- HCDBG(fprintf(stderr, "inread is %d, k is %d, path is %d\n", inread,
- k, path));
- path = ((path >> (k + count) << count) | inread) << k;
- if (path != saved) // convention for end
- {
- int anotherread = read(k);
- HCDBG(fprintf(stderr, "newinread is %d\n", anotherread));
- start += path | anotherread;
- int final = start;
- HCDBG(fprintf(stderr, "2:Decompressor::ascDecode to %d\n", final));
- array.push_back(final);
- }
- else
- {
- break;
- }
- }
- }
-}
-
-int Decompressor::ascendingDecode(int k, int start, std::vector<int> &array)
-{
- int path = 0, index = 0;
- while (true)
- {
- if (read() != 0)
- array[index++] = (start += path | read(k));
- else
- {
- outerbreak hack = donothing;
- for (int cnt = 0;; _readByte = getNextByte(), _toRead = BitsInByte)
- {
- while (_toRead-- > 0)
- {
- if ((_readByte & (1 << _toRead)) != 0)
- {
- ++cnt;
- int Path = ((path >> (k + cnt) << cnt) | read(cnt)) << k;
- if (Path != path)
- {
- array[index++] = (start += (path = Path) | read(k));
- hack = docontinue;
- break;
- }
- else
- return index;
- }
- else
- ++cnt;
- }
- if (hack == docontinue)
- break;
- }
- }
- }
-}
-
-class StreamDecompressor : public Decompressor
-{
-private:
- std::ifstream *_input;
-public:
- StreamDecompressor(std::ifstream &input) { initReading(input); }
- using Decompressor::initReading;
- virtual void initReading(std::ifstream &input) { _input = &input; Decompressor::initReading(); }
- int getNextByte()
- {
- unsigned char ret;
- _input->read( (char*)&ret, 1 );
- HCDBG(fprintf(stderr, "StreamDecompressor::getNextByte of %d\n", ret));
- return ret;
- }
-};
-
-void Index::readPositions()
-{
- getPositionsFile();
- //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt
- _positionsFile->seekg(0, std::ios::end);
- _positionsCacheSize = _positionsFile->tellg();
- if (_positionsCacheSize < 0) _positionsCacheSize = 0;
- _positionsFile->clear();
- _positionsFile->seekg(0, std::ios::beg);
-
- if (_positionsCacheSize <= _positionsCacheSize)
- {
- _allInCache = true;
- _positions.resize(_positionsCacheSize);
- _positionsFile->readsome((char*)(&_positions[0]), _positionsCacheSize);
- std::cout << "POS fits in cache" << std::endl;
- }
-}
-
-void Index::readOffsetsTables(const std::string &fileName)
-{
- std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
- unsigned char k1;
- in.read( (char*)&k1, 1 );
- StreamDecompressor sddocs(in);
- sddocs.decode(k1, _documents);
- unsigned char k2;
- in.read( (char*)&k2, 1 );
- _microIndexOffsets.clear();
- StreamDecompressor sdoffsets(in);
- sdoffsets.ascDecode(k2, _microIndexOffsets);
- // decompress titles' ids table
- unsigned char k3;
- in.read( (char*)&k3, 1 );
- _titles.clear();
- StreamDecompressor sdtitles(in);
- sdtitles.decode(k3, _titles);
-}
-
-void Index::readDocumentsTable(const std::string &fileName)
-{
- std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary);
- unsigned char k1;
- in.read( (char*)&k1, 1 );
- _concepts.clear();
- StreamDecompressor sddocs(in);
- sddocs.ascDecode(k1, _concepts);
- unsigned char k2;
- in.read( (char*)&k2, 1 );
- _offsets.clear();
- _offsets.push_back(0);
- StreamDecompressor sdoffsets(in);
- sdoffsets.ascDecode(k2, _offsets);
- in.close();
-}
-
-class ContextTables;
-
-class Tables
-{
-private:
- std::vector<int> _initialWordsCached;
- std::vector<int> _destsCached;
- std::vector<int> _linkTypesCached;
- std::vector<int> _seqNumbersCached;
-public:
- Tables(const std::vector<int> &initialWords,
- std::vector<int> &dests,
- std::vector<int> &linkTypes,
- std::vector<int> &seqNumbers)
- {
- _initialWordsCached = initialWords;
- _destsCached = dests;
- _linkTypesCached = linkTypes;
- _seqNumbersCached = seqNumbers;
- }
- void setTables(ContextTables &context);
-}; // end of Tables
-
-class ContextTables
-{
-public:
- std::vector<int> _initialWords;
- std::vector<int> _dests;
- std::vector<int> _linkTypes;
- std::vector<int> _seqNumbers;
- int _nTextNodes;
-private:
- std::vector<Tables*> _cache;
- // cached last position for linear search
- int _initialWordsIndex;
- // link names are shared between all microindexes in an index
- std::vector<std::string> _linkNames;
- // offsets to tables' storage in file (or memory)
- std::vector<int> _offsets;
- std::vector<unsigned char> _contextData; // !!! fully cached for now
- // auxillary
- IntegerArray _kTable;
- // _auxArray will be used as an auxillary to decode arrays
- IntegerArray _auxArray;
- int _lastDocNo;
-
- std::vector<int> _markers;
-
-public:
- ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData,
- const std::vector<std::string> &linkNames);
- ~ContextTables();
- void setMicroindex(int docNo);
- int parentContext(int context);
- const std::string& linkName(int context);
- int linkCode(const std::string &linkName);
- std::vector<bool> getIgnoredElementsSet(const std::vector<std::string> &ignoredElements);
- bool notIgnored(int ctx, const std::vector<bool> &ignoredElements);
- int firstParentWithCode(int pos, int linkCode);
- int firstParentWithCode2(int pos, int linkCode, int parentCode);
- int firstParentWithCode3(int pos, int linkCode, int ancestorCode);
- int firstParentWithCode4(int pos, const std::vector<int> &linkCodes);
- int firstParentWithCode5(int pos, const std::vector<int> &pathCodes);
- int firstParentWithCode7(int pos, int linkCode, int seq);
- bool isGoverning(int context) { return linkName(context) == "TITLE"; }
- void resetContextSearch() { _initialWordsIndex = 0; }
-private:
- void appendSegment(int context, std::string &result);
- int findIndexBin(int wordNumber);
-public:
- int wordContextLin(int wordNumber);
-};
-
-ContextTables::ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData,
- const std::vector<std::string> &linkNames) : _kTable(5), _auxArray(4096), _lastDocNo(-1)
-{
- _offsets = offsets;
- _contextData = contextData;
- _linkNames = linkNames;
- _cache.resize(_offsets.size());
-}
-
-ContextTables::~ContextTables()
-{
- for (size_t i = 0; i < _cache.size(); ++i)
- delete _cache[i];
-}
-
-void ContextTables::setMicroindex(int docNo)
-{
- if (docNo != _lastDocNo) // check if we need to do anything
- {
- if (_cache[docNo])
- _cache[docNo]->setTables(*this);
- else
- {
- int offset = _offsets[docNo];
- int k0 = _contextData[offset] & 0xFF;
- ByteArrayDecompressor compr(&_contextData, offset + 1);
- _kTable.clear();
- compr.decode(k0, _kTable);
- // decompress initialWords into auxiliary array
- _auxArray.clear();
- compr.ascDecode(_kTable[0], _auxArray); // _initialWords
- _initialWords = _auxArray;
- _nTextNodes = _initialWords.size();
- // decompress destinations into auxiliary array
- _auxArray.clear();
- compr.decode(_kTable[1], _auxArray); // _dests
- _auxArray.push_back(-1); // sentinel, root
- _dests = _auxArray;
- _linkTypes.clear();
- compr.decode(_kTable[2], _linkTypes);
- _seqNumbers.clear();
- compr.decode(_kTable[3], _seqNumbers);
-
- _cache[docNo] = new Tables(_initialWords, _dests, _linkTypes, _seqNumbers);
-
- /*
- System.out.println("|_initialWords| = " + _nTextNodes);
- System.out.println("|_dests| -1 = " + (_dests.length - 1));
- System.out.println("|_seqNumbers| = " + _seqNumbers.length);
- System.out.println("|_linkTypes| = " + _linkTypes.length);
- */
- }
- _lastDocNo = docNo;
- _markers.resize(_dests.size());
- }
- _initialWordsIndex = 0;
-}
-
-int ContextTables::parentContext(int context)
-{
- return _dests[context];
-}
-
-const std::string& ContextTables::linkName(int context)
-{
- return _linkNames[_linkTypes[context]];
-}
-
-int ContextTables::linkCode(const std::string &inlinkName)
-{
- for (size_t i = 0; i < _linkNames.size(); i++)
- if (inlinkName == _linkNames[i])
- return i;
- return -1; // when not found
-}
-
-std::vector<bool> ContextTables::getIgnoredElementsSet(const std::vector<std::string> &ignoredElements)
-{
- std::vector<bool> result;
- bool noValidIgnoredElements = true;
- if (!ignoredElements.empty())
- {
- result.resize(_linkNames.size());
- for (size_t i = 0; i < ignoredElements.size(); i++)
- {
- int code = linkCode(ignoredElements[i]);
- if (code > -1)
- {
- result[code] = true;
- noValidIgnoredElements = false;
- }
- }
- }
- return noValidIgnoredElements ? std::vector<bool>() : result;
-}
-
-bool ContextTables::notIgnored(int ctx, const std::vector<bool> &ignoredElements)
-{
- do
- {
- if (ignoredElements[_linkTypes[ctx]])
- {
- std::cout << "hit ignored" << std::endl;
- return false;
- }
- }
- while ((ctx = _dests[ctx]) > -1); // parentContext 'hand inlined'
- return true;
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with the given linkCode */
-int ContextTables::firstParentWithCode(int pos, int inlinkCode)
-{
- int ctx = _dests[wordContextLin(pos)]; // first parent of text node
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- while (_linkTypes[ctx - shift] != inlinkCode)
- if ((ctx = _dests[ctx]) == limit)
- return -1;
- return ctx;
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with the given linkCode and given parent code */
-int ContextTables::firstParentWithCode2(int pos, int inlinkCode, int parentCode)
-{
- int ctx = _dests[wordContextLin(pos)]; // first parent of text node
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent])
- if (_linkTypes[parent - shift] == parentCode && _linkTypes[ctx - shift] == inlinkCode)
- return ctx;
- else
- ctx = parent;
- return -1;
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with the given linkCode and given ancestor code */
-int ContextTables::firstParentWithCode3(int pos, int inlinkCode, int ancestorCode)
-{
- int ctx = _dests[wordContextLin(pos)];
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- // find first instance of linkCode
- while (ctx < limit && _linkTypes[ctx - shift] != inlinkCode)
- ctx = _dests[ctx];
- if (ctx < limit) // found linkCode, check ancestry
- for (int ancestor = _dests[ctx];
- ancestor < limit;
- ancestor = _dests[ancestor])
- if (_linkTypes[ancestor - shift] == ancestorCode) // ancestor confirmed
- return ctx; // match found, return successful ctx
- return -1; // match NOT found
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with any of the given linkCode */
-int ContextTables::firstParentWithCode4(int pos, const std::vector<int> &linkCodes)
-{
- int nCodes = linkCodes.size();
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- for (int ctx = _dests[wordContextLin(pos)]; ctx < limit; ctx = _dests[ctx])
- {
- int code = _linkTypes[ctx - shift];
- for (int i = 0; i < nCodes; i++)
- if (code == linkCodes[i])
- return ctx;
- }
- return -1;
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with the given path */
-int ContextTables::firstParentWithCode5(int pos, const std::vector<int> &pathCodes)
-{
- int nCodes = pathCodes.size();
- int lastCode = pathCodes[nCodes - 1];
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- int ctx = _dests[wordContextLin(pos)];
- for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent])
- {
- if (_linkTypes[ctx - shift] == lastCode)
- {
- // try to match the entire path
- outerbreak hack = donothing;
- for (int i = nCodes - 2, parent2 = parent; i >= 0; i--)
- if (_linkTypes[parent2 - shift] != pathCodes[i]) // match failure
- {
- hack = docontinue;
- break; // try to match higher
- }
- else if ((parent2 = _dests[parent2]) == limit)
- return -1;
- if (hack == docontinue)
- continue;
- return ctx;
- }
- else
- ctx = parent;
- }
- return -1;
-}
-
-/** starting with ctx and going up the ancestry tree look for the first
- context with the given linkCode */
-int ContextTables::firstParentWithCode7(int pos, int inlinkCode, int seq)
-{
- int ctx = _dests[wordContextLin(pos)]; // first parent of text node
- int shift = _nTextNodes;
- int limit = _dests.size() - 1;
- while (_linkTypes[ctx - shift] != inlinkCode || _seqNumbers[ctx] != seq)
- if ((ctx = _dests[ctx]) == limit)
- return -1;
- return ctx;
-}
-
-void ContextTables::appendSegment(int context, std::string &result)
-{
- result.append(context < _nTextNodes ? "text()" : _linkNames[_linkTypes[context - _nTextNodes]]);
- result.push_back('[');
- std::ostringstream tmp;
- tmp << _seqNumbers[context];
- result.append(tmp.str());
- result.append("]/");
-}
-
-int ContextTables::findIndexBin(int wordNumber)
-{
- int i = 0, j = _nTextNodes - 1;
- while (i <= j)
- {
- int k = (i + j) >> 1;
- if (_initialWords[k] < wordNumber)
- i = k + 1;
- else if (_initialWords[k] > wordNumber)
- j = k - 1;
- else
- return k;
- }
- return i - 1;
-}
-
-int ContextTables::wordContextLin(int wordNumber)
-{
- for (int i = _initialWordsIndex; i < _nTextNodes; i++)
- if (_initialWords[i] > wordNumber) // first such i
- {
- // - 1 if wordNumbers can be the same
- _initialWordsIndex = i; // cached to speed up next search
- return i - 1;
- }
- return _nTextNodes - 1;
-}
-
-void Tables::setTables(ContextTables &context)
-{
- context._initialWords = _initialWordsCached;
- context._dests = _destsCached;
- context._linkTypes = _linkTypesCached;
- context._seqNumbers = _seqNumbersCached;
- context._nTextNodes = context._initialWords.size();
-}
-
-class Compressor;
-
-class XmlIndex : public Index
-{
-private:
- VectorBtreeParameters *_edgesParams;
- FullVectorBtree *_edges;
- ContextTables *_contextTables;
- std::fstream *_contextsFile;
- IntegerArray _contextsOffsets;
- std::vector<unsigned char> _contextsData;
- std::vector<std::string> _linkNames;
-protected:
- virtual void writeOutOffsets();
-public:
- XmlIndex(const fs::path &index, bool update)
- : Index(index, update), _edgesParams(0), _edges(0), _contextTables(0), _contextsFile(0) {}
- void init();
- void close();
- virtual ~XmlIndex() { delete _edgesParams; delete _edges; delete _contextTables; }
- std::fstream& getContextsFile();
- using Index::compress;
- virtual void compress(int docID, int titleID,
- std::vector<ConceptLocation> &locations,
- std::vector<ConceptLocation> &extents,
- int k, const Compressor &contextTables);
- const std::vector<std::string>& getLinkNames() { return _linkNames; }
-};
-
-void XmlIndex::init()
-{
- Index::init();
- if (_edgesParams) delete _edgesParams;
- _edgesParams = new VectorBtreeParameters(*_schema, "EDGE", 9);
- if (_edgesParams->readState() == false)
- _edgesParams->setBlockSize(1024);
- _edges = new FullVectorBtree(_edgesParams, _update);
- if (!_contextsOffsets.empty())
- {
- _contextsData = readByteArray("CONTEXTS");
-#if 0
- _linkNames = (String[])readObject("LINKNAMES");
-#endif
- _contextTables = new ContextTables(_contextsOffsets, _contextsData, _linkNames);
- }
-}
-
-void XmlIndex::writeOutOffsets()
-{
- Index::writeOutOffsets();
- if (!_contextsOffsets.empty())
- {
- std::fstream &out = getOffsetsFile();
- Compressor offsets2;
- char k = static_cast<char>(offsets2.compressAscending(_contextsOffsets));
- out.write( (const char*)&k, 1 );
- offsets2.write(out);
- }
-}
-
-std::fstream& XmlIndex::getContextsFile()
-{
- if (!_contextsFile)
- _contextsFile = getRAF("CONTEXTS", _update);
- return *_contextsFile;
-}
-
-void XmlIndex::close()
-{
- if (_contextsFile)
- {
- _contextsFile->close();
- delete _contextsFile;
- _contextsFile = 0;
- }
- _edges->close();
- if (_update)
- _edgesParams->updateSchema();
- Index::close();
-}
-
-class Tokenizer
-{
-private:
- UnicodeString s;
- BreakIterator *bi;
- int32_t start;
- UConverter *utf8;
- std::vector<char> utfbuffer;
-public:
- Tokenizer();
- ~Tokenizer();
- void setText(const xmlChar *text);
- std::string nextToken();
-};
-
-Tokenizer::Tokenizer() : start(BreakIterator::DONE), utfbuffer(64)
-{
- UErrorCode status = U_ZERO_ERROR;
- bi = BreakIterator::createWordInstance("en_US", status);
- utf8 = ucnv_open("utf-8", &status);
-}
-
-Tokenizer::~Tokenizer()
-{
-#if !defined(SOLARIS)
- delete bi;
- ucnv_close(utf8);
-#endif
-}
-
-void Tokenizer::setText(const xmlChar *text)
-{
- UErrorCode status = U_ZERO_ERROR;
- s = UnicodeString((const char*)text, -1, utf8, status);
- bi->setText(s);
- start = ubrk_first(bi);
-}
-
-std::string Tokenizer::nextToken()
-{
- std::string ret;
-
- int32_t end = ubrk_next(bi);
- while (end != BreakIterator::DONE)
- {
- if (ubrk_getRuleStatus(bi) != UBRK_WORD_NONE)
- break;
- start = end;
- end = ubrk_next(bi);
- }
-
- if (end != -1 && end != start)
- {
- UnicodeString token(s, start, end-start);
- token = token.toLower();
- size_t needed = 0;
-
- UErrorCode status = U_ZERO_ERROR;
- while ((needed = token.extract(&utfbuffer[0], utfbuffer.size(), utf8, status)) > utfbuffer.size())
- utfbuffer.resize(utfbuffer.size() * 2);
-
- ret = std::string(&utfbuffer[0], needed);
- start = end;
- }
-
- return ret;
-}
-
-typedef std::vector<xmlNodePtr> Vector;
-
-ConceptLocation::ConceptLocation(int conceptID, int begin, int end) :
- _concept(conceptID), _begin(begin), _end(end)
-{
-}
-
-#ifdef EMULATEORIGINALSORT
-class ConceptLocationSorter
-{
-public:
- virtual bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) = 0;
-private:
- // part of quicksearch
- int partition(std::vector<ConceptLocation> &array, int p, int r)
- {
- ConceptLocation x = array[(p + r)/2];
- int i = p - 1, j = r + 1;
- while (true)
- {
- while (smallerThan(x, array[--j]))
- ;
- while (smallerThan(array[++i], x))
- ;
- if (i < j)
- {
- ConceptLocation t = array[i];
- array[i] = array[j];
- array[j] = t;
- }
- else
- return j;
- }
- }
-public:
- void quicksort(std::vector<ConceptLocation> &array, int p, int r)
- {
- while (p < r)
- {
- int q = partition(array, p, r);
- quicksort(array, p, q);
- p = q + 1;
- }
- }
-};
-
-class ConceptSorter : public ConceptLocationSorter
-{
-public:
- bool smallerThan(const ConceptLocation &a, const ConceptLocation &b)
- {
- return a._concept < b._concept;
- }
-};
-
-class PositionSorter : public ConceptLocationSorter
-{
-public:
- bool smallerThan(const ConceptLocation &a, const ConceptLocation &b)
- {
- return a._begin < b._begin || a._begin == b._begin && a._end < b._end;
- }
-};
-
-#else
-
-class ConceptSorter
-{
-public:
- bool operator()(const ConceptLocation &a, const ConceptLocation &b) const
- {
- return a._concept < b._concept;
- }
-};
-
-class PositionSorter
-{
-public:
- bool operator()(const ConceptLocation &a, const ConceptLocation &b) const
- {
- return a._begin < b._begin || (a._begin == b._begin && a._end < b._end);
- }
-};
-
-#endif
-
-void ConceptLocation::sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2)
-{
-#ifdef EMULATEORIGINALSORT
- PositionSorter _pComp;
- _pComp.quicksort(array, i1, i2 - 1);
-#else
- std::vector<ConceptLocation>::iterator begin = array.begin();
- std::vector<ConceptLocation>::iterator end = begin;
- std::advance(begin, i1);
- std::advance(end, i2);
- std::sort(begin, end, PositionSorter());
-#endif
-}
-
-void ConceptLocation::sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2)
-{
-#ifdef EMULATEORIGINALSORT
- ConceptSorter _cComp;
- _cComp.quicksort(array, i1, i2 - 1);
-#else
- std::vector<ConceptLocation>::iterator begin = array.begin();
- std::vector<ConceptLocation>::iterator end = begin;
- std::advance(begin, i1);
- std::advance(end, i2);
- std::sort(begin, end, ConceptSorter());
-#endif
-}
-
-typedef std::map<xmlNodePtr, int> NodeHashtable;
-typedef std::hash_map<std::string, int, pref_hash> LinkHashTable;
-
-class IndexAdapter
-{
-private:
- static int StackSize;
- const char* _indexText_Name;
- const char* _indexElement_Name;
- const char* _indexAttribute_Name;
- const char* _nodeID_Name;
- const char* _tokenizer_Name;
- const char* _attributeName_Name;
- std::vector<bool> _indexOnOffStack;
- int _sp;
- int _tsp;
- std::vector< std::string > _attributeStack;
- xmlNodePtr _currentNode;
- int _attrSP;
- void storeLocation(const std::string &token, int number);
- void storeLocation(const std::string &token) { storeLocation(token, _lastWordNumber++); }
- void storeEdge(int relation, int seqNumber, int destination);
-
- void startElement(xmlNodePtr node);
- void attribute(const char *name, const char *value);
- void characters(const xmlChar *str) throw( HelpProcessingException );
- void endElement(xmlNodePtr node);
-
- void indexText(const xmlChar *str);
-
- Vector _textNodes;
- NodeHashtable _numberedNodes;
-public:
- HashSet _stoplist;
- LinkHashTable _linkCodes;
- std::vector<std::string> _linknames;
- static int CurrenMaxLinkCode;
- std::vector<ConceptLocation> _locations;
- int _availContextNumber;
- IntegerArray _initialWords;
- IntegerArray _links;
- IntegerArray _dests;
- IntegerArray _seqNumbers;
- int _lastWordNumber;
- int _firstWord;
- bool _anyLocationsStored;
- XmlIndex *_index;
-private:
- static int InitSize;
- int _size;
-public:
- IndexAdapter();
- void process(xmlNodePtr node, xmlDocPtr doc);
- void init();
- void finish();
- int intern(const std::string &name) { return _index->intern(name); }
- int getLinkCode(const std::string &linkName);
-};
-
-int IndexAdapter::StackSize = 64;
-int IndexAdapter::InitSize = 4096;
-int IndexAdapter::CurrenMaxLinkCode = 0;
-
-IndexAdapter::IndexAdapter()
- : _indexOnOffStack(StackSize), _attributeStack(StackSize),
- _anyLocationsStored(false), _size(InitSize)
-{
- _indexText_Name = "text";
- _indexElement_Name = "element";
- _indexAttribute_Name = "attribute";
- _nodeID_Name = "nodeID";
- _tokenizer_Name = "tokenizer";
- _attributeName_Name = "attributeName";
-}
-
-void IndexAdapter::storeLocation(const std::string &token, int number)
-{
- int concept = intern(token);
- HCDBG(std::cerr << "storeLocation of number " << number << "for token "
- << token << " as conceptlocation " << concept << std::endl);
- _locations.push_back(ConceptLocation(concept, number, number));
-}
-
-void IndexAdapter::storeEdge(int relation, int seqNumber, int destination)
-{
- _links.push_back(relation);
- _seqNumbers.push_back(seqNumber);
- _dests.push_back(destination);
- HCDBG(std::cerr << "storeEdge" << std::endl);
-}
-
-void IndexAdapter::finish()
-{
- _numberedNodes.clear();
- _dests.clear();
- _seqNumbers.clear();
- _links.clear();
-
- int nTextNodes = _textNodes.size();
- _availContextNumber = nTextNodes;
- // vector to hold parents of text nodes
- Vector parents;
- /*****
- for each of the text nodes its sequence number is stored
- as well as the index of its parent (in _dests)
- _link is not stored as it is always "text()"
- _availContextNumber only used to number parent element contexts
- ******/
- for (int i = 0; i < nTextNodes; i++)
- {
- xmlNodePtr node = _textNodes[i];
- xmlNodePtr parent = node->parent;
- // find this text node's seq number
- int counter = 1;
- xmlNodePtr sibling = parent->xmlChildrenNode;
- while (sibling && sibling != node)
- {
- if (xmlNodeIsText(sibling))
- ++counter;
- sibling = sibling->next;
- }
- _seqNumbers.push_back(counter);
- // check whether parent already encountered
- NodeHashtable::const_iterator number = _numberedNodes.find(parent);
- if (number == _numberedNodes.end()) // not yet seen
- {
- int newContext = _availContextNumber++;
- _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext;
- _dests.push_back(newContext);
- // enqueue parent: its parent will need a number too
- parents.push_back(parent);
- // System.out.println(parent.getName().toString() +
- // " -> " + newContext);
- }
- else
- {
- _dests.push_back(number->second);
- }
- } // end for
-
- _textNodes.clear();
-
- // store info about element ancestry of the above text nodes
- // grandparents are added to the end of the vector
- int rootElementPos = 0;
- for (size_t i = 0; i < parents.size(); i++)
- {
- xmlNodePtr node = parents[i];
-
- std::string name((const char*)(node->name));
-
- xmlNodePtr parent = node->parent;
-
- _links.push_back(getLinkCode(name));
-
-// if (parent.getType() == Node.ELEMENT) // not ROOT
- if (parent && parent->parent) // not ROOT
- {
- // find sequence number
- xmlNodePtr sibling = parent->xmlChildrenNode;
- int counter = 1;
- while (sibling && sibling != node)
- {
- if (strcmp((const char*)sibling->name, (const char*)name.c_str()) == 0)
- ++counter;
- sibling = sibling->next;
- }
-
- _seqNumbers.push_back(counter);
-
- // check whether parent already known
- NodeHashtable::iterator number = _numberedNodes.find(parent);
- if (number == _numberedNodes.end())
- {
- int newContext = _availContextNumber++;
- _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext;
- _dests.push_back(newContext);
- // enqueue parent: its parent will need a number too
- parents.push_back(parent);
- //System.out.println(parent.getName().toString() +
- // " -> " + newContext);
- }
- else
- {
- _dests.push_back(number->second);
- }
- }
- else
- {
- _dests.push_back(0); // placeholder
- _seqNumbers.push_back(1);
- rootElementPos = i + nTextNodes;
- // System.out.println("rootElementPos = " + i);
- }
- } // end for
-
- if (_dests.empty())
- _dests.push_back(0);
-
- // index to sentinel
- _dests[rootElementPos] = _availContextNumber;
-} // end public void finish
-
-void IndexAdapter::init()
-{
- _sp = -1;
- _tsp = -1;
- _attrSP = -1;
- _lastWordNumber = 0;
- _anyLocationsStored = false;
- _availContextNumber = 0;
- // all the contexts' tables
- _initialWords.clear();
- _locations.clear();
-}
-
-void IndexAdapter::attribute(const char *name, const char *value)
-{
- HCDBG(std::cerr << "attribute: " << name << " = " << value << std::endl);
- if (strcmp(name, _nodeID_Name) == 0)
- _currentNode = (xmlNodePtr)(strtol(value, NULL, 10));
- else if (strcmp(name, _tokenizer_Name) == 0)
- {
- if (strcmp(value, "com.sun.xmlsearch.util.SimpleTokenizer") != 0 && !isExtensionMode() )
- std::cerr << "changing tokenizers not implemented in C++ version of HelpLinker"
- << " because no other tokenizers were referenced in the helpcontent2 source"
- << std::endl;
- }
- else if (strcmp(name, _attributeName_Name) == 0)
- {
- //namespace prefix ?
- std::string attrVal = std::string("index:") + value;
- if( !isExtensionMode() )
- std::cout << "attrVal = " << attrVal << std::endl;
- _attributeStack[_attrSP] = std::string(name) + '<' + value + '<' + attrVal;
- storeLocation("+<" + _attributeStack[_attrSP]);
- }
-}
-
-void IndexAdapter::indexText(const xmlChar *text)
-{
- static Tokenizer tokenizer;
- tokenizer.setText(text);
- _firstWord = _lastWordNumber;
- _anyLocationsStored = false;
-
- std::string lowercaseToken = tokenizer.nextToken();
- while (!lowercaseToken.empty())
- {
- HCDBG(std::cerr << "token is: " << lowercaseToken << std::endl);
-#ifdef EMULATEORIGINAL
- if ((lowercaseToken.size() == 1) && isdigit(lowercaseToken[0]))
- {
- lowercaseToken = tokenizer.nextToken();
- continue;
- }
-#endif
- if (std::find(_stoplist.begin(),
- _stoplist.end(), lowercaseToken) == _stoplist.end())
- {
- storeLocation(lowercaseToken);
- _anyLocationsStored = true;
- }
- else
- _lastWordNumber++;
- lowercaseToken = tokenizer.nextToken();
- }
-
- if (_anyLocationsStored && _firstWord > -1)
- {
- _initialWords.push_back(_firstWord);
- HCDBG(std::cerr << "appending " << _firstWord << std::endl);
- _textNodes.push_back(_currentNode);
- }
- // reset before next batch
- _firstWord = -1;
-}
-
-void IndexAdapter::characters(const xmlChar *str) throw( HelpProcessingException )
-{
- if (!str)
- {
- std::stringstream aStrStream;
- aStrStream << "no characters!" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
-
- HCDBG(std::cerr << "IndexAdapter::characters of " << str << std::endl);
- HCDBG(std::cerr << _sp << " : " << _indexOnOffStack[_sp] << std::endl);
-
- if (_sp >= 0 && _indexOnOffStack[_sp])
- {
- indexText( str );
- }
-}
-
-void IndexAdapter::startElement(xmlNodePtr node)
-{
- const char *name = (const char*)(node->name);
-
- HCDBG(std::cerr << "startElement is " << name << std::endl);
-
- if (strcmp(name, _indexElement_Name) == 0)
- {
- _indexOnOffStack[++_sp] = true;
- // pop Tokenizer stack
- // following attribute can push selected Tokenizer
- if (_tsp != -1)
- _tsp--;
- }
- else if (strcmp(name, _indexText_Name) == 0)
- {
- }
- else if (strcmp(name, _indexAttribute_Name) == 0)
- {
- _attrSP++;
- }
-}
-
-void IndexAdapter::endElement(xmlNodePtr node)
-{
- const char *name = (const char*)(node->name);
- HCDBG(std::cerr << "endElement is " << name << std::endl);
- if (strcmp(name, _indexElement_Name) == 0)
- _sp--;
- else if (strcmp(name, _indexText_Name) == 0)
- {
- // reset
- }
- else if (strcmp(name, _indexAttribute_Name) == 0)
- storeLocation("-<" + _attributeStack[_attrSP--]);
-}
-
-int IndexAdapter::getLinkCode(const std::string &linkName)
-{
- LinkHashTable::iterator code = _linkCodes.find(linkName);
- if (code != _linkCodes.end())
- return code->second;
- else
- {
- _linknames.push_back(linkName);
- int newCode = CurrenMaxLinkCode++;
- _linkCodes.insert(LinkHashTable::value_type(linkName, newCode)).first->second = newCode;
- return newCode;
- }
-}
-
-void IndexAdapter::process(xmlNodePtr node, xmlDocPtr doc)
-{
- startElement(node);
-
- for (xmlAttrPtr attr = node->properties; attr; attr = attr->next)
- {
- xmlChar *value = xmlNodeListGetString(doc, attr->children, 0);
- attribute((const char*)(attr->name), (const char*)value);
- xmlFree(value);
- }
-
- if (xmlNodeIsText(node))
- {
- xmlChar *str = xmlNodeListGetString(doc, node, 1);
- characters(str);
- xmlFree(str);
- }
-
- for (xmlNodePtr test = node->xmlChildrenNode; test; test = test->next)
- process(test, doc);
-
- endElement(node);
-}
-
-class XmlIndexBuilder
-{
-private:
- fs::path _transformLocation;
- xsltStylesheetPtr _indexingTransform;
- IndexAdapter _indexAdapter;
- int _currentDocID;
- void reset();
- xsltStylesheetPtr getTransform(const std::string &stylesheetName);
-public:
- XmlIndexBuilder() : _indexingTransform(0) {}
- XmlIndexBuilder(const fs::path &dir);
- ~XmlIndexBuilder();
- void clearIndex();
- void setTransformLocation(const fs::path &filelocation);
- void init(const std::string &transform);
- void initXmlProcessor(const std::string &transform);
- void indexDocument(xmlDocPtr document, const std::string &docURL, const std::string &title);
- int intern(const std::string &name);
- void openDocument(const std::string &name) throw( HelpProcessingException );
- void closeDocument(const std::string &name) throw( HelpProcessingException );
- void close();
-};
-
-void XmlIndexBuilder::close()
-{
- fs::path fullname = _indexAdapter._index->indexFile("LINKNAMES");
- std::fstream _linkFile(fullname.native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary);
-
-#ifdef EMULATEORIGINAL
- static const unsigned char vectorheader[] =
- {
- 0xAC, 0xED, 0x00, 0x05, 0x75, 0x72, 0x00, 0x13,
- 0x5B, 0x4C, 0x6A, 0x61, 0x76, 0x61, 0x2E, 0x6C,
- 0x61, 0x6E, 0x67, 0x2E, 0x53, 0x74, 0x72, 0x69,
- 0x6E, 0x67, 0x3B, 0xAD, 0xD2, 0x56, 0xE7, 0xE9,
- 0x1D, 0x7B, 0x47, 0x02, 0x00, 0x00, 0x78, 0x70
- };
-
- _linkFile.write((const char*)(&vectorheader[0]), sizeof(vectorheader));
- writeInt(_linkFile, _indexAdapter._linknames.size());
- std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end();
- for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin();
- aIter != aEnd; ++aIter)
- {
- HCDBG(std::cerr << "linkname is " << *aIter << std::endl);
- _linkFile << 't';
- writeShort(_linkFile, aIter->size());
- _linkFile << *aIter;
- }
-#else
- std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end();
- for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin();
- aIter != aEnd; ++aIter)
- {
- _linkFile << *aIter << '\n';
- }
-#endif
-#if 0
-
- // output link codes
- /*
- Enumeration keys = _linknames.elements();
- while (keys.hasMoreElements())
- System.out.println((String)keys.nextElement());
- */
-#endif
- _indexAdapter._index->close();
- std::cout << "done" << std::endl;
-}
-
-int XmlIndexBuilder::intern(const std::string &name)
-{
- return _indexAdapter.intern(name);
-}
-
-void XmlIndexBuilder::openDocument(const std::string &name) throw( HelpProcessingException )
-{
- if (_currentDocID != 0)
- {
- std::stringstream aStrStream;
- aStrStream << "document already open" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- _currentDocID = intern( PrefixTranslator::translatePrefix(name) );
- reset(); // reset context gathering state
-}
-
-int BitBuffer::InitSize = 256;
-int BitBuffer::NBits = 32;
-int BitBuffer::BitsInByte = 8;
-int BitBuffer::BytesInInt = 4;
-
-void Compressor::encode(const IntegerArray &pos, int k)
-{
- HCDBG(std::cerr << "1:start this encode of " << k << "size of "
- << pos.size() << std::endl);
- unsigned int n1 = 0;
- unsigned int power = 1 << k;
- for (size_t i = 0; i < pos.size(); i++)
- {
- HCDBG(std::cerr << "1: loop " << i << std::endl);
- unsigned int n2 = pos[i] >> k;
- int rem = pos[i] % power;
- HCDBG(std::cerr << "1: n1, n2 : " << n1 << "," << n2 << std::endl);
- if (n2 != n1)
- {
- unsigned int min = n1;
- unsigned int a = n1;
- int lev = 0, power2 = 1;
- if (n2 > n1)
- for (size_t max = n1; max < n2; a >>= 1, power2 <<= 1, lev++)
- if ((a & 1) != 0)
- min -= power2;
- else
- max += power2;
- else
- for ( ; min > n2; a >>= 1, power2 <<= 1, lev++)
- if ((a & 1) != 0)
- min -= power2;
- // lev 0s, 1, lev bits of (n2 - min) plus following value
- // no 'V' symbol needed here
- if (lev*2 + 1 + k <= NBits)
- _buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k);
- else
- {
- if (lev*2 + 1 <= NBits)
- _buffer.append(1 << lev | (n2 - min), lev*2 + 1);
- else
- {
- _buffer.append(0, lev);
- _buffer.append(1 << lev | (n2 - min), lev + 1);
- }
- _buffer.append(rem, k);
- }
- n1 = n2;
- }
- else
- _buffer.append(rem | power, k + 1); // 'V' + value
- }
- _buffer.append(2 | n1 & 1, 3); // marking end
- _buffer.close();
- HCDBG(std::cerr << "1:end this encode of " << k << std::endl);
-}
-
-void Compressor::encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2)
-{
- HCDBG(std::cerr << "2:start this encode of " << k << "size of "
- << pos.size() << std::endl);
- int power = 1 << k, n1 = 0;
- for (size_t i = 0; i < pos.size(); i++)
- {
- HCDBG(std::cerr << "2: loop " << i << std::endl);
- int n2 = pos[i] >> k;
- int rem = pos[i] % power;
- HCDBG(std::cerr << "2: n1, n2 : " << n1 << "," << n2 << std::endl);
- if (n2 != n1)
- {
- int min = n1, a = n1;
- int lev = 0, power2 = 1;
- if (n2 > n1)
- for (int max = n1; max < n2; a >>= 1, power2 <<= 1, lev++)
- if ((a & 1) != 0)
- min -= power2;
- else
- max += power2;
- else
- for ( ; min > n2; a >>= 1, power2 <<= 1, lev++)
- if ((a & 1) != 0)
- min -= power2;
- // lev 0s, 1, lev bits of (n2 - min) plus following value
- if (lev*2 + 1 + k <= NBits)
- _buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k);
- else
- {
- if (lev*2 + 1 <= NBits)
- _buffer.append(1 << lev | (n2 - min), lev*2 + 1);
- else
- {
- _buffer.append(0, lev);
- _buffer.append(1 << lev | (n2 - min), lev + 1);
- }
- _buffer.append(rem, k);
- }
- _buffer.append(len[i], k2);
- n1 = n2;
- }
- else
- _buffer.append((rem|power)<<k2 | len[i], k+k2+1); // 'V' + v1,v2
- }
- _buffer.append(2 | n1 & 1, 3); // marking end
- _buffer.close();
- HCDBG(std::cerr << "2:end this encode of " << k << std::endl);
-}
-
-// k: starting value for minimization
-int Compressor::minimize(const IntegerArray &array, int startK)
-{
- BitBuffer saved;
- int minK = startK;
- _buffer.clear();
- encode(array, startK);
- int min = _buffer.bitCount(); // init w/ first value
- saved.setFrom(_buffer);
-
- _buffer.clear();
- encode(array, startK + 1);
-
- if (_buffer.bitCount() < min)
- {
- int k = startK + 1;
- do
- {
- saved.setFrom(_buffer);
- min = _buffer.bitCount();
- minK = k;
- _buffer.clear();
- encode(array, ++k);
- }
- while (_buffer.bitCount() < min);
- }
- else // try smaller values through 1
- {
- for (int k = startK - 1; k > 0; k--)
- {
- _buffer.clear();
- encode(array, k);
- if (_buffer.bitCount() < min)
- {
- saved.setFrom(_buffer);
- min = _buffer.bitCount();
- minK = k;
- }
- else
- break;
- }
- }
-
- _buffer.setFrom(saved);
- return minK;
-}
-
-int Compressor::compressAscending(const IntegerArray &array)
-{
- IntegerArray differences(array.size());
- toDifferences(array, differences);
- return minimize(differences, BeginK);
-}
-
-int Compressor::NBits = 32;
-int Compressor::BeginK = 5;
-
-class DocumentCompressor
-{
-public:
- static int NConceptsInGroup;
- static int BitsInLabel;
- static int DefaultSize;
-private:
- int _nGroups;
- int _nExtents;
- unsigned int _freeComp;
- int _kk;
- Compressor *_currentCompressor;
- std::vector<Compressor> _compressors;
- Compressor _kCompr;
- Compressor _lCompr;
- Compressor _mCompr;
- Compressor _posCompressor;
- IntegerArray _kTable; // k's for the series
- IntegerArray _lTable; // lengths of the C/P groups
- IntegerArray _maxConcepts; // maximal concepts in CP
- IntegerArray _concepts;
- IntegerArray _documents;
- IntegerArray _microIndexOffsets;
- IntegerArray _titles;
- // _contextsOffsets for use in XML indexing
- IntegerArray _contextsOffsets;
- IntegerArray _positions;
- IntegerArray _labels;
-
-public:
- DocumentCompressor() : _currentCompressor(0), _compressors(DefaultSize) {}
- void writeOutMicroIndex(std::fstream &output,
- std::vector<ConceptLocation> &locations,
- std::vector<ConceptLocation> &extents)
- {
- HCDBG(std::cerr << "writeOutMicroIndex start" << std::endl);
- encode(locations, NConceptsInGroup);
- HCDBG(std::cerr << "writeOutMicroIndex end encode" << std::endl);
- if (!extents.empty())
- encodeExtents(extents);
- HCDBG(std::cerr << "writeOutMicroIndex finalize" << std::endl);
- finalizeEncoding();
- HCDBG(std::cerr << "writeOutMicroIndex write" << std::endl);
- writeOut(output);
- HCDBG(std::cerr << "writeOutMicroIndex end" << std::endl);
- }
-private:
- void encode(std::vector<ConceptLocation> &locations, int nConcepts)
- {
- int initK = 4;
- // first sort by concept only
-#ifdef CMCDEBUG
- for (size_t i = 0; i < locations.size(); ++i)
- fprintf(stderr, "unsorted is %d\n", locations[i].getConcept());
-#endif
- HCDBG(std::cerr << "start sort" << std::endl);
- ConceptLocation::sortByConcept(locations, 0, locations.size());
- HCDBG(std::cerr << "end sort" << std::endl);
-#ifdef CMCDEBUG
- for (size_t i = 0; i < locations.size(); ++i)
- fprintf(stderr, "sorted is %d\n", locations[i].getConcept());
-#endif
-
- // using the fact that concepts are already sorted
- // count of groups of 'nConcepts'
- // go for differences directly
-
- // clear the state
- _nGroups = 0;
- _nExtents = 0;
- _kTable.clear();
- _lTable.clear();
- _concepts.clear();
- _maxConcepts.clear();
- _kCompr.clear();
- _lCompr.clear();
- _mCompr.clear();
- for (size_t i = 0; i < _compressors.size(); i++)
- _compressors[i].clear();
- _freeComp = 0;
- _currentCompressor = NULL;
- // end of resetting state
-
- int conceptCounter = 0;
- int fromIndex = 0;
- int prevMax = 0;
- int last = locations[0].getConcept(); // init w/ first ID
- nextCompressor();
- _concepts.push_back(last);
- for (size_t i = 0;;)
- {
- for (; i < locations.size() && locations[i].getConcept() == last; i++)
- locations[i].setConcept(conceptCounter);
- if (i == locations.size())
- {
- if (!_concepts.empty())
- {
- ++_nGroups;
- _kTable.push_back(_currentCompressor->minimize(_concepts, initK));
- }
- encodePositions(locations, fromIndex, i, BitsInLabel);
- break;
- }
- else
- { // new concept (group?)
- if (++conceptCounter == nConcepts)
- {
- ++_nGroups;
- // we are looking at the beginning of a new group
- // last is maximal for the group just finished
- // it won't be stored in concepts array but maxConcepts
- _concepts.pop_back();
- HCDBG(fprintf(stderr, "_maxConcepts %d %d -> %d\n", last, prevMax, last - prevMax));
- _maxConcepts.push_back(last - prevMax);
- prevMax = last;
- _kTable.push_back(_currentCompressor->minimize(_concepts, initK));
-
-#ifdef CMCDEBUG
- for(size_t p = 0; p < locations.size(); ++p)
- std::cerr << "microindex2 this testing is " << locations[p].getBegin() <<
- locations[p].getEnd() << " : " << locations[p].getConcept() << std::endl;
-#endif
-
- HCDBG(std::cerr << "two encodePositions " << fromIndex << " " << i << std::endl);
- encodePositions(locations, fromIndex, i, BitsInLabel);
- fromIndex = i;
- nextCompressor();
- _concepts.clear();
- conceptCounter = 0;
- }
- _concepts.push_back(locations[i].getConcept() - last);
- last = locations[i].getConcept();
- }
- }
- }
-
- void encodePositions(std::vector<ConceptLocation> &locations, int from, int to, int cK)
- {
- int initK = 3;
- int lastPos, k;
- // sort in place by psitions only
-#ifdef CMCDEBUG
- for (int i = from; i < to; ++i)
- fprintf(stderr, "unsorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd());
-#endif
- ConceptLocation::sortByPosition(locations, from, to);
-#ifdef CMCDEBUG
- for (int i = from; i < to; ++i)
- fprintf(stderr, "sorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd());
-#endif
- _positions.clear();
- _labels.clear();
- _positions.push_back(lastPos = locations[from].getBegin());
- _labels.push_back(locations[from].getConcept()); // now: a label
- // skip duplicates
- for (int i = from, j = from + 1; j < to; j++)
- {
- if (locations[i].equals(locations[j]) == false)
- {
- i = j;
- HCDBG(std::cerr << "i is " << i << "locations begin is "
- << locations[i].getBegin() << "last pos is " << lastPos << std::endl);
- _positions.push_back(locations[i].getBegin() - lastPos);
- lastPos = locations[i].getBegin();
- _labels.push_back(locations[i].getConcept()); // now: a label
- }
- }
- // first find k by minimizing just positions w/o labels
- _kTable.push_back(k = _posCompressor.minimize(_positions, initK));
- _posCompressor.clear();
- HCDBG(std::cerr << "start encodePositions" << std::endl);
- _posCompressor.encode(_positions, _labels, k, cK);
- HCDBG(std::cerr << "end encodePositions" << std::endl);
- _currentCompressor->concatenate(_posCompressor);
- }
-
- void encodeExtents(std::vector<ConceptLocation> &extents)
- {
- // side effects:
- // 'k3' added to _kTable
- // a number of compressors populated: header + lengths' lists
- int initK = 4;
- int c = 0;
- IntegerArray concepts; //difference
- IntegerArray lengths;
- IntegerArray kTable;
- IntegerArray lTable;
- // reserve a compressor for concatenated tables
- nextCompressor();
- Compressor *extentsHeader = _currentCompressor;
- std::vector<ConceptLocation>::const_iterator aEnd = extents.end();
- for (std::vector<ConceptLocation>::const_iterator aIter = extents.begin();
- aIter != aEnd; ++aIter)
- {
- if (aIter->getConcept() != c)
- {
- if (c != 0)
- {
- _nExtents++;
- nextCompressor();
- kTable.push_back(_currentCompressor->minimize(lengths, initK));
- lTable.push_back(_currentCompressor->byteCount());
- }
- concepts.push_back(aIter->getConcept() - c);
- c = aIter->getConcept();
- lengths.clear();
- lengths.push_back(aIter->getLength());
- }
- else
- lengths.push_back(aIter->getLength());
- }
- // last table of lengths
- nextCompressor();
- kTable.push_back(_currentCompressor->minimize(lengths, initK));
- lTable.push_back(_currentCompressor->byteCount());
- Compressor compressor1;
- kTable.push_back(compressor1.minimize(lTable, initK));
- Compressor compressor2;
- kTable.push_back(compressor2.minimize(concepts, initK));
- _kTable.push_back(extentsHeader->minimize(kTable, initK)); // k3
- extentsHeader->concatenate(compressor1);
- extentsHeader->concatenate(compressor2);
- }
-
- void finalizeEncoding()
- {
- if (_nGroups > 1)
- {
- // if extents follow C/P groups we need the length of the last group
- int limit = _nExtents > 0 ? _freeComp : _freeComp - 1;
- for (int j = 0; j < limit; j++) // length of last not saved
- _lTable.push_back(_compressors[j].byteCount());
-
- _kTable.push_back(_mCompr.minimize(_maxConcepts, 3));
- _kTable.push_back(_lCompr.minimize(_lTable, 3));
- _kk = _kCompr.minimize(_kTable, 3);
- _kCompr.concatenate(_lCompr);
- _kCompr.concatenate(_mCompr);
- }
- else if (_nGroups == 1 && _nExtents > 0)
- {
- // length of the single C/P group packed with k-s
- _kTable.push_back(_compressors[0].byteCount());
- _kk = _kCompr.minimize(_kTable, 3);
- }
- }
-
- void writeOut(std::fstream &out)
- {
- if (_nExtents == 0)
- {
- if (_nGroups > 1)
- {
- unsigned char byte = static_cast<unsigned char>((0x80 | _kk));
- out.write( (const char*)&byte, 1 );
- HCDBG(std::cerr << "writeOut of " << int(byte) << std::endl);
- _kCompr.write(out); // concatenated k,l,m
- for (size_t j = 0; j < _freeComp; j++)
- _compressors[j].write(out);
- }
- else // single group, no extents; code: 00
- {
- unsigned char k1 = (unsigned char)(_kTable[0]);
- unsigned char k2 = (unsigned char)(_kTable[1]);
- out.write( (const char*)&k1, 1 );
- out.write( (const char*)&k2, 1 );
- _compressors[0].write(out); // C/P
- }
- }
- else
- { // extents
- unsigned char byte = static_cast<unsigned char>(
- (_nGroups > 1 ? 0xC0 : 0x40) | _kk);
- out.write( (const char*)&byte, 1 );
- _kCompr.write(out);
- for (size_t j = 0; j < _freeComp; j++)
- _compressors[j].write(out);
- }
- }
-
- Compressor* nextCompressor()
- {
- if (_freeComp == _compressors.size())
- _compressors.push_back(Compressor());
- return _currentCompressor = &_compressors[_freeComp++];
- }
-
- int byteCount()
- {
- if (_nGroups == 1 && _nExtents == 0)
- return 2 + _compressors[0].byteCount();
- else
- {
- int result = 1; // initial kk
- result += _kCompr.byteCount();
- for (size_t j = 0; j < _freeComp; j++)
- result += _compressors[j].byteCount();
- return result;
- }
- }
-};
-
-int DocumentCompressor::NConceptsInGroup = 16;
-int DocumentCompressor::BitsInLabel = 4;
-int DocumentCompressor::DefaultSize = 32;
-
-DocumentCompressor& Index::getDocumentCompressor()
-{
- if (!_documentCompressor)
- _documentCompressor = new DocumentCompressor();
- return *_documentCompressor;
-}
-
-void Index::compress(int docID, int titleID,
- std::vector<ConceptLocation> &locations,
- std::vector<ConceptLocation> &extents)
-{
- std::fstream &positions = getPositionsFile();
-
- positions.seekg(0, std::ios::end);
- long currentEnd = positions.tellg();
- if (currentEnd < 0) currentEnd = 0;
- positions.clear();
- positions.seekg(currentEnd, std::ios::beg);
-
- _documents.push_back(docID);
- _microIndexOffsets.push_back(currentEnd);
- HCDBG(std::cerr << "_microIndexOffsets pushed back " << currentEnd << std::endl);
- HCDBG(std::cerr << "added title id of " << titleID << std::endl);
- _titles.push_back(titleID);
-
- getDocumentCompressor().writeOutMicroIndex(positions,
- locations, extents);
-}
-
-void Index::writeOutOffsets()
-{
- Compressor documents;
- int k1 = documents.minimize(_documents, 8);
- Compressor offsets;
- int k2 = offsets.compressAscending(_microIndexOffsets);
- Compressor titles;
- int k3 = titles.minimize(_titles, 8); // 8 is the starting k
- std::fstream &out = getOffsetsFile();
- out.seekp(0); // position at beginning
- out.clear();
- unsigned char byte;
- byte = static_cast<unsigned char>(k1);
- out.write( (const char*)&byte, 1 );
- HCDBG(fprintf(stderr, "a: offset dump of %x\n", byte));
- documents.write(out);
- byte = static_cast<unsigned char>(k2);
- out.write( (const char*)&byte, 1 );
- HCDBG(fprintf(stderr, "b: offset dump of %x\n", byte));
- offsets.write(out);
- byte = static_cast<unsigned char>(k3);
- out.write( (const char*)&byte, 1 );
- HCDBG(fprintf(stderr, "c: offset dump of %x\n", byte));
- titles.write(out);
-}
-
-Index::~Index()
-{
- delete _schema;
- delete _dictParams;
- delete _dict;
- delete _positionsFile;
- delete _offsetsFile;
- delete _documentCompressor;
-}
-
-void XmlIndex::compress(int docID, int titleID,
- std::vector<ConceptLocation> &locations,
- std::vector<ConceptLocation> &extents,
- int k, const Compressor &contextTables)
-{
- HCDBG(std::cerr << "start compress" << std::endl);
- HCDBG(std::cerr << "docID : " << docID << " titleID : " << titleID <<
- "locations size : " << locations.size() << "extents size : " << extents.size() << std::endl);
- Index::compress(docID, titleID, locations, extents);
- HCDBG(std::cerr << "end compress" << std::endl);
-
- std::fstream& contexts = getContextsFile();
-
- contexts.seekp(0, std::ios::end);
- long currentEnd = contexts.tellp();
- if (currentEnd < 0) currentEnd = 0;
- contexts.clear();
- contexts.seekp(currentEnd);
- writeByte(contexts, static_cast<unsigned char>(k));
- contextTables.write(contexts);
- _contextsOffsets.push_back(currentEnd);
-}
-
-void XmlIndexBuilder::closeDocument(const std::string &title) throw( HelpProcessingException )
-{
- if (_currentDocID == 0)
- {
- std::stringstream aStrStream;
- aStrStream << "no document open" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
- else if (!_indexAdapter._locations.empty())
- {
- IntegerArray kTable;
-
- Compressor compressor1;
- Compressor compressor2;
- Compressor compressor3;
- Compressor compressor4;
-
- kTable.push_back(compressor1.compressAscending(_indexAdapter._initialWords));
- kTable.push_back(compressor2.minimize(_indexAdapter._dests, 2));
- kTable.push_back(compressor3.minimize(_indexAdapter._links, 2));
- kTable.push_back(compressor4.minimize(_indexAdapter._seqNumbers, 2));
-
- Compressor compressor0;
- int k0 = compressor0.minimize(kTable, 4);
-
- compressor0.concatenate(compressor1);
- compressor0.concatenate(compressor2);
- compressor0.concatenate(compressor3);
- compressor0.concatenate(compressor4);
-
- std::vector<ConceptLocation> dummy;
- _indexAdapter._index->compress(_currentDocID, intern(title),
- _indexAdapter._locations, dummy, k0, compressor0);
- }
- else
- {
- // System.out.println("no indexable content");
- }
- _indexAdapter._locations.clear();
- _currentDocID = 0; // state: nothing open
-}
-
-void XmlIndexBuilder::indexDocument(xmlDocPtr doc, const std::string &docURL, const std::string &title)
-{
- HCDBG(std::cerr << "Indexing " << docURL << std::endl);
-
- xmlNodePtr root = xmlDocGetRootElement(doc);
-
- openDocument(docURL);
-
-// xmlDocDump(stdout, doc);
- xmlDocPtr res = xsltApplyStylesheet(_indexingTransform, doc, NULL);
-
- _indexAdapter.init();
-
- // start = System.currentTimeMillis();
- root = xmlDocGetRootElement(res);
- if (root)
- {
-// xmlDocDump(stdout, res);
- for (xmlNodePtr test = root; test; test = test->next)
- _indexAdapter.process(test, res);
- }
- xmlFreeDoc(res);
-
- // System.out.println((System.currentTimeMillis()-start)+" transform");
- // start = System.currentTimeMillis();
- _indexAdapter.finish();
- // System.out.println((System.currentTimeMillis()-start)+" finish");
- // start = System.currentTimeMillis();
- closeDocument(title);
- // System.out.println((System.currentTimeMillis()-start)+" close");
-}
-
-XmlIndexBuilder::~XmlIndexBuilder()
-{
- delete _indexAdapter._index;
-}
-
-void XmlIndexBuilder::setTransformLocation(const fs::path &filelocation)
-{
- _transformLocation = filelocation;
-}
-
-xsltStylesheetPtr XmlIndexBuilder::getTransform(const std::string &stylesheetName)
-{
- fs::path stylesheet = _transformLocation / (stylesheetName + ".xsl");
- return xsltParseStylesheetFile((const xmlChar *)stylesheet.native_file_string().c_str());
-}
-
-void XmlIndexBuilder::initXmlProcessor(const std::string &transform)
-{
- _indexingTransform = getTransform(transform);
-}
-
-void XmlIndexBuilder::init(const std::string &transform)
-{
- _indexAdapter._index->init();
-#ifdef EMULATEORIGINAL
- //some kind of bug in the original AFAICS
- _indexAdapter._stoplist.push_back("andnull");
-#endif
- reset();
-
- // initialize vector and hashtable
- const std::vector<std::string> &linkNames = _indexAdapter._index->getLinkNames();
- std::vector<std::string>::const_iterator aEnd = linkNames.end();
- for (std::vector<std::string>::const_iterator aIter = linkNames.begin();
- aIter != aEnd; ++aIter)
- {
- _indexAdapter.getLinkCode(*aIter);
- }
-
- initXmlProcessor(transform);
-}
-
-void XmlIndexBuilder::reset()
-{
- _indexAdapter._availContextNumber = 0;
- _indexAdapter._lastWordNumber = 0;
- _indexAdapter._locations.clear();
- _indexAdapter._anyLocationsStored = false;
- // all the contexts' tables
- _indexAdapter._initialWords.clear();
- _indexAdapter._dests.clear();
- _indexAdapter._links.clear();
- _indexAdapter._seqNumbers.clear();
-}
-
-XmlIndexBuilder::XmlIndexBuilder(const fs::path &indexDir)
- : _indexingTransform(0), _currentDocID(0)
-{
- HCDBG(std::cerr << "indexDir is " << indexDir.native_directory_string() << std::endl);
- _indexAdapter._index = new XmlIndex(indexDir, true);
-}
-
-void XmlIndexBuilder::clearIndex()
-{
- _indexAdapter._index->clear();
-}
-
class HelpLinker
{
public:
- static void main(std::vector<std::string> &args, std::string* pExtensionPath = NULL )
+ void main(std::vector<std::string> &args, std::string* pExtensionPath = NULL )
throw( HelpProcessingException );
- static bool isExtensionMode( void )
- {return bExtensionMode; }
+
+ HelpLinker()
+ : init(true)
+ , m_pIndexerPreProcessor(NULL)
+ {}
+ ~HelpLinker()
+ { delete m_pIndexerPreProcessor; }
+
private:
- HelpLinker() : init(true), xmlIndexBuilder(NULL) {}
- ~HelpLinker() { delete xmlIndexBuilder; }
- JarOutputStream jarOutputStream;
- static int locCount, totCount;
- static Stringtable additionalFiles;
- static HashSet helpFiles;
- static fs::path sourceRoot;
- static fs::path embeddStylesheet;
- static fs::path indexStylesheet;
- static fs::path outputFile;
- static std::string module;
- static std::string lang;
- static std::string hid;
- static std::string extensionPath;
- static bool bExtensionMode;
+ int locCount, totCount;
+ Stringtable additionalFiles;
+ HashSet helpFiles;
+ fs::path sourceRoot;
+ fs::path embeddStylesheet;
+ fs::path idxCaptionStylesheet;
+ fs::path idxContentStylesheet;
+ fs::path zipdir;
+ fs::path outputFile;
+ std::string module;
+ std::string lang;
+ std::string hid;
+ std::string extensionPath;
+ bool bExtensionMode;
fs::path indexDirName;
Stringtable hidlistTranslation;
fs::path indexDirParentName;
bool init;
- XmlIndexBuilder* xmlIndexBuilder;
- void initXMLIndexBuilder();
- void createFileFromBytes(const std::string &fileName,
- const std::string &defaultXSL);
- void closeXMLIndexBuilder()
- {
- xmlIndexBuilder->close();
- }
+ IndexerPreProcessor* m_pIndexerPreProcessor;
+ void initIndexerPreProcessor();
void link() throw( HelpProcessingException );
void addBookmark( DB* dbBase, std::string thishid,
const std::string& fileB, const std::string& anchorB,
@@ -4712,11 +257,6 @@ private:
#endif
};
-bool isExtensionMode( void )
-{
- return HelpLinker::isExtensionMode();
-}
-
namespace URLEncoder
{
static std::string encode(const std::string &rIn)
@@ -4740,76 +280,6 @@ namespace URLEncoder
}
}
-JarOutputStream::JarOutputStream()
-{
- perlline << "use Archive::Zip qw(:ERROR_CODES); ";
- perlline << "my $zip = Archive::Zip->new(); ";
-}
-
-std::string replaceAll(std::string result,
- const std::string &search, const std::string &replace)
-{
- std::string::size_type pos = 0;
- while(1)
- {
- pos = result.find(search, pos);
- if (pos == std::string::npos) break;
- result.replace(pos, search.size(), replace);
- pos += replace.size();
- }
- return result;
-}
-
-void JarOutputStream::addFile(const std::string &fileName, const std::string &name)
-{
- perlline << "$zip->addFile(\"" << replaceAll(fileName, "\\", "/") << "\", \"" << name << "\"); ";
-}
-
-void JarOutputStream::addTree(const std::string &tree, const std::string &name)
-{
- perlline << "$zip->addTree(\"" << replaceAll(tree, "\\", "/") << "\", \"" << name << "\"); ";
-}
-
-void JarOutputStream::dontCompress(const std::string &key)
-{
- perlline << "my $member = $zip->memberNamed(\"" << key << "\"); ";
- perlline << "if ($member) { $member->desiredCompressionMethod( COMPRESSION_STORED ); } ";
-}
-
-void JarOutputStream::commit()
-{
- perlline << "print $zip->writeToFileNamed(\"" << replaceAll(getname().native_file_string(), "\\", "/") << "\").\"\\n\"; ";
-
- fs::path tmp = getname();
- tmp.append(".perl");
- std::string perlfile = replaceAll( tmp.native_file_string(), "\\", "/");
- std::ofstream fos(perlfile.c_str());
- fos << perlline.str();
- fos.close();
- std::string myperl("perl");
- std::string is4nt;
- char* use_shell = getenv( "USE_SHELL" );
- if ( use_shell )
- is4nt = use_shell;
- if( !is4nt.empty() && is4nt == "4nt" )
- {
- // in SO windows environment perl isn't in the path and
- // needs to be fetched from the environment. this doesn't
- // work in a cygwin shell as "/usr/bin/perl" will fail in a
- // native shell (see system call).
- myperl = getenv( "PERL" );
- }
- std::string commandline;
- commandline = myperl + " " + perlfile;
- HCDBG(std::cerr << "command line 3 is" << commandline << std::endl);
- // on windows, calling perl (either cygwin or native) from a native
- // shell the only chance to survive is using "c:/foo" notation
- if ( system(commandline.c_str()) )
- fprintf (stderr, "ERROR: calling generated perl script failed!\n");
-
- fs::remove(tmp);
-}
-
void HelpLinker::addBookmark( DB* dbBase, std::string thishid,
const std::string& fileB, const std::string& anchorB,
const std::string& jarfileB, const std::string& titleB)
@@ -4863,104 +333,14 @@ void HelpLinker::addBookmark( DB* dbBase, std::string thishid,
dbBase->put(dbBase, NULL, &key, &data, 0);
}
-void HelpLinker::createFileFromBytes(const std::string &fileName,
- const std::string &defaultXSL)
-{
- std::ofstream fos((indexDirParentName / fileName).native_file_string().c_str());
- fos << defaultXSL;
-}
-
-void HelpLinker::initXMLIndexBuilder()
+void HelpLinker::initIndexerPreProcessor()
{
+ if( m_pIndexerPreProcessor )
+ delete m_pIndexerPreProcessor;
std::string mod = module;
std::transform (mod.begin(), mod.end(), mod.begin(), tolower);
- indexDirName = indexDirParentName / (mod + ".idx");
- fs::create_directory(indexDirName);
-
- if (xmlIndexBuilder) delete xmlIndexBuilder;
- xmlIndexBuilder = new XmlIndexBuilder(indexDirName);
-
- std::string defaultXSL =
- "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
- "<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n"
- "\t<xsl:template match=\"*|/\"/>\n"
- "</xsl:stylesheet>";
- createFileFromBytes("default.xsl", defaultXSL);
- xmlIndexBuilder->clearIndex(); // Build index from scratch
- xmlIndexBuilder->setTransformLocation(indexDirParentName);
-}
-
-namespace
-{
- fs::path gettmppath()
- {
- fs::path ret;
- osl::File::createTempFile(0, 0, &ret.data);
- fs::remove(ret);
- return ret;
- }
-}
-
-extern "C" void function_orig_pointer(xmlXPathParserContextPtr ctxt, int nargs)
-{
- if (nargs > 1)
- {
- // TODO: Change when used for extensions, no exception possible here
- std::cerr << "function_orig_pointer, too many args" << std::endl;
- exit(-1);
- }
-
- xmlNodePtr cur = NULL;
- if (nargs == 0)
- cur = ctxt->context->node;
- else if (nargs == 1)
- {
- xmlXPathObjectPtr obj = valuePop(ctxt);
- xmlNodeSetPtr nodelist = obj->nodesetval;
-
- if ((nodelist == NULL) || (nodelist->nodeNr <= 0))
- {
- // TODO: Change when used for extensions, no exception possible here
- std::cerr << "function_orig_pointer, bad nodeset" << std::endl;
- exit(-1);
- }
-
- cur = nodelist->nodeTab[0];
- for (int i = 1; i < nodelist->nodeNr; ++i)
- {
- int ret = xmlXPathCmpNodes(cur, nodelist->nodeTab[i]);
- if (ret == -1)
- cur = nodelist->nodeTab[i];
- }
-
- xmlXPathFreeObject(obj);
- }
-
- if (cur == NULL)
- {
- // TODO: Change when used for extensions, no exception possible here
- std::cerr << "function_orig_pointer, bad node" << std::endl;
- exit(-1);
- }
-
- static xmlChar str[20];
- sprintf((char *)str, "%ld", (sal_uIntPtr)(cur));
- valuePush(ctxt, xmlXPathNewString(str));
-}
-
-extern "C" void* cmc_module_init(xsltTransformContextPtr ctxt, const xmlChar* uri)
-{
- if (xsltRegisterExtFunction(ctxt, (const xmlChar*)"orig-pointer", uri, function_orig_pointer))
- {
- // TODO: Change when used for extensions, no exception possible here
- std::cerr << "failure to register function_orig_pointer" << std::endl;
- exit(-1);
- }
- return NULL;
-}
-
-extern "C" void cmc_module_term(xsltTransformContextPtr, const xmlChar*, void*)
-{
+ m_pIndexerPreProcessor = new IndexerPreProcessor( mod, indexDirParentName,
+ idxCaptionStylesheet, idxContentStylesheet );
}
/**
@@ -4976,7 +356,7 @@ void HelpLinker::link() throw( HelpProcessingException )
}
else
{
- indexDirParentName = gettmppath();
+ indexDirParentName = zipdir;
fs::create_directory(indexDirParentName);
}
@@ -4987,15 +367,6 @@ void HelpLinker::link() throw( HelpProcessingException )
std::string mod = module;
std::transform (mod.begin(), mod.end(), mod.begin(), tolower);
- // Determine the outputstream
- fs::path outputTmpFile;
- if( !bExtensionMode )
- {
- outputTmpFile = outputFile;
- outputTmpFile.append(".tmp");
- jarOutputStream.setname(outputTmpFile);
- }
-
// do the work here
// continue with introduction of the overall process thing into the
// here all hzip files will be worked on
@@ -5042,7 +413,7 @@ void HelpLinker::link() throw( HelpProcessingException )
// lastly, initialize the indexBuilder
if ( (!bExtensionMode || bIndexForExtension) && !helpFiles.empty())
- initXMLIndexBuilder();
+ initIndexerPreProcessor();
if( !bExtensionMode )
{
@@ -5056,6 +427,7 @@ void HelpLinker::link() throw( HelpProcessingException )
{
std::cout << ".";
std::cout.flush();
+
// process one file
// streamTable contains the streams in the hzip file
StreamTable streamTable;
@@ -5074,6 +446,7 @@ void HelpLinker::link() throw( HelpProcessingException )
fs::path langsourceRoot(sourceRoot);
fs::path xhpFile;
+
if( bExtensionMode )
{
// langsourceRoot == sourceRoot for extensions
@@ -5086,6 +459,7 @@ void HelpLinker::link() throw( HelpProcessingException )
langsourceRoot.append('/' + lang + '/');
xhpFile = fs::path(xhpFileName, fs::native);
}
+
HelpCompiler hc( streamTable, xhpFile, langsourceRoot,
embeddStylesheet, module, lang, bExtensionMode );
@@ -5130,33 +504,6 @@ void HelpLinker::link() throw( HelpProcessingException )
// add once this as its own id.
addBookmark(dbBase, documentPath, fileB, std::string(), jarfileB, titleB);
- if ( (!bExtensionMode || bIndexForExtension) && init)
- {
- std::ifstream indexXSLFile(indexStylesheet.native_file_string().c_str());
- std::ostringstream baos;
- baos << indexXSLFile.rdbuf();
- std::string xsl = baos.str();
-
- //I see that we later generate a map of generateids to nodes which we will use
- //to link the results of generate-id in the transformed document back to the nodes
- //in the original document, so let's cut out the middle-men and make an extension
- //which does exactly what we want, and give us a pointer to the original node
- xsl.replace(xsl.find("<xsl:stylesheet"), strlen("<xsl:stylesheet"),
- "<xsl:stylesheet extension-element-prefixes=\"CMC\" xmlns:CMC=\"http://www.cunninghack.org\"");
- xsl.replace(xsl.find("generate-id"), strlen("generate-id"), "CMC:orig-pointer");
-
- if (xsltRegisterExtModule((const xmlChar*)"http://www.cunninghack.org", cmc_module_init, cmc_module_term))
- {
- std::stringstream aStrStream;
- aStrStream << "fatal error on registering xslt module" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() );
- }
-
- createFileFromBytes("index.xsl", xsl);
- xmlIndexBuilder->init("index");
- init = false;
- }
-
// first the database *.db
// ByteArrayInputStream bais = null;
// ObjectInputStream ois = null;
@@ -5246,6 +593,7 @@ void HelpLinker::link() throw( HelpProcessingException )
}
}
+ //IndexerPreProcessor
if( !bExtensionMode || bIndexForExtension )
{
// now the indexing
@@ -5256,14 +604,10 @@ void HelpLinker::link() throw( HelpProcessingException )
{
std::string temp = module;
std::transform (temp.begin(), temp.end(), temp.begin(), tolower);
- xmlIndexBuilder->indexDocument(document,
- std::string("vnd.sun.star.help://")
- + temp
- + "/"
- + URLEncoder::encode(documentPath),
- "");
+ m_pIndexerPreProcessor->processDocument(document, URLEncoder::encode(documentPath) );
}
}
+
} // while loop over hzip files ending
if( !bExtensionMode )
@@ -5284,46 +628,32 @@ void HelpLinker::link() throw( HelpProcessingException )
helpKeyword.dump(keyWord);
keyWord->close(keyWord, 0);
- if (!bExtensionMode && !helpFiles.empty())
- {
- closeXMLIndexBuilder();
- HCDBG(std::cerr << "dir is " << indexDirName.native_directory_string() << std::endl);
- jarOutputStream.addTree(indexDirName.native_file_string(), mod + ".idx");
- }
-
if( !bExtensionMode )
{
- jarOutputStream.addFile(helpTextFileName.native_file_string(), mod + ".ht");
- jarOutputStream.addFile(dbBaseFileName.native_file_string(), mod + ".db");
- jarOutputStream.addFile(keyWordFileName.native_file_string(), mod + ".key");
-
- /////////////////////////////////////////////////////////////////////////
- // last, all files which should be copied into the jar file
- /////////////////////////////////////////////////////////////////////////
-
+ // New index
Stringtable::iterator aEnd = additionalFiles.end();
for (Stringtable::iterator enumer = additionalFiles.begin(); enumer != aEnd;
++enumer)
{
- const std::string &additionalFileKey = enumer->first;
const std::string &additionalFileName = enumer->second;
- jarOutputStream.addFile(additionalFileName, additionalFileKey);
- }
+ const std::string &additionalFileKey = enumer->first;
- jarOutputStream.dontCompress(mod + ".jar");
- jarOutputStream.commit();
+ fs::path fsAdditionalFileName( additionalFileName, fs::native );
+ std::string aNativeStr = fsAdditionalFileName.native_file_string();
+ const char* pStr = aNativeStr.c_str();
+ std::cerr << pStr;
- HCDBG(std::cerr << "like to rename " << outputTmpFile.native_file_string() << " as " <<
- outputFile.native_file_string() << std::endl);
- fs::rename(outputTmpFile, outputFile);
- if (!fs::exists(outputFile))
- {
- std::stringstream aStrStream;
- aStrStream << "can't rename file '" << outputTmpFile.native_file_string() << "'" << std::endl;
- throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
+ fs::path fsTargetName( indexDirParentName / additionalFileKey );
+
+ fs::copy( fsAdditionalFileName, fsTargetName );
}
}
+#ifdef SOLARIS
+ if( !bExtensionMode )
+ _exit( 0 );
+#endif
+/*
/////////////////////////////////////////////////////////////////////////
/// remove temprary directory for index creation
/////////////////////////////////////////////////////////////////////////
@@ -5331,24 +661,10 @@ void HelpLinker::link() throw( HelpProcessingException )
if( !bExtensionMode )
fs::remove_all( indexDirParentName );
#endif
+*/
}
-int HelpLinker::locCount;
-int HelpLinker::totCount;
-Stringtable HelpLinker::additionalFiles;
-HashSet HelpLinker::helpFiles;
-fs::path HelpLinker::sourceRoot;
-fs::path HelpLinker::embeddStylesheet, HelpLinker::indexStylesheet;
-fs::path HelpLinker::outputFile;
-std::string HelpLinker::module;
-std::string HelpLinker::lang;
-std::string HelpLinker::hid;
-std::string HelpLinker::extensionPath;
-bool HelpLinker::bExtensionMode;
-
-int GnTmpFileCounter = 0;
-
void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPath)
throw( HelpProcessingException )
{
@@ -5406,17 +722,41 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat
embeddStylesheet = fs::path(args[i], fs::native);
}
- else if (args[i].compare("-idx") == 0)
+ else if (args[i].compare("-zipdir") == 0)
{
++i;
if (i >= args.size())
{
std::stringstream aStrStream;
- aStrStream << "indexstylesheet missing" << std::endl;
+ aStrStream << "idxtemp missing" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
- indexStylesheet = fs::path(args[i], fs::native);
+ zipdir = fs::path(args[i], fs::native);
+ }
+ else if (args[i].compare("-idxcaption") == 0)
+ {
+ ++i;
+ if (i >= args.size())
+ {
+ std::stringstream aStrStream;
+ aStrStream << "idxcaption stylesheet missing" << std::endl;
+ throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
+ }
+
+ idxCaptionStylesheet = fs::path(args[i], fs::native);
+ }
+ else if (args[i].compare("-idxcontent") == 0)
+ {
+ ++i;
+ if (i >= args.size())
+ {
+ std::stringstream aStrStream;
+ aStrStream << "idxcontent stylesheet missing" << std::endl;
+ throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
+ }
+
+ idxContentStylesheet = fs::path(args[i], fs::native);
}
else if (args[i].compare("-o") == 0)
{
@@ -5494,10 +834,22 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat
++i;
}
- if (!bExtensionMode && indexStylesheet.empty())
+ if (!bExtensionMode && zipdir.empty())
+ {
+ std::stringstream aStrStream;
+ aStrStream << "no index dir given" << std::endl;
+ throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
+ }
+ if (!bExtensionMode && idxCaptionStylesheet.empty())
+ {
+ std::stringstream aStrStream;
+ aStrStream << "no index caption stylesheet given" << std::endl;
+ throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
+ }
+ if (!bExtensionMode && idxContentStylesheet.empty())
{
std::stringstream aStrStream;
- aStrStream << "no index file given" << std::endl;
+ aStrStream << "no index content stylesheet given" << std::endl;
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
if (!bExtensionMode && embeddStylesheet.empty())
@@ -5537,7 +889,7 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat
throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() );
}
- HelpLinker().link();
+ link();
}
int main(int argc, char**argv)
@@ -5548,7 +900,9 @@ int main(int argc, char**argv)
args.push_back(std::string(argv[i]));
try
{
- HelpLinker::main(args);
+ HelpLinker* pHelpLinker = new HelpLinker();
+ pHelpLinker->main( args );
+ delete pHelpLinker;
}
catch( const HelpProcessingException& e )
{
@@ -5584,9 +938,9 @@ HelpProcessingErrorInfo& HelpProcessingErrorInfo::operator=( const struct HelpPr
{
m_eErrorClass = e.m_eErrorClass;
rtl::OString tmpErrorMsg( e.m_aErrorMsg.c_str() );
- m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, osl_getThreadTextEncoding() );
+ m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, fs::getThreadTextEncoding() );
rtl::OString tmpXMLParsingFile( e.m_aXMLParsingFile.c_str() );
- m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, osl_getThreadTextEncoding() );
+ m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, fs::getThreadTextEncoding() );
m_nXMLParsingLine = e.m_nXMLParsingLine;
return *this;
}
@@ -5607,14 +961,14 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp
const char** argv = new const char*[argc];
argv[0] = "";
argv[1] = "-mod";
- rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, osl_getThreadTextEncoding() );
+ rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, fs::getThreadTextEncoding() );
argv[2] = aOExtensionName.getStr();
for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp )
{
rtl::OUString aXhpFile = pXhpFiles[iXhp];
- rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, osl_getThreadTextEncoding() );
+ rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, fs::getThreadTextEncoding() );
char* pArgStr = new char[aOXhpFile.getLength() + 1];
strcpy( pArgStr, aOXhpFile.getStr() );
argv[iXhp + 3] = pArgStr;
@@ -5628,7 +982,7 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp
delete argv[iXhp + 3];
delete[] argv;
- rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, osl_getThreadTextEncoding() );
+ rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, fs::getThreadTextEncoding() );
const char* pExtensionPath = aOExtensionLanguageRoot.getStr();
std::string aStdStrExtensionPath = pExtensionPath;
@@ -5636,7 +990,9 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp
xmlSetStructuredErrorFunc( NULL, (xmlStructuredErrorFunc)StructuredXMLErrorFunction );
try
{
- HelpLinker::main(args,&aStdStrExtensionPath);
+ HelpLinker* pHelpLinker = new HelpLinker();
+ pHelpLinker->main( args,&aStdStrExtensionPath );
+ delete pHelpLinker;
}
catch( const HelpProcessingException& e )
{