diff options
author | Kurt Zenker <kz@openoffice.org> | 2008-06-24 15:18:42 +0000 |
---|---|---|
committer | Kurt Zenker <kz@openoffice.org> | 2008-06-24 15:18:42 +0000 |
commit | e82de9330ef0f972a4544f1841c3fee1049aac15 (patch) | |
tree | dfd713b21ae78c1d93fb4f1c6b95a8af1546e96b /xmlhelp/source/com | |
parent | d9c33e48174f32f2010c40bbb42081a689c2d319 (diff) |
INTEGRATION: CWS ab52 (1.13.6); FILE MERGED
2008/06/18 10:35:42 ab 1.13.6.1: #i83625# Migration to Lucene
Diffstat (limited to 'xmlhelp/source/com')
-rw-r--r-- | xmlhelp/source/com/sun/star/help/HelpLinker.cxx | 5038 |
1 files changed, 197 insertions, 4841 deletions
diff --git a/xmlhelp/source/com/sun/star/help/HelpLinker.cxx b/xmlhelp/source/com/sun/star/help/HelpLinker.cxx index ddd5efb4055f..d35ae39e2e89 100644 --- a/xmlhelp/source/com/sun/star/help/HelpLinker.cxx +++ b/xmlhelp/source/com/sun/star/help/HelpLinker.cxx @@ -7,7 +7,7 @@ * OpenOffice.org - a multi-platform office productivity suite * * $RCSfile: HelpLinker.cxx,v $ - * $Revision: 1.13 $ + * $Revision: 1.14 $ * * This file is part of OpenOffice.org. * @@ -35,9 +35,6 @@ #include <string.h> #include <limits.h> -#include <boost/shared_ptr.hpp> -#include <boost/tokenizer.hpp> - #include <libxslt/xslt.h> #include <libxslt/transform.h> #include <libxslt/xsltutils.h> @@ -57,21 +54,101 @@ #include <expat/xmlparse.h> #endif -class JarOutputStream +class IndexerPreProcessor { private: - fs::path filename; - std::ostringstream perlline; + std::string m_aModuleName; + fs::path m_fsIndexBaseDir; + fs::path m_fsCaptionFilesDirName; + fs::path m_fsContentFilesDirName; + + xsltStylesheetPtr m_xsltStylesheetPtrCaption; + xsltStylesheetPtr m_xsltStylesheetPtrContent; + public: - JarOutputStream(); - void setname(const fs::path &name) { filename = name; } - const fs::path& getname() const { return filename; } - void addFile(const std::string &name, const std::string &key); - void addTree(const std::string &dir, const std::string &key); - void dontCompress(const std::string &key); - void commit(); + IndexerPreProcessor( const std::string& aModuleName, const fs::path& fsIndexBaseDir, + const fs::path& idxCaptionStylesheet, const fs::path& idxContentStylesheet ); + ~IndexerPreProcessor(); + + void processDocument( xmlDocPtr doc, const std::string& EncodedDocPath ); }; +IndexerPreProcessor::IndexerPreProcessor + ( const std::string& aModuleName, const fs::path& fsIndexBaseDir, + const fs::path& idxCaptionStylesheet, const fs::path& idxContentStylesheet ) + : m_aModuleName( aModuleName ) + , m_fsIndexBaseDir( fsIndexBaseDir ) +{ + m_fsCaptionFilesDirName = fsIndexBaseDir / "caption"; + fs::create_directory( m_fsCaptionFilesDirName ); + + m_fsContentFilesDirName = fsIndexBaseDir / "content"; + fs::create_directory( m_fsContentFilesDirName ); + + m_xsltStylesheetPtrCaption = xsltParseStylesheetFile + ((const xmlChar *)idxCaptionStylesheet.native_file_string().c_str()); + m_xsltStylesheetPtrContent = xsltParseStylesheetFile + ((const xmlChar *)idxContentStylesheet.native_file_string().c_str()); +} + +IndexerPreProcessor::~IndexerPreProcessor() +{ + if( m_xsltStylesheetPtrCaption ) + xsltFreeStylesheet( m_xsltStylesheetPtrCaption ); + if( m_xsltStylesheetPtrContent ) + xsltFreeStylesheet( m_xsltStylesheetPtrContent ); +} + + +std::string getEncodedPath( const std::string& Path ) +{ + rtl::OString aOStr_Path( Path.c_str() ); + rtl::OUString aOUStr_Path( rtl::OStringToOUString + ( aOStr_Path, fs::getThreadTextEncoding() ) ); + rtl::OUString aPathURL; + osl::File::getFileURLFromSystemPath( aOUStr_Path, aPathURL ); + rtl::OString aOStr_PathURL( rtl::OUStringToOString + ( aPathURL, fs::getThreadTextEncoding() ) ); + std::string aStdStr_PathURL( aOStr_PathURL.getStr() ); + return aStdStr_PathURL; +} + +void IndexerPreProcessor::processDocument + ( xmlDocPtr doc, const std::string &EncodedDocPath ) +{ + std::string aStdStr_EncodedDocPathURL = getEncodedPath( EncodedDocPath ); + + xmlDocPtr resCaption = xsltApplyStylesheet( m_xsltStylesheetPtrCaption, doc, NULL ); + xmlNodePtr pResNodeCaption = resCaption->xmlChildrenNode; + if( pResNodeCaption ) + { + fs::path fsCaptionPureTextFile_docURL = m_fsCaptionFilesDirName / aStdStr_EncodedDocPathURL; + std::string aCaptionPureTextFileStr_docURL = fsCaptionPureTextFile_docURL.native_file_string(); + FILE* pFile_docURL = fopen( aCaptionPureTextFileStr_docURL.c_str(), "w" ); + if( pFile_docURL ) + { + fprintf( pFile_docURL, "%s\n", pResNodeCaption->content ); + fclose( pFile_docURL ); + } + } + xmlFreeDoc(resCaption); + + xmlDocPtr resContent = xsltApplyStylesheet( m_xsltStylesheetPtrContent, doc, NULL ); + xmlNodePtr pResNodeContent = resContent->xmlChildrenNode; + if( pResNodeContent ) + { + fs::path fsContentPureTextFile_docURL = m_fsContentFilesDirName / aStdStr_EncodedDocPathURL; + std::string aContentPureTextFileStr_docURL = fsContentPureTextFile_docURL.native_file_string(); + FILE* pFile_docURL = fopen( aContentPureTextFileStr_docURL.c_str(), "w" ); + if( pFile_docURL ) + { + fprintf( pFile_docURL, "%s\n", pResNodeContent->content ); + fclose( pFile_docURL ); + } + } + xmlFreeDoc(resContent); +} + struct Data { std::vector<std::string> _idList; @@ -128,4572 +205,40 @@ public: } }; -namespace PrefixTranslator -{ - std::string translatePrefix(const std::string &input) - { - if (input.find("vnd.sun.star.help://") == 0) - return std::string("#HLP#") + input.substr(strlen("vnd.sun.star.help://")); - else - return input; - } -} - -class IndexAccessor -{ - fs::path _dirName; -public: - IndexAccessor(const fs::path &dirName) : _dirName(dirName) {} - IndexAccessor(const IndexAccessor &another) { _dirName = another._dirName; } - fs::path indexFile(const std::string &name) const { return _dirName / name; } - std::ifstream* getLineInput(const std::string &name); - std::fstream* getOutputStream(const std::string &name); - std::vector<unsigned char> readByteArray(const std::string &fileName); - void clear(); - std::fstream *getRAF(const std::string &name, bool update) throw( HelpProcessingException ); - void createIfNeeded() {} -}; - -std::ifstream* IndexAccessor::getLineInput(const std::string &name) -{ - return new std::ifstream(indexFile(name).native_file_string().c_str()); -} - -std::fstream* IndexAccessor::getOutputStream(const std::string &name) -{ - return new std::fstream(indexFile(name).native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); -} - -std::vector<unsigned char> IndexAccessor::readByteArray(const std::string &fileName) -{ - std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); - std::vector<unsigned char> ret(1024*16); - int i=0; - while (in.good()) - { - int len = in.readsome((char *)&ret[i], 1024*16); - if (!len) - break; - i += len; - ret.resize(i+1024*16); - } - ret.resize(i); - return ret; -} - -std::fstream* IndexAccessor::getRAF(const std::string &name, bool update) - throw( HelpProcessingException ) -{ - std::fstream *_file = new std::fstream; - fs::path fullname = indexFile(name); - if (!update) - { - _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::binary); - } - else - { - _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); - if (!_file->is_open()) - { - HCDBG(std::cerr << "didn't exist" << std::endl); - _file->open(fullname.native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); - } - if (!_file->is_open()) - { - std::stringstream aStrStream; - aStrStream << "Cannot open " << name << std::endl; - throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); - } - } - return _file; -} - -void IndexAccessor::clear() -{ -#if 0 - File thisDir = indexFile("."); - File[] components = thisDir.listFiles(); - if (components != null) - for (int i = 0; i < components.length; i++) - components[i].delete(); -#endif -} - -typedef std::vector< std::string > VectorLines; - -class Schema : public IndexAccessor -{ -private: - static std::string PartName; - bool _update; - VectorLines _lines; -public: - Schema(const IndexAccessor &index, bool update); - std::ifstream* getSchemaLineInput() { return getLineInput(PartName); } - void read(); - Stringtable parameters(const std::string &name) const; - void update(const std::string &partName, const std::string ¶meters); - void save(); -}; - -std::string Schema::PartName = "SCHEMA"; - - -class startsWith -{ -public: - startsWith(const std::string &in) : str(in) {} - bool operator() ( const std::string &in ) const { return (in.find(str) == 0); } -private: - const std::string &str; -}; - -void Schema::update(const std::string &partName, const std::string &inparameters) -{ - VectorLines::iterator aEnd = std::remove_if(_lines.begin(), _lines.end(), startsWith(partName)); - if (aEnd != _lines.end()) _lines.erase(aEnd, _lines.end()); - _lines.push_back(partName + " " + inparameters); -} - -Stringtable Schema::parameters(const std::string &name) const -{ - Stringtable result; - VectorLines::const_iterator aEnd = _lines.end(); - for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) - { - if (aIter->find(name) == 0) - { - boost::char_separator<char> sep(" ="); - boost::tokenizer< boost::char_separator<char> > tokens(name, sep); - boost::tokenizer< boost::char_separator<char> >::const_iterator it = tokens.begin(); - ++it; // skip name - while(it != tokens.end()) - { - const std::string &part1 = *it; - ++it; - if (it == tokens.end()) - break; - const std::string &part2 = *it; - result[part1] = part2; - ++it; - } - break; - } - } - return result; -} - -Schema::Schema(const IndexAccessor &index, bool inupdate) : IndexAccessor(index), - _update(inupdate) -{ - read(); -} - -#ifdef UNX -#define MAX_LINE PATH_MAX -#else -#define MAX_LINE _MAX_PATH -#endif - -void Schema::read() -{ - std::ifstream* in = getSchemaLineInput(); - char line[MAX_LINE]; - // This needs to be replaced with our XML Parser - while (in->getline(line, MAX_LINE)) - _lines.push_back(line); - delete in; -} - -void Schema::save() -{ - if (_update) - { - std::fstream* out = getOutputStream(PartName); - *out << "JavaSearch 1.0\n"; - VectorLines::const_iterator aEnd = _lines.end(); - for (VectorLines::const_iterator aIter = _lines.begin(); aIter != aEnd; ++aIter) - *out << *aIter << '\n'; - delete out; - } -} - -class DBPartParameters -{ - Schema &_schema; - std::string _partName; - Stringtable _parameters; -protected: - bool parametersKnown() const; - void updateSchema(const std::string ¶meters) { _schema.update(_partName, parameters); } -public: - DBPartParameters(Schema &schema, const std::string &partName); - int integerParameter(const std::string &name); -}; - -DBPartParameters::DBPartParameters(Schema &schema, const std::string &partName) - : _schema(schema), _partName(partName) -{ - _parameters = schema.parameters(partName); -} - -bool DBPartParameters::parametersKnown() const -{ - return !_parameters.empty(); -} - -int DBPartParameters::integerParameter(const std::string &name) -{ - std::istringstream converter(_parameters[name]); - int ret; - converter >> ret; - return ret; -} - -class BlockManagerParameters : public DBPartParameters -{ -private: - fs::path _file; - int _blockSize; -protected: - int _root; -public: - BlockManagerParameters(Schema &schema, const std::string &partName); - bool readState(); - const fs::path& getFile() const { return _file; } - int getBlockSize() const { return _blockSize; } - void setBlockSize(int size) { _blockSize = size; } - int getRootPosition() const { return _root; } - void setRoot(int root) { _root = root; } - void updateSchema(const std::string ¶ms); -}; - -void BlockManagerParameters::updateSchema(const std::string ¶ms) -{ - std::ostringstream tmp; - tmp << "bs=" << _blockSize << " rt=" << _root << " fl=-1 " << params; - DBPartParameters::updateSchema(tmp.str()); -} - -BlockManagerParameters::BlockManagerParameters(Schema &schema, const std::string &partName) - : DBPartParameters(schema, partName), _root(0) -{ - _file = schema.indexFile(partName); - HCDBG(std::cerr << "file name set to " << _file.native_file_string()); - readState(); -} - -bool BlockManagerParameters::readState() -{ - if (parametersKnown()) - { - _blockSize = integerParameter("bs"); - _root = integerParameter("rt"); - return true; - } - else - return false; -} - -class BtreeDictParameters : public BlockManagerParameters -{ -private: - int _id1; -public: - BtreeDictParameters(Schema &schema, const std::string &partName); - int getFreeID() const { return _id1; } - void setFreeID(int id) { _id1 = id; } - void updateSchema(); -}; - -void BtreeDictParameters::updateSchema() -{ - std::ostringstream tmp; - tmp << "id1=" << _id1 << " id2=1"; - BlockManagerParameters::updateSchema(tmp.str()); -} - -BtreeDictParameters::BtreeDictParameters(Schema &schema, const std::string &partName) - : BlockManagerParameters(schema, partName) -{ -} - -int readInt(std::fstream &in) -{ - HCDBG(std::cerr << "want to read at " << in.tellg() << std::endl); - int ret = 0; - for (int i = 3; i >= 0; --i) - { - unsigned char byte; - in.read( (char*)&byte, 1 ); - ret |= (static_cast<unsigned int>(byte) << (i*8)); - HCDBG(fprintf(stderr, "inputting %x ret is now %x\n", byte, ret)); - } - return ret; -} - -void writeByte(std::fstream &out, unsigned char byte) -{ - out.write( (const char *)&byte, 1 ); -} - -void writeShort(std::fstream &out, int item) -{ - for (int i = 1; i >= 0; --i) - { - unsigned char byte = static_cast<unsigned char>((item >> (i*8))); - out.write( (const char*)&byte, 1 ); - } -} - -void writeInt(std::fstream &out, int item) -{ - HCDBG(std::cerr << "want to write at " << out.tellp() << std::endl); - for (int i = 3; i >= 0; --i) - { - unsigned char byte = static_cast<unsigned char>((item >> (i*8))); - HCDBG(fprintf(stderr, "outputting %x in is %x\n", byte, item)); - out.write( (const char*)&byte, 1 ); - } -} - -void readFully(std::fstream &in, std::vector<unsigned char> &_data) -{ - in.read((char*)(&_data[0]), _data.size()); -} - -/** - - Base class for (typically btree) blocks to hold either - byte vectors representing graph/tree edges, - or pairs (key, id) for dictionaries - - Each block has a header and a data section - - */ - -class Block -{ -public: - static int HEADERLEN; - // length of Block ID in bytes - static int IDLEN; - - // number of the block - // used for both referring to the block - // and addresssing the block in file - unsigned int _number; - bool _isLeaf; - // first available byte in data section - int _free; - std::vector<unsigned char> _data; - - Block(int blocksize) : _number(0), _isLeaf(true), _free(0) - { - _data.resize(blocksize - HEADERLEN); - } - - virtual ~Block() {} - - void setBlockNumber(int n) { _number = n; } - virtual void setFree(int free) { _free = free; } - // interpret 4 bytes at 'i' as an integer - int integerAt(int i) const - { - int result = ((((((_data[i]&0xFF)<<8) - |_data[i+1]&0xFF)<<8) - |_data[i+2]&0xFF)<<8) - |_data[i+3]&0xFF; - return result; - } - void setIntegerAt(int i, int value) - { - /* - for (int j = i + 3; j >= i; j--, value >>= 8) - _data[j] = (unsigned char)(value & 0xFF); - */ - _data[i++] = (unsigned char)((value >> 24) & 0xFF); - _data[i++] = (unsigned char)((value >> 16) & 0xFF); - _data[i++] = (unsigned char)((value >> 8) & 0xFF); - _data[i] = (unsigned char)(value & 0xFF); - } - void readIn(std::fstream &in) - { - _number = readInt(in); - int twoFields = readInt(in); - _isLeaf = (twoFields & 0x80000000) != 0; - HCDBG(std::cerr << "read leaf as " << _isLeaf << std::endl); - _free = twoFields & 0x7FFFFFFF; - readFully(in, _data); - } - void writeOut(std::fstream &out) const - { - writeInt(out, _number); - writeInt(out, _free | (_isLeaf ? 0x80000000 : 0)); - out.write((const char*)(&_data[0]), _data.size()); - } -}; - -int Block::HEADERLEN = 8; -// length of Block ID in bytes -int Block::IDLEN = 4; - -class BtreeDict; -class EntryProcessor; -typedef std::vector<int> IntegerArray; - -class DictBlock : public Block -{ -public: - DictBlock(); - int free() const { return _free + firstEntry(); } - int numberOfEntries() const { return integerAt(0); } - int nthPointer(int n) const { return integerAt(4*(n + 1)); } - int getChildIdx(int index) const; - int entryKeyLength(int i) const { return _data[i] & 0xFF; } - int entryCompression(int i) const { return _data[i + 1] & 0xFF; } - int entryID(int i) const { return integerAt(i + 2); } - int entryLength(int entry) const; - int entryKey(int entry) const; - int firstEntry() const { return 4; } - int nextEntry(int entry) const { return entry + entryLength(entry); } - void restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer); - std::string restoreKey(int entry, std::vector<unsigned char> &buffer); - std::string findID(int id) throw( HelpProcessingException ); - void setBlockNumbers(std::vector<int> &blocks) const; - void listBlock(); - void doMap(BtreeDict &owner, const EntryProcessor &processor); - void withPrefix(BtreeDict &owner, const std::string &prefix, - size_t prefLen, IntegerArray &result); -}; - -class BlockFactory; - -class BlockProcessor; - -class BlockDescriptor -{ -public: - Block *_block; - bool _modf; - BlockDescriptor(Block *block) : _block(block), _modf(false) {} -}; // end of BlockDescriptor - -class BlockManager -{ -private: - static int INCR; - std::fstream _file; - long _blockSize; - bool _update; - BlockFactory *_blockFactory; - std::vector<BlockDescriptor> _blockTab; -public: - BlockManager(const BlockManagerParameters *params, - bool update, BlockFactory *bfactory) throw( HelpProcessingException ); - ~BlockManager(); - Block& accessBlock(int blockNumber); - void setModified(int blNum); - void close(); - Block& getNewBlock(); - void processBlocks(BlockProcessor &processor); - void mapBlock(Block* block); - void addDescriptor(Block* block) throw( HelpProcessingException ); -private: - void writeBlock(const Block &bl); -}; - -int BlockManager::INCR = 64; // size increment - -class EntryProcessor -{ -public: - virtual void processEntry(const std::string &string, int id) const = 0; - virtual ~EntryProcessor() {}; -}; - -class BtreeDict -{ -public: - static int ENTHEADERLEN; - static int BLOCKSIZE; - static int DATALEN; - static int MaxKeyLength; - static int lastPtrIndex; -protected: - BlockManager *blockManager; - int root; - std::vector<int> blocks; - - BtreeDict() {/*empty*/} - ~BtreeDict() { delete blockManager; } - BtreeDict(const BtreeDictParameters *params); - void init(const BtreeDictParameters *params, bool update, - BlockFactory *bfactory); -public: - int fetch(const std::string &key); - void close(); -private: - std::string fetch(int conceptID); - IntegerArray withPrefix(const std::string &prefix); -public: - DictBlock& accessBlock(int index); - DictBlock& child(const DictBlock &bl, int index) throw( HelpProcessingException ); -private: - std::string findID(int blNum, int id); - int find(const DictBlock &bl, std::vector<unsigned char> &key, int index); - int find(const DictBlock &bl, std::vector<unsigned char> &key); - void setBlocks(std::vector<int> &blocks); - void map(const EntryProcessor &processor); -public: - void dumpnode(DictBlock &bl, int level); -}; - -class BlockFactory -{ -public: - virtual Block* makeBlock() const = 0; - virtual ~BlockFactory() {} -}; - -static int dictcount; - -class DictBlockFactory : public BlockFactory -{ -public: - Block* makeBlock() const - { - dictcount++; - return new DictBlock; - } -}; - -BtreeDict::BtreeDict(const BtreeDictParameters *params) -{ - init(params, false, new DictBlockFactory()); - blocks.resize(params->getFreeID()); - setBlocks(blocks); -} - -void BtreeDict::dumpnode(DictBlock &bl, int level) -{ - if (!bl._isLeaf) - { - fprintf(stderr, "\n"); - for (int i = 0; i < level; ++i) - fprintf(stderr, "\t"); - fprintf(stderr, "there are %d entries\n", bl.numberOfEntries()); - for (int i = 0; i < level; ++i) - fprintf(stderr, "\t"); - for (int i = 0; i < bl.numberOfEntries(); ++i) - { - int index = bl.getChildIdx(i); - fprintf(stderr, " %d ", index); - DictBlock &thischild = accessBlock(index); - dumpnode(thischild, level + 1); - } - fprintf(stderr, "\n"); - } -} - -int BtreeDict::fetch(const std::string &key) -{ - HCDBG(std::cerr << "fetching " << key << " from root " << root << std::endl); - DictBlock &bl = accessBlock(root); - - int length = key.size(); - std::vector<unsigned char> Key(length + 1); - memcpy(&(Key[0]), key.c_str(), length); - Key[length] = 0; // sentinel - - return find(bl, Key); -} - -std::string BtreeDict::fetch(int conceptID) -{ - return findID(blocks[conceptID], conceptID); -} - -IntegerArray BtreeDict::withPrefix(const std::string &prefix) -{ - IntegerArray result; - accessBlock(root).withPrefix(*this, prefix, prefix.size(), result); - return result; -} - -void BtreeDict::close() -{ - blockManager->close(); -} - -void BtreeDict::init(const BtreeDictParameters *params, bool update, - BlockFactory *bfactory) -{ - blockManager = new BlockManager(params, update, bfactory); - root = params->getRootPosition(); -} - -DictBlock& BtreeDict::accessBlock(int index) -{ - return (DictBlock&)blockManager->accessBlock(index); -} - -DictBlock& BtreeDict::child(const DictBlock &bl, int index) throw( HelpProcessingException ) -{ - if (bl._isLeaf) - { - std::stringstream aStrStream; - aStrStream << "leaf's can't have children, screwed!" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - return accessBlock(bl.getChildIdx(index)); -} - -std::string BtreeDict::findID(int blNum, int id) -{ - return accessBlock(blNum).findID(id); -} - -int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key, int index) -{ - HCDBG(std::cerr << "find2: " << bl._isLeaf << " : " << index << " : " << std::endl); - - return bl._isLeaf ? 0 : find(child(bl, index), key); -} - -int BtreeDict::find(const DictBlock &bl, std::vector<unsigned char> &key) -{ - int inputKeyLen = key.size() - 1; - int entryPtr = bl.firstEntry(); - int freeSpace = bl.free(); - int nCharsEqual = 0; - int compression = 0; - - HCDBG(std::cerr << "find1: " << inputKeyLen << " : " - << entryPtr << " : " << freeSpace << " : " << nCharsEqual << " " - << compression << std::endl); - - for (int entryIdx = 0;;) - { - if (entryPtr == freeSpace) - return find(bl, key, bl.numberOfEntries()); - else if (compression == nCharsEqual) - { - int keyLen = bl.entryKeyLength(entryPtr); - int keyPtr = bl.entryKey(entryPtr), i; - for (i = 0; i < keyLen && key[nCharsEqual] == bl._data[keyPtr + i]; i++) - ++nCharsEqual; - if (i == keyLen) - { - if (nCharsEqual == inputKeyLen) - return bl.entryID(entryPtr); - } - else if ((key[nCharsEqual]&0xFF) < (bl._data[keyPtr + i]&0xFF)) - return find(bl, key, entryIdx); - } - else if (compression < nCharsEqual) // compression dropped - return find(bl, key, entryPtr == freeSpace - ? bl.numberOfEntries() : entryIdx); - do - { - entryPtr = bl.nextEntry(entryPtr); - ++entryIdx; - } - while (bl.entryCompression(entryPtr) > nCharsEqual); - compression = bl.entryCompression(entryPtr); - } -} - -class BlockProcessor -{ -protected: - std::vector<int> &blocks; -public: - BlockProcessor(std::vector<int> &_blocks) : blocks(_blocks) {} - virtual void process(const Block &block) = 0; - virtual ~BlockProcessor() {} -}; - - -class DictBlockProcessor : public BlockProcessor -{ -public: - DictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {} - void process(const Block &block) - { - ((const DictBlock&)block).setBlockNumbers(blocks); - } -}; - -BlockManager::BlockManager(const BlockManagerParameters *params, - bool update, BlockFactory *bfactory) throw( HelpProcessingException ) - : _blockFactory(bfactory) -{ - _update = update; - // params.readState(); - _blockSize = params->getBlockSize(); - HCDBG(std::cerr << "opening " << params->getFile().native_file_string() << std::endl); - if (!update) - { - _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::binary); - } - else - { - _file.open(params->getFile().native_file_string().c_str(), std::ios::in | std::ios::out | std::ios::binary); - if (!_file.is_open()) - { - HCDBG(std::cerr << "didn't exist" << std::endl); - _file.open(params->getFile().native_file_string().c_str(), - std::ios::in | std::ios::out | std::ios::trunc | std::ios::binary); - } - if (!_file.is_open()) - { - std::stringstream aStrStream; - aStrStream << "Cannot open " << params->getFile().native_file_string() << std::endl; - throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); - } - } - - _file.seekg(0, std::ios::end); - long length = _file.tellg(); - if (length < 0) length = 0; - _file.seekg(0, std::ios::beg); - _file.clear(); - - HCDBG(std::cerr << "len is " << length << std::endl); - - if (length <= 0 && update) - { - Block* _dummy = bfactory->makeBlock(); - _dummy->setBlockNumber(0); - writeBlock(*_dummy); - delete _dummy; - length = _blockSize; - } - - _file.seekg(0, std::ios::beg); - - int _blockTableSize = (length/_blockSize); - HCDBG(std::cerr << "len is now " << _blockTableSize << std::endl); - for (int i = 0; i < _blockTableSize; ++i) - mapBlock(bfactory->makeBlock()); -} - -Block& BlockManager::getNewBlock() -{ - unsigned int number = _blockTab.size(); - - Block *bl = _blockFactory->makeBlock(); - bl->setBlockNumber(number); - writeBlock(*bl); - addDescriptor(bl); - - return *(_blockTab[number]._block); -} - -void BlockManager::setModified(int blNum) -{ - _blockTab[blNum]._modf = true; -} - -void BlockManager::close() -{ - if (_update) - { - std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end(); - for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin(); - aIter != aEnd; ++aIter) - { - if (aIter->_modf) - writeBlock(*(aIter->_block)); - } - } - _file.close(); -} - -void BlockManager::processBlocks(BlockProcessor &processor) -{ - std::vector<BlockDescriptor>::const_iterator aEnd = _blockTab.end(); - for (std::vector<BlockDescriptor>::const_iterator aIter = _blockTab.begin(); - aIter != aEnd; ++aIter) - { - processor.process(*(aIter->_block)); - } -} - -void BlockManager::mapBlock(Block* block) -{ - block->readIn(_file); - addDescriptor(block); -} - -void BlockManager::addDescriptor(Block *block) throw( HelpProcessingException ) -{ - BlockDescriptor desc(block); - _blockTab.push_back(desc); - HCDBG(std::cerr << "numbers are " << block->_number << " " << (_blockTab.size()-1) << std::endl); - if (block->_number != _blockTab.size() - 1) - { - std::stringstream aStrStream; - aStrStream << "totally screwed" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - HCDBG(std::cerr << "addDescriptor blocks are now " << _blockTab.size() << std::endl); -} - -void BlockManager::writeBlock(const Block &bl) -{ - _file.seekp(_blockSize * bl._number); - bl.writeOut(_file); -} - -Block& BlockManager::accessBlock(int blockNumber) -{ - return *(_blockTab[blockNumber]._block); -} - -BlockManager::~BlockManager() -{ - std::vector<BlockDescriptor>::iterator aEnd = _blockTab.end(); - for (std::vector<BlockDescriptor>::iterator aIter = _blockTab.begin(); - aIter != aEnd; ++aIter) - { - delete aIter->_block; - } - delete _blockFactory; -} - -void BtreeDict::setBlocks(std::vector<int> &inblocks) -{ - DictBlockProcessor foo(inblocks); - blockManager->processBlocks(foo); -} - -// can go to Full -void BtreeDict::map(const EntryProcessor &processor) -{ - accessBlock(root).doMap(*this, processor); -} - -void DictBlock::restoreKeyInBuffer(int entry, std::vector<unsigned char> &buffer) -{ - int howMany = entryKeyLength(entry); - int where = entryCompression(entry); - int from = entryKey(entry); - while (howMany-- > 0) - buffer[where++] = _data[from++]; -} - -std::string DictBlock::restoreKey(int entry, std::vector<unsigned char> &buffer) -{ - int howMany = entryKeyLength(entry); - int where = entryCompression(entry); - int from = entryKey(entry); - while (howMany-- > 0) - buffer[where++] = _data[from++]; - return std::string((const char*)(&buffer[0]), 0, where); -} - -std::string DictBlock::findID(int id) throw( HelpProcessingException ) -{ - std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength); - int freeSpace = free(); - for (int ent = firstEntry(); ent < freeSpace; ent = nextEntry(ent)) - { - if (entryID(ent) == id) // found - return restoreKey(ent, buffer); - else - restoreKeyInBuffer(ent, buffer); - } - std::stringstream aStrStream; - aStrStream << "ID not found in block" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); -} - -void DictBlock::setBlockNumbers(std::vector<int> &blocks) const -{ - for (int e = firstEntry(); e < _free; e = nextEntry(e)) - blocks[entryID(e)] = _number; -} - -void DictBlock::listBlock() -{ - std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength); - int freeSpace = free(); - int entryPtr = firstEntry(); - if (_isLeaf) - { - while (entryPtr < freeSpace) - { - std::cout << restoreKey(entryPtr, buffer) << " " << - entryID(entryPtr); - entryPtr = nextEntry(entryPtr); - } - } - else - std::cout << "not leaf" << std::endl; -} - -void DictBlock::doMap(BtreeDict &owner, const EntryProcessor &processor) -{ - std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength); - int freeSpace = free(); - int entryPtr = firstEntry(); - if (_isLeaf) - { - while (entryPtr < freeSpace) - { - processor.processEntry(restoreKey(entryPtr, buffer), - entryID(entryPtr)); - entryPtr = nextEntry(entryPtr); - } - } - else - { - int entryIdx = 0; - while (entryPtr < freeSpace) - { - owner.accessBlock(getChildIdx(entryIdx)).doMap(owner,processor); - processor.processEntry(restoreKey(entryPtr, buffer), - entryID(entryPtr)); - entryPtr = nextEntry(entryPtr); - ++entryIdx; - } - owner.accessBlock(getChildIdx(entryIdx)).doMap(owner, processor); - } -} - -void DictBlock::withPrefix(BtreeDict &owner, const std::string &prefix, - size_t prefLen, IntegerArray &result) -{ - std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength); - int freeSpace = free(); - int entryPtr = firstEntry(); - if (_isLeaf) - { - while (entryPtr < freeSpace) - { - if (restoreKey(entryPtr, buffer).find(prefix) == 0) - result.push_back(entryID(entryPtr)); - entryPtr = nextEntry(entryPtr); - } - } - else - { - int entryIndex = 0; - while (entryPtr < freeSpace) - { - std::string key = restoreKey(entryPtr, buffer); - if (key.size() > prefLen) - key = key.substr(0, prefLen); - int cmp = key.compare(prefix); - if (cmp < 0) - { - entryPtr = nextEntry(entryPtr); - ++entryIndex; - } - else if (cmp == 0) - { - result.push_back(entryID(entryPtr)); - owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); - entryPtr = nextEntry(entryPtr); - ++entryIndex; - } - else - { - owner.accessBlock(getChildIdx(entryIndex)).withPrefix(owner, prefix, prefLen, result); - return; - } - } - owner.accessBlock(getChildIdx(numberOfEntries())).withPrefix(owner, prefix, prefLen, result); - } -} - -int BtreeDict::ENTHEADERLEN = 6; -int BtreeDict::BLOCKSIZE = 2048; -int BtreeDict::DATALEN = BtreeDict::BLOCKSIZE - Block::HEADERLEN; -int BtreeDict::MaxKeyLength = 255; - //!!! Careful with that number, Eugene -int BtreeDict::lastPtrIndex = 508; - -DictBlock::DictBlock() : Block(BtreeDict::BLOCKSIZE) -{ -} - -int DictBlock::getChildIdx(int index) const -{ - return nthPointer(BtreeDict::lastPtrIndex - index); -} - -int DictBlock::entryLength(int entry) const -{ - return BtreeDict::ENTHEADERLEN + entryKeyLength(entry); -} - -int DictBlock::entryKey(int entry) const -{ - return entry + BtreeDict::ENTHEADERLEN; -} - -void setBlockNumber2(std::vector<int> &blocks, size_t index, int number) -{ - if (index >= blocks.size()) - blocks.resize(index + 1000); - blocks[index] = number; -} - -class Entry -{ -public: - std::vector<unsigned char> key; - int id; - int block; - - Entry(const std::vector<unsigned char> &keyin, int length, int idin) : key(length+1), id(idin), block(-1) - { - memcpy(&key[0], &keyin[0], length); - } - - Entry(const std::string &keyin, int idin) : key(keyin.size()+1), id(idin), block(-1) - { - memcpy(&key[0], keyin.c_str(), keyin.size()); - } - - bool smallerThan(const Entry &other) - { - for (size_t i = 0; i < std::min(key.size(), other.key.size()); i++) - if (key[i] != other.key[i]) - return (key[i]&0xFF) < (other.key[i]&0xFF); - return false; - } -}; // end of internal class Entry - -class FullDictBlock; - -class FullBtreeDict : public BtreeDict -{ -protected: - BtreeDictParameters *_params; - bool update; -public: - FullBtreeDict(BtreeDictParameters ¶ms, bool update); - void store(const std::string &bla, int id) throw( HelpProcessingException ); - boost::shared_ptr<Entry> insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent); - boost::shared_ptr<Entry> insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent) - throw( HelpProcessingException ); - FullDictBlock& getNewBlock(); - void setModified(Block &bl); - void close(int freeID); -}; - -class FullDictBlock : public DictBlock -{ -public: - virtual void setFree(int free); - void setNumberOfEntries(int n) { setIntegerAt(0, n); } - void setChildIndex(int index, int value) - { - setIntegerAt(4*(BtreeDict::lastPtrIndex - index + 1), value); - } - void setEntryID(int i, int id) { setIntegerAt(i + 2, id); } - void setBlockNumbers(std::vector<int> &blocks) const; - bool insert(const Entry &entry); - void makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr); - bool insert(const Entry &ent, int entryPtr, int compr1, int compr2, int index); - int insertInternal(const Entry &entry); - boost::shared_ptr<Entry> split(FullDictBlock &newbl); - void initInternal(int leftBlock, const Entry &entry); - bool insert(boost::shared_ptr<Entry> entry); - bool insert(boost::shared_ptr<Entry> ent, int entryPtr, - int compr1, int compr2, int index); - -}; - -void FullDictBlock::initInternal(int leftBlock, const Entry &entry) -{ - _isLeaf = false; - setNumberOfEntries(1); - setChildIndex(0, leftBlock); - setChildIndex(1, entry.block); - int ent = firstEntry(); - makeEntry(ent, entry.key, entry.id, entry.key.size() - 1, 0); - setFree(nextEntry(ent)); -} - -void FullDictBlock::setFree(int infree) -{ - _free = infree - firstEntry(); - _data[infree] = _data[infree + 1] = 0; // sentinel -} - -boost::shared_ptr<Entry> FullDictBlock::split(FullDictBlock& newbl) -{ - std::vector<unsigned char> buffer(BtreeDict::MaxKeyLength); - int freeSpace = free(); - int half = freeSpace/2; - int index = 0; // of middle entry - newbl._isLeaf = _isLeaf; - int ent; - for (ent = firstEntry(); ent < half; ent = nextEntry(ent)) - { - restoreKeyInBuffer(ent, buffer); - ++index; - } - int entriesToMove = numberOfEntries() - index - 1; - // middle entry - restoreKeyInBuffer(ent, buffer); - int len = entryKeyLength(ent) + entryCompression(ent); - boost::shared_ptr<Entry> result(new Entry(buffer, len, entryID(ent))); - result->block = newbl._number; - int newFree = ent; - // rest goes to the new block - ent = nextEntry(ent); - restoreKeyInBuffer(ent, buffer); - len = entryKeyLength(ent) + entryCompression(ent); - int nptr = firstEntry(); - newbl.makeEntry(nptr, buffer, entryID(ent), len, 0); - ent = nextEntry(ent); - memmove(&(newbl._data[newbl.nextEntry(nptr)]), &(_data[ent]), freeSpace - ent); - newbl.setNumberOfEntries(entriesToMove); - newbl.setFree(newbl.nextEntry(nptr) + freeSpace - ent); - if (_isLeaf == false) // need to split pointers - { - int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); - int to = from + 4*(index + 1); - memmove(&(newbl._data[to]), &(_data[from]), 4*(entriesToMove + 1)); - } - // this entry will end here - setFree(newFree); - setNumberOfEntries(index); - return result; - //!!!remember updating ID -> string association -} - -void FullDictBlock::setBlockNumbers(std::vector<int> &blocks) const -{ - for (int e = firstEntry(); e < _free; e = nextEntry(e)) - setBlockNumber2(blocks, entryID(e), _number); -} - -bool FullDictBlock::insert(boost::shared_ptr<Entry> ent, int entryPtr, - int compr1, int compr2, int index) -{ - const std::vector<unsigned char> &key = ent->key; - int keyLen = key.size() - 1 - compr1; - int freeSpace = free(); - // calculate how much space is needed to add the new entry - // first, how many bytes are needed for just the new entry - int demand = BtreeDict::ENTHEADERLEN + keyLen; - // adding an entry can increase compression in the following entry - - int increase = 0; - if (entryPtr < freeSpace) - if (entryCompression(entryPtr) < compr2) - increase = compr2 - entryCompression(entryPtr); - /* - std::cerr << "key " << key << std::endl; - std::cerr << "entryPtr " << entryPtr << std::endl; - std::cerr << "compr1 " << compr1) << std::endl; - std::cerr << "compr2 " << compr2) << std::endl; - std::cerr << "index " << index) << std::endl; - std::cerr << "demand " << demand) << std::endl; - std::cerr << "increase " << increase) << std::endl; - */ - // check if enough space is available - int limit = _isLeaf ? BtreeDict::DATALEN-2 : 4*(BtreeDict::lastPtrIndex-numberOfEntries()-1); - - if (freeSpace + demand - increase <= limit) // 2 for sentinel - { - if (entryPtr < freeSpace) - { - // need to shift extant entries forward - int toMove = increase > 0 ? entryPtr + BtreeDict::ENTHEADERLEN + increase : entryPtr; - // move entries - memmove(&(_data[toMove + demand - increase]), &(_data[toMove]), freeSpace - toMove); - - if (increase > 0) - { - // update header - unsigned char tmp = static_cast<unsigned char>(increase); - _data[entryPtr] = _data[entryPtr] - tmp; - _data[entryPtr + 1] = _data[entryPtr + 1] + tmp; - // shift header - memmove(&(_data[entryPtr + demand]), &(_data[entryPtr]), BtreeDict::ENTHEADERLEN); - } - } - // now write the new entry in the space made above - makeEntry(entryPtr, key, ent->id, keyLen, compr1); - - if (_isLeaf == false) - { - int from = 4*(BtreeDict::lastPtrIndex - numberOfEntries() + 1); - memmove(&(_data[from - 4]), &(_data[from]), 4*(numberOfEntries() - index)); - setChildIndex(index + 1, ent->block); - } - setFree(freeSpace + demand - increase); - setNumberOfEntries(numberOfEntries() + 1); - - /* - System.err.println("------------list--------------"); - byte[] buffer = new byte[MaxKeyLength]; - final int freeSpace2 = free(); - int entryPtr2 = firstEntry(); - while (entryPtr2 < freeSpace2) - { - System.err.println(entryPtr2); - System.err.println(entryKeyLength(entryPtr2)); - System.err.println(entryCompression(entryPtr2)); - System.err.println(new String(_data, - entryKey(entryPtr2), - entryKeyLength(entryPtr2))); - System.err.println(restoreKey(entryPtr2, buffer)+" "+ - entryID(entryPtr2)); - entryPtr2 = nextEntry(entryPtr2); - } - System.err.println("------------end--------------"); - */ - return true; - } - else - return false; -} - -// finds the place and context -bool FullDictBlock::insert(boost::shared_ptr<Entry> entry) -{ - const std::vector<unsigned char> &inkey = entry->key; - int inputKeyLen = inkey.size() - 1; - int freeSpace = free(); - int entryPtr = firstEntry(); - int nCharsEqual = 0; - int prevNCEqual = 0; - int compression = 0; - - for (int entryIndex = 0;;) - { - if (entryPtr == freeSpace) - return insert(entry, entryPtr, nCharsEqual, 0, numberOfEntries()); - else if (compression == nCharsEqual) - { - int keyLen = entryKeyLength(entryPtr); - int keyPtr = entryKey(entryPtr), i; - prevNCEqual = nCharsEqual; - for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) - ++nCharsEqual; - if (i == keyLen) - { - if (nCharsEqual == inputKeyLen) - { - HCDBG(std::cerr << "setting to " << entry->id << std::endl); - setEntryID(entryPtr, entry->id); - return true; - } - } - else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) - return insert(entry, entryPtr, prevNCEqual, nCharsEqual, entryIndex); - } - else if (compression < nCharsEqual) // compression dropped - { - int index = entryPtr == freeSpace ? numberOfEntries() : entryIndex; - return insert(entry, entryPtr, nCharsEqual, compression, index); - } - do - { - entryPtr = nextEntry(entryPtr); - ++entryIndex; - } - while (entryCompression(entryPtr) > nCharsEqual); - compression = entryCompression(entryPtr); - } -} - -static int fulldictcount; - -class FullDictBlockFactory : public BlockFactory -{ -public: - Block* makeBlock() const - { - fulldictcount++; - return new FullDictBlock; - } -}; - -class FullDictBlockProcessor : public BlockProcessor -{ -public: - FullDictBlockProcessor(std::vector<int> &_blocks) : BlockProcessor(_blocks) {} - void process(const Block &block) - { - ((const FullDictBlock&)block).setBlockNumbers(blocks); - } -}; - -FullBtreeDict::FullBtreeDict(BtreeDictParameters ¶ms, bool _update) : - _params(¶ms), update(_update) -{ - init(_params, update, new FullDictBlockFactory()); - HCDBG(std::cerr << "id is " << params.getFreeID() << std::endl); - blocks.resize(params.getFreeID()); - - FullDictBlockProcessor foo(blocks); - blockManager->processBlocks(foo); - /* - if (logging) - log = new FileWriter("/tmp/FullBtreeDict.log"); - */ -} - -void FullBtreeDict::setModified(Block &bl) -{ - blockManager->setModified(bl._number); -} - -FullDictBlock& FullBtreeDict::getNewBlock() -{ - FullDictBlock &nbl = (FullDictBlock&)blockManager->getNewBlock(); - setModified(nbl); - return nbl; -} - -boost::shared_ptr<Entry> FullBtreeDict::insertHere(FullDictBlock &bl, boost::shared_ptr<Entry> ent) - throw( HelpProcessingException ) -{ - setModified(bl); // to be modified in any case - if (bl.insert(ent)) - return boost::shared_ptr<Entry>(); - else - { - FullDictBlock &nbl = getNewBlock(); - boost::shared_ptr<Entry> middle = bl.split(nbl); - nbl.setBlockNumbers(blocks); - if ((middle->smallerThan(*ent) ? nbl : bl).insert(ent) == false) - { - std::stringstream aStrStream; - aStrStream << "entry didn't fit into a freshly split block" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - return middle; - } -} - -void FullDictBlock::makeEntry(int entry, const std::vector<unsigned char> &key, int id, int length, int compr) -{ - _data[entry] = static_cast<unsigned char>(length); - _data[entry + 1] = static_cast<unsigned char>(compr); - setEntryID(entry, id); - memmove(&(_data[entryKey(entry)]), &(key[compr]), length); -} - -int FullDictBlock::insertInternal(const Entry &entry) -{ - const std::vector<unsigned char> &inkey = entry.key; - int inputKeyLen = inkey.size() - 1; - int entryPtr = firstEntry(); - int freeSpace = free(); - int nCharsEqual = 0; - int compression = 0; - - for (int entryIndex = 0;;) - { - if (entryPtr == freeSpace) - return numberOfEntries(); - else if (compression == nCharsEqual) - { - int i; - int keyLen = entryKeyLength(entryPtr); - int keyPtr = entryKey(entryPtr); - for (i = 0; i < keyLen && inkey[nCharsEqual] == _data[keyPtr + i]; i++) - ++nCharsEqual; - if (i == keyLen) - { - if (nCharsEqual == inputKeyLen) - { - setEntryID(entryPtr, entry.id); - return -1; - } - } - else if ((inkey[nCharsEqual]&0xFF) < (_data[keyPtr + i]&0xFF)) - return entryIndex; - } - else if (compression < nCharsEqual) // compression dropped - return entryPtr >= freeSpace ? numberOfEntries() : entryIndex; - - do - { - entryPtr = nextEntry(entryPtr); - ++entryIndex; - } - while (entryCompression(entryPtr) > nCharsEqual); - compression = entryCompression(entryPtr); - } -} - -/* - delegation to powerful primitives at the FullDictBlock level lets us - express the insertion algorithm very succintly here -*/ -boost::shared_ptr<Entry> FullBtreeDict::insert(FullDictBlock &bl, boost::shared_ptr<Entry> ent) -{ - if (bl._isLeaf) - ent = insertHere(bl, ent); - else - { - int index = bl.insertInternal(*ent); - if (index != -1) - { - ent = insert((FullDictBlock&)child(bl, index), ent); - if (ent.get()) - ent = insertHere(bl, ent); - } - } - return ent; -} - -void FullBtreeDict::store(const std::string &key, int id) throw( HelpProcessingException ) -{ - HCDBG(std::cerr << "so storing " << key << " id " << id << std::endl); - - if (key.size() >= 250) - { - std::stringstream aStrStream; - aStrStream << "token " << key << " too long" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - boost::shared_ptr<Entry> aTemp(new Entry(key, id)); - FullDictBlock &rBlock = (FullDictBlock&)accessBlock(root); - boost::shared_ptr<Entry> entry = insert(rBlock, aTemp); - if (entry.get()) - { - // new root; writing to params needed - FullDictBlock &nbl = getNewBlock(); - nbl.initInternal(root, *entry); - setBlockNumber2(blocks, entry->id, root = nbl._number); - _params->setRoot(root); - } -} - -void FullBtreeDict::close(int freeID) -{ - _params->setFreeID(freeID); - if (update) - _params->updateSchema(); - BtreeDict::close(); - /* - if (logging) - log.close(); - */ -} - -class ConceptLocation -{ -public: - int _concept; - int _begin; - int _end; -public: - ConceptLocation(int conceptID, int begin, int end); - static void sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2); - static void sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2); - int getConcept() const { return _concept; } - void setConcept(int concept) { _concept = concept; } - int getBegin() const { return _begin; } - int getEnd() const { return _end; } - int getLength() const { return _end - _begin; } - bool equals(const ConceptLocation &other) const - { - return _concept==other._concept&&_begin==other._begin&&_end==other._end; - } -}; - -class DocumentCompressor; - -class Index : public IndexAccessor -{ -protected: - typedef std::hash_map<std::string, int, pref_hash> IndexHashtable; - bool _update; - IndexHashtable _cache; - Schema *_schema; -private: - BtreeDictParameters *_dictParams; - FullBtreeDict *_dict; - int _freeID; - std::fstream *_positionsFile; - std::fstream *_offsetsFile; - DocumentCompressor *_documentCompressor; - IntegerArray _concepts; - IntegerArray _offsets; - std::vector<unsigned char> _allLists; // POSITIONS - void readDocumentsTable(const std::string &fileName); - void readOffsetsTables(const std::string &fileName); - void readPositions(); -protected: - IntegerArray _microIndexOffsets; - IntegerArray _documents; - IntegerArray _titles; - std::vector<unsigned char> _positions; -private: - int _positionsCacheSize; - int _currentBatchOffset; - bool _allInCache; -protected: - virtual void writeOutOffsets(); -public: - Index(const fs::path &indexName, bool update); - virtual ~Index(); - void init(); - int intern(const std::string &name); - std::fstream& getPositionsFile(); - std::fstream& getOffsetsFile(); - DocumentCompressor& getDocumentCompressor(); - virtual void compress(int docID, int titleID, - std::vector<ConceptLocation> &locations, - std::vector<ConceptLocation> &extents); - void close(); -}; - -Index::Index(const fs::path &indexName, bool update) : IndexAccessor(indexName), - _update(update), _cache(256), _schema(NULL), _dictParams(NULL), _dict(NULL), _positionsFile(0), _offsetsFile(0), _documentCompressor(0), - _positionsCacheSize(0), _currentBatchOffset(0), _allInCache(false) -{ -} - -class CompressorIterator; -class Decompressor -{ -private: - static int BitsInByte; - static int NBits; - - int _readByte; - int _toRead; - int _path; - -protected: - virtual int getNextByte() = 0; - virtual void initReading() { _toRead = 0; _path = 0; } - -private: - int countZeroes(); - // reads 1 bit; returns non-0 for bit "1" - int read(); - -public: - int read(int kBits); - void beginIteration() { _path = 0; } - bool readNext(int k, CompressorIterator &it); - void decode(int k, IntegerArray &array); - void ascDecode(int k, IntegerArray &array); - int ascendingDecode(int k, int start, std::vector<int> &array); - virtual ~Decompressor() {} -}; - -int Decompressor::BitsInByte = 8; -int Decompressor::NBits = 32; - -class ByteArrayDecompressor : public Decompressor -{ -private: - const std::vector<unsigned char> *_array; - int _index; - int _index0; -public: - ByteArrayDecompressor(const std::vector<unsigned char> *array, int index) { initReading(array, index); } - using Decompressor::initReading; - virtual void initReading(const std::vector<unsigned char> *array, int index) - { - _array = array; - _index = _index0 = index; - Decompressor::initReading(); - } - int bytesRead() { return _index - _index0; } -protected: - int getNextByte() - { - int ret = (*_array)[_index] & 0xFF; - HCDBG(fprintf(stderr, "ByteArrayDecompressor::getNextByte of %d at index %d\n", ret, _index)); - _index++; - return ret; - } -}; - -bool isExtensionMode( void ); - -class IndexInverter; - -class MicroIndex -{ -public: - static int RANGE; - static int NConcepts; -private: - int _currentRange; - int _documentNumber; - std::vector<int> _concepts; - short _group; - short _ix; - IntegerArray _kTable; - IntegerArray _offsets; - IntegerArray _maxConcepts; - const std::vector<unsigned char> *_data; - int _base; - int _limit; - int _nc; - ByteArrayDecompressor _decmp; -public: - MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index); - bool smallerThan(const MicroIndex &other) - { - return _currentRange < other._currentRange || - _currentRange == other._currentRange && - _documentNumber < other._documentNumber; - } - -private: - bool next() - { - if (_group <= _limit) - { - int shift, index; - if (_group > 0) - { - index = _base + _offsets[_group - 1]; - shift = _maxConcepts[_group - 1]; - } - else - { - index = _base; - shift = 0; - } - - _decmp.initReading(_data, index); - _nc = _decmp.ascendingDecode(_kTable[_group*2], shift, _concepts); - HCDBG(std::cerr << "nc b set to " << _nc << std::endl); - if (_group < _limit) - { - HCDBG(fprintf(stderr, "microindex concept index %d set to %d\n", _nc, _maxConcepts[_group])); - _concepts[_nc++] = _maxConcepts[_group]; - } - _currentRange = _concepts[_ix = 0]/RANGE; - _group++; - return true; - } - else - return false; - } - - void openDocumentIndex() - { - unsigned int kk = (*_data)[_base] & 0xFF; - HCDBG(std::cerr << "openDocumentIndex, kk is " << kk - << " base is " << _base << std::endl); - switch (kk >> 6) // get type - { - case 0: // single group, no extents - _decmp.initReading(_data, _base += 2); - _nc = _decmp.ascendingDecode(kk & 0x3F, 0, _concepts); - HCDBG(std::cerr << "nc a set to " << _nc << std::endl); - _currentRange = _concepts[_ix = 0]/RANGE; - _limit = 0; - _group = 1; - break; - case 2: // multi group, no extents - { - _decmp.initReading(_data, _base + 1); - _decmp.decode(kk & 0x3F, _kTable); - int last = _kTable.back(); - _kTable.pop_back(); - _decmp.ascDecode(last, _offsets); - last = _kTable.back(); - _kTable.pop_back(); - _decmp.ascDecode(last, _maxConcepts); - _base += 1 + _decmp.bytesRead(); - _limit = _maxConcepts.size(); - _group = 0; - next(); - } - break; - case 1: // single group, extents - case 3: // multi group, extents - if( !isExtensionMode() ) - std::cerr << "extents not yet implemented" << std::endl; - break; - } - } - -public: - bool process(IndexInverter &lists); -}; - -int MicroIndex::RANGE = 1024; -int MicroIndex::NConcepts = 16; - -class BitBuffer -{ -private: - static int InitSize; - static int NBits; - static int BitsInByte; - static int BytesInInt; - - int _avail; - unsigned int _word; - int _free; - int _size; - std::vector<unsigned int> _array; - -public: - BitBuffer() : _avail(NBits), _word(0), _free(0), _size(InitSize) - { - _array.resize(InitSize); - } - - void close() - { - if (_avail < NBits) - store(_word << _avail); - else - _avail = 0; - } - - void write(std::fstream &out) const - { - for (int i = 0; i < _free - 1; i++) - writeInt(out, _array[i]); - unsigned int word = _array[_free - 1]; - int bytes = BytesInInt - _avail/BitsInByte; - int shift = NBits; - while (bytes-- > 0) - writeByte(out, static_cast<unsigned char>((word >> (shift -= BitsInByte)) & 0xFF)); - } - - void clear() - { - _word = 0; - _avail = NBits; - _free = 0; - } - - int byteCount() { return _free*BytesInInt - _avail/BitsInByte; } - int bitCount() { return _free*NBits - _avail; } - - void setFrom(const BitBuffer &rhs) - { - _word = rhs._word; - _avail = rhs._avail; - if ((_free = rhs._free) > _size) - _array.resize(_size = rhs._free); - _array = rhs._array; - } -private: - void growArray(int newSize) - { - _array.resize(newSize); - _size = newSize; - } - - void store(unsigned int value) - { - if (_free == _size) - growArray(_size * 2); - HCDBG(fprintf(stderr, "store of %x to %d\n", (int)value, _free)); - _array[_free++] = value; - } - -public: - void append(int bit) - { - _word = (_word << 1) | bit; - if (--_avail == 0) - { - store(_word); - _word = 0; - _avail = NBits; - } - } - - void append(unsigned int source, int kBits) - { - if (kBits < _avail) - { - _word = (_word << kBits) | source; - _avail -= kBits; - } - else if (kBits > _avail) - { - int leftover = kBits - _avail; - store((_word << _avail) | (source >> leftover)); - _word = source; - _avail = NBits - leftover; - } - else - { - store((_word << kBits) | source); - _word = 0; - _avail = NBits; - } - } - - void concatenate(const BitBuffer &bb) - { - if (_size - _free < bb._free) - growArray(_free + bb._free + 1); - - if (_avail == 0) - { - memmove(&_array[_free], &bb._array[0], bb._free * sizeof(unsigned int)); - _avail = bb._avail; - _free += bb._free; - HCDBG(fprintf(stderr, "free bumped to %d\n", _free)); - } - else - { - int tp = _free - 1; // target - int sp = 0; // source - do - { - _array[tp] |= bb._array[sp] >> (NBits - _avail); - _array[++tp] = bb._array[sp++] << _avail; - } - while (sp < bb._free); - _free += bb._free; - if ((_avail += bb._avail) >= NBits) - { - _avail -= NBits; - _free--; - } - HCDBG(fprintf(stderr, "other free bumped to %d\n", _free)); - } - } -}; - -class Compressor -{ -private: - static int NBits; - static int BeginK; - BitBuffer _buffer; -public: - void write(std::fstream &out) const { _buffer.write(out); } - int byteCount() { return _buffer.byteCount(); } - void clear() { _buffer.clear(); } - void concatenate(const Compressor &other) { _buffer.concatenate(other._buffer); } - void encode(const IntegerArray &pos, int k); - void encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2); - // k: starting value for minimization - int minimize(const IntegerArray &array, int startK); - int compressAscending(const IntegerArray &array); -}; - -void toDifferences(const IntegerArray &in, IntegerArray &out) -{ - if (out.size() < in.size()) - out.resize(in.size()); - if (in.empty()) - return; - out[0] = in[0]; - for (size_t i = 1; i < in.size(); ++i) - out[i] = in[i] - in[i - 1]; -} - -class IndexInverter -{ -private: - static int K; - std::vector<IntegerArray> _arrays; - int _minConcept; - int _limit; - IntegerArray _concepts; - IntegerArray _offsets; - Compressor _compr; - IntegerArray _diffs; - std::fstream *_mainFile; - // heap - int _heapSize; - std::vector<MicroIndex*> _heap; - - Index &_index; - -public: - IndexInverter(Index &index) : _arrays(MicroIndex::RANGE), - _minConcept(0), _limit(MicroIndex::RANGE), - _mainFile(0), _heapSize(0), _index(index) {} - ~IndexInverter() - { - delete _mainFile; - for (int i = 0; i < _heapSize; i++) - { - HCDBG(fprintf(stderr, "deleting number %d\n", i)); - delete _heap[i]; - } - } - void invertIndex(int nDocuments, const IntegerArray µIndexOffsets) - { - _mainFile = _index.getOutputStream("DOCS"); - for (int i = 0; i < MicroIndex::RANGE; i++) - _arrays[i] = IntegerArray(); - - // read in the whole POSITIONS file - std::vector<unsigned char> positions = _index.readByteArray("POSITIONS"); - // build heap - _heap.clear(); - _heap.resize(_heapSize = nDocuments); - for (int i = 0; i < nDocuments; i++) - _heap[i] = new MicroIndex(i, &positions, microIndexOffsets[i]); - for (int i = _heapSize/2; i >= 0; i--) - heapify(i); - // process till exhausted - while (!_heap.empty()) - if (_heap[0]->process(*this)) - heapify(0); - else if (_heapSize > 1) - { - delete _heap[0]; - _heap[0] = _heap[--_heapSize]; - heapify(0); - } - else - break; - // closing - flush(); - _mainFile->close(); - // compress index file - std::fstream *indexFile = _index.getOutputStream("DOCS.TAB"); - unsigned char byte = static_cast<unsigned char>( - _compr.compressAscending(_concepts)); - indexFile->write( (const char*)&byte, 1 ); // write k - _compr.write(*indexFile); - _compr.clear(); - byte = static_cast<unsigned char>(_compr.minimize(_offsets, K)); - indexFile->write( (const char*)&byte, 1 ); // write k - _compr.write(*indexFile); - indexFile->close(); - delete indexFile; - } - - short process(int documentNumber, std::vector<int> &concepts, - int n, short start, bool firstTime) - { - if (firstTime && concepts[start] >= _limit) - flush(); - concepts[n] = _limit; // sentinel - while (concepts[start] < _limit) - { - _arrays[concepts[start++] - _minConcept].push_back(documentNumber); - } - return start; - } - -private: - void heapify(int i) - { - int r = (i + 1) << 1, l = r - 1; - int smallest = l < _heapSize && _heap[l]->smallerThan(*_heap[i]) ? l : i; - if (r < _heapSize && _heap[r]->smallerThan(*_heap[smallest])) - smallest = r; - if (smallest != i) - { - MicroIndex *temp = _heap[smallest]; - _heap[smallest] = _heap[i]; - _heap[i] = temp; - heapify(smallest); - } - } - - void flush() - { - for (int i = 0; i < MicroIndex::RANGE; ++i) - { - if (!_arrays[i].empty()) - { - toDifferences(_arrays[i], _diffs); - unsigned char byte = static_cast<unsigned char>( - _compr.minimize(_diffs, K)); - _mainFile->write( (const char*)&byte, 1 ); // write k - _offsets.push_back(_compr.byteCount() + 1); - _compr.write(*_mainFile); - _concepts.push_back(_minConcept + i); - _arrays[i].clear(); - _diffs.clear(); - _compr.clear(); - } - } - _limit += MicroIndex::RANGE; - _minConcept += MicroIndex::RANGE; - } -}; - -int IndexInverter::K = 3; - -MicroIndex::MicroIndex(int documentNumber, const std::vector<unsigned char> *positions, int index) - : _concepts(NConcepts + 1), _data(positions), _decmp(NULL, 0) -{ - _documentNumber = documentNumber; - _base = index; - openDocumentIndex(); -} - -bool MicroIndex::process(IndexInverter &lists) -{ - bool firstTime = true; - while (true) - { - short stop = lists.process(_documentNumber, _concepts, _nc, _ix, firstTime); - if (stop < _nc) - { - _currentRange = _concepts[_ix = stop]/RANGE; - return true; - } - else if (next()) - firstTime = false; - else - return false; - } -} - -void Index::close() -{ - /* - BtreeDictCompactor source = new BtreeDictCompactor(_dictParams, false); - - URL url = new URL("file", "", _indexDir + "compacted"); - BtreeDictParameters params = - new BtreeDictParameters(url, _dictParams.getBlockSize(), 0, _freeID); - source.compact(params); - URL tmapURL = new URL("file", "", _indexDir + "DICTIONARY"); - File tmap = new File(tmapURL.getFile()); - File compacted = new File(url.getFile()); - compacted.renameTo(tmap); - _dictParams.setRoot(params.getRootPosition()); - _dictParams.updateSchema(); - */ - _dict->close(_freeID); - if (_positionsFile) - { - delete _positionsFile; - _positionsFile = NULL; - } - - if (_update) - { - writeOutOffsets(); - _dictParams->setFreeID(_freeID); - _dictParams->updateSchema(); - _schema->save(); - IndexInverter inverter(*this); - inverter.invertIndex(_documents.size(), _microIndexOffsets); - } - if (_offsetsFile) - { - delete _offsetsFile; - _offsetsFile = NULL; - } -} - -void Index::init() -{ - bool indexExists = false; - if (_update) - { - createIfNeeded(); - _cache.clear(); - } - if (_schema) delete _schema; - _schema = new Schema(*this, _update); - - if (_dictParams) delete _dictParams; - _dictParams = new BtreeDictParameters(*_schema, "DICTIONARY"); - - if (_dictParams->readState() == false) - { - _dictParams->setBlockSize(2048); - _dictParams->setRoot(0); - _dictParams->setFreeID(1); - } - else - indexExists = true; - - if (_dict) delete _dict; - _dict = new FullBtreeDict(*_dictParams, _update); - - _freeID = _dictParams->getFreeID(); - - _documents.clear(); - if (indexExists) - { - // read in index parts - _allLists = readByteArray("DOCS"); - readDocumentsTable("DOCS.TAB"); - readOffsetsTables("OFFSETS"); - readPositions(); - } - else - { - _microIndexOffsets.clear(); - _titles.clear(); - } -} - -namespace -{ - std::string cliptoken(const std::string &name) - { - std::string key = name; - int length = key.size(); - while(key.size() >= 250) - key = name.substr(--length); - return key; - } -} - -int Index::intern(const std::string &name) -{ - std::string key = cliptoken(name); - IndexHashtable::const_iterator aIter = _cache.find(key); - if (aIter != _cache.end()) - return aIter->second; - else - { - //Seeing as we always start off with an empty dictionary, - //our entries will always be in the _cache, so don't ever - //search the underlying dictionary - int id = _freeID++; - _dict->store(key, id); - _cache.insert(IndexHashtable::value_type(key, id)).first->second = id; - return id; - } -} - -std::fstream& Index::getPositionsFile() -{ - if (!_positionsFile) - _positionsFile = getRAF("POSITIONS", _update); - return *_positionsFile; -} - -std::fstream& Index::getOffsetsFile() -{ - if (!_offsetsFile) - _offsetsFile = getRAF("OFFSETS", _update); - return *_offsetsFile; -} - -class VectorBtreeParameters : public BlockManagerParameters -{ -private: - int _vectorLength; -public: - VectorBtreeParameters(Schema &schema, const std::string &partName) : - BlockManagerParameters(schema, partName) - { - _vectorLength = integerParameter("vl"); - } - - void updateSchema() - { - std::ostringstream tmp; - tmp << "vl=" << _vectorLength; - BlockManagerParameters::updateSchema(tmp.str()); - } - - VectorBtreeParameters(Schema &schema, const std::string &partName, int vecLen) - : BlockManagerParameters(schema, partName) - { - _vectorLength = vecLen; - } - - int getVectorLength() { return _vectorLength; } -}; - -enum outerbreak { dobreak, docontinue, donothing }; - -class VectorProcessor -{ - std::vector<unsigned char> _vector; -public: - virtual bool processVector() = 0; - std::vector<unsigned char>& getVectorBuffer() { return _vector; } - virtual ~VectorProcessor() {} -}; - -class VectorBlock; - -class VectorBtree -{ -protected: - VectorBlock *_root; - BlockManager *_blockManager; - VectorBtreeParameters *_params; - int _blockSize; -public: - int _maxEntries; - int _leafDataLimit; -protected: - int _vectorsOffset; - VectorBlock& accessBlock(int index); - VectorBtree() {/*empty*/} -public: - int _vecLen; - int vector(int index) const; - static int memcmp(const std::vector<unsigned char> &v1, - const std::vector<unsigned char> &v2, int i2, int n); - VectorBtree(VectorBtreeParameters *params); - ~VectorBtree() { delete _blockManager; } -}; - -class VectorBlockFactory : public BlockFactory -{ -private: - int _blockSize; -public: - VectorBlockFactory(int blockSize) : _blockSize(blockSize) {} - Block* makeBlock() const; -}; - -VectorBtree::VectorBtree(VectorBtreeParameters *params) -{ - _params = params; - _vecLen = params->getVectorLength(); - _blockSize = params->getBlockSize(); - _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); - if ((_maxEntries & 1) == 0) // needs to be odd - _maxEntries--; - - _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; - - _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; - _blockManager = new BlockManager(_params, false, new VectorBlockFactory(_blockSize)); - _root = &(accessBlock(params->getRootPosition())); -} - -VectorBlock& VectorBtree::accessBlock(int index) -{ - return (VectorBlock&)_blockManager->accessBlock(index); -} - -int VectorBtree::memcmp(const std::vector<unsigned char> &v1, - const std::vector<unsigned char> &v2, int i2, int n) -{ - for (int i = 0; i < n; i++, i2++) - if (v1[i] != v2[i2]) - return (v1[i]&0xFF) - (v2[i2]&0xFF); - return 0; -} - -class VectorBlock : public Block -{ -public: - VectorBlock(int size) : Block(size) {} -protected: - int findIndex(const std::vector<unsigned char> &key, const VectorBtree &tree) - { - int i = 0, j = _free - 1; - while (i <= j) - { - int k = (i + j)/2; - int test = VectorBtree::memcmp(key, _data, tree.vector(k),tree._vecLen); - // std::cerr << "k = " << k << ", test = " << test << std::endl; - if (test > 0) - i = k + 1; - else if (test < 0) - j = k - 1; - else - return -1 - k; // result always negative; "k" encoded - } - return i; - } -private: - int FindVectorsInLeaf(const std::vector<unsigned char> &lo, - const std::vector<unsigned char> &hi, int commLen, int prefLen, - std::vector<unsigned char> &buffer, int size, const VectorBtree &tree) - { - int idx = 0, start; - for (int nBytesEq = 0;;) - { - // std::cout << "idx = " << idx << std::endl; - if (_data[idx] == nBytesEq) // at compression byte - { - int i; - outerbreak hack(donothing); - for (i = nBytesEq; i < tree._vecLen; i++) - { - if (lo[i] == _data[++idx]) - ++nBytesEq; - else if ((lo[i]&0xFF) < (_data[idx]&0xFF)) - if (nBytesEq >= commLen && (i >= prefLen || (hi[i]&0xFF) >= (_data[idx]&0xFF))) - { - start = nBytesEq; - hack = dobreak; - break; - } - else - return 0; - else - { - idx += tree._vecLen - i; // skip - hack = docontinue; - break; - } - } - - if (hack == dobreak) - break; - else if (hack == docontinue) - continue; - - if (i == tree._vecLen) // eq vec found - if ((_data[++idx]&0xFF) >= prefLen) - { - start = _data[idx++]&0xFF; - break; - } - else - return 0; - } - else if (_data[idx] < nBytesEq) // drop - { - std::cout << idx << std::endl; - nBytesEq = (_data[idx++]); - std::cout << nBytesEq << std::endl; - if (nBytesEq < commLen) - return 0; - else if (lo[nBytesEq] < (_data[idx]&0xFF)) - if (hi[nBytesEq] < (_data[idx]&0xFF)) - return 0; - else - { - start = nBytesEq; // found - break; - } - else - idx += tree._vecLen - nBytesEq; - } - else if ((_data[idx]&0xFF) == 0xFF) - return 0; - else // compression is bigger - idx += tree._vecLen + 1 - _data[idx]; - } - - int length = std::min(size - start, _free - idx); - buffer[0] = static_cast<unsigned char>(start); - memcpy(&(buffer[1]), &(_data[idx]), length); - buffer[length + 1] = 0; - return length + 1; - } -protected: - bool searchLeafBlock(const std::vector<unsigned char> &key, const VectorBtree &tree) - { -#if 0 - processLeafBlock(_printer); -#endif - int nBytesEq = 0; - for (int idx = 0;; idx += tree._vecLen + 1 - _data[idx]) - { - if (_data[idx] == nBytesEq) - { - int i, j; - outerbreak hack(donothing); - for (i = _data[idx], j = idx + 1; i < tree._vecLen; i++, j++) - { - if (key[i] == _data[j]) - ++nBytesEq; - else if ((key[i]&0xFF) < (_data[j]&0xFF)) - return false; - else /* key[i] > _data[j] */ - { - hack = dobreak; - break; - } - } - - if (hack == dobreak) - break; - - if (i == tree._vecLen) /* or nBytesEq == _vecLen */ - return true; /* equal vector found */ - } - else if (_data[idx] < nBytesEq) - return false; - } - return false; - } -public: - bool processLeafBlock(VectorProcessor &processor, const VectorBtree &tree) - { - std::vector<unsigned char> &buffer = processor.getVectorBuffer(); - for (int ix = 0; ix < _free; ix += tree._vecLen - _data[ix] + 1) - { - // cmc: the below line was a comment in the original java, somewhere along - // the line I suspect this was written in c++, then into java - // and now I'm putting it back to c++ :-( - // ::memcpy(&buffer[_data[ix]], &_data[ix + 1], _vecLen - _data[ix]); - memcpy(&(buffer[_data[ix]]), &(_data[ix + 1]), tree._vecLen - _data[ix]); - if (processor.processVector()) - return true; - } - return false; - } -}; // VectorBlock - -Block* VectorBlockFactory::makeBlock() const -{ - return new VectorBlock(_blockSize); -} - -class FullVectorBlock : public VectorBlock -{ -public: - FullVectorBlock(int size) : VectorBlock(size) {} - bool isFull(const VectorBtree &tree) const - { - //return pbl->_leaf ? pbl->_free > _leafDataLimit : pbl->_free == _maxEntries; - return _isLeaf ? _free > tree._leafDataLimit : _free == tree._maxEntries; - } -}; - -class FullVectorBtree : public VectorBtree -{ -private: - static int MaxVeclen; - static double SplitRatio; -public: - FullVectorBtree(VectorBtreeParameters* params, bool update); - bool insertVector(const std::vector<unsigned char> &key); -private: - bool treeInsertNonfull(const FullVectorBlock &bl, const std::vector<unsigned char> &key); - bool treeInsertNonfullRoot(const std::vector<unsigned char> &key); - FullVectorBlock& getNewBlock(); - void enableModif(const Block &bl); - void declareModif(const Block &bl); -public: - void close() { _blockManager->close(); } -}; - -int FullVectorBtree::MaxVeclen = 128; -double FullVectorBtree::SplitRatio = 0.5; - -class FullVectorBlockFactory : public BlockFactory -{ -private: - int _blockSize; -public: - FullVectorBlockFactory(int blockSize) : _blockSize(blockSize) {} - Block* makeBlock() const - { - return new FullVectorBlock(_blockSize); - } -}; - -FullVectorBtree::FullVectorBtree(VectorBtreeParameters *params, bool update) -{ - _params = params; - _vecLen = params->getVectorLength(); - _blockSize = params->getBlockSize(); - _blockManager = new BlockManager(params, update, new FullVectorBlockFactory(_blockSize)); - _maxEntries=(_blockSize-Block::HEADERLEN-Block::IDLEN)/(_vecLen+Block::IDLEN); - // System.out.println("_maxEntries = " + _maxEntries); - if ((_maxEntries & 1) == 0) // needs to be odd - _maxEntries--; - _leafDataLimit = _blockSize - _vecLen - Block::HEADERLEN - Block::IDLEN; - _vectorsOffset = (_maxEntries + 1)*Block::IDLEN; - _root = &(accessBlock(params->getRootPosition())); -} - -class CompressorIterator -{ -public: - virtual void value(int value) = 0; - virtual ~CompressorIterator() {} -}; - -int Decompressor::countZeroes() -{ - for (int count = 0;; _readByte = getNextByte(), _toRead = BitsInByte) - { - HCDBG(fprintf(stderr, "count is %d\n", count)); - HCDBG(fprintf(stderr, "Decompressor::countZeroes is %x\n", _readByte)); - HCDBG(fprintf(stderr, "_toRead is %d\n", _toRead)); - HCDBG(fprintf(stderr, "_readByte is %x\n", _readByte)); - while (_toRead-- > 0) - { - if ((_readByte & (1 << _toRead)) != 0) - { - HCDBG(fprintf(stderr, "returning count of %d\n", count)); - return count; - } - else - { - ++count; - HCDBG(fprintf(stderr, "int count to %d\n", count)); - } - } - } - //return 0; -} - -// reads 1 bit; returns non-0 for bit "1" -int Decompressor::read() -{ - if (_toRead-- > 0) - return _readByte & (1 << _toRead); - else - { // get next word - _toRead = BitsInByte - 1; - return (_readByte = getNextByte()) & 0x80; - } -} - -int Decompressor::read(int kBits) -{ - int shift = BitsInByte - _toRead; - if (kBits <= _toRead) - { - HCDBG(fprintf(stderr, "leg 1\n")); - return ((_readByte<<shift) & 0xFF) >> (shift + (_toRead-=kBits)); - } - else - { - HCDBG(fprintf(stderr, "leg 2 _readByte is %d, shift %d\n", _readByte, shift)); - int result = _toRead > 0 ? ((_readByte << shift) & 0xFF) >> shift : 0; - HCDBG(fprintf(stderr, "result is %d\n", result)); - for (kBits -= _toRead; kBits >= BitsInByte; kBits -= BitsInByte) - { - int foo = getNextByte(); - HCDBG(fprintf(stderr, "byte is %d\n", foo)); - result = (result << BitsInByte) | foo; - HCDBG(fprintf(stderr, "and result is %d\n", result)); - } - if (kBits > 0) - { - int foo = getNextByte(); - HCDBG(fprintf(stderr, "and byte is %d\n", foo)); - int thing = BitsInByte - kBits; - HCDBG(fprintf(stderr, "thing is %d\n", thing)); - _toRead = thing; - _readByte = foo; - int right = (_readByte >> _toRead); - HCDBG(fprintf(stderr, "right is %d\n", right)); - int left = result << kBits; - HCDBG(fprintf(stderr, "kbits are %d\n", kBits)); - HCDBG(fprintf(stderr, "left is %d\n", left)); - int ret = left | right; -// int ret = (result << kBits) | ((_readByte = foo) >> (_toRead = BitsInByte - kBits)); - HCDBG(fprintf(stderr, "and final is %d\n", ret)); - return ret; - } - else - { - _toRead = 0; - HCDBG(fprintf(stderr, "and this result says %d\n", result)); - return result; - } - } -} - -bool Decompressor::readNext(int k, CompressorIterator &it) -{ - if (read() != 0) - { - it.value(_path | read(k)); - return true; - } - else - { - for (int count = 1;; _readByte = getNextByte(), _toRead = BitsInByte) - { - while (_toRead-- > 0) - { - if ((_readByte & (1 << _toRead)) != 0) - { - int saved = _path; - _path = ((_path >> (k + count) << count) | read(count)) << k; - if (_path != saved) - { - it.value(_path | read(k)); - return true; - } - else - { - return false; - } - } - else - { - ++count; - } - } - } - } -} - -void Decompressor::decode(int k, IntegerArray &array) -{ - for (int path = 0;;) - { - if (read() != 0) - { - array.push_back(path | read(k)); - } - else - { - int count = countZeroes() + 1; - int saved = path; - path = ((path >> (k + count) << count) | read(count)) << k; - if (path != saved) // convention for end - array.push_back(path | read(k)); - else - break; - } - } -} - -void Decompressor::ascDecode(int k, IntegerArray &array) -{ - for (int path = 0, start = 0;;) - { - HCDBG(fprintf(stderr, "path is %d, start is %d\n", path, start)); - if (read() != 0) - { - int inread = read(k); - start += path | inread; - HCDBG(fprintf(stderr, "inread is %d\n", inread)); - int final = start; - HCDBG(fprintf(stderr, "1:Decompressor::ascDecode to %d\n", final)); - array.push_back(final); - } - else - { - int count = countZeroes() + 1; - HCDBG(fprintf(stderr, "count is %d\n", count)); - int saved = path; - int inread = read(count); - HCDBG(fprintf(stderr, "inread is %d, k is %d, path is %d\n", inread, - k, path)); - path = ((path >> (k + count) << count) | inread) << k; - if (path != saved) // convention for end - { - int anotherread = read(k); - HCDBG(fprintf(stderr, "newinread is %d\n", anotherread)); - start += path | anotherread; - int final = start; - HCDBG(fprintf(stderr, "2:Decompressor::ascDecode to %d\n", final)); - array.push_back(final); - } - else - { - break; - } - } - } -} - -int Decompressor::ascendingDecode(int k, int start, std::vector<int> &array) -{ - int path = 0, index = 0; - while (true) - { - if (read() != 0) - array[index++] = (start += path | read(k)); - else - { - outerbreak hack = donothing; - for (int cnt = 0;; _readByte = getNextByte(), _toRead = BitsInByte) - { - while (_toRead-- > 0) - { - if ((_readByte & (1 << _toRead)) != 0) - { - ++cnt; - int Path = ((path >> (k + cnt) << cnt) | read(cnt)) << k; - if (Path != path) - { - array[index++] = (start += (path = Path) | read(k)); - hack = docontinue; - break; - } - else - return index; - } - else - ++cnt; - } - if (hack == docontinue) - break; - } - } - } -} - -class StreamDecompressor : public Decompressor -{ -private: - std::ifstream *_input; -public: - StreamDecompressor(std::ifstream &input) { initReading(input); } - using Decompressor::initReading; - virtual void initReading(std::ifstream &input) { _input = &input; Decompressor::initReading(); } - int getNextByte() - { - unsigned char ret; - _input->read( (char*)&ret, 1 ); - HCDBG(fprintf(stderr, "StreamDecompressor::getNextByte of %d\n", ret)); - return ret; - } -}; - -void Index::readPositions() -{ - getPositionsFile(); - //!!! temporary: better than fixed large value, worse than 'intelligent' size mgt - _positionsFile->seekg(0, std::ios::end); - _positionsCacheSize = _positionsFile->tellg(); - if (_positionsCacheSize < 0) _positionsCacheSize = 0; - _positionsFile->clear(); - _positionsFile->seekg(0, std::ios::beg); - - if (_positionsCacheSize <= _positionsCacheSize) - { - _allInCache = true; - _positions.resize(_positionsCacheSize); - _positionsFile->readsome((char*)(&_positions[0]), _positionsCacheSize); - std::cout << "POS fits in cache" << std::endl; - } -} - -void Index::readOffsetsTables(const std::string &fileName) -{ - std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); - unsigned char k1; - in.read( (char*)&k1, 1 ); - StreamDecompressor sddocs(in); - sddocs.decode(k1, _documents); - unsigned char k2; - in.read( (char*)&k2, 1 ); - _microIndexOffsets.clear(); - StreamDecompressor sdoffsets(in); - sdoffsets.ascDecode(k2, _microIndexOffsets); - // decompress titles' ids table - unsigned char k3; - in.read( (char*)&k3, 1 ); - _titles.clear(); - StreamDecompressor sdtitles(in); - sdtitles.decode(k3, _titles); -} - -void Index::readDocumentsTable(const std::string &fileName) -{ - std::ifstream in(indexFile(fileName).native_file_string().c_str(), std::ios::binary); - unsigned char k1; - in.read( (char*)&k1, 1 ); - _concepts.clear(); - StreamDecompressor sddocs(in); - sddocs.ascDecode(k1, _concepts); - unsigned char k2; - in.read( (char*)&k2, 1 ); - _offsets.clear(); - _offsets.push_back(0); - StreamDecompressor sdoffsets(in); - sdoffsets.ascDecode(k2, _offsets); - in.close(); -} - -class ContextTables; - -class Tables -{ -private: - std::vector<int> _initialWordsCached; - std::vector<int> _destsCached; - std::vector<int> _linkTypesCached; - std::vector<int> _seqNumbersCached; -public: - Tables(const std::vector<int> &initialWords, - std::vector<int> &dests, - std::vector<int> &linkTypes, - std::vector<int> &seqNumbers) - { - _initialWordsCached = initialWords; - _destsCached = dests; - _linkTypesCached = linkTypes; - _seqNumbersCached = seqNumbers; - } - void setTables(ContextTables &context); -}; // end of Tables - -class ContextTables -{ -public: - std::vector<int> _initialWords; - std::vector<int> _dests; - std::vector<int> _linkTypes; - std::vector<int> _seqNumbers; - int _nTextNodes; -private: - std::vector<Tables*> _cache; - // cached last position for linear search - int _initialWordsIndex; - // link names are shared between all microindexes in an index - std::vector<std::string> _linkNames; - // offsets to tables' storage in file (or memory) - std::vector<int> _offsets; - std::vector<unsigned char> _contextData; // !!! fully cached for now - // auxillary - IntegerArray _kTable; - // _auxArray will be used as an auxillary to decode arrays - IntegerArray _auxArray; - int _lastDocNo; - - std::vector<int> _markers; - -public: - ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData, - const std::vector<std::string> &linkNames); - ~ContextTables(); - void setMicroindex(int docNo); - int parentContext(int context); - const std::string& linkName(int context); - int linkCode(const std::string &linkName); - std::vector<bool> getIgnoredElementsSet(const std::vector<std::string> &ignoredElements); - bool notIgnored(int ctx, const std::vector<bool> &ignoredElements); - int firstParentWithCode(int pos, int linkCode); - int firstParentWithCode2(int pos, int linkCode, int parentCode); - int firstParentWithCode3(int pos, int linkCode, int ancestorCode); - int firstParentWithCode4(int pos, const std::vector<int> &linkCodes); - int firstParentWithCode5(int pos, const std::vector<int> &pathCodes); - int firstParentWithCode7(int pos, int linkCode, int seq); - bool isGoverning(int context) { return linkName(context) == "TITLE"; } - void resetContextSearch() { _initialWordsIndex = 0; } -private: - void appendSegment(int context, std::string &result); - int findIndexBin(int wordNumber); -public: - int wordContextLin(int wordNumber); -}; - -ContextTables::ContextTables(const std::vector<int> &offsets, const std::vector<unsigned char> &contextData, - const std::vector<std::string> &linkNames) : _kTable(5), _auxArray(4096), _lastDocNo(-1) -{ - _offsets = offsets; - _contextData = contextData; - _linkNames = linkNames; - _cache.resize(_offsets.size()); -} - -ContextTables::~ContextTables() -{ - for (size_t i = 0; i < _cache.size(); ++i) - delete _cache[i]; -} - -void ContextTables::setMicroindex(int docNo) -{ - if (docNo != _lastDocNo) // check if we need to do anything - { - if (_cache[docNo]) - _cache[docNo]->setTables(*this); - else - { - int offset = _offsets[docNo]; - int k0 = _contextData[offset] & 0xFF; - ByteArrayDecompressor compr(&_contextData, offset + 1); - _kTable.clear(); - compr.decode(k0, _kTable); - // decompress initialWords into auxiliary array - _auxArray.clear(); - compr.ascDecode(_kTable[0], _auxArray); // _initialWords - _initialWords = _auxArray; - _nTextNodes = _initialWords.size(); - // decompress destinations into auxiliary array - _auxArray.clear(); - compr.decode(_kTable[1], _auxArray); // _dests - _auxArray.push_back(-1); // sentinel, root - _dests = _auxArray; - _linkTypes.clear(); - compr.decode(_kTable[2], _linkTypes); - _seqNumbers.clear(); - compr.decode(_kTable[3], _seqNumbers); - - _cache[docNo] = new Tables(_initialWords, _dests, _linkTypes, _seqNumbers); - - /* - System.out.println("|_initialWords| = " + _nTextNodes); - System.out.println("|_dests| -1 = " + (_dests.length - 1)); - System.out.println("|_seqNumbers| = " + _seqNumbers.length); - System.out.println("|_linkTypes| = " + _linkTypes.length); - */ - } - _lastDocNo = docNo; - _markers.resize(_dests.size()); - } - _initialWordsIndex = 0; -} - -int ContextTables::parentContext(int context) -{ - return _dests[context]; -} - -const std::string& ContextTables::linkName(int context) -{ - return _linkNames[_linkTypes[context]]; -} - -int ContextTables::linkCode(const std::string &inlinkName) -{ - for (size_t i = 0; i < _linkNames.size(); i++) - if (inlinkName == _linkNames[i]) - return i; - return -1; // when not found -} - -std::vector<bool> ContextTables::getIgnoredElementsSet(const std::vector<std::string> &ignoredElements) -{ - std::vector<bool> result; - bool noValidIgnoredElements = true; - if (!ignoredElements.empty()) - { - result.resize(_linkNames.size()); - for (size_t i = 0; i < ignoredElements.size(); i++) - { - int code = linkCode(ignoredElements[i]); - if (code > -1) - { - result[code] = true; - noValidIgnoredElements = false; - } - } - } - return noValidIgnoredElements ? std::vector<bool>() : result; -} - -bool ContextTables::notIgnored(int ctx, const std::vector<bool> &ignoredElements) -{ - do - { - if (ignoredElements[_linkTypes[ctx]]) - { - std::cout << "hit ignored" << std::endl; - return false; - } - } - while ((ctx = _dests[ctx]) > -1); // parentContext 'hand inlined' - return true; -} - -/** starting with ctx and going up the ancestry tree look for the first - context with the given linkCode */ -int ContextTables::firstParentWithCode(int pos, int inlinkCode) -{ - int ctx = _dests[wordContextLin(pos)]; // first parent of text node - int shift = _nTextNodes; - int limit = _dests.size() - 1; - while (_linkTypes[ctx - shift] != inlinkCode) - if ((ctx = _dests[ctx]) == limit) - return -1; - return ctx; -} - -/** starting with ctx and going up the ancestry tree look for the first - context with the given linkCode and given parent code */ -int ContextTables::firstParentWithCode2(int pos, int inlinkCode, int parentCode) -{ - int ctx = _dests[wordContextLin(pos)]; // first parent of text node - int shift = _nTextNodes; - int limit = _dests.size() - 1; - for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) - if (_linkTypes[parent - shift] == parentCode && _linkTypes[ctx - shift] == inlinkCode) - return ctx; - else - ctx = parent; - return -1; -} - -/** starting with ctx and going up the ancestry tree look for the first - context with the given linkCode and given ancestor code */ -int ContextTables::firstParentWithCode3(int pos, int inlinkCode, int ancestorCode) -{ - int ctx = _dests[wordContextLin(pos)]; - int shift = _nTextNodes; - int limit = _dests.size() - 1; - // find first instance of linkCode - while (ctx < limit && _linkTypes[ctx - shift] != inlinkCode) - ctx = _dests[ctx]; - if (ctx < limit) // found linkCode, check ancestry - for (int ancestor = _dests[ctx]; - ancestor < limit; - ancestor = _dests[ancestor]) - if (_linkTypes[ancestor - shift] == ancestorCode) // ancestor confirmed - return ctx; // match found, return successful ctx - return -1; // match NOT found -} - -/** starting with ctx and going up the ancestry tree look for the first - context with any of the given linkCode */ -int ContextTables::firstParentWithCode4(int pos, const std::vector<int> &linkCodes) -{ - int nCodes = linkCodes.size(); - int shift = _nTextNodes; - int limit = _dests.size() - 1; - for (int ctx = _dests[wordContextLin(pos)]; ctx < limit; ctx = _dests[ctx]) - { - int code = _linkTypes[ctx - shift]; - for (int i = 0; i < nCodes; i++) - if (code == linkCodes[i]) - return ctx; - } - return -1; -} - -/** starting with ctx and going up the ancestry tree look for the first - context with the given path */ -int ContextTables::firstParentWithCode5(int pos, const std::vector<int> &pathCodes) -{ - int nCodes = pathCodes.size(); - int lastCode = pathCodes[nCodes - 1]; - int shift = _nTextNodes; - int limit = _dests.size() - 1; - int ctx = _dests[wordContextLin(pos)]; - for (int parent = _dests[ctx]; parent < limit; parent = _dests[parent]) - { - if (_linkTypes[ctx - shift] == lastCode) - { - // try to match the entire path - outerbreak hack = donothing; - for (int i = nCodes - 2, parent2 = parent; i >= 0; i--) - if (_linkTypes[parent2 - shift] != pathCodes[i]) // match failure - { - hack = docontinue; - break; // try to match higher - } - else if ((parent2 = _dests[parent2]) == limit) - return -1; - if (hack == docontinue) - continue; - return ctx; - } - else - ctx = parent; - } - return -1; -} - -/** starting with ctx and going up the ancestry tree look for the first - context with the given linkCode */ -int ContextTables::firstParentWithCode7(int pos, int inlinkCode, int seq) -{ - int ctx = _dests[wordContextLin(pos)]; // first parent of text node - int shift = _nTextNodes; - int limit = _dests.size() - 1; - while (_linkTypes[ctx - shift] != inlinkCode || _seqNumbers[ctx] != seq) - if ((ctx = _dests[ctx]) == limit) - return -1; - return ctx; -} - -void ContextTables::appendSegment(int context, std::string &result) -{ - result.append(context < _nTextNodes ? "text()" : _linkNames[_linkTypes[context - _nTextNodes]]); - result.push_back('['); - std::ostringstream tmp; - tmp << _seqNumbers[context]; - result.append(tmp.str()); - result.append("]/"); -} - -int ContextTables::findIndexBin(int wordNumber) -{ - int i = 0, j = _nTextNodes - 1; - while (i <= j) - { - int k = (i + j) >> 1; - if (_initialWords[k] < wordNumber) - i = k + 1; - else if (_initialWords[k] > wordNumber) - j = k - 1; - else - return k; - } - return i - 1; -} - -int ContextTables::wordContextLin(int wordNumber) -{ - for (int i = _initialWordsIndex; i < _nTextNodes; i++) - if (_initialWords[i] > wordNumber) // first such i - { - // - 1 if wordNumbers can be the same - _initialWordsIndex = i; // cached to speed up next search - return i - 1; - } - return _nTextNodes - 1; -} - -void Tables::setTables(ContextTables &context) -{ - context._initialWords = _initialWordsCached; - context._dests = _destsCached; - context._linkTypes = _linkTypesCached; - context._seqNumbers = _seqNumbersCached; - context._nTextNodes = context._initialWords.size(); -} - -class Compressor; - -class XmlIndex : public Index -{ -private: - VectorBtreeParameters *_edgesParams; - FullVectorBtree *_edges; - ContextTables *_contextTables; - std::fstream *_contextsFile; - IntegerArray _contextsOffsets; - std::vector<unsigned char> _contextsData; - std::vector<std::string> _linkNames; -protected: - virtual void writeOutOffsets(); -public: - XmlIndex(const fs::path &index, bool update) - : Index(index, update), _edgesParams(0), _edges(0), _contextTables(0), _contextsFile(0) {} - void init(); - void close(); - virtual ~XmlIndex() { delete _edgesParams; delete _edges; delete _contextTables; } - std::fstream& getContextsFile(); - using Index::compress; - virtual void compress(int docID, int titleID, - std::vector<ConceptLocation> &locations, - std::vector<ConceptLocation> &extents, - int k, const Compressor &contextTables); - const std::vector<std::string>& getLinkNames() { return _linkNames; } -}; - -void XmlIndex::init() -{ - Index::init(); - if (_edgesParams) delete _edgesParams; - _edgesParams = new VectorBtreeParameters(*_schema, "EDGE", 9); - if (_edgesParams->readState() == false) - _edgesParams->setBlockSize(1024); - _edges = new FullVectorBtree(_edgesParams, _update); - if (!_contextsOffsets.empty()) - { - _contextsData = readByteArray("CONTEXTS"); -#if 0 - _linkNames = (String[])readObject("LINKNAMES"); -#endif - _contextTables = new ContextTables(_contextsOffsets, _contextsData, _linkNames); - } -} - -void XmlIndex::writeOutOffsets() -{ - Index::writeOutOffsets(); - if (!_contextsOffsets.empty()) - { - std::fstream &out = getOffsetsFile(); - Compressor offsets2; - char k = static_cast<char>(offsets2.compressAscending(_contextsOffsets)); - out.write( (const char*)&k, 1 ); - offsets2.write(out); - } -} - -std::fstream& XmlIndex::getContextsFile() -{ - if (!_contextsFile) - _contextsFile = getRAF("CONTEXTS", _update); - return *_contextsFile; -} - -void XmlIndex::close() -{ - if (_contextsFile) - { - _contextsFile->close(); - delete _contextsFile; - _contextsFile = 0; - } - _edges->close(); - if (_update) - _edgesParams->updateSchema(); - Index::close(); -} - -class Tokenizer -{ -private: - UnicodeString s; - BreakIterator *bi; - int32_t start; - UConverter *utf8; - std::vector<char> utfbuffer; -public: - Tokenizer(); - ~Tokenizer(); - void setText(const xmlChar *text); - std::string nextToken(); -}; - -Tokenizer::Tokenizer() : start(BreakIterator::DONE), utfbuffer(64) -{ - UErrorCode status = U_ZERO_ERROR; - bi = BreakIterator::createWordInstance("en_US", status); - utf8 = ucnv_open("utf-8", &status); -} - -Tokenizer::~Tokenizer() -{ -#if !defined(SOLARIS) - delete bi; - ucnv_close(utf8); -#endif -} - -void Tokenizer::setText(const xmlChar *text) -{ - UErrorCode status = U_ZERO_ERROR; - s = UnicodeString((const char*)text, -1, utf8, status); - bi->setText(s); - start = ubrk_first(bi); -} - -std::string Tokenizer::nextToken() -{ - std::string ret; - - int32_t end = ubrk_next(bi); - while (end != BreakIterator::DONE) - { - if (ubrk_getRuleStatus(bi) != UBRK_WORD_NONE) - break; - start = end; - end = ubrk_next(bi); - } - - if (end != -1 && end != start) - { - UnicodeString token(s, start, end-start); - token = token.toLower(); - size_t needed = 0; - - UErrorCode status = U_ZERO_ERROR; - while ((needed = token.extract(&utfbuffer[0], utfbuffer.size(), utf8, status)) > utfbuffer.size()) - utfbuffer.resize(utfbuffer.size() * 2); - - ret = std::string(&utfbuffer[0], needed); - start = end; - } - - return ret; -} - -typedef std::vector<xmlNodePtr> Vector; - -ConceptLocation::ConceptLocation(int conceptID, int begin, int end) : - _concept(conceptID), _begin(begin), _end(end) -{ -} - -#ifdef EMULATEORIGINALSORT -class ConceptLocationSorter -{ -public: - virtual bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) = 0; -private: - // part of quicksearch - int partition(std::vector<ConceptLocation> &array, int p, int r) - { - ConceptLocation x = array[(p + r)/2]; - int i = p - 1, j = r + 1; - while (true) - { - while (smallerThan(x, array[--j])) - ; - while (smallerThan(array[++i], x)) - ; - if (i < j) - { - ConceptLocation t = array[i]; - array[i] = array[j]; - array[j] = t; - } - else - return j; - } - } -public: - void quicksort(std::vector<ConceptLocation> &array, int p, int r) - { - while (p < r) - { - int q = partition(array, p, r); - quicksort(array, p, q); - p = q + 1; - } - } -}; - -class ConceptSorter : public ConceptLocationSorter -{ -public: - bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) - { - return a._concept < b._concept; - } -}; - -class PositionSorter : public ConceptLocationSorter -{ -public: - bool smallerThan(const ConceptLocation &a, const ConceptLocation &b) - { - return a._begin < b._begin || a._begin == b._begin && a._end < b._end; - } -}; - -#else - -class ConceptSorter -{ -public: - bool operator()(const ConceptLocation &a, const ConceptLocation &b) const - { - return a._concept < b._concept; - } -}; - -class PositionSorter -{ -public: - bool operator()(const ConceptLocation &a, const ConceptLocation &b) const - { - return a._begin < b._begin || (a._begin == b._begin && a._end < b._end); - } -}; - -#endif - -void ConceptLocation::sortByPosition(std::vector<ConceptLocation> &array, int i1, int i2) -{ -#ifdef EMULATEORIGINALSORT - PositionSorter _pComp; - _pComp.quicksort(array, i1, i2 - 1); -#else - std::vector<ConceptLocation>::iterator begin = array.begin(); - std::vector<ConceptLocation>::iterator end = begin; - std::advance(begin, i1); - std::advance(end, i2); - std::sort(begin, end, PositionSorter()); -#endif -} - -void ConceptLocation::sortByConcept(std::vector<ConceptLocation> &array, int i1, int i2) -{ -#ifdef EMULATEORIGINALSORT - ConceptSorter _cComp; - _cComp.quicksort(array, i1, i2 - 1); -#else - std::vector<ConceptLocation>::iterator begin = array.begin(); - std::vector<ConceptLocation>::iterator end = begin; - std::advance(begin, i1); - std::advance(end, i2); - std::sort(begin, end, ConceptSorter()); -#endif -} - -typedef std::map<xmlNodePtr, int> NodeHashtable; -typedef std::hash_map<std::string, int, pref_hash> LinkHashTable; - -class IndexAdapter -{ -private: - static int StackSize; - const char* _indexText_Name; - const char* _indexElement_Name; - const char* _indexAttribute_Name; - const char* _nodeID_Name; - const char* _tokenizer_Name; - const char* _attributeName_Name; - std::vector<bool> _indexOnOffStack; - int _sp; - int _tsp; - std::vector< std::string > _attributeStack; - xmlNodePtr _currentNode; - int _attrSP; - void storeLocation(const std::string &token, int number); - void storeLocation(const std::string &token) { storeLocation(token, _lastWordNumber++); } - void storeEdge(int relation, int seqNumber, int destination); - - void startElement(xmlNodePtr node); - void attribute(const char *name, const char *value); - void characters(const xmlChar *str) throw( HelpProcessingException ); - void endElement(xmlNodePtr node); - - void indexText(const xmlChar *str); - - Vector _textNodes; - NodeHashtable _numberedNodes; -public: - HashSet _stoplist; - LinkHashTable _linkCodes; - std::vector<std::string> _linknames; - static int CurrenMaxLinkCode; - std::vector<ConceptLocation> _locations; - int _availContextNumber; - IntegerArray _initialWords; - IntegerArray _links; - IntegerArray _dests; - IntegerArray _seqNumbers; - int _lastWordNumber; - int _firstWord; - bool _anyLocationsStored; - XmlIndex *_index; -private: - static int InitSize; - int _size; -public: - IndexAdapter(); - void process(xmlNodePtr node, xmlDocPtr doc); - void init(); - void finish(); - int intern(const std::string &name) { return _index->intern(name); } - int getLinkCode(const std::string &linkName); -}; - -int IndexAdapter::StackSize = 64; -int IndexAdapter::InitSize = 4096; -int IndexAdapter::CurrenMaxLinkCode = 0; - -IndexAdapter::IndexAdapter() - : _indexOnOffStack(StackSize), _attributeStack(StackSize), - _anyLocationsStored(false), _size(InitSize) -{ - _indexText_Name = "text"; - _indexElement_Name = "element"; - _indexAttribute_Name = "attribute"; - _nodeID_Name = "nodeID"; - _tokenizer_Name = "tokenizer"; - _attributeName_Name = "attributeName"; -} - -void IndexAdapter::storeLocation(const std::string &token, int number) -{ - int concept = intern(token); - HCDBG(std::cerr << "storeLocation of number " << number << "for token " - << token << " as conceptlocation " << concept << std::endl); - _locations.push_back(ConceptLocation(concept, number, number)); -} - -void IndexAdapter::storeEdge(int relation, int seqNumber, int destination) -{ - _links.push_back(relation); - _seqNumbers.push_back(seqNumber); - _dests.push_back(destination); - HCDBG(std::cerr << "storeEdge" << std::endl); -} - -void IndexAdapter::finish() -{ - _numberedNodes.clear(); - _dests.clear(); - _seqNumbers.clear(); - _links.clear(); - - int nTextNodes = _textNodes.size(); - _availContextNumber = nTextNodes; - // vector to hold parents of text nodes - Vector parents; - /***** - for each of the text nodes its sequence number is stored - as well as the index of its parent (in _dests) - _link is not stored as it is always "text()" - _availContextNumber only used to number parent element contexts - ******/ - for (int i = 0; i < nTextNodes; i++) - { - xmlNodePtr node = _textNodes[i]; - xmlNodePtr parent = node->parent; - // find this text node's seq number - int counter = 1; - xmlNodePtr sibling = parent->xmlChildrenNode; - while (sibling && sibling != node) - { - if (xmlNodeIsText(sibling)) - ++counter; - sibling = sibling->next; - } - _seqNumbers.push_back(counter); - // check whether parent already encountered - NodeHashtable::const_iterator number = _numberedNodes.find(parent); - if (number == _numberedNodes.end()) // not yet seen - { - int newContext = _availContextNumber++; - _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; - _dests.push_back(newContext); - // enqueue parent: its parent will need a number too - parents.push_back(parent); - // System.out.println(parent.getName().toString() + - // " -> " + newContext); - } - else - { - _dests.push_back(number->second); - } - } // end for - - _textNodes.clear(); - - // store info about element ancestry of the above text nodes - // grandparents are added to the end of the vector - int rootElementPos = 0; - for (size_t i = 0; i < parents.size(); i++) - { - xmlNodePtr node = parents[i]; - - std::string name((const char*)(node->name)); - - xmlNodePtr parent = node->parent; - - _links.push_back(getLinkCode(name)); - -// if (parent.getType() == Node.ELEMENT) // not ROOT - if (parent && parent->parent) // not ROOT - { - // find sequence number - xmlNodePtr sibling = parent->xmlChildrenNode; - int counter = 1; - while (sibling && sibling != node) - { - if (strcmp((const char*)sibling->name, (const char*)name.c_str()) == 0) - ++counter; - sibling = sibling->next; - } - - _seqNumbers.push_back(counter); - - // check whether parent already known - NodeHashtable::iterator number = _numberedNodes.find(parent); - if (number == _numberedNodes.end()) - { - int newContext = _availContextNumber++; - _numberedNodes.insert(NodeHashtable::value_type(parent, newContext)).first->second = newContext; - _dests.push_back(newContext); - // enqueue parent: its parent will need a number too - parents.push_back(parent); - //System.out.println(parent.getName().toString() + - // " -> " + newContext); - } - else - { - _dests.push_back(number->second); - } - } - else - { - _dests.push_back(0); // placeholder - _seqNumbers.push_back(1); - rootElementPos = i + nTextNodes; - // System.out.println("rootElementPos = " + i); - } - } // end for - - if (_dests.empty()) - _dests.push_back(0); - - // index to sentinel - _dests[rootElementPos] = _availContextNumber; -} // end public void finish - -void IndexAdapter::init() -{ - _sp = -1; - _tsp = -1; - _attrSP = -1; - _lastWordNumber = 0; - _anyLocationsStored = false; - _availContextNumber = 0; - // all the contexts' tables - _initialWords.clear(); - _locations.clear(); -} - -void IndexAdapter::attribute(const char *name, const char *value) -{ - HCDBG(std::cerr << "attribute: " << name << " = " << value << std::endl); - if (strcmp(name, _nodeID_Name) == 0) - _currentNode = (xmlNodePtr)(strtol(value, NULL, 10)); - else if (strcmp(name, _tokenizer_Name) == 0) - { - if (strcmp(value, "com.sun.xmlsearch.util.SimpleTokenizer") != 0 && !isExtensionMode() ) - std::cerr << "changing tokenizers not implemented in C++ version of HelpLinker" - << " because no other tokenizers were referenced in the helpcontent2 source" - << std::endl; - } - else if (strcmp(name, _attributeName_Name) == 0) - { - //namespace prefix ? - std::string attrVal = std::string("index:") + value; - if( !isExtensionMode() ) - std::cout << "attrVal = " << attrVal << std::endl; - _attributeStack[_attrSP] = std::string(name) + '<' + value + '<' + attrVal; - storeLocation("+<" + _attributeStack[_attrSP]); - } -} - -void IndexAdapter::indexText(const xmlChar *text) -{ - static Tokenizer tokenizer; - tokenizer.setText(text); - _firstWord = _lastWordNumber; - _anyLocationsStored = false; - - std::string lowercaseToken = tokenizer.nextToken(); - while (!lowercaseToken.empty()) - { - HCDBG(std::cerr << "token is: " << lowercaseToken << std::endl); -#ifdef EMULATEORIGINAL - if ((lowercaseToken.size() == 1) && isdigit(lowercaseToken[0])) - { - lowercaseToken = tokenizer.nextToken(); - continue; - } -#endif - if (std::find(_stoplist.begin(), - _stoplist.end(), lowercaseToken) == _stoplist.end()) - { - storeLocation(lowercaseToken); - _anyLocationsStored = true; - } - else - _lastWordNumber++; - lowercaseToken = tokenizer.nextToken(); - } - - if (_anyLocationsStored && _firstWord > -1) - { - _initialWords.push_back(_firstWord); - HCDBG(std::cerr << "appending " << _firstWord << std::endl); - _textNodes.push_back(_currentNode); - } - // reset before next batch - _firstWord = -1; -} - -void IndexAdapter::characters(const xmlChar *str) throw( HelpProcessingException ) -{ - if (!str) - { - std::stringstream aStrStream; - aStrStream << "no characters!" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - - HCDBG(std::cerr << "IndexAdapter::characters of " << str << std::endl); - HCDBG(std::cerr << _sp << " : " << _indexOnOffStack[_sp] << std::endl); - - if (_sp >= 0 && _indexOnOffStack[_sp]) - { - indexText( str ); - } -} - -void IndexAdapter::startElement(xmlNodePtr node) -{ - const char *name = (const char*)(node->name); - - HCDBG(std::cerr << "startElement is " << name << std::endl); - - if (strcmp(name, _indexElement_Name) == 0) - { - _indexOnOffStack[++_sp] = true; - // pop Tokenizer stack - // following attribute can push selected Tokenizer - if (_tsp != -1) - _tsp--; - } - else if (strcmp(name, _indexText_Name) == 0) - { - } - else if (strcmp(name, _indexAttribute_Name) == 0) - { - _attrSP++; - } -} - -void IndexAdapter::endElement(xmlNodePtr node) -{ - const char *name = (const char*)(node->name); - HCDBG(std::cerr << "endElement is " << name << std::endl); - if (strcmp(name, _indexElement_Name) == 0) - _sp--; - else if (strcmp(name, _indexText_Name) == 0) - { - // reset - } - else if (strcmp(name, _indexAttribute_Name) == 0) - storeLocation("-<" + _attributeStack[_attrSP--]); -} - -int IndexAdapter::getLinkCode(const std::string &linkName) -{ - LinkHashTable::iterator code = _linkCodes.find(linkName); - if (code != _linkCodes.end()) - return code->second; - else - { - _linknames.push_back(linkName); - int newCode = CurrenMaxLinkCode++; - _linkCodes.insert(LinkHashTable::value_type(linkName, newCode)).first->second = newCode; - return newCode; - } -} - -void IndexAdapter::process(xmlNodePtr node, xmlDocPtr doc) -{ - startElement(node); - - for (xmlAttrPtr attr = node->properties; attr; attr = attr->next) - { - xmlChar *value = xmlNodeListGetString(doc, attr->children, 0); - attribute((const char*)(attr->name), (const char*)value); - xmlFree(value); - } - - if (xmlNodeIsText(node)) - { - xmlChar *str = xmlNodeListGetString(doc, node, 1); - characters(str); - xmlFree(str); - } - - for (xmlNodePtr test = node->xmlChildrenNode; test; test = test->next) - process(test, doc); - - endElement(node); -} - -class XmlIndexBuilder -{ -private: - fs::path _transformLocation; - xsltStylesheetPtr _indexingTransform; - IndexAdapter _indexAdapter; - int _currentDocID; - void reset(); - xsltStylesheetPtr getTransform(const std::string &stylesheetName); -public: - XmlIndexBuilder() : _indexingTransform(0) {} - XmlIndexBuilder(const fs::path &dir); - ~XmlIndexBuilder(); - void clearIndex(); - void setTransformLocation(const fs::path &filelocation); - void init(const std::string &transform); - void initXmlProcessor(const std::string &transform); - void indexDocument(xmlDocPtr document, const std::string &docURL, const std::string &title); - int intern(const std::string &name); - void openDocument(const std::string &name) throw( HelpProcessingException ); - void closeDocument(const std::string &name) throw( HelpProcessingException ); - void close(); -}; - -void XmlIndexBuilder::close() -{ - fs::path fullname = _indexAdapter._index->indexFile("LINKNAMES"); - std::fstream _linkFile(fullname.native_file_string().c_str(), std::ios::out | std::ios::trunc | std::ios::binary); - -#ifdef EMULATEORIGINAL - static const unsigned char vectorheader[] = - { - 0xAC, 0xED, 0x00, 0x05, 0x75, 0x72, 0x00, 0x13, - 0x5B, 0x4C, 0x6A, 0x61, 0x76, 0x61, 0x2E, 0x6C, - 0x61, 0x6E, 0x67, 0x2E, 0x53, 0x74, 0x72, 0x69, - 0x6E, 0x67, 0x3B, 0xAD, 0xD2, 0x56, 0xE7, 0xE9, - 0x1D, 0x7B, 0x47, 0x02, 0x00, 0x00, 0x78, 0x70 - }; - - _linkFile.write((const char*)(&vectorheader[0]), sizeof(vectorheader)); - writeInt(_linkFile, _indexAdapter._linknames.size()); - std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end(); - for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin(); - aIter != aEnd; ++aIter) - { - HCDBG(std::cerr << "linkname is " << *aIter << std::endl); - _linkFile << 't'; - writeShort(_linkFile, aIter->size()); - _linkFile << *aIter; - } -#else - std::vector<std::string>::iterator aEnd = _indexAdapter._linknames.end(); - for (std::vector<std::string>::iterator aIter = _indexAdapter._linknames.begin(); - aIter != aEnd; ++aIter) - { - _linkFile << *aIter << '\n'; - } -#endif -#if 0 - - // output link codes - /* - Enumeration keys = _linknames.elements(); - while (keys.hasMoreElements()) - System.out.println((String)keys.nextElement()); - */ -#endif - _indexAdapter._index->close(); - std::cout << "done" << std::endl; -} - -int XmlIndexBuilder::intern(const std::string &name) -{ - return _indexAdapter.intern(name); -} - -void XmlIndexBuilder::openDocument(const std::string &name) throw( HelpProcessingException ) -{ - if (_currentDocID != 0) - { - std::stringstream aStrStream; - aStrStream << "document already open" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - _currentDocID = intern( PrefixTranslator::translatePrefix(name) ); - reset(); // reset context gathering state -} - -int BitBuffer::InitSize = 256; -int BitBuffer::NBits = 32; -int BitBuffer::BitsInByte = 8; -int BitBuffer::BytesInInt = 4; - -void Compressor::encode(const IntegerArray &pos, int k) -{ - HCDBG(std::cerr << "1:start this encode of " << k << "size of " - << pos.size() << std::endl); - unsigned int n1 = 0; - unsigned int power = 1 << k; - for (size_t i = 0; i < pos.size(); i++) - { - HCDBG(std::cerr << "1: loop " << i << std::endl); - unsigned int n2 = pos[i] >> k; - int rem = pos[i] % power; - HCDBG(std::cerr << "1: n1, n2 : " << n1 << "," << n2 << std::endl); - if (n2 != n1) - { - unsigned int min = n1; - unsigned int a = n1; - int lev = 0, power2 = 1; - if (n2 > n1) - for (size_t max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) - if ((a & 1) != 0) - min -= power2; - else - max += power2; - else - for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) - if ((a & 1) != 0) - min -= power2; - // lev 0s, 1, lev bits of (n2 - min) plus following value - // no 'V' symbol needed here - if (lev*2 + 1 + k <= NBits) - _buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k); - else - { - if (lev*2 + 1 <= NBits) - _buffer.append(1 << lev | (n2 - min), lev*2 + 1); - else - { - _buffer.append(0, lev); - _buffer.append(1 << lev | (n2 - min), lev + 1); - } - _buffer.append(rem, k); - } - n1 = n2; - } - else - _buffer.append(rem | power, k + 1); // 'V' + value - } - _buffer.append(2 | n1 & 1, 3); // marking end - _buffer.close(); - HCDBG(std::cerr << "1:end this encode of " << k << std::endl); -} - -void Compressor::encode(const IntegerArray &pos, const IntegerArray &len, int k, int k2) -{ - HCDBG(std::cerr << "2:start this encode of " << k << "size of " - << pos.size() << std::endl); - int power = 1 << k, n1 = 0; - for (size_t i = 0; i < pos.size(); i++) - { - HCDBG(std::cerr << "2: loop " << i << std::endl); - int n2 = pos[i] >> k; - int rem = pos[i] % power; - HCDBG(std::cerr << "2: n1, n2 : " << n1 << "," << n2 << std::endl); - if (n2 != n1) - { - int min = n1, a = n1; - int lev = 0, power2 = 1; - if (n2 > n1) - for (int max = n1; max < n2; a >>= 1, power2 <<= 1, lev++) - if ((a & 1) != 0) - min -= power2; - else - max += power2; - else - for ( ; min > n2; a >>= 1, power2 <<= 1, lev++) - if ((a & 1) != 0) - min -= power2; - // lev 0s, 1, lev bits of (n2 - min) plus following value - if (lev*2 + 1 + k <= NBits) - _buffer.append((1<<lev | (n2 - min)) << k | rem, lev*2+1+k); - else - { - if (lev*2 + 1 <= NBits) - _buffer.append(1 << lev | (n2 - min), lev*2 + 1); - else - { - _buffer.append(0, lev); - _buffer.append(1 << lev | (n2 - min), lev + 1); - } - _buffer.append(rem, k); - } - _buffer.append(len[i], k2); - n1 = n2; - } - else - _buffer.append((rem|power)<<k2 | len[i], k+k2+1); // 'V' + v1,v2 - } - _buffer.append(2 | n1 & 1, 3); // marking end - _buffer.close(); - HCDBG(std::cerr << "2:end this encode of " << k << std::endl); -} - -// k: starting value for minimization -int Compressor::minimize(const IntegerArray &array, int startK) -{ - BitBuffer saved; - int minK = startK; - _buffer.clear(); - encode(array, startK); - int min = _buffer.bitCount(); // init w/ first value - saved.setFrom(_buffer); - - _buffer.clear(); - encode(array, startK + 1); - - if (_buffer.bitCount() < min) - { - int k = startK + 1; - do - { - saved.setFrom(_buffer); - min = _buffer.bitCount(); - minK = k; - _buffer.clear(); - encode(array, ++k); - } - while (_buffer.bitCount() < min); - } - else // try smaller values through 1 - { - for (int k = startK - 1; k > 0; k--) - { - _buffer.clear(); - encode(array, k); - if (_buffer.bitCount() < min) - { - saved.setFrom(_buffer); - min = _buffer.bitCount(); - minK = k; - } - else - break; - } - } - - _buffer.setFrom(saved); - return minK; -} - -int Compressor::compressAscending(const IntegerArray &array) -{ - IntegerArray differences(array.size()); - toDifferences(array, differences); - return minimize(differences, BeginK); -} - -int Compressor::NBits = 32; -int Compressor::BeginK = 5; - -class DocumentCompressor -{ -public: - static int NConceptsInGroup; - static int BitsInLabel; - static int DefaultSize; -private: - int _nGroups; - int _nExtents; - unsigned int _freeComp; - int _kk; - Compressor *_currentCompressor; - std::vector<Compressor> _compressors; - Compressor _kCompr; - Compressor _lCompr; - Compressor _mCompr; - Compressor _posCompressor; - IntegerArray _kTable; // k's for the series - IntegerArray _lTable; // lengths of the C/P groups - IntegerArray _maxConcepts; // maximal concepts in CP - IntegerArray _concepts; - IntegerArray _documents; - IntegerArray _microIndexOffsets; - IntegerArray _titles; - // _contextsOffsets for use in XML indexing - IntegerArray _contextsOffsets; - IntegerArray _positions; - IntegerArray _labels; - -public: - DocumentCompressor() : _currentCompressor(0), _compressors(DefaultSize) {} - void writeOutMicroIndex(std::fstream &output, - std::vector<ConceptLocation> &locations, - std::vector<ConceptLocation> &extents) - { - HCDBG(std::cerr << "writeOutMicroIndex start" << std::endl); - encode(locations, NConceptsInGroup); - HCDBG(std::cerr << "writeOutMicroIndex end encode" << std::endl); - if (!extents.empty()) - encodeExtents(extents); - HCDBG(std::cerr << "writeOutMicroIndex finalize" << std::endl); - finalizeEncoding(); - HCDBG(std::cerr << "writeOutMicroIndex write" << std::endl); - writeOut(output); - HCDBG(std::cerr << "writeOutMicroIndex end" << std::endl); - } -private: - void encode(std::vector<ConceptLocation> &locations, int nConcepts) - { - int initK = 4; - // first sort by concept only -#ifdef CMCDEBUG - for (size_t i = 0; i < locations.size(); ++i) - fprintf(stderr, "unsorted is %d\n", locations[i].getConcept()); -#endif - HCDBG(std::cerr << "start sort" << std::endl); - ConceptLocation::sortByConcept(locations, 0, locations.size()); - HCDBG(std::cerr << "end sort" << std::endl); -#ifdef CMCDEBUG - for (size_t i = 0; i < locations.size(); ++i) - fprintf(stderr, "sorted is %d\n", locations[i].getConcept()); -#endif - - // using the fact that concepts are already sorted - // count of groups of 'nConcepts' - // go for differences directly - - // clear the state - _nGroups = 0; - _nExtents = 0; - _kTable.clear(); - _lTable.clear(); - _concepts.clear(); - _maxConcepts.clear(); - _kCompr.clear(); - _lCompr.clear(); - _mCompr.clear(); - for (size_t i = 0; i < _compressors.size(); i++) - _compressors[i].clear(); - _freeComp = 0; - _currentCompressor = NULL; - // end of resetting state - - int conceptCounter = 0; - int fromIndex = 0; - int prevMax = 0; - int last = locations[0].getConcept(); // init w/ first ID - nextCompressor(); - _concepts.push_back(last); - for (size_t i = 0;;) - { - for (; i < locations.size() && locations[i].getConcept() == last; i++) - locations[i].setConcept(conceptCounter); - if (i == locations.size()) - { - if (!_concepts.empty()) - { - ++_nGroups; - _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); - } - encodePositions(locations, fromIndex, i, BitsInLabel); - break; - } - else - { // new concept (group?) - if (++conceptCounter == nConcepts) - { - ++_nGroups; - // we are looking at the beginning of a new group - // last is maximal for the group just finished - // it won't be stored in concepts array but maxConcepts - _concepts.pop_back(); - HCDBG(fprintf(stderr, "_maxConcepts %d %d -> %d\n", last, prevMax, last - prevMax)); - _maxConcepts.push_back(last - prevMax); - prevMax = last; - _kTable.push_back(_currentCompressor->minimize(_concepts, initK)); - -#ifdef CMCDEBUG - for(size_t p = 0; p < locations.size(); ++p) - std::cerr << "microindex2 this testing is " << locations[p].getBegin() << - locations[p].getEnd() << " : " << locations[p].getConcept() << std::endl; -#endif - - HCDBG(std::cerr << "two encodePositions " << fromIndex << " " << i << std::endl); - encodePositions(locations, fromIndex, i, BitsInLabel); - fromIndex = i; - nextCompressor(); - _concepts.clear(); - conceptCounter = 0; - } - _concepts.push_back(locations[i].getConcept() - last); - last = locations[i].getConcept(); - } - } - } - - void encodePositions(std::vector<ConceptLocation> &locations, int from, int to, int cK) - { - int initK = 3; - int lastPos, k; - // sort in place by psitions only -#ifdef CMCDEBUG - for (int i = from; i < to; ++i) - fprintf(stderr, "unsorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); -#endif - ConceptLocation::sortByPosition(locations, from, to); -#ifdef CMCDEBUG - for (int i = from; i < to; ++i) - fprintf(stderr, "sorted is %d %d\n", locations[i].getBegin(), locations[i].getEnd()); -#endif - _positions.clear(); - _labels.clear(); - _positions.push_back(lastPos = locations[from].getBegin()); - _labels.push_back(locations[from].getConcept()); // now: a label - // skip duplicates - for (int i = from, j = from + 1; j < to; j++) - { - if (locations[i].equals(locations[j]) == false) - { - i = j; - HCDBG(std::cerr << "i is " << i << "locations begin is " - << locations[i].getBegin() << "last pos is " << lastPos << std::endl); - _positions.push_back(locations[i].getBegin() - lastPos); - lastPos = locations[i].getBegin(); - _labels.push_back(locations[i].getConcept()); // now: a label - } - } - // first find k by minimizing just positions w/o labels - _kTable.push_back(k = _posCompressor.minimize(_positions, initK)); - _posCompressor.clear(); - HCDBG(std::cerr << "start encodePositions" << std::endl); - _posCompressor.encode(_positions, _labels, k, cK); - HCDBG(std::cerr << "end encodePositions" << std::endl); - _currentCompressor->concatenate(_posCompressor); - } - - void encodeExtents(std::vector<ConceptLocation> &extents) - { - // side effects: - // 'k3' added to _kTable - // a number of compressors populated: header + lengths' lists - int initK = 4; - int c = 0; - IntegerArray concepts; //difference - IntegerArray lengths; - IntegerArray kTable; - IntegerArray lTable; - // reserve a compressor for concatenated tables - nextCompressor(); - Compressor *extentsHeader = _currentCompressor; - std::vector<ConceptLocation>::const_iterator aEnd = extents.end(); - for (std::vector<ConceptLocation>::const_iterator aIter = extents.begin(); - aIter != aEnd; ++aIter) - { - if (aIter->getConcept() != c) - { - if (c != 0) - { - _nExtents++; - nextCompressor(); - kTable.push_back(_currentCompressor->minimize(lengths, initK)); - lTable.push_back(_currentCompressor->byteCount()); - } - concepts.push_back(aIter->getConcept() - c); - c = aIter->getConcept(); - lengths.clear(); - lengths.push_back(aIter->getLength()); - } - else - lengths.push_back(aIter->getLength()); - } - // last table of lengths - nextCompressor(); - kTable.push_back(_currentCompressor->minimize(lengths, initK)); - lTable.push_back(_currentCompressor->byteCount()); - Compressor compressor1; - kTable.push_back(compressor1.minimize(lTable, initK)); - Compressor compressor2; - kTable.push_back(compressor2.minimize(concepts, initK)); - _kTable.push_back(extentsHeader->minimize(kTable, initK)); // k3 - extentsHeader->concatenate(compressor1); - extentsHeader->concatenate(compressor2); - } - - void finalizeEncoding() - { - if (_nGroups > 1) - { - // if extents follow C/P groups we need the length of the last group - int limit = _nExtents > 0 ? _freeComp : _freeComp - 1; - for (int j = 0; j < limit; j++) // length of last not saved - _lTable.push_back(_compressors[j].byteCount()); - - _kTable.push_back(_mCompr.minimize(_maxConcepts, 3)); - _kTable.push_back(_lCompr.minimize(_lTable, 3)); - _kk = _kCompr.minimize(_kTable, 3); - _kCompr.concatenate(_lCompr); - _kCompr.concatenate(_mCompr); - } - else if (_nGroups == 1 && _nExtents > 0) - { - // length of the single C/P group packed with k-s - _kTable.push_back(_compressors[0].byteCount()); - _kk = _kCompr.minimize(_kTable, 3); - } - } - - void writeOut(std::fstream &out) - { - if (_nExtents == 0) - { - if (_nGroups > 1) - { - unsigned char byte = static_cast<unsigned char>((0x80 | _kk)); - out.write( (const char*)&byte, 1 ); - HCDBG(std::cerr << "writeOut of " << int(byte) << std::endl); - _kCompr.write(out); // concatenated k,l,m - for (size_t j = 0; j < _freeComp; j++) - _compressors[j].write(out); - } - else // single group, no extents; code: 00 - { - unsigned char k1 = (unsigned char)(_kTable[0]); - unsigned char k2 = (unsigned char)(_kTable[1]); - out.write( (const char*)&k1, 1 ); - out.write( (const char*)&k2, 1 ); - _compressors[0].write(out); // C/P - } - } - else - { // extents - unsigned char byte = static_cast<unsigned char>( - (_nGroups > 1 ? 0xC0 : 0x40) | _kk); - out.write( (const char*)&byte, 1 ); - _kCompr.write(out); - for (size_t j = 0; j < _freeComp; j++) - _compressors[j].write(out); - } - } - - Compressor* nextCompressor() - { - if (_freeComp == _compressors.size()) - _compressors.push_back(Compressor()); - return _currentCompressor = &_compressors[_freeComp++]; - } - - int byteCount() - { - if (_nGroups == 1 && _nExtents == 0) - return 2 + _compressors[0].byteCount(); - else - { - int result = 1; // initial kk - result += _kCompr.byteCount(); - for (size_t j = 0; j < _freeComp; j++) - result += _compressors[j].byteCount(); - return result; - } - } -}; - -int DocumentCompressor::NConceptsInGroup = 16; -int DocumentCompressor::BitsInLabel = 4; -int DocumentCompressor::DefaultSize = 32; - -DocumentCompressor& Index::getDocumentCompressor() -{ - if (!_documentCompressor) - _documentCompressor = new DocumentCompressor(); - return *_documentCompressor; -} - -void Index::compress(int docID, int titleID, - std::vector<ConceptLocation> &locations, - std::vector<ConceptLocation> &extents) -{ - std::fstream &positions = getPositionsFile(); - - positions.seekg(0, std::ios::end); - long currentEnd = positions.tellg(); - if (currentEnd < 0) currentEnd = 0; - positions.clear(); - positions.seekg(currentEnd, std::ios::beg); - - _documents.push_back(docID); - _microIndexOffsets.push_back(currentEnd); - HCDBG(std::cerr << "_microIndexOffsets pushed back " << currentEnd << std::endl); - HCDBG(std::cerr << "added title id of " << titleID << std::endl); - _titles.push_back(titleID); - - getDocumentCompressor().writeOutMicroIndex(positions, - locations, extents); -} - -void Index::writeOutOffsets() -{ - Compressor documents; - int k1 = documents.minimize(_documents, 8); - Compressor offsets; - int k2 = offsets.compressAscending(_microIndexOffsets); - Compressor titles; - int k3 = titles.minimize(_titles, 8); // 8 is the starting k - std::fstream &out = getOffsetsFile(); - out.seekp(0); // position at beginning - out.clear(); - unsigned char byte; - byte = static_cast<unsigned char>(k1); - out.write( (const char*)&byte, 1 ); - HCDBG(fprintf(stderr, "a: offset dump of %x\n", byte)); - documents.write(out); - byte = static_cast<unsigned char>(k2); - out.write( (const char*)&byte, 1 ); - HCDBG(fprintf(stderr, "b: offset dump of %x\n", byte)); - offsets.write(out); - byte = static_cast<unsigned char>(k3); - out.write( (const char*)&byte, 1 ); - HCDBG(fprintf(stderr, "c: offset dump of %x\n", byte)); - titles.write(out); -} - -Index::~Index() -{ - delete _schema; - delete _dictParams; - delete _dict; - delete _positionsFile; - delete _offsetsFile; - delete _documentCompressor; -} - -void XmlIndex::compress(int docID, int titleID, - std::vector<ConceptLocation> &locations, - std::vector<ConceptLocation> &extents, - int k, const Compressor &contextTables) -{ - HCDBG(std::cerr << "start compress" << std::endl); - HCDBG(std::cerr << "docID : " << docID << " titleID : " << titleID << - "locations size : " << locations.size() << "extents size : " << extents.size() << std::endl); - Index::compress(docID, titleID, locations, extents); - HCDBG(std::cerr << "end compress" << std::endl); - - std::fstream& contexts = getContextsFile(); - - contexts.seekp(0, std::ios::end); - long currentEnd = contexts.tellp(); - if (currentEnd < 0) currentEnd = 0; - contexts.clear(); - contexts.seekp(currentEnd); - writeByte(contexts, static_cast<unsigned char>(k)); - contextTables.write(contexts); - _contextsOffsets.push_back(currentEnd); -} - -void XmlIndexBuilder::closeDocument(const std::string &title) throw( HelpProcessingException ) -{ - if (_currentDocID == 0) - { - std::stringstream aStrStream; - aStrStream << "no document open" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - else if (!_indexAdapter._locations.empty()) - { - IntegerArray kTable; - - Compressor compressor1; - Compressor compressor2; - Compressor compressor3; - Compressor compressor4; - - kTable.push_back(compressor1.compressAscending(_indexAdapter._initialWords)); - kTable.push_back(compressor2.minimize(_indexAdapter._dests, 2)); - kTable.push_back(compressor3.minimize(_indexAdapter._links, 2)); - kTable.push_back(compressor4.minimize(_indexAdapter._seqNumbers, 2)); - - Compressor compressor0; - int k0 = compressor0.minimize(kTable, 4); - - compressor0.concatenate(compressor1); - compressor0.concatenate(compressor2); - compressor0.concatenate(compressor3); - compressor0.concatenate(compressor4); - - std::vector<ConceptLocation> dummy; - _indexAdapter._index->compress(_currentDocID, intern(title), - _indexAdapter._locations, dummy, k0, compressor0); - } - else - { - // System.out.println("no indexable content"); - } - _indexAdapter._locations.clear(); - _currentDocID = 0; // state: nothing open -} - -void XmlIndexBuilder::indexDocument(xmlDocPtr doc, const std::string &docURL, const std::string &title) -{ - HCDBG(std::cerr << "Indexing " << docURL << std::endl); - - xmlNodePtr root = xmlDocGetRootElement(doc); - - openDocument(docURL); - -// xmlDocDump(stdout, doc); - xmlDocPtr res = xsltApplyStylesheet(_indexingTransform, doc, NULL); - - _indexAdapter.init(); - - // start = System.currentTimeMillis(); - root = xmlDocGetRootElement(res); - if (root) - { -// xmlDocDump(stdout, res); - for (xmlNodePtr test = root; test; test = test->next) - _indexAdapter.process(test, res); - } - xmlFreeDoc(res); - - // System.out.println((System.currentTimeMillis()-start)+" transform"); - // start = System.currentTimeMillis(); - _indexAdapter.finish(); - // System.out.println((System.currentTimeMillis()-start)+" finish"); - // start = System.currentTimeMillis(); - closeDocument(title); - // System.out.println((System.currentTimeMillis()-start)+" close"); -} - -XmlIndexBuilder::~XmlIndexBuilder() -{ - delete _indexAdapter._index; -} - -void XmlIndexBuilder::setTransformLocation(const fs::path &filelocation) -{ - _transformLocation = filelocation; -} - -xsltStylesheetPtr XmlIndexBuilder::getTransform(const std::string &stylesheetName) -{ - fs::path stylesheet = _transformLocation / (stylesheetName + ".xsl"); - return xsltParseStylesheetFile((const xmlChar *)stylesheet.native_file_string().c_str()); -} - -void XmlIndexBuilder::initXmlProcessor(const std::string &transform) -{ - _indexingTransform = getTransform(transform); -} - -void XmlIndexBuilder::init(const std::string &transform) -{ - _indexAdapter._index->init(); -#ifdef EMULATEORIGINAL - //some kind of bug in the original AFAICS - _indexAdapter._stoplist.push_back("andnull"); -#endif - reset(); - - // initialize vector and hashtable - const std::vector<std::string> &linkNames = _indexAdapter._index->getLinkNames(); - std::vector<std::string>::const_iterator aEnd = linkNames.end(); - for (std::vector<std::string>::const_iterator aIter = linkNames.begin(); - aIter != aEnd; ++aIter) - { - _indexAdapter.getLinkCode(*aIter); - } - - initXmlProcessor(transform); -} - -void XmlIndexBuilder::reset() -{ - _indexAdapter._availContextNumber = 0; - _indexAdapter._lastWordNumber = 0; - _indexAdapter._locations.clear(); - _indexAdapter._anyLocationsStored = false; - // all the contexts' tables - _indexAdapter._initialWords.clear(); - _indexAdapter._dests.clear(); - _indexAdapter._links.clear(); - _indexAdapter._seqNumbers.clear(); -} - -XmlIndexBuilder::XmlIndexBuilder(const fs::path &indexDir) - : _indexingTransform(0), _currentDocID(0) -{ - HCDBG(std::cerr << "indexDir is " << indexDir.native_directory_string() << std::endl); - _indexAdapter._index = new XmlIndex(indexDir, true); -} - -void XmlIndexBuilder::clearIndex() -{ - _indexAdapter._index->clear(); -} - class HelpLinker { public: - static void main(std::vector<std::string> &args, std::string* pExtensionPath = NULL ) + void main(std::vector<std::string> &args, std::string* pExtensionPath = NULL ) throw( HelpProcessingException ); - static bool isExtensionMode( void ) - {return bExtensionMode; } + + HelpLinker() + : init(true) + , m_pIndexerPreProcessor(NULL) + {} + ~HelpLinker() + { delete m_pIndexerPreProcessor; } + private: - HelpLinker() : init(true), xmlIndexBuilder(NULL) {} - ~HelpLinker() { delete xmlIndexBuilder; } - JarOutputStream jarOutputStream; - static int locCount, totCount; - static Stringtable additionalFiles; - static HashSet helpFiles; - static fs::path sourceRoot; - static fs::path embeddStylesheet; - static fs::path indexStylesheet; - static fs::path outputFile; - static std::string module; - static std::string lang; - static std::string hid; - static std::string extensionPath; - static bool bExtensionMode; + int locCount, totCount; + Stringtable additionalFiles; + HashSet helpFiles; + fs::path sourceRoot; + fs::path embeddStylesheet; + fs::path idxCaptionStylesheet; + fs::path idxContentStylesheet; + fs::path zipdir; + fs::path outputFile; + std::string module; + std::string lang; + std::string hid; + std::string extensionPath; + bool bExtensionMode; fs::path indexDirName; Stringtable hidlistTranslation; fs::path indexDirParentName; bool init; - XmlIndexBuilder* xmlIndexBuilder; - void initXMLIndexBuilder(); - void createFileFromBytes(const std::string &fileName, - const std::string &defaultXSL); - void closeXMLIndexBuilder() - { - xmlIndexBuilder->close(); - } + IndexerPreProcessor* m_pIndexerPreProcessor; + void initIndexerPreProcessor(); void link() throw( HelpProcessingException ); void addBookmark( DB* dbBase, std::string thishid, const std::string& fileB, const std::string& anchorB, @@ -4712,11 +257,6 @@ private: #endif }; -bool isExtensionMode( void ) -{ - return HelpLinker::isExtensionMode(); -} - namespace URLEncoder { static std::string encode(const std::string &rIn) @@ -4740,76 +280,6 @@ namespace URLEncoder } } -JarOutputStream::JarOutputStream() -{ - perlline << "use Archive::Zip qw(:ERROR_CODES); "; - perlline << "my $zip = Archive::Zip->new(); "; -} - -std::string replaceAll(std::string result, - const std::string &search, const std::string &replace) -{ - std::string::size_type pos = 0; - while(1) - { - pos = result.find(search, pos); - if (pos == std::string::npos) break; - result.replace(pos, search.size(), replace); - pos += replace.size(); - } - return result; -} - -void JarOutputStream::addFile(const std::string &fileName, const std::string &name) -{ - perlline << "$zip->addFile(\"" << replaceAll(fileName, "\\", "/") << "\", \"" << name << "\"); "; -} - -void JarOutputStream::addTree(const std::string &tree, const std::string &name) -{ - perlline << "$zip->addTree(\"" << replaceAll(tree, "\\", "/") << "\", \"" << name << "\"); "; -} - -void JarOutputStream::dontCompress(const std::string &key) -{ - perlline << "my $member = $zip->memberNamed(\"" << key << "\"); "; - perlline << "if ($member) { $member->desiredCompressionMethod( COMPRESSION_STORED ); } "; -} - -void JarOutputStream::commit() -{ - perlline << "print $zip->writeToFileNamed(\"" << replaceAll(getname().native_file_string(), "\\", "/") << "\").\"\\n\"; "; - - fs::path tmp = getname(); - tmp.append(".perl"); - std::string perlfile = replaceAll( tmp.native_file_string(), "\\", "/"); - std::ofstream fos(perlfile.c_str()); - fos << perlline.str(); - fos.close(); - std::string myperl("perl"); - std::string is4nt; - char* use_shell = getenv( "USE_SHELL" ); - if ( use_shell ) - is4nt = use_shell; - if( !is4nt.empty() && is4nt == "4nt" ) - { - // in SO windows environment perl isn't in the path and - // needs to be fetched from the environment. this doesn't - // work in a cygwin shell as "/usr/bin/perl" will fail in a - // native shell (see system call). - myperl = getenv( "PERL" ); - } - std::string commandline; - commandline = myperl + " " + perlfile; - HCDBG(std::cerr << "command line 3 is" << commandline << std::endl); - // on windows, calling perl (either cygwin or native) from a native - // shell the only chance to survive is using "c:/foo" notation - if ( system(commandline.c_str()) ) - fprintf (stderr, "ERROR: calling generated perl script failed!\n"); - - fs::remove(tmp); -} - void HelpLinker::addBookmark( DB* dbBase, std::string thishid, const std::string& fileB, const std::string& anchorB, const std::string& jarfileB, const std::string& titleB) @@ -4863,104 +333,14 @@ void HelpLinker::addBookmark( DB* dbBase, std::string thishid, dbBase->put(dbBase, NULL, &key, &data, 0); } -void HelpLinker::createFileFromBytes(const std::string &fileName, - const std::string &defaultXSL) -{ - std::ofstream fos((indexDirParentName / fileName).native_file_string().c_str()); - fos << defaultXSL; -} - -void HelpLinker::initXMLIndexBuilder() +void HelpLinker::initIndexerPreProcessor() { + if( m_pIndexerPreProcessor ) + delete m_pIndexerPreProcessor; std::string mod = module; std::transform (mod.begin(), mod.end(), mod.begin(), tolower); - indexDirName = indexDirParentName / (mod + ".idx"); - fs::create_directory(indexDirName); - - if (xmlIndexBuilder) delete xmlIndexBuilder; - xmlIndexBuilder = new XmlIndexBuilder(indexDirName); - - std::string defaultXSL = - "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" - "<xsl:stylesheet version=\"1.0\" xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\">\n" - "\t<xsl:template match=\"*|/\"/>\n" - "</xsl:stylesheet>"; - createFileFromBytes("default.xsl", defaultXSL); - xmlIndexBuilder->clearIndex(); // Build index from scratch - xmlIndexBuilder->setTransformLocation(indexDirParentName); -} - -namespace -{ - fs::path gettmppath() - { - fs::path ret; - osl::File::createTempFile(0, 0, &ret.data); - fs::remove(ret); - return ret; - } -} - -extern "C" void function_orig_pointer(xmlXPathParserContextPtr ctxt, int nargs) -{ - if (nargs > 1) - { - // TODO: Change when used for extensions, no exception possible here - std::cerr << "function_orig_pointer, too many args" << std::endl; - exit(-1); - } - - xmlNodePtr cur = NULL; - if (nargs == 0) - cur = ctxt->context->node; - else if (nargs == 1) - { - xmlXPathObjectPtr obj = valuePop(ctxt); - xmlNodeSetPtr nodelist = obj->nodesetval; - - if ((nodelist == NULL) || (nodelist->nodeNr <= 0)) - { - // TODO: Change when used for extensions, no exception possible here - std::cerr << "function_orig_pointer, bad nodeset" << std::endl; - exit(-1); - } - - cur = nodelist->nodeTab[0]; - for (int i = 1; i < nodelist->nodeNr; ++i) - { - int ret = xmlXPathCmpNodes(cur, nodelist->nodeTab[i]); - if (ret == -1) - cur = nodelist->nodeTab[i]; - } - - xmlXPathFreeObject(obj); - } - - if (cur == NULL) - { - // TODO: Change when used for extensions, no exception possible here - std::cerr << "function_orig_pointer, bad node" << std::endl; - exit(-1); - } - - static xmlChar str[20]; - sprintf((char *)str, "%ld", (sal_uIntPtr)(cur)); - valuePush(ctxt, xmlXPathNewString(str)); -} - -extern "C" void* cmc_module_init(xsltTransformContextPtr ctxt, const xmlChar* uri) -{ - if (xsltRegisterExtFunction(ctxt, (const xmlChar*)"orig-pointer", uri, function_orig_pointer)) - { - // TODO: Change when used for extensions, no exception possible here - std::cerr << "failure to register function_orig_pointer" << std::endl; - exit(-1); - } - return NULL; -} - -extern "C" void cmc_module_term(xsltTransformContextPtr, const xmlChar*, void*) -{ + m_pIndexerPreProcessor = new IndexerPreProcessor( mod, indexDirParentName, + idxCaptionStylesheet, idxContentStylesheet ); } /** @@ -4976,7 +356,7 @@ void HelpLinker::link() throw( HelpProcessingException ) } else { - indexDirParentName = gettmppath(); + indexDirParentName = zipdir; fs::create_directory(indexDirParentName); } @@ -4987,15 +367,6 @@ void HelpLinker::link() throw( HelpProcessingException ) std::string mod = module; std::transform (mod.begin(), mod.end(), mod.begin(), tolower); - // Determine the outputstream - fs::path outputTmpFile; - if( !bExtensionMode ) - { - outputTmpFile = outputFile; - outputTmpFile.append(".tmp"); - jarOutputStream.setname(outputTmpFile); - } - // do the work here // continue with introduction of the overall process thing into the // here all hzip files will be worked on @@ -5042,7 +413,7 @@ void HelpLinker::link() throw( HelpProcessingException ) // lastly, initialize the indexBuilder if ( (!bExtensionMode || bIndexForExtension) && !helpFiles.empty()) - initXMLIndexBuilder(); + initIndexerPreProcessor(); if( !bExtensionMode ) { @@ -5056,6 +427,7 @@ void HelpLinker::link() throw( HelpProcessingException ) { std::cout << "."; std::cout.flush(); + // process one file // streamTable contains the streams in the hzip file StreamTable streamTable; @@ -5074,6 +446,7 @@ void HelpLinker::link() throw( HelpProcessingException ) fs::path langsourceRoot(sourceRoot); fs::path xhpFile; + if( bExtensionMode ) { // langsourceRoot == sourceRoot for extensions @@ -5086,6 +459,7 @@ void HelpLinker::link() throw( HelpProcessingException ) langsourceRoot.append('/' + lang + '/'); xhpFile = fs::path(xhpFileName, fs::native); } + HelpCompiler hc( streamTable, xhpFile, langsourceRoot, embeddStylesheet, module, lang, bExtensionMode ); @@ -5130,33 +504,6 @@ void HelpLinker::link() throw( HelpProcessingException ) // add once this as its own id. addBookmark(dbBase, documentPath, fileB, std::string(), jarfileB, titleB); - if ( (!bExtensionMode || bIndexForExtension) && init) - { - std::ifstream indexXSLFile(indexStylesheet.native_file_string().c_str()); - std::ostringstream baos; - baos << indexXSLFile.rdbuf(); - std::string xsl = baos.str(); - - //I see that we later generate a map of generateids to nodes which we will use - //to link the results of generate-id in the transformed document back to the nodes - //in the original document, so let's cut out the middle-men and make an extension - //which does exactly what we want, and give us a pointer to the original node - xsl.replace(xsl.find("<xsl:stylesheet"), strlen("<xsl:stylesheet"), - "<xsl:stylesheet extension-element-prefixes=\"CMC\" xmlns:CMC=\"http://www.cunninghack.org\""); - xsl.replace(xsl.find("generate-id"), strlen("generate-id"), "CMC:orig-pointer"); - - if (xsltRegisterExtModule((const xmlChar*)"http://www.cunninghack.org", cmc_module_init, cmc_module_term)) - { - std::stringstream aStrStream; - aStrStream << "fatal error on registering xslt module" << std::endl; - throw HelpProcessingException( HELPPROCESSING_INTERNAL_ERROR, aStrStream.str() ); - } - - createFileFromBytes("index.xsl", xsl); - xmlIndexBuilder->init("index"); - init = false; - } - // first the database *.db // ByteArrayInputStream bais = null; // ObjectInputStream ois = null; @@ -5246,6 +593,7 @@ void HelpLinker::link() throw( HelpProcessingException ) } } + //IndexerPreProcessor if( !bExtensionMode || bIndexForExtension ) { // now the indexing @@ -5256,14 +604,10 @@ void HelpLinker::link() throw( HelpProcessingException ) { std::string temp = module; std::transform (temp.begin(), temp.end(), temp.begin(), tolower); - xmlIndexBuilder->indexDocument(document, - std::string("vnd.sun.star.help://") - + temp - + "/" - + URLEncoder::encode(documentPath), - ""); + m_pIndexerPreProcessor->processDocument(document, URLEncoder::encode(documentPath) ); } } + } // while loop over hzip files ending if( !bExtensionMode ) @@ -5284,46 +628,32 @@ void HelpLinker::link() throw( HelpProcessingException ) helpKeyword.dump(keyWord); keyWord->close(keyWord, 0); - if (!bExtensionMode && !helpFiles.empty()) - { - closeXMLIndexBuilder(); - HCDBG(std::cerr << "dir is " << indexDirName.native_directory_string() << std::endl); - jarOutputStream.addTree(indexDirName.native_file_string(), mod + ".idx"); - } - if( !bExtensionMode ) { - jarOutputStream.addFile(helpTextFileName.native_file_string(), mod + ".ht"); - jarOutputStream.addFile(dbBaseFileName.native_file_string(), mod + ".db"); - jarOutputStream.addFile(keyWordFileName.native_file_string(), mod + ".key"); - - ///////////////////////////////////////////////////////////////////////// - // last, all files which should be copied into the jar file - ///////////////////////////////////////////////////////////////////////// - + // New index Stringtable::iterator aEnd = additionalFiles.end(); for (Stringtable::iterator enumer = additionalFiles.begin(); enumer != aEnd; ++enumer) { - const std::string &additionalFileKey = enumer->first; const std::string &additionalFileName = enumer->second; - jarOutputStream.addFile(additionalFileName, additionalFileKey); - } + const std::string &additionalFileKey = enumer->first; - jarOutputStream.dontCompress(mod + ".jar"); - jarOutputStream.commit(); + fs::path fsAdditionalFileName( additionalFileName, fs::native ); + std::string aNativeStr = fsAdditionalFileName.native_file_string(); + const char* pStr = aNativeStr.c_str(); + std::cerr << pStr; - HCDBG(std::cerr << "like to rename " << outputTmpFile.native_file_string() << " as " << - outputFile.native_file_string() << std::endl); - fs::rename(outputTmpFile, outputFile); - if (!fs::exists(outputFile)) - { - std::stringstream aStrStream; - aStrStream << "can't rename file '" << outputTmpFile.native_file_string() << "'" << std::endl; - throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); + fs::path fsTargetName( indexDirParentName / additionalFileKey ); + + fs::copy( fsAdditionalFileName, fsTargetName ); } } +#ifdef SOLARIS + if( !bExtensionMode ) + _exit( 0 ); +#endif +/* ///////////////////////////////////////////////////////////////////////// /// remove temprary directory for index creation ///////////////////////////////////////////////////////////////////////// @@ -5331,24 +661,10 @@ void HelpLinker::link() throw( HelpProcessingException ) if( !bExtensionMode ) fs::remove_all( indexDirParentName ); #endif +*/ } -int HelpLinker::locCount; -int HelpLinker::totCount; -Stringtable HelpLinker::additionalFiles; -HashSet HelpLinker::helpFiles; -fs::path HelpLinker::sourceRoot; -fs::path HelpLinker::embeddStylesheet, HelpLinker::indexStylesheet; -fs::path HelpLinker::outputFile; -std::string HelpLinker::module; -std::string HelpLinker::lang; -std::string HelpLinker::hid; -std::string HelpLinker::extensionPath; -bool HelpLinker::bExtensionMode; - -int GnTmpFileCounter = 0; - void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPath) throw( HelpProcessingException ) { @@ -5406,17 +722,41 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat embeddStylesheet = fs::path(args[i], fs::native); } - else if (args[i].compare("-idx") == 0) + else if (args[i].compare("-zipdir") == 0) { ++i; if (i >= args.size()) { std::stringstream aStrStream; - aStrStream << "indexstylesheet missing" << std::endl; + aStrStream << "idxtemp missing" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } - indexStylesheet = fs::path(args[i], fs::native); + zipdir = fs::path(args[i], fs::native); + } + else if (args[i].compare("-idxcaption") == 0) + { + ++i; + if (i >= args.size()) + { + std::stringstream aStrStream; + aStrStream << "idxcaption stylesheet missing" << std::endl; + throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); + } + + idxCaptionStylesheet = fs::path(args[i], fs::native); + } + else if (args[i].compare("-idxcontent") == 0) + { + ++i; + if (i >= args.size()) + { + std::stringstream aStrStream; + aStrStream << "idxcontent stylesheet missing" << std::endl; + throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); + } + + idxContentStylesheet = fs::path(args[i], fs::native); } else if (args[i].compare("-o") == 0) { @@ -5494,10 +834,22 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat ++i; } - if (!bExtensionMode && indexStylesheet.empty()) + if (!bExtensionMode && zipdir.empty()) + { + std::stringstream aStrStream; + aStrStream << "no index dir given" << std::endl; + throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); + } + if (!bExtensionMode && idxCaptionStylesheet.empty()) + { + std::stringstream aStrStream; + aStrStream << "no index caption stylesheet given" << std::endl; + throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); + } + if (!bExtensionMode && idxContentStylesheet.empty()) { std::stringstream aStrStream; - aStrStream << "no index file given" << std::endl; + aStrStream << "no index content stylesheet given" << std::endl; throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } if (!bExtensionMode && embeddStylesheet.empty()) @@ -5537,7 +889,7 @@ void HelpLinker::main(std::vector<std::string> &args, std::string* pExtensionPat throw HelpProcessingException( HELPPROCESSING_GENERAL_ERROR, aStrStream.str() ); } - HelpLinker().link(); + link(); } int main(int argc, char**argv) @@ -5548,7 +900,9 @@ int main(int argc, char**argv) args.push_back(std::string(argv[i])); try { - HelpLinker::main(args); + HelpLinker* pHelpLinker = new HelpLinker(); + pHelpLinker->main( args ); + delete pHelpLinker; } catch( const HelpProcessingException& e ) { @@ -5584,9 +938,9 @@ HelpProcessingErrorInfo& HelpProcessingErrorInfo::operator=( const struct HelpPr { m_eErrorClass = e.m_eErrorClass; rtl::OString tmpErrorMsg( e.m_aErrorMsg.c_str() ); - m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, osl_getThreadTextEncoding() ); + m_aErrorMsg = rtl::OStringToOUString( tmpErrorMsg, fs::getThreadTextEncoding() ); rtl::OString tmpXMLParsingFile( e.m_aXMLParsingFile.c_str() ); - m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, osl_getThreadTextEncoding() ); + m_aXMLParsingFile = rtl::OStringToOUString( tmpXMLParsingFile, fs::getThreadTextEncoding() ); m_nXMLParsingLine = e.m_nXMLParsingLine; return *this; } @@ -5607,14 +961,14 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp const char** argv = new const char*[argc]; argv[0] = ""; argv[1] = "-mod"; - rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, osl_getThreadTextEncoding() ); + rtl::OString aOExtensionName = rtl::OUStringToOString( aExtensionName, fs::getThreadTextEncoding() ); argv[2] = aOExtensionName.getStr(); for( sal_Int32 iXhp = 0 ; iXhp < nXhpFileCount ; ++iXhp ) { rtl::OUString aXhpFile = pXhpFiles[iXhp]; - rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, osl_getThreadTextEncoding() ); + rtl::OString aOXhpFile = rtl::OUStringToOString( aXhpFile, fs::getThreadTextEncoding() ); char* pArgStr = new char[aOXhpFile.getLength() + 1]; strcpy( pArgStr, aOXhpFile.getStr() ); argv[iXhp + 3] = pArgStr; @@ -5628,7 +982,7 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp delete argv[iXhp + 3]; delete[] argv; - rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, osl_getThreadTextEncoding() ); + rtl::OString aOExtensionLanguageRoot = rtl::OUStringToOString( aExtensionLanguageRoot, fs::getThreadTextEncoding() ); const char* pExtensionPath = aOExtensionLanguageRoot.getStr(); std::string aStdStrExtensionPath = pExtensionPath; @@ -5636,7 +990,9 @@ HELPLINKER_DLLPUBLIC bool compileExtensionHelp xmlSetStructuredErrorFunc( NULL, (xmlStructuredErrorFunc)StructuredXMLErrorFunction ); try { - HelpLinker::main(args,&aStdStrExtensionPath); + HelpLinker* pHelpLinker = new HelpLinker(); + pHelpLinker->main( args,&aStdStrExtensionPath ); + delete pHelpLinker; } catch( const HelpProcessingException& e ) { |