diff options
author | Gert van Valkenhoef <g.h.m.van.valkenhoef@rug.nl> | 2012-02-14 19:31:18 +0100 |
---|---|---|
committer | Caolán McNamara <caolanm@redhat.com> | 2012-02-23 10:31:16 +0000 |
commit | 25caea6f0d6806b44c611bef448b97e260306258 (patch) | |
tree | 7dfde8a977bffaa6c2a1b5d1668dc506a6ea47f9 | |
parent | 15704e6319c1194944df6c25a9d1ce413f309610 (diff) |
Add C++ HelpIndexer
-rw-r--r-- | l10ntools/prj/build.lst | 2 | ||||
-rw-r--r-- | l10ntools/prj/d.lst | 6 | ||||
-rw-r--r-- | l10ntools/source/help/helpindexer.cxx | 247 | ||||
-rw-r--r-- | l10ntools/source/help/makefile.mk | 30 |
4 files changed, 263 insertions, 22 deletions
diff --git a/l10ntools/prj/build.lst b/l10ntools/prj/build.lst index ed919a5ec078..8e3ea7041bc6 100644 --- a/l10ntools/prj/build.lst +++ b/l10ntools/prj/build.lst @@ -1,4 +1,4 @@ -tr l10ntools : BERKELEYDB:berkeleydb EXPAT:expat LIBXSLT:libxslt LUCENE:lucene sal NULL +tr l10ntools : BERKELEYDB:berkeleydb EXPAT:expat LIBXSLT:libxslt sal NULL tr l10ntools usr1 - all tr_mkout NULL tr l10ntools\inc nmake - all tr_inc NULL tr l10ntools\source nmake - all tr_src tr_inc NULL diff --git a/l10ntools/prj/d.lst b/l10ntools/prj/d.lst index eded848af832..174bb6c6d99f 100644 --- a/l10ntools/prj/d.lst +++ b/l10ntools/prj/d.lst @@ -26,12 +26,14 @@ mkdir: %_DEST%\bin\help\com\sun\star\help ..\%__SRC%\bin\txtconv %_DEST%\bin\txtconv ..\%__SRC%\bin\ulfconv %_DEST%\bin\ulfconv ..\%__SRC%\class\FCFGMerge.jar %_DEST%\bin\FCFGMerge.jar -..\%__SRC%\class\HelpIndexerTool.jar %_DEST%\bin\HelpIndexerTool.jar -..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker ..\%__SRC%\bin\HelpCompiler %_DEST%\bin\HelpCompiler ..\%__SRC%\bin\HelpCompiler.exe %_DEST%\bin\HelpCompiler.exe +..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker ..\%__SRC%\bin\HelpLinker.exe %_DEST%\bin\HelpLinker.exe ..\%__SRC%\bin\HelpLinker* %_DEST%\bin +..\%__SRC%\bin\HelpIndexer %_DEST%\bin\HelpIndexer +..\%__SRC%\bin\HelpIndexer.exe %_DEST%\bin\HelpIndexer.exe +..\%__SRC%\bin\HelpIndexer* %_DEST%\bin ..\scripts\localize %_DEST%\bin\localize ..\scripts\fast_merge.pl %_DEST%\bin\fast_merge.pl diff --git a/l10ntools/source/help/helpindexer.cxx b/l10ntools/source/help/helpindexer.cxx new file mode 100644 index 000000000000..c3271194bb30 --- /dev/null +++ b/l10ntools/source/help/helpindexer.cxx @@ -0,0 +1,247 @@ +#include <CLucene/StdHeader.h> +#include <CLucene.h> +#ifdef TODO +#include <CLucene/analysis/LanguageBasedAnalyzer.h> +#endif + +#include <unistd.h> +#include <sys/stat.h> +#include <dirent.h> +#include <errno.h> +#include <string.h> + +#include <string> +#include <iostream> +#include <algorithm> +#include <set> + +// I assume that TCHAR is defined as wchar_t throughout + +using namespace lucene::document; + +class HelpIndexer { + private: + std::string d_lang; + std::string d_module; + std::string d_captionDir; + std::string d_contentDir; + std::string d_indexDir; + std::string d_error; + std::set<std::string> d_files; + + public: + + /** + * @param lang Help files language. + * @param module The module of the helpfiles. + * @param captionDir The directory to scan for caption files. + * @param contentDir The directory to scan for content files. + * @param indexDir The directory to write the index to. + */ + HelpIndexer(std::string const &lang, std::string const &module, + std::string const &captionDir, std::string const &contentDir, + std::string const &indexDir); + + /** + * Run the indexer. + * @return true if index successfully generated. + */ + bool indexDocuments(); + + /** + * Get the error string (empty if no error occurred). + */ + std::string const & getErrorMessage(); + + private: + + /** + * Scan the caption & contents directories for help files. + */ + bool scanForFiles(); + + /** + * Scan for files in the given directory. + */ + bool scanForFiles(std::string const &path); + + /** + * Fill the Document with information on the given help file. + */ + bool helpDocument(std::string const & fileName, Document *doc); + + /** + * Create a reader for the given file, and create an "empty" reader in case the file doesn't exist. + */ + lucene::util::Reader *helpFileReader(std::string const & path); + + std::wstring string2wstring(std::string const &source); +}; + +HelpIndexer::HelpIndexer(std::string const &lang, std::string const &module, + std::string const &captionDir, std::string const &contentDir, std::string const &indexDir) : +d_lang(lang), d_module(module), d_captionDir(captionDir), d_contentDir(contentDir), d_indexDir(indexDir), d_error(""), d_files() {} + +bool HelpIndexer::indexDocuments() { + if (!scanForFiles()) { + return false; + } + +#ifdef TODO + // Construct the analyzer appropriate for the given language + lucene::analysis::Analyzer *analyzer = ( + d_lang.compare("ja") == 0 ? + (lucene::analysis::Analyzer*)new lucene::analysis::LanguageBasedAnalyzer(L"cjk") : + (lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer()); +#else + lucene::analysis::Analyzer *analyzer = ( + (lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer()); +#endif + + lucene::index::IndexWriter writer(d_indexDir.c_str(), analyzer, true); + + // Index the identified help files + Document doc; + for (std::set<std::string>::iterator i = d_files.begin(); i != d_files.end(); ++i) { + doc.clear(); + if (!helpDocument(*i, &doc)) { + delete analyzer; + return false; + } + writer.addDocument(&doc); + } + + // Optimize the index + writer.optimize(); + + delete analyzer; + return true; +} + +std::string const & HelpIndexer::getErrorMessage() { + return d_error; +} + +bool HelpIndexer::scanForFiles() { + if (!scanForFiles(d_contentDir)) { + return false; + } + if (!scanForFiles(d_captionDir)) { + return false; + } + return true; +} + +bool HelpIndexer::scanForFiles(std::string const & path) { + DIR *dir = opendir(path.c_str()); + if (dir == 0) { + d_error = "Error reading directory " + path + strerror(errno); + return true; + } + + struct dirent *ent; + struct stat info; + while ((ent = readdir(dir)) != 0) { + if (stat((path + "/" + ent->d_name).c_str(), &info) == 0 && S_ISREG(info.st_mode)) { + d_files.insert(ent->d_name); + } + } + + closedir(dir); + + return true; +} + +bool HelpIndexer::helpDocument(std::string const & fileName, Document *doc) { + // Add the help path as an indexed, untokenized field. + std::wstring path(L"#HLP#" + string2wstring(d_module) + L"/" + string2wstring(fileName)); + doc->add(*new Field(_T("path"), path.c_str(), Field::STORE_YES | Field::INDEX_UNTOKENIZED)); + + // Add the caption as a field. + std::string captionPath = d_captionDir + "/" + fileName; + doc->add(*new Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED)); + // FIXME: does the Document take responsibility for the FileReader or should I free it somewhere? + + // Add the content as a field. + std::string contentPath = d_contentDir + "/" + fileName; + doc->add(*new Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED)); + // FIXME: does the Document take responsibility for the FileReader or should I free it somewhere? + + return true; +} + +lucene::util::Reader *HelpIndexer::helpFileReader(std::string const & path) { + if (access(path.c_str(), R_OK) == 0) { + return new lucene::util::FileReader(path.c_str(), "UTF-8"); + } else { + return new lucene::util::StringReader(L""); + } +} + +std::wstring HelpIndexer::string2wstring(std::string const &source) { + std::wstring target(source.length(), L' '); + std::copy(source.begin(), source.end(), target.begin()); + return target; +} + +int main(int argc, char **argv) { + const std::string pLang("-lang"); + const std::string pModule("-mod"); + const std::string pOutDir("-zipdir"); + const std::string pSrcDir("-srcdir"); + + std::string lang; + std::string module; + std::string srcDir; + std::string outDir; + + bool error = false; + for (int i = 1; i < argc; ++i) { + if (pLang.compare(argv[i]) == 0) { + if (i + 1 < argc) { + lang = argv[++i]; + } else { + error = true; + } + } else if (pModule.compare(argv[i]) == 0) { + if (i + 1 < argc) { + module = argv[++i]; + } else { + error = true; + } + } else if (pOutDir.compare(argv[i]) == 0) { + if (i + 1 < argc) { + outDir = argv[++i]; + } else { + error = true; + } + } else if (pSrcDir.compare(argv[i]) == 0) { + if (i + 1 < argc) { + srcDir = argv[++i]; + } else { + error = true; + } + } else { + error = true; + } + } + + if (error) { + std::cerr << "Error parsing command-line arguments" << std::endl; + } + + if (error || lang.empty() || module.empty() || srcDir.empty() || outDir.empty()) { + std::cerr << "Usage: HelpIndexer -lang ISOLangCode -mod HelpModule -srcdir SourceDir -zipdir OutputDir" << std::endl; + return 1; + } + + std::string captionDir(srcDir + "/caption"); + std::string contentDir(srcDir + "/content"); + std::string indexDir(outDir + "/" + module + ".idxl"); + HelpIndexer indexer(lang, module, captionDir, contentDir, indexDir); + if (!indexer.indexDocuments()) { + std::cerr << indexer.getErrorMessage() << std::endl; + return 2; + } + return 0; +} diff --git a/l10ntools/source/help/makefile.mk b/l10ntools/source/help/makefile.mk index bab01b8f9a66..e22c6a3dbb4a 100644 --- a/l10ntools/source/help/makefile.mk +++ b/l10ntools/source/help/makefile.mk @@ -60,8 +60,10 @@ SLOFILES=\ EXCEPTIONSFILES=\ $(OBJ)$/HelpLinker.obj \ $(OBJ)$/HelpCompiler.obj \ + $(OBJ)$/helpindexer.obj \ $(SLO)$/HelpLinker.obj \ $(SLO)$/HelpCompiler.obj + .IF "$(OS)" == "MACOSX" && "$(CPU)" == "P" && "$(COM)" == "GCC" # There appears to be a GCC 4.0.1 optimization error causing _file:good() to # report true right before the call to writeOut at HelpLinker.cxx:1.12 l. 954 @@ -72,6 +74,9 @@ NOOPTFILES=\ $(SLO)$/HelpLinker.obj .ENDIF +PKGCONFIG_MODULES=libclucene-core +.INCLUDE : pkg_config.mk + APP1TARGET= $(TARGET) APP1OBJS=\ $(OBJ)$/HelpLinker.obj \ @@ -79,6 +84,12 @@ APP1OBJS=\ APP1RPATH = NONE APP1STDLIBS+=$(SALLIB) $(BERKELEYLIB) $(XSLTLIB) $(EXPATASCII3RDLIB) +APP2TARGET=HelpIndexer +APP2OBJS=\ + $(OBJ)$/helpindexer.obj +APP2RPATH = NONE +APP2STDLIBS+=$(SALLIB) $(PKGCONFIG_LIBS) + SHL1TARGET =$(LIBBASENAME)$(DLLPOSTFIX) SHL1LIBS= $(SLB)$/$(TARGET).lib .IF "$(COM)" == "MSC" @@ -93,26 +104,7 @@ SHL1USE_EXPORTS =ordinal DEF1NAME =$(SHL1TARGET) DEFLIB1NAME =$(TARGET) -JAVAFILES = \ - HelpIndexerTool.java \ - HelpFileDocument.java - - -JAVACLASSFILES = \ - $(CLASSDIR)$/$(PACKAGE)$/HelpIndexerTool.class \ - $(CLASSDIR)$/$(PACKAGE)$/HelpFileDocument.class -.IF "$(SYSTEM_LUCENE)" == "YES" -EXTRAJARFILES += $(LUCENE_CORE_JAR) $(LUCENE_ANALYZERS_JAR) -.ELSE -JARFILES += lucene-core-2.3.jar lucene-analyzers-2.3.jar -.ENDIF -JAVAFILES = $(subst,$(CLASSDIR)$/$(PACKAGE)$/, $(subst,.class,.java $(JAVACLASSFILES))) - -JARCLASSDIRS = $(PACKAGE)/* -JARTARGET = HelpIndexerTool.jar -JARCOMPRESS = TRUE - # --- Targets ------------------------------------------------------ .INCLUDE : target.mk |