summaryrefslogtreecommitdiff
path: root/l10ntools/source/help/HelpIndexer.cxx
blob: fdae9e63d273aae102525a7b615646f7d2ac3084 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#include <l10ntools/HelpIndexer.hxx>
#include "LuceneHelper.hxx"
#include <CLucene/analysis/LanguageBasedAnalyzer.h>

#include <rtl/string.hxx>
#include <osl/file.hxx>

#include <algorithm>

using namespace lucene::document;

HelpIndexer::HelpIndexer(rtl::OUString const &lang, rtl::OUString const &module,
	rtl::OUString const &captionDir, rtl::OUString const &contentDir, rtl::OUString const &indexDir) :
d_lang(lang), d_module(module), d_captionDir(captionDir), d_contentDir(contentDir), d_indexDir(indexDir),
d_error(), d_files() {}

bool HelpIndexer::indexDocuments() {
	if (!scanForFiles()) {
		return false;
	}

	rtl::OUString sLang = d_lang.getToken(0, '-');
	bool bUseCJK =
		sLang.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("ja")) ||
		sLang.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("ko")) ||
		sLang.equalsAsciiL(RTL_CONSTASCII_STRINGPARAM("zh"));

	// Construct the analyzer appropriate for the given language
	lucene::analysis::Analyzer *analyzer = (
		bUseCJK ?
		(lucene::analysis::Analyzer*)new lucene::analysis::LanguageBasedAnalyzer(L"cjk") :
		(lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());

	rtl::OString indexDirStr;
	d_indexDir.convertToString(&indexDirStr, RTL_TEXTENCODING_ASCII_US, 0);
	lucene::index::IndexWriter writer(indexDirStr.getStr(), analyzer, true);

	// Index the identified help files
	Document doc;
	for (std::set<rtl::OUString>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
		doc.clear();
		if (!helpDocument(*i, &doc)) {
			delete analyzer;
			return false;
		}
		writer.addDocument(&doc);
	}

	// Optimize the index
	writer.optimize();

	delete analyzer;
	return true;
}

rtl::OUString const & HelpIndexer::getErrorMessage() {
	return d_error;
}

bool HelpIndexer::scanForFiles() {
	if (!scanForFiles(d_contentDir)) {
		return false;
	}
	if (!scanForFiles(d_captionDir)) {
		return false;
	}
	return true;
}

bool HelpIndexer::scanForFiles(rtl::OUString const & path) {
	osl::Directory dir(path);
	if (osl::FileBase::E_None != dir.open()) {
		d_error = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("Error reading directory ")) + path;
		return true;
	}

	osl::DirectoryItem item;
	osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
	while (dir.getNextItem(item) == osl::FileBase::E_None) {
		if (fileStatus.getFileType() == osl::FileStatus::Regular) {
			d_files.insert(fileStatus.getFileName());
		}
	}

	return true;
}

bool HelpIndexer::helpDocument(rtl::OUString const & fileName, Document *doc) {
	// Add the help path as an indexed, untokenized field.
	rtl::OUString path = rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("#HLP#")) + d_module + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName;
	std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
	doc->add(*new Field(_T("path"), &aPath[0], Field::STORE_YES | Field::INDEX_UNTOKENIZED));

	// Add the caption as a field.
	rtl::OUString captionPath = d_captionDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName;
	doc->add(*new Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?

	// Add the content as a field.
	rtl::OUString contentPath = d_contentDir + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("/")) + fileName;
	doc->add(*new Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?

	return true;
}

lucene::util::Reader *HelpIndexer::helpFileReader(rtl::OUString const & path) {
	osl::File file(path);
	if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
		file.close();
		rtl::OString pathStr;
		path.convertToString(&pathStr, RTL_TEXTENCODING_ASCII_US, 0); // FIXME: path encoding?
		return new lucene::util::FileReader(pathStr.getStr(), "UTF-8");
	} else {
		return new lucene::util::StringReader(L"");
	}
}