1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3  * This file is part of the LibreOffice project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  */
9 
10 #include <helpcompiler/HelpIndexer.hxx>
11 
12 #include <rtl/string.hxx>
13 #include <rtl/uri.hxx>
14 #include <o3tl/runtimetooustring.hxx>
15 #include <osl/file.hxx>
16 #include <osl/thread.h>
17 #include <memory>
18 
19 #include "LuceneHelper.hxx"
20 #include <CLucene.h>
21 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
22 
23 #if defined _WIN32
24 #include <o3tl/char16_t2wchar_t.hxx>
25 #include <prewin.h>
26 #include <postwin.h>
27 #endif
28 
29 using namespace lucene::document;
30 
HelpIndexer(OUString const & lang,OUString const & module,std::u16string_view srcDir,std::u16string_view outDir)31 HelpIndexer::HelpIndexer(OUString const &lang, OUString const &module,
32     std::u16string_view srcDir, std::u16string_view outDir)
33     : d_lang(lang), d_module(module)
34 {
35     d_indexDir = outDir + OUStringChar('/') + module + ".idxl";
36     d_captionDir = OUString::Concat(srcDir) + "/caption";
37     d_contentDir = OUString::Concat(srcDir) + "/content";
38 }
39 
40 #if defined _WIN32
41 namespace
42 {
43 template <class Constructor>
TryWithUnicodePathWorkaround(const OUString & ustrPath,const Constructor & constructor)44 auto TryWithUnicodePathWorkaround(const OUString& ustrPath, const Constructor& constructor)
45 {
46     const rtl_TextEncoding eThreadEncoding = osl_getThreadTextEncoding();
47     OString sPath = OUStringToOString(ustrPath, eThreadEncoding);
48     try
49     {
50         // First try path in thread encoding (ACP in case of Windows).
51         return constructor(sPath);
52     }
53     catch (const CLuceneError&)
54     {
55         // Maybe the path contains characters not representable in ACP. There's no API in lucene
56         // that takes Unicode strings (they take 8-bit strings, and pass them to CRT library
57         // functions without conversion).
58 
59         // For a workaround, try short name, which should only contain ASCII characters. Would
60         // not help (i.e., would return original long name) if short (8.3) file name creation is
61         // disabled in OS or volume settings.
62         wchar_t buf[32767];
63         if (GetShortPathNameW(o3tl::toW(ustrPath.getStr()), buf, std::size(buf)) == 0)
64             throw;
65         sPath = OUStringToOString(o3tl::toU(buf), eThreadEncoding);
66         return constructor(sPath);
67     }
68 }
69 }
70 #endif
71 
indexDocuments()72 bool HelpIndexer::indexDocuments()
73 {
74     if (!scanForFiles())
75         return false;
76 
77     try
78     {
79         OUString sLang = d_lang.getToken(0, '-');
80         bool bUseCJK = sLang == "ja" || sLang == "ko" || sLang == "zh";
81 
82         // Construct the analyzer appropriate for the given language
83         std::unique_ptr<lucene::analysis::Analyzer> analyzer;
84         if (bUseCJK)
85             analyzer.reset(new lucene::analysis::LanguageBasedAnalyzer(L"cjk"));
86         else
87             analyzer.reset(new lucene::analysis::standard::StandardAnalyzer());
88 
89         OUString ustrSystemPath;
90         osl::File::getSystemPathFromFileURL(d_indexDir, ustrSystemPath);
91 
92 #if defined _WIN32
93         // Make sure the path exists, or GetShortPathNameW (if attempted) will fail.
94         osl::Directory::createPath(d_indexDir);
95         auto writer = TryWithUnicodePathWorkaround(ustrSystemPath, [&analyzer](const OString& s) {
96             return std::make_unique<lucene::index::IndexWriter>(s.getStr(), analyzer.get(), true);
97         });
98 #else
99         OString indexDirStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
100         auto writer = std::make_unique<lucene::index::IndexWriter>(indexDirStr.getStr(),
101                                                                    analyzer.get(), true);
102 #endif
103 
104         //Double limit of tokens allowed, otherwise we'll get a too-many-tokens
105         //exception for ja help. Could alternative ignore the exception and get
106         //truncated results as per java-Lucene apparently
107         writer->setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH*2);
108 
109         // Index the identified help files
110         Document doc;
111         for (auto const& elem : d_files)
112         {
113             helpDocument(elem, &doc);
114             writer->addDocument(&doc);
115             doc.clear();
116         }
117 
118         // Optimize the index
119         writer->optimize();
120     }
121     catch (CLuceneError &e)
122     {
123         d_error = o3tl::runtimeToOUString(e.what());
124         return false;
125     }
126 
127     return true;
128 }
129 
130 
scanForFiles()131 bool HelpIndexer::scanForFiles() {
132     if (!scanForFiles(d_contentDir)) {
133         return false;
134     }
135     if (!scanForFiles(d_captionDir)) {
136         return false;
137     }
138     return true;
139 }
140 
scanForFiles(OUString const & path)141 bool HelpIndexer::scanForFiles(OUString const & path) {
142 
143     osl::Directory dir(path);
144     if (osl::FileBase::E_None != dir.open()) {
145         d_error = "Error reading directory " + path;
146         return false;
147     }
148 
149     osl::DirectoryItem item;
150     osl::FileStatus fileStatus(osl_FileStatus_Mask_FileName | osl_FileStatus_Mask_Type);
151     while (dir.getNextItem(item) == osl::FileBase::E_None) {
152         item.getFileStatus(fileStatus);
153         if (fileStatus.getFileType() == osl::FileStatus::Regular) {
154             d_files.insert(fileStatus.getFileName());
155         }
156     }
157 
158     return true;
159 }
160 
helpDocument(OUString const & fileName,Document * doc) const161 void HelpIndexer::helpDocument(OUString const & fileName, Document *doc) const {
162     // Add the help path as an indexed, untokenized field.
163 
164     OUString path = "#HLP#" + d_module + "/" + fileName;
165     std::vector<TCHAR> aPath(OUStringToTCHARVec(path));
166     doc->add(*_CLNEW Field(_T("path"), aPath.data(), int(Field::STORE_YES) | int(Field::INDEX_UNTOKENIZED)));
167 
168     OUString sEscapedFileName =
169         rtl::Uri::encode(fileName,
170         rtl_UriCharClassUric, rtl_UriEncodeIgnoreEscapes, RTL_TEXTENCODING_UTF8);
171 
172     // Add the caption as a field.
173     OUString captionPath = d_captionDir + "/" + sEscapedFileName;
174     doc->add(*_CLNEW Field(_T("caption"), helpFileReader(captionPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
175 
176     // Add the content as a field.
177     OUString contentPath = d_contentDir + "/" + sEscapedFileName;
178     doc->add(*_CLNEW Field(_T("content"), helpFileReader(contentPath), int(Field::STORE_NO) | int(Field::INDEX_TOKENIZED)));
179 }
180 
helpFileReader(OUString const & path)181 lucene::util::Reader *HelpIndexer::helpFileReader(OUString const & path) {
182     osl::File file(path);
183     if (osl::FileBase::E_None == file.open(osl_File_OpenFlag_Read)) {
184         file.close();
185         OUString ustrSystemPath;
186         osl::File::getSystemPathFromFileURL(path, ustrSystemPath);
187 #if defined _WIN32
188         return TryWithUnicodePathWorkaround(ustrSystemPath, [](const OString& s) {
189             return _CLNEW lucene::util::FileReader(s.getStr(), "UTF-8");
190         });
191 #else
192         OString pathStr = OUStringToOString(ustrSystemPath, osl_getThreadTextEncoding());
193         return _CLNEW lucene::util::FileReader(pathStr.getStr(), "UTF-8");
194 #endif
195     } else {
196         return _CLNEW lucene::util::StringReader(L"");
197     }
198 }
199 
200 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
201