1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/tools/convert_dict/dic_reader.h"
6
7 #include <stddef.h>
8
9 #include <algorithm>
10 #include <set>
11
12 #include "base/files/file_util.h"
13 #include "base/strings/string_util.h"
14 #include "chrome/tools/convert_dict/aff_reader.h"
15 #include "chrome/tools/convert_dict/hunspell_reader.h"
16
17 namespace convert_dict {
18
19 namespace {
20
21 // Maps each unique word to the unique affix group IDs associated with it.
22 typedef std::map<std::string, std::set<int> > WordSet;
23
SplitDicLine(const std::string & line,std::vector<std::string> * output)24 void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
25 // We split the line on a slash not preceded by a backslash. A slash at the
26 // beginning of the line is not a separator either.
27 size_t slash_index = line.size();
28 for (size_t i = 0; i < line.size(); i++) {
29 if (line[i] == '/' && i > 0 && line[i - 1] != '\\') {
30 slash_index = i;
31 break;
32 }
33 }
34
35 output->clear();
36
37 // Everything before the slash index is the first term. We also need to
38 // convert all escaped slashes ("\/" sequences) to regular slashes.
39 std::string word = line.substr(0, slash_index);
40 base::ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/");
41 output->push_back(word);
42
43 // Everything (if anything) after the slash is the second.
44 if (slash_index < line.size() - 1)
45 output->push_back(line.substr(slash_index + 1));
46 }
47
48 // This function reads words from a .dic file, or a .dic_delta file. Note that
49 // we read 'all' the words in the file, irrespective of the word count given
50 // in the first non empty line of a .dic file. Also note that, for a .dic_delta
51 // file, the first line actually does _not_ have the number of words. In order
52 // to control this, we use the |file_has_word_count_in_the_first_line|
53 // parameter to tell this method whether the first non empty line in the file
54 // contains the number of words or not. If it does, skip the first line. If it
55 // does not, then the first line contains a word.
PopulateWordSet(WordSet * word_set,FILE * file,AffReader * aff_reader,const char * file_type,const char * encoding,bool file_has_word_count_in_the_first_line)56 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
57 const char* file_type, const char* encoding,
58 bool file_has_word_count_in_the_first_line) {
59 int line_number = 0;
60 while (!feof(file)) {
61 std::string line = ReadLine(file);
62 line_number++;
63 StripComment(&line);
64 if (line.empty())
65 continue;
66
67 if (file_has_word_count_in_the_first_line) {
68 // Skip the first nonempty line, this is the line count. We don't bother
69 // with it and just read all the lines.
70 file_has_word_count_in_the_first_line = false;
71 continue;
72 }
73
74 std::vector<std::string> split;
75 SplitDicLine(line, &split);
76 if (split.empty() || split.size() > 2) {
77 printf("Line %d has extra slashes in the %s file\n", line_number,
78 file_type);
79 return false;
80 }
81
82 // The first part is the word, the second (optional) part is the affix. We
83 // always use UTF-8 as the encoding to simplify life.
84 std::string utf8word;
85 std::string encoding_string(encoding);
86 if (encoding_string == "UTF-8") {
87 utf8word = split[0];
88 } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
89 printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
90 line_number, encoding, file_type);
91 return false;
92 }
93
94 // We always convert the affix to an index. 0 means no affix.
95 int affix_index = 0;
96 if (split.size() == 2) {
97 // Got a rule, which is the stuff after the slash. The line may also have
98 // an optional term separated by a tab. This is the morphological
99 // description. We don't care about this (it is used in the tests to
100 // generate a nice dump), so we remove it.
101 size_t split1_tab_offset = split[1].find('\t');
102 if (split1_tab_offset != std::string::npos)
103 split[1] = split[1].substr(0, split1_tab_offset);
104
105 if (aff_reader->has_indexed_affixes())
106 affix_index = atoi(split[1].c_str());
107 else
108 affix_index = aff_reader->GetAFIndexForAFString(split[1]);
109 }
110
111 // Discard the morphological description if it is attached to the first
112 // token. (It is attached to the first token if a word doesn't have affix
113 // rules.)
114 size_t word_tab_offset = utf8word.find('\t');
115 if (word_tab_offset != std::string::npos)
116 utf8word = utf8word.substr(0, word_tab_offset);
117
118 auto found = word_set->find(utf8word);
119 std::set<int> affix_vector;
120 affix_vector.insert(affix_index);
121
122 if (found == word_set->end())
123 word_set->insert(std::make_pair(utf8word, affix_vector));
124 else
125 found->second.insert(affix_index);
126 }
127
128 return true;
129 }
130
131 } // namespace
132
DicReader(const base::FilePath & path)133 DicReader::DicReader(const base::FilePath& path) {
134 file_ = base::OpenFile(path, "r");
135
136 base::FilePath additional_path =
137 path.ReplaceExtension(FILE_PATH_LITERAL("dic_delta"));
138 additional_words_file_ = base::OpenFile(additional_path, "r");
139
140 if (additional_words_file_)
141 printf("Reading %" PRFilePath " ...\n", additional_path.value().c_str());
142 else
143 printf("%" PRFilePath " not found.\n", additional_path.value().c_str());
144 }
145
~DicReader()146 DicReader::~DicReader() {
147 if (file_)
148 base::CloseFile(file_);
149 if (additional_words_file_)
150 base::CloseFile(additional_words_file_);
151 }
152
Read(AffReader * aff_reader)153 bool DicReader::Read(AffReader* aff_reader) {
154 if (!file_)
155 return false;
156
157 WordSet word_set;
158
159 // Add words from the dic file to the word set.
160 // Note that the first line is the word count in the file.
161 if (!PopulateWordSet(&word_set, file_, aff_reader, "dic",
162 aff_reader->encoding(), true))
163 return false;
164
165 // Add words from the .dic_delta file to the word set, if it exists.
166 // The first line is the first word to add. Word count line is not present.
167 // NOTE: These additional words should be encoded as UTF-8.
168 if (additional_words_file_) {
169 PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
170 "UTF-8", false);
171 }
172 // Make sure the words are sorted, they may be unsorted in the input.
173 for (auto word = word_set.begin(); word != word_set.end(); ++word) {
174 std::vector<int> affixes;
175 for (auto aff = word->second.begin(); aff != word->second.end(); ++aff)
176 affixes.push_back(*aff);
177
178 // Double check that the affixes are sorted. This isn't strictly necessary
179 // but it's nice for the file to have a fixed layout.
180 std::sort(affixes.begin(), affixes.end());
181 std::reverse(affixes.begin(), affixes.end());
182 words_.push_back(std::make_pair(word->first, affixes));
183 }
184
185 // Double-check that the words are sorted.
186 std::sort(words_.begin(), words_.end());
187 return true;
188 }
189
190 } // namespace convert_dict
191