1 /*
2 * This file is part of StarDict.
3 *
4 * StarDict is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * StarDict is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with StarDict. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21
22 #include <algorithm>
23 #include <string>
24 #include "lib_dict_repair.h"
25 #include "lib_chars.h"
26
compare_article_data_by_key(const article_data_t & left,const article_data_t & right)27 static bool compare_article_data_by_key(const article_data_t& left, const article_data_t& right)
28 {
29 return 0 > stardict_strcmp(left.key.c_str(), right.key.c_str());
30 }
31
repair_text_data(std::string & text)32 static void repair_text_data(std::string& text)
33 {
34 if(!g_utf8_validate(text.c_str(), -1, NULL)) {
35 text = fix_utf8_str(text, 0);
36 }
37 typedef std::list<const char*> str_list_t;
38 str_list_t invalid_chars;
39 if(check_xml_string_chars(text.c_str(), invalid_chars)) {
40 std::string tmp;
41 fix_xml_string_chars(text.c_str(), tmp);
42 text = tmp;
43 }
44 }
45
repair_key(std::string & key)46 static void repair_key(std::string& key)
47 {
48 if(key.empty())
49 return;
50 repair_text_data(key);
51 if(key.length()>=(size_t)MAX_INDEX_KEY_SIZE) {
52 size_t wordlen = truncate_utf8_string(key.c_str(), key.length(), MAX_INDEX_KEY_SIZE-1);
53 key.resize(wordlen);
54 }
55 if(!key.empty()) {
56 if(g_ascii_isspace(key[0]) || g_ascii_isspace(key[key.length()-1])) {
57 const char* new_beg = NULL;
58 size_t new_len;
59 trim_spaces(key.c_str(), new_beg, new_len);
60 std::string new_key(new_beg, new_len);
61 key = new_key;
62 }
63 }
64 if (check_stardict_key_chars(key.c_str())) {
65 std::string tmp;
66 fix_stardict_key_chars(key.c_str(), tmp);
67 key = tmp;
68 }
69 }
70
71 /* return value:
72 * EXIT_FAILURE - unrecoverable error occurred, for example file read error.
73 * Errors related to article contents are do not lead to EXIT_FAILURE.
74 * In case the article contents is broken and cannot be recovered
75 * we clear the article key, that in practise mean that this article will ignored. */
repair_article(article_data_t & article,common_dict_t & norm_dict)76 static int repair_article(article_data_t& article, common_dict_t& norm_dict)
77 {
78 repair_key(article.key);
79 // We check that the key is blank after processing synonyms
80 // synonyms
81 {
82 std::vector<std::string> synonyms2;
83 synonyms2.reserve(article.synonyms.size());
84 for(std::vector<std::string>::iterator it=article.synonyms.begin(); it!=article.synonyms.end(); ++it) {
85 repair_key(*it);
86 if(it->empty())
87 continue;
88 if(*it == article.key)
89 continue;
90 // ignore duplicates
91 if(std::find(article.synonyms.begin(), it, *it) != it)
92 continue;
93 synonyms2.push_back(*it);
94 }
95 std::swap(article.synonyms, synonyms2);
96 }
97 if(article.key.empty()) {
98 if(article.synonyms.empty())
99 return EXIT_SUCCESS;
100 // if the key is empty, replace it with the first synonym
101 article.key = article.synonyms[0];
102 article.synonyms.erase(article.synonyms.begin());
103 }
104 // definitions
105 {
106 std::vector<article_def_t> defs2;
107 std::vector<char> buf;
108 defs2.reserve(article.definitions.size());
109 for(std::vector<article_def_t>::iterator it=article.definitions.begin(); it!=article.definitions.end(); ++it) {
110 if(it->type == 'r') {
111 if(it->resources.empty())
112 continue;
113 defs2.push_back(*it);
114 continue;
115 }
116 if(it->size == 0)
117 continue;
118 if(g_ascii_isupper(it->type)) {
119 defs2.push_back(*it);
120 continue;
121 }
122 if(g_ascii_islower(it->type)) {
123 buf.resize(it->size);
124 if(norm_dict.read_data(&buf[0], it->size, it->offset))
125 return EXIT_FAILURE;
126 std::string def(&buf[0], buf.size());
127 const std::string def_orig(def);
128 repair_text_data(def);
129 if(def.empty())
130 continue;
131 if(def != def_orig) {
132 size_t offset;
133 if(norm_dict.write_data(def.c_str(), def.length(), offset))
134 return EXIT_FAILURE;
135 it->size = def.length();
136 it->offset = offset;
137 }
138 defs2.push_back(*it);
139 continue;
140 }
141 // unknown type
142 }
143 std::swap(article.definitions, defs2);
144 if(article.definitions.empty()) {
145 article.key.clear();
146 return EXIT_SUCCESS;
147 }
148 }
149 return EXIT_SUCCESS;
150 }
151
repair_dict(common_dict_t & norm_dict)152 int repair_dict(common_dict_t& norm_dict)
153 {
154 for(std::vector<article_data_t>::iterator it=norm_dict.articles.begin(); it!=norm_dict.articles.end(); ++it)
155 if(repair_article(*it, norm_dict))
156 return EXIT_FAILURE;
157 std::sort(norm_dict.articles.begin(), norm_dict.articles.end(), compare_article_data_by_key);
158 // remove empty articles
159 article_data_t empty_article;
160 typedef std::vector<article_data_t>::iterator article_iter_t;
161 std::pair<article_iter_t, article_iter_t> range
162 = std::equal_range(norm_dict.articles.begin(), norm_dict.articles.end(), empty_article,
163 compare_article_data_by_key);
164 norm_dict.articles.erase(range.first, range.second);
165 if(norm_dict.articles.empty()) {
166 g_critical("Dictionary contains no articles");
167 return EXIT_FAILURE;
168 }
169 norm_dict.dict_info.set_wordcount(norm_dict.articles.size());
170 return EXIT_SUCCESS;
171 }
172