1 /*
2  * This file is part of StarDict.
3  *
4  * StarDict is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * StarDict is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with StarDict.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #ifdef HAVE_CONFIG_H
19 #  include "config.h"
20 #endif
21 
22 #include <algorithm>
23 #include <string>
24 #include "lib_dict_repair.h"
25 #include "lib_chars.h"
26 
compare_article_data_by_key(const article_data_t & left,const article_data_t & right)27 static bool compare_article_data_by_key(const article_data_t& left, const article_data_t& right)
28 {
29 	return 0 > stardict_strcmp(left.key.c_str(), right.key.c_str());
30 }
31 
repair_text_data(std::string & text)32 static void repair_text_data(std::string& text)
33 {
34 	if(!g_utf8_validate(text.c_str(), -1, NULL)) {
35 		text = fix_utf8_str(text, 0);
36 	}
37 	typedef std::list<const char*> str_list_t;
38 	str_list_t invalid_chars;
39 	if(check_xml_string_chars(text.c_str(), invalid_chars)) {
40 		std::string tmp;
41 		fix_xml_string_chars(text.c_str(), tmp);
42 		text = tmp;
43 	}
44 }
45 
repair_key(std::string & key)46 static void repair_key(std::string& key)
47 {
48 	if(key.empty())
49 		return;
50 	repair_text_data(key);
51 	if(key.length()>=(size_t)MAX_INDEX_KEY_SIZE) {
52 		size_t wordlen = truncate_utf8_string(key.c_str(), key.length(), MAX_INDEX_KEY_SIZE-1);
53 		key.resize(wordlen);
54 	}
55 	if(!key.empty()) {
56 		if(g_ascii_isspace(key[0]) || g_ascii_isspace(key[key.length()-1])) {
57 			const char* new_beg = NULL;
58 			size_t new_len;
59 			trim_spaces(key.c_str(), new_beg, new_len);
60 			std::string new_key(new_beg, new_len);
61 			key = new_key;
62 		}
63 	}
64 	if (check_stardict_key_chars(key.c_str())) {
65 		std::string tmp;
66 		fix_stardict_key_chars(key.c_str(), tmp);
67 		key = tmp;
68 	}
69 }
70 
71 /* return value:
72  * EXIT_FAILURE - unrecoverable error occurred, for example file read error.
73  * Errors related to article contents are do not lead to EXIT_FAILURE.
74  * In case the article contents is broken and cannot be recovered
75  * we clear the article key, that in practise mean that this article will ignored. */
repair_article(article_data_t & article,common_dict_t & norm_dict)76 static int repair_article(article_data_t& article, common_dict_t& norm_dict)
77 {
78 	repair_key(article.key);
79 	// We check that the key is blank after processing synonyms
80 	// synonyms
81 	{
82 		std::vector<std::string> synonyms2;
83 		synonyms2.reserve(article.synonyms.size());
84 		for(std::vector<std::string>::iterator it=article.synonyms.begin(); it!=article.synonyms.end(); ++it) {
85 			repair_key(*it);
86 			if(it->empty())
87 				continue;
88 			if(*it == article.key)
89 				continue;
90 			// ignore duplicates
91 			if(std::find(article.synonyms.begin(), it, *it) != it)
92 				continue;
93 			synonyms2.push_back(*it);
94 		}
95 		std::swap(article.synonyms, synonyms2);
96 	}
97 	if(article.key.empty()) {
98 		if(article.synonyms.empty())
99 			return EXIT_SUCCESS;
100 		// if the key is empty, replace it with the first synonym
101 		article.key = article.synonyms[0];
102 		article.synonyms.erase(article.synonyms.begin());
103 	}
104 	// definitions
105 	{
106 		std::vector<article_def_t> defs2;
107 		std::vector<char> buf;
108 		defs2.reserve(article.definitions.size());
109 		for(std::vector<article_def_t>::iterator it=article.definitions.begin(); it!=article.definitions.end(); ++it) {
110 			if(it->type == 'r') {
111 				if(it->resources.empty())
112 					continue;
113 				defs2.push_back(*it);
114 				continue;
115 			}
116 			if(it->size == 0)
117 				continue;
118 			if(g_ascii_isupper(it->type)) {
119 				defs2.push_back(*it);
120 				continue;
121 			}
122 			if(g_ascii_islower(it->type)) {
123 				buf.resize(it->size);
124 				if(norm_dict.read_data(&buf[0], it->size, it->offset))
125 					return EXIT_FAILURE;
126 				std::string def(&buf[0], buf.size());
127 				const std::string def_orig(def);
128 				repair_text_data(def);
129 				if(def.empty())
130 					continue;
131 				if(def != def_orig) {
132 					size_t offset;
133 					if(norm_dict.write_data(def.c_str(), def.length(), offset))
134 						return EXIT_FAILURE;
135 					it->size = def.length();
136 					it->offset = offset;
137 				}
138 				defs2.push_back(*it);
139 				continue;
140 			}
141 			// unknown type
142 		}
143 		std::swap(article.definitions, defs2);
144 		if(article.definitions.empty()) {
145 			article.key.clear();
146 			return EXIT_SUCCESS;
147 		}
148 	}
149 	return EXIT_SUCCESS;
150 }
151 
repair_dict(common_dict_t & norm_dict)152 int repair_dict(common_dict_t& norm_dict)
153 {
154 	for(std::vector<article_data_t>::iterator it=norm_dict.articles.begin(); it!=norm_dict.articles.end(); ++it)
155 		if(repair_article(*it, norm_dict))
156 			return EXIT_FAILURE;
157 	std::sort(norm_dict.articles.begin(), norm_dict.articles.end(), compare_article_data_by_key);
158 	// remove empty articles
159 	article_data_t empty_article;
160 	typedef std::vector<article_data_t>::iterator article_iter_t;
161 	std::pair<article_iter_t, article_iter_t> range
162 		= std::equal_range(norm_dict.articles.begin(), norm_dict.articles.end(), empty_article,
163 			compare_article_data_by_key);
164 	norm_dict.articles.erase(range.first, range.second);
165 	if(norm_dict.articles.empty()) {
166 		g_critical("Dictionary contains no articles");
167 		return EXIT_FAILURE;
168 	}
169 	norm_dict.dict_info.set_wordcount(norm_dict.articles.size());
170 	return EXIT_SUCCESS;
171 }
172