1 /*
2 * This file is part of StarDict.
3 *
4 * StarDict is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * StarDict is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with StarDict. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21
22 #include <libxml/xmlreader.h>
23 #include <libxml/xinclude.h>
24 #include <cstring>
25 #include <iostream>
26 #include <vector>
27 #include <algorithm>
28 #include "lib_stardict_text2bin.h"
29 #include "ifo_file.h"
30 #include "libcommon.h"
31 #include "lib_chars.h"
32 #include "lib_common_dict.h"
33 #include "lib_textual_dict_parser.h"
34 #include "lib_dict_verify.h"
35
36 #define parser_err \
37 "Parser error."
38 #define duplicate_elem_err \
39 "%u: duplicate element %s."
40 #define attribute_value_not_specified_err \
41 "%u: attribute '%s' is not specified."
42 #define attribute_value_unknown_err \
43 "%u: unknown value of the attribute '%s': %s."
44 #define element_blank_value \
45 "%u: element '%s' has blank value."
46 #define attribute_content_type_absent_err \
47 "%u: Content type attribute is not specified neither for the definition element, " \
48 "nor for any element above."
49 #define attribute_value_empty_err \
50 "%u: value of the attribute '%s' is an empty string."
51 #define save_into_temp_file_err \
52 "Unable to save into a temporary file."
53 #define duplicate_key_err \
54 "%u: duplicate key: %s."
55 #define duplicate_synonym_err \
56 "%u: duplicate synonym: %s."
57 #define incorrect_dir_sep_err \
58 "%u: Incorrect directory separator, use '/' char."
59 #define resource_list_empty_err \
60 "%u: empty resource list."
61 #define article_key_list_empty_err \
62 "%u: article: empty key list."
63 #define article_definition_list_empty_err \
64 "%u: article: empty definition list."
65 #define missing_info_section_err \
66 "missing info section."
67 #define article_list_empty \
68 "article list empty."
69 #define unknown_content_type_str_err \
70 "%u: unknown content type '%s'. It must be one ASCII char."
71 #define unknown_content_type_chr_err \
72 "%u: unknown content type '%c'."
73 #define content_type_r_in_definition_err \
74 "%u: use 'definition-r' element for content type 'r'."
75 #define element_invalid_utf8_value_err \
76 "%u: element %s. Invalid utf-8 text value: %s."
77 #define article_key_forbidden_chars_err \
78 "%u: key contains forbidden chars: %s."
79 #define article_synonym_forbidden_chars_err \
80 "%u: synonym contains forbidden chars: %s."
81 #define normalization_failed_err \
82 "%u: utf8 normalization failed. String: %s."
83 #define article_key_long_err \
84 "%u: Key is too long: %s. Key length = %u, maximum length = %d."
85 #define article_synonym_long_err \
86 "%u: Synonym is too long: %s. Synonym length = %u, maximum length = %d."
87 #define element_invalid_text_err \
88 "%u: element '%s'. Invalid text: ''\n%s\n'''\n" \
89 invalid_chars_in_textual_data_msg
90
91 #define missing_req_info_item_msg \
92 "missing required info item %s."
93 #define parse_xml_done_msg \
94 "xml parsing done."
95 #define allow_unknown_content_type_msg \
96 "OK, allow unknown content type."
97 #define xinclude_process_msg \
98 "%u: processing xinclude: %s"
99
100 namespace xml {
101 /* ResourceWrapper needs an address of a function with external linkage.
102 * xmlFree is a global variable - a pointer to a function.
103 * Provide a wrapper function. */
stardict_xmlFree(void * p)104 void stardict_xmlFree(void* p)
105 {
106 xmlFree(p);
107 }
108 typedef ResourceWrapper<xmlChar, void*, void, stardict_xmlFree> CharStr;
109 }
110
note_type_str(xmlReaderTypes type)111 static const char* note_type_str(xmlReaderTypes type)
112 {
113 return note_type_str(type);
114 }
115
note_type_str(int type)116 static const char* note_type_str(int type)
117 {
118 const char* types[] = {
119 "NONE",
120 "ELEMENT",
121 "ATTRIBUTE",
122 "TEXT",
123 "CDATA",
124 "ENTITY_REFERENCE",
125 "ENTITY",
126 "PROCESSING_INSTRUCTION",
127 "COMMENT",
128 "DOCUMENT",
129 "DOCUMENT_TYPE",
130 "DOCUMENT_FRAGMENT",
131 "NOTATION",
132 "WHITESPACE",
133 "SIGNIFICANT_WHITESPACE",
134 "END_ELEMENT",
135 "END_ENTITY",
136 "XML_DECLARATION"
137 };
138 if(type < 0 || type > XML_READER_TYPE_XML_DECLARATION)
139 return "ERROR";
140 return types[type];
141 }
142
143 #if 0
144 static void processNode(xmlTextReaderPtr reader) {
145 xmlChar *name, *value;
146
147 name = xmlTextReaderName(reader);
148 if (name == NULL)
149 name = xmlStrdup(BAD_CAST "--");
150 value = xmlTextReaderValue(reader);
151 xmlNodePtr node = xmlTextReaderCurrentNode(reader);
152
153 printf("%hu: %d %s %s %d %d %d",
154 node ? node->line : 0,
155 xmlTextReaderDepth(reader),
156 note_type_str(xmlTextReaderNodeType(reader)),
157 name,
158 xmlTextReaderIsEmptyElement(reader),
159 xmlTextReaderHasAttributes(reader),
160 xmlTextReaderAttributeCount(reader)
161 );
162 xmlFree(name);
163 if (value == NULL)
164 printf("\n");
165 else {
166 printf(" '%s'\n", value);
167 xmlFree(value);
168 }
169 }
170 #endif
171
172 /* Implementation details
173 *
174 * Most of the elements have a special method that reads that particular element.
175 * For example, we have read_article, read_contents, etc.
176 * The method is invoked after the start tag of the element in read.
177 * For example, after reading article start tag we invoke read_article method.
178 * The method in question must read its element completely and return logical result:
179 * EXIT_FAILURE or EXIT_SUCCESS.
180 * A method may correct small errors, that does not effect the rest of the document.
181 * If a method returns EXIT_SUCCESS, the following elements can be read without problems.
182 * If a method returns EXIT_FAILURE, we should terminate the parsing process immediately.
183 * */
184 class textual_dict_parser_t
185 {
186 public:
187 textual_dict_parser_t(void);
188 int parse(const std::string& xmlfilename, common_dict_t* norm_dict);
189 void set_custom_include(bool b);
190 bool get_custom_include(void) const;
191 private:
192 enum ReadResult { rrEOF = -1, rrError = -2, rrEndElement = -3};
193 typedef unsigned int line_number_t;
194 int prepare_parser(void);
195 void close_parser(void);
196 void remove_reader(void);
197 int read_all(void);
198 int read_stardict(void);
199 int read_info(void);
200 int read_info_items(void);
201 int read_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
202 void (DictInfo:: *set_item)(const std::string& ));
203 int read_contents(void);
204 int read_article(void);
205 int read_contents_r(article_def_t& article_def);
206 TLoadResult read_att_type(char& type);
207 int check_blank_info_items(void);
208 int check_new_line_info_items(void);
209 int check_mandatory_info_items(void);
210 int check_article_key(const std::string& value, line_number_t article_item_line_number);
211 int check_article_synonym(const std::string& value, line_number_t article_item_line_number);
212 int read_xml_element(const char** exp_elems, const char* open_elem);
213 int read_xml_end_element(const char* open_elem);
214 int read_xml_text_item(const char* elem, std::string& value);
215 int read_xml_text(const char* open_elem);
216 int read_xml_attribute(const char* att_name, std::string& value);
217 void check_blank_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
218 const std::string& (DictInfo:: *get_item)(void) const, void (DictInfo:: *unset_item)(void));
219 int check_new_line_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
220 const std::string& (DictInfo:: *get_item)(void) const);
221 char get_effective_content_type(void) const;
222 void unexpected_node_type_element(const char** exp_elems);
223 void unexpected_node_type_text(void);
224 void unexpected_node_type_end_element(const char* open_elem);
225 void unexpected_element(const char** exp_elems);
226 void unexpected_eof(const char** exp_elems);
227 void unexpected_eof(void);
228 void unexpected_end_element(const char** exp_elems, const char* open_elem);
229 void unmatched_end_element(const char* open_elem, const char* end_elem);
230 void unexpected_empty_element(void);
231 void unexpected_non_empty_element(void);
232 line_number_t get_line_number(void);
233 int next_node(void);
234 std::string xmlfilename;
235 xmlTextReaderPtr xml_reader;
236 xmlXIncludeCtxtPtr xincctxt;
237 static const int default_reader_options =
238 XML_PARSE_NOENT | XML_PARSE_NOBLANKS | XML_PARSE_NOCDATA | XML_PARSE_XINCLUDE;
239 int reader_options;
240 struct elem_data_t
241 {
elem_data_ttextual_dict_parser_t::elem_data_t242 elem_data_t(const char* name, char content_type = 0)
243 :
244 name(name),
245 content_type(content_type)
246 {
247
248 }
249 const char* name;
250 char content_type;
251 private:
252 elem_data_t(void);
253 };
254 /* stack of elements
255 * Each time new element start tag is read, name of the read element is added to the stack.
256 * When an end element is read, the top element of the stack is removed.
257 * Some elements (article, contents, definition) may have a content type attribute.
258 * The value of this attribute is one char that is assigned to content_type field.
259 * For other elements and when the attribute in question is not specified, content_type field
260 * must be = 0. */
261 std::vector<elem_data_t> elem_stack;
262 common_dict_t* norm_dict;
263 };
264
textual_dict_parser_t(void)265 textual_dict_parser_t::textual_dict_parser_t(void)
266 :
267 xml_reader(NULL),
268 xincctxt(NULL),
269 reader_options(default_reader_options),
270 norm_dict(NULL)
271 {
272
273 }
274
275 /* Return value:
276 * EXIT_FAILURE or EXIT_SUCCESS */
parse(const std::string & xmlfilename,common_dict_t * norm_dict)277 int textual_dict_parser_t::parse(const std::string& xmlfilename,
278 common_dict_t* norm_dict)
279 {
280 this->xmlfilename = xmlfilename;
281 this->norm_dict = norm_dict;
282
283 auto_executor_t<textual_dict_parser_t> auto_exec(*this, &textual_dict_parser_t::close_parser);
284 if(prepare_parser())
285 return EXIT_FAILURE;
286 g_message("processing %s...", xmlfilename.c_str());
287 if(read_all())
288 return EXIT_FAILURE;
289 #if 0
290 int ret = next_node();
291 while (ret == 1) {
292 processNode(xml_reader);
293 ret = next_node();
294 }
295 if (ret != 0) {
296 g_critical("%s : failed to parse.", xmlfilename.c_str());
297 return EXIT_FAILURE;
298 }
299 #endif
300 g_message(parse_xml_done_msg);
301 return EXIT_SUCCESS;
302 }
303
set_custom_include(bool b)304 void textual_dict_parser_t::set_custom_include(bool b)
305 {
306 if(b)
307 reader_options &= ~XML_PARSE_XINCLUDE;
308 else
309 reader_options |= XML_PARSE_XINCLUDE;
310 }
311
get_custom_include(void) const312 bool textual_dict_parser_t::get_custom_include(void) const
313 {
314 return (reader_options & XML_PARSE_XINCLUDE) == 0;
315 }
316
317 /* Return value:
318 * EXIT_FAILURE or EXIT_SUCCESS */
prepare_parser(void)319 int textual_dict_parser_t::prepare_parser(void)
320 {
321 remove_reader();
322 xml_reader = xmlReaderForFile(xmlfilename.c_str(), NULL,
323 reader_options);
324 if (!xml_reader) {
325 g_critical(open_read_file_err, xmlfilename.c_str(), "");
326 return EXIT_FAILURE;
327 }
328 if(!norm_dict) {
329 g_critical("norm_dict == NULL.");
330 return EXIT_FAILURE;
331 }
332 if(norm_dict->reset())
333 return EXIT_FAILURE;
334 return EXIT_SUCCESS;
335 }
336
close_parser(void)337 void textual_dict_parser_t::close_parser(void)
338 {
339 remove_reader();
340 norm_dict = NULL;
341 xmlfilename.clear();
342 }
343
remove_reader(void)344 void textual_dict_parser_t::remove_reader(void)
345 {
346 if(xml_reader)
347 xmlFreeTextReader(xml_reader);
348 xml_reader = NULL;
349 if(xincctxt)
350 xmlXIncludeFreeContext(xincctxt);
351 xincctxt = NULL;
352 }
353
354 /* Return value:
355 * EXIT_FAILURE or EXIT_SUCCESS */
read_all(void)356 int textual_dict_parser_t::read_all(void)
357 {
358 const char* const open_elem = "";
359 static const char* exp_elems[] = {
360 "stardict",
361 NULL
362 };
363 int ret = read_xml_element(exp_elems, open_elem);
364 if(ret == rrEOF) {
365 unexpected_eof(exp_elems);
366 return EXIT_FAILURE;
367 }
368 if(ret == rrError)
369 return EXIT_FAILURE;
370 if(ret == rrEndElement) {
371 unexpected_end_element(exp_elems, open_elem);
372 return EXIT_FAILURE;
373 }
374 if(xmlTextReaderIsEmptyElement(xml_reader)) {
375 unexpected_empty_element();
376 return EXIT_FAILURE;
377 }
378 g_assert(ret == 0);
379 elem_stack.push_back(elem_data_t(exp_elems[0]));
380 if(read_stardict())
381 return EXIT_FAILURE;
382 return EXIT_SUCCESS;
383 }
384
385 /* read root stardict element
386 * Return value:
387 * EXIT_FAILURE or EXIT_SUCCESS */
read_stardict(void)388 int textual_dict_parser_t::read_stardict(void)
389 {
390 static const char* exp_elems[] = {
391 "info",
392 "contents",
393 "article",
394 NULL
395 };
396 bool have_info = false;
397 while(true) {
398 int ret = read_xml_element(exp_elems, elem_stack.back().name);
399 if(ret == rrEOF) {
400 unexpected_eof(exp_elems);
401 return EXIT_FAILURE;
402 }
403 if(ret == rrError)
404 return EXIT_FAILURE;
405 if(ret == rrEndElement) {
406 elem_stack.pop_back();
407 break;
408 }
409 if(xmlTextReaderIsEmptyElement(xml_reader)) {
410 unexpected_empty_element();
411 return EXIT_FAILURE;
412 }
413 elem_stack.push_back(elem_data_t(exp_elems[ret]));
414 switch(ret) {
415 case 0:
416 if(have_info) {
417 g_critical(duplicate_elem_err, get_line_number(), exp_elems[ret]);
418 return EXIT_FAILURE;
419 }
420 if(read_info())
421 return EXIT_FAILURE;
422 have_info = true;
423 break;
424 case 1:
425 if(read_contents())
426 return EXIT_FAILURE;
427 break;
428 case 2:
429 if(read_article())
430 return EXIT_FAILURE;
431 break;
432 default:
433 g_assert_not_reached();
434 }
435 }
436 if(!have_info) {
437 g_critical(missing_info_section_err);
438 return EXIT_FAILURE;
439 }
440 if(norm_dict->articles.empty()) {
441 g_critical(article_list_empty);
442 return EXIT_FAILURE;
443 }
444 g_message("total articles: %lu.", static_cast<unsigned long>(norm_dict->articles.size()));
445 return EXIT_SUCCESS;
446 }
447
448 /* Return value:
449 * EXIT_FAILURE or EXIT_SUCCESS */
read_info(void)450 int textual_dict_parser_t::read_info(void)
451 {
452 norm_dict->dict_info.clear();
453 if(read_info_items())
454 return EXIT_FAILURE;
455 if(check_blank_info_items())
456 return EXIT_FAILURE;
457 if(check_new_line_info_items())
458 return EXIT_FAILURE;
459 if(check_mandatory_info_items())
460 return EXIT_FAILURE;
461 return EXIT_SUCCESS;
462 }
463
464 /* Return value:
465 * EXIT_FAILURE or EXIT_SUCCESS */
read_info_items(void)466 int textual_dict_parser_t::read_info_items(void)
467 {
468 static const char* exp_elems[] = {
469 "version",
470 "bookname",
471 "author",
472 "email",
473 "website",
474 "description",
475 "date",
476 "dicttype",
477 NULL
478 };
479 while(true) {
480 int ret = read_xml_element(exp_elems, elem_stack.back().name);
481 if(ret == rrEOF) {
482 unexpected_eof(exp_elems);
483 return EXIT_FAILURE;
484 }
485 if(ret == rrError)
486 return EXIT_FAILURE;
487 if(ret == rrEndElement) {
488 elem_stack.pop_back();
489 break;
490 }
491 if(xmlTextReaderIsEmptyElement(xml_reader)) {
492 unexpected_empty_element();
493 return EXIT_FAILURE;
494 }
495 // do not change elem_stack for leaf elements
496 switch(ret) {
497 case 0:
498 if(read_info_item("version", &DictInfo::is_version, &DictInfo::set_version))
499 return EXIT_FAILURE;
500 break;
501 case 1:
502 if(read_info_item("bookname", &DictInfo::is_bookname, &DictInfo::set_bookname))
503 return EXIT_FAILURE;
504 break;
505 case 2:
506 if(read_info_item("author", &DictInfo::is_author, &DictInfo::set_author))
507 return EXIT_FAILURE;
508 break;
509 case 3:
510 if(read_info_item("email", &DictInfo::is_email, &DictInfo::set_email))
511 return EXIT_FAILURE;
512 break;
513 case 4:
514 if(read_info_item("website", &DictInfo::is_website, &DictInfo::set_website))
515 return EXIT_FAILURE;
516 break;
517 case 5:
518 if(read_info_item("description", &DictInfo::is_description, &DictInfo::set_description))
519 return EXIT_FAILURE;
520 break;
521 case 6:
522 if(read_info_item("date", &DictInfo::is_date, &DictInfo::set_date))
523 return EXIT_FAILURE;
524 break;
525 case 7:
526 if(read_info_item("dicttype", &DictInfo::is_dicttype, &DictInfo::set_dicttype))
527 return EXIT_FAILURE;
528 break;
529 default:
530 g_assert_not_reached();
531 }
532 }
533 return EXIT_SUCCESS;
534 }
535
536 /* Return value:
537 * EXIT_FAILURE or EXIT_SUCCESS */
read_info_item(const char * elem,bool (DictInfo::* is_item)(void)const,void (DictInfo::* set_item)(const std::string &))538 int textual_dict_parser_t::read_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
539 void (DictInfo:: *set_item)(const std::string& ))
540 {
541 bool ignore = false;
542 if((norm_dict->dict_info.*is_item)()) {
543 g_warning(duplicate_elem_err, get_line_number(), elem);
544 g_message(fixed_ignore_msg);
545 ignore = true;
546 }
547 std::string value;
548 if(read_xml_text_item(elem, value))
549 return EXIT_FAILURE;
550 if(value.empty()) {
551 g_warning(element_blank_value, get_line_number(), elem);
552 g_message(fixed_ignore_msg);
553 return EXIT_SUCCESS;
554 }
555 if(!ignore) {
556 (norm_dict->dict_info.*set_item)(value.c_str());
557 }
558 return EXIT_SUCCESS;
559 }
560
561 /* Return value:
562 * EXIT_FAILURE or EXIT_SUCCESS */
read_contents(void)563 int textual_dict_parser_t::read_contents(void)
564 {
565 {
566 char content_type;
567 switch(read_att_type(content_type))
568 {
569 case lrOK:
570 elem_stack.back().content_type = content_type;
571 break;
572 case lrError:
573 return EXIT_FAILURE;
574 case lrNotFound:
575 break;
576 }
577 }
578 static const char* exp_elems[] = {
579 "contents",
580 "article",
581 NULL
582 };
583 while(true) {
584 int ret = read_xml_element(exp_elems, elem_stack.back().name);
585 if(ret == rrEOF) {
586 unexpected_eof(exp_elems);
587 return EXIT_FAILURE;
588 }
589 if(ret == rrError)
590 return EXIT_FAILURE;
591 if(ret == rrEndElement) {
592 elem_stack.pop_back();
593 break;
594 }
595 if(xmlTextReaderIsEmptyElement(xml_reader)) {
596 unexpected_empty_element();
597 return EXIT_FAILURE;
598 }
599 elem_stack.push_back(elem_data_t(exp_elems[ret]));
600 switch(ret) {
601 case 0:
602 if(read_contents())
603 return EXIT_FAILURE;
604 break;
605 case 1:
606 if(read_article())
607 return EXIT_FAILURE;
608 break;
609 default:
610 g_assert_not_reached();
611 }
612 }
613 return EXIT_SUCCESS;
614 }
615
616 /* Return value:
617 * EXIT_FAILURE or EXIT_SUCCESS */
read_article(void)618 int textual_dict_parser_t::read_article(void)
619 {
620 const line_number_t article_beg_line_number = get_line_number();
621 {
622 char content_type;
623 switch(read_att_type(content_type))
624 {
625 case lrOK:
626 elem_stack.back().content_type = content_type;
627 break;
628 case lrError:
629 return EXIT_FAILURE;
630 case lrNotFound:
631 break;
632 }
633 }
634 static const char* exp_elems[] = {
635 "key",
636 "synonym",
637 "definition",
638 "definition-r",
639 NULL
640 };
641 article_data_t article;
642 /* for simplicity we do not check the order of elements */
643 while(true) {
644 int ret = read_xml_element(exp_elems, elem_stack.back().name);
645 if(ret == rrEOF) {
646 unexpected_eof(exp_elems);
647 return EXIT_FAILURE;
648 }
649 if(ret == rrError)
650 return EXIT_FAILURE;
651 if(ret == rrEndElement) {
652 elem_stack.pop_back();
653 break;
654 }
655 if(xmlTextReaderIsEmptyElement(xml_reader)) {
656 unexpected_empty_element();
657 return EXIT_FAILURE;
658 }
659 const line_number_t article_item_line_number = get_line_number();
660 char content_type;
661 if(ret == 2) { // read 'type' attribute of the 'definition' element
662 switch(read_att_type(content_type))
663 {
664 case lrOK:
665 break;
666 case lrError:
667 return EXIT_FAILURE;
668 case lrNotFound:
669 content_type = get_effective_content_type();
670 break;
671 }
672 if(content_type == 0) {
673 g_critical(attribute_content_type_absent_err,
674 get_line_number());
675 return EXIT_FAILURE;
676 }
677 }
678 switch(ret) {
679 case 0:
680 case 1:
681 case 2:
682 {
683 std::string value;
684 if(read_xml_text_item(exp_elems[ret], value))
685 return EXIT_FAILURE;
686 if(value.empty()) {
687 g_warning(element_blank_value, get_line_number(), exp_elems[ret]);
688 g_message(fixed_ignore_msg);
689 break;
690 }
691 if(ret == 0) {
692 if(check_article_key(value, article_item_line_number))
693 return EXIT_FAILURE;
694 if(article.add_key(value)) {
695 g_critical(duplicate_key_err, get_line_number(), value.c_str());
696 return EXIT_FAILURE;
697 }
698 }
699 if(ret == 1) {
700 if(check_article_synonym(value, article_item_line_number))
701 return EXIT_FAILURE;
702 if(article.add_synonym(value)) {
703 g_critical(duplicate_synonym_err, get_line_number(), value.c_str());
704 return EXIT_FAILURE;
705 }
706 }
707 if(ret == 2) {
708 size_t size = value.length(), offset;
709 if(norm_dict->write_data(value.c_str(), size, offset)) {
710 g_critical(save_into_temp_file_err);
711 return EXIT_FAILURE;
712 }
713 if(article.add_definition(article_def_t(content_type, offset, size)))
714 return EXIT_FAILURE;
715 }
716 break;
717 }
718 case 3:
719 {
720 elem_stack.push_back(elem_data_t(exp_elems[ret]));
721 article_def_t def;
722 if(read_contents_r(def))
723 return EXIT_FAILURE;
724 if(article.add_definition(def))
725 return EXIT_FAILURE;
726 break;
727 }
728 default:
729 g_assert_not_reached();
730 }
731 }
732 if(article.key.empty()) {
733 g_warning(article_key_list_empty_err, article_beg_line_number);
734 g_message(fixed_ignore_msg);
735 return EXIT_SUCCESS;
736 }
737 if(article.definitions.empty()) {
738 g_warning(article_definition_list_empty_err, article_beg_line_number);
739 g_message(fixed_ignore_msg);
740 return EXIT_SUCCESS;
741 }
742 if(norm_dict->add_article(article))
743 return EXIT_FAILURE;
744 return EXIT_SUCCESS;
745 }
746
747 /* Return value:
748 * EXIT_FAILURE or EXIT_SUCCESS */
read_contents_r(article_def_t & article_def)749 int textual_dict_parser_t::read_contents_r(article_def_t& article_def)
750 {
751 const line_number_t contents_beg_line_number = get_line_number();
752 article_def.type = 'r';
753 static const char* exp_elems[] = {
754 "resource",
755 NULL
756 };
757 while(true) {
758 int ret = read_xml_element(exp_elems, elem_stack.back().name);
759 if(ret == rrEOF) {
760 unexpected_eof(exp_elems);
761 return EXIT_FAILURE;
762 }
763 if(ret == rrError)
764 return EXIT_FAILURE;
765 if(ret == rrEndElement) {
766 elem_stack.pop_back();
767 break;
768 }
769 if(!xmlTextReaderIsEmptyElement(xml_reader)) {
770 unexpected_non_empty_element();
771 return EXIT_FAILURE;
772 }
773 g_assert(ret == 0);
774 const char* att_name = "key";
775 std::string key;
776 if(read_xml_attribute(att_name, key)) {
777 g_critical(attribute_value_not_specified_err,
778 get_line_number(), att_name);
779 return EXIT_FAILURE;
780 }
781 if(key.empty()) {
782 g_critical(attribute_value_empty_err,
783 get_line_number(), att_name);
784 return EXIT_FAILURE;
785 }
786 if(key.find_first_of('\\') != std::string::npos) {
787 g_critical(incorrect_dir_sep_err, get_line_number());
788 return EXIT_FAILURE;
789 }
790 att_name = "type";
791 std::string type;
792 if(read_xml_attribute(att_name, type)) {
793 g_critical(attribute_value_not_specified_err,
794 get_line_number(), att_name);
795 return EXIT_FAILURE;
796 }
797 if(!is_known_resource_type(type.c_str())) {
798 g_critical(attribute_value_unknown_err,
799 get_line_number(), att_name, type.c_str());
800 return EXIT_FAILURE;
801 }
802 article_def.resources.push_back(resource_t(type, key));
803 }
804 if(article_def.resources.empty()) {
805 g_critical(resource_list_empty_err, contents_beg_line_number);
806 return EXIT_FAILURE;
807 }
808 return EXIT_SUCCESS;
809 }
810
811
read_att_type(char & type)812 TLoadResult textual_dict_parser_t::read_att_type(char& type)
813 {
814 std::string value;
815 static const char* const att_name = "type";
816 if(read_xml_attribute(att_name, value))
817 return lrNotFound;
818 if(value.length() != 1) {
819 g_critical(unknown_content_type_str_err,
820 get_line_number(), value.c_str());
821 return lrError;
822 }
823 type = value[0];
824 g_assert(type != 0);
825 if(strchr(known_type_ids, type) == NULL) {
826 g_warning(unknown_content_type_chr_err, get_line_number(), type);
827 g_message(allow_unknown_content_type_msg);
828 }
829 if(type == 'r') {
830 g_critical(content_type_r_in_definition_err,
831 get_line_number());
832 return lrError;
833 }
834 return lrOK;
835 }
836
837 /* Return value:
838 * EXIT_FAILURE or EXIT_SUCCESS */
check_blank_info_items(void)839 int textual_dict_parser_t::check_blank_info_items(void)
840 {
841 check_blank_info_item("version", &DictInfo::is_version,
842 &DictInfo::get_version, &DictInfo::unset_version);
843 check_blank_info_item("bookname", &DictInfo::is_bookname,
844 &DictInfo::get_bookname, &DictInfo::unset_bookname);
845 check_blank_info_item("author", &DictInfo::is_author,
846 &DictInfo::get_author, &DictInfo::unset_author);
847 check_blank_info_item("email", &DictInfo::is_email,
848 &DictInfo::get_email, &DictInfo::unset_email);
849 check_blank_info_item("website", &DictInfo::is_website,
850 &DictInfo::get_website, &DictInfo::unset_website);
851 check_blank_info_item("description", &DictInfo::is_description,
852 &DictInfo::get_description, &DictInfo::unset_description);
853 check_blank_info_item("date", &DictInfo::is_date,
854 &DictInfo::get_date, &DictInfo::unset_date);
855 check_blank_info_item("dicttype", &DictInfo::is_dicttype,
856 &DictInfo::get_dicttype, &DictInfo::unset_dicttype);
857 return EXIT_SUCCESS;
858 }
859
860 /* Return value:
861 * EXIT_FAILURE or EXIT_SUCCESS */
check_new_line_info_items(void)862 int textual_dict_parser_t::check_new_line_info_items(void)
863 {
864 if(check_new_line_info_item("version", &DictInfo::is_version,
865 &DictInfo::get_version))
866 return EXIT_FAILURE;
867 if(check_new_line_info_item("bookname", &DictInfo::is_bookname,
868 &DictInfo::get_bookname))
869 return EXIT_FAILURE;
870 if(check_new_line_info_item("author", &DictInfo::is_author,
871 &DictInfo::get_author))
872 return EXIT_FAILURE;
873 if(check_new_line_info_item("email", &DictInfo::is_email,
874 &DictInfo::get_email))
875 return EXIT_FAILURE;
876 if(check_new_line_info_item("website", &DictInfo::is_website,
877 &DictInfo::get_website))
878 return EXIT_FAILURE;
879 if(check_new_line_info_item("date", &DictInfo::is_date,
880 &DictInfo::get_date))
881 return EXIT_FAILURE;
882 if(check_new_line_info_item("dicttype", &DictInfo::is_dicttype,
883 &DictInfo::get_dicttype))
884 return EXIT_FAILURE;
885 return EXIT_SUCCESS;
886 }
887
888 /* Return value:
889 * EXIT_FAILURE or EXIT_SUCCESS */
check_mandatory_info_items(void)890 int textual_dict_parser_t::check_mandatory_info_items(void)
891 {
892 if(!norm_dict->dict_info.is_version()) {
893 g_critical(missing_req_info_item_msg, "version");
894 return EXIT_FAILURE;
895 }
896 if(!norm_dict->dict_info.is_bookname()) {
897 g_critical(missing_req_info_item_msg, "bookname");
898 return EXIT_FAILURE;
899 }
900 return EXIT_SUCCESS;
901 }
902
903 /* Return value:
904 * EXIT_FAILURE or EXIT_SUCCESS */
check_article_key(const std::string & value,line_number_t article_item_line_number)905 int textual_dict_parser_t::check_article_key(const std::string& value, line_number_t article_item_line_number)
906 {
907 if(value.find_first_of(key_forbidden_chars) != std::string::npos) {
908 g_critical(article_key_forbidden_chars_err,
909 article_item_line_number, value.c_str());
910 return EXIT_FAILURE;
911 }
912 if(value.length() >= (size_t)MAX_INDEX_KEY_SIZE) {
913 g_critical(article_key_long_err,
914 get_line_number(), value.c_str(), static_cast<unsigned>(value.length()), MAX_INDEX_KEY_SIZE-1);
915 return EXIT_FAILURE;
916 }
917 return EXIT_SUCCESS;
918 }
919
920
921 /* Return value:
922 * EXIT_FAILURE or EXIT_SUCCESS */
check_article_synonym(const std::string & value,line_number_t article_item_line_number)923 int textual_dict_parser_t::check_article_synonym(const std::string& value, line_number_t article_item_line_number)
924 {
925 if(value.find_first_of(key_forbidden_chars) != std::string::npos) {
926 g_critical(article_synonym_forbidden_chars_err,
927 article_item_line_number, value.c_str());
928 return EXIT_FAILURE;
929 }
930 if(value.length() >= (size_t)MAX_INDEX_KEY_SIZE) {
931 g_critical(article_synonym_long_err,
932 get_line_number(), value.c_str(), static_cast<unsigned>(value.length()), MAX_INDEX_KEY_SIZE-1);
933 return EXIT_FAILURE;
934 }
935 return EXIT_SUCCESS;
936 }
937
938 /* Read the next element. It must be one of exp_elems.
939 * parameters:
940 * open_elem - expected end element, may be NULL
941 *
942 * return value:
943 * >= 0 - index in the exp_elems array
944 * rrEOF, rrError, rrEndElement
945 * */
read_xml_element(const char ** exp_elems,const char * open_elem)946 int textual_dict_parser_t::read_xml_element(const char** exp_elems, const char* open_elem)
947 {
948 int ret = next_node();
949 if(ret == 0)
950 return rrEOF;
951 if(ret < 0)
952 return rrError;
953 ret = xmlTextReaderNodeType(xml_reader);
954 if(ret < 0) {
955 g_critical(parser_err);
956 return rrError;
957 }
958 const xmlReaderTypes node_type = static_cast<xmlReaderTypes>(ret);
959 if(node_type == XML_READER_TYPE_END_ELEMENT) {
960 xml::CharStr node_name(xmlTextReaderName(xml_reader));
961 if(xmlStrEqual(get_impl(node_name), BAD_CAST open_elem))
962 return rrEndElement;
963 else {
964 unmatched_end_element(open_elem, (const char*)get_impl(node_name));
965 return rrError;
966 }
967 }
968 if(node_type != XML_READER_TYPE_ELEMENT) {
969 unexpected_node_type_element(exp_elems);
970 return rrError;
971 }
972 xml::CharStr node_name(xmlTextReaderName(xml_reader));
973 for(size_t i=0; exp_elems[i]; ++i)
974 if(xmlStrEqual(get_impl(node_name), BAD_CAST exp_elems[i])) {
975 return i;
976 }
977 unexpected_element(exp_elems);
978 return rrError;
979 }
980
981 /* Read the next node. It must be end element.
982 * parameters:
983 * open_elem - opened element, != NULL
984 *
985 * return value:
986 * rrEOF, rrError, rrEndElement
987 * */
read_xml_end_element(const char * open_elem)988 int textual_dict_parser_t::read_xml_end_element(const char* open_elem)
989 {
990 int ret = next_node();
991 if(ret == 0)
992 return rrEOF;
993 if(ret < 0)
994 return rrError;
995 ret = xmlTextReaderNodeType(xml_reader);
996 if(ret < 0) {
997 g_critical(parser_err);
998 return rrError;
999 }
1000 const xmlReaderTypes node_type = static_cast<xmlReaderTypes>(ret);
1001 if(node_type != XML_READER_TYPE_END_ELEMENT) {
1002 unexpected_node_type_end_element(open_elem);
1003 return rrError;
1004 }
1005 xml::CharStr node_name(xmlTextReaderName(xml_reader));
1006 if(!xmlStrEqual(get_impl(node_name), BAD_CAST open_elem)) {
1007 unmatched_end_element(open_elem, (const char*)get_impl(node_name));
1008 return rrError;
1009 }
1010 return rrEndElement;
1011 }
1012
1013 /* Return value:
1014 * EXIT_FAILURE or EXIT_SUCCESS */
read_xml_text_item(const char * elem,std::string & value)1015 int textual_dict_parser_t::read_xml_text_item(const char* elem, std::string& value)
1016 {
1017 value.clear();
1018 const line_number_t element_beg_line_number = get_line_number();
1019 int ret = read_xml_text(elem);
1020 if(ret == rrEOF) {
1021 unexpected_eof();
1022 return EXIT_FAILURE;
1023 }
1024 if(ret == rrError)
1025 return EXIT_FAILURE;
1026 if(ret == rrEndElement)
1027 return EXIT_SUCCESS;
1028 xml::CharStr tvalue(xmlTextReaderValue(xml_reader));
1029 if(!g_utf8_validate((const char*)get_impl(tvalue), -1, NULL)) {
1030 g_critical(element_invalid_utf8_value_err,
1031 element_beg_line_number, elem, get_impl(tvalue));
1032 return EXIT_FAILURE;
1033 }
1034 glib::CharStr t2value(g_utf8_normalize((const char*)get_impl(tvalue), -1, G_NORMALIZE_ALL_COMPOSE));
1035 if(!t2value) {
1036 g_critical(normalization_failed_err,
1037 element_beg_line_number, get_impl(tvalue));
1038 return EXIT_FAILURE;
1039 }
1040 std::string data_str;
1041 { // check for invalid chars
1042 typedef std::list<const char*> str_list_t;
1043 str_list_t invalid_chars;
1044 if(check_stardict_string_chars(get_impl(t2value), invalid_chars)) {
1045 g_message(element_invalid_text_err,
1046 element_beg_line_number, elem, get_impl(t2value),
1047 print_char_codes(invalid_chars).c_str());
1048 fix_stardict_string_chars(get_impl(t2value), data_str);
1049 g_message(fixed_drop_invalid_char_msg);
1050 } else
1051 data_str.assign(get_impl(t2value));
1052 }
1053 {
1054 const char* beg;
1055 size_t len;
1056 trim_spaces(data_str.c_str(), beg, len);
1057 value.assign(beg, len);
1058 }
1059 // read end element
1060 ret = read_xml_end_element(elem);
1061 if(ret == rrEOF) {
1062 unexpected_eof();
1063 return EXIT_FAILURE;
1064 }
1065 if(ret == rrError)
1066 return EXIT_FAILURE;
1067 return EXIT_SUCCESS;
1068 }
1069
1070 /* Read the next node. It must be text.
1071 * parameters:
1072 * open_elem - opened element, != NULL
1073 *
1074 * return value:
1075 * 0 - text node is found
1076 * rrEOF, rrError, rrEndElement
1077 * */
read_xml_text(const char * open_elem)1078 int textual_dict_parser_t::read_xml_text(const char* open_elem)
1079 {
1080 int ret = next_node();
1081 if(ret == 0)
1082 return rrEOF;
1083 if(ret < 0)
1084 return rrError;
1085 ret = xmlTextReaderNodeType(xml_reader);
1086 if(ret < 0) {
1087 g_critical(parser_err);
1088 return rrError;
1089 }
1090 const xmlReaderTypes node_type = static_cast<xmlReaderTypes>(ret);
1091 if(node_type == XML_READER_TYPE_END_ELEMENT) {
1092 xml::CharStr node_name(xmlTextReaderName(xml_reader));
1093 if(xmlStrEqual(get_impl(node_name), BAD_CAST open_elem))
1094 return rrEndElement;
1095 else {
1096 unmatched_end_element(open_elem, (const char*)get_impl(node_name));
1097 return rrError;
1098 }
1099 }
1100 if(node_type != XML_READER_TYPE_TEXT) {
1101 unexpected_node_type_text();
1102 return rrError;
1103 }
1104 return 0;
1105 }
1106
1107 /* Return value:
1108 * EXIT_FAILURE or EXIT_SUCCESS
1109 *
1110 * EXIT_FAILURE - attribute is missing,
1111 * EXIT_SUCCESS - attribute value read successfully */
read_xml_attribute(const char * att_name,std::string & value)1112 int textual_dict_parser_t::read_xml_attribute(const char* att_name, std::string& value)
1113 {
1114 xmlChar * att_val = xmlTextReaderGetAttribute(xml_reader, (const xmlChar*) att_name);
1115 if(!att_val)
1116 return EXIT_FAILURE;
1117 value.assign((const char*) att_val);
1118 xmlFree(att_val);
1119 return EXIT_SUCCESS;
1120 }
1121
check_blank_info_item(const char * elem,bool (DictInfo::* is_item)(void)const,const std::string & (DictInfo::* get_item)(void)const,void (DictInfo::* unset_item)(void))1122 void textual_dict_parser_t::check_blank_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
1123 const std::string& (DictInfo:: *get_item)(void) const, void (DictInfo:: *unset_item)(void))
1124 {
1125 if((norm_dict->dict_info.*is_item)() && (norm_dict->dict_info.*get_item)().empty()) {
1126 g_warning("info item %s is assigned an empty string.", elem);
1127 g_message(fixed_msg2 "Unset the item.");
1128 (norm_dict->dict_info.*unset_item)();
1129 }
1130 }
1131
check_new_line_info_item(const char * elem,bool (DictInfo::* is_item)(void)const,const std::string & (DictInfo::* get_item)(void)const)1132 int textual_dict_parser_t::check_new_line_info_item(const char* elem, bool (DictInfo::* is_item)(void) const,
1133 const std::string& (DictInfo:: *get_item)(void) const)
1134 {
1135 if((norm_dict->dict_info.*is_item)() && (norm_dict->dict_info.*get_item)().find_first_of("\n\r") != std::string::npos) {
1136 g_critical("info item %s contains new line character.", elem);
1137 return EXIT_FAILURE;
1138 }
1139 return EXIT_SUCCESS;
1140 }
1141
1142 /* return effective content type for the current element.
1143 * return 0 if content type is not specified for all elements in the element stack. */
get_effective_content_type(void) const1144 char textual_dict_parser_t::get_effective_content_type(void) const
1145 {
1146 for(size_t i=elem_stack.size()-1; i>=0; --i)
1147 if(elem_stack[i].content_type)
1148 return elem_stack[i].content_type;
1149 return 0;
1150 }
1151
unexpected_node_type_element(const char ** exp_elems)1152 void textual_dict_parser_t::unexpected_node_type_element(const char** exp_elems)
1153 {
1154 std::string buf;
1155 for(size_t i=0; exp_elems[i]; ++i) {
1156 buf += " ";
1157 buf += exp_elems[i];
1158 }
1159 g_warning("%u: Unexpected node type: %s, name: %s. Expecting elements:%s.",
1160 get_line_number(),
1161 note_type_str(xmlTextReaderNodeType(xml_reader)),
1162 xmlTextReaderName(xml_reader),
1163 buf.c_str()
1164 );
1165 }
1166
unexpected_node_type_text(void)1167 void textual_dict_parser_t::unexpected_node_type_text(void)
1168 {
1169 g_warning("%u: Unexpected node type: %s, name: %s. Expecting text.",
1170 get_line_number(),
1171 note_type_str(xmlTextReaderNodeType(xml_reader)),
1172 xmlTextReaderName(xml_reader)
1173 );
1174 }
1175
unexpected_node_type_end_element(const char * open_elem)1176 void textual_dict_parser_t::unexpected_node_type_end_element(const char* open_elem)
1177 {
1178 g_warning("%u: Unexpected node type: %s, name: %s. Expecting end element: %s.",
1179 get_line_number(),
1180 note_type_str(xmlTextReaderNodeType(xml_reader)),
1181 xmlTextReaderName(xml_reader),
1182 open_elem
1183 );
1184 }
1185
unexpected_element(const char ** exp_elems)1186 void textual_dict_parser_t::unexpected_element(const char** exp_elems)
1187 {
1188 std::string buf;
1189 for(size_t i=0; exp_elems[i]; ++i) {
1190 buf += " ";
1191 buf += exp_elems[i];
1192 }
1193 g_warning("%u: Unexpected element: %s. Expecting elements:%s.",
1194 get_line_number(),
1195 xmlTextReaderName(xml_reader),
1196 buf.c_str()
1197 );
1198 }
1199
unexpected_eof(const char ** exp_elems)1200 void textual_dict_parser_t::unexpected_eof(const char** exp_elems)
1201 {
1202 std::string buf;
1203 for(size_t i=0; exp_elems[i]; ++i) {
1204 buf += " ";
1205 buf += exp_elems[i];
1206 }
1207 g_warning("Unexpected end of file. Expecting: %s.", buf.c_str());
1208 }
1209
unexpected_eof(void)1210 void textual_dict_parser_t::unexpected_eof(void)
1211 {
1212 g_warning("Unexpected end of file.");
1213 }
1214
unexpected_end_element(const char ** exp_elems,const char * open_elem)1215 void textual_dict_parser_t::unexpected_end_element(const char** exp_elems, const char* open_elem)
1216 {
1217 std::string buf;
1218 for(size_t i=0; exp_elems[i]; ++i) {
1219 buf += " ";
1220 buf += exp_elems[i];
1221 }
1222 g_warning("%u: Unexpected end element %s. Expecting:%s.",
1223 get_line_number(),
1224 open_elem,
1225 buf.c_str());
1226 }
1227
unmatched_end_element(const char * open_elem,const char * end_elem)1228 void textual_dict_parser_t::unmatched_end_element(const char* open_elem, const char* end_elem)
1229 {
1230 g_warning("%u: open element %s, end element %s.",
1231 get_line_number(),
1232 open_elem,
1233 end_elem
1234 );
1235 }
1236
unexpected_empty_element(void)1237 void textual_dict_parser_t::unexpected_empty_element(void)
1238 {
1239 g_warning("%u: Unexpected empty element: %s.",
1240 get_line_number(),
1241 xmlTextReaderName(xml_reader)
1242 );
1243 }
1244
unexpected_non_empty_element(void)1245 void textual_dict_parser_t::unexpected_non_empty_element(void)
1246 {
1247 g_warning("%u: Unexpected non-empty element: %s.",
1248 get_line_number(),
1249 xmlTextReaderName(xml_reader)
1250 );
1251 }
1252
get_line_number(void)1253 textual_dict_parser_t::line_number_t textual_dict_parser_t::get_line_number(void)
1254 {
1255 xmlNodePtr node = xmlTextReaderCurrentNode(xml_reader);
1256 return node ? node->line : 0;
1257 }
1258
1259 /* Read next node, skipping comments and processing instructions.
1260 * Return value: see xmlTextReaderRead */
next_node(void)1261 int textual_dict_parser_t::next_node(void)
1262 {
1263 while(true) {
1264 int ret = xmlTextReaderRead(xml_reader);
1265 if(ret != 1)
1266 return ret;
1267 ret = xmlTextReaderNodeType(xml_reader);
1268 if(ret < 0)
1269 return -1;
1270 xmlReaderTypes reader_type = static_cast<xmlReaderTypes>(ret);
1271 if(reader_type == XML_READER_TYPE_COMMENT
1272 || reader_type == XML_READER_TYPE_PROCESSING_INSTRUCTION) {
1273 continue;
1274 }
1275 if(get_custom_include()) {
1276 xmlNodePtr node = xmlTextReaderCurrentNode(xml_reader);
1277 if(!node)
1278 return -1;
1279 // see xmlreader.c, xmlTextReaderRead function, #ifdef LIBXML_XINCLUDE_ENABLED block
1280 if((node->type == XML_ELEMENT_NODE) &&
1281 (node->ns != NULL) &&
1282 ((xmlStrEqual(node->ns->href, XINCLUDE_NS)) ||
1283 (xmlStrEqual(node->ns->href, XINCLUDE_OLD_NS)))) {
1284 if (xincctxt == NULL) {
1285 xincctxt = xmlXIncludeNewContext(xmlTextReaderCurrentDoc(xml_reader));
1286 xmlXIncludeSetFlags(xincctxt, reader_options);
1287 }
1288 const char* att_name = "href";
1289 std::string href;
1290 if(!read_xml_attribute(att_name, href)) {
1291 g_message(xinclude_process_msg,
1292 get_line_number(), href.c_str());
1293 } else {
1294 g_message(xinclude_process_msg,
1295 get_line_number(), href.c_str());
1296 }
1297 /*
1298 * expand that node and process it
1299 */
1300 if (xmlTextReaderExpand(xml_reader) == NULL)
1301 return -1;
1302 xmlXIncludeProcessNode(xincctxt, node);
1303 continue;
1304 }
1305 }
1306 break;
1307 }
1308 return 1;
1309 }
1310
1311
parse_textual_dict(const std::string & xmlfilename,common_dict_t * norm_dict,bool show_xincludes)1312 int parse_textual_dict(const std::string& xmlfilename, common_dict_t* norm_dict,
1313 bool show_xincludes)
1314 {
1315 textual_dict_parser_t parser;
1316 parser.set_custom_include(show_xincludes);
1317 return parser.parse(xmlfilename, norm_dict);
1318 }
1319