1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3 *
4 * Copyright (C) 2002-2017 Németh László
5 *
6 * The contents of this file are subject to the Mozilla Public License Version
7 * 1.1 (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
9 * http://www.mozilla.org/MPL/
10 *
11 * Software distributed under the License is distributed on an "AS IS" basis,
12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13 * for the specific language governing rights and limitations under the
14 * License.
15 *
16 * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17 *
18 * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19 * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20 * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21 * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22 * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the LGPL.
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38 #include <cstdlib>
39 #include <cstring>
40 #include <cstdio>
41 #include <ctype.h>
42
43 #include "../hunspell/csutil.hxx"
44 #include "xmlparser.hxx"
45
46 #ifndef W32
47 using namespace std;
48 #endif
49
50 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
51
52 static const char* __PATTERN__[][2] = {{"<!--", "-->"},
53 {"<[cdata[", "]]>"}, // XML comment
54 {"<", ">"}};
55
56 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
57
58 // for checking attributes, eg. <img alt="text"> in HTML
59 static const char* (*__PATTERN2__)[2] = NULL;
60
61 #define __PATTERN_LEN2__ 0
62
63 // for checking words with in-word patterns
64 // for example, "exam<text:span>p</text:span>le" in ODT
65 static const char* (*__PATTERN3__)[2] = NULL;
66
67 #define __PATTERN_LEN3__ 0
68
69 #define ENTITY_APOS "'"
70 #define UTF8_APOS "\xe2\x80\x99"
71 #define APOSTROPHE "'"
72
XMLParser(const char * wordchars)73 XMLParser::XMLParser(const char* wordchars)
74 : TextParser(wordchars)
75 , pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
76 }
77
XMLParser(const w_char * wordchars,int len)78 XMLParser::XMLParser(const w_char* wordchars, int len)
79 : TextParser(wordchars, len)
80 , pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
81 }
82
~XMLParser()83 XMLParser::~XMLParser() {}
84
look_pattern(const char * p[][2],unsigned int len,int column)85 int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
86 for (unsigned int i = 0; i < len; i++) {
87 const char* j = line[actual].c_str() + head;
88 const char* k = p[i][column];
89 while ((*k != '\0') && (tolower(*j) == *k)) {
90 j++;
91 k++;
92 }
93 if (*k == '\0')
94 return i;
95 }
96 return -1;
97 }
98
99 /*
100 * XML parser
101 *
102 */
103
next_token(const char * PATTERN[][2],unsigned int PATTERN_LEN,const char * PATTERN2[][2],unsigned int PATTERN_LEN2,const char * PATTERN3[][2],unsigned int PATTERN_LEN3,std::string & t)104 bool XMLParser::next_token(const char* PATTERN[][2],
105 unsigned int PATTERN_LEN,
106 const char* PATTERN2[][2],
107 unsigned int PATTERN_LEN2,
108 const char* PATTERN3[][2],
109 unsigned int PATTERN_LEN3,
110 std::string& t) {
111 t.clear();
112 const char* latin1;
113
114 for (;;) {
115 switch (state) {
116 case ST_NON_WORD: // non word chars
117 prevstate = ST_NON_WORD;
118 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
119 checkattr = 0;
120 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
121 checkattr = 1;
122 }
123 state = ST_TAG;
124 } else if (is_wordchar(line[actual].c_str() + head)) {
125 state = ST_WORD;
126 token = head;
127 } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
128 state = ST_WORD;
129 token = head;
130 head += strlen(latin1);
131 } else if (line[actual][head] == '&') {
132 state = ST_CHAR_ENTITY;
133 }
134 break;
135 case ST_WORD: // wordchar
136 if ((latin1 = get_latin1(line[actual].c_str() + head))) {
137 head += strlen(latin1);
138 } else if ((is_wordchar((char*)APOSTROPHE) ||
139 (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
140 strncmp(line[actual].c_str() + head, ENTITY_APOS,
141 strlen(ENTITY_APOS)) == 0 &&
142 is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
143 head += strlen(ENTITY_APOS) - 1;
144 } else if (is_utf8() &&
145 is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe
146 // to the WORDCHARS, if
147 // needed
148 strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
149 0 &&
150 is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
151 head += strlen(UTF8_APOS) - 1;
152 } else if (!is_wordchar(line[actual].c_str() + head)) {
153 // in-word patterns
154 if ((pattern3_num = look_pattern(PATTERN3, PATTERN_LEN3, 0)) != -1) {
155 size_t pos = line[actual].find(PATTERN3[pattern3_num][1], head);
156 if (pos != std::string::npos) {
157 size_t endpos = pos + strlen(PATTERN3[pattern3_num][1]) - 1;
158 if (is_wordchar(line[actual].c_str() + endpos + 1)) {
159 head = endpos;
160 break;
161 }
162 }
163 }
164 state = prevstate;
165 // return with the token, except in the case of in-word patterns
166 if (alloc_token(token, &head, t))
167 return true;
168 }
169 break;
170 case ST_TAG: // comment, labels, etc
171 int i;
172 if ((checkattr == 1) &&
173 ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
174 (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
175 checkattr = 2;
176 } else if ((checkattr > 0) && (line[actual][head] == '>')) {
177 state = ST_NON_WORD;
178 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
179 (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
180 state = ST_NON_WORD;
181 head += strlen(PATTERN[pattern_num][1]) - 1;
182 } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
183 ((line[actual][head] == '"') ||
184 (line[actual][head] == '\''))) {
185 quotmark = line[actual][head];
186 state = ST_ATTRIB;
187 }
188 break;
189 case ST_ATTRIB: // non word chars
190 prevstate = ST_ATTRIB;
191 if (line[actual][head] == quotmark) {
192 state = ST_TAG;
193 if (checkattr == 2)
194 checkattr = 1;
195 // for IMG ALT
196 } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
197 state = ST_WORD;
198 token = head;
199 } else if (line[actual][head] == '&') {
200 state = ST_CHAR_ENTITY;
201 }
202 break;
203 case ST_CHAR_ENTITY: // SGML element
204 if ((tolower(line[actual][head]) == ';')) {
205 state = prevstate;
206 head--;
207 }
208 }
209 if (next_char(line[actual].c_str(), &head))
210 return false;
211 }
212 //FIXME No return, in function returning non-void
213 }
214
next_token(std::string & t)215 bool XMLParser::next_token(std::string& t) {
216 return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
217 __PATTERN_LEN2__, __PATTERN3__, __PATTERN_LEN3__, t);
218 }
219
220 // remove in-word patterns
get_word2(const char * PATTERN3[][2],unsigned int PATTERN_LEN3,const std::string & tok)221 std::string XMLParser::get_word2(
222 const char* PATTERN3[][2],
223 unsigned int PATTERN_LEN3,
224 const std::string &tok) {
225 std::string word = tok;
226 for (unsigned int i = 0; i < PATTERN_LEN3; i++) {
227 size_t pos;
228 while ((pos = word.find(PATTERN3[i][0])) != word.npos) {
229 size_t endpos = word.find(PATTERN3[i][1], pos);
230 if (endpos != word.npos) {
231 word.erase(pos, endpos + strlen(PATTERN3[i][1]) - pos);
232 } else
233 return word;
234 }
235 }
236 return word;
237 }
238
change_token(const char * word)239 int XMLParser::change_token(const char* word) {
240 if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
241 strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
242 strchr(word, '>') != NULL) {
243 std::string r(word);
244 mystrrep(r, "&", "__namp;__");
245 mystrrep(r, "__namp;__", "&");
246 mystrrep(r, APOSTROPHE, ENTITY_APOS);
247 mystrrep(r, "\"", """);
248 mystrrep(r, ">", ">");
249 mystrrep(r, "<", "<");
250 return TextParser::change_token(r.c_str());
251 }
252 return TextParser::change_token(word);
253 }
254