1 /* ***** BEGIN LICENSE BLOCK *****
2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
3  *
4  * Copyright (C) 2002-2017 Németh László
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
17  *
18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
23  *
24  * Alternatively, the contents of this file may be used under the terms of
25  * either the GNU General Public License Version 2 or later (the "GPL"), or
26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27  * in which case the provisions of the GPL or the LGPL are applicable instead
28  * of those above. If you wish to allow use of your version of this file only
29  * under the terms of either the GPL or the LGPL, and not to allow others to
30  * use your version of this file under the terms of the MPL, indicate your
31  * decision by deleting the provisions above and replace them with the notice
32  * and other provisions required by the GPL or the LGPL. If you do not delete
33  * the provisions above, a recipient may use your version of this file under
34  * the terms of any one of the MPL, the GPL or the LGPL.
35  *
36  * ***** END LICENSE BLOCK ***** */
37 
38 #include <cstdlib>
39 #include <cstring>
40 #include <cstdio>
41 #include <ctype.h>
42 
43 #include "../hunspell/csutil.hxx"
44 #include "latexparser.hxx"
45 
46 #ifndef W32
47 using namespace std;
48 #endif
49 
50 #define UTF8_APOS "\xe2\x80\x99"
51 #define APOSTROPHE "'"
52 
53 static struct {
54   const char* pat[2];
55   int arg;
56 } PATTERN[] = {{{"\\(", "\\)"}, 0},
57                {{"$$", "$$"}, 0},
58                {{"$", "$"}, 0},
59                {{"\\begin{math}", "\\end{math}"}, 0},
60                {{"\\[", "\\]"}, 0},
61                {{"\\begin{displaymath}", "\\end{displaymath}"}, 0},
62                {{"\\begin{equation}", "\\end{equation}"}, 0},
63                {{"\\begin{equation*}", "\\end{equation*}"}, 0},
64                {{"\\cite", NULL}, 1},
65                {{"\\nocite", NULL}, 1},
66                {{"\\index", NULL}, 1},
67                {{"\\label", NULL}, 1},
68                {{"\\ref", NULL}, 1},
69                {{"\\pageref", NULL}, 1},
70                {{"\\autoref", NULL}, 1},
71                {{"\\parbox", NULL}, 1},
72                {{"\\begin{verbatim}", "\\end{verbatim}"}, 0},
73                {{"\\verb+", "+"}, 0},
74                {{"\\verb|", "|"}, 0},
75                {{"\\verb#", "#"}, 0},
76                {{"\\verb*", "*"}, 0},
77                {{"\\documentstyle", "\\begin{document}"}, 0},
78                {{"\\documentclass", "\\begin{document}"}, 0},
79                //	{ { "\\documentclass", NULL } , 1 },
80                {{"\\usepackage", NULL}, 1},
81                {{"\\includeonly", NULL}, 1},
82                {{"\\include", NULL}, 1},
83                {{"\\input", NULL}, 1},
84                {{"\\vspace", NULL}, 1},
85                {{"\\setlength", NULL}, 2},
86                {{"\\addtolength", NULL}, 2},
87                {{"\\settowidth", NULL}, 2},
88                {{"\\rule", NULL}, 2},
89                {{"\\hspace", NULL}, 1},
90                {{"\\vspace", NULL}, 1},
91                {{"\\\\[", "]"}, 0},
92                {{"\\pagebreak[", "]"}, 0},
93                {{"\\nopagebreak[", "]"}, 0},
94                {{"\\enlargethispage", NULL}, 1},
95                {{"\\begin{tabular}", NULL}, 1},
96                {{"\\addcontentsline", NULL}, 2},
97                {{"\\begin{thebibliography}", NULL}, 1},
98                {{"\\bibliography", NULL}, 1},
99                {{"\\bibliographystyle", NULL}, 1},
100                {{"\\bibitem", NULL}, 1},
101                {{"\\begin", NULL}, 1},
102                {{"\\end", NULL}, 1},
103                {{"\\pagestyle", NULL}, 1},
104                {{"\\pagenumbering", NULL}, 1},
105                {{"\\thispagestyle", NULL}, 1},
106                {{"\\newtheorem", NULL}, 2},
107                {{"\\newcommand", NULL}, 2},
108                {{"\\renewcommand", NULL}, 2},
109                {{"\\setcounter", NULL}, 2},
110                {{"\\addtocounter", NULL}, 1},
111                {{"\\stepcounter", NULL}, 1},
112                {{"\\selectlanguage", NULL}, 1},
113                {{"\\inputencoding", NULL}, 1},
114                {{"\\hyphenation", NULL}, 1},
115                {{"\\definecolor", NULL}, 3},
116                {{"\\color", NULL}, 1},
117                {{"\\textcolor", NULL}, 1},
118                {{"\\pagecolor", NULL}, 1},
119                {{"\\colorbox", NULL}, 2},
120                {{"\\fcolorbox", NULL}, 2},
121                {{"\\declaregraphicsextensions", NULL}, 1},
122                {{"\\psfig", NULL}, 1},
123                {{"\\url", NULL}, 1},
124                {{"\\eqref", NULL}, 1},
125                {{"\\vskip", NULL}, 1},
126                {{"\\vglue", NULL}, 1},
127                {{"\'\'", NULL}, 1}};
128 
129 #define PATTERN_LEN (sizeof(PATTERN) / sizeof(PATTERN[0]))
130 
LaTeXParser(const char * wordchars)131 LaTeXParser::LaTeXParser(const char* wordchars)
132     : TextParser(wordchars)
133     , pattern_num(0), depth(0), arg(0), opt(0) {
134 }
135 
LaTeXParser(const w_char * wordchars,int len)136 LaTeXParser::LaTeXParser(const w_char* wordchars, int len)
137     : TextParser(wordchars, len)
138     , pattern_num(0), depth(0), arg(0), opt(0) {
139 }
140 
~LaTeXParser()141 LaTeXParser::~LaTeXParser() {}
142 
look_pattern(int col)143 int LaTeXParser::look_pattern(int col) {
144   for (unsigned int i = 0; i < PATTERN_LEN; i++) {
145     const char* j = line[actual].c_str() + head;
146     const char* k = PATTERN[i].pat[col];
147     if (!k)
148       continue;
149     while ((*k != '\0') && (tolower(*j) == *k)) {
150       j++;
151       k++;
152     }
153     if (*k == '\0')
154       return i;
155   }
156   return -1;
157 }
158 
159 /*
160  * LaTeXParser
161  *
162  * state 0: not wordchar
163  * state 1: wordchar
164  * state 2: comments
165  * state 3: commands
166  * state 4: commands with arguments
167  * state 5: % comment
168  *
169  */
170 
next_token(std::string & t)171 bool LaTeXParser::next_token(std::string& t) {
172   t.clear();
173   int i;
174   int slash = 0;
175   int apostrophe;
176   for (;;) {
177     // fprintf(stderr,"depth: %d, state: %d, , arg: %d, token:
178     // %s\n",depth,state,arg,line[actual]+head);
179 
180     switch (state) {
181       case 0:  // non word chars
182         if ((pattern_num = look_pattern(0)) != -1) {
183           if (PATTERN[pattern_num].pat[1]) {
184             state = 2;
185           } else {
186             state = 4;
187             depth = 0;
188             arg = 0;
189             opt = 1;
190           }
191           head += strlen(PATTERN[pattern_num].pat[0]) - 1;
192         } else if (line[actual][head] == '%') {
193           state = 5;
194         } else if (is_wordchar(line[actual].c_str() + head)) {
195           state = 1;
196           token = head;
197         } else if (line[actual][head] == '\\') {
198           if (line[actual][head + 1] == '\\' ||   // \\ (linebreak)
199               (line[actual][head + 1] == '$') ||  // \$ (dollar sign)
200               (line[actual][head + 1] == '%')) {  // \% (percent)
201             head++;
202             break;
203           }
204           state = 3;
205         }
206         break;
207       case 1:  // wordchar
208         apostrophe = 0;
209         if ((is_wordchar((char*)APOSTROPHE) ||
210              (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
211             !line[actual].empty() && line[actual][head] == '\'' &&
212             is_wordchar(line[actual].c_str() + head + 1)) {
213           head++;
214         } else if (is_utf8() &&
215                    is_wordchar((char*)APOSTROPHE) &&  // add Unicode apostrophe
216                                                       // to the WORDCHARS, if
217                                                       // needed
218                    strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
219                    0 &&
220                    is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
221           head += strlen(UTF8_APOS) - 1;
222         } else if (!is_wordchar(line[actual].c_str() + head) ||
223             (line[actual][head] == '\'' && line[actual][head + 1] == '\'' &&
224              ++apostrophe)) {
225           state = 0;
226           bool ok = alloc_token(token, &head, t);
227           if (apostrophe)
228             head += 2;
229           if (ok)
230             return true;
231         }
232         break;
233       case 2:  // comment, labels, etc
234         if (((i = look_pattern(1)) != -1) &&
235             (strcmp(PATTERN[i].pat[1], PATTERN[pattern_num].pat[1]) == 0)) {
236           state = 0;
237           head += strlen(PATTERN[pattern_num].pat[1]) - 1;
238         }
239         break;
240       case 3:  // command
241         if ((tolower(line[actual][head]) < 'a') ||
242             (tolower(line[actual][head]) > 'z')) {
243           state = 0;
244           head--;
245         }
246         break;
247       case 4:  // command with arguments
248         if (slash && (line[actual][head] != '\0')) {
249           slash = 0;
250           head++;
251           break;
252         } else if (line[actual][head] == '\\') {
253           slash = 1;
254         } else if ((line[actual][head] == '{') ||
255                    ((opt) && (line[actual][head] == '['))) {
256           depth++;
257           opt = 0;
258         } else if (line[actual][head] == '}') {
259           depth--;
260           if (depth == 0) {
261             opt = 1;
262             arg++;
263           }
264           if (((depth == 0) && (arg == PATTERN[pattern_num].arg)) ||
265               (depth < 0)) {
266             state = 0;  // XXX not handles the last optional arg.
267           }
268         } else if (line[actual][head] == ']')
269           depth--;
270     }  // case
271     if (next_char(line[actual].c_str(), &head)) {
272       if (state == 5)
273         state = 0;
274       return false;
275     }
276   }
277 }
278