1
2 /******************************************************************************
3 * MODULE : tree_correct.cpp
4 * DESCRIPTION: make a tree syntactically match a drd
5 * COPYRIGHT : (C) 2005 Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11
12 #include "tree_correct.hpp"
13 #include "tree_analyze.hpp"
14 #include "scheme.hpp"
15 #include "packrat.hpp"
16
17 /******************************************************************************
18 * DRD based correction
19 ******************************************************************************/
20
21 tree
drd_correct(drd_info drd,tree t)22 drd_correct (drd_info drd, tree t) {
23 if (is_atomic (t)) return t;
24 else {
25 int i, n= N(t);
26 if (drd->contains (as_string (L(t))) &&
27 !drd->correct_arity (L(t), n))
28 return "";
29 tree r (t, n);
30 for (i=0; i<n; i++)
31 r[i]= drd_correct (drd, t[i]);
32 return r;
33 }
34 }
35
36 /******************************************************************************
37 * Add missing explicit document tags when necessary
38 ******************************************************************************/
39
40 static bool
is_block_pseudo_code(tree t)41 is_block_pseudo_code (tree t) {
42 if (!is_compound (t) || N(t) == 0) return false;
43 string s= as_string (L(t));
44 if (!starts (s, "algo-")) return false;
45 return
46 s == "algo-procedure" ||
47 s == "algo-function" ||
48 s == "algo-for" ||
49 s == "algo-forall" ||
50 s == "algo-foreach" ||
51 s == "algo-while" ||
52 s == "algo-repeat" ||
53 s == "algo-loop" ||
54 s == "algo-body" ||
55 s == "algo-begin" ||
56 s == "algo-inputs" ||
57 s == "algo-outputs" ||
58 s == "algo-if" ||
59 s == "algo-else-if" ||
60 s == "algo-else";
61 }
62
63 tree
correct_missing_block(tree t)64 correct_missing_block (tree t) {
65 if (is_atomic (t)) return t;
66 int i, n= N(t);
67 tree r (t, n);
68 for (i=0; i<n; i++)
69 r[i]= correct_missing_block (t[i]);
70 t= r;
71 if (is_block_pseudo_code (t))
72 if (!is_document (t[N(t)-1]))
73 t[N(t)-1]= tree (DOCUMENT, t[N(t)-1]);
74 return t;
75 }
76
77 /******************************************************************************
78 * Correct incorrectly concatenated block content
79 ******************************************************************************/
80
81 static tree
to_concat(tree t)82 to_concat (tree t) {
83 if (t == "") return tree (CONCAT);
84 else if (is_concat (t)) return t;
85 else return tree (CONCAT, t);
86 }
87
88 static tree
from_concat(tree t)89 from_concat (tree t) {
90 if (N(t) == 0) return "";
91 else if (N(t) == 1) return t[0];
92 else return t;
93 }
94
95 static bool
is_migratable(tree t)96 is_migratable (tree t) {
97 return
98 t == " " || is_compound (t, "mlx", 2) ||
99 is_func (t, VAR_VSPACE) || is_func (t, VSPACE) ||
100 is_func (t, VAR_PAGE_BREAK) || is_func (t, PAGE_BREAK) ||
101 is_func (t, VAR_NO_PAGE_BREAK) || is_func (t, NO_PAGE_BREAK) ||
102 is_func (t, VAR_NEW_PAGE) || is_func (t, NEW_PAGE) ||
103 is_func (t, VAR_NEW_DPAGE) || is_func (t, NEW_DPAGE);
104 }
105
106 bool expand_needs_surrounding (string s); // defined in upgrade.cpp
107
108 static bool
is_basic_environment(tree t)109 is_basic_environment (tree t) {
110 return // TODO: to be completed using DRD properties
111 (is_compound (t) && N(t) == 1 &&
112 expand_needs_surrounding (as_string (L(t)))) ||
113 is_compound (t, "equation", 1) || is_compound (t, "equation*", 1);
114 }
115
116 static tree
migrate_surround(tree bef,tree aft,tree body)117 migrate_surround (tree bef, tree aft, tree body) {
118 if (is_document (body) && N(body) > 0) {
119 if (N(body) == 1)
120 return tree (DOCUMENT, migrate_surround (bef, aft, body[0]));
121 tree first (DOCUMENT, migrate_surround (bef, "", body[0]));
122 tree last (DOCUMENT, migrate_surround ("", aft, body[N(body)-1]));
123 return first * body (1, N(body)-1) * last;
124 }
125 else
126 return from_concat (to_concat (bef) * to_concat (body) * to_concat (aft));
127 }
128
129 static tree
make_surround(tree bef,tree aft,tree t)130 make_surround (tree bef, tree aft, tree t) {
131 if (bef == "" && aft == "") return t;
132 else if (N(t) == 1 && is_basic_environment (t))
133 return tree (L(t), migrate_surround (bef, aft, t[0]));
134 return tree (SURROUND, bef, aft, t);
135 }
136
137 tree
correct_concat_block(tree t)138 correct_concat_block (tree t) {
139 if (is_atomic (t)) return t;
140 int i, n= N(t);
141 tree r (t, n);
142 for (i=0; i<n; i++)
143 r[i]= correct_concat_block (t[i]);
144 t= r;
145 if (is_concat (t)) {
146 tree doc (DOCUMENT);
147 int start= 0;
148 for (i=0; i<n; )
149 if (is_multi_paragraph (t[i])) {
150 int bef_i= i;
151 while (bef_i > start && is_migratable (t[bef_i-1])) bef_i--;
152 int aft_i= i+1;
153 while (aft_i < n && is_migratable (t[aft_i])) aft_i++;
154 if (start < bef_i) doc << from_concat (t (start, bef_i));
155 tree bef= from_concat (t (bef_i, i));
156 tree aft= from_concat (t (i+1, aft_i));
157 doc << make_surround (bef, aft, t[i]);
158 i= aft_i;
159 start= i;
160 }
161 else i++;
162 if (start < n) doc << from_concat (t (start, n));
163 if (N(doc) == 1) return doc[0];
164 else return doc;
165 }
166 else if (is_document (t)) {
167 tree doc (DOCUMENT);
168 for (i=0; i<n; i++)
169 if (is_document (t[i])) doc << A(t[i]);
170 else doc << t[i];
171 return doc;
172 }
173 else if (N(t) == 1 && is_basic_environment (t) && !is_document (t[0]))
174 return tree (L(t), tree (DOCUMENT, t[0]));
175 else return t;
176 }
177
178 /******************************************************************************
179 * Correct WITHs or WITH-like macros
180 ******************************************************************************/
181
182 tree
with_correct(tree t)183 with_correct (tree t) {
184 if (is_atomic (t)) return t;
185 else {
186 //cout << "Correcting " << t << LF << INDENT;
187 tree u (t, N(t));
188 for (int k=0; k<N(t); k++)
189 u[k]= with_correct (t[k]);
190 array<tree> a= concat_decompose (u);
191 int i, n= N(a);
192 array<tree> r;
193 for (i=0; i<n; i++) {
194 if (is_with_like (a[i])) {
195 array<tree> b= with_decompose (a[i], with_body (a[i]));
196 int p= N(b), k1, k2;
197 for (k1=0; k1<p ; k1++)
198 if (is_with_like (b[k1]) && with_similar_type (a[i], b[k1]));
199 else break;
200 for (k2=p; k2>k1; k2--)
201 if (is_with_like (b[k2-1]) && with_similar_type (a[i], b[k2-1]));
202 else break;
203 array<tree> x;
204 if (0 < k1) x << range (b, 0, k1);
205 if (k1 < k2) x << with_recompose (a[i], range (b, k1, k2));
206 if (k2 < p ) x << range (b, k2, p);
207 if (N(x) == 0) continue;
208 if (N(r) != 0 &&
209 is_with_like (r[N(r)-1]) &&
210 with_same_type (r[N(r)-1], x[0]))
211 {
212 array<tree> c= concat_decompose (with_body (r[N(r)-1]));
213 c << concat_decompose (with_body (x[0]));
214 r[N(r)-1]= with_recompose (x[0], c);
215 r << range (x, 1, N(x));
216 }
217 else r << x;
218 }
219 else r << a[i];
220 }
221 //cout << UNINDENT << "Corrected " << t << " -> "
222 //<< concat_recompose (r) << LF;
223 return concat_recompose (r);
224 }
225 }
226
227 static tree
superfluous_with_correct(tree t,tree env)228 superfluous_with_correct (tree t, tree env) {
229 if (is_atomic (t)) return t;
230 else {
231 //cout << "Superfluous correcting " << t << ", " << env << LF;
232 if (is_compound (t, "body", 1))
233 return compound ("body", superfluous_with_correct (t[0], env));
234 if (is_func (t, WITH) && ((N(t) & 1) == 0))
235 t= t * tree (WITH, "");
236 tree r (t, N(t));
237 for (int i=0; i<N(t); i++)
238 r[i]= superfluous_with_correct
239 (t[i], the_drd->get_env_child (t, i, env));
240 if (is_compound (r, "math", 1) && r[0] == "") return "";
241 else if (is_compound (r, "text", 1) && r[0] == "") return "";
242 else if (is_compound (r, "math", 1) && drd_env_read (env, MODE) == "math")
243 return r[0];
244 else if (is_compound (r, "text", 1) && drd_env_read (env, MODE) == "text")
245 return r[0];
246 else if (is_func (r, WITH)) {
247 for (int i=0; i+1<N(r); i+=2)
248 if (!is_atomic (r[i])) return r;
249 else if (drd_env_read (env, r[i]->label) != r[i+1]) return r;
250 return r[N(r)-1];
251 }
252 else if (is_func (r, CONCAT)) {
253 array<tree> a= concat_decompose (r);
254 return concat_recompose (a);
255 }
256 return r;
257 }
258 }
259
260 tree
superfluous_with_correct(tree t)261 superfluous_with_correct (tree t) {
262 with_drd drd (get_document_drd (t));
263 return superfluous_with_correct (t, tree (WITH, MODE, "text"));
264 }
265
266 /******************************************************************************
267 * Replace symbols by appropriate homoglyphs
268 ******************************************************************************/
269
270 static array<tree>
homoglyph_correct(array<tree> a)271 homoglyph_correct (array<tree> a) {
272 array<int> tp= symbol_types (a);
273 array<tree> r;
274 //cout << a << ", " << tp << "\n";
275 for (int i=0; i<N(a); i++)
276 if (a[i] == "<minus>") r << tree ("-");
277 else if (a[i] == "\\" || a[i] == "<backslash>") {
278 int j1, j2;
279 for (j1= i-1; j1>=0; j1--)
280 if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
281 for (j2= i+1; j2<N(a); j2++)
282 if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
283 if (j1 < 0 || j2 >= N(a));
284 else if ((a[i] == "\\" ||
285 a[i] == "<backslash>") &&
286 ((tp[j1] == SYMBOL_BASIC) ||
287 (tp[j1] == SYMBOL_POSTFIX)) &&
288 ((tp[j2] == SYMBOL_BASIC) ||
289 (tp[j2] == SYMBOL_PREFIX)))
290 r << tree ("<setminus>");
291 else r << a[i];
292 }
293 else if (is_func (a[i], NEG, 1) && is_atomic (a[i][0])) {
294 string s= a[i][0]->label;
295 if (s == "=") r << tree ("<neq>");
296 else if (s == "<less>") r << tree ("<nless>");
297 else if (s == "<gtr>") r << tree ("<ngtr>");
298 else if (s == "<leq>") r << tree ("<nleq>");
299 else if (s == "<geq>") r << tree ("<ngeq>");
300 else if (s == "<leqslant>") r << tree ("<nleqslant>");
301 else if (s == "<geqslant>") r << tree ("<ngeqslant>");
302 else if (s == "<prec>") r << tree ("<nprec>");
303 else if (s == "<succ>") r << tree ("<nsucc>");
304 else if (s == "<preceq>") r << tree ("<npreceq>");
305 else if (s == "<succeq>") r << tree ("<nsucceq>");
306 else if (s == "<preccurlyeq>") r << tree ("<npreccurlyeq>");
307 else if (s == "<succcurlyeq>") r << tree ("<nsucccurlyeq>");
308 else if (s == "<rightarrow>") r << tree ("<nrightarrow>");
309 else if (s == "<Rightarrow>") r << tree ("<nRightarrow>");
310 else if (s == "<leftarrow>") r << tree ("<nleftarrow>");
311 else if (s == "<Leftarrow>") r << tree ("<nLeftarrow>");
312 else if (s == "<leftrightarrow>") r << tree ("<nleftrightarrow>");
313 else if (s == "<Leftrightarrow>") r << tree ("<nLeftrightarrow>");
314 else if (s == "<equiv>") r << tree ("<nequiv>");
315 else if (s == "<sim>") r << tree ("<nsim>");
316 else if (s == "<simeq>") r << tree ("<nsimeq>");
317 else if (s == "<approx>") r << tree ("<napprox>");
318 else if (s == "<cong>") r << tree ("<ncong>");
319 else if (s == "<asymp>") r << tree ("<nasymp>");
320 else if (s == "<in>") r << tree ("<nin>");
321 else if (s == "<ni>") r << tree ("<nni>");
322 else if (s == "<subset>") r << tree ("<nsubset>");
323 else if (s == "<supset>") r << tree ("<nsupset>");
324 else if (s == "<subseteq>") r << tree ("<nsubseteq>");
325 else if (s == "<supseteq>") r << tree ("<nsupseteq>");
326 else if (s == "<sqsubset>") r << tree ("<nsqsubset>");
327 else if (s == "<sqsupset>") r << tree ("<nsqsupset>");
328 else if (s == "<sqsubseteq>") r << tree ("<nsqsubseteq>");
329 else if (s == "<sqsupseteq>") r << tree ("<nsqsupseteq>");
330 else if (s == "<leadsto>") r << tree ("<nleadsto>");
331 else r << a[i];
332 }
333 else if (a[i] == ":" && i+1 < N(a) && a[i+1] == "=") {
334 r << tree ("<assign>");
335 i++;
336 }
337 else r << a[i];
338 return r;
339 }
340
341 static string
get_submode(tree t,int i,string mode)342 get_submode (tree t, int i, string mode) {
343 if (is_func (t, WITH) && i == N(t)-1)
344 for (int j=0; j<N(t)-1; j+=2)
345 if (t[j] == MATH_FONT_FAMILY) return "text";
346 tree tmode= the_drd->get_env_child (t, i, MODE, mode);
347 return (is_atomic (tmode)? tmode->label: string ("text"));
348 }
349
350 static tree
homoglyph_correct(tree t,string mode)351 homoglyph_correct (tree t, string mode) {
352 //cout << "Correct " << t << ", " << mode << "\n";
353 tree r= t;
354 if (is_compound (t)) {
355 int i, n= N(t);
356 r= tree (t, n);
357 for (i=0; i<n; i++) {
358 string smode= get_submode (t, i, mode);
359 if (is_correctable_child (t, i))
360 r[i]= homoglyph_correct (t[i], smode);
361 else r[i]= t[i];
362 }
363 }
364
365 if (mode == "math") {
366 array<tree> a= concat_tokenize (r);
367 a= homoglyph_correct (a);
368 tree ret= concat_recompose (a);
369 //if (ret != r) cout << "< " << r << " >" << LF
370 //<< "> " << ret << " <" << LF;
371 return ret;
372 }
373 else return r;
374 }
375
376 tree
homoglyph_correct(tree t)377 homoglyph_correct (tree t) {
378 with_drd drd (get_document_drd (t));
379 return homoglyph_correct (t, "text");
380 }
381
382 /******************************************************************************
383 * Remove incorrect spaces and multiplications
384 ******************************************************************************/
385
386 static array<tree>
superfluous_invisible_correct(array<tree> a)387 superfluous_invisible_correct (array<tree> a) {
388 array<int> tp= symbol_types (a);
389 array<tree> r;
390 //cout << a << ", " << tp << "\n";
391 for (int i=0; i<N(a); i++)
392 if (a[i] == " " || a[i] == "*") {
393 int j1, j2;
394 for (j1= i-1; j1>=0; j1--)
395 if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
396 else if (a[j1] == " ") break;
397 for (j2= i+1; j2<N(a); j2++)
398 if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT)
399 if (a[j2] != " " && a[j2] != "*") break;
400 //cout << " " << i << ": " << j1 << ", " << j2
401 //<< "; " << tp[j1] << ", " << tp[j2] << "\n";
402 if (j1 < 0 || j2 >= N(a));
403 else if (a[j1] == " " || a[j1] == "*");
404 else if (tp[j1] == SYMBOL_PREFIX ||
405 tp[j1] == SYMBOL_INFIX ||
406 tp[j1] == SYMBOL_SEPARATOR ||
407 tp[j1] == SYMBOL_PROBABLE_MIDDLE);
408 else if (tp[j2] == SYMBOL_POSTFIX ||
409 tp[j2] == SYMBOL_INFIX ||
410 tp[j2] == SYMBOL_SEPARATOR ||
411 tp[j2] == SYMBOL_PROBABLE_MIDDLE);
412 else r << a[i];
413 }
414 else if (is_func (a[i], SQRT, 2) && a[i][1] == "")
415 r << tree (SQRT, a[i][0]);
416 else if (is_script (a[i]) && a[i][0] == "")
417 r << tree (L(a[i]), "<nosymbol>");
418 else r << a[i];
419 return r;
420 }
421
422 static tree
superfluous_invisible_correct(tree t,string mode)423 superfluous_invisible_correct (tree t, string mode) {
424 //cout << "Correct " << t << ", " << mode << "\n";
425 tree r= t;
426 if (is_compound (t)) {
427 int i, n= N(t);
428 r= tree (t, n);
429 for (i=0; i<n; i++) {
430 string smode= get_submode (t, i, mode);
431 //cout << " " << i << ": " << is_correctable_child (t, i)
432 //<< ", " << smode << "\n";
433 if (is_func (t, WITH) && i != N(t)-1)
434 r[i]= t[i];
435 else if (is_correctable_child (t, i))
436 r[i]= superfluous_invisible_correct (t[i], smode);
437 else r[i]= t[i];
438 }
439 }
440
441 if (is_func (r, CONCAT)) {
442 bool ok= true;
443 int i, found= -1;
444 for (i=0; i<N(r); i++)
445 if (is_compound (r[i], "hide-preamble") ||
446 is_compound (r[i], "show-preamble"))
447 {
448 ok= (found == -1);
449 found= i;
450 }
451 else if (!is_atomic (r[i])) ok= false;
452 else {
453 string s= r[i]->label;
454 for (int j=0; j<N(s); j++)
455 if (s[j] != ' ') ok= false;
456 }
457 if (ok) r= r[found];
458 }
459
460 if (is_func (r, INACTIVE, 1) && is_func (r[0], RIGID))
461 return r[0];
462 else if (mode == "math") {
463 array<tree> a= concat_tokenize (r);
464 a= superfluous_invisible_correct (a);
465 tree ret= concat_recompose (a);
466 //if (ret != r) cout << "< " << r << " >" << LF
467 //<< "> " << ret << " <" << LF;
468 return ret;
469 }
470 else return r;
471 }
472
473 tree
superfluous_invisible_correct(tree t)474 superfluous_invisible_correct (tree t) {
475 with_drd drd (get_document_drd (t));
476 return superfluous_invisible_correct (t, "text");
477 }
478
479 /******************************************************************************
480 * Insert missing multiplications or function applications
481 ******************************************************************************/
482
483 #define SURE_NOTHING 0
484 #define SURE_TIMES 1
485 #define SURE_SPACE 2
486 #define PROBABLE_TIMES 3
487 #define PROBABLE_SPACE 4
488 #define BOTH_WAYS 5
489
490 struct invisible_corrector {
491 int force;
492 hashmap<string,int> times_before;
493 hashmap<string,int> times_after;
494 hashmap<string,int> space_before;
495 hashmap<string,int> space_after;
496
497 protected:
498 bool is_letter_like (string s);
499 bool contains_infix (tree t);
500 bool contains_plus_like (tree t);
501 void count_invisible (array<tree> a);
502 void count_invisible (tree t, string mode);
503 int get_status (tree t, bool left, bool script_flag);
504 array<tree> correct (array<tree> a);
505
506 public:
invisible_correctorinvisible_corrector507 inline invisible_corrector (tree t, int force2):
508 force (force2), times_before (0), times_after (0), space_after (0) {
509 count_invisible (t, "text"); }
510 tree correct (tree t, string mode);
511 };
512
513 bool
is_letter_like(string s)514 invisible_corrector::is_letter_like (string s) {
515 static language lan= math_language ("std-math");
516 if (s != "" && is_iso_alpha (s)) return true;
517 return lan->get_group (s) == "Letter-symbol";
518 }
519
520 bool
contains_infix(tree t)521 invisible_corrector::contains_infix (tree t) {
522 array<int> tp= symbol_types (concat_tokenize (t));
523 for (int i=0; i<N(tp); i++)
524 if (tp[i] == SYMBOL_INFIX)
525 return true;
526 return false;
527 }
528
529 bool
contains_plus_like(tree t)530 invisible_corrector::contains_plus_like (tree t) {
531 array<tree> a= concat_tokenize (t);
532 for (int i=1; i<N(a)-1; i++)
533 if (a[i] == "+" || a[i] == "-")
534 return true;
535 return false;
536 }
537
538 void
count_invisible(array<tree> a)539 invisible_corrector::count_invisible (array<tree> a) {
540 array<int> tp= symbol_types (a);
541 for (int i=0; i<N(a); i++)
542 if (is_atomic (a[i]) && is_letter_like (a[i]->label)) {
543 int j1, j2;
544 for (j1= i-1; j1>=0; j1--)
545 if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
546 else if (a[j1] == " ") break;
547 for (j2= i+1; j2<N(a); j2++)
548 if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
549 else if (a[j2] == " ") break;
550 string s= a[i]->label;
551 if (j1 >= 0) {
552 if (a[j1] == "*")
553 times_before (s)= times_before[s] + 1;
554 if (a[j1] == " ")
555 space_before (s)= space_before[s] + 1;
556 }
557 if (j2 < N(a)) {
558 if (a[j2] == "*")
559 times_after (s)= times_after[s] + 1;
560 if (a[j2] == " ")
561 space_after (s)= space_after[s] + 1;
562 // NOTE: this heuristic might not be a good idea,
563 // because it inhibits the correction of QR -> Q*R,
564 // if Q is a polynomial which is applied somewhere Q(1).
565 // We might introduce a table 'apply_after'.
566 //if (is_around (a[j2]) && a[j2][0] == "(" &&
567 //!contains_infix (a[j2][1]))
568 //space_after (s)= space_after[s] + 1;
569 }
570 }
571 }
572
573 void
count_invisible(tree t,string mode)574 invisible_corrector::count_invisible (tree t, string mode) {
575 if (is_compound (t)) {
576 int i, n= N(t);
577 for (i=0; i<n; i++) {
578 string smode= get_submode (t, i, mode);
579 if (is_func (t, WITH) && i != N(t)-1);
580 else if (is_correctable_child (t, i))
581 count_invisible (t[i], smode);
582 }
583 }
584 if (mode == "math")
585 count_invisible (concat_tokenize (t));
586 }
587
588 int
get_status(tree t,bool left,bool script_flag)589 invisible_corrector::get_status (tree t, bool left, bool script_flag) {
590 if (is_atomic (t)) {
591 static language lan= math_language ("std-math");
592 string s= t->label;
593 string g= lan->get_group (t->label);
594 if (is_numeric (s))
595 return (left? SURE_TIMES: PROBABLE_TIMES);
596 else if (starts (g, "Unary-operator-textual"))
597 return (left? SURE_SPACE: BOTH_WAYS);
598 else if (starts (g, "Binary-operator"))
599 return SURE_SPACE;
600 else if (starts (g, "N-ary-operator"))
601 return (left? SURE_SPACE: BOTH_WAYS);
602 else if (is_letter_like (s)) {
603 if (left) {
604 if (times_after[s] > 0 && space_after[s] == 0)
605 return SURE_TIMES;
606 else if (space_after[s] > 0 && times_after[s] == 0)
607 return SURE_SPACE;
608 else if (times_after[s] > space_after[s])
609 return PROBABLE_TIMES;
610 else if (space_after[s] > times_after[s])
611 return PROBABLE_SPACE;
612 else if (N(s)>1 && is_iso_alpha (s))
613 return PROBABLE_SPACE;
614 else if (script_flag)
615 return PROBABLE_TIMES;
616 else return BOTH_WAYS;
617 }
618 else {
619 if (times_before[s] > space_before[s])
620 return PROBABLE_TIMES;
621 else if (times_after[s] > 0 && space_after[s] == 0)
622 return PROBABLE_TIMES;
623 else if (script_flag && (N(s) == 1 || !is_iso_alpha (s)))
624 return PROBABLE_TIMES;
625 else return BOTH_WAYS;
626 }
627 }
628 else if (s == "<cdots>" || s == "<ldots>")
629 return PROBABLE_TIMES;
630 else return ((force > 0)? BOTH_WAYS: SURE_NOTHING);
631 }
632 else {
633 if (is_around (t)) {
634 if (left && contains_plus_like (t[1]))
635 return ((force > 0)? SURE_TIMES: PROBABLE_TIMES);
636 else if (contains_plus_like (t[1]))
637 return ((force > 0)? PROBABLE_TIMES: BOTH_WAYS);
638 else if (!contains_infix (t[1]))
639 return (left? BOTH_WAYS: SURE_SPACE);
640 else return BOTH_WAYS;
641 }
642 else if (is_func (t, FRAC) ||
643 is_func (t, SQRT))
644 return (left? SURE_TIMES: BOTH_WAYS);
645 else if (!left && is_func (t, BIG_AROUND, 2) &&
646 (t[0] == "<sum>" || t[0] == "<amalg>" ||
647 t[0] == "<oplus>" || t[0] == "<uplus>" ||
648 t[0] == "<int>" || t[0] == "<oint>" ||
649 t[0] == "<intlim>" || t[0] == "<ointlim>" ||
650 t[0] == "<prod>" || t[0] == "<odot>" || t[0] == "<otimes>"))
651 return PROBABLE_TIMES;
652 else if (is_func (t, WIDE, 2))
653 return get_status (t[0], left, script_flag);
654 else if (is_func (t, WITH))
655 return get_status (t[N(t)-1], left, script_flag);
656 else if (N(t) == 0 && L(t) >= START_EXTENSIONS) {
657 tree def= the_drd->get_syntax (L(t));
658 if (is_func (def, MACRO, 1))
659 return get_status (def[0], left, script_flag);
660 else return SURE_NOTHING;
661 }
662 else return SURE_NOTHING;
663 }
664 }
665
666 static bool
admits_script(array<int> tp,int i)667 admits_script (array<int> tp, int i) {
668 i++;
669 while (i<N(tp))
670 if (tp[i] == SYMBOL_SCRIPT) return true;
671 else if (tp[i] == SYMBOL_SKIP) i++;
672 else return false;
673 return false;
674 }
675
676 array<tree>
correct(array<tree> a)677 invisible_corrector::correct (array<tree> a) {
678 //cout << "Correct " << a << "\n";
679 array<tree> r;
680 array<int> tp= symbol_types (a);
681 for (int i=0; i<N(a); i++) {
682 r << a[i];
683 if (a[i] != " " && tp[i] == SYMBOL_BASIC) {
684 int j;
685 for (j= i+1; j<N(a); j++)
686 if (tp[j] != SYMBOL_SKIP && tp[j] != SYMBOL_SCRIPT) break;
687 else if (a[j] == " ") break;
688 if (j >= N(a) || a[j] == " " || tp[j] != SYMBOL_BASIC)
689 continue;
690
691 string ins= "";
692 int sti= get_status (a[i], true, admits_script (tp, i));
693 int stj= get_status (a[j], false, admits_script (tp, j));
694 //cout << "Pair (" << a[i] << ", " << a[j] << ")"
695 //<< " -> (" << sti << ", " << stj << ")" << LF;
696 if (sti == SURE_NOTHING || stj == SURE_NOTHING)
697 ins= "";
698 else if (sti == SURE_TIMES && stj != SURE_SPACE)
699 ins= "*";
700 else if (sti == SURE_SPACE && stj != SURE_TIMES)
701 ins= " ";
702 else if (sti == PROBABLE_TIMES && stj == PROBABLE_TIMES)
703 ins= "*";
704 else if (sti == PROBABLE_SPACE && stj == PROBABLE_SPACE)
705 ins= " ";
706 else if (sti == PROBABLE_TIMES && stj == BOTH_WAYS)
707 ins= "*";
708 else if (sti == PROBABLE_SPACE && stj == BOTH_WAYS)
709 ins= " ";
710 else if (sti == BOTH_WAYS && stj == PROBABLE_TIMES)
711 ins= "*";
712 else if (sti == BOTH_WAYS && stj == PROBABLE_SPACE)
713 ins= " ";
714 else if (sti == BOTH_WAYS && stj == BOTH_WAYS && force == 1 &&
715 (is_atomic (a[i]) || is_around (a[i])) &&
716 (is_atomic (a[j]) || is_around (a[j])))
717 ins= "*";
718
719 if (is_around (a[j]))
720 if (ins == " " || (ins == "*" && force == -1))
721 ins= "";
722 if (a[j] == ".") ins= "";
723 while (i+1 < N(a) && (is_func (a[i+1], RSUB, 1) ||
724 is_func (a[i+1], RSUP, 1) ||
725 is_func (a[i+1], RPRIME, 1))) {
726 i++;
727 r << a[i];
728 }
729 if (ins != "") r << tree (ins);
730 }
731 }
732 return r;
733 }
734
735 tree
correct(tree t,string mode)736 invisible_corrector::correct (tree t, string mode) {
737 //cout << "Correct " << t << ", " << mode << "\n";
738 tree r= t;
739 if (is_compound (t)) {
740 int i, n= N(t);
741 r= tree (t, n);
742 for (i=0; i<n; i++) {
743 string smode= get_submode (t, i, mode);
744 if (is_func (t, WITH) && i != N(t)-1)
745 r[i]= t[i];
746 else if (is_correctable_child (t, i))
747 r[i]= correct (t[i], smode);
748 else r[i]= t[i];
749 }
750 }
751
752 if (mode == "math") {
753 array<tree> a= concat_tokenize (r);
754 a= correct (a);
755 tree ret= concat_recompose (a);
756 //if (ret != r)
757 // cout << "<< " << r << " >>" << LF
758 // << ">> " << ret << " <<" << LF;
759 return ret;
760 }
761 else return r;
762 }
763
764 tree
missing_invisible_correct(tree t,int force)765 missing_invisible_correct (tree t, int force) {
766 // force = -1, only correct when sure, and when old markup is incorrect
767 // force = 0 , only correct when pretty sure
768 // force = 1 , correct whenever reasonable (used for LaTeX import)
769 with_drd drd (get_document_drd (t));
770 invisible_corrector corrector (t, force);
771 //cout << "Times before " << corrector.times_before << "\n";
772 //cout << "Space before " << corrector.space_before << "\n";
773 //cout << "Times after " << corrector.times_after << "\n";
774 //cout << "Space after " << corrector.space_after << "\n";
775 return corrector.correct (t, "text");
776 }
777
778 tree
missing_invisible_correct_twice(tree t,int force=-1)779 missing_invisible_correct_twice (tree t, int force= -1) {
780 tree u= missing_invisible_correct (t, force);
781 if (u == t) return t;
782 return missing_invisible_correct (u, force);
783 }
784
785 /******************************************************************************
786 * Miscellaneous corrections
787 ******************************************************************************/
788
789 tree
misc_math_correct(tree t)790 misc_math_correct (tree t) {
791 if (is_atomic (t)) return t;
792 else if (is_compound (t, "math", 1) && is_func (t[0], RSUB, 1))
793 return tree (RSUB, compound ("math", misc_math_correct (t[0][0])));
794 else if (is_compound (t, "math", 1) && is_func (t[0], RSUP, 1))
795 return tree (RSUP, compound ("math", misc_math_correct (t[0][0])));
796 else if (is_func (t, RSUB, 1) && is_func (t[0], RSUB, 1))
797 return misc_math_correct (t[0]);
798 else if (is_func (t, RSUB, 1) && is_func (t[0], RSUP, 1))
799 return misc_math_correct (tree (RSUB, t[0][0]));
800 else if (is_func (t, RSUP, 1) && is_func (t[0], RSUB, 1))
801 return misc_math_correct (tree (RSUP, t[0][0]));
802 else if (is_func (t, RSUP, 1) && is_func (t[0], RSUP, 1))
803 return misc_math_correct (t[0]);
804 else if (is_func (t, RSUP, 1) && is_func (t[0], RPRIME, 1))
805 return misc_math_correct (t[0]);
806 else if (is_script (t) && is_compound (t[0], "text", 1) &&
807 is_atomic (t[0][0]) && is_alpha (t[0][0]->label))
808 {
809 if (N(t[0][0]->label) != 1) return tree (L(t), t[0][0]);
810 else return tree (L(t), tree (WITH, "math-font-family", "trm",
811 misc_math_correct (t[0])));
812 }
813 else if (is_compound (t, "math", 1)) {
814 tree arg = misc_math_correct (t[0]);
815 tree last= arg;
816 if (is_concat (last) && N(last) > 0) last= last[N(last)-1];
817 if (is_atomic (last) && N(last->label) > 0 &&
818 is_punctuation (last->label [N(last->label)-1]))
819 {
820 string s= last->label;
821 int i= N(s);
822 while (i>0 && is_punctuation (s[i-1])) i--;
823 if (i == N(s)) return compound ("math", arg);
824 string tail= s (i, N(s));
825 s= s (0, i);
826 if (last == arg) {
827 if (N(s) == 0) return tail;
828 else return concat (compound ("math", s), tail);
829 }
830 else {
831 tree cc= arg (0, N(arg) - 1);
832 if (N(s) != 0) cc << tree (s);
833 if (N(cc) == 1) cc= cc[0];
834 return concat (compound ("math", cc), tail);
835 }
836 }
837 else return compound ("math", arg);
838 }
839 else {
840 int i, n= N(t);
841 tree r (t, n);
842 for (i=0; i<n; i++)
843 r[i]= misc_math_correct (t[i]);
844 if (is_concat (r))
845 r= concat_recompose (concat_decompose (r));
846 return r;
847 }
848 }
849
850 /******************************************************************************
851 * Count errors
852 ******************************************************************************/
853
854 static int
count_math_formula_errors(tree t,int mode)855 count_math_formula_errors (tree t, int mode) {
856 if (mode == 1) return 1;
857 if (packrat_correct ("std-math", "Main", t)) return 0;
858 else {
859 if (mode == 2) cout << " ERROR> " << t << "\n";
860 return 1;
861 }
862 }
863
864 static int
count_math_table_errors(tree t,int mode)865 count_math_table_errors (tree t, int mode) {
866 if (is_atomic (t)) return 0;
867 else if (is_func (t, CELL, 1)) {
868 if (t[0] == "" || t[0] == tree (DOCUMENT, "")) return 0;
869 if (mode == 1) return 1;
870 if (packrat_correct ("std-math", "Cell", t[0])) return 0;
871 else {
872 if (mode == 2) cout << " ERROR> " << t << "\n";
873 return 1;
874 }
875 }
876 else {
877 int sum= 0;
878 for (int i=0; i<N(t); i++)
879 sum += count_math_table_errors (t[i], mode);
880 return sum;
881 }
882 }
883
884 int
count_math_errors(tree t,int mode)885 count_math_errors (tree t, int mode) {
886 if (is_atomic (t)) return 0;
887 else {
888 int sum= 0;
889 for (int i=0; i<N(t); i++) {
890 tree cmode= the_drd->get_env_child (t, i, MODE, "text");
891 if (cmode != "math") sum += count_math_errors (t[i], mode);
892 else {
893 tree u= t[i];
894 while (is_func (u, DOCUMENT, 1) ||
895 is_func (u, TFORMAT) ||
896 is_func (u, WITH))
897 u= u[N(u)-1];
898 if (is_func (u, TABLE)) sum += count_math_table_errors (u, mode);
899 else sum += count_math_formula_errors (u, mode);
900 }
901 }
902 return sum;
903 }
904 }
905
906 /******************************************************************************
907 * Print mathematical status
908 ******************************************************************************/
909
910 static int count_formula= 0;
911 static int count_initial_errors= 0;
912 static int count_final_errors= 0;
913
914 static int corrected_with= 0;
915 static int corrected_superfluous_with= 0;
916 static int corrected_brackets= 0;
917 static int corrected_move_brackets= 0;
918 static int corrected_misc= 0;
919 static int corrected_superfluous_invisible= 0;
920 static int corrected_homoglyph= 0;
921 static int corrected_missing_invisible= 0;
922 static int corrected_zealous_invisible= 0;
923
924 void
math_status_cumul_sub(tree t,int & cumul,int & errors)925 math_status_cumul_sub (tree t, int& cumul, int& errors) {
926 int new_errors= count_math_errors (t);
927 cumul += (errors - new_errors);
928 errors= new_errors;
929 }
930
931 void
math_status_cumul(tree t)932 math_status_cumul (tree t) {
933 with_drd drd (get_document_drd (t));
934 if (is_func (t, DOCUMENT))
935 for (int i=0; i<N(t); i++)
936 if (is_compound (t[i], "body", 1)) {
937 t= t[i][0];
938 break;
939 }
940
941 int errors= count_math_errors (t);
942 count_formula += count_math_errors (t, 1);
943 count_initial_errors += errors;
944 t= with_correct (t);
945 math_status_cumul_sub (t, corrected_with, errors);
946 t= superfluous_with_correct (t);
947 math_status_cumul_sub (t, corrected_superfluous_with, errors);
948 t= upgrade_brackets (t);
949 math_status_cumul_sub (t, corrected_brackets, errors);
950 t= move_brackets (t);
951 math_status_cumul_sub (t, corrected_move_brackets, errors);
952 t= misc_math_correct (t);
953 math_status_cumul_sub (t, corrected_misc, errors);
954 t= superfluous_invisible_correct (t);
955 math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
956 t= homoglyph_correct (t);
957 math_status_cumul_sub (t, corrected_homoglyph, errors);
958 t= superfluous_invisible_correct (t);
959 math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
960 t= missing_invisible_correct (t);
961 math_status_cumul_sub (t, corrected_missing_invisible, errors);
962 count_final_errors += errors;
963 //cout << "Errors= " << errors << "\n";
964 //(void) count_math_errors (t, 2);
965 t= missing_invisible_correct (t, 1);
966 math_status_cumul_sub (t, corrected_zealous_invisible, errors);
967 }
968
969 void
math_status_reset()970 math_status_reset () {
971 count_formula= 0;
972 count_initial_errors= 0;
973 count_final_errors= 0;
974 corrected_with= 0;
975 corrected_superfluous_with= 0;
976 corrected_brackets= 0;
977 corrected_move_brackets= 0;
978 corrected_misc= 0;
979 corrected_superfluous_invisible= 0;
980 corrected_homoglyph= 0;
981 corrected_missing_invisible= 0;
982 }
983
984 void
math_status_print()985 math_status_print () {
986 cout << "Formulas : " << count_formula << "\n";
987 cout << "Initial errors : " << count_initial_errors << "\n";
988 cout << "Final errors : " << count_final_errors << "\n";
989 cout << "\n";
990 cout << "With corrected : "
991 << corrected_with << "\n";
992 cout << "Superfluous with corrected : "
993 << corrected_superfluous_with << "\n";
994 cout << "Upgraded brackets : "
995 << corrected_brackets << "\n";
996 cout << "Moved brackets : "
997 << corrected_move_brackets << "\n";
998 cout << "Miscellaneous corrected : "
999 << corrected_misc << "\n";
1000 cout << "Superfluous invisible corrected : "
1001 << corrected_superfluous_invisible << "\n";
1002 cout << "Homoglyphs corrected : "
1003 << corrected_homoglyph << "\n";
1004 cout << "Missing invisible corrected : "
1005 << corrected_missing_invisible << "\n";
1006 cout << "Zealous invisible corrected : "
1007 << corrected_zealous_invisible << "\n";
1008 cout << "\n";
1009 }
1010
1011 /******************************************************************************
1012 * Master routines
1013 ******************************************************************************/
1014
1015 bool
enabled_preference(string s)1016 enabled_preference (string s) {
1017 return call ("get-preference", s) == object ("on");
1018 }
1019
1020 tree
latex_correct(tree t)1021 latex_correct (tree t) {
1022 // NOTE: matching brackets corrected in upgrade_tex
1023 t= misc_math_correct (t);
1024 //if (enabled_preference ("remove superfluous invisible"))
1025 t= superfluous_invisible_correct (t);
1026 //if (enabled_preference ("homoglyph correct"))
1027 t= homoglyph_correct (t);
1028 //if (enabled_preference ("remove superfluous invisible"))
1029 t= superfluous_invisible_correct (t);
1030 //if (enabled_preference ("insert missing invisible"))
1031 t= missing_invisible_correct_twice (t);
1032 //if (enabled_preference ("insert missing invisible"))
1033 t= missing_invisible_correct (t, 1);
1034 t= downgrade_big (t);
1035 t= correct_missing_block (t);
1036 t= correct_concat_block (t);
1037 return t;
1038 }
1039
1040 tree
automatic_correct(tree t,string version)1041 automatic_correct (tree t, string version) {
1042 if (version_inf_eq (version, "1.0.7.9")) {
1043 t= misc_math_correct (t);
1044 if (enabled_preference ("remove superfluous invisible"))
1045 t= superfluous_invisible_correct (t);
1046 if (enabled_preference ("homoglyph correct"))
1047 t= homoglyph_correct (t);
1048 if (enabled_preference ("remove superfluous invisible"))
1049 t= superfluous_invisible_correct (t);
1050 if (enabled_preference ("insert missing invisible"))
1051 t= missing_invisible_correct_twice (t);
1052 if (enabled_preference ("zealous invisible correct"))
1053 t= missing_invisible_correct (t, 1);
1054 }
1055 t= downgrade_big (t);
1056 return t;
1057 }
1058
1059 tree
manual_correct(tree t)1060 manual_correct (tree t) {
1061 t= with_correct (t);
1062 t= superfluous_with_correct (t);
1063 t= upgrade_brackets (t);
1064 t= misc_math_correct (t);
1065 if (enabled_preference ("manual remove superfluous invisible"))
1066 t= superfluous_invisible_correct (t);
1067 if (enabled_preference ("manual homoglyph correct"))
1068 t= homoglyph_correct (t);
1069 if (enabled_preference ("manual remove superfluous invisible"))
1070 t= superfluous_invisible_correct (t);
1071 if (enabled_preference ("manual insert missing invisible"))
1072 t= missing_invisible_correct_twice (t);
1073 if (enabled_preference ("manual zealous invisible correct"))
1074 t= missing_invisible_correct (t, 1);
1075 t= downgrade_big (t);
1076 return t;
1077 }
1078