1 
2 /******************************************************************************
3 * MODULE     : tree_correct.cpp
4 * DESCRIPTION: make a tree syntactically match a drd
5 * COPYRIGHT  : (C) 2005  Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11 
12 #include "tree_correct.hpp"
13 #include "tree_analyze.hpp"
14 #include "scheme.hpp"
15 #include "packrat.hpp"
16 
17 /******************************************************************************
18 * DRD based correction
19 ******************************************************************************/
20 
21 tree
drd_correct(drd_info drd,tree t)22 drd_correct (drd_info drd, tree t) {
23   if (is_atomic (t)) return t;
24   else {
25     int i, n= N(t);
26     if (drd->contains (as_string (L(t))) &&
27 	!drd->correct_arity (L(t), n))
28       return "";
29     tree r (t, n);
30     for (i=0; i<n; i++)
31       r[i]= drd_correct (drd, t[i]);
32     return r;
33   }
34 }
35 
36 /******************************************************************************
37 * Add missing explicit document tags when necessary
38 ******************************************************************************/
39 
40 static bool
is_block_pseudo_code(tree t)41 is_block_pseudo_code (tree t) {
42   if (!is_compound (t) || N(t) == 0) return false;
43   string s= as_string (L(t));
44   if (!starts (s, "algo-")) return false;
45   return
46     s == "algo-procedure" ||
47     s == "algo-function" ||
48     s == "algo-for" ||
49     s == "algo-forall" ||
50     s == "algo-foreach" ||
51     s == "algo-while" ||
52     s == "algo-repeat" ||
53     s == "algo-loop" ||
54     s == "algo-body" ||
55     s == "algo-begin" ||
56     s == "algo-inputs" ||
57     s == "algo-outputs" ||
58     s == "algo-if" ||
59     s == "algo-else-if" ||
60     s == "algo-else";
61 }
62 
63 tree
correct_missing_block(tree t)64 correct_missing_block (tree t) {
65   if (is_atomic (t)) return t;
66   int i, n= N(t);
67   tree r (t, n);
68   for (i=0; i<n; i++)
69     r[i]= correct_missing_block (t[i]);
70   t= r;
71   if (is_block_pseudo_code (t))
72     if (!is_document (t[N(t)-1]))
73       t[N(t)-1]= tree (DOCUMENT, t[N(t)-1]);
74   return t;
75 }
76 
77 /******************************************************************************
78 * Correct incorrectly concatenated block content
79 ******************************************************************************/
80 
81 static tree
to_concat(tree t)82 to_concat (tree t) {
83   if (t == "") return tree (CONCAT);
84   else if (is_concat (t)) return t;
85   else return tree (CONCAT, t);
86 }
87 
88 static tree
from_concat(tree t)89 from_concat (tree t) {
90   if (N(t) == 0) return "";
91   else if (N(t) == 1) return t[0];
92   else return t;
93 }
94 
95 static bool
is_migratable(tree t)96 is_migratable (tree t) {
97   return
98     t == " " || is_compound (t, "mlx", 2) ||
99     is_func (t, VAR_VSPACE) || is_func (t, VSPACE) ||
100     is_func (t, VAR_PAGE_BREAK) || is_func (t, PAGE_BREAK) ||
101     is_func (t, VAR_NO_PAGE_BREAK) || is_func (t, NO_PAGE_BREAK) ||
102     is_func (t, VAR_NEW_PAGE) || is_func (t, NEW_PAGE) ||
103     is_func (t, VAR_NEW_DPAGE) || is_func (t, NEW_DPAGE);
104 }
105 
106 bool expand_needs_surrounding (string s); // defined in upgrade.cpp
107 
108 static bool
is_basic_environment(tree t)109 is_basic_environment (tree t) {
110   return // TODO: to be completed using DRD properties
111     (is_compound (t) && N(t) == 1 &&
112      expand_needs_surrounding (as_string (L(t)))) ||
113     is_compound (t, "equation", 1) || is_compound (t, "equation*", 1);
114 }
115 
116 static tree
migrate_surround(tree bef,tree aft,tree body)117 migrate_surround (tree bef, tree aft, tree body) {
118   if (is_document (body) && N(body) > 0) {
119     if (N(body) == 1)
120       return tree (DOCUMENT, migrate_surround (bef, aft, body[0]));
121     tree first (DOCUMENT, migrate_surround (bef, "", body[0]));
122     tree last  (DOCUMENT, migrate_surround ("", aft, body[N(body)-1]));
123     return first * body (1, N(body)-1) * last;
124   }
125   else
126     return from_concat (to_concat (bef) * to_concat (body) * to_concat (aft));
127 }
128 
129 static tree
make_surround(tree bef,tree aft,tree t)130 make_surround (tree bef, tree aft, tree t) {
131   if (bef == "" && aft == "") return t;
132   else if (N(t) == 1 && is_basic_environment (t))
133     return tree (L(t), migrate_surround (bef, aft, t[0]));
134   return tree (SURROUND, bef, aft, t);
135 }
136 
137 tree
correct_concat_block(tree t)138 correct_concat_block (tree t) {
139   if (is_atomic (t)) return t;
140   int i, n= N(t);
141   tree r (t, n);
142   for (i=0; i<n; i++)
143     r[i]= correct_concat_block (t[i]);
144   t= r;
145   if (is_concat (t)) {
146     tree doc (DOCUMENT);
147     int start= 0;
148     for (i=0; i<n; )
149       if (is_multi_paragraph (t[i])) {
150         int bef_i= i;
151         while (bef_i > start && is_migratable (t[bef_i-1])) bef_i--;
152         int aft_i= i+1;
153         while (aft_i < n && is_migratable (t[aft_i])) aft_i++;
154         if (start < bef_i) doc << from_concat (t (start, bef_i));
155         tree bef= from_concat (t (bef_i, i));
156         tree aft= from_concat (t (i+1, aft_i));
157         doc << make_surround (bef, aft, t[i]);
158         i= aft_i;
159         start= i;
160       }
161       else i++;
162     if (start < n) doc << from_concat (t (start, n));
163     if (N(doc) == 1) return doc[0];
164     else return doc;
165   }
166   else if (is_document (t)) {
167     tree doc (DOCUMENT);
168     for (i=0; i<n; i++)
169       if (is_document (t[i])) doc << A(t[i]);
170       else doc << t[i];
171     return doc;
172   }
173   else if (N(t) == 1 && is_basic_environment (t) && !is_document (t[0]))
174     return tree (L(t), tree (DOCUMENT, t[0]));
175   else return t;
176 }
177 
178 /******************************************************************************
179 * Correct WITHs or WITH-like macros
180 ******************************************************************************/
181 
182 tree
with_correct(tree t)183 with_correct (tree t) {
184   if (is_atomic (t)) return t;
185   else {
186     //cout << "Correcting " << t << LF << INDENT;
187     tree u (t, N(t));
188     for (int k=0; k<N(t); k++)
189       u[k]= with_correct (t[k]);
190     array<tree> a= concat_decompose (u);
191     int i, n= N(a);
192     array<tree> r;
193     for (i=0; i<n; i++) {
194       if (is_with_like (a[i])) {
195 	array<tree> b= with_decompose (a[i], with_body (a[i]));
196 	int p= N(b), k1, k2;
197 	for (k1=0; k1<p ; k1++)
198 	  if (is_with_like (b[k1]) && with_similar_type (a[i], b[k1]));
199 	  else break;
200 	for (k2=p; k2>k1; k2--)
201 	  if (is_with_like (b[k2-1]) && with_similar_type (a[i], b[k2-1]));
202 	  else break;
203 	array<tree> x;
204 	if (0  < k1) x << range (b, 0, k1);
205 	if (k1 < k2) x << with_recompose (a[i], range (b, k1, k2));
206 	if (k2 < p ) x << range (b, k2, p);
207 	if (N(x) == 0) continue;
208 	if (N(r) != 0 &&
209 	    is_with_like (r[N(r)-1]) &&
210 	    with_same_type (r[N(r)-1], x[0]))
211 	  {
212 	    array<tree> c= concat_decompose (with_body (r[N(r)-1]));
213 	    c << concat_decompose (with_body (x[0]));
214 	    r[N(r)-1]= with_recompose (x[0], c);
215 	    r << range (x, 1, N(x));
216 	  }
217 	else r << x;
218       }
219       else r << a[i];
220     }
221     //cout << UNINDENT << "Corrected " << t << " -> "
222     //<< concat_recompose (r) << LF;
223     return concat_recompose (r);
224   }
225 }
226 
227 static tree
superfluous_with_correct(tree t,tree env)228 superfluous_with_correct (tree t, tree env) {
229   if (is_atomic (t)) return t;
230   else {
231     //cout << "Superfluous correcting " << t << ", " << env << LF;
232     if (is_compound (t, "body", 1))
233       return compound ("body", superfluous_with_correct (t[0], env));
234     if (is_func (t, WITH) && ((N(t) & 1) == 0))
235       t= t * tree (WITH, "");
236     tree r (t, N(t));
237     for (int i=0; i<N(t); i++)
238       r[i]= superfluous_with_correct
239 	      (t[i], the_drd->get_env_child (t, i, env));
240     if (is_compound (r, "math", 1) && r[0] == "") return "";
241     else if (is_compound (r, "text", 1) && r[0] == "") return "";
242     else if (is_compound (r, "math", 1) && drd_env_read (env, MODE) == "math")
243       return r[0];
244     else if (is_compound (r, "text", 1) && drd_env_read (env, MODE) == "text")
245       return r[0];
246     else if (is_func (r, WITH)) {
247       for (int i=0; i+1<N(r); i+=2)
248 	if (!is_atomic (r[i])) return r;
249 	else if (drd_env_read (env, r[i]->label) != r[i+1]) return r;
250       return r[N(r)-1];
251     }
252     else if (is_func (r, CONCAT)) {
253       array<tree> a= concat_decompose (r);
254       return concat_recompose (a);
255     }
256     return r;
257   }
258 }
259 
260 tree
superfluous_with_correct(tree t)261 superfluous_with_correct (tree t) {
262   with_drd drd (get_document_drd (t));
263   return superfluous_with_correct (t, tree (WITH, MODE, "text"));
264 }
265 
266 /******************************************************************************
267 * Replace symbols by appropriate homoglyphs
268 ******************************************************************************/
269 
270 static array<tree>
homoglyph_correct(array<tree> a)271 homoglyph_correct (array<tree> a) {
272   array<int>  tp= symbol_types (a);
273   array<tree> r;
274   //cout << a << ", " << tp << "\n";
275   for (int i=0; i<N(a); i++)
276     if (a[i] == "<minus>") r << tree ("-");
277     else if (a[i] == "\\" || a[i] == "<backslash>") {
278       int j1, j2;
279       for (j1= i-1; j1>=0; j1--)
280 	if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
281       for (j2= i+1; j2<N(a); j2++)
282 	if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
283       if (j1 < 0 || j2 >= N(a));
284       else if ((a[i] == "\\" ||
285 		a[i] == "<backslash>") &&
286 	       ((tp[j1] == SYMBOL_BASIC) ||
287 		(tp[j1] == SYMBOL_POSTFIX)) &&
288 	       ((tp[j2] == SYMBOL_BASIC) ||
289 		(tp[j2] == SYMBOL_PREFIX)))
290 	r << tree ("<setminus>");
291       else r << a[i];
292     }
293     else if (is_func (a[i], NEG, 1) && is_atomic (a[i][0])) {
294       string s= a[i][0]->label;
295       if (s == "=") r << tree ("<neq>");
296       else if (s == "<less>") r << tree ("<nless>");
297       else if (s == "<gtr>") r << tree ("<ngtr>");
298       else if (s == "<leq>") r << tree ("<nleq>");
299       else if (s == "<geq>") r << tree ("<ngeq>");
300       else if (s == "<leqslant>") r << tree ("<nleqslant>");
301       else if (s == "<geqslant>") r << tree ("<ngeqslant>");
302       else if (s == "<prec>") r << tree ("<nprec>");
303       else if (s == "<succ>") r << tree ("<nsucc>");
304       else if (s == "<preceq>") r << tree ("<npreceq>");
305       else if (s == "<succeq>") r << tree ("<nsucceq>");
306       else if (s == "<preccurlyeq>") r << tree ("<npreccurlyeq>");
307       else if (s == "<succcurlyeq>") r << tree ("<nsucccurlyeq>");
308       else if (s == "<rightarrow>") r << tree ("<nrightarrow>");
309       else if (s == "<Rightarrow>") r << tree ("<nRightarrow>");
310       else if (s == "<leftarrow>") r << tree ("<nleftarrow>");
311       else if (s == "<Leftarrow>") r << tree ("<nLeftarrow>");
312       else if (s == "<leftrightarrow>") r << tree ("<nleftrightarrow>");
313       else if (s == "<Leftrightarrow>") r << tree ("<nLeftrightarrow>");
314       else if (s == "<equiv>") r << tree ("<nequiv>");
315       else if (s == "<sim>") r << tree ("<nsim>");
316       else if (s == "<simeq>") r << tree ("<nsimeq>");
317       else if (s == "<approx>") r << tree ("<napprox>");
318       else if (s == "<cong>") r << tree ("<ncong>");
319       else if (s == "<asymp>") r << tree ("<nasymp>");
320       else if (s == "<in>") r << tree ("<nin>");
321       else if (s == "<ni>") r << tree ("<nni>");
322       else if (s == "<subset>") r << tree ("<nsubset>");
323       else if (s == "<supset>") r << tree ("<nsupset>");
324       else if (s == "<subseteq>") r << tree ("<nsubseteq>");
325       else if (s == "<supseteq>") r << tree ("<nsupseteq>");
326       else if (s == "<sqsubset>") r << tree ("<nsqsubset>");
327       else if (s == "<sqsupset>") r << tree ("<nsqsupset>");
328       else if (s == "<sqsubseteq>") r << tree ("<nsqsubseteq>");
329       else if (s == "<sqsupseteq>") r << tree ("<nsqsupseteq>");
330       else if (s == "<leadsto>") r << tree ("<nleadsto>");
331       else r << a[i];
332     }
333     else if (a[i] == ":" && i+1 < N(a) && a[i+1] == "=") {
334       r << tree ("<assign>");
335       i++;
336     }
337     else r << a[i];
338   return r;
339 }
340 
341 static string
get_submode(tree t,int i,string mode)342 get_submode (tree t, int i, string mode) {
343   if (is_func (t, WITH) && i == N(t)-1)
344     for (int j=0; j<N(t)-1; j+=2)
345       if (t[j] == MATH_FONT_FAMILY) return "text";
346   tree tmode= the_drd->get_env_child (t, i, MODE, mode);
347   return (is_atomic (tmode)? tmode->label: string ("text"));
348 }
349 
350 static tree
homoglyph_correct(tree t,string mode)351 homoglyph_correct (tree t, string mode) {
352   //cout << "Correct " << t << ", " << mode << "\n";
353   tree r= t;
354   if (is_compound (t)) {
355     int i, n= N(t);
356     r= tree (t, n);
357     for (i=0; i<n; i++) {
358       string smode= get_submode (t, i, mode);
359       if (is_correctable_child (t, i))
360 	r[i]= homoglyph_correct (t[i], smode);
361       else r[i]= t[i];
362     }
363   }
364 
365   if (mode == "math") {
366     array<tree> a= concat_tokenize (r);
367     a= homoglyph_correct (a);
368     tree ret= concat_recompose (a);
369     //if (ret != r) cout << "< " << r << " >" << LF
370     //<< "> " << ret << " <" << LF;
371     return ret;
372   }
373   else return r;
374 }
375 
376 tree
homoglyph_correct(tree t)377 homoglyph_correct (tree t) {
378   with_drd drd (get_document_drd (t));
379   return homoglyph_correct (t, "text");
380 }
381 
382 /******************************************************************************
383 * Remove incorrect spaces and multiplications
384 ******************************************************************************/
385 
386 static array<tree>
superfluous_invisible_correct(array<tree> a)387 superfluous_invisible_correct (array<tree> a) {
388   array<int>  tp= symbol_types (a);
389   array<tree> r;
390   //cout << a << ", " << tp << "\n";
391   for (int i=0; i<N(a); i++)
392     if (a[i] == " " || a[i] == "*") {
393       int j1, j2;
394       for (j1= i-1; j1>=0; j1--)
395 	if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
396 	else if (a[j1] == " ") break;
397       for (j2= i+1; j2<N(a); j2++)
398 	if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT)
399 	  if (a[j2] != " " && a[j2] != "*") break;
400       //cout << "  " << i << ": " << j1 << ", " << j2
401       //<< "; " << tp[j1] << ", " << tp[j2] << "\n";
402       if (j1 < 0 || j2 >= N(a));
403       else if (a[j1] == " " || a[j1] == "*");
404       else if (tp[j1] == SYMBOL_PREFIX ||
405 	       tp[j1] == SYMBOL_INFIX ||
406 	       tp[j1] == SYMBOL_SEPARATOR ||
407                tp[j1] == SYMBOL_PROBABLE_MIDDLE);
408       else if (tp[j2] == SYMBOL_POSTFIX ||
409 	       tp[j2] == SYMBOL_INFIX ||
410 	       tp[j2] == SYMBOL_SEPARATOR ||
411                tp[j2] == SYMBOL_PROBABLE_MIDDLE);
412       else r << a[i];
413     }
414     else if (is_func (a[i], SQRT, 2) && a[i][1] == "")
415       r << tree (SQRT, a[i][0]);
416     else if (is_script (a[i]) && a[i][0] == "")
417       r << tree (L(a[i]), "<nosymbol>");
418     else r << a[i];
419   return r;
420 }
421 
422 static tree
superfluous_invisible_correct(tree t,string mode)423 superfluous_invisible_correct (tree t, string mode) {
424   //cout << "Correct " << t << ", " << mode << "\n";
425   tree r= t;
426   if (is_compound (t)) {
427     int i, n= N(t);
428     r= tree (t, n);
429     for (i=0; i<n; i++) {
430       string smode= get_submode (t, i, mode);
431       //cout << "  " << i << ": " << is_correctable_child (t, i)
432       //<< ", " << smode << "\n";
433       if (is_func (t, WITH) && i != N(t)-1)
434 	r[i]= t[i];
435       else if (is_correctable_child (t, i))
436 	r[i]= superfluous_invisible_correct (t[i], smode);
437       else r[i]= t[i];
438     }
439   }
440 
441   if (is_func (r, CONCAT)) {
442     bool ok= true;
443     int i, found= -1;
444     for (i=0; i<N(r); i++)
445       if (is_compound (r[i], "hide-preamble") ||
446 	  is_compound (r[i], "show-preamble"))
447 	{
448 	  ok= (found == -1);
449 	  found= i;
450 	}
451       else if (!is_atomic (r[i])) ok= false;
452       else {
453 	string s= r[i]->label;
454 	for (int j=0; j<N(s); j++)
455 	  if (s[j] != ' ') ok= false;
456       }
457     if (ok) r= r[found];
458   }
459 
460   if (is_func (r, INACTIVE, 1) && is_func (r[0], RIGID))
461     return r[0];
462   else if (mode == "math") {
463     array<tree> a= concat_tokenize (r);
464     a= superfluous_invisible_correct (a);
465     tree ret= concat_recompose (a);
466     //if (ret != r) cout << "< " << r << " >" << LF
467     //<< "> " << ret << " <" << LF;
468     return ret;
469   }
470   else return r;
471 }
472 
473 tree
superfluous_invisible_correct(tree t)474 superfluous_invisible_correct (tree t) {
475   with_drd drd (get_document_drd (t));
476   return superfluous_invisible_correct (t, "text");
477 }
478 
479 /******************************************************************************
480 * Insert missing multiplications or function applications
481 ******************************************************************************/
482 
483 #define SURE_NOTHING     0
484 #define SURE_TIMES       1
485 #define SURE_SPACE       2
486 #define PROBABLE_TIMES   3
487 #define PROBABLE_SPACE   4
488 #define BOTH_WAYS        5
489 
490 struct invisible_corrector {
491   int force;
492   hashmap<string,int> times_before;
493   hashmap<string,int> times_after;
494   hashmap<string,int> space_before;
495   hashmap<string,int> space_after;
496 
497 protected:
498   bool is_letter_like (string s);
499   bool contains_infix (tree t);
500   bool contains_plus_like (tree t);
501   void count_invisible (array<tree> a);
502   void count_invisible (tree t, string mode);
503   int  get_status (tree t, bool left, bool script_flag);
504   array<tree> correct (array<tree> a);
505 
506 public:
invisible_correctorinvisible_corrector507   inline invisible_corrector (tree t, int force2):
508     force (force2), times_before (0), times_after (0), space_after (0) {
509       count_invisible (t, "text"); }
510   tree correct (tree t, string mode);
511 };
512 
513 bool
is_letter_like(string s)514 invisible_corrector::is_letter_like (string s) {
515   static language lan= math_language ("std-math");
516   if (s != "" && is_iso_alpha (s)) return true;
517   return lan->get_group (s) == "Letter-symbol";
518 }
519 
520 bool
contains_infix(tree t)521 invisible_corrector::contains_infix (tree t) {
522   array<int> tp= symbol_types (concat_tokenize (t));
523   for (int i=0; i<N(tp); i++)
524     if (tp[i] == SYMBOL_INFIX)
525       return true;
526   return false;
527 }
528 
529 bool
contains_plus_like(tree t)530 invisible_corrector::contains_plus_like (tree t) {
531   array<tree> a= concat_tokenize (t);
532   for (int i=1; i<N(a)-1; i++)
533     if (a[i] == "+" || a[i] == "-")
534       return true;
535   return false;
536 }
537 
538 void
count_invisible(array<tree> a)539 invisible_corrector::count_invisible (array<tree> a) {
540   array<int>  tp= symbol_types (a);
541   for (int i=0; i<N(a); i++)
542     if (is_atomic (a[i]) && is_letter_like (a[i]->label)) {
543       int j1, j2;
544       for (j1= i-1; j1>=0; j1--)
545 	if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break;
546 	else if (a[j1] == " ") break;
547       for (j2= i+1; j2<N(a); j2++)
548 	if (tp[j2] != SYMBOL_SKIP && tp[j2] != SYMBOL_SCRIPT) break;
549 	else if (a[j2] == " ") break;
550       string s= a[i]->label;
551       if (j1 >= 0) {
552 	if (a[j1] == "*")
553 	  times_before (s)= times_before[s] + 1;
554 	if (a[j1] == " ")
555 	  space_before (s)= space_before[s] + 1;
556       }
557       if (j2 < N(a)) {
558 	if (a[j2] == "*")
559 	  times_after (s)= times_after[s] + 1;
560 	if (a[j2] == " ")
561 	  space_after (s)= space_after[s] + 1;
562         // NOTE: this heuristic might not be a good idea,
563         // because it inhibits the correction of QR -> Q*R,
564         // if Q is a polynomial which is applied somewhere Q(1).
565         // We might introduce a table 'apply_after'.
566 	//if (is_around (a[j2]) && a[j2][0] == "(" &&
567         //!contains_infix (a[j2][1]))
568         //space_after (s)= space_after[s] + 1;
569       }
570     }
571 }
572 
573 void
count_invisible(tree t,string mode)574 invisible_corrector::count_invisible (tree t, string mode) {
575   if (is_compound (t)) {
576     int i, n= N(t);
577     for (i=0; i<n; i++) {
578       string smode= get_submode (t, i, mode);
579       if (is_func (t, WITH) && i != N(t)-1);
580       else if (is_correctable_child (t, i))
581 	count_invisible (t[i], smode);
582     }
583   }
584   if (mode == "math")
585     count_invisible (concat_tokenize (t));
586 }
587 
588 int
get_status(tree t,bool left,bool script_flag)589 invisible_corrector::get_status (tree t, bool left, bool script_flag) {
590   if (is_atomic (t)) {
591     static language lan= math_language ("std-math");
592     string s= t->label;
593     string g= lan->get_group (t->label);
594     if (is_numeric (s))
595       return (left? SURE_TIMES: PROBABLE_TIMES);
596     else if (starts (g, "Unary-operator-textual"))
597       return (left? SURE_SPACE: BOTH_WAYS);
598     else if (starts (g, "Binary-operator"))
599       return SURE_SPACE;
600     else if (starts (g, "N-ary-operator"))
601       return (left? SURE_SPACE: BOTH_WAYS);
602     else if (is_letter_like (s)) {
603       if (left) {
604 	if (times_after[s] > 0 && space_after[s] == 0)
605 	  return SURE_TIMES;
606 	else if (space_after[s] > 0 && times_after[s] == 0)
607 	  return SURE_SPACE;
608 	else if (times_after[s] > space_after[s])
609 	  return PROBABLE_TIMES;
610 	else if (space_after[s] > times_after[s])
611 	  return PROBABLE_SPACE;
612 	else if (N(s)>1 && is_iso_alpha (s))
613 	  return PROBABLE_SPACE;
614         else if (script_flag)
615           return PROBABLE_TIMES;
616 	else return BOTH_WAYS;
617       }
618       else {
619 	if (times_before[s] > space_before[s])
620 	  return PROBABLE_TIMES;
621 	else if (times_after[s] > 0 && space_after[s] == 0)
622 	  return PROBABLE_TIMES;
623         else if (script_flag && (N(s) == 1 || !is_iso_alpha (s)))
624           return PROBABLE_TIMES;
625 	else return BOTH_WAYS;
626       }
627     }
628     else if (s == "<cdots>" || s == "<ldots>")
629       return PROBABLE_TIMES;
630     else return ((force > 0)? BOTH_WAYS: SURE_NOTHING);
631   }
632   else {
633     if (is_around (t)) {
634       if (left && contains_plus_like (t[1]))
635 	return ((force > 0)? SURE_TIMES: PROBABLE_TIMES);
636       else if (contains_plus_like (t[1]))
637 	return ((force > 0)? PROBABLE_TIMES: BOTH_WAYS);
638       else if (!contains_infix (t[1]))
639 	return (left? BOTH_WAYS: SURE_SPACE);
640       else return BOTH_WAYS;
641     }
642     else if (is_func (t, FRAC) ||
643 	     is_func (t, SQRT))
644       return (left? SURE_TIMES: BOTH_WAYS);
645     else if (!left && is_func (t, BIG_AROUND, 2) &&
646              (t[0] == "<sum>" || t[0] == "<amalg>" ||
647               t[0] == "<oplus>" || t[0] == "<uplus>" ||
648               t[0] == "<int>" || t[0] == "<oint>" ||
649               t[0] == "<intlim>" || t[0] == "<ointlim>" ||
650               t[0] == "<prod>" || t[0] == "<odot>" || t[0] == "<otimes>"))
651       return PROBABLE_TIMES;
652     else if (is_func (t, WIDE, 2))
653       return get_status (t[0], left, script_flag);
654     else if (is_func (t, WITH))
655       return get_status (t[N(t)-1], left, script_flag);
656     else if (N(t) == 0 && L(t) >= START_EXTENSIONS) {
657       tree def= the_drd->get_syntax (L(t));
658       if (is_func (def, MACRO, 1))
659         return get_status (def[0], left, script_flag);
660       else return SURE_NOTHING;
661     }
662     else return SURE_NOTHING;
663   }
664 }
665 
666 static bool
admits_script(array<int> tp,int i)667 admits_script (array<int> tp, int i) {
668   i++;
669   while (i<N(tp))
670     if (tp[i] == SYMBOL_SCRIPT) return true;
671     else if (tp[i] == SYMBOL_SKIP) i++;
672     else return false;
673   return false;
674 }
675 
676 array<tree>
correct(array<tree> a)677 invisible_corrector::correct (array<tree> a) {
678   //cout << "Correct " << a << "\n";
679   array<tree> r;
680   array<int> tp= symbol_types (a);
681   for (int i=0; i<N(a); i++) {
682     r << a[i];
683     if (a[i] != " " && tp[i] == SYMBOL_BASIC) {
684       int j;
685       for (j= i+1; j<N(a); j++)
686 	if (tp[j] != SYMBOL_SKIP && tp[j] != SYMBOL_SCRIPT) break;
687 	else if (a[j] == " ") break;
688       if (j >= N(a) || a[j] == " " || tp[j] != SYMBOL_BASIC)
689 	continue;
690 
691       string ins= "";
692       int sti= get_status (a[i], true, admits_script (tp, i));
693       int stj= get_status (a[j], false, admits_script (tp, j));
694       //cout << "Pair (" << a[i] << ", " << a[j] << ")"
695       //<< " -> (" << sti << ", " << stj << ")" << LF;
696       if (sti == SURE_NOTHING || stj == SURE_NOTHING)
697 	ins= "";
698       else if (sti == SURE_TIMES && stj != SURE_SPACE)
699 	ins= "*";
700       else if (sti == SURE_SPACE && stj != SURE_TIMES)
701 	ins= " ";
702       else if (sti == PROBABLE_TIMES && stj == PROBABLE_TIMES)
703 	ins= "*";
704       else if (sti == PROBABLE_SPACE && stj == PROBABLE_SPACE)
705 	ins= " ";
706       else if (sti == PROBABLE_TIMES && stj == BOTH_WAYS)
707 	ins= "*";
708       else if (sti == PROBABLE_SPACE && stj == BOTH_WAYS)
709 	ins= " ";
710       else if (sti == BOTH_WAYS && stj == PROBABLE_TIMES)
711 	ins= "*";
712       else if (sti == BOTH_WAYS && stj == PROBABLE_SPACE)
713 	ins= " ";
714       else if (sti == BOTH_WAYS && stj == BOTH_WAYS && force == 1 &&
715 	       (is_atomic (a[i]) || is_around (a[i])) &&
716 	       (is_atomic (a[j]) || is_around (a[j])))
717 	ins= "*";
718 
719       if (is_around (a[j]))
720 	if (ins == " " || (ins == "*" && force == -1))
721 	  ins= "";
722       if (a[j] == ".") ins= "";
723       while (i+1 < N(a) && (is_func (a[i+1], RSUB, 1) ||
724 			    is_func (a[i+1], RSUP, 1) ||
725 			    is_func (a[i+1], RPRIME, 1))) {
726 	i++;
727 	r << a[i];
728       }
729       if (ins != "") r << tree (ins);
730     }
731   }
732   return r;
733 }
734 
735 tree
correct(tree t,string mode)736 invisible_corrector::correct (tree t, string mode) {
737   //cout << "Correct " << t << ", " << mode << "\n";
738   tree r= t;
739   if (is_compound (t)) {
740     int i, n= N(t);
741     r= tree (t, n);
742     for (i=0; i<n; i++) {
743       string smode= get_submode (t, i, mode);
744       if (is_func (t, WITH) && i != N(t)-1)
745 	r[i]= t[i];
746       else if (is_correctable_child (t, i))
747 	r[i]= correct (t[i], smode);
748       else r[i]= t[i];
749     }
750   }
751 
752   if (mode == "math") {
753     array<tree> a= concat_tokenize (r);
754     a= correct (a);
755     tree ret= concat_recompose (a);
756     //if (ret != r)
757     //  cout << "<< " << r << " >>" << LF
758     //       << ">> " << ret << " <<" << LF;
759     return ret;
760   }
761   else return r;
762 }
763 
764 tree
missing_invisible_correct(tree t,int force)765 missing_invisible_correct (tree t, int force) {
766   // force = -1, only correct when sure, and when old markup is incorrect
767   // force = 0 , only correct when pretty sure
768   // force = 1 , correct whenever reasonable (used for LaTeX import)
769   with_drd drd (get_document_drd (t));
770   invisible_corrector corrector (t, force);
771   //cout << "Times before " << corrector.times_before << "\n";
772   //cout << "Space before " << corrector.space_before << "\n";
773   //cout << "Times after  " << corrector.times_after  << "\n";
774   //cout << "Space after  " << corrector.space_after  << "\n";
775   return corrector.correct (t, "text");
776 }
777 
778 tree
missing_invisible_correct_twice(tree t,int force=-1)779 missing_invisible_correct_twice (tree t, int force= -1) {
780   tree u= missing_invisible_correct (t, force);
781   if (u == t) return t;
782   return missing_invisible_correct (u, force);
783 }
784 
785 /******************************************************************************
786 * Miscellaneous corrections
787 ******************************************************************************/
788 
789 tree
misc_math_correct(tree t)790 misc_math_correct (tree t) {
791   if (is_atomic (t)) return t;
792   else if (is_compound (t, "math", 1) && is_func (t[0], RSUB, 1))
793     return tree (RSUB, compound ("math", misc_math_correct (t[0][0])));
794   else if (is_compound (t, "math", 1) && is_func (t[0], RSUP, 1))
795     return tree (RSUP, compound ("math", misc_math_correct (t[0][0])));
796   else if (is_func (t, RSUB, 1) && is_func (t[0], RSUB, 1))
797     return misc_math_correct (t[0]);
798   else if (is_func (t, RSUB, 1) && is_func (t[0], RSUP, 1))
799     return misc_math_correct (tree (RSUB, t[0][0]));
800   else if (is_func (t, RSUP, 1) && is_func (t[0], RSUB, 1))
801     return misc_math_correct (tree (RSUP, t[0][0]));
802   else if (is_func (t, RSUP, 1) && is_func (t[0], RSUP, 1))
803     return misc_math_correct (t[0]);
804   else if (is_func (t, RSUP, 1) && is_func (t[0], RPRIME, 1))
805     return misc_math_correct (t[0]);
806   else if (is_script (t) && is_compound (t[0], "text", 1) &&
807            is_atomic (t[0][0]) && is_alpha (t[0][0]->label))
808     {
809       if (N(t[0][0]->label) != 1) return tree (L(t), t[0][0]);
810       else return tree (L(t), tree (WITH, "math-font-family", "trm",
811                                     misc_math_correct (t[0])));
812     }
813   else if (is_compound (t, "math", 1)) {
814     tree arg = misc_math_correct (t[0]);
815     tree last= arg;
816     if (is_concat (last) && N(last) > 0) last= last[N(last)-1];
817     if (is_atomic (last) && N(last->label) > 0 &&
818         is_punctuation (last->label [N(last->label)-1]))
819       {
820         string s= last->label;
821         int i= N(s);
822         while (i>0 && is_punctuation (s[i-1])) i--;
823         if (i == N(s)) return compound ("math", arg);
824         string tail= s (i, N(s));
825         s= s (0, i);
826         if (last == arg) {
827           if (N(s) == 0) return tail;
828           else return concat (compound ("math", s), tail);
829         }
830         else {
831           tree cc= arg (0, N(arg) - 1);
832           if (N(s) != 0) cc << tree (s);
833           if (N(cc) == 1) cc= cc[0];
834           return concat (compound ("math", cc), tail);
835         }
836       }
837     else return compound ("math", arg);
838   }
839   else {
840     int i, n= N(t);
841     tree r (t, n);
842     for (i=0; i<n; i++)
843       r[i]= misc_math_correct (t[i]);
844     if (is_concat (r))
845       r= concat_recompose (concat_decompose (r));
846     return r;
847   }
848 }
849 
850 /******************************************************************************
851 * Count errors
852 ******************************************************************************/
853 
854 static int
count_math_formula_errors(tree t,int mode)855 count_math_formula_errors (tree t, int mode) {
856   if (mode == 1) return 1;
857   if (packrat_correct ("std-math", "Main", t)) return 0;
858   else {
859     if (mode == 2) cout << "  ERROR> " << t << "\n";
860     return 1;
861   }
862 }
863 
864 static int
count_math_table_errors(tree t,int mode)865 count_math_table_errors (tree t, int mode) {
866   if (is_atomic (t)) return 0;
867   else if (is_func (t, CELL, 1)) {
868     if (t[0] == "" || t[0] == tree (DOCUMENT, "")) return 0;
869     if (mode == 1) return 1;
870     if (packrat_correct ("std-math", "Cell", t[0])) return 0;
871     else {
872       if (mode == 2) cout << "  ERROR> " << t << "\n";
873       return 1;
874     }
875   }
876   else {
877     int sum= 0;
878     for (int i=0; i<N(t); i++)
879       sum += count_math_table_errors (t[i], mode);
880     return sum;
881   }
882 }
883 
884 int
count_math_errors(tree t,int mode)885 count_math_errors (tree t, int mode) {
886   if (is_atomic (t)) return 0;
887   else {
888     int sum= 0;
889     for (int i=0; i<N(t); i++) {
890       tree cmode= the_drd->get_env_child (t, i, MODE, "text");
891       if (cmode != "math") sum += count_math_errors (t[i], mode);
892       else {
893         tree u= t[i];
894         while (is_func (u, DOCUMENT, 1) ||
895                is_func (u, TFORMAT) ||
896                is_func (u, WITH))
897           u= u[N(u)-1];
898         if (is_func (u, TABLE)) sum += count_math_table_errors (u, mode);
899         else sum += count_math_formula_errors (u, mode);
900       }
901     }
902     return sum;
903   }
904 }
905 
906 /******************************************************************************
907 * Print mathematical status
908 ******************************************************************************/
909 
910 static int count_formula= 0;
911 static int count_initial_errors= 0;
912 static int count_final_errors= 0;
913 
914 static int corrected_with= 0;
915 static int corrected_superfluous_with= 0;
916 static int corrected_brackets= 0;
917 static int corrected_move_brackets= 0;
918 static int corrected_misc= 0;
919 static int corrected_superfluous_invisible= 0;
920 static int corrected_homoglyph= 0;
921 static int corrected_missing_invisible= 0;
922 static int corrected_zealous_invisible= 0;
923 
924 void
math_status_cumul_sub(tree t,int & cumul,int & errors)925 math_status_cumul_sub (tree t, int& cumul, int& errors) {
926   int new_errors= count_math_errors (t);
927   cumul += (errors - new_errors);
928   errors= new_errors;
929 }
930 
931 void
math_status_cumul(tree t)932 math_status_cumul (tree t) {
933   with_drd drd (get_document_drd (t));
934   if (is_func (t, DOCUMENT))
935     for (int i=0; i<N(t); i++)
936       if (is_compound (t[i], "body", 1)) {
937         t= t[i][0];
938         break;
939       }
940 
941   int errors= count_math_errors (t);
942   count_formula += count_math_errors (t, 1);
943   count_initial_errors += errors;
944   t= with_correct (t);
945   math_status_cumul_sub (t, corrected_with, errors);
946   t= superfluous_with_correct (t);
947   math_status_cumul_sub (t, corrected_superfluous_with, errors);
948   t= upgrade_brackets (t);
949   math_status_cumul_sub (t, corrected_brackets, errors);
950   t= move_brackets (t);
951   math_status_cumul_sub (t, corrected_move_brackets, errors);
952   t= misc_math_correct (t);
953   math_status_cumul_sub (t, corrected_misc, errors);
954   t= superfluous_invisible_correct (t);
955   math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
956   t= homoglyph_correct (t);
957   math_status_cumul_sub (t, corrected_homoglyph, errors);
958   t= superfluous_invisible_correct (t);
959   math_status_cumul_sub (t, corrected_superfluous_invisible, errors);
960   t= missing_invisible_correct (t);
961   math_status_cumul_sub (t, corrected_missing_invisible, errors);
962   count_final_errors += errors;
963   //cout << "Errors= " << errors << "\n";
964   //(void) count_math_errors (t, 2);
965   t= missing_invisible_correct (t, 1);
966   math_status_cumul_sub (t, corrected_zealous_invisible, errors);
967 }
968 
969 void
math_status_reset()970 math_status_reset () {
971   count_formula= 0;
972   count_initial_errors= 0;
973   count_final_errors= 0;
974   corrected_with= 0;
975   corrected_superfluous_with= 0;
976   corrected_brackets= 0;
977   corrected_move_brackets= 0;
978   corrected_misc= 0;
979   corrected_superfluous_invisible= 0;
980   corrected_homoglyph= 0;
981   corrected_missing_invisible= 0;
982 }
983 
984 void
math_status_print()985 math_status_print () {
986   cout << "Formulas       : " << count_formula << "\n";
987   cout << "Initial errors : " << count_initial_errors << "\n";
988   cout << "Final errors   : " << count_final_errors << "\n";
989   cout << "\n";
990   cout << "With corrected                  : "
991        << corrected_with << "\n";
992   cout << "Superfluous with corrected      : "
993        << corrected_superfluous_with << "\n";
994   cout << "Upgraded brackets               : "
995        << corrected_brackets << "\n";
996   cout << "Moved brackets                  : "
997        << corrected_move_brackets << "\n";
998   cout << "Miscellaneous corrected         : "
999        << corrected_misc << "\n";
1000   cout << "Superfluous invisible corrected : "
1001        << corrected_superfluous_invisible << "\n";
1002   cout << "Homoglyphs corrected            : "
1003        << corrected_homoglyph << "\n";
1004   cout << "Missing invisible corrected     : "
1005        << corrected_missing_invisible << "\n";
1006   cout << "Zealous invisible corrected     : "
1007        << corrected_zealous_invisible << "\n";
1008   cout << "\n";
1009 }
1010 
1011 /******************************************************************************
1012 * Master routines
1013 ******************************************************************************/
1014 
1015 bool
enabled_preference(string s)1016 enabled_preference (string s) {
1017   return call ("get-preference", s) == object ("on");
1018 }
1019 
1020 tree
latex_correct(tree t)1021 latex_correct (tree t) {
1022   // NOTE: matching brackets corrected in upgrade_tex
1023   t= misc_math_correct (t);
1024   //if (enabled_preference ("remove superfluous invisible"))
1025   t= superfluous_invisible_correct (t);
1026   //if (enabled_preference ("homoglyph correct"))
1027   t= homoglyph_correct (t);
1028   //if (enabled_preference ("remove superfluous invisible"))
1029   t= superfluous_invisible_correct (t);
1030   //if (enabled_preference ("insert missing invisible"))
1031   t= missing_invisible_correct_twice (t);
1032   //if (enabled_preference ("insert missing invisible"))
1033   t= missing_invisible_correct (t, 1);
1034   t= downgrade_big (t);
1035   t= correct_missing_block (t);
1036   t= correct_concat_block (t);
1037   return t;
1038 }
1039 
1040 tree
automatic_correct(tree t,string version)1041 automatic_correct (tree t, string version) {
1042   if (version_inf_eq (version, "1.0.7.9")) {
1043     t= misc_math_correct (t);
1044     if (enabled_preference ("remove superfluous invisible"))
1045       t= superfluous_invisible_correct (t);
1046     if (enabled_preference ("homoglyph correct"))
1047       t= homoglyph_correct (t);
1048     if (enabled_preference ("remove superfluous invisible"))
1049       t= superfluous_invisible_correct (t);
1050     if (enabled_preference ("insert missing invisible"))
1051       t= missing_invisible_correct_twice (t);
1052     if (enabled_preference ("zealous invisible correct"))
1053       t= missing_invisible_correct (t, 1);
1054   }
1055   t= downgrade_big (t);
1056   return t;
1057 }
1058 
1059 tree
manual_correct(tree t)1060 manual_correct (tree t) {
1061   t= with_correct (t);
1062   t= superfluous_with_correct (t);
1063   t= upgrade_brackets (t);
1064   t= misc_math_correct (t);
1065   if (enabled_preference ("manual remove superfluous invisible"))
1066     t= superfluous_invisible_correct (t);
1067   if (enabled_preference ("manual homoglyph correct"))
1068     t= homoglyph_correct (t);
1069   if (enabled_preference ("manual remove superfluous invisible"))
1070     t= superfluous_invisible_correct (t);
1071   if (enabled_preference ("manual insert missing invisible"))
1072     t= missing_invisible_correct_twice (t);
1073   if (enabled_preference ("manual zealous invisible correct"))
1074     t= missing_invisible_correct (t, 1);
1075   t= downgrade_big (t);
1076   return t;
1077 }
1078