/****************************************************************************** * MODULE : tree_correct.cpp * DESCRIPTION: make a tree syntactically match a drd * COPYRIGHT : (C) 2005 Joris van der Hoeven ******************************************************************************* * This software falls under the GNU general public license version 3 or later. * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE * in the root directory or . ******************************************************************************/ #include "tree_correct.hpp" #include "tree_analyze.hpp" #include "scheme.hpp" #include "packrat.hpp" /****************************************************************************** * DRD based correction ******************************************************************************/ tree drd_correct (drd_info drd, tree t) { if (is_atomic (t)) return t; else { int i, n= N(t); if (drd->contains (as_string (L(t))) && !drd->correct_arity (L(t), n)) return ""; tree r (t, n); for (i=0; i 0) { if (N(body) == 1) return tree (DOCUMENT, migrate_surround (bef, aft, body[0])); tree first (DOCUMENT, migrate_surround (bef, "", body[0])); tree last (DOCUMENT, migrate_surround ("", aft, body[N(body)-1])); return first * body (1, N(body)-1) * last; } else return from_concat (to_concat (bef) * to_concat (body) * to_concat (aft)); } static tree make_surround (tree bef, tree aft, tree t) { if (bef == "" && aft == "") return t; else if (N(t) == 1 && is_basic_environment (t)) return tree (L(t), migrate_surround (bef, aft, t[0])); return tree (SURROUND, bef, aft, t); } tree correct_concat_block (tree t) { if (is_atomic (t)) return t; int i, n= N(t); tree r (t, n); for (i=0; i start && is_migratable (t[bef_i-1])) bef_i--; int aft_i= i+1; while (aft_i < n && is_migratable (t[aft_i])) aft_i++; if (start < bef_i) doc << from_concat (t (start, bef_i)); tree bef= from_concat (t (bef_i, i)); tree aft= from_concat (t (i+1, aft_i)); doc << make_surround (bef, aft, t[i]); i= aft_i; start= i; } else i++; if (start < n) doc << from_concat (t (start, n)); if (N(doc) == 1) return doc[0]; else return doc; } else if (is_document (t)) { tree doc (DOCUMENT); for (i=0; i a= concat_decompose (u); int i, n= N(a); array r; for (i=0; i b= with_decompose (a[i], with_body (a[i])); int p= N(b), k1, k2; for (k1=0; k1

k1; k2--) if (is_with_like (b[k2-1]) && with_similar_type (a[i], b[k2-1])); else break; array x; if (0 < k1) x << range (b, 0, k1); if (k1 < k2) x << with_recompose (a[i], range (b, k1, k2)); if (k2 < p ) x << range (b, k2, p); if (N(x) == 0) continue; if (N(r) != 0 && is_with_like (r[N(r)-1]) && with_same_type (r[N(r)-1], x[0])) { array c= concat_decompose (with_body (r[N(r)-1])); c << concat_decompose (with_body (x[0])); r[N(r)-1]= with_recompose (x[0], c); r << range (x, 1, N(x)); } else r << x; } else r << a[i]; } //cout << UNINDENT << "Corrected " << t << " -> " //<< concat_recompose (r) << LF; return concat_recompose (r); } } static tree superfluous_with_correct (tree t, tree env) { if (is_atomic (t)) return t; else { //cout << "Superfluous correcting " << t << ", " << env << LF; if (is_compound (t, "body", 1)) return compound ("body", superfluous_with_correct (t[0], env)); if (is_func (t, WITH) && ((N(t) & 1) == 0)) t= t * tree (WITH, ""); tree r (t, N(t)); for (int i=0; iget_env_child (t, i, env)); if (is_compound (r, "math", 1) && r[0] == "") return ""; else if (is_compound (r, "text", 1) && r[0] == "") return ""; else if (is_compound (r, "math", 1) && drd_env_read (env, MODE) == "math") return r[0]; else if (is_compound (r, "text", 1) && drd_env_read (env, MODE) == "text") return r[0]; else if (is_func (r, WITH)) { for (int i=0; i+1label) != r[i+1]) return r; return r[N(r)-1]; } else if (is_func (r, CONCAT)) { array a= concat_decompose (r); return concat_recompose (a); } return r; } } tree superfluous_with_correct (tree t) { with_drd drd (get_document_drd (t)); return superfluous_with_correct (t, tree (WITH, MODE, "text")); } /****************************************************************************** * Replace symbols by appropriate homoglyphs ******************************************************************************/ static array homoglyph_correct (array a) { array tp= symbol_types (a); array r; //cout << a << ", " << tp << "\n"; for (int i=0; i") r << tree ("-"); else if (a[i] == "\\" || a[i] == "") { int j1, j2; for (j1= i-1; j1>=0; j1--) if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break; for (j2= i+1; j2= N(a)); else if ((a[i] == "\\" || a[i] == "") && ((tp[j1] == SYMBOL_BASIC) || (tp[j1] == SYMBOL_POSTFIX)) && ((tp[j2] == SYMBOL_BASIC) || (tp[j2] == SYMBOL_PREFIX))) r << tree (""); else r << a[i]; } else if (is_func (a[i], NEG, 1) && is_atomic (a[i][0])) { string s= a[i][0]->label; if (s == "=") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else if (s == "") r << tree (""); else r << a[i]; } else if (a[i] == ":" && i+1 < N(a) && a[i+1] == "=") { r << tree (""); i++; } else r << a[i]; return r; } static string get_submode (tree t, int i, string mode) { if (is_func (t, WITH) && i == N(t)-1) for (int j=0; jget_env_child (t, i, MODE, mode); return (is_atomic (tmode)? tmode->label: string ("text")); } static tree homoglyph_correct (tree t, string mode) { //cout << "Correct " << t << ", " << mode << "\n"; tree r= t; if (is_compound (t)) { int i, n= N(t); r= tree (t, n); for (i=0; i a= concat_tokenize (r); a= homoglyph_correct (a); tree ret= concat_recompose (a); //if (ret != r) cout << "< " << r << " >" << LF //<< "> " << ret << " <" << LF; return ret; } else return r; } tree homoglyph_correct (tree t) { with_drd drd (get_document_drd (t)); return homoglyph_correct (t, "text"); } /****************************************************************************** * Remove incorrect spaces and multiplications ******************************************************************************/ static array superfluous_invisible_correct (array a) { array tp= symbol_types (a); array r; //cout << a << ", " << tp << "\n"; for (int i=0; i=0; j1--) if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break; else if (a[j1] == " ") break; for (j2= i+1; j2= N(a)); else if (a[j1] == " " || a[j1] == "*"); else if (tp[j1] == SYMBOL_PREFIX || tp[j1] == SYMBOL_INFIX || tp[j1] == SYMBOL_SEPARATOR || tp[j1] == SYMBOL_PROBABLE_MIDDLE); else if (tp[j2] == SYMBOL_POSTFIX || tp[j2] == SYMBOL_INFIX || tp[j2] == SYMBOL_SEPARATOR || tp[j2] == SYMBOL_PROBABLE_MIDDLE); else r << a[i]; } else if (is_func (a[i], SQRT, 2) && a[i][1] == "") r << tree (SQRT, a[i][0]); else if (is_script (a[i]) && a[i][0] == "") r << tree (L(a[i]), ""); else r << a[i]; return r; } static tree superfluous_invisible_correct (tree t, string mode) { //cout << "Correct " << t << ", " << mode << "\n"; tree r= t; if (is_compound (t)) { int i, n= N(t); r= tree (t, n); for (i=0; ilabel; for (int j=0; j a= concat_tokenize (r); a= superfluous_invisible_correct (a); tree ret= concat_recompose (a); //if (ret != r) cout << "< " << r << " >" << LF //<< "> " << ret << " <" << LF; return ret; } else return r; } tree superfluous_invisible_correct (tree t) { with_drd drd (get_document_drd (t)); return superfluous_invisible_correct (t, "text"); } /****************************************************************************** * Insert missing multiplications or function applications ******************************************************************************/ #define SURE_NOTHING 0 #define SURE_TIMES 1 #define SURE_SPACE 2 #define PROBABLE_TIMES 3 #define PROBABLE_SPACE 4 #define BOTH_WAYS 5 struct invisible_corrector { int force; hashmap times_before; hashmap times_after; hashmap space_before; hashmap space_after; protected: bool is_letter_like (string s); bool contains_infix (tree t); bool contains_plus_like (tree t); void count_invisible (array a); void count_invisible (tree t, string mode); int get_status (tree t, bool left, bool script_flag); array correct (array a); public: inline invisible_corrector (tree t, int force2): force (force2), times_before (0), times_after (0), space_after (0) { count_invisible (t, "text"); } tree correct (tree t, string mode); }; bool invisible_corrector::is_letter_like (string s) { static language lan= math_language ("std-math"); if (s != "" && is_iso_alpha (s)) return true; return lan->get_group (s) == "Letter-symbol"; } bool invisible_corrector::contains_infix (tree t) { array tp= symbol_types (concat_tokenize (t)); for (int i=0; i a= concat_tokenize (t); for (int i=1; i a) { array tp= symbol_types (a); for (int i=0; ilabel)) { int j1, j2; for (j1= i-1; j1>=0; j1--) if (tp[j1] != SYMBOL_SKIP && tp[j1] != SYMBOL_SCRIPT) break; else if (a[j1] == " ") break; for (j2= i+1; j2label; if (j1 >= 0) { if (a[j1] == "*") times_before (s)= times_before[s] + 1; if (a[j1] == " ") space_before (s)= space_before[s] + 1; } if (j2 < N(a)) { if (a[j2] == "*") times_after (s)= times_after[s] + 1; if (a[j2] == " ") space_after (s)= space_after[s] + 1; // NOTE: this heuristic might not be a good idea, // because it inhibits the correction of QR -> Q*R, // if Q is a polynomial which is applied somewhere Q(1). // We might introduce a table 'apply_after'. //if (is_around (a[j2]) && a[j2][0] == "(" && //!contains_infix (a[j2][1])) //space_after (s)= space_after[s] + 1; } } } void invisible_corrector::count_invisible (tree t, string mode) { if (is_compound (t)) { int i, n= N(t); for (i=0; ilabel; string g= lan->get_group (t->label); if (is_numeric (s)) return (left? SURE_TIMES: PROBABLE_TIMES); else if (starts (g, "Unary-operator-textual")) return (left? SURE_SPACE: BOTH_WAYS); else if (starts (g, "Binary-operator")) return SURE_SPACE; else if (starts (g, "N-ary-operator")) return (left? SURE_SPACE: BOTH_WAYS); else if (is_letter_like (s)) { if (left) { if (times_after[s] > 0 && space_after[s] == 0) return SURE_TIMES; else if (space_after[s] > 0 && times_after[s] == 0) return SURE_SPACE; else if (times_after[s] > space_after[s]) return PROBABLE_TIMES; else if (space_after[s] > times_after[s]) return PROBABLE_SPACE; else if (N(s)>1 && is_iso_alpha (s)) return PROBABLE_SPACE; else if (script_flag) return PROBABLE_TIMES; else return BOTH_WAYS; } else { if (times_before[s] > space_before[s]) return PROBABLE_TIMES; else if (times_after[s] > 0 && space_after[s] == 0) return PROBABLE_TIMES; else if (script_flag && (N(s) == 1 || !is_iso_alpha (s))) return PROBABLE_TIMES; else return BOTH_WAYS; } } else if (s == "" || s == "") return PROBABLE_TIMES; else return ((force > 0)? BOTH_WAYS: SURE_NOTHING); } else { if (is_around (t)) { if (left && contains_plus_like (t[1])) return ((force > 0)? SURE_TIMES: PROBABLE_TIMES); else if (contains_plus_like (t[1])) return ((force > 0)? PROBABLE_TIMES: BOTH_WAYS); else if (!contains_infix (t[1])) return (left? BOTH_WAYS: SURE_SPACE); else return BOTH_WAYS; } else if (is_func (t, FRAC) || is_func (t, SQRT)) return (left? SURE_TIMES: BOTH_WAYS); else if (!left && is_func (t, BIG_AROUND, 2) && (t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "" || t[0] == "")) return PROBABLE_TIMES; else if (is_func (t, WIDE, 2)) return get_status (t[0], left, script_flag); else if (is_func (t, WITH)) return get_status (t[N(t)-1], left, script_flag); else if (N(t) == 0 && L(t) >= START_EXTENSIONS) { tree def= the_drd->get_syntax (L(t)); if (is_func (def, MACRO, 1)) return get_status (def[0], left, script_flag); else return SURE_NOTHING; } else return SURE_NOTHING; } } static bool admits_script (array tp, int i) { i++; while (i invisible_corrector::correct (array a) { //cout << "Correct " << a << "\n"; array r; array tp= symbol_types (a); for (int i=0; i= N(a) || a[j] == " " || tp[j] != SYMBOL_BASIC) continue; string ins= ""; int sti= get_status (a[i], true, admits_script (tp, i)); int stj= get_status (a[j], false, admits_script (tp, j)); //cout << "Pair (" << a[i] << ", " << a[j] << ")" //<< " -> (" << sti << ", " << stj << ")" << LF; if (sti == SURE_NOTHING || stj == SURE_NOTHING) ins= ""; else if (sti == SURE_TIMES && stj != SURE_SPACE) ins= "*"; else if (sti == SURE_SPACE && stj != SURE_TIMES) ins= " "; else if (sti == PROBABLE_TIMES && stj == PROBABLE_TIMES) ins= "*"; else if (sti == PROBABLE_SPACE && stj == PROBABLE_SPACE) ins= " "; else if (sti == PROBABLE_TIMES && stj == BOTH_WAYS) ins= "*"; else if (sti == PROBABLE_SPACE && stj == BOTH_WAYS) ins= " "; else if (sti == BOTH_WAYS && stj == PROBABLE_TIMES) ins= "*"; else if (sti == BOTH_WAYS && stj == PROBABLE_SPACE) ins= " "; else if (sti == BOTH_WAYS && stj == BOTH_WAYS && force == 1 && (is_atomic (a[i]) || is_around (a[i])) && (is_atomic (a[j]) || is_around (a[j]))) ins= "*"; if (is_around (a[j])) if (ins == " " || (ins == "*" && force == -1)) ins= ""; if (a[j] == ".") ins= ""; while (i+1 < N(a) && (is_func (a[i+1], RSUB, 1) || is_func (a[i+1], RSUP, 1) || is_func (a[i+1], RPRIME, 1))) { i++; r << a[i]; } if (ins != "") r << tree (ins); } } return r; } tree invisible_corrector::correct (tree t, string mode) { //cout << "Correct " << t << ", " << mode << "\n"; tree r= t; if (is_compound (t)) { int i, n= N(t); r= tree (t, n); for (i=0; i a= concat_tokenize (r); a= correct (a); tree ret= concat_recompose (a); //if (ret != r) // cout << "<< " << r << " >>" << LF // << ">> " << ret << " <<" << LF; return ret; } else return r; } tree missing_invisible_correct (tree t, int force) { // force = -1, only correct when sure, and when old markup is incorrect // force = 0 , only correct when pretty sure // force = 1 , correct whenever reasonable (used for LaTeX import) with_drd drd (get_document_drd (t)); invisible_corrector corrector (t, force); //cout << "Times before " << corrector.times_before << "\n"; //cout << "Space before " << corrector.space_before << "\n"; //cout << "Times after " << corrector.times_after << "\n"; //cout << "Space after " << corrector.space_after << "\n"; return corrector.correct (t, "text"); } tree missing_invisible_correct_twice (tree t, int force= -1) { tree u= missing_invisible_correct (t, force); if (u == t) return t; return missing_invisible_correct (u, force); } /****************************************************************************** * Miscellaneous corrections ******************************************************************************/ tree misc_math_correct (tree t) { if (is_atomic (t)) return t; else if (is_compound (t, "math", 1) && is_func (t[0], RSUB, 1)) return tree (RSUB, compound ("math", misc_math_correct (t[0][0]))); else if (is_compound (t, "math", 1) && is_func (t[0], RSUP, 1)) return tree (RSUP, compound ("math", misc_math_correct (t[0][0]))); else if (is_func (t, RSUB, 1) && is_func (t[0], RSUB, 1)) return misc_math_correct (t[0]); else if (is_func (t, RSUB, 1) && is_func (t[0], RSUP, 1)) return misc_math_correct (tree (RSUB, t[0][0])); else if (is_func (t, RSUP, 1) && is_func (t[0], RSUB, 1)) return misc_math_correct (tree (RSUP, t[0][0])); else if (is_func (t, RSUP, 1) && is_func (t[0], RSUP, 1)) return misc_math_correct (t[0]); else if (is_func (t, RSUP, 1) && is_func (t[0], RPRIME, 1)) return misc_math_correct (t[0]); else if (is_script (t) && is_compound (t[0], "text", 1) && is_atomic (t[0][0]) && is_alpha (t[0][0]->label)) { if (N(t[0][0]->label) != 1) return tree (L(t), t[0][0]); else return tree (L(t), tree (WITH, "math-font-family", "trm", misc_math_correct (t[0]))); } else if (is_compound (t, "math", 1)) { tree arg = misc_math_correct (t[0]); tree last= arg; if (is_concat (last) && N(last) > 0) last= last[N(last)-1]; if (is_atomic (last) && N(last->label) > 0 && is_punctuation (last->label [N(last->label)-1])) { string s= last->label; int i= N(s); while (i>0 && is_punctuation (s[i-1])) i--; if (i == N(s)) return compound ("math", arg); string tail= s (i, N(s)); s= s (0, i); if (last == arg) { if (N(s) == 0) return tail; else return concat (compound ("math", s), tail); } else { tree cc= arg (0, N(arg) - 1); if (N(s) != 0) cc << tree (s); if (N(cc) == 1) cc= cc[0]; return concat (compound ("math", cc), tail); } } else return compound ("math", arg); } else { int i, n= N(t); tree r (t, n); for (i=0; i " << t << "\n"; return 1; } } static int count_math_table_errors (tree t, int mode) { if (is_atomic (t)) return 0; else if (is_func (t, CELL, 1)) { if (t[0] == "" || t[0] == tree (DOCUMENT, "")) return 0; if (mode == 1) return 1; if (packrat_correct ("std-math", "Cell", t[0])) return 0; else { if (mode == 2) cout << " ERROR> " << t << "\n"; return 1; } } else { int sum= 0; for (int i=0; iget_env_child (t, i, MODE, "text"); if (cmode != "math") sum += count_math_errors (t[i], mode); else { tree u= t[i]; while (is_func (u, DOCUMENT, 1) || is_func (u, TFORMAT) || is_func (u, WITH)) u= u[N(u)-1]; if (is_func (u, TABLE)) sum += count_math_table_errors (u, mode); else sum += count_math_formula_errors (u, mode); } } return sum; } } /****************************************************************************** * Print mathematical status ******************************************************************************/ static int count_formula= 0; static int count_initial_errors= 0; static int count_final_errors= 0; static int corrected_with= 0; static int corrected_superfluous_with= 0; static int corrected_brackets= 0; static int corrected_move_brackets= 0; static int corrected_misc= 0; static int corrected_superfluous_invisible= 0; static int corrected_homoglyph= 0; static int corrected_missing_invisible= 0; static int corrected_zealous_invisible= 0; void math_status_cumul_sub (tree t, int& cumul, int& errors) { int new_errors= count_math_errors (t); cumul += (errors - new_errors); errors= new_errors; } void math_status_cumul (tree t) { with_drd drd (get_document_drd (t)); if (is_func (t, DOCUMENT)) for (int i=0; i