1 /*
2  cmark_trie_line_autolinking_preprocessor.cpp     MindForger thinking notebook
3 
4  Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  as published by the Free Software Foundation; either version 2
9  of the License, or (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "cmark_trie_line_autolinking_preprocessor.h"
20 
21 /*
22  * DEPRECATED
23  *
24  * This line-based autolinker has been deprecated by block-based autlinker.
25  */
26 
27 /*
28  * High priority tasks:
29  *
30  * - unit tests first, manual tests then
31  * - move trie to memory (or mind) and keep it up to date there (based on mind state)
32  * - switch from row based MD>HTML TO block based MD2>HTML
33  *   - the reason is that MD lines cannot be correctly converted w/o context
34  *     e.g. multi-item deep bullet lists (tab prefix causes parser to make it code block)
35  *   - the solution is to let CMARK parse MD, traverse and inject links to AST as needed
36  * - O or N edit: always DELETE old name and add new name to TRIE
37  *
38  * - autolink tags: if no N/O found on click, then open tags dialog
39  *
40  * Plan:
41  *
42  * - ensure correctness FIRST ~ unit tests:
43  *    x protection of bullet lists
44  *    - protection of deep bullet lists
45  *      (the problem list that deep bullet list cannot be
46  *      autolinked as line, but whole block > consider
47  *      injecting AST, rather than lines - CMARK needs
48  *      to know context of previous/next line)
49  *      >>> perhaps it could be faster than line by line autoinjections
50  *    x no trailing spaces
51  *    x protection of links/images/...
52  *    - protection of inlined MATH $..$
53  *    - blacklist ~ don't autolink e.g. http (to protect cmark's URLs autolinking)
54  *
55  * - bugs
56  *    - click link > dialog > choose > crash
57  *
58  * - code style
59  *    - copy/paste code to methods
60  *
61  * - performance
62  *    - avoid trie rebuild - on N save():
63  *      ! implement delete operation on trie:
64  *       - enhance trie w/ refcount for words so that delete knows whether to remove or not
65  *       - implement delete() operation:
66  *        - finds word mark, decreases refcount, if it's leaf than it goes up deletes orphan
67  *      OR:
68  *      1) synchronous: add new N name to trie on its save()
69  *      2) async: after save let trie rebuild async in other
70  *         thread to update it (trie access must be synchronized)
71  *    - avoid autolinking whole O on its load - it's not needed > debug why it happens
72  *    - map search structure instead of Aho
73  *    - benchmark on C++ repo
74  *    - configurable time limit on autolinking and leave on exceeding it
75  */
76 
77 namespace m8r {
78 
79 using namespace std;
80 
81 const string CmarkTrieLineAutolinkingPreprocessor::TRAILING_CHARS = string{" \t,:;.!?<>{}&()-+/*\\_=%~#$^[]'\""};
82 
CmarkTrieLineAutolinkingPreprocessor(Mind & mind)83 CmarkTrieLineAutolinkingPreprocessor::CmarkTrieLineAutolinkingPreprocessor(Mind& mind)
84     : AutolinkingPreprocessor{mind}
85 {
86 }
87 
~CmarkTrieLineAutolinkingPreprocessor()88 CmarkTrieLineAutolinkingPreprocessor::~CmarkTrieLineAutolinkingPreprocessor()
89 {
90 }
91 
processProtectedBlock(vector<string * > & block,string & amd)92 void CmarkTrieLineAutolinkingPreprocessor::processProtectedBlock(
93         vector<string*>& block,
94         string& amd)
95 {
96     if(block.size()) {
97         string blockString{};
98         toString(block, blockString);
99         amd.append(blockString);
100         block.clear();
101     }
102     MF_DEBUG("Appended PROTECTED block:" << endl << "'" << amd << "'" << endl);
103 }
104 
processAndAutolinkBlock(vector<string * > & block,string & amd)105 void CmarkTrieLineAutolinkingPreprocessor::processAndAutolinkBlock(
106         vector<string*>& block,
107         string& amd)
108 {
109     if(block.size()) {
110         string blockString{}, autolinkedBlock{};
111         toString(block, blockString);
112         MF_DEBUG("111");
113         parseMarkdownLine(&blockString, &autolinkedBlock);
114         MF_DEBUG("222");
115         amd.append(autolinkedBlock);
116         MF_DEBUG("333");
117         block.clear();
118     }
119     MF_DEBUG("Appended AUTOLINKED block:" << endl << "'" << amd << "'" << endl);
120 }
121 
process(const vector<string * > & md,string & amd)122 void CmarkTrieLineAutolinkingPreprocessor::process(
123         const vector<string*>& md,
124         string& amd)
125 {
126 #ifdef MF_MD_2_HTML_CMARK
127 
128 #ifdef DO_MF_DEBUG
129     MF_DEBUG("[Autolinking] begin CMARK" << endl);
130     string ds{};
131     toString(md, ds);
132     MF_DEBUG("[Autolinking] input:" << endl << ">>>" << ds << "<<<" << endl);
133 
134     auto begin = chrono::high_resolution_clock::now();
135 #endif
136 
137     insensitive = Configuration::getInstance().isAutolinkingCaseInsensitive();
138 
139     vector<string*> block{};
140     if(md.size()) {
141 
142         // IMPROVE measure time in here and if over give limit, than STOP injecting
143         // and leave i.e. what happens is that a time SLA will be fulfilled and
144         // some part (prefix) of the input MD will be autolinked.
145 
146         bool inCodeBlock=false, inMathBlock=false;
147         for(string* l:md) {
148             if(l && stringStartsWith(*l, CODE_BLOCK)) {
149                 block.push_back(l);
150                 if(inCodeBlock) {
151                     processProtectedBlock(block, amd);
152                 } else {
153                     processAndAutolinkBlock(block, amd);
154                 }
155                 inCodeBlock = !inCodeBlock;
156             } else if(l && stringStartsWith(*l, MATH_BLOCK)) {
157                 block.push_back(l);
158                 if(inMathBlock) {
159                     processProtectedBlock(block, amd);
160                 } else {
161                     processAndAutolinkBlock(block, amd);
162                 }
163                 inMathBlock= !inMathBlock;
164             } else {
165                 block.push_back(l);
166             }
167         }
168     }
169 
170     processAndAutolinkBlock(block, amd);
171 
172 #ifdef DO_MF_DEBUG
173     MF_DEBUG("[Autolinking] output:" << endl << ">>>" << amd << "<<<" << endl);
174 
175     auto end = chrono::high_resolution_clock::now();
176     MF_DEBUG("[Autolinking] MD autolinked in: " << chrono::duration_cast<chrono::microseconds>(end-begin).count()/1000000.0 << "ms" << endl);
177 #endif
178 
179 #else
180     toString(md, amd);
181 #endif
182 }
183 
184 
185 
processLineByLine(const std::vector<std::string * > & md,std::string & amd)186 void CmarkTrieLineAutolinkingPreprocessor::processLineByLine(
187         const std::vector<std::string*>& md,
188         std::string& amd)
189 {
190 #ifdef MF_MD_2_HTML_CMARK
191 
192 #ifdef DO_MF_DEBUG
193     MF_DEBUG("[Autolinking] begin CMARK-AHO" << endl);
194     string ds{};
195     toString(md, ds);
196     MF_DEBUG("[Autolinking] input:" << endl << ">>" << ds << "<<" << endl);
197 
198     auto begin = chrono::high_resolution_clock::now();
199 #endif
200 
201     // TODO rewrite
202     std::vector<std::string*> amdl{};
203 
204     insensitive = Configuration::getInstance().isAutolinkingCaseInsensitive();
205 
206     if(md.size()) {
207 
208         // IMPROVE measure time in here and if over give limit, than STOP injecting
209         // and leave i.e. what happens is that a time SLA will be fulfilled and
210         // some part (prefix) of the input MD will be autolinked.
211 
212         bool inCodeBlock=false, inMathBlock=false;
213         for(string* l:md) {
214             // every line is autolinked SEPARATELY
215             string* nl = new string{};
216 
217             // skip code/math/... blocks
218             if(stringStartsWith(*l, CODE_BLOCK)) {
219                 inCodeBlock = !inCodeBlock;
220 
221                 nl->assign(*l);
222                 amdl.push_back(nl);
223             } else if(stringStartsWith(*l, MATH_BLOCK)) {
224                 inMathBlock= !inMathBlock;
225 
226                 nl->assign(*l);
227                 amdl.push_back(nl);
228             } else if(l) {
229                 if(l->size() && !inCodeBlock && !inMathBlock) {
230                     parseMarkdownLine(l, nl);
231                     amdl.push_back(nl);
232                 } else {
233                     nl->assign(*l);
234                     amdl.push_back(nl);
235                 }
236             } else {
237                 delete nl;
238                 amdl.push_back(nullptr);
239             }
240         }
241     }
242 
243     toString(amdl, amd);
244 
245 #ifdef DO_MF_DEBUG
246     MF_DEBUG("[Autolinking] output:" << endl << ">>" << amd << "<<" << endl);
247 
248     auto end = chrono::high_resolution_clock::now();
249     MF_DEBUG("[Autolinking] MD autolinked in: " << chrono::duration_cast<chrono::microseconds>(end-begin).count()/1000000.0 << "ms" << endl);
250 #endif
251 
252 #else
253     toString(md, amd);
254 #endif
255 }
256 
parseMarkdownLine(const std::string * md,std::string * amd)257 void CmarkTrieLineAutolinkingPreprocessor::parseMarkdownLine(const std::string* md, std::string* amd)
258 {
259 #ifdef DO_MF_DEBUG
260     MF_DEBUG("[Autolinking] parsing line:" << endl << ">>" << *md << "<<" << endl);
261 #endif
262 
263 #ifdef MF_MD_2_HTML_CMARK
264     const char* smd = md->c_str();
265 
266     // cmark identifies '   * my bullet' as code block, which is wrong > workaround
267     string attic{};
268     if(stringStartsWith(smd, "   ")) {
269         attic.assign(*md);
270         attic[0] = '@';
271         smd = attic.c_str();
272         MF_DEBUG("[Autolinking] avoiding CODE block interpretation:" << endl << ">>" << attic << "<<" << endl);
273     }
274 
275     cmark_node* document = cmark_parse_document(
276         smd,
277         strlen(smd),
278         CMARK_OPT_DEFAULT);
279 
280     // AST iteration
281     cmark_iter *i = cmark_iter_new(document);
282     cmark_event_type eventType;
283     cmark_node* zombieNode{};
284 
285     bool inLinkImgOrCode = false;
286 
287     while ((eventType = cmark_iter_next(i)) != CMARK_EVENT_DONE) {
288         cmark_node *node = cmark_iter_get_node(i);
289 
290         if(zombieNode) {
291             cmark_node_unlink(zombieNode);
292             cmark_node_free(zombieNode);
293             zombieNode = nullptr;
294         }
295 
296         // IMPROVE make this a debug method
297         // do something with `cur` and `ev_type`
298         switch(eventType) {
299         case CMARK_EVENT_ENTER:
300             MF_DEBUG("ENTER");
301             break;
302         case CMARK_EVENT_EXIT:
303             MF_DEBUG("LEAVE");
304             break;
305         case CMARK_EVENT_DONE:
306             MF_DEBUG("DONE");
307             break;
308         case CMARK_EVENT_NONE:
309             MF_DEBUG("NONE");
310             break;
311         default:
312             MF_DEBUG(".");
313         }
314 
315         // Nodes must only be modified after an `EXIT` event,
316         // or an `ENTER` event for leaf nodes.
317 
318         // inlined autolinking constructions to avoid - auto-link in:
319         // - link
320         // - image
321         // - code
322         // ... iterate nodes and skip <text/> if UNDER link/image/code,
323         // otherwise trim text and try to autolink it.
324 
325         switch(cmark_node_get_type(node)) {
326         case CMARK_NODE_CODE:
327             MF_DEBUG(" code");
328         case CMARK_NODE_LINK:
329             MF_DEBUG(" link");
330         case CMARK_NODE_IMAGE:
331             MF_DEBUG(" image");
332             if(eventType == CMARK_EVENT_ENTER) {
333                 inLinkImgOrCode = true;
334             } else if (eventType == CMARK_EVENT_EXIT) {
335                 inLinkImgOrCode = false;
336             }
337             break;
338         case CMARK_NODE_TEXT:
339             MF_DEBUG(" text '" << cmark_node_get_literal(node) << "'" << endl);
340 
341             // TODO: block - split lines by new line
342 
343             if(!inLinkImgOrCode) {
344                 // replace text node w/ sequence of text and link nodes
345                 injectThingsLinks(node);
346                 zombieNode = node;
347             }
348             break;
349         default:
350             MF_DEBUG(" .");
351         }
352 
353         MF_DEBUG(endl);
354     }
355 
356     cmark_iter_free(i);
357 
358     char* cmm = cmark_render_commonmark(document, 0, 0);
359     amd->assign(cmm);
360     amd->pop_back();
361 
362     if(attic.size()) {
363         (*amd)[4] = ' ';
364         amd->erase(0, 1);
365     }
366 
367 #ifdef DO_MF_DEBUG
368     char* xml = cmark_render_xml(document, 0);
369     MF_DEBUG("[Autolinking] Line's cmark AST as XML:" << endl << endl);
370     MF_DEBUG(xml << endl);
371     free(xml);
372 
373     MF_DEBUG("[Autolinking] Line's cmark AST as MD:" << endl << ">>" << *amd << "<<" << endl);
374 #endif
375 
376     free(cmm);
377     cmark_node_free(document);
378 
379 #else
380     amd->assign(*md);
381 #endif
382 }
383 
384 // TODO move this to helper parent class ~ Cmark autolinker
addAstLinkNode(cmark_node * origNode,cmark_node * node,string & pre)385 cmark_node* CmarkTrieLineAutolinkingPreprocessor::addAstLinkNode(
386         cmark_node* origNode,
387         cmark_node* node,
388         string& pre)
389 {
390     cmark_node* linkNode{};
391     cmark_node* txtNode{};
392 
393     linkNode = cmark_node_new(CMARK_NODE_LINK);
394     string link{MF_URL_PROTOCOL};
395     link.append(pre);
396     cmark_node_set_url(linkNode, link.c_str());
397     txtNode = cmark_node_new(CMARK_NODE_TEXT);
398     cmark_node_set_literal(txtNode, pre.c_str());
399     cmark_node_append_child(linkNode, txtNode);
400     if(node) {
401         cmark_node_insert_after(node, linkNode);
402     } else {
403         cmark_node_insert_before(origNode, linkNode);
404     }
405     return linkNode;
406 }
407 
408 // TODO move this to helper parent class
addAstTxtNode(cmark_node * origNode,cmark_node * node,string & at)409 cmark_node* CmarkTrieLineAutolinkingPreprocessor::addAstTxtNode(
410         cmark_node* origNode,
411         cmark_node* node,
412         string& at)
413 {
414     cmark_node* txtNode{};
415 
416     txtNode = cmark_node_new(CMARK_NODE_TEXT);
417     cmark_node_set_literal(txtNode, at.c_str());
418     if(node) {
419         cmark_node_insert_after(node, txtNode);
420     } else {
421         cmark_node_insert_before(origNode, txtNode);
422     }
423 
424     at.clear();
425 
426     return txtNode;
427 }
428 
injectThingsLinks(cmark_node * origNode)429 void CmarkTrieLineAutolinkingPreprocessor::injectThingsLinks(cmark_node* origNode)
430 {
431     // copy w to t as it will be chopped word/match by word/match from head to tail
432     string txt{cmark_node_get_literal(origNode)};
433     string pre{}, at{}, chop{};
434     size_t preSize{};
435 
436     cmark_node* node{};
437 
438 #ifdef DO_MF_DEBUG
439     MF_DEBUG("[Autolinking] Injecting links to: '" << txt << "'" << endl);
440 #endif
441 
442     while(txt.size()>0) {
443         // skip trailing chars and append them
444         preSize = 0;
445         while(preSize < txt.size()) {
446             if(TRAILING_CHARS.find(txt.at(preSize)) != string::npos) {
447                 preSize++;
448             } else {
449                 break;
450             }
451         }
452         if(preSize) {
453             // chop trailing chars prefix from input and append it to result
454             at.append(txt.substr(0, preSize));
455             txt = txt.substr(preSize);
456 
457             MF_DEBUG("  Skipping trailing chars: '" << txt.substr(0, preSize) << "'" << endl);
458             MF_DEBUG("     txt: '" << txt << "'" << endl);
459             MF_DEBUG("     at : '" << at  << "'" << endl);
460         }
461 
462         // try to match word
463         pre.clear();
464         MF_DEBUG("  Trie search txt: '" << txt << "'" << endl);
465         if(mind.autolinkFindLongestPrefixWord(txt, pre)) {
466             MF_DEBUG("    Matched prefix: '" << pre << "'" << endl);
467 
468             // avoid word PREFIX matches ~ ensure that WHOLE world is matched:
469             // - match followed by trailing char
470             // - match until EOL
471 
472             // determine trailing char
473             char tChar{txt.size()==pre.size()?' ':txt.at(pre.size())};
474             MF_DEBUG("    Match's trailing char: '" << tChar << "'" << endl);
475             if(TRAILING_CHARS.find(tChar) != string::npos
476                  ||
477                txt.size() == pre.size()) {
478 
479                 // AST: add text node w/ content preceding link
480                 // IMPROVE make this method
481                 if(at.size()) {
482                     node = addAstTxtNode(origNode, node, at);
483                 }
484 
485                 // AST: add link
486                 node = addAstLinkNode(origNode, node, pre);
487 
488                 // chop linked prefix word from input word w
489                 if(txt.size() == pre.size()) {
490                     txt.clear();
491                 } else {
492                     txt = txt.substr(pre.size());
493                 }
494                 // trailing char will be handled later
495                 at.clear();
496 
497                 MF_DEBUG("    txt: '" << txt << "'" << endl);
498                 MF_DEBUG("    at : '" << at  << "'" << endl);
499             } else {
500                 // invalid trailing char > matched a NOT-whole-world prefix > skip and append one word
501 
502                 // TODO make it method
503                 // current w prefix was NOT linked > chop it and append it
504                 size_t begin = txt.find_first_of(" \t");
505                 if(begin != string::npos) {
506                     chop = txt.substr(0, begin);
507                     txt = txt.substr(begin+1);
508                     at.append(chop);
509                     // TODO IMPORTANT append what was found - space or tab! simply index there
510                     at.append(" ");
511 
512                     MF_DEBUG("  Skiping word: '" << chop << "'" << endl);
513                     MF_DEBUG("     txt: '" << txt << "'" << endl);
514                     MF_DEBUG("     at : '" << at  << "'" << endl);
515                 } else {
516                     // no more words (prefix already checked) > DONE
517                     at.append(txt);
518 
519                     MF_DEBUG("  DONE no-more words: '" << chop << "'" << endl);
520                     MF_DEBUG("     txt: '" << txt << "'" << endl);
521                     MF_DEBUG("     at : '" << at  << "'" << endl);
522 
523                     break;
524                 }
525             }
526         } else {
527             // didn't match prefix > skip and append one word
528 
529             // TODO make it method
530             // current w prefix was NOT linked > chop it and append it
531             size_t begin = txt.find_first_of(" \t");
532             if(begin != string::npos) {
533                 chop = txt.substr(0, begin);
534                 txt = txt.substr(begin+1);
535                 at.append(chop);
536                 // TODO IMPORTANT append what was found - space or tab! simply index there
537                 at.append(" ");
538 
539                 MF_DEBUG("  Skiping word: '" << chop << "'" << endl);
540                 MF_DEBUG("     txt: '" << txt << "'" << endl);
541                 MF_DEBUG("     at : '" << at  << "'" << endl);
542             } else {
543                 // no more words (prefix already checked) > DONE
544                 at.append(txt);
545 
546                 MF_DEBUG("  DONE no-more words: '" << chop << "'" << endl);
547                 MF_DEBUG("     txt: '" << txt << "'" << endl);
548                 MF_DEBUG("     at : '" << at  << "'" << endl);
549 
550                 break;
551             }
552         }
553     }
554 
555     // AST: add text node w/ content preceding link
556     if(at.size()) {
557         node = addAstTxtNode(origNode, node, at);
558     }
559 }
560 
561 } // m8r namespace
562