1 /*
2 cmark_trie_line_autolinking_preprocessor.cpp MindForger thinking notebook
3
4 Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "cmark_trie_line_autolinking_preprocessor.h"
20
21 /*
22 * DEPRECATED
23 *
24 * This line-based autolinker has been deprecated by block-based autlinker.
25 */
26
27 /*
28 * High priority tasks:
29 *
30 * - unit tests first, manual tests then
31 * - move trie to memory (or mind) and keep it up to date there (based on mind state)
32 * - switch from row based MD>HTML TO block based MD2>HTML
33 * - the reason is that MD lines cannot be correctly converted w/o context
34 * e.g. multi-item deep bullet lists (tab prefix causes parser to make it code block)
35 * - the solution is to let CMARK parse MD, traverse and inject links to AST as needed
36 * - O or N edit: always DELETE old name and add new name to TRIE
37 *
38 * - autolink tags: if no N/O found on click, then open tags dialog
39 *
40 * Plan:
41 *
42 * - ensure correctness FIRST ~ unit tests:
43 * x protection of bullet lists
44 * - protection of deep bullet lists
45 * (the problem list that deep bullet list cannot be
46 * autolinked as line, but whole block > consider
47 * injecting AST, rather than lines - CMARK needs
48 * to know context of previous/next line)
49 * >>> perhaps it could be faster than line by line autoinjections
50 * x no trailing spaces
51 * x protection of links/images/...
52 * - protection of inlined MATH $..$
53 * - blacklist ~ don't autolink e.g. http (to protect cmark's URLs autolinking)
54 *
55 * - bugs
56 * - click link > dialog > choose > crash
57 *
58 * - code style
59 * - copy/paste code to methods
60 *
61 * - performance
62 * - avoid trie rebuild - on N save():
63 * ! implement delete operation on trie:
64 * - enhance trie w/ refcount for words so that delete knows whether to remove or not
65 * - implement delete() operation:
66 * - finds word mark, decreases refcount, if it's leaf than it goes up deletes orphan
67 * OR:
68 * 1) synchronous: add new N name to trie on its save()
69 * 2) async: after save let trie rebuild async in other
70 * thread to update it (trie access must be synchronized)
71 * - avoid autolinking whole O on its load - it's not needed > debug why it happens
72 * - map search structure instead of Aho
73 * - benchmark on C++ repo
74 * - configurable time limit on autolinking and leave on exceeding it
75 */
76
77 namespace m8r {
78
79 using namespace std;
80
81 const string CmarkTrieLineAutolinkingPreprocessor::TRAILING_CHARS = string{" \t,:;.!?<>{}&()-+/*\\_=%~#$^[]'\""};
82
CmarkTrieLineAutolinkingPreprocessor(Mind & mind)83 CmarkTrieLineAutolinkingPreprocessor::CmarkTrieLineAutolinkingPreprocessor(Mind& mind)
84 : AutolinkingPreprocessor{mind}
85 {
86 }
87
~CmarkTrieLineAutolinkingPreprocessor()88 CmarkTrieLineAutolinkingPreprocessor::~CmarkTrieLineAutolinkingPreprocessor()
89 {
90 }
91
processProtectedBlock(vector<string * > & block,string & amd)92 void CmarkTrieLineAutolinkingPreprocessor::processProtectedBlock(
93 vector<string*>& block,
94 string& amd)
95 {
96 if(block.size()) {
97 string blockString{};
98 toString(block, blockString);
99 amd.append(blockString);
100 block.clear();
101 }
102 MF_DEBUG("Appended PROTECTED block:" << endl << "'" << amd << "'" << endl);
103 }
104
processAndAutolinkBlock(vector<string * > & block,string & amd)105 void CmarkTrieLineAutolinkingPreprocessor::processAndAutolinkBlock(
106 vector<string*>& block,
107 string& amd)
108 {
109 if(block.size()) {
110 string blockString{}, autolinkedBlock{};
111 toString(block, blockString);
112 MF_DEBUG("111");
113 parseMarkdownLine(&blockString, &autolinkedBlock);
114 MF_DEBUG("222");
115 amd.append(autolinkedBlock);
116 MF_DEBUG("333");
117 block.clear();
118 }
119 MF_DEBUG("Appended AUTOLINKED block:" << endl << "'" << amd << "'" << endl);
120 }
121
process(const vector<string * > & md,string & amd)122 void CmarkTrieLineAutolinkingPreprocessor::process(
123 const vector<string*>& md,
124 string& amd)
125 {
126 #ifdef MF_MD_2_HTML_CMARK
127
128 #ifdef DO_MF_DEBUG
129 MF_DEBUG("[Autolinking] begin CMARK" << endl);
130 string ds{};
131 toString(md, ds);
132 MF_DEBUG("[Autolinking] input:" << endl << ">>>" << ds << "<<<" << endl);
133
134 auto begin = chrono::high_resolution_clock::now();
135 #endif
136
137 insensitive = Configuration::getInstance().isAutolinkingCaseInsensitive();
138
139 vector<string*> block{};
140 if(md.size()) {
141
142 // IMPROVE measure time in here and if over give limit, than STOP injecting
143 // and leave i.e. what happens is that a time SLA will be fulfilled and
144 // some part (prefix) of the input MD will be autolinked.
145
146 bool inCodeBlock=false, inMathBlock=false;
147 for(string* l:md) {
148 if(l && stringStartsWith(*l, CODE_BLOCK)) {
149 block.push_back(l);
150 if(inCodeBlock) {
151 processProtectedBlock(block, amd);
152 } else {
153 processAndAutolinkBlock(block, amd);
154 }
155 inCodeBlock = !inCodeBlock;
156 } else if(l && stringStartsWith(*l, MATH_BLOCK)) {
157 block.push_back(l);
158 if(inMathBlock) {
159 processProtectedBlock(block, amd);
160 } else {
161 processAndAutolinkBlock(block, amd);
162 }
163 inMathBlock= !inMathBlock;
164 } else {
165 block.push_back(l);
166 }
167 }
168 }
169
170 processAndAutolinkBlock(block, amd);
171
172 #ifdef DO_MF_DEBUG
173 MF_DEBUG("[Autolinking] output:" << endl << ">>>" << amd << "<<<" << endl);
174
175 auto end = chrono::high_resolution_clock::now();
176 MF_DEBUG("[Autolinking] MD autolinked in: " << chrono::duration_cast<chrono::microseconds>(end-begin).count()/1000000.0 << "ms" << endl);
177 #endif
178
179 #else
180 toString(md, amd);
181 #endif
182 }
183
184
185
processLineByLine(const std::vector<std::string * > & md,std::string & amd)186 void CmarkTrieLineAutolinkingPreprocessor::processLineByLine(
187 const std::vector<std::string*>& md,
188 std::string& amd)
189 {
190 #ifdef MF_MD_2_HTML_CMARK
191
192 #ifdef DO_MF_DEBUG
193 MF_DEBUG("[Autolinking] begin CMARK-AHO" << endl);
194 string ds{};
195 toString(md, ds);
196 MF_DEBUG("[Autolinking] input:" << endl << ">>" << ds << "<<" << endl);
197
198 auto begin = chrono::high_resolution_clock::now();
199 #endif
200
201 // TODO rewrite
202 std::vector<std::string*> amdl{};
203
204 insensitive = Configuration::getInstance().isAutolinkingCaseInsensitive();
205
206 if(md.size()) {
207
208 // IMPROVE measure time in here and if over give limit, than STOP injecting
209 // and leave i.e. what happens is that a time SLA will be fulfilled and
210 // some part (prefix) of the input MD will be autolinked.
211
212 bool inCodeBlock=false, inMathBlock=false;
213 for(string* l:md) {
214 // every line is autolinked SEPARATELY
215 string* nl = new string{};
216
217 // skip code/math/... blocks
218 if(stringStartsWith(*l, CODE_BLOCK)) {
219 inCodeBlock = !inCodeBlock;
220
221 nl->assign(*l);
222 amdl.push_back(nl);
223 } else if(stringStartsWith(*l, MATH_BLOCK)) {
224 inMathBlock= !inMathBlock;
225
226 nl->assign(*l);
227 amdl.push_back(nl);
228 } else if(l) {
229 if(l->size() && !inCodeBlock && !inMathBlock) {
230 parseMarkdownLine(l, nl);
231 amdl.push_back(nl);
232 } else {
233 nl->assign(*l);
234 amdl.push_back(nl);
235 }
236 } else {
237 delete nl;
238 amdl.push_back(nullptr);
239 }
240 }
241 }
242
243 toString(amdl, amd);
244
245 #ifdef DO_MF_DEBUG
246 MF_DEBUG("[Autolinking] output:" << endl << ">>" << amd << "<<" << endl);
247
248 auto end = chrono::high_resolution_clock::now();
249 MF_DEBUG("[Autolinking] MD autolinked in: " << chrono::duration_cast<chrono::microseconds>(end-begin).count()/1000000.0 << "ms" << endl);
250 #endif
251
252 #else
253 toString(md, amd);
254 #endif
255 }
256
parseMarkdownLine(const std::string * md,std::string * amd)257 void CmarkTrieLineAutolinkingPreprocessor::parseMarkdownLine(const std::string* md, std::string* amd)
258 {
259 #ifdef DO_MF_DEBUG
260 MF_DEBUG("[Autolinking] parsing line:" << endl << ">>" << *md << "<<" << endl);
261 #endif
262
263 #ifdef MF_MD_2_HTML_CMARK
264 const char* smd = md->c_str();
265
266 // cmark identifies ' * my bullet' as code block, which is wrong > workaround
267 string attic{};
268 if(stringStartsWith(smd, " ")) {
269 attic.assign(*md);
270 attic[0] = '@';
271 smd = attic.c_str();
272 MF_DEBUG("[Autolinking] avoiding CODE block interpretation:" << endl << ">>" << attic << "<<" << endl);
273 }
274
275 cmark_node* document = cmark_parse_document(
276 smd,
277 strlen(smd),
278 CMARK_OPT_DEFAULT);
279
280 // AST iteration
281 cmark_iter *i = cmark_iter_new(document);
282 cmark_event_type eventType;
283 cmark_node* zombieNode{};
284
285 bool inLinkImgOrCode = false;
286
287 while ((eventType = cmark_iter_next(i)) != CMARK_EVENT_DONE) {
288 cmark_node *node = cmark_iter_get_node(i);
289
290 if(zombieNode) {
291 cmark_node_unlink(zombieNode);
292 cmark_node_free(zombieNode);
293 zombieNode = nullptr;
294 }
295
296 // IMPROVE make this a debug method
297 // do something with `cur` and `ev_type`
298 switch(eventType) {
299 case CMARK_EVENT_ENTER:
300 MF_DEBUG("ENTER");
301 break;
302 case CMARK_EVENT_EXIT:
303 MF_DEBUG("LEAVE");
304 break;
305 case CMARK_EVENT_DONE:
306 MF_DEBUG("DONE");
307 break;
308 case CMARK_EVENT_NONE:
309 MF_DEBUG("NONE");
310 break;
311 default:
312 MF_DEBUG(".");
313 }
314
315 // Nodes must only be modified after an `EXIT` event,
316 // or an `ENTER` event for leaf nodes.
317
318 // inlined autolinking constructions to avoid - auto-link in:
319 // - link
320 // - image
321 // - code
322 // ... iterate nodes and skip <text/> if UNDER link/image/code,
323 // otherwise trim text and try to autolink it.
324
325 switch(cmark_node_get_type(node)) {
326 case CMARK_NODE_CODE:
327 MF_DEBUG(" code");
328 case CMARK_NODE_LINK:
329 MF_DEBUG(" link");
330 case CMARK_NODE_IMAGE:
331 MF_DEBUG(" image");
332 if(eventType == CMARK_EVENT_ENTER) {
333 inLinkImgOrCode = true;
334 } else if (eventType == CMARK_EVENT_EXIT) {
335 inLinkImgOrCode = false;
336 }
337 break;
338 case CMARK_NODE_TEXT:
339 MF_DEBUG(" text '" << cmark_node_get_literal(node) << "'" << endl);
340
341 // TODO: block - split lines by new line
342
343 if(!inLinkImgOrCode) {
344 // replace text node w/ sequence of text and link nodes
345 injectThingsLinks(node);
346 zombieNode = node;
347 }
348 break;
349 default:
350 MF_DEBUG(" .");
351 }
352
353 MF_DEBUG(endl);
354 }
355
356 cmark_iter_free(i);
357
358 char* cmm = cmark_render_commonmark(document, 0, 0);
359 amd->assign(cmm);
360 amd->pop_back();
361
362 if(attic.size()) {
363 (*amd)[4] = ' ';
364 amd->erase(0, 1);
365 }
366
367 #ifdef DO_MF_DEBUG
368 char* xml = cmark_render_xml(document, 0);
369 MF_DEBUG("[Autolinking] Line's cmark AST as XML:" << endl << endl);
370 MF_DEBUG(xml << endl);
371 free(xml);
372
373 MF_DEBUG("[Autolinking] Line's cmark AST as MD:" << endl << ">>" << *amd << "<<" << endl);
374 #endif
375
376 free(cmm);
377 cmark_node_free(document);
378
379 #else
380 amd->assign(*md);
381 #endif
382 }
383
384 // TODO move this to helper parent class ~ Cmark autolinker
addAstLinkNode(cmark_node * origNode,cmark_node * node,string & pre)385 cmark_node* CmarkTrieLineAutolinkingPreprocessor::addAstLinkNode(
386 cmark_node* origNode,
387 cmark_node* node,
388 string& pre)
389 {
390 cmark_node* linkNode{};
391 cmark_node* txtNode{};
392
393 linkNode = cmark_node_new(CMARK_NODE_LINK);
394 string link{MF_URL_PROTOCOL};
395 link.append(pre);
396 cmark_node_set_url(linkNode, link.c_str());
397 txtNode = cmark_node_new(CMARK_NODE_TEXT);
398 cmark_node_set_literal(txtNode, pre.c_str());
399 cmark_node_append_child(linkNode, txtNode);
400 if(node) {
401 cmark_node_insert_after(node, linkNode);
402 } else {
403 cmark_node_insert_before(origNode, linkNode);
404 }
405 return linkNode;
406 }
407
408 // TODO move this to helper parent class
addAstTxtNode(cmark_node * origNode,cmark_node * node,string & at)409 cmark_node* CmarkTrieLineAutolinkingPreprocessor::addAstTxtNode(
410 cmark_node* origNode,
411 cmark_node* node,
412 string& at)
413 {
414 cmark_node* txtNode{};
415
416 txtNode = cmark_node_new(CMARK_NODE_TEXT);
417 cmark_node_set_literal(txtNode, at.c_str());
418 if(node) {
419 cmark_node_insert_after(node, txtNode);
420 } else {
421 cmark_node_insert_before(origNode, txtNode);
422 }
423
424 at.clear();
425
426 return txtNode;
427 }
428
injectThingsLinks(cmark_node * origNode)429 void CmarkTrieLineAutolinkingPreprocessor::injectThingsLinks(cmark_node* origNode)
430 {
431 // copy w to t as it will be chopped word/match by word/match from head to tail
432 string txt{cmark_node_get_literal(origNode)};
433 string pre{}, at{}, chop{};
434 size_t preSize{};
435
436 cmark_node* node{};
437
438 #ifdef DO_MF_DEBUG
439 MF_DEBUG("[Autolinking] Injecting links to: '" << txt << "'" << endl);
440 #endif
441
442 while(txt.size()>0) {
443 // skip trailing chars and append them
444 preSize = 0;
445 while(preSize < txt.size()) {
446 if(TRAILING_CHARS.find(txt.at(preSize)) != string::npos) {
447 preSize++;
448 } else {
449 break;
450 }
451 }
452 if(preSize) {
453 // chop trailing chars prefix from input and append it to result
454 at.append(txt.substr(0, preSize));
455 txt = txt.substr(preSize);
456
457 MF_DEBUG(" Skipping trailing chars: '" << txt.substr(0, preSize) << "'" << endl);
458 MF_DEBUG(" txt: '" << txt << "'" << endl);
459 MF_DEBUG(" at : '" << at << "'" << endl);
460 }
461
462 // try to match word
463 pre.clear();
464 MF_DEBUG(" Trie search txt: '" << txt << "'" << endl);
465 if(mind.autolinkFindLongestPrefixWord(txt, pre)) {
466 MF_DEBUG(" Matched prefix: '" << pre << "'" << endl);
467
468 // avoid word PREFIX matches ~ ensure that WHOLE world is matched:
469 // - match followed by trailing char
470 // - match until EOL
471
472 // determine trailing char
473 char tChar{txt.size()==pre.size()?' ':txt.at(pre.size())};
474 MF_DEBUG(" Match's trailing char: '" << tChar << "'" << endl);
475 if(TRAILING_CHARS.find(tChar) != string::npos
476 ||
477 txt.size() == pre.size()) {
478
479 // AST: add text node w/ content preceding link
480 // IMPROVE make this method
481 if(at.size()) {
482 node = addAstTxtNode(origNode, node, at);
483 }
484
485 // AST: add link
486 node = addAstLinkNode(origNode, node, pre);
487
488 // chop linked prefix word from input word w
489 if(txt.size() == pre.size()) {
490 txt.clear();
491 } else {
492 txt = txt.substr(pre.size());
493 }
494 // trailing char will be handled later
495 at.clear();
496
497 MF_DEBUG(" txt: '" << txt << "'" << endl);
498 MF_DEBUG(" at : '" << at << "'" << endl);
499 } else {
500 // invalid trailing char > matched a NOT-whole-world prefix > skip and append one word
501
502 // TODO make it method
503 // current w prefix was NOT linked > chop it and append it
504 size_t begin = txt.find_first_of(" \t");
505 if(begin != string::npos) {
506 chop = txt.substr(0, begin);
507 txt = txt.substr(begin+1);
508 at.append(chop);
509 // TODO IMPORTANT append what was found - space or tab! simply index there
510 at.append(" ");
511
512 MF_DEBUG(" Skiping word: '" << chop << "'" << endl);
513 MF_DEBUG(" txt: '" << txt << "'" << endl);
514 MF_DEBUG(" at : '" << at << "'" << endl);
515 } else {
516 // no more words (prefix already checked) > DONE
517 at.append(txt);
518
519 MF_DEBUG(" DONE no-more words: '" << chop << "'" << endl);
520 MF_DEBUG(" txt: '" << txt << "'" << endl);
521 MF_DEBUG(" at : '" << at << "'" << endl);
522
523 break;
524 }
525 }
526 } else {
527 // didn't match prefix > skip and append one word
528
529 // TODO make it method
530 // current w prefix was NOT linked > chop it and append it
531 size_t begin = txt.find_first_of(" \t");
532 if(begin != string::npos) {
533 chop = txt.substr(0, begin);
534 txt = txt.substr(begin+1);
535 at.append(chop);
536 // TODO IMPORTANT append what was found - space or tab! simply index there
537 at.append(" ");
538
539 MF_DEBUG(" Skiping word: '" << chop << "'" << endl);
540 MF_DEBUG(" txt: '" << txt << "'" << endl);
541 MF_DEBUG(" at : '" << at << "'" << endl);
542 } else {
543 // no more words (prefix already checked) > DONE
544 at.append(txt);
545
546 MF_DEBUG(" DONE no-more words: '" << chop << "'" << endl);
547 MF_DEBUG(" txt: '" << txt << "'" << endl);
548 MF_DEBUG(" at : '" << at << "'" << endl);
549
550 break;
551 }
552 }
553 }
554
555 // AST: add text node w/ content preceding link
556 if(at.size()) {
557 node = addAstTxtNode(origNode, node, at);
558 }
559 }
560
561 } // m8r namespace
562