1 /*
2 markdown_lexer-sections.cpp MindForger thinking notebook
3
4 Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19 #include "markdown_lexer_sections.h"
20
21 using namespace std;
22
23 namespace m8r {
24
25 /*
26 * FileLineProvider
27 */
28
29 // IMPROVE consider lazy loading of lines using FileLineProvider instead of pre-loading
30 class FileLineProvider
31 {
32 private:
33 ifstream* infile;
34
35 public:
36 explicit FileLineProvider(const char* filename);
37 virtual ~FileLineProvider();
38
39 string* getLine();
40 };
41
FileLineProvider(const char * filename)42 FileLineProvider::FileLineProvider(const char* filename)
43 {
44 infile = new ifstream{filename};
45 }
46
getLine()47 string* FileLineProvider::getLine()
48 {
49 // IMPROVE this is slow heap operation
50 string* line = new string();
51 getline(*infile, *line);
52 return line;
53 }
54
~FileLineProvider()55 FileLineProvider::~FileLineProvider()
56 {
57 infile->close();
58 delete infile;
59 }
60
61 /*
62 * MarkdownLexemTable
63 */
64
MarkdownLexemTable()65 MarkdownLexemTable::MarkdownLexemTable()
66 {
67 BEGIN_DOC = new MarkdownLexem { MarkdownLexemType::BEGIN_DOC };
68 lexems.insert(BEGIN_DOC);
69 META_BEGIN = new MarkdownLexem { MarkdownLexemType::META_BEGIN };
70 lexems.insert(META_BEGIN);
71 META_PROPERTY_DELIMITER = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_DELIMITER};
72 lexems.insert(META_PROPERTY_DELIMITER);
73 META_PROPERTY_type = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_type};
74 lexems.insert(META_PROPERTY_type);
75 META_PROPERTY_created = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_created};
76 lexems.insert(META_PROPERTY_created);
77 META_PROPERTY_reads = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_reads};
78 lexems.insert(META_PROPERTY_reads);
79 META_PROPERTY_read = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_read};
80 lexems.insert(META_PROPERTY_read);
81 META_PROPERTY_revision = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_revision};
82 lexems.insert(META_PROPERTY_revision);
83 META_PROPERTY_modified = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_modified};
84 lexems.insert(META_PROPERTY_modified);
85 META_PROPERTY_importance = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_importance};
86 lexems.insert(META_PROPERTY_importance);
87 META_PROPERTY_urgency = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_urgency};
88 lexems.insert(META_PROPERTY_urgency);
89 META_PROPERTY_progress = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_progress};
90 lexems.insert(META_PROPERTY_progress);
91 META_PROPERTY_tags = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_tags};
92 lexems.insert(META_PROPERTY_tags);
93 META_PROPERTY_links= new MarkdownLexem{MarkdownLexemType::META_PROPERTY_links};
94 lexems.insert(META_PROPERTY_links);
95 META_PROPERTY_deadline = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_deadline};
96 lexems.insert(META_PROPERTY_deadline);
97 META_PROPERTY_scope = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_scope};
98 lexems.insert(META_PROPERTY_scope);
99 META_NAMEVALUE_DELIMITER = new MarkdownLexem{MarkdownLexemType::META_NAMEVALUE_DELIMITER};
100 lexems.insert(META_NAMEVALUE_DELIMITER);
101 HTML_COMMENT_BEGIN = new MarkdownLexem{MarkdownLexemType::HTML_COMMENT_BEGIN};
102 lexems.insert(HTML_COMMENT_BEGIN);
103 HTML_COMMENT_END = new MarkdownLexem{MarkdownLexemType::HTML_COMMENT_END};
104 lexems.insert(HTML_COMMENT_END);
105 BR = new MarkdownLexem{MarkdownLexemType::BR};
106 lexems.insert(BR);
107 END_DOC = new MarkdownLexem{MarkdownLexemType::END_DOC};
108 lexems.insert(END_DOC);
109 }
110
~MarkdownLexemTable()111 MarkdownLexemTable::~MarkdownLexemTable()
112 {
113 delete BEGIN_DOC;
114 delete META_BEGIN;
115 delete META_PROPERTY_DELIMITER;
116 delete META_PROPERTY_type;
117 delete META_PROPERTY_created;
118 delete META_PROPERTY_reads;
119 delete META_PROPERTY_read;
120 delete META_PROPERTY_revision;
121 delete META_PROPERTY_modified;
122 delete META_PROPERTY_importance;
123 delete META_PROPERTY_urgency;
124 delete META_PROPERTY_progress;
125 delete META_PROPERTY_tags;
126 delete META_PROPERTY_links;
127 delete META_PROPERTY_deadline;
128 delete META_PROPERTY_scope;
129 delete META_NAMEVALUE_DELIMITER;
130 delete HTML_COMMENT_BEGIN;
131 delete HTML_COMMENT_END;
132 delete BR;
133 delete END_DOC;
134 }
135
136 /*
137 * MarkdownSymbolTable
138 */
139
140 const MarkdownLexemTable& MarkdownSymbolTable::LEXEM = MarkdownLexemTable();
141
MarkdownSymbolTable()142 MarkdownSymbolTable::MarkdownSymbolTable()
143 {
144 }
145
lookup(string * symbol) const146 bool MarkdownSymbolTable::lookup(string* symbol) const
147 {
148 if(symbol!=nullptr && symbols.find(symbol)!=symbols.end()) {
149 return true;
150 } else {
151 return false;
152 }
153 }
154
addSymbol(string * symbol)155 void MarkdownSymbolTable::addSymbol(string* symbol)
156 {
157 symbols.insert(symbol);
158 }
159
~MarkdownSymbolTable()160 MarkdownSymbolTable::~MarkdownSymbolTable()
161 {
162 if(symbols.size()) {
163 // TODO delete set members
164 }
165 }
166
167 /*
168 * MarkdownLexerSections
169 */
170
MarkdownLexerSections(const string * filePath)171 MarkdownLexerSections::MarkdownLexerSections(const string* filePath)
172 {
173 this->filePath = filePath;
174 this->fileSize = 0;
175 this->inCodeBlock = false;
176 this->lastBrTokensOffset = 0;
177 }
178
~MarkdownLexerSections()179 MarkdownLexerSections::~MarkdownLexerSections()
180 {
181 // lines
182 for(string*& line:lines) {
183 if(line!=nullptr) {
184 delete line;
185 }
186 }
187
188 // lexems
189 for(MarkdownLexem*& lexem:lexems) {
190 if(lexem!=nullptr) {
191 if(!MarkdownSymbolTable::LEXEM.contains(lexem)) {
192 delete lexem;
193 lexem=nullptr;
194 }
195 }
196 }
197 }
198
tokenize()199 void MarkdownLexerSections::tokenize()
200 {
201 fileSize = 0;
202 if(fileToLines(filePath, lines, fileSize)) {
203 // IMPROVE body of this function can be shared by file & text
204 lexems.push_back(MarkdownSymbolTable::LEXEM.BEGIN_DOC);
205
206 unsigned offset = 0;
207 while(nextToken(offset)) {
208 offset++;
209 }
210
211 if(lexems.size()==1) {
212 lexems.clear();
213 } else {
214 lexems.push_back(MarkdownSymbolTable::LEXEM.END_DOC);
215 }
216 }
217 }
218
tokenize(const string * text)219 void MarkdownLexerSections::tokenize(const string* text)
220 {
221 if(stringToLines(text, lines)) {
222 // IMPROVE body of this function can be shared by file & text
223 lexems.push_back(MarkdownSymbolTable::LEXEM.BEGIN_DOC);
224
225 unsigned offset = 0;
226 while(nextToken(offset)) {
227 offset++;
228 }
229
230 if(lexems.size()==1) {
231 lexems.clear();
232 } else {
233 lexems.push_back(MarkdownSymbolTable::LEXEM.END_DOC);
234 }
235 }
236 }
237
lexWhitespaces(const unsigned offset,unsigned short int & idx)238 bool MarkdownLexerSections::lexWhitespaces(const unsigned offset, unsigned short int& idx)
239 {
240 unsigned short int i = idx+1;
241 if(lines[offset]!=nullptr) {
242 while(lines[offset]->size()>i && isspace(lines[offset]->at(i))) {
243 i++;
244 }
245 if(i != idx+1) {
246 lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,idx+1,i-1-idx));
247 idx = i-1;
248 return true;
249 }
250 }
251 return false;
252 }
253
startsWithCodeBlockSymbol(const unsigned offset) const254 bool MarkdownLexerSections::startsWithCodeBlockSymbol(const unsigned offset) const
255 {
256 if(lines[offset]!=nullptr && lines[offset]->size()>=3
257 &&
258 lines[offset]->at(0)=='`' && lines[offset]->at(1)=='`' && lines[offset]->at(2)=='`'
259 ){
260 return true;
261 } else {
262 return false;
263 }
264 }
265
startsWithHtmlCommentEndSymbol(const unsigned offset,const unsigned short idx) const266 bool MarkdownLexerSections::startsWithHtmlCommentEndSymbol(const unsigned offset, const unsigned short idx) const
267 {
268 if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+3)
269 &&
270 lines[offset]->at(idx)=='-' && lines[offset]->at(idx+1)=='-' && lines[offset]->at(idx+2)=='>'
271 ){
272 return true;
273 } else {
274 return false;
275 }
276 }
277
lexSectionSymbol(const unsigned offset,unsigned short int & idx)278 bool MarkdownLexerSections::lexSectionSymbol(const unsigned offset, unsigned short int& idx)
279 {
280 unsigned depth = 0; // depth = [0,n)
281 if(lines[offset]!=nullptr) {
282 while(lines[offset]->size()>depth && lines[offset]->at(depth)=='#') {
283 ++depth;
284 }
285 if(depth
286 &&
287 (lines[offset]->size()>=depth || isspace(lines[offset]->at(depth))))
288 {
289 idx = depth-1;
290 lexems.push_back(new MarkdownLexem(MarkdownLexemType::SECTION,depth-1));
291 return true;
292 }
293 }
294 return false;
295 }
296
lexHtmlCommentBeginSymbol(const unsigned offset,unsigned short int & idx)297 bool MarkdownLexerSections::lexHtmlCommentBeginSymbol(const unsigned offset, unsigned short int& idx)
298 {
299 if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+4)
300 &&
301 lines[offset]->at(idx)=='<' && lines[offset]->at(idx+1)=='!' && lines[offset]->at(idx+2)=='-' && lines[offset]->at(idx+3)=='-'
302 ){
303 idx+=4;
304 lexems.push_back(symbolTable.LEXEM.HTML_COMMENT_BEGIN);
305 return true;
306 } else {
307 return false;
308 }
309 }
310
lexHtmlCommentEndSymbol(const unsigned offset,unsigned short int & idx)311 bool MarkdownLexerSections::lexHtmlCommentEndSymbol(const unsigned offset, unsigned short int& idx)
312 {
313 if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+3)
314 &&
315 lines[offset]->at(idx)=='-' && lines[offset]->at(idx+1)=='-' && lines[offset]->at(idx+2)=='>'
316 ){
317 idx+=3;
318 lexems.push_back(symbolTable.LEXEM.HTML_COMMENT_END);
319 return true;
320 } else {
321 return false;
322 }
323 }
324
lexMetadataSymbol(const unsigned offset,unsigned short int & idx)325 bool MarkdownLexerSections::lexMetadataSymbol(const unsigned offset, unsigned short int& idx)
326 {
327 // case insensitive 'metadata'
328 if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+9)
329 &&
330 (lines[offset]->at(idx+1)=='M' || lines[offset]->at(idx+1)=='m') &&
331 (lines[offset]->at(idx+2)=='e' || lines[offset]->at(idx+2)=='E') &&
332 (lines[offset]->at(idx+3)=='t' || lines[offset]->at(idx+3)=='T') &&
333 (lines[offset]->at(idx+4)=='a' || lines[offset]->at(idx+4)=='A') &&
334 (lines[offset]->at(idx+5)=='d' || lines[offset]->at(idx+5)=='D') &&
335 (lines[offset]->at(idx+6)=='a' || lines[offset]->at(idx+6)=='A') &&
336 (lines[offset]->at(idx+7)=='t' || lines[offset]->at(idx+7)=='T') &&
337 (lines[offset]->at(idx+8)=='a' || lines[offset]->at(idx+8)=='A') &&
338 lines[offset]->at(idx+9)==':'
339 ){
340 idx+=9;
341 lexems.push_back(symbolTable.LEXEM.META_BEGIN);
342 return true;
343 } else {
344 return false;
345 }
346 }
347
lexMetaPropertyName(const unsigned offset,unsigned short int & idx)348 bool MarkdownLexerSections::lexMetaPropertyName(const unsigned offset, unsigned short int& idx)
349 {
350 if(lines[offset]->size() > (size_t)(idx+1)) {
351 switch(lines[offset]->at(idx+1)) {
352 case 't':
353 if(lines[offset]->at(idx+2)=='y' &&
354 lines[offset]->at(idx+3)=='p' &&
355 lines[offset]->at(idx+4)=='e' &&
356 (lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
357 idx+=4;
358 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_type);
359 return true;
360 } else {
361 if(lines[offset]->at(idx+2)=='a' &&
362 lines[offset]->at(idx+3)=='g' &&
363 lines[offset]->at(idx+4)=='s' &&
364 (lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
365 idx+=4;
366 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_tags);
367 return true;
368 } else {
369 return false;
370 }
371 }
372 case 'c':
373 if(lines[offset]->at(idx+2)=='r' &&
374 lines[offset]->at(idx+3)=='e' &&
375 lines[offset]->at(idx+4)=='a' &&
376 lines[offset]->at(idx+5)=='t' &&
377 lines[offset]->at(idx+6)=='e' &&
378 lines[offset]->at(idx+7)=='d' &&
379 (lines[offset]->at(idx+8)==':' || !isspace(idx+8))) {
380 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_created);
381 idx+=7;
382 return true;
383 } else {
384 return false;
385 }
386 case 'r':
387 if(lines[offset]->at(idx+2)=='e') {
388 if(lines[offset]->at(idx+3)=='a' &&
389 lines[offset]->at(idx+4)=='d')
390 {
391 if(lines[offset]->at(idx+5)=='s' &&
392 (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
393 idx+=5;
394 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_reads);
395 return true;
396 } else {
397 if((lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
398 idx+=4;
399 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_read);
400 return true;
401 }
402 }
403 } else {
404 if(lines[offset]->at(idx+3)=='v' &&
405 lines[offset]->at(idx+4)=='i' &&
406 lines[offset]->at(idx+5)=='s' &&
407 lines[offset]->at(idx+6)=='i' &&
408 lines[offset]->at(idx+7)=='o' &&
409 lines[offset]->at(idx+8)=='n' &&
410 (lines[offset]->at(idx+9)==':' || !isspace(idx+9)))
411 {
412 idx+=8;
413 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_revision);
414 return true;
415 }
416 }
417 }
418 return false;
419 case 'i':
420 if(lines[offset]->at(idx+2)=='m' &&
421 lines[offset]->at(idx+3)=='p' &&
422 lines[offset]->at(idx+4)=='o' &&
423 lines[offset]->at(idx+5)=='r' &&
424 lines[offset]->at(idx+6)=='t' &&
425 lines[offset]->at(idx+7)=='a' &&
426 lines[offset]->at(idx+8)=='n' &&
427 lines[offset]->at(idx+9)=='c' &&
428 lines[offset]->at(idx+10)=='e' &&
429 (lines[offset]->at(idx+11)==':' || !isspace(idx+11))) {
430 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_importance);
431 idx+=10;
432 return true;
433 } else {
434 return false;
435 }
436 case 'u':
437 if(lines[offset]->at(idx+2)=='r' &&
438 lines[offset]->at(idx+3)=='g' &&
439 lines[offset]->at(idx+4)=='e' &&
440 lines[offset]->at(idx+5)=='n' &&
441 lines[offset]->at(idx+6)=='c' &&
442 lines[offset]->at(idx+7)=='y' &&
443 (lines[offset]->at(idx+8)==':' || !isspace(idx+8))) {
444 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_urgency);
445 idx+=7;
446 return true;
447 } else {
448 return false;
449 }
450 case 'p':
451 if(lines[offset]->at(idx+2)=='r' &&
452 lines[offset]->at(idx+3)=='o' &&
453 lines[offset]->at(idx+4)=='g' &&
454 lines[offset]->at(idx+5)=='r' &&
455 lines[offset]->at(idx+6)=='e' &&
456 lines[offset]->at(idx+7)=='s' &&
457 lines[offset]->at(idx+8)=='s' &&
458 (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
459 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_progress);
460 idx+=8;
461 return true;
462 } else {
463 return false;
464 }
465 case 'm':
466 if(lines[offset]->at(idx+2)=='o' &&
467 lines[offset]->at(idx+3)=='d' &&
468 lines[offset]->at(idx+4)=='i' &&
469 lines[offset]->at(idx+5)=='f' &&
470 lines[offset]->at(idx+6)=='i' &&
471 lines[offset]->at(idx+7)=='e' &&
472 lines[offset]->at(idx+8)=='d' &&
473 (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
474 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_modified);
475 idx+=8;
476 return true;
477 } else {
478 return false;
479 }
480 case 'l':
481 // key for relationships is 'links' because a) there are clashes for 'r' b) links is shorter than relationships
482 if(lines[offset]->at(idx+2)=='i' &&
483 lines[offset]->at(idx+3)=='n' &&
484 lines[offset]->at(idx+4)=='k' &&
485 lines[offset]->at(idx+5)=='s' &&
486 (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
487 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_links);
488 idx+=5;
489 return true;
490 } else {
491 return false;
492 }
493 case 's':
494 if(lines[offset]->at(idx+2)=='c' &&
495 lines[offset]->at(idx+3)=='o' &&
496 lines[offset]->at(idx+4)=='p' &&
497 lines[offset]->at(idx+5)=='e' &&
498 (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
499 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_scope);
500 idx+=5;
501 return true;
502 } else {
503 return false;
504 }
505 case 'd':
506 if(lines[offset]->at(idx+2)=='e' &&
507 lines[offset]->at(idx+3)=='a' &&
508 lines[offset]->at(idx+4)=='d' &&
509 lines[offset]->at(idx+5)=='l' &&
510 lines[offset]->at(idx+6)=='i' &&
511 lines[offset]->at(idx+7)=='n' &&
512 lines[offset]->at(idx+8)=='e' &&
513 (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
514 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_deadline);
515 idx+=8;
516 return true;
517 } else {
518 return false;
519 }
520 default:
521 return false;
522 }
523 }
524 return false;
525 }
526
527 /**
528 * Tokenize the remaining part of the line regardless what's there.
529 */
lexToEndOfHtmlComment(const unsigned offset,unsigned short int & idx)530 bool MarkdownLexerSections::lexToEndOfHtmlComment(const unsigned offset, unsigned short int& idx)
531 {
532 if(lines[offset]!=nullptr && lines[offset]->size()>(size_t)(idx+1)) {
533 unsigned short int i;
534 for(i=idx+1;
535 i<lines[offset]->size();
536 i++) {
537 if(lines[offset]->at(i)=='-') {
538 if(startsWithHtmlCommentEndSymbol(offset,i)) {
539 if(i > idx+1) {
540 lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_TEXT,offset,idx,i-idx)); // note: ushort-ushort narrowing ({} > ())
541 idx=i;
542 }
543 lexHtmlCommentEndSymbol(offset,i);
544 if(lines[offset]->size()>=i) {
545 lexems.push_back(symbolTable.LEXEM.BR);
546 }
547 return true;
548 }
549 }
550 }
551 if(i > idx+1) {
552 lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_TEXT,offset,idx,i-idx)); // note: ushort-ushort narrowing ({} > ())
553 lexems.push_back(symbolTable.LEXEM.BR);
554 idx=i;
555 return true;
556 }
557 }
558 lexems.push_back(symbolTable.LEXEM.BR);
559 return false;
560 }
561
lexPostDeclaredSectionHeader(const unsigned int offset,const char delimiter)562 bool MarkdownLexerSections::lexPostDeclaredSectionHeader(const unsigned int offset, const char delimiter)
563 {
564 if (offset == 0 || inCodeBlock) {
565 // if the first MD document line starts with '=' > markdown document w/ preamble || in code > ignore
566 addLineToLexems(offset);
567 return false;
568 } else {
569 // previous line is valid section name && current line is header line for that name
570 if(lines[offset-1]!=nullptr && lines[offset-1]->size()>=2 && !isspace(lines[offset-1]->at(0))
571 &&
572 isSameCharsLine(offset, delimiter))
573 {
574 // Patch processed lexems and inject post declared section marker - before:
575 //
576 // #0 BEGIN_DOC 0 0 0
577 // #1 LINE 0 0 *
578 // #2 BR 0 0 0
579 // #3 ... current line...
580 // after:
581 // #0 BEGIN_DOC 0 0 0
582 // #1 SECTION= 0 0 0 # note === marker: SECTION vs SECTION= vs SECTION-
583 // #2 LINE 0 0 *
584 // #3 BR 0 0 0
585 //
586 // or before:
587 //
588 // #3 ...
589 // #4 BR 0 0 0
590 // #5 BR 0 0 0
591 // #6 BR 0 0 0
592 // #7 LINE 8 0 *
593 // #8 BR 0 0 0
594 // #9 ... current line...
595 // after:
596 // #3 ...
597 // #4 BR 0 0 0
598 // #5 BR 0 0 0
599 // #6 BR 0 0 0
600 // #7 SECTION= 0 0 0 # note === marker: SECTION vs SECTION= vs SECTION-
601 // #8 LINE 8 0 *
602 // #9 BR 0 0 0
603 //
604 // Conlusion: there MUST be BR and LINE, insert section right after, otherwise normal line
605
606 if(lexems.size()>2 // BEGIN_DOC LINE BR
607 &&
608 lexems[lexems.size()-1]->getType()==MarkdownLexemType::BR
609 &&
610 lexems[lexems.size()-2]->getType()==MarkdownLexemType::LINE)
611 {
612 if(delimiter=='=') {
613 lexems.insert(lexems.begin()+lexems.size()-2, new MarkdownLexem(MarkdownLexemType::SECTION_equals,0));
614 } else {
615 lexems.insert(lexems.begin()+lexems.size()-2, new MarkdownLexem(MarkdownLexemType::SECTION_hyphens,1));
616 }
617 } else {
618 addLineToLexems(offset);
619 return false;
620 }
621 // insert marker > parser will create section w/ section type marker (lexem lookahead)
622 return true;
623 } else {
624 addLineToLexems(offset);
625 return false;
626 }
627 }
628 }
629
addLineToLexems(const unsigned int offset)630 void MarkdownLexerSections::addLineToLexems(const unsigned int offset)
631 {
632 lexems.push_back(new MarkdownLexem{MarkdownLexemType::LINE, offset, 0, MarkdownLexem::WHOLE_LINE});
633 lexems.push_back(symbolTable.LEXEM.BR);
634 }
635
nextToken(const unsigned int offset)636 bool MarkdownLexerSections::nextToken(const unsigned int offset) {
637 if(offset<lines.size()) {
638 if(lines[offset]->size()==0) {
639 lexems.push_back(symbolTable.LEXEM.BR);
640 return true;
641 } else {
642 switch(lines[offset]->at(0)) {
643 case '`':
644 if(startsWithCodeBlockSymbol(offset)) {
645 // sections lexer just needs to detect code block to avoid detection of false sections, but no need to tokenize it
646 toggleInCodeBlock();
647 }
648 addLineToLexems(offset);
649 return true;
650 case '#': // IF #+ space THEN section ELSE line
651 if(!inCodeBlock) {
652 unsigned short int idx = 0;
653 if(!lexSectionSymbol(offset,idx)) {
654 addLineToLexems(offset);
655 return true;
656 } else {
657 // #+ parsed > process the rest: [:whitespace]+ (TEXT [:whitespace]+)* METADATA?
658 lexWhitespaces(offset, idx);
659 char cc;
660 unsigned short int ws=0, text=0, x = idx+1;
661 while(lookahead(offset,idx)) {
662 cc = lines[offset]->at(++idx);
663 if(isspace(cc)) {
664 // a) whitespaces
665 if(ws==0 && text) {
666 lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,x,idx-x)); // note: ushort-ushort narrowing ({} > ())
667 text = 0;
668 x = idx;
669 }
670 ws++;
671 } else {
672 // b1) text as section metadata
673 if(cc=='<' && lexHtmlCommentBeginSymbol(offset,idx)) {
674 lexWhitespaces(offset,idx);
675 if(lexMetadataSymbol(offset,idx)) {
676 do {
677 lexWhitespaces(offset,idx);
678 if(lexMetaPropertyName(offset,idx)) {
679 lexWhitespaces(offset,idx);
680 if(lexMetaPropertyNameValueDelimiter(offset,idx)) {
681 lexWhitespaces(offset,idx);
682 if(lexMetaPropertyValue(offset,idx)) {
683 lexMetaPropertyDelimiter(offset,idx);
684 } else {
685 // a mess inside comment > seek before the end of comment and finish line parsing
686 lexToEndOfHtmlComment(offset,idx);
687 return true;
688 }
689 } else {
690 lexToEndOfHtmlComment(offset,idx);
691 return true;
692 }
693 } else {
694 lexToEndOfHtmlComment(offset,idx);
695 return true;
696 }
697 } while(lookahead(offset,idx));
698 lexWhitespaces(offset,idx);
699
700 unsigned short int mess = 0;
701 char ccc;
702 while(lookahead(offset,idx)) {
703 ccc = lines[offset]->at(++idx);
704 if(ccc=='-' && lexHtmlCommentEndSymbol(offset,idx)) {
705 if(mess) {
706 // TODO BUG add text BEFORE last lexem
707 lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,idx-mess,mess)); // note: ushort-ushort narrowing ({} > ())
708 }
709 // IMPROVE process the rest of line after --> (ignored for now)
710
711 // TODO FIX
712 lexems.push_back(symbolTable.LEXEM.BR);
713 // TODO FIX
714
715 return true;
716 } else {
717 mess++;
718 }
719 }
720 if(mess) {
721 lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,idx-mess,mess)); // note: ushort-ushort narrowing ({} > ())
722 }
723
724 // TODO FIX
725 lexems.push_back(symbolTable.LEXEM.BR);
726 // TODO FIX
727
728 return true;
729 } else {
730 // parse HTML comment (as it's not HTML comment w/ metadata)
731 // TODO
732 }
733 } else {
734 // b2) text
735 if(text==0 && ws) {
736 lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,x,idx-x)); // note: ushort-ushort narrowing ({} > ())
737 ws = 0;
738 x = idx;
739 }
740 text++;
741 }
742 }
743 } // while
744 if(ws) {
745 lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,x,idx+1-x)); // note: ushort-ushort narrowing ({} > ())
746 }
747 if(text) {
748 lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,x,idx+1-x)); // note: ushort-ushort narrowing ({} > ())
749 }
750 lexems.push_back(symbolTable.LEXEM.BR);
751 return true;
752 }
753 } else {
754 // in code block
755 addLineToLexems(offset);
756 }
757 return true;
758 case '=': // IF [=]+ > 1st level header (not in ``` and prev text)
759 lexPostDeclaredSectionHeader(offset, '=');
760 return true;
761 case '-':
762 lexPostDeclaredSectionHeader(offset, '-');
763 return true;
764 default:
765 addLineToLexems(offset);
766 return true;
767 }
768 }
769 } else {
770 return false;
771 }
772 }
773
isSameCharsLine(const unsigned offset,const char c) const774 bool MarkdownLexerSections::isSameCharsLine(const unsigned offset, const char c) const
775 {
776 // fail fast
777 if(lines[offset]!=nullptr && lines[offset]->size()
778 &&
779 lines[offset]->at(0)==c && lines[offset]->at(lines[offset]->size()-1)==c)
780 {
781 for(unsigned i=1; i<lines[offset]->size()-1; i++) {
782 if(lines[offset]->at(i)!=c) {
783 return false;
784 }
785 }
786 return true;
787 }
788 return false;
789 }
790
lookahead(const unsigned offset,const unsigned short idx) const791 bool MarkdownLexerSections::lookahead(const unsigned offset, const unsigned short idx) const
792 {
793 if(lines[offset]->size() > (size_t)(idx+1)) {
794 return true;
795 } else {
796 return false;
797 }
798 }
799
lexMetaPropertyNameValueDelimiter(const unsigned offset,unsigned short int & idx)800 bool MarkdownLexerSections::lexMetaPropertyNameValueDelimiter(const unsigned offset, unsigned short int& idx)
801 {
802 if(lines[offset]->size()>(size_t)(idx+1) && lines[offset]->at(idx+1)==':') {
803 idx++;
804 lexems.push_back(symbolTable.LEXEM.META_NAMEVALUE_DELIMITER);
805 return true;
806 } else {
807 return false;
808 }
809 }
810
lexMetaPropertyValue(const unsigned offset,unsigned short int & idx)811 bool MarkdownLexerSections::lexMetaPropertyValue(const unsigned offset, unsigned short int& idx)
812 {
813 if(lines[offset]!=nullptr && lines[offset]->size()>(size_t)(idx+1)) {
814 unsigned short int i;
815 for(i=idx+1;
816 i<lines[offset]->size() && lines[offset]->at(i)!=';';
817 i++)
818 {}
819 if(i>idx+1) {
820 lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_PROPERTY_VALUE,offset,idx+1,i-idx-1));
821 idx=i-1;
822 return true;
823 }
824 }
825 return false;
826 }
827
lexMetaPropertyDelimiter(const unsigned offset,unsigned short int & idx)828 bool MarkdownLexerSections::lexMetaPropertyDelimiter(const unsigned offset, unsigned short int& idx)
829 {
830 if(lines[offset]->size()>(size_t)(idx+1) && lines[offset]->at(idx+1)==';') {
831 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_DELIMITER);
832 idx++;
833 return true;
834 } else {
835 return false;
836 }
837 }
838
getText(const MarkdownLexem * lexem)839 string* MarkdownLexerSections::getText(const MarkdownLexem* lexem)
840 {
841 if(lexem!=nullptr && lines.size()) {
842 if(lexem->getOff()<lines.size()) {
843 if(lexem->getLng()==MarkdownLexem::WHOLE_LINE) {
844 string *result = lines[lexem->getOff()];
845 lines[lexem->getOff()] = nullptr;
846 return result;
847 } else {
848 if(lexem->getLng()==0) {
849 return new string{};
850 } else {
851 return new string{lines[lexem->getOff()]->substr(lexem->getIdx(),lexem->getLng())};
852 }
853 }
854 }
855 }
856 return nullptr;
857 }
858
859 } // m8r namespace
860