1 /*
2  markdown_lexer-sections.cpp     MindForger thinking notebook
3 
4  Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  as published by the Free Software Foundation; either version 2
9  of the License, or (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program. If not, see <http://www.gnu.org/licenses/>.
18  */
19 #include "markdown_lexer_sections.h"
20 
21 using namespace std;
22 
23 namespace m8r {
24 
25 /*
26  * FileLineProvider
27  */
28 
29 // IMPROVE consider lazy loading of lines using FileLineProvider instead of pre-loading
30 class FileLineProvider
31 {
32 private:
33     ifstream* infile;
34 
35 public:
36     explicit FileLineProvider(const char* filename);
37     virtual ~FileLineProvider();
38 
39     string* getLine();
40 };
41 
FileLineProvider(const char * filename)42 FileLineProvider::FileLineProvider(const char* filename)
43 {
44     infile = new ifstream{filename};
45 }
46 
getLine()47 string* FileLineProvider::getLine()
48 {
49     // IMPROVE this is slow heap operation
50     string* line = new string();
51     getline(*infile, *line);
52     return line;
53 }
54 
~FileLineProvider()55 FileLineProvider::~FileLineProvider()
56 {
57     infile->close();
58     delete infile;
59 }
60 
61 /*
62  * MarkdownLexemTable
63  */
64 
MarkdownLexemTable()65 MarkdownLexemTable::MarkdownLexemTable()
66 {
67     BEGIN_DOC = new MarkdownLexem { MarkdownLexemType::BEGIN_DOC };
68     lexems.insert(BEGIN_DOC);
69     META_BEGIN = new MarkdownLexem { MarkdownLexemType::META_BEGIN };
70     lexems.insert(META_BEGIN);
71     META_PROPERTY_DELIMITER = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_DELIMITER};
72     lexems.insert(META_PROPERTY_DELIMITER);
73     META_PROPERTY_type = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_type};
74     lexems.insert(META_PROPERTY_type);
75     META_PROPERTY_created = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_created};
76     lexems.insert(META_PROPERTY_created);
77     META_PROPERTY_reads = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_reads};
78     lexems.insert(META_PROPERTY_reads);
79     META_PROPERTY_read = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_read};
80     lexems.insert(META_PROPERTY_read);
81     META_PROPERTY_revision = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_revision};
82     lexems.insert(META_PROPERTY_revision);
83     META_PROPERTY_modified = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_modified};
84     lexems.insert(META_PROPERTY_modified);
85     META_PROPERTY_importance = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_importance};
86     lexems.insert(META_PROPERTY_importance);
87     META_PROPERTY_urgency = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_urgency};
88     lexems.insert(META_PROPERTY_urgency);
89     META_PROPERTY_progress = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_progress};
90     lexems.insert(META_PROPERTY_progress);
91     META_PROPERTY_tags = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_tags};
92     lexems.insert(META_PROPERTY_tags);
93     META_PROPERTY_links= new MarkdownLexem{MarkdownLexemType::META_PROPERTY_links};
94     lexems.insert(META_PROPERTY_links);
95     META_PROPERTY_deadline = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_deadline};
96     lexems.insert(META_PROPERTY_deadline);
97     META_PROPERTY_scope = new MarkdownLexem{MarkdownLexemType::META_PROPERTY_scope};
98     lexems.insert(META_PROPERTY_scope);
99     META_NAMEVALUE_DELIMITER = new MarkdownLexem{MarkdownLexemType::META_NAMEVALUE_DELIMITER};
100     lexems.insert(META_NAMEVALUE_DELIMITER);
101     HTML_COMMENT_BEGIN = new MarkdownLexem{MarkdownLexemType::HTML_COMMENT_BEGIN};
102     lexems.insert(HTML_COMMENT_BEGIN);
103     HTML_COMMENT_END = new MarkdownLexem{MarkdownLexemType::HTML_COMMENT_END};
104     lexems.insert(HTML_COMMENT_END);
105     BR = new MarkdownLexem{MarkdownLexemType::BR};
106     lexems.insert(BR);
107     END_DOC = new MarkdownLexem{MarkdownLexemType::END_DOC};
108     lexems.insert(END_DOC);
109 }
110 
~MarkdownLexemTable()111 MarkdownLexemTable::~MarkdownLexemTable()
112 {
113     delete BEGIN_DOC;
114     delete META_BEGIN;
115     delete META_PROPERTY_DELIMITER;
116     delete META_PROPERTY_type;
117     delete META_PROPERTY_created;
118     delete META_PROPERTY_reads;
119     delete META_PROPERTY_read;
120     delete META_PROPERTY_revision;
121     delete META_PROPERTY_modified;
122     delete META_PROPERTY_importance;
123     delete META_PROPERTY_urgency;
124     delete META_PROPERTY_progress;
125     delete META_PROPERTY_tags;
126     delete META_PROPERTY_links;
127     delete META_PROPERTY_deadline;
128     delete META_PROPERTY_scope;
129     delete META_NAMEVALUE_DELIMITER;
130     delete HTML_COMMENT_BEGIN;
131     delete HTML_COMMENT_END;
132     delete BR;
133     delete END_DOC;
134 }
135 
136 /*
137  * MarkdownSymbolTable
138  */
139 
140 const MarkdownLexemTable& MarkdownSymbolTable::LEXEM = MarkdownLexemTable();
141 
MarkdownSymbolTable()142 MarkdownSymbolTable::MarkdownSymbolTable()
143 {
144 }
145 
lookup(string * symbol) const146 bool MarkdownSymbolTable::lookup(string* symbol) const
147 {
148     if(symbol!=nullptr && symbols.find(symbol)!=symbols.end()) {
149         return true;
150     } else {
151         return false;
152     }
153 }
154 
addSymbol(string * symbol)155 void MarkdownSymbolTable::addSymbol(string* symbol)
156 {
157     symbols.insert(symbol);
158 }
159 
~MarkdownSymbolTable()160 MarkdownSymbolTable::~MarkdownSymbolTable()
161 {
162     if(symbols.size()) {
163         // TODO delete set members
164     }
165 }
166 
167 /*
168  * MarkdownLexerSections
169  */
170 
MarkdownLexerSections(const string * filePath)171 MarkdownLexerSections::MarkdownLexerSections(const string* filePath)
172 {
173     this->filePath = filePath;
174     this->fileSize = 0;
175     this->inCodeBlock = false;
176     this->lastBrTokensOffset = 0;
177 }
178 
~MarkdownLexerSections()179 MarkdownLexerSections::~MarkdownLexerSections()
180 {
181     // lines
182     for(string*& line:lines) {
183         if(line!=nullptr) {
184             delete line;
185         }
186     }
187 
188     // lexems
189     for(MarkdownLexem*& lexem:lexems) {
190         if(lexem!=nullptr) {
191             if(!MarkdownSymbolTable::LEXEM.contains(lexem)) {
192                 delete lexem;
193                 lexem=nullptr;
194             }
195         }
196     }
197 }
198 
tokenize()199 void MarkdownLexerSections::tokenize()
200 {
201     fileSize = 0;
202     if(fileToLines(filePath, lines, fileSize)) {
203         // IMPROVE body of this function can be shared by file & text
204         lexems.push_back(MarkdownSymbolTable::LEXEM.BEGIN_DOC);
205 
206         unsigned offset = 0;
207         while(nextToken(offset)) {
208             offset++;
209         }
210 
211         if(lexems.size()==1) {
212             lexems.clear();
213         } else {
214             lexems.push_back(MarkdownSymbolTable::LEXEM.END_DOC);
215         }
216     }
217 }
218 
tokenize(const string * text)219 void MarkdownLexerSections::tokenize(const string* text)
220 {
221     if(stringToLines(text, lines)) {
222         // IMPROVE body of this function can be shared by file & text
223         lexems.push_back(MarkdownSymbolTable::LEXEM.BEGIN_DOC);
224 
225         unsigned offset = 0;
226         while(nextToken(offset)) {
227             offset++;
228         }
229 
230         if(lexems.size()==1) {
231             lexems.clear();
232         } else {
233             lexems.push_back(MarkdownSymbolTable::LEXEM.END_DOC);
234         }
235     }
236 }
237 
lexWhitespaces(const unsigned offset,unsigned short int & idx)238 bool MarkdownLexerSections::lexWhitespaces(const unsigned offset, unsigned short int& idx)
239 {
240     unsigned short int i = idx+1;
241     if(lines[offset]!=nullptr) {
242         while(lines[offset]->size()>i && isspace(lines[offset]->at(i))) {
243             i++;
244         }
245         if(i != idx+1) {
246             lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,idx+1,i-1-idx));
247             idx = i-1;
248             return true;
249         }
250     }
251     return false;
252 }
253 
startsWithCodeBlockSymbol(const unsigned offset) const254 bool MarkdownLexerSections::startsWithCodeBlockSymbol(const unsigned offset) const
255 {
256     if(lines[offset]!=nullptr && lines[offset]->size()>=3
257          &&
258        lines[offset]->at(0)=='`' && lines[offset]->at(1)=='`' && lines[offset]->at(2)=='`'
259     ){
260         return true;
261     } else {
262         return false;
263     }
264 }
265 
startsWithHtmlCommentEndSymbol(const unsigned offset,const unsigned short idx) const266 bool MarkdownLexerSections::startsWithHtmlCommentEndSymbol(const unsigned offset, const unsigned short idx) const
267 {
268     if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+3)
269          &&
270        lines[offset]->at(idx)=='-' && lines[offset]->at(idx+1)=='-' && lines[offset]->at(idx+2)=='>'
271     ){
272         return true;
273     } else {
274         return false;
275     }
276 }
277 
lexSectionSymbol(const unsigned offset,unsigned short int & idx)278 bool MarkdownLexerSections::lexSectionSymbol(const unsigned offset, unsigned short int& idx)
279 {
280     unsigned depth = 0; // depth = [0,n)
281     if(lines[offset]!=nullptr) {
282         while(lines[offset]->size()>depth && lines[offset]->at(depth)=='#') {
283             ++depth;
284         }
285         if(depth
286              &&
287            (lines[offset]->size()>=depth || isspace(lines[offset]->at(depth))))
288         {
289             idx = depth-1;
290             lexems.push_back(new MarkdownLexem(MarkdownLexemType::SECTION,depth-1));
291             return true;
292         }
293     }
294     return false;
295 }
296 
lexHtmlCommentBeginSymbol(const unsigned offset,unsigned short int & idx)297 bool MarkdownLexerSections::lexHtmlCommentBeginSymbol(const unsigned offset, unsigned short int& idx)
298 {
299     if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+4)
300          &&
301        lines[offset]->at(idx)=='<' && lines[offset]->at(idx+1)=='!' && lines[offset]->at(idx+2)=='-' && lines[offset]->at(idx+3)=='-'
302     ){
303         idx+=4;
304         lexems.push_back(symbolTable.LEXEM.HTML_COMMENT_BEGIN);
305         return true;
306     } else {
307         return false;
308     }
309 }
310 
lexHtmlCommentEndSymbol(const unsigned offset,unsigned short int & idx)311 bool MarkdownLexerSections::lexHtmlCommentEndSymbol(const unsigned offset, unsigned short int& idx)
312 {
313     if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+3)
314          &&
315        lines[offset]->at(idx)=='-' && lines[offset]->at(idx+1)=='-' && lines[offset]->at(idx+2)=='>'
316     ){
317         idx+=3;
318         lexems.push_back(symbolTable.LEXEM.HTML_COMMENT_END);
319         return true;
320     } else {
321         return false;
322     }
323 }
324 
lexMetadataSymbol(const unsigned offset,unsigned short int & idx)325 bool MarkdownLexerSections::lexMetadataSymbol(const unsigned offset, unsigned short int& idx)
326 {
327     // case insensitive 'metadata'
328     if(lines[offset]!=nullptr && lines[offset]->size()>=(size_t)(idx+9)
329          &&
330        (lines[offset]->at(idx+1)=='M' || lines[offset]->at(idx+1)=='m') &&
331        (lines[offset]->at(idx+2)=='e' || lines[offset]->at(idx+2)=='E') &&
332        (lines[offset]->at(idx+3)=='t' || lines[offset]->at(idx+3)=='T') &&
333        (lines[offset]->at(idx+4)=='a' || lines[offset]->at(idx+4)=='A') &&
334        (lines[offset]->at(idx+5)=='d' || lines[offset]->at(idx+5)=='D') &&
335        (lines[offset]->at(idx+6)=='a' || lines[offset]->at(idx+6)=='A') &&
336        (lines[offset]->at(idx+7)=='t' || lines[offset]->at(idx+7)=='T') &&
337        (lines[offset]->at(idx+8)=='a' || lines[offset]->at(idx+8)=='A') &&
338        lines[offset]->at(idx+9)==':'
339     ){
340         idx+=9;
341         lexems.push_back(symbolTable.LEXEM.META_BEGIN);
342         return true;
343     } else {
344         return false;
345     }
346 }
347 
lexMetaPropertyName(const unsigned offset,unsigned short int & idx)348 bool MarkdownLexerSections::lexMetaPropertyName(const unsigned offset, unsigned short int& idx)
349 {
350     if(lines[offset]->size() > (size_t)(idx+1)) {
351         switch(lines[offset]->at(idx+1)) {
352         case 't':
353             if(lines[offset]->at(idx+2)=='y' &&
354                lines[offset]->at(idx+3)=='p' &&
355                lines[offset]->at(idx+4)=='e' &&
356                (lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
357                 idx+=4;
358                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_type);
359                 return true;
360             } else {
361                 if(lines[offset]->at(idx+2)=='a' &&
362                    lines[offset]->at(idx+3)=='g' &&
363                    lines[offset]->at(idx+4)=='s' &&
364                    (lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
365                     idx+=4;
366                     lexems.push_back(symbolTable.LEXEM.META_PROPERTY_tags);
367                     return true;
368                 } else {
369                     return false;
370                 }
371             }
372         case 'c':
373             if(lines[offset]->at(idx+2)=='r' &&
374                lines[offset]->at(idx+3)=='e' &&
375                lines[offset]->at(idx+4)=='a' &&
376                lines[offset]->at(idx+5)=='t' &&
377                lines[offset]->at(idx+6)=='e' &&
378                lines[offset]->at(idx+7)=='d' &&
379                (lines[offset]->at(idx+8)==':' || !isspace(idx+8))) {
380                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_created);
381                 idx+=7;
382                 return true;
383             } else {
384                 return false;
385             }
386         case 'r':
387             if(lines[offset]->at(idx+2)=='e') {
388                 if(lines[offset]->at(idx+3)=='a' &&
389                    lines[offset]->at(idx+4)=='d')
390                 {
391                     if(lines[offset]->at(idx+5)=='s' &&
392                        (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
393                         idx+=5;
394                         lexems.push_back(symbolTable.LEXEM.META_PROPERTY_reads);
395                         return true;
396                     } else {
397                         if((lines[offset]->at(idx+5)==':' || !isspace(idx+5))) {
398                             idx+=4;
399                             lexems.push_back(symbolTable.LEXEM.META_PROPERTY_read);
400                             return true;
401                         }
402                     }
403                 } else {
404                     if(lines[offset]->at(idx+3)=='v' &&
405                        lines[offset]->at(idx+4)=='i' &&
406                        lines[offset]->at(idx+5)=='s' &&
407                        lines[offset]->at(idx+6)=='i' &&
408                        lines[offset]->at(idx+7)=='o' &&
409                        lines[offset]->at(idx+8)=='n' &&
410                        (lines[offset]->at(idx+9)==':' || !isspace(idx+9)))
411                     {
412                         idx+=8;
413                         lexems.push_back(symbolTable.LEXEM.META_PROPERTY_revision);
414                         return true;
415                     }
416                 }
417             }
418             return false;
419         case 'i':
420             if(lines[offset]->at(idx+2)=='m' &&
421                lines[offset]->at(idx+3)=='p' &&
422                lines[offset]->at(idx+4)=='o' &&
423                lines[offset]->at(idx+5)=='r' &&
424                lines[offset]->at(idx+6)=='t' &&
425                lines[offset]->at(idx+7)=='a' &&
426                lines[offset]->at(idx+8)=='n' &&
427                lines[offset]->at(idx+9)=='c' &&
428                lines[offset]->at(idx+10)=='e' &&
429                (lines[offset]->at(idx+11)==':' || !isspace(idx+11))) {
430                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_importance);
431                 idx+=10;
432                 return true;
433             } else {
434                 return false;
435             }
436         case 'u':
437             if(lines[offset]->at(idx+2)=='r' &&
438                lines[offset]->at(idx+3)=='g' &&
439                lines[offset]->at(idx+4)=='e' &&
440                lines[offset]->at(idx+5)=='n' &&
441                lines[offset]->at(idx+6)=='c' &&
442                lines[offset]->at(idx+7)=='y' &&
443                (lines[offset]->at(idx+8)==':' || !isspace(idx+8))) {
444                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_urgency);
445                 idx+=7;
446                 return true;
447             } else {
448                 return false;
449             }
450         case 'p':
451             if(lines[offset]->at(idx+2)=='r' &&
452                lines[offset]->at(idx+3)=='o' &&
453                lines[offset]->at(idx+4)=='g' &&
454                lines[offset]->at(idx+5)=='r' &&
455                lines[offset]->at(idx+6)=='e' &&
456                lines[offset]->at(idx+7)=='s' &&
457                lines[offset]->at(idx+8)=='s' &&
458                (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
459                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_progress);
460                 idx+=8;
461                 return true;
462             } else {
463                 return false;
464             }
465         case 'm':
466             if(lines[offset]->at(idx+2)=='o' &&
467                lines[offset]->at(idx+3)=='d' &&
468                lines[offset]->at(idx+4)=='i' &&
469                lines[offset]->at(idx+5)=='f' &&
470                lines[offset]->at(idx+6)=='i' &&
471                lines[offset]->at(idx+7)=='e' &&
472                lines[offset]->at(idx+8)=='d' &&
473                (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
474                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_modified);
475                 idx+=8;
476                 return true;
477             } else {
478                 return false;
479             }
480         case 'l':
481             // key for relationships is 'links' because a) there are clashes for 'r' b) links is shorter than relationships
482             if(lines[offset]->at(idx+2)=='i' &&
483                lines[offset]->at(idx+3)=='n' &&
484                lines[offset]->at(idx+4)=='k' &&
485                lines[offset]->at(idx+5)=='s' &&
486                (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
487                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_links);
488                 idx+=5;
489                 return true;
490             } else {
491                 return false;
492             }
493         case 's':
494             if(lines[offset]->at(idx+2)=='c' &&
495                lines[offset]->at(idx+3)=='o' &&
496                lines[offset]->at(idx+4)=='p' &&
497                lines[offset]->at(idx+5)=='e' &&
498                (lines[offset]->at(idx+6)==':' || !isspace(idx+6))) {
499                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_scope);
500                 idx+=5;
501                 return true;
502             } else {
503                 return false;
504             }
505         case 'd':
506             if(lines[offset]->at(idx+2)=='e' &&
507                lines[offset]->at(idx+3)=='a' &&
508                lines[offset]->at(idx+4)=='d' &&
509                lines[offset]->at(idx+5)=='l' &&
510                lines[offset]->at(idx+6)=='i' &&
511                lines[offset]->at(idx+7)=='n' &&
512                lines[offset]->at(idx+8)=='e' &&
513                (lines[offset]->at(idx+9)==':' || !isspace(idx+9))) {
514                 lexems.push_back(symbolTable.LEXEM.META_PROPERTY_deadline);
515                 idx+=8;
516                 return true;
517             } else {
518                 return false;
519             }
520         default:
521             return false;
522         }
523     }
524     return false;
525 }
526 
527 /**
528  * Tokenize the remaining part of the line regardless what's there.
529  */
lexToEndOfHtmlComment(const unsigned offset,unsigned short int & idx)530 bool MarkdownLexerSections::lexToEndOfHtmlComment(const unsigned offset, unsigned short int& idx)
531 {
532     if(lines[offset]!=nullptr && lines[offset]->size()>(size_t)(idx+1)) {
533         unsigned short int i;
534         for(i=idx+1;
535             i<lines[offset]->size();
536             i++) {
537             if(lines[offset]->at(i)=='-') {
538                 if(startsWithHtmlCommentEndSymbol(offset,i)) {
539                     if(i > idx+1) {
540                         lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_TEXT,offset,idx,i-idx)); // note: ushort-ushort narrowing ({} > ())
541                         idx=i;
542                     }
543                     lexHtmlCommentEndSymbol(offset,i);
544                     if(lines[offset]->size()>=i) {
545                         lexems.push_back(symbolTable.LEXEM.BR);
546                     }
547                     return true;
548                 }
549             }
550         }
551         if(i > idx+1) {
552             lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_TEXT,offset,idx,i-idx)); // note: ushort-ushort narrowing ({} > ())
553             lexems.push_back(symbolTable.LEXEM.BR);
554             idx=i;
555             return true;
556         }
557     }
558     lexems.push_back(symbolTable.LEXEM.BR);
559     return false;
560 }
561 
lexPostDeclaredSectionHeader(const unsigned int offset,const char delimiter)562 bool MarkdownLexerSections::lexPostDeclaredSectionHeader(const unsigned int offset, const char delimiter)
563 {
564     if (offset == 0 || inCodeBlock) {
565         // if the first MD document line starts with '=' > markdown document w/ preamble || in code > ignore
566         addLineToLexems(offset);
567         return false;
568     } else {
569         // previous line is valid section name && current line is header line for that name
570         if(lines[offset-1]!=nullptr && lines[offset-1]->size()>=2 && !isspace(lines[offset-1]->at(0))
571              &&
572            isSameCharsLine(offset, delimiter))
573         {
574             // Patch processed lexems and inject post declared section marker - before:
575             //
576             //  #0 BEGIN_DOC          0 0 0
577             //  #1 LINE               0 0 *
578             //  #2 BR                 0 0 0
579             //  #3 ... current line...
580             // after:
581             //  #0 BEGIN_DOC          0 0 0
582             //  #1 SECTION=           0 0 0         # note === marker: SECTION vs SECTION= vs SECTION-
583             //  #2 LINE               0 0 *
584             //  #3 BR                 0 0 0
585             //
586             // or before:
587             //
588             //  #3 ...
589             //  #4 BR                 0 0 0
590             //  #5 BR                 0 0 0
591             //  #6 BR                 0 0 0
592             //  #7 LINE               8 0 *
593             //  #8 BR                 0 0 0
594             //  #9 ... current line...
595             // after:
596             //  #3 ...
597             //  #4 BR                 0 0 0
598             //  #5 BR                 0 0 0
599             //  #6 BR                 0 0 0
600             //  #7 SECTION=           0 0 0         # note === marker: SECTION vs SECTION= vs SECTION-
601             //  #8 LINE               8 0 *
602             //  #9 BR                 0 0 0
603             //
604             // Conlusion: there MUST be BR and LINE, insert section right after, otherwise normal line
605 
606             if(lexems.size()>2 // BEGIN_DOC LINE BR
607                  &&
608                lexems[lexems.size()-1]->getType()==MarkdownLexemType::BR
609                  &&
610                lexems[lexems.size()-2]->getType()==MarkdownLexemType::LINE)
611             {
612                 if(delimiter=='=') {
613                     lexems.insert(lexems.begin()+lexems.size()-2, new MarkdownLexem(MarkdownLexemType::SECTION_equals,0));
614                 } else {
615                     lexems.insert(lexems.begin()+lexems.size()-2, new MarkdownLexem(MarkdownLexemType::SECTION_hyphens,1));
616                 }
617             } else {
618                 addLineToLexems(offset);
619                 return false;
620             }
621             // insert  marker > parser will create section w/ section type marker (lexem lookahead)
622             return true;
623         } else {
624             addLineToLexems(offset);
625             return false;
626         }
627     }
628 }
629 
addLineToLexems(const unsigned int offset)630 void MarkdownLexerSections::addLineToLexems(const unsigned int offset)
631 {
632     lexems.push_back(new MarkdownLexem{MarkdownLexemType::LINE, offset, 0, MarkdownLexem::WHOLE_LINE});
633     lexems.push_back(symbolTable.LEXEM.BR);
634 }
635 
nextToken(const unsigned int offset)636 bool MarkdownLexerSections::nextToken(const unsigned int offset) {
637     if(offset<lines.size()) {
638         if(lines[offset]->size()==0) {
639             lexems.push_back(symbolTable.LEXEM.BR);
640             return true;
641         } else {
642             switch(lines[offset]->at(0)) {
643             case '`':
644                 if(startsWithCodeBlockSymbol(offset)) {
645                     // sections lexer just needs to detect code block to avoid detection of false sections, but no need to tokenize it
646                     toggleInCodeBlock();
647                 }
648                 addLineToLexems(offset);
649                 return true;
650             case '#': // IF #+ space THEN section ELSE line
651                 if(!inCodeBlock) {
652                     unsigned short int idx = 0;
653                     if(!lexSectionSymbol(offset,idx)) {
654                         addLineToLexems(offset);
655                         return true;
656                     } else {
657                         // #+ parsed > process the rest: [:whitespace]+ (TEXT [:whitespace]+)* METADATA?
658                         lexWhitespaces(offset, idx);
659                         char cc;
660                         unsigned short int ws=0, text=0, x = idx+1;
661                         while(lookahead(offset,idx)) {
662                             cc = lines[offset]->at(++idx);
663                             if(isspace(cc)) {
664                                 // a) whitespaces
665                                 if(ws==0 && text) {
666                                     lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,x,idx-x)); // note: ushort-ushort narrowing ({} > ())
667                                     text = 0;
668                                     x = idx;
669                                 }
670                                 ws++;
671                             } else {
672                                 // b1) text as section metadata
673                                 if(cc=='<' && lexHtmlCommentBeginSymbol(offset,idx)) {
674                                     lexWhitespaces(offset,idx);
675                                     if(lexMetadataSymbol(offset,idx)) {
676                                         do {
677                                             lexWhitespaces(offset,idx);
678                                             if(lexMetaPropertyName(offset,idx)) {
679                                                 lexWhitespaces(offset,idx);
680                                                 if(lexMetaPropertyNameValueDelimiter(offset,idx)) {
681                                                     lexWhitespaces(offset,idx);
682                                                     if(lexMetaPropertyValue(offset,idx)) {
683                                                         lexMetaPropertyDelimiter(offset,idx);
684                                                     } else {
685                                                         // a mess inside comment > seek before the end of comment and finish line parsing
686                                                         lexToEndOfHtmlComment(offset,idx);
687                                                         return true;
688                                                     }
689                                                 } else {
690                                                     lexToEndOfHtmlComment(offset,idx);
691                                                     return true;
692                                                 }
693                                             } else {
694                                                 lexToEndOfHtmlComment(offset,idx);
695                                                 return true;
696                                             }
697                                         } while(lookahead(offset,idx));
698                                         lexWhitespaces(offset,idx);
699 
700                                         unsigned short int mess = 0;
701                                         char ccc;
702                                         while(lookahead(offset,idx)) {
703                                             ccc = lines[offset]->at(++idx);
704                                             if(ccc=='-' && lexHtmlCommentEndSymbol(offset,idx)) {
705                                                 if(mess) {
706                                                     // TODO BUG add text BEFORE last lexem
707                                                     lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,idx-mess,mess)); // note: ushort-ushort narrowing ({} > ())
708                                                 }
709                                                 // IMPROVE process the rest of line after --> (ignored for now)
710 
711                                                 // TODO FIX
712                                                 lexems.push_back(symbolTable.LEXEM.BR);
713                                                 // TODO FIX
714 
715                                                 return true;
716                                             } else {
717                                                 mess++;
718                                             }
719                                         }
720                                         if(mess) {
721                                             lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,idx-mess,mess)); // note: ushort-ushort narrowing ({} > ())
722                                         }
723 
724                                         // TODO FIX
725                                         lexems.push_back(symbolTable.LEXEM.BR);
726                                         // TODO FIX
727 
728                                         return true;
729                                     } else {
730                                         // parse HTML comment (as it's not HTML comment w/ metadata)
731                                         // TODO
732                                     }
733                                 } else {
734                                     // b2) text
735                                     if(text==0 && ws) {
736                                         lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,x,idx-x)); // note: ushort-ushort narrowing ({} > ())
737                                         ws = 0;
738                                         x = idx;
739                                     }
740                                     text++;
741                                 }
742                             }
743                         } // while
744                         if(ws) {
745                             lexems.push_back(new MarkdownLexem(MarkdownLexemType::WHITESPACES,offset,x,idx+1-x)); // note: ushort-ushort narrowing ({} > ())
746                         }
747                         if(text) {
748                             lexems.push_back(new MarkdownLexem(MarkdownLexemType::TEXT,offset,x,idx+1-x)); // note: ushort-ushort narrowing ({} > ())
749                         }
750                         lexems.push_back(symbolTable.LEXEM.BR);
751                         return true;
752                     }
753                 } else {
754                     // in code block
755                     addLineToLexems(offset);
756                 }
757                 return true;
758             case '=': // IF [=]+ > 1st level header (not in ``` and prev text)
759                 lexPostDeclaredSectionHeader(offset, '=');
760                 return true;
761             case '-':
762                 lexPostDeclaredSectionHeader(offset, '-');
763                 return true;
764             default:
765                 addLineToLexems(offset);
766                 return true;
767             }
768         }
769     } else {
770         return false;
771     }
772 }
773 
isSameCharsLine(const unsigned offset,const char c) const774 bool MarkdownLexerSections::isSameCharsLine(const unsigned offset, const char c) const
775 {
776     // fail fast
777     if(lines[offset]!=nullptr && lines[offset]->size()
778          &&
779        lines[offset]->at(0)==c && lines[offset]->at(lines[offset]->size()-1)==c)
780     {
781         for(unsigned i=1; i<lines[offset]->size()-1; i++) {
782             if(lines[offset]->at(i)!=c) {
783                 return false;
784             }
785         }
786         return true;
787     }
788     return false;
789 }
790 
lookahead(const unsigned offset,const unsigned short idx) const791 bool MarkdownLexerSections::lookahead(const unsigned offset, const unsigned short idx) const
792 {
793     if(lines[offset]->size() > (size_t)(idx+1)) {
794         return true;
795     } else {
796         return false;
797     }
798 }
799 
lexMetaPropertyNameValueDelimiter(const unsigned offset,unsigned short int & idx)800 bool MarkdownLexerSections::lexMetaPropertyNameValueDelimiter(const unsigned offset, unsigned short int& idx)
801 {
802     if(lines[offset]->size()>(size_t)(idx+1) && lines[offset]->at(idx+1)==':') {
803         idx++;
804         lexems.push_back(symbolTable.LEXEM.META_NAMEVALUE_DELIMITER);
805         return true;
806     } else {
807         return false;
808     }
809 }
810 
lexMetaPropertyValue(const unsigned offset,unsigned short int & idx)811 bool MarkdownLexerSections::lexMetaPropertyValue(const unsigned offset, unsigned short int& idx)
812 {
813     if(lines[offset]!=nullptr && lines[offset]->size()>(size_t)(idx+1)) {
814         unsigned short int i;
815         for(i=idx+1;
816             i<lines[offset]->size() && lines[offset]->at(i)!=';';
817             i++)
818         {}
819         if(i>idx+1) {
820             lexems.push_back(new MarkdownLexem(MarkdownLexemType::META_PROPERTY_VALUE,offset,idx+1,i-idx-1));
821             idx=i-1;
822             return true;
823         }
824     }
825     return false;
826 }
827 
lexMetaPropertyDelimiter(const unsigned offset,unsigned short int & idx)828 bool MarkdownLexerSections::lexMetaPropertyDelimiter(const unsigned offset, unsigned short int& idx)
829 {
830     if(lines[offset]->size()>(size_t)(idx+1) && lines[offset]->at(idx+1)==';') {
831         lexems.push_back(symbolTable.LEXEM.META_PROPERTY_DELIMITER);
832         idx++;
833         return true;
834     } else {
835         return false;
836     }
837 }
838 
getText(const MarkdownLexem * lexem)839 string* MarkdownLexerSections::getText(const MarkdownLexem* lexem)
840 {
841     if(lexem!=nullptr && lines.size()) {
842         if(lexem->getOff()<lines.size()) {
843             if(lexem->getLng()==MarkdownLexem::WHOLE_LINE) {
844                 string *result = lines[lexem->getOff()];
845                 lines[lexem->getOff()] = nullptr;
846                 return result;
847             } else {
848                 if(lexem->getLng()==0) {
849                     return new string{};
850                 } else {
851                     return new string{lines[lexem->getOff()]->substr(lexem->getIdx(),lexem->getLng())};
852                 }
853             }
854         }
855     }
856     return nullptr;
857 }
858 
859 } // m8r namespace
860