1 /*
2  markdown_lexer_sections.h     MindForger thinking notebook
3 
4  Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  as published by the Free Software Foundation; either version 2
9  of the License, or (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program. If not, see <http://www.gnu.org/licenses/>.
18  */
19 #ifndef M8R_MARKDOWN_LEXER_SECTIONS_H_
20 #define M8R_MARKDOWN_LEXER_SECTIONS_H_
21 
22 #include <set>
23 #include <string>
24 #include <vector>
25 #include <unordered_set>
26 
27 #include "../../gear/lang_utils.h"
28 #include "../../gear/file_utils.h"
29 #include "markdown_lexem.h"
30 
31 namespace m8r {
32 
33 /**
34  * @brief Managed table of reusable lexems.
35  */
36 class MarkdownLexemTable {
37 private:
38     // IMPROVE unordered set
39     std::set<MarkdownLexem*> lexems;
40 
41 public:
42     MarkdownLexem* BEGIN_DOC;
43     MarkdownLexem* META_BEGIN;
44     MarkdownLexem* META_PROPERTY_DELIMITER;
45     MarkdownLexem* META_PROPERTY_type;
46     MarkdownLexem* META_PROPERTY_created;
47     MarkdownLexem* META_PROPERTY_reads;
48     MarkdownLexem* META_PROPERTY_read;
49     MarkdownLexem* META_PROPERTY_revision;
50     MarkdownLexem* META_PROPERTY_modified;
51     MarkdownLexem* META_PROPERTY_importance;
52     MarkdownLexem* META_PROPERTY_urgency;
53     MarkdownLexem* META_PROPERTY_progress;
54     MarkdownLexem* META_PROPERTY_tags;
55     MarkdownLexem* META_PROPERTY_links;
56     MarkdownLexem* META_PROPERTY_deadline;
57     MarkdownLexem* META_PROPERTY_scope;
58     MarkdownLexem* META_NAMEVALUE_DELIMITER;
59     MarkdownLexem* HTML_COMMENT_BEGIN;
60     MarkdownLexem* HTML_COMMENT_END;
61     MarkdownLexem* BR;
62     MarkdownLexem* END_DOC;
63 
64     MarkdownLexemTable();
65     MarkdownLexemTable(const MarkdownLexemTable&) = delete;
66     MarkdownLexemTable(const MarkdownLexemTable&&) = delete;
67     MarkdownLexemTable& operator=(const MarkdownLexemTable&) = delete;
68     MarkdownLexemTable& operator=(const MarkdownLexemTable&&) = delete;
69     ~MarkdownLexemTable();
70 
contains(MarkdownLexem * lexem)71     bool contains(MarkdownLexem *lexem) const { return lexems.find(lexem)!=lexems.end(); }
72 };
73 
74 class MarkdownSymbolTable
75 {
76 private:
77     /**
78      * Symbols that are reused through the lexing and parsing
79      * process (function names, variable names, ...) are kept
80      * from loading file to final entity (Outline, Note) creation.
81      */
82     // IMPROVE use unordered_set once I found out how to compile it
83     std::set<std::string*> symbols;
84 
85 public:
86     // reusable lexems
87     static const MarkdownLexemTable& LEXEM;
88 
89 public:
90     explicit MarkdownSymbolTable();
91     MarkdownSymbolTable(const MarkdownSymbolTable&) = delete;
92     MarkdownSymbolTable(const MarkdownSymbolTable&&) = delete;
93     MarkdownSymbolTable& operator=(const MarkdownSymbolTable&) = delete;
94     MarkdownSymbolTable& operator=(const MarkdownSymbolTable&&) = delete;
95     ~MarkdownSymbolTable();
96 
97     void addSymbol(std::string *symbol);
98     bool lookup(std::string* symbol) const;
clearSymbols()99     void clearSymbols() { symbols.clear(); }
100 };
101 
102 /**
103  * @brief Markdown lexical analyzer for section-level granularity parser.
104  */
105 class MarkdownLexerSections
106 {
107 private:
108     const std::string* filePath;
109     unsigned lastBrTokensOffset;
110     bool inCodeBlock;
111 
112     size_t fileSize;
113     std::vector<std::string*> lines;
114     // IMPROVE prepare a LexemPool: vector + MarkdownLexem[1000] and allocate from there (performance)
115     std::vector<MarkdownLexem*> lexems;
116     MarkdownSymbolTable symbolTable;
117 
118 public:
119     explicit MarkdownLexerSections(const std::string* filePath=nullptr);
120     MarkdownLexerSections(const MarkdownLexerSections &) = delete;
121     MarkdownLexerSections(const MarkdownLexerSections &&);
122     MarkdownLexerSections &operator=(const MarkdownLexerSections &) = delete;
123     MarkdownLexerSections &operator=(const MarkdownLexerSections &&) = delete;
124     virtual ~MarkdownLexerSections();
125 
126     void tokenize();
127     void tokenize(const std::string* text);
128 
129     /**
130      * Returns text, caller is expected to destroy it.
131      */
132     std::string* getText(const MarkdownLexem*);
133 
setFilePath(const std::string * & filePath)134     void setFilePath(const std::string*& filePath) { this->filePath = filePath; }
getFileSize()135     size_t getFileSize() const { return fileSize; }
getLexems()136     const std::vector<MarkdownLexem*>& getLexems() const { return lexems; }
getLines()137     const std::vector<std::string*>& getLines() const { return lines; }
getSymbolTable()138     const MarkdownSymbolTable& getSymbolTable() const { return symbolTable; }
139     MarkdownLexem* operator[](size_t i) { return lexems[i]; }
140     const MarkdownLexem* operator[](size_t i) const { return lexems[i]; }
empty()141     bool empty() const { return lexems.empty(); }
size()142     size_t size() const { return lexems.size(); }
143 
144 private:
145     bool nextToken(const unsigned int offset);
146 
147     inline bool lookahead(const unsigned offset, const unsigned short idx) const;
toggleInCodeBlock()148     void toggleInCodeBlock() { inCodeBlock=!inCodeBlock; }
149 
150     inline bool isSameCharsLine(const unsigned offset, const char c) const;
151     inline bool startsWithCodeBlockSymbol(const unsigned offset) const;
152     inline bool startsWithHtmlCommentEndSymbol(const unsigned offset, const unsigned short int idx) const;
153 
154     inline bool lexWhitespaces(const unsigned offset, unsigned short int& idx);
155     inline bool lexSectionSymbol(const unsigned offset, unsigned short int& idx);
156     inline bool lexHtmlCommentBeginSymbol(const unsigned offset, unsigned short int& idx);
157     inline bool lexHtmlCommentEndSymbol(const unsigned offset, unsigned short int& idx);
158     inline bool lexMetadataSymbol(const unsigned offset, unsigned short int& idx);
159     inline bool lexMetaPropertyName(const unsigned offset, unsigned short int& idx);
160     inline bool lexMetaPropertyNameValueDelimiter(const unsigned offset, unsigned short int& idx);
161     inline bool lexMetaPropertyValue(const unsigned offset, unsigned short int& idx);
162     inline bool lexMetaPropertyDelimiter(const unsigned offset, unsigned short int& idx);
163     inline bool lexToEndOfHtmlComment(const unsigned offset, unsigned short int& idx);
164     inline bool lexPostDeclaredSectionHeader(const unsigned offset, const char delimiter);
165 
166     inline void addLineToLexems(const unsigned offset);
167 
168     /**
169      * @brief Insert back section lexem if "standalone line section declaration" found.
170      *
171      * Tokenize previous line as section header and prepend SECTION lexem with given depth.
172      */
173     void fixBackDeclaredSection(const unsigned offset, const unsigned short sectionDepth);
174 };
175 
176 } // m8r namespace
177 
178 #endif /* M8R_MARKDOWN_LEXER_SECTIONS_H_ */
179