1 /* 2 markdown_lexer_sections.h MindForger thinking notebook 3 4 Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com> 5 6 This program is free software; you can redistribute it and/or 7 modify it under the terms of the GNU General Public License 8 as published by the Free Software Foundation; either version 2 9 of the License, or (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 #ifndef M8R_MARKDOWN_LEXER_SECTIONS_H_ 20 #define M8R_MARKDOWN_LEXER_SECTIONS_H_ 21 22 #include <set> 23 #include <string> 24 #include <vector> 25 #include <unordered_set> 26 27 #include "../../gear/lang_utils.h" 28 #include "../../gear/file_utils.h" 29 #include "markdown_lexem.h" 30 31 namespace m8r { 32 33 /** 34 * @brief Managed table of reusable lexems. 35 */ 36 class MarkdownLexemTable { 37 private: 38 // IMPROVE unordered set 39 std::set<MarkdownLexem*> lexems; 40 41 public: 42 MarkdownLexem* BEGIN_DOC; 43 MarkdownLexem* META_BEGIN; 44 MarkdownLexem* META_PROPERTY_DELIMITER; 45 MarkdownLexem* META_PROPERTY_type; 46 MarkdownLexem* META_PROPERTY_created; 47 MarkdownLexem* META_PROPERTY_reads; 48 MarkdownLexem* META_PROPERTY_read; 49 MarkdownLexem* META_PROPERTY_revision; 50 MarkdownLexem* META_PROPERTY_modified; 51 MarkdownLexem* META_PROPERTY_importance; 52 MarkdownLexem* META_PROPERTY_urgency; 53 MarkdownLexem* META_PROPERTY_progress; 54 MarkdownLexem* META_PROPERTY_tags; 55 MarkdownLexem* META_PROPERTY_links; 56 MarkdownLexem* META_PROPERTY_deadline; 57 MarkdownLexem* META_PROPERTY_scope; 58 MarkdownLexem* META_NAMEVALUE_DELIMITER; 59 MarkdownLexem* HTML_COMMENT_BEGIN; 60 MarkdownLexem* HTML_COMMENT_END; 61 MarkdownLexem* BR; 62 MarkdownLexem* END_DOC; 63 64 MarkdownLexemTable(); 65 MarkdownLexemTable(const MarkdownLexemTable&) = delete; 66 MarkdownLexemTable(const MarkdownLexemTable&&) = delete; 67 MarkdownLexemTable& operator=(const MarkdownLexemTable&) = delete; 68 MarkdownLexemTable& operator=(const MarkdownLexemTable&&) = delete; 69 ~MarkdownLexemTable(); 70 contains(MarkdownLexem * lexem)71 bool contains(MarkdownLexem *lexem) const { return lexems.find(lexem)!=lexems.end(); } 72 }; 73 74 class MarkdownSymbolTable 75 { 76 private: 77 /** 78 * Symbols that are reused through the lexing and parsing 79 * process (function names, variable names, ...) are kept 80 * from loading file to final entity (Outline, Note) creation. 81 */ 82 // IMPROVE use unordered_set once I found out how to compile it 83 std::set<std::string*> symbols; 84 85 public: 86 // reusable lexems 87 static const MarkdownLexemTable& LEXEM; 88 89 public: 90 explicit MarkdownSymbolTable(); 91 MarkdownSymbolTable(const MarkdownSymbolTable&) = delete; 92 MarkdownSymbolTable(const MarkdownSymbolTable&&) = delete; 93 MarkdownSymbolTable& operator=(const MarkdownSymbolTable&) = delete; 94 MarkdownSymbolTable& operator=(const MarkdownSymbolTable&&) = delete; 95 ~MarkdownSymbolTable(); 96 97 void addSymbol(std::string *symbol); 98 bool lookup(std::string* symbol) const; clearSymbols()99 void clearSymbols() { symbols.clear(); } 100 }; 101 102 /** 103 * @brief Markdown lexical analyzer for section-level granularity parser. 104 */ 105 class MarkdownLexerSections 106 { 107 private: 108 const std::string* filePath; 109 unsigned lastBrTokensOffset; 110 bool inCodeBlock; 111 112 size_t fileSize; 113 std::vector<std::string*> lines; 114 // IMPROVE prepare a LexemPool: vector + MarkdownLexem[1000] and allocate from there (performance) 115 std::vector<MarkdownLexem*> lexems; 116 MarkdownSymbolTable symbolTable; 117 118 public: 119 explicit MarkdownLexerSections(const std::string* filePath=nullptr); 120 MarkdownLexerSections(const MarkdownLexerSections &) = delete; 121 MarkdownLexerSections(const MarkdownLexerSections &&); 122 MarkdownLexerSections &operator=(const MarkdownLexerSections &) = delete; 123 MarkdownLexerSections &operator=(const MarkdownLexerSections &&) = delete; 124 virtual ~MarkdownLexerSections(); 125 126 void tokenize(); 127 void tokenize(const std::string* text); 128 129 /** 130 * Returns text, caller is expected to destroy it. 131 */ 132 std::string* getText(const MarkdownLexem*); 133 setFilePath(const std::string * & filePath)134 void setFilePath(const std::string*& filePath) { this->filePath = filePath; } getFileSize()135 size_t getFileSize() const { return fileSize; } getLexems()136 const std::vector<MarkdownLexem*>& getLexems() const { return lexems; } getLines()137 const std::vector<std::string*>& getLines() const { return lines; } getSymbolTable()138 const MarkdownSymbolTable& getSymbolTable() const { return symbolTable; } 139 MarkdownLexem* operator[](size_t i) { return lexems[i]; } 140 const MarkdownLexem* operator[](size_t i) const { return lexems[i]; } empty()141 bool empty() const { return lexems.empty(); } size()142 size_t size() const { return lexems.size(); } 143 144 private: 145 bool nextToken(const unsigned int offset); 146 147 inline bool lookahead(const unsigned offset, const unsigned short idx) const; toggleInCodeBlock()148 void toggleInCodeBlock() { inCodeBlock=!inCodeBlock; } 149 150 inline bool isSameCharsLine(const unsigned offset, const char c) const; 151 inline bool startsWithCodeBlockSymbol(const unsigned offset) const; 152 inline bool startsWithHtmlCommentEndSymbol(const unsigned offset, const unsigned short int idx) const; 153 154 inline bool lexWhitespaces(const unsigned offset, unsigned short int& idx); 155 inline bool lexSectionSymbol(const unsigned offset, unsigned short int& idx); 156 inline bool lexHtmlCommentBeginSymbol(const unsigned offset, unsigned short int& idx); 157 inline bool lexHtmlCommentEndSymbol(const unsigned offset, unsigned short int& idx); 158 inline bool lexMetadataSymbol(const unsigned offset, unsigned short int& idx); 159 inline bool lexMetaPropertyName(const unsigned offset, unsigned short int& idx); 160 inline bool lexMetaPropertyNameValueDelimiter(const unsigned offset, unsigned short int& idx); 161 inline bool lexMetaPropertyValue(const unsigned offset, unsigned short int& idx); 162 inline bool lexMetaPropertyDelimiter(const unsigned offset, unsigned short int& idx); 163 inline bool lexToEndOfHtmlComment(const unsigned offset, unsigned short int& idx); 164 inline bool lexPostDeclaredSectionHeader(const unsigned offset, const char delimiter); 165 166 inline void addLineToLexems(const unsigned offset); 167 168 /** 169 * @brief Insert back section lexem if "standalone line section declaration" found. 170 * 171 * Tokenize previous line as section header and prepend SECTION lexem with given depth. 172 */ 173 void fixBackDeclaredSection(const unsigned offset, const unsigned short sectionDepth); 174 }; 175 176 } // m8r namespace 177 178 #endif /* M8R_MARKDOWN_LEXER_SECTIONS_H_ */ 179