1 #include <ot/utility/tokenizer.hpp>
2
3 // TODO
4 // 1. Consider removing the utf-8 bom (https://github.com/zer4tul/utf8-bom-strip)
5
6 namespace ot {
7
8 // Function: to_lower
to_lower(std::string s)9 std::string to_lower(std::string s) {
10 std::transform(s.begin(), s.end(), s.begin(), [] (auto c) {
11 return std::tolower(c);
12 });
13 return s;
14 }
15
16 // Function: to_upper
to_upper(std::string s)17 std::string to_upper(std::string s) {
18 std::transform(s.begin(), s.end(), s.begin(), [] (auto c) {
19 return std::toupper(c);
20 });
21 return s;
22 }
23
24 // Functon : remove_quote
remove_quote(std::string s)25 std::string remove_quote(std::string s) {
26 s.erase(std::remove( s.begin(), s.end(), '\"'), s.end());
27 return s;
28 }
29
30 // Function: unquoted
unquoted(std::string s)31 std::string unquoted(std::string s) {
32 if(s.size() >= 2 && s.front() == '\"' && s.back() == '\"') {
33 return s.substr(1, s.size() - 2);
34 }
35 else return s;
36 }
37
38 // Function: is_numeric
is_numeric(const std::string & token)39 bool is_numeric(const std::string& token) {
40 return std::regex_match(token, std::regex("(\\+|-)?[0-9]*(\\.?([0-9]+))$"));
41 }
42
43 // Function: is_array
is_array(const std::string & token)44 bool is_array(const std::string& token) {
45 return std::regex_match(token, std::regex("[a-zA-Z_][a-zA-Z_0-9]*(\\[[0-9]+\\])+"));
46 }
47
48 // Function: is_word
is_word(const std::string & token)49 bool is_word(const std::string& token) {
50 return std::regex_match(token, std::regex("[a-zA-Z_][a-zA-Z_0-9]*"));
51 }
52
53 // ------------------------------------------------------------------------------------------------
54
55 // Function: tokenize
split(const std::string & str,std::string_view dels)56 std::vector<std::string> split(const std::string& str, std::string_view dels) {
57
58 // Parse the token.
59 std::string token;
60 std::vector<std::string> tokens;
61
62 for(size_t i=0; i<str.size(); ++i) {
63 bool is_del = (dels.find(str[i]) != std::string_view::npos);
64 if(is_del || std::isspace(str[i])) {
65 if(!token.empty()) { // Add the current token.
66 tokens.push_back(std::move(token));
67 }
68 } else {
69 token.push_back(str[i]); // Add the char to the current token.
70 }
71 }
72
73 if(!token.empty()) {
74 tokens.push_back(std::move(token));
75 }
76
77 return tokens;
78 }
79
80 //-------------------------------------------------------------------------------------------------
81
82 // Function: tokenize:
tokenize(const std::filesystem::path & path,std::string_view dels,std::string_view exps)83 std::vector<std::string> tokenize(
84 const std::filesystem::path& path,
85 std::string_view dels,
86 std::string_view exps
87 ) {
88
89 using namespace std::literals::string_literals;
90
91 std::ifstream ifs(path, std::ios::ate);
92
93 if(!ifs.good()) {
94 //throw std::invalid_argument("failed to open the file '"s + path.c_str() + '\'');
95 return {};
96 }
97
98 // Read the file to a local buffer.
99 size_t fsize = ifs.tellg();
100 ifs.seekg(0, std::ios::beg);
101 std::vector<char> buffer(fsize + 1);
102 ifs.read(buffer.data(), fsize);
103 buffer[fsize] = 0;
104
105 // Mart out the comment
106 for(size_t i=0; i<fsize; ++i) {
107
108 // Block comment
109 if(buffer[i] == '/' && buffer[i+1] == '*') {
110 buffer[i] = buffer[i+1] = ' ';
111 for(i=i+2; i<fsize; buffer[i++]=' ') {
112 if(buffer[i] == '*' && buffer[i+1] == '/') {
113 buffer[i] = buffer[i+1] = ' ';
114 i = i+1;
115 break;
116 }
117 }
118 }
119
120 // Line comment
121 if(buffer[i] == '/' && buffer[i+1] == '/') {
122 buffer[i] = buffer[i+1] = ' ';
123 for(i=i+2; i<fsize; ++i) {
124 if(buffer[i] == '\n' || buffer[i] == '\r') {
125 break;
126 }
127 else buffer[i] = ' ';
128 }
129 }
130
131 // Pond comment
132 if(buffer[i] == '#') {
133 buffer[i] = ' ';
134 for(i=i+1; i<fsize; ++i) {
135 if(buffer[i] == '\n' || buffer[i] == '\r') {
136 break;
137 }
138 else buffer[i] = ' ';
139 }
140 }
141 }
142
143 //std::cout << std::string_view(buffer.data()) << std::endl;
144
145 // Parse the token.
146 std::string token;
147 std::vector<std::string> tokens;
148
149 for(size_t i=0; i<fsize; ++i) {
150
151 auto c = buffer[i];
152 bool is_del = (dels.find(c) != std::string_view::npos);
153
154 if(is_del || std::isspace(c)) {
155 if(!token.empty()) { // Add the current token.
156 tokens.push_back(std::move(token));
157 token.clear();
158 }
159 if(is_del && exps.find(c) != std::string_view::npos) {
160 token.push_back(c);
161 tokens.push_back(std::move(token));
162 }
163 } else {
164 token.push_back(c); // Add the char to the current token.
165 }
166 }
167
168 if(!token.empty()) {
169 tokens.push_back(std::move(token));
170 }
171
172 return tokens;
173 }
174
175 }; // end of namespace ot. -----------------------------------------------------------------------
176
177
178