1 /*
2 * PROJECT: ReactOS TXT to NLS Converter
3 * LICENSE: GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later)
4 * FILE: sdk/tools/txt2nls/main.c
5 * COPYRIGHT: Copyright 2021 Jérôme Gardou <jerome.gardou@reactos.org>
6 */
7
8 #include <iostream>
9 #include <fstream>
10 #include <limits>
11 #include <vector>
12 #include <cstring>
13 #include <string>
14 #include <cstdint>
15
16 static const char whitespaces[] = " \t\f\v\n\r";
17 static long line_number = -1;
18
19 #pragma pack(push, 1)
20 #define MAXIMUM_LEADBYTES 12
21 struct NLS_FILE_HEADER
22 {
23 uint16_t HeaderSize;
24 uint16_t CodePage;
25 uint16_t MaximumCharacterSize;
26 uint16_t DefaultChar;
27 uint16_t UniDefaultChar;
28 uint16_t TransDefaultChar;
29 uint16_t TransUniDefaultChar;
30 uint8_t LeadByte[MAXIMUM_LEADBYTES];
31 };
32 static_assert(sizeof(NLS_FILE_HEADER) == 26, "Wrong size for NLS_FILE_HEADER");
33 #pragma pack(pop)
34
get_clean_line(std::istream & stream,std::string & str)35 static std::istream& get_clean_line(std::istream& stream, std::string& str)
36 {
37 do
38 {
39 std::istream& ret = std::getline(stream, str);
40 if (!ret)
41 return ret;
42
43 /* Ignore comments */
44 std::size_t comment_pos = str.find_first_of(';');
45 if (comment_pos != std::string::npos)
46 {
47 str.erase(comment_pos);
48 }
49
50 /* Remove trailing spaces */
51 std::size_t end_of_line = str.find_last_not_of(whitespaces);
52 if (end_of_line != std::string::npos)
53 str.erase(end_of_line + 1);
54 else
55 str.clear();
56
57 line_number++;
58 } while (str.empty());
59
60 return stream;
61 }
62
tokenize(std::string & str,std::string & token)63 static void tokenize(std::string& str, std::string& token)
64 {
65 std::size_t token_start = str.find_first_not_of(whitespaces);
66 if (token_start == std::string::npos)
67 {
68 token = "";
69 str.clear();
70 return;
71 }
72
73 std::size_t token_end = str.find_first_of(whitespaces, token_start);
74 if (token_end == std::string::npos)
75 {
76 token = str.substr(token_start);
77 str.clear();
78 return;
79 }
80
81 token = str.substr(token_start, token_end);
82 str.erase(0, str.find_first_not_of(whitespaces, token_end));
83 }
84
85 template<typename T>
tokenize(std::string & str,T & int_token,int base=0)86 static void tokenize(std::string& str, T& int_token, int base = 0)
87 {
88 std::string token;
89 tokenize(str, token);
90
91 long val;
92 val = std::stol(token, nullptr, base);
93 if ((val > std::numeric_limits<T>::max()) || (val < std::numeric_limits<T>::min()))
94 throw std::invalid_argument(token + " does not fit range ["
95 + std::to_string(std::numeric_limits<T>::min()) + ":" + std::to_string(std::numeric_limits<T>::max()) + "]");
96
97 int_token = val;
98 }
99
error(const std::string & err)100 void error(const std::string& err)
101 {
102 std::cerr << "Error parsing line " << line_number <<": " << err << std::endl;
103 std::exit(1);
104 }
105
main(int argc,char * argv[])106 int main(int argc, char* argv[])
107 {
108 if (argc != 3)
109 {
110 std::cerr << "Usage: " << argv[0] << " <txt_in> <nls_out>" << std::endl;
111 return 1;
112 }
113
114 std::ifstream input(argv[1]);
115 if (!input.is_open())
116 {
117 std::cerr << "Unable to open " << argv[1] << std::endl;
118 return 1;
119 }
120
121 NLS_FILE_HEADER FileHeader;
122 memset(&FileHeader, 0, sizeof(FileHeader));
123
124 std::string curr_line;
125 // Get code page
126 if (!get_clean_line(input, curr_line))
127 {
128 std::cerr << "ERROR: File is empty" << std::endl;
129 return 1;
130 }
131
132 std::string token;
133 tokenize(curr_line, token);
134 if (token != "CODEPAGE")
135 error("expected CODEPAGE, got \"" + token + "\" instead");
136 try
137 {
138 tokenize(curr_line, FileHeader.CodePage, 10);
139 }
140 catch(const std::invalid_argument& ia)
141 {
142 error(ia.what());
143 }
144
145 if (!curr_line.empty())
146 error("Garbage after CODEPAGE statement: \"" + curr_line + "\"");
147
148 /* Get CPINFO */
149 if (!get_clean_line(input, curr_line))
150 error("Nothing after CODEPAGE statement");
151
152 tokenize(curr_line, token);
153 if (token != "CPINFO")
154 error("Expected CPINFO, got \"" + token + "\" instead");
155 try
156 {
157 tokenize(curr_line, FileHeader.MaximumCharacterSize);
158 tokenize(curr_line, FileHeader.DefaultChar);
159 tokenize(curr_line, FileHeader.UniDefaultChar);
160 }
161 catch(const std::invalid_argument& ia)
162 {
163 error(ia.what());
164 return 1;
165 }
166 if (!curr_line.empty())
167 error("Garbage after CPINFO statement: \"" + curr_line + "\"");
168 if ((FileHeader.MaximumCharacterSize != 1) && (FileHeader.MaximumCharacterSize != 2))
169 error("Expected 1 or 2 as max char size in CPINFO, got \"" + std::to_string(FileHeader.MaximumCharacterSize) + "\" instead");
170 if ((FileHeader.MaximumCharacterSize == 1) && (FileHeader.DefaultChar > std::numeric_limits<uint8_t>::max()))
171 error("Default MB character " + std::to_string(FileHeader.DefaultChar) + " doesn't fit in a 8-bit value");
172
173 /* Setup tables & default values */
174 bool has_mbtable = false;
175 uint16_t mb_table[256] = {0};
176
177 bool has_wctable = false;
178 uint8_t* wc_table = new uint8_t[65536 * FileHeader.MaximumCharacterSize];
179 if (FileHeader.MaximumCharacterSize == 1)
180 {
181 for (int i = 0; i < 65536; i++)
182 wc_table[i] = FileHeader.DefaultChar;
183 }
184 else
185 {
186 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
187 for (int i = 0; i < 65536; i++)
188 wc_table_dbcs[i] = FileHeader.DefaultChar;
189 }
190
191 std::vector<uint16_t> dbcs_table;
192 uint16_t lb_offsets[256] = {0};
193 uint16_t dbcs_range_count = 0;
194
195 uint16_t glyph_table[256] = {0};
196 bool has_glyphs = false;
197
198 /* Now parse */
199 while (get_clean_line(input, curr_line))
200 {
201 tokenize(curr_line, token);
202
203 if (token == "ENDCODEPAGE")
204 {
205 if (!curr_line.empty())
206 error("Garbage after ENDCODEPAGE statement: \"" + curr_line + "\"");
207 break;
208 }
209 else if (token == "MBTABLE")
210 {
211 uint16_t table_size;
212 try
213 {
214 tokenize(curr_line, table_size);
215 }
216 catch(const std::invalid_argument& ia)
217 {
218 error(ia.what());
219 }
220 if (has_mbtable)
221 error("MBTABLE can only be declared once");
222 if (table_size > 256)
223 error("MBTABLE size can't be larger than 256");
224 if (!curr_line.empty())
225 error("Garbage after MBTABLE statement: \"" + curr_line + "\"");
226
227 has_mbtable = true;
228 while (table_size--)
229 {
230 if (!get_clean_line(input, curr_line))
231 error("Expected " + std::to_string(table_size + 1) + " more lines after MBTABLE token");
232
233 uint8_t mb;
234 uint16_t wc;
235
236 try
237 {
238 tokenize(curr_line, mb);
239 tokenize(curr_line, wc);
240 }
241 catch(const std::invalid_argument& ia)
242 {
243 error(ia.what());
244 }
245 if (!curr_line.empty())
246 error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
247 mb_table[mb] = wc;
248 }
249 }
250 else if (token == "WCTABLE")
251 {
252 uint32_t table_size;
253 try
254 {
255 tokenize(curr_line, table_size);
256 }
257 catch(const std::invalid_argument& ia)
258 {
259 error(ia.what());
260 }
261 if (has_wctable)
262 error("WCTABLE can only be declared once");
263 if (!curr_line.empty())
264 error("Garbage after WCTABLE statement: \"" + curr_line + "\"");
265 if (table_size > 65536)
266 error("WCTABLE size can't be larger than 65536");
267
268 has_wctable = true;
269
270 if (FileHeader.MaximumCharacterSize == 1)
271 {
272 while (table_size--)
273 {
274 if (!get_clean_line(input, curr_line))
275 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
276
277 uint8_t mb;
278 uint16_t wc;
279
280 try
281 {
282 tokenize(curr_line, wc);
283 tokenize(curr_line, mb);
284 }
285 catch(const std::invalid_argument& ia)
286 {
287 error(ia.what());
288 }
289 if (!curr_line.empty())
290 error("Garbage after WCTABLE entry: \"" + curr_line + "\"");
291 wc_table[wc] = mb;
292 }
293 }
294 else
295 {
296 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
297 while (table_size--)
298 {
299 if (!get_clean_line(input, curr_line))
300 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
301 uint16_t mb;
302 uint16_t wc;
303
304 try
305 {
306 tokenize(curr_line, wc);
307 tokenize(curr_line, mb);
308 }
309 catch(const std::invalid_argument& ia)
310 {
311 error(ia.what());
312 }
313 if (!curr_line.empty())
314 error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
315 wc_table_dbcs[wc] = mb;
316 }
317 }
318 }
319 else if (token == "DBCSRANGE")
320 {
321 if (dbcs_range_count != 0)
322 error("DBCSRANGE can only be declared once");
323
324 try
325 {
326 tokenize(curr_line, dbcs_range_count);
327 }
328 catch(const std::invalid_argument& ia)
329 {
330 error(ia.what());
331 }
332 if (dbcs_range_count > (MAXIMUM_LEADBYTES / 2))
333 error("DBCSRANGE count can't exceed " + std::to_string(MAXIMUM_LEADBYTES / 2));
334 if (!curr_line.empty())
335 error("Garbage after DBCSRANGE token");
336
337 std::size_t current_offset = 0;
338
339 uint16_t range_count = dbcs_range_count;
340 uint16_t current_range = 0;
341 while (range_count--)
342 {
343 if (!get_clean_line(input, curr_line))
344 error("Expected new range after DBCSRANGE");
345
346 uint8_t RangeStart, RangeEnd;
347 try
348 {
349 tokenize(curr_line, RangeStart);
350 tokenize(curr_line, RangeEnd);
351 }
352 catch(const std::invalid_argument& ia)
353 {
354 error(ia.what());
355 }
356 if (!curr_line.empty())
357 error("Garbage after DBCS range declaration");
358
359 if (RangeStart > RangeEnd)
360 error("Invalid range specified for DBCSRANGE");
361
362 FileHeader.LeadByte[current_range*2] = RangeStart;
363 FileHeader.LeadByte[current_range*2+1] = RangeEnd;
364 current_range++;
365
366 dbcs_table.resize(dbcs_table.size() + 256 * (RangeEnd - RangeStart + 1), FileHeader.UniDefaultChar);
367
368 for (uint8_t LeadByte = RangeStart; LeadByte <= RangeEnd; LeadByte++)
369 {
370 if (!get_clean_line(input, curr_line))
371 error("Expected new DBCSTABLE after DBCS range declaration");
372
373 tokenize(curr_line, token);
374 if (token != "DBCSTABLE")
375 error("Expected new DBCSTABLE after DBCS range declaration");
376
377 uint16_t table_size;
378 try
379 {
380 tokenize(curr_line, table_size);
381 }
382 catch(const std::invalid_argument& ia)
383 {
384 error(ia.what());
385 }
386 if (table_size > 256)
387 error("DBCSTABLE can't have more than 256 entries");
388 while (table_size--)
389 {
390 if (!get_clean_line(input, curr_line))
391 error("Expected " + std::to_string(table_size + 1) + " more lines after DBCSTABLE token");
392
393 uint8_t mb;
394 uint16_t wc;
395
396 try
397 {
398 tokenize(curr_line, mb);
399 tokenize(curr_line, wc);
400 }
401 catch(const std::invalid_argument& ia)
402 {
403 error(ia.what());
404 }
405 if (!curr_line.empty())
406 error("Garbage after DBCSTABLE entry: \"" + curr_line + "\"");
407
408 dbcs_table[current_offset + mb] = wc;
409 }
410 current_offset += 256;
411 /* Offsets start at 256 for the offset table. */
412 lb_offsets[LeadByte] = current_offset;
413 }
414 }
415 }
416 else if (token == "GLYPHTABLE")
417 {
418 uint16_t table_size;
419 try
420 {
421 tokenize(curr_line, table_size);
422 }
423 catch(const std::invalid_argument& ia)
424 {
425 error(ia.what());
426 }
427 if (has_glyphs)
428 error("GLYPHTABLE can only be declared once");
429 if (table_size > 256)
430 error("GLYPHTABLE size can't be larger than 256");
431 if (!curr_line.empty())
432 error("Garbage after GLYPHTABLE statement: \"" + curr_line + "\"");
433 has_glyphs = true;
434
435 while (table_size--)
436 {
437 if (!get_clean_line(input, curr_line))
438 error("Expected " + std::to_string(table_size + 1) + " more lines after GLYPHTABLE token");
439
440 uint8_t mb;
441 uint16_t wc;
442
443 try
444 {
445 tokenize(curr_line, mb);
446 tokenize(curr_line, wc);
447 }
448 catch(const std::invalid_argument& ia)
449 {
450 error(ia.what());
451 }
452 if (!curr_line.empty())
453 error("Garbage after GLYPHTABLE entry: \"" + curr_line + "\"");
454 glyph_table[mb] = wc;
455 }
456 }
457 else
458 {
459 error("Unexpected token \"" + token + "\"");
460 }
461 }
462
463 if (token != "ENDCODEPAGE")
464 error("Expected last token to be \"ENDCODEPAGE\"");
465
466 input.close();
467
468 /* Ensure this is minimally workable */
469 if (!has_mbtable)
470 error("File has no MBTABLE statement");
471 if (!has_wctable)
472 error("File has no WCTABLE statement");
473
474 /* Glyph table fixup */
475 if (has_glyphs)
476 {
477 for(int i = 0; i < 256; i++)
478 {
479 if (glyph_table[i] == 0)
480 glyph_table[i] = mb_table[i];
481 }
482 }
483
484 /* Translated default char fixup */
485 if (FileHeader.MaximumCharacterSize == 1)
486 {
487 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
488 FileHeader.TransUniDefaultChar = wc_table[FileHeader.UniDefaultChar];
489 }
490 else
491 {
492 if (FileHeader.DefaultChar > 0xFF)
493 {
494 uint16_t offset = lb_offsets[FileHeader.DefaultChar >> 8];
495 if (!offset)
496 error("Default MB char is not translatable!");
497 FileHeader.TransDefaultChar = dbcs_table[(FileHeader.DefaultChar & 0xFF) + (offset - 256)];
498 }
499 else
500 {
501 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
502 }
503 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
504 FileHeader.TransUniDefaultChar = wc_table_dbcs[FileHeader.UniDefaultChar];
505 }
506 FileHeader.HeaderSize = sizeof(NLS_FILE_HEADER) / sizeof(uint16_t);
507
508 std::ofstream output(argv[2], std::ios_base::binary);
509
510 output.write(reinterpret_cast<char*>(&FileHeader), sizeof(FileHeader));
511
512 uint16_t wc_table_offset = sizeof(mb_table) / sizeof(uint16_t)
513 + 1 /* size of glyph table */
514 + (has_glyphs ? 256 : 0) /* Glyph table */
515 + 1 /* Number of DBCS LeadByte ranges */
516 + (dbcs_range_count ? 256 : 0) /* offsets of lead byte sub tables */
517 + dbcs_table.size() /* LeadByte sub tables */
518 + 1; /* Unknown flag */
519
520 output.write(reinterpret_cast<char*>(&wc_table_offset), sizeof(wc_table_offset));
521
522 output.write(reinterpret_cast<char*>(mb_table), sizeof(mb_table));
523
524 uint16_t glyph_table_size = has_glyphs ? 256 : 0;
525 output.write(reinterpret_cast<char*>(&glyph_table_size), sizeof(glyph_table_size));
526 if (has_glyphs)
527 output.write(reinterpret_cast<char*>(glyph_table), sizeof(glyph_table));
528
529 output.write(reinterpret_cast<char*>(&dbcs_range_count), sizeof(dbcs_range_count));
530 if (dbcs_range_count)
531 {
532 output.write(reinterpret_cast<char*>(lb_offsets), sizeof(lb_offsets));
533 }
534 if (dbcs_table.size())
535 {
536 output.write(reinterpret_cast<char*>(dbcs_table.data()), dbcs_table.size() * sizeof(uint16_t));
537 }
538
539 uint16_t unknown_flag = FileHeader.MaximumCharacterSize == 1 ? 0 : 4;
540 output.write(reinterpret_cast<char*>(&unknown_flag), sizeof(unknown_flag));
541
542 output.write(reinterpret_cast<char*>(wc_table), 65536 * FileHeader.MaximumCharacterSize);
543
544 output.close();
545 delete[] wc_table;
546
547 return 0;
548 }
549