1 /* 2 * PROJECT: ReactOS TXT to NLS Converter 3 * LICENSE: GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later) 4 * FILE: sdk/tools/txt2nls/main.c 5 * COPYRIGHT: Copyright 2021 Jérôme Gardou <jerome.gardou@reactos.org> 6 */ 7 8 #include <iostream> 9 #include <fstream> 10 #include <limits> 11 #include <vector> 12 #include <cstring> 13 #include <string> 14 #include <cstdint> 15 16 static const char whitespaces[] = " \t\f\v\n\r"; 17 static long line_number = -1; 18 19 #pragma pack(push, 1) 20 #define MAXIMUM_LEADBYTES 12 21 struct NLS_FILE_HEADER 22 { 23 uint16_t HeaderSize; 24 uint16_t CodePage; 25 uint16_t MaximumCharacterSize; 26 uint16_t DefaultChar; 27 uint16_t UniDefaultChar; 28 uint16_t TransDefaultChar; 29 uint16_t TransUniDefaultChar; 30 uint8_t LeadByte[MAXIMUM_LEADBYTES]; 31 }; 32 static_assert(sizeof(NLS_FILE_HEADER) == 26, "Wrong size for NLS_FILE_HEADER"); 33 #pragma pack(pop) 34 35 static std::istream& get_clean_line(std::istream& stream, std::string& str) 36 { 37 do 38 { 39 std::istream& ret = std::getline(stream, str); 40 if (!ret) 41 return ret; 42 43 /* Ignore comments */ 44 std::size_t comment_pos = str.find_first_of(';'); 45 if (comment_pos != std::string::npos) 46 { 47 str.erase(comment_pos); 48 } 49 50 /* Remove trailing spaces */ 51 std::size_t end_of_line = str.find_last_not_of(whitespaces); 52 if (end_of_line != std::string::npos) 53 str.erase(end_of_line + 1); 54 else 55 str.clear(); 56 57 line_number++; 58 } while (str.empty()); 59 60 return stream; 61 } 62 63 static void tokenize(std::string& str, std::string& token) 64 { 65 std::size_t token_start = str.find_first_not_of(whitespaces); 66 if (token_start == std::string::npos) 67 { 68 token = ""; 69 str.clear(); 70 return; 71 } 72 73 std::size_t token_end = str.find_first_of(whitespaces, token_start); 74 if (token_end == std::string::npos) 75 { 76 token = str.substr(token_start); 77 str.clear(); 78 return; 79 } 80 81 token = str.substr(token_start, token_end); 82 str.erase(0, str.find_first_not_of(whitespaces, token_end)); 83 } 84 85 template<typename T> 86 static void tokenize(std::string& str, T& int_token, int base = 0) 87 { 88 std::string token; 89 tokenize(str, token); 90 91 long val; 92 val = std::stol(token, nullptr, base); 93 if ((val > std::numeric_limits<T>::max()) || (val < std::numeric_limits<T>::min())) 94 throw std::invalid_argument(token + " does not fit range [" 95 + std::to_string(std::numeric_limits<T>::min()) + ":" + std::to_string(std::numeric_limits<T>::max()) + "]"); 96 97 int_token = val; 98 } 99 100 void error(const std::string& err) 101 { 102 std::cerr << "Error parsing line " << line_number <<": " << err << std::endl; 103 std::exit(1); 104 } 105 106 int main(int argc, char* argv[]) 107 { 108 if (argc != 3) 109 { 110 std::cerr << "Usage: " << argv[0] << " <txt_in> <nls_out>" << std::endl; 111 return 1; 112 } 113 114 std::ifstream input(argv[1]); 115 if (!input.is_open()) 116 { 117 std::cerr << "Unable to open " << argv[1] << std::endl; 118 return 1; 119 } 120 121 NLS_FILE_HEADER FileHeader; 122 memset(&FileHeader, 0, sizeof(FileHeader)); 123 124 std::string curr_line; 125 // Get code page 126 if (!get_clean_line(input, curr_line)) 127 { 128 std::cerr << "ERROR: File is empty" << std::endl; 129 return 1; 130 } 131 132 std::string token; 133 tokenize(curr_line, token); 134 if (token != "CODEPAGE") 135 error("expected CODEPAGE, got \"" + token + "\" instead"); 136 try 137 { 138 tokenize(curr_line, FileHeader.CodePage, 10); 139 } 140 catch(const std::invalid_argument& ia) 141 { 142 error(ia.what()); 143 } 144 145 if (!curr_line.empty()) 146 error("Garbage after CODEPAGE statement: \"" + curr_line + "\""); 147 148 /* Get CPINFO */ 149 if (!get_clean_line(input, curr_line)) 150 error("Nothing after CODEPAGE statement"); 151 152 tokenize(curr_line, token); 153 if (token != "CPINFO") 154 error("Expected CPINFO, got \"" + token + "\" instead"); 155 try 156 { 157 tokenize(curr_line, FileHeader.MaximumCharacterSize); 158 tokenize(curr_line, FileHeader.DefaultChar); 159 tokenize(curr_line, FileHeader.UniDefaultChar); 160 } 161 catch(const std::invalid_argument& ia) 162 { 163 error(ia.what()); 164 return 1; 165 } 166 if (!curr_line.empty()) 167 error("Garbage after CPINFO statement: \"" + curr_line + "\""); 168 if ((FileHeader.MaximumCharacterSize != 1) && (FileHeader.MaximumCharacterSize != 2)) 169 error("Expected 1 or 2 as max char size in CPINFO, got \"" + std::to_string(FileHeader.MaximumCharacterSize) + "\" instead"); 170 if ((FileHeader.MaximumCharacterSize == 1) && (FileHeader.DefaultChar > std::numeric_limits<uint8_t>::max())) 171 error("Default MB character " + std::to_string(FileHeader.DefaultChar) + " doesn't fit in a 8-bit value"); 172 173 /* Setup tables & default values */ 174 bool has_mbtable = false; 175 uint16_t mb_table[256] = {0}; 176 177 bool has_wctable = false; 178 uint8_t* wc_table = new uint8_t[65536 * FileHeader.MaximumCharacterSize]; 179 if (FileHeader.MaximumCharacterSize == 1) 180 { 181 for (int i = 0; i < 65536; i++) 182 wc_table[i] = FileHeader.DefaultChar; 183 } 184 else 185 { 186 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 187 for (int i = 0; i < 65536; i++) 188 wc_table_dbcs[i] = FileHeader.DefaultChar; 189 } 190 191 std::vector<uint16_t> dbcs_table; 192 uint16_t lb_offsets[256] = {0}; 193 uint16_t dbcs_range_count = 0; 194 195 uint16_t glyph_table[256] = {0}; 196 bool has_glyphs = false; 197 198 /* Now parse */ 199 while (get_clean_line(input, curr_line)) 200 { 201 tokenize(curr_line, token); 202 203 if (token == "ENDCODEPAGE") 204 { 205 if (!curr_line.empty()) 206 error("Garbage after ENDCODEPAGE statement: \"" + curr_line + "\""); 207 break; 208 } 209 else if (token == "MBTABLE") 210 { 211 uint16_t table_size; 212 try 213 { 214 tokenize(curr_line, table_size); 215 } 216 catch(const std::invalid_argument& ia) 217 { 218 error(ia.what()); 219 } 220 if (has_mbtable) 221 error("MBTABLE can only be declared once"); 222 if (table_size > 256) 223 error("MBTABLE size can't be larger than 256"); 224 if (!curr_line.empty()) 225 error("Garbage after MBTABLE statement: \"" + curr_line + "\""); 226 227 has_mbtable = true; 228 while (table_size--) 229 { 230 if (!get_clean_line(input, curr_line)) 231 error("Expected " + std::to_string(table_size + 1) + " more lines after MBTABLE token"); 232 233 uint8_t mb; 234 uint16_t wc; 235 236 try 237 { 238 tokenize(curr_line, mb); 239 tokenize(curr_line, wc); 240 } 241 catch(const std::invalid_argument& ia) 242 { 243 error(ia.what()); 244 } 245 if (!curr_line.empty()) 246 error("Garbage after MBTABLE entry: \"" + curr_line + "\""); 247 mb_table[mb] = wc; 248 } 249 } 250 else if (token == "WCTABLE") 251 { 252 uint32_t table_size; 253 try 254 { 255 tokenize(curr_line, table_size); 256 } 257 catch(const std::invalid_argument& ia) 258 { 259 error(ia.what()); 260 } 261 if (has_wctable) 262 error("WCTABLE can only be declared once"); 263 if (!curr_line.empty()) 264 error("Garbage after WCTABLE statement: \"" + curr_line + "\""); 265 if (table_size > 65536) 266 error("WCTABLE size can't be larger than 65536"); 267 268 has_wctable = true; 269 270 if (FileHeader.MaximumCharacterSize == 1) 271 { 272 while (table_size--) 273 { 274 if (!get_clean_line(input, curr_line)) 275 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token"); 276 277 uint8_t mb; 278 uint16_t wc; 279 280 try 281 { 282 tokenize(curr_line, wc); 283 tokenize(curr_line, mb); 284 } 285 catch(const std::invalid_argument& ia) 286 { 287 error(ia.what()); 288 } 289 if (!curr_line.empty()) 290 error("Garbage after WCTABLE entry: \"" + curr_line + "\""); 291 wc_table[wc] = mb; 292 } 293 } 294 else 295 { 296 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 297 while (table_size--) 298 { 299 if (!get_clean_line(input, curr_line)) 300 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token"); 301 uint16_t mb; 302 uint16_t wc; 303 304 try 305 { 306 tokenize(curr_line, wc); 307 tokenize(curr_line, mb); 308 } 309 catch(const std::invalid_argument& ia) 310 { 311 error(ia.what()); 312 } 313 if (!curr_line.empty()) 314 error("Garbage after MBTABLE entry: \"" + curr_line + "\""); 315 wc_table_dbcs[wc] = mb; 316 } 317 } 318 } 319 else if (token == "DBCSRANGE") 320 { 321 if (dbcs_range_count != 0) 322 error("DBCSRANGE can only be declared once"); 323 324 try 325 { 326 tokenize(curr_line, dbcs_range_count); 327 } 328 catch(const std::invalid_argument& ia) 329 { 330 error(ia.what()); 331 } 332 if (dbcs_range_count > (MAXIMUM_LEADBYTES / 2)) 333 error("DBCSRANGE count can't exceed " + std::to_string(MAXIMUM_LEADBYTES / 2)); 334 if (!curr_line.empty()) 335 error("Garbage after DBCSRANGE token"); 336 337 std::size_t current_offset = 0; 338 339 uint16_t range_count = dbcs_range_count; 340 uint16_t current_range = 0; 341 while (range_count--) 342 { 343 if (!get_clean_line(input, curr_line)) 344 error("Expected new range after DBCSRANGE"); 345 346 uint8_t RangeStart, RangeEnd; 347 try 348 { 349 tokenize(curr_line, RangeStart); 350 tokenize(curr_line, RangeEnd); 351 } 352 catch(const std::invalid_argument& ia) 353 { 354 error(ia.what()); 355 } 356 if (!curr_line.empty()) 357 error("Garbage after DBCS range declaration"); 358 359 if (RangeStart > RangeEnd) 360 error("Invalid range specified for DBCSRANGE"); 361 362 FileHeader.LeadByte[current_range*2] = RangeStart; 363 FileHeader.LeadByte[current_range*2+1] = RangeEnd; 364 current_range++; 365 366 dbcs_table.resize(dbcs_table.size() + 256 * (RangeEnd - RangeStart + 1), FileHeader.UniDefaultChar); 367 368 for (uint8_t LeadByte = RangeStart; LeadByte <= RangeEnd; LeadByte++) 369 { 370 if (!get_clean_line(input, curr_line)) 371 error("Expected new DBCSTABLE after DBCS range declaration"); 372 373 tokenize(curr_line, token); 374 if (token != "DBCSTABLE") 375 error("Expected new DBCSTABLE after DBCS range declaration"); 376 377 uint16_t table_size; 378 try 379 { 380 tokenize(curr_line, table_size); 381 } 382 catch(const std::invalid_argument& ia) 383 { 384 error(ia.what()); 385 } 386 if (table_size > 256) 387 error("DBCSTABLE can't have more than 256 entries"); 388 while (table_size--) 389 { 390 if (!get_clean_line(input, curr_line)) 391 error("Expected " + std::to_string(table_size + 1) + " more lines after DBCSTABLE token"); 392 393 uint8_t mb; 394 uint16_t wc; 395 396 try 397 { 398 tokenize(curr_line, mb); 399 tokenize(curr_line, wc); 400 } 401 catch(const std::invalid_argument& ia) 402 { 403 error(ia.what()); 404 } 405 if (!curr_line.empty()) 406 error("Garbage after DBCSTABLE entry: \"" + curr_line + "\""); 407 408 dbcs_table[current_offset + mb] = wc; 409 } 410 current_offset += 256; 411 /* Offsets start at 256 for the offset table. */ 412 lb_offsets[LeadByte] = current_offset; 413 } 414 } 415 } 416 else if (token == "GLYPHTABLE") 417 { 418 uint16_t table_size; 419 try 420 { 421 tokenize(curr_line, table_size); 422 } 423 catch(const std::invalid_argument& ia) 424 { 425 error(ia.what()); 426 } 427 if (has_glyphs) 428 error("GLYPHTABLE can only be declared once"); 429 if (table_size > 256) 430 error("GLYPHTABLE size can't be larger than 256"); 431 if (!curr_line.empty()) 432 error("Garbage after GLYPHTABLE statement: \"" + curr_line + "\""); 433 has_glyphs = true; 434 435 while (table_size--) 436 { 437 if (!get_clean_line(input, curr_line)) 438 error("Expected " + std::to_string(table_size + 1) + " more lines after GLYPHTABLE token"); 439 440 uint8_t mb; 441 uint16_t wc; 442 443 try 444 { 445 tokenize(curr_line, mb); 446 tokenize(curr_line, wc); 447 } 448 catch(const std::invalid_argument& ia) 449 { 450 error(ia.what()); 451 } 452 if (!curr_line.empty()) 453 error("Garbage after GLYPHTABLE entry: \"" + curr_line + "\""); 454 glyph_table[mb] = wc; 455 } 456 } 457 else 458 { 459 error("Unexpected token \"" + token + "\""); 460 } 461 } 462 463 if (token != "ENDCODEPAGE") 464 error("Expected last token to be \"ENDCODEPAGE\""); 465 466 input.close(); 467 468 /* Ensure this is minimally workable */ 469 if (!has_mbtable) 470 error("File has no MBTABLE statement"); 471 if (!has_wctable) 472 error("File has no WCTABLE statement"); 473 474 /* Glyph table fixup */ 475 if (has_glyphs) 476 { 477 for(int i = 0; i < 256; i++) 478 { 479 if (glyph_table[i] == 0) 480 glyph_table[i] = mb_table[i]; 481 } 482 } 483 484 /* Translated default char fixup */ 485 if (FileHeader.MaximumCharacterSize == 1) 486 { 487 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar]; 488 FileHeader.TransUniDefaultChar = wc_table[FileHeader.UniDefaultChar]; 489 } 490 else 491 { 492 if (FileHeader.DefaultChar > 0xFF) 493 { 494 uint16_t offset = lb_offsets[FileHeader.DefaultChar >> 8]; 495 if (!offset) 496 error("Default MB char is not translatable!"); 497 FileHeader.TransDefaultChar = dbcs_table[(FileHeader.DefaultChar & 0xFF) + (offset - 256)]; 498 } 499 else 500 { 501 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar]; 502 } 503 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 504 FileHeader.TransUniDefaultChar = wc_table_dbcs[FileHeader.UniDefaultChar]; 505 } 506 FileHeader.HeaderSize = sizeof(NLS_FILE_HEADER) / sizeof(uint16_t); 507 508 std::ofstream output(argv[2], std::ios_base::binary); 509 510 output.write(reinterpret_cast<char*>(&FileHeader), sizeof(FileHeader)); 511 512 uint16_t wc_table_offset = sizeof(mb_table) / sizeof(uint16_t) 513 + 1 /* size of glyph table */ 514 + (has_glyphs ? 256 : 0) /* Glyph table */ 515 + 1 /* Number of DBCS LeadByte ranges */ 516 + (dbcs_range_count ? 256 : 0) /* offsets of lead byte sub tables */ 517 + dbcs_table.size() /* LeadByte sub tables */ 518 + 1; /* Unknown flag */ 519 520 output.write(reinterpret_cast<char*>(&wc_table_offset), sizeof(wc_table_offset)); 521 522 output.write(reinterpret_cast<char*>(mb_table), sizeof(mb_table)); 523 524 uint16_t glyph_table_size = has_glyphs ? 256 : 0; 525 output.write(reinterpret_cast<char*>(&glyph_table_size), sizeof(glyph_table_size)); 526 if (has_glyphs) 527 output.write(reinterpret_cast<char*>(glyph_table), sizeof(glyph_table)); 528 529 output.write(reinterpret_cast<char*>(&dbcs_range_count), sizeof(dbcs_range_count)); 530 if (dbcs_range_count) 531 { 532 output.write(reinterpret_cast<char*>(lb_offsets), sizeof(lb_offsets)); 533 } 534 if (dbcs_table.size()) 535 { 536 output.write(reinterpret_cast<char*>(dbcs_table.data()), dbcs_table.size() * sizeof(uint16_t)); 537 } 538 539 uint16_t unknown_flag = FileHeader.MaximumCharacterSize == 1 ? 0 : 4; 540 output.write(reinterpret_cast<char*>(&unknown_flag), sizeof(unknown_flag)); 541 542 output.write(reinterpret_cast<char*>(wc_table), 65536 * FileHeader.MaximumCharacterSize); 543 544 output.close(); 545 delete[] wc_table; 546 547 return 0; 548 } 549