1 /* 2 * PROJECT: ReactOS TXT to NLS Converter 3 * LICENSE: GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later.html) 4 * FILE: sdk/tools/txt2nls/main.c 5 * COPYRIGHT: Copyright 2021 Jérôme Gardou <jerome.gardou@reactos.org> 6 */ 7 8 #include <iostream> 9 #include <fstream> 10 #include <limits> 11 #include <vector> 12 #include <cstring> 13 #include <string> 14 15 static const char whitespaces[] = " \t\f\v\n\r"; 16 static long line_number = -1; 17 18 #pragma pack(push, 1) 19 #define MAXIMUM_LEADBYTES 12 20 struct NLS_FILE_HEADER 21 { 22 uint16_t HeaderSize; 23 uint16_t CodePage; 24 uint16_t MaximumCharacterSize; 25 uint16_t DefaultChar; 26 uint16_t UniDefaultChar; 27 uint16_t TransDefaultChar; 28 uint16_t TransUniDefaultChar; 29 uint8_t LeadByte[MAXIMUM_LEADBYTES]; 30 }; 31 static_assert(sizeof(NLS_FILE_HEADER) == 26, "Wrong size for NLS_FILE_HEADER"); 32 #pragma pack(pop) 33 34 static std::istream& get_clean_line(std::istream& stream, std::string& str) 35 { 36 do 37 { 38 std::istream& ret = std::getline(stream, str); 39 if (!ret) 40 return ret; 41 42 /* Ignore comments */ 43 std::size_t comment_pos = str.find_first_of(';'); 44 if (comment_pos != std::string::npos) 45 { 46 str.erase(comment_pos); 47 } 48 49 /* Remove trailing spaces */ 50 std::size_t end_of_line = str.find_last_not_of(whitespaces); 51 if (end_of_line != std::string::npos) 52 str.erase(end_of_line + 1); 53 else 54 str.clear(); 55 56 line_number++; 57 } while (str.empty()); 58 59 return stream; 60 } 61 62 static void tokenize(std::string& str, std::string& token) 63 { 64 std::size_t token_start = str.find_first_not_of(whitespaces); 65 if (token_start == std::string::npos) 66 { 67 token = ""; 68 str.clear(); 69 return; 70 } 71 72 std::size_t token_end = str.find_first_of(whitespaces, token_start); 73 if (token_end == std::string::npos) 74 { 75 token = str.substr(token_start); 76 str.clear(); 77 return; 78 } 79 80 token = str.substr(token_start, token_end); 81 str.erase(0, str.find_first_not_of(whitespaces, token_end)); 82 } 83 84 template<typename T> 85 static void tokenize(std::string& str, T& int_token, int base = 0) 86 { 87 std::string token; 88 tokenize(str, token); 89 90 long val; 91 val = std::stol(token, nullptr, base); 92 if ((val > std::numeric_limits<T>::max()) || (val < std::numeric_limits<T>::min())) 93 throw std::invalid_argument(token + " does not fit range [" 94 + std::to_string(std::numeric_limits<T>::min()) + ":" + std::to_string(std::numeric_limits<T>::max()) + "]"); 95 96 int_token = val; 97 } 98 99 void error(const std::string& err) 100 { 101 std::cerr << "Error parsing line " << line_number <<": " << err << std::endl; 102 std::exit(1); 103 } 104 105 int main(int argc, char* argv[]) 106 { 107 if (argc != 3) 108 { 109 std::cerr << "Usage: " << argv[0] << " <txt_in> <nls_out>" << std::endl; 110 return 1; 111 } 112 113 std::ifstream input(argv[1]); 114 if (!input.is_open()) 115 { 116 std::cerr << "Unable to open " << argv[1] << std::endl; 117 return 1; 118 } 119 120 NLS_FILE_HEADER FileHeader; 121 memset(&FileHeader, 0, sizeof(FileHeader)); 122 123 std::string curr_line; 124 // Get code page 125 if (!get_clean_line(input, curr_line)) 126 { 127 std::cerr << "ERROR: File is empty" << std::endl; 128 return 1; 129 } 130 131 std::string token; 132 tokenize(curr_line, token); 133 if (token != "CODEPAGE") 134 error("expected CODEPAGE, got \"" + token + "\" instead"); 135 try 136 { 137 tokenize(curr_line, FileHeader.CodePage, 10); 138 } 139 catch(const std::invalid_argument& ia) 140 { 141 error(ia.what()); 142 } 143 144 if (!curr_line.empty()) 145 error("Garbage after CODEPAGE statement: \"" + curr_line + "\""); 146 147 /* Get CPINFO */ 148 if (!get_clean_line(input, curr_line)) 149 error("Nothing after CODEPAGE statement"); 150 151 tokenize(curr_line, token); 152 if (token != "CPINFO") 153 error("Expected CPINFO, got \"" + token + "\" instead"); 154 try 155 { 156 tokenize(curr_line, FileHeader.MaximumCharacterSize); 157 tokenize(curr_line, FileHeader.DefaultChar); 158 tokenize(curr_line, FileHeader.UniDefaultChar); 159 } 160 catch(const std::invalid_argument& ia) 161 { 162 error(ia.what()); 163 return 1; 164 } 165 if (!curr_line.empty()) 166 error("Garbage after CPINFO statement: \"" + curr_line + "\""); 167 if ((FileHeader.MaximumCharacterSize != 1) && (FileHeader.MaximumCharacterSize != 2)) 168 error("Expected 1 or 2 as max char size in CPINFO, got \"" + std::to_string(FileHeader.MaximumCharacterSize) + "\" instead"); 169 if ((FileHeader.MaximumCharacterSize == 1) && (FileHeader.DefaultChar > std::numeric_limits<uint8_t>::max())) 170 error("Default MB character " + std::to_string(FileHeader.DefaultChar) + " doesn't fit in a 8-bit value"); 171 172 /* Setup tables & default values */ 173 bool has_mbtable = false; 174 uint16_t mb_table[256] = {0}; 175 176 bool has_wctable = false; 177 uint8_t* wc_table = new uint8_t[65536 * FileHeader.MaximumCharacterSize]; 178 if (FileHeader.MaximumCharacterSize == 1) 179 { 180 for (int i = 0; i < 65536; i++) 181 wc_table[i] = FileHeader.DefaultChar; 182 } 183 else 184 { 185 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 186 for (int i = 0; i < 65536; i++) 187 wc_table_dbcs[i] = FileHeader.DefaultChar; 188 } 189 190 std::vector<uint16_t> dbcs_table; 191 uint16_t lb_offsets[256] = {0}; 192 uint16_t dbcs_range_count = 0; 193 194 uint16_t glyph_table[256] = {0}; 195 bool has_glyphs = false; 196 197 /* Now parse */ 198 while (get_clean_line(input, curr_line)) 199 { 200 tokenize(curr_line, token); 201 202 if (token == "ENDCODEPAGE") 203 { 204 if (!curr_line.empty()) 205 error("Garbage after ENDCODEPAGE statement: \"" + curr_line + "\""); 206 break; 207 } 208 else if (token == "MBTABLE") 209 { 210 uint16_t table_size; 211 try 212 { 213 tokenize(curr_line, table_size); 214 } 215 catch(const std::invalid_argument& ia) 216 { 217 error(ia.what()); 218 } 219 if (has_mbtable) 220 error("MBTABLE can only be declared once"); 221 if (table_size > 256) 222 error("MBTABLE size can't be larger than 256"); 223 if (!curr_line.empty()) 224 error("Garbage after MBTABLE statement: \"" + curr_line + "\""); 225 226 has_mbtable = true; 227 while (table_size--) 228 { 229 if (!get_clean_line(input, curr_line)) 230 error("Expected " + std::to_string(table_size + 1) + " more lines after MBTABLE token"); 231 232 uint8_t mb; 233 uint16_t wc; 234 235 try 236 { 237 tokenize(curr_line, mb); 238 tokenize(curr_line, wc); 239 } 240 catch(const std::invalid_argument& ia) 241 { 242 error(ia.what()); 243 } 244 if (!curr_line.empty()) 245 error("Garbage after MBTABLE entry: \"" + curr_line + "\""); 246 mb_table[mb] = wc; 247 } 248 } 249 else if (token == "WCTABLE") 250 { 251 uint32_t table_size; 252 try 253 { 254 tokenize(curr_line, table_size); 255 } 256 catch(const std::invalid_argument& ia) 257 { 258 error(ia.what()); 259 } 260 if (has_wctable) 261 error("WCTABLE can only be declared once"); 262 if (!curr_line.empty()) 263 error("Garbage after WCTABLE statement: \"" + curr_line + "\""); 264 if (table_size > 65536) 265 error("WCTABLE size can't be larger than 65536"); 266 267 has_wctable = true; 268 269 if (FileHeader.MaximumCharacterSize == 1) 270 { 271 while (table_size--) 272 { 273 if (!get_clean_line(input, curr_line)) 274 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token"); 275 276 uint8_t mb; 277 uint16_t wc; 278 279 try 280 { 281 tokenize(curr_line, wc); 282 tokenize(curr_line, mb); 283 } 284 catch(const std::invalid_argument& ia) 285 { 286 error(ia.what()); 287 } 288 if (!curr_line.empty()) 289 error("Garbage after WCTABLE entry: \"" + curr_line + "\""); 290 wc_table[wc] = mb; 291 } 292 } 293 else 294 { 295 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 296 while (table_size--) 297 { 298 if (!get_clean_line(input, curr_line)) 299 error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token"); 300 uint16_t mb; 301 uint16_t wc; 302 303 try 304 { 305 tokenize(curr_line, wc); 306 tokenize(curr_line, mb); 307 } 308 catch(const std::invalid_argument& ia) 309 { 310 error(ia.what()); 311 } 312 if (!curr_line.empty()) 313 error("Garbage after MBTABLE entry: \"" + curr_line + "\""); 314 wc_table_dbcs[wc] = mb; 315 } 316 } 317 } 318 else if (token == "DBCSRANGE") 319 { 320 if (dbcs_range_count != 0) 321 error("DBCSRANGE can only be declared once"); 322 323 try 324 { 325 tokenize(curr_line, dbcs_range_count); 326 } 327 catch(const std::invalid_argument& ia) 328 { 329 error(ia.what()); 330 } 331 if (dbcs_range_count > (MAXIMUM_LEADBYTES / 2)) 332 error("DBCSRANGE count can't exceed " + std::to_string(MAXIMUM_LEADBYTES / 2)); 333 if (!curr_line.empty()) 334 error("Garbage after DBCSRANGE token"); 335 336 std::size_t current_offset = 0; 337 338 uint16_t range_count = dbcs_range_count; 339 uint16_t current_range = 0; 340 while (range_count--) 341 { 342 if (!get_clean_line(input, curr_line)) 343 error("Expected new range after DBCSRANGE"); 344 345 uint8_t RangeStart, RangeEnd; 346 try 347 { 348 tokenize(curr_line, RangeStart); 349 tokenize(curr_line, RangeEnd); 350 } 351 catch(const std::invalid_argument& ia) 352 { 353 error(ia.what()); 354 } 355 if (!curr_line.empty()) 356 error("Garbage after DBCS range declaration"); 357 358 if (RangeStart > RangeEnd) 359 error("Invalid range specified for DBCSRANGE"); 360 361 FileHeader.LeadByte[current_range*2] = RangeStart; 362 FileHeader.LeadByte[current_range*2+1] = RangeEnd; 363 current_range++; 364 365 dbcs_table.resize(dbcs_table.size() + 256 * (RangeEnd - RangeStart + 1), FileHeader.UniDefaultChar); 366 367 for (uint8_t LeadByte = RangeStart; LeadByte <= RangeEnd; LeadByte++) 368 { 369 if (!get_clean_line(input, curr_line)) 370 error("Expected new DBCSTABLE after DBCS range declaration"); 371 372 tokenize(curr_line, token); 373 if (token != "DBCSTABLE") 374 error("Expected new DBCSTABLE after DBCS range declaration"); 375 376 uint16_t table_size; 377 try 378 { 379 tokenize(curr_line, table_size); 380 } 381 catch(const std::invalid_argument& ia) 382 { 383 error(ia.what()); 384 } 385 if (table_size > 256) 386 error("DBCSTABLE can't have more than 256 entries"); 387 while (table_size--) 388 { 389 if (!get_clean_line(input, curr_line)) 390 error("Expected " + std::to_string(table_size + 1) + " more lines after DBCSTABLE token"); 391 392 uint8_t mb; 393 uint16_t wc; 394 395 try 396 { 397 tokenize(curr_line, mb); 398 tokenize(curr_line, wc); 399 } 400 catch(const std::invalid_argument& ia) 401 { 402 error(ia.what()); 403 } 404 if (!curr_line.empty()) 405 error("Garbage after DBCSTABLE entry: \"" + curr_line + "\""); 406 407 dbcs_table[current_offset + mb] = wc; 408 } 409 current_offset += 256; 410 /* Offsets start at 256 for the offset table. */ 411 lb_offsets[LeadByte] = current_offset; 412 } 413 } 414 } 415 else if (token == "GLYPHTABLE") 416 { 417 uint16_t table_size; 418 try 419 { 420 tokenize(curr_line, table_size); 421 } 422 catch(const std::invalid_argument& ia) 423 { 424 error(ia.what()); 425 } 426 if (has_glyphs) 427 error("GLYPHTABLE can only be declared once"); 428 if (table_size > 256) 429 error("GLYPHTABLE size can't be larger than 256"); 430 if (!curr_line.empty()) 431 error("Garbage after GLYPHTABLE statement: \"" + curr_line + "\""); 432 has_glyphs = true; 433 434 while (table_size--) 435 { 436 if (!get_clean_line(input, curr_line)) 437 error("Expected " + std::to_string(table_size + 1) + " more lines after GLYPHTABLE token"); 438 439 uint8_t mb; 440 uint16_t wc; 441 442 try 443 { 444 tokenize(curr_line, mb); 445 tokenize(curr_line, wc); 446 } 447 catch(const std::invalid_argument& ia) 448 { 449 error(ia.what()); 450 } 451 if (!curr_line.empty()) 452 error("Garbage after GLYPHTABLE entry: \"" + curr_line + "\""); 453 glyph_table[mb] = wc; 454 } 455 } 456 else 457 { 458 error("Unexpected token \"" + token + "\""); 459 } 460 } 461 462 if (token != "ENDCODEPAGE") 463 error("Expected last token to be \"ENDCODEPAGE\""); 464 465 input.close(); 466 467 /* Ensure this is minimally workable */ 468 if (!has_mbtable) 469 error("File has no MBTABLE statement"); 470 if (!has_wctable) 471 error("File has no WCTABLE statement"); 472 473 /* Glyph table fixup */ 474 if (has_glyphs) 475 { 476 for(int i = 0; i < 256; i++) 477 { 478 if (glyph_table[i] == 0) 479 glyph_table[i] = mb_table[i]; 480 } 481 } 482 483 /* Translated default char fixup */ 484 if (FileHeader.MaximumCharacterSize == 1) 485 { 486 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar]; 487 FileHeader.TransUniDefaultChar = wc_table[FileHeader.UniDefaultChar]; 488 } 489 else 490 { 491 if (FileHeader.DefaultChar > 0xFF) 492 { 493 uint16_t offset = lb_offsets[FileHeader.DefaultChar >> 8]; 494 if (!offset) 495 error("Default MB char is not translatable!"); 496 FileHeader.TransDefaultChar = dbcs_table[(FileHeader.DefaultChar & 0xFF) + (offset - 256)]; 497 } 498 else 499 { 500 FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar]; 501 } 502 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table); 503 FileHeader.TransUniDefaultChar = wc_table_dbcs[FileHeader.UniDefaultChar]; 504 } 505 FileHeader.HeaderSize = sizeof(NLS_FILE_HEADER) / sizeof(uint16_t); 506 507 std::ofstream output(argv[2], std::ios_base::binary); 508 509 output.write(reinterpret_cast<char*>(&FileHeader), sizeof(FileHeader)); 510 511 uint16_t wc_table_offset = sizeof(mb_table) / sizeof(uint16_t) 512 + 1 /* size of glyph table */ 513 + (has_glyphs ? 256 : 0) /* Glyph table */ 514 + 1 /* Number of DBCS LeadByte ranges */ 515 + (dbcs_range_count ? 256 : 0) /* offsets of lead byte sub tables */ 516 + dbcs_table.size() /* LeadByte sub tables */ 517 + 1; /* Unknown flag */ 518 519 output.write(reinterpret_cast<char*>(&wc_table_offset), sizeof(wc_table_offset)); 520 521 output.write(reinterpret_cast<char*>(mb_table), sizeof(mb_table)); 522 523 uint16_t glyph_table_size = has_glyphs ? 256 : 0; 524 output.write(reinterpret_cast<char*>(&glyph_table_size), sizeof(glyph_table_size)); 525 if (has_glyphs) 526 output.write(reinterpret_cast<char*>(glyph_table), sizeof(glyph_table)); 527 528 output.write(reinterpret_cast<char*>(&dbcs_range_count), sizeof(dbcs_range_count)); 529 if (dbcs_range_count) 530 { 531 output.write(reinterpret_cast<char*>(lb_offsets), sizeof(lb_offsets)); 532 } 533 if (dbcs_table.size()) 534 { 535 output.write(reinterpret_cast<char*>(dbcs_table.data()), dbcs_table.size() * sizeof(uint16_t)); 536 } 537 538 uint16_t unknown_flag = FileHeader.MaximumCharacterSize == 1 ? 0 : 4; 539 output.write(reinterpret_cast<char*>(&unknown_flag), sizeof(unknown_flag)); 540 541 output.write(reinterpret_cast<char*>(wc_table), 65536 * FileHeader.MaximumCharacterSize); 542 543 output.close(); 544 delete[] wc_table; 545 546 return 0; 547 } 548