xref: /reactos/sdk/tools/txt2nls/main.cpp (revision d2aeaba5)
1 /*
2  * PROJECT:     ReactOS TXT to NLS Converter
3  * LICENSE:     GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later.html)
4  * FILE:        sdk/tools/txt2nls/main.c
5  * COPYRIGHT:   Copyright 2021 Jérôme Gardou <jerome.gardou@reactos.org>
6  */
7 
8 #include <iostream>
9 #include <fstream>
10 #include <limits>
11 #include <vector>
12 #include <cstring>
13 #include <string>
14 
15 static const char whitespaces[] = " \t\f\v\n\r";
16 static long line_number = -1;
17 
18 #pragma pack(push, 1)
19 #define MAXIMUM_LEADBYTES 12
20 struct NLS_FILE_HEADER
21 {
22     uint16_t HeaderSize;
23     uint16_t CodePage;
24     uint16_t MaximumCharacterSize;
25     uint16_t DefaultChar;
26     uint16_t UniDefaultChar;
27     uint16_t TransDefaultChar;
28     uint16_t TransUniDefaultChar;
29     uint8_t LeadByte[MAXIMUM_LEADBYTES];
30 };
31 static_assert(sizeof(NLS_FILE_HEADER) == 26, "Wrong size for NLS_FILE_HEADER");
32 #pragma pack(pop)
33 
34 static std::istream& get_clean_line(std::istream& stream, std::string& str)
35 {
36     do
37     {
38         std::istream& ret = std::getline(stream, str);
39         if (!ret)
40             return ret;
41 
42         /* Ignore comments */
43         std::size_t comment_pos = str.find_first_of(';');
44         if (comment_pos != std::string::npos)
45         {
46             str.erase(comment_pos);
47         }
48 
49         /* Remove trailing spaces */
50         std::size_t end_of_line = str.find_last_not_of(whitespaces);
51         if (end_of_line != std::string::npos)
52             str.erase(end_of_line + 1);
53         else
54             str.clear();
55 
56         line_number++;
57     } while (str.empty());
58 
59     return stream;
60 }
61 
62 static void tokenize(std::string& str, std::string& token)
63 {
64     std::size_t token_start = str.find_first_not_of(whitespaces);
65     if (token_start == std::string::npos)
66     {
67         token = "";
68         str.clear();
69         return;
70     }
71 
72     std::size_t token_end = str.find_first_of(whitespaces, token_start);
73     if (token_end == std::string::npos)
74     {
75         token = str.substr(token_start);
76         str.clear();
77         return;
78     }
79 
80     token = str.substr(token_start, token_end);
81     str.erase(0, str.find_first_not_of(whitespaces, token_end));
82 }
83 
84 template<typename T>
85 static void tokenize(std::string& str, T& int_token, int base = 0)
86 {
87     std::string token;
88     tokenize(str, token);
89 
90     long val;
91     val = std::stol(token, nullptr, base);
92     if ((val > std::numeric_limits<T>::max()) || (val < std::numeric_limits<T>::min()))
93         throw std::invalid_argument(token + " does not fit range ["
94             + std::to_string(std::numeric_limits<T>::min()) + ":" + std::to_string(std::numeric_limits<T>::max()) + "]");
95 
96     int_token = val;
97 }
98 
99 void error(const std::string& err)
100 {
101     std::cerr << "Error parsing line " << line_number <<": " << err << std::endl;
102     std::exit(1);
103 }
104 
105 int main(int argc, char* argv[])
106 {
107     if (argc != 3)
108     {
109         std::cerr << "Usage: " << argv[0] << " <txt_in> <nls_out>" << std::endl;
110         return 1;
111     }
112 
113     std::ifstream input(argv[1]);
114     if (!input.is_open())
115     {
116         std::cerr << "Unable to open " << argv[1] << std::endl;
117         return 1;
118     }
119 
120     NLS_FILE_HEADER FileHeader;
121     memset(&FileHeader, 0, sizeof(FileHeader));
122 
123     std::string curr_line;
124     // Get code page
125     if (!get_clean_line(input, curr_line))
126     {
127         std::cerr << "ERROR: File is empty" << std::endl;
128         return 1;
129     }
130 
131     std::string token;
132     tokenize(curr_line, token);
133     if (token != "CODEPAGE")
134         error("expected CODEPAGE, got \"" + token + "\" instead");
135     try
136     {
137         tokenize(curr_line, FileHeader.CodePage, 10);
138     }
139     catch(const std::invalid_argument& ia)
140     {
141         error(ia.what());
142     }
143 
144     if (!curr_line.empty())
145         error("Garbage after CODEPAGE statement: \"" + curr_line + "\"");
146 
147     /* Get CPINFO */
148     if (!get_clean_line(input, curr_line))
149         error("Nothing after CODEPAGE statement");
150 
151     tokenize(curr_line, token);
152     if (token != "CPINFO")
153         error("Expected CPINFO, got \"" + token + "\" instead");
154     try
155     {
156         tokenize(curr_line, FileHeader.MaximumCharacterSize);
157         tokenize(curr_line, FileHeader.DefaultChar);
158         tokenize(curr_line, FileHeader.UniDefaultChar);
159     }
160     catch(const std::invalid_argument& ia)
161     {
162         error(ia.what());
163         return 1;
164     }
165     if (!curr_line.empty())
166         error("Garbage after CPINFO statement: \"" + curr_line + "\"");
167     if ((FileHeader.MaximumCharacterSize != 1) && (FileHeader.MaximumCharacterSize != 2))
168         error("Expected 1 or 2 as max char size in CPINFO, got \"" + std::to_string(FileHeader.MaximumCharacterSize) + "\" instead");
169     if ((FileHeader.MaximumCharacterSize == 1) && (FileHeader.DefaultChar > std::numeric_limits<uint8_t>::max()))
170         error("Default MB character " + std::to_string(FileHeader.DefaultChar) + " doesn't fit in a 8-bit value");
171 
172     /* Setup tables & default values */
173     bool has_mbtable = false;
174     uint16_t mb_table[256] = {0};
175 
176     bool has_wctable = false;
177     uint8_t* wc_table = new uint8_t[65536 * FileHeader.MaximumCharacterSize];
178     if (FileHeader.MaximumCharacterSize == 1)
179     {
180         for (int i = 0; i < 65536; i++)
181             wc_table[i] = FileHeader.DefaultChar;
182     }
183     else
184     {
185         uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
186         for (int i = 0; i < 65536; i++)
187             wc_table_dbcs[i] = FileHeader.DefaultChar;
188     }
189 
190     std::vector<uint16_t> dbcs_table;
191     uint16_t lb_offsets[256] = {0};
192     uint16_t dbcs_range_count = 0;
193 
194     uint16_t glyph_table[256] = {0};
195     bool has_glyphs = false;
196 
197     /* Now parse */
198     while (get_clean_line(input, curr_line))
199     {
200         tokenize(curr_line, token);
201 
202         if (token == "ENDCODEPAGE")
203         {
204             if (!curr_line.empty())
205                 error("Garbage after ENDCODEPAGE statement: \"" + curr_line + "\"");
206             break;
207         }
208         else if (token == "MBTABLE")
209         {
210             uint16_t table_size;
211             try
212             {
213                 tokenize(curr_line, table_size);
214             }
215             catch(const std::invalid_argument& ia)
216             {
217                 error(ia.what());
218             }
219             if (has_mbtable)
220                 error("MBTABLE can only be declared once");
221             if (table_size > 256)
222                 error("MBTABLE size can't be larger than 256");
223             if (!curr_line.empty())
224                 error("Garbage after MBTABLE statement: \"" + curr_line + "\"");
225 
226             has_mbtable = true;
227             while (table_size--)
228             {
229                 if (!get_clean_line(input, curr_line))
230                     error("Expected " + std::to_string(table_size + 1) + " more lines after MBTABLE token");
231 
232                 uint8_t mb;
233                 uint16_t wc;
234 
235                 try
236                 {
237                     tokenize(curr_line, mb);
238                     tokenize(curr_line, wc);
239                 }
240                 catch(const std::invalid_argument& ia)
241                 {
242                     error(ia.what());
243                 }
244                 if (!curr_line.empty())
245                     error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
246                 mb_table[mb] = wc;
247             }
248         }
249         else if (token == "WCTABLE")
250         {
251             uint32_t table_size;
252             try
253             {
254                 tokenize(curr_line, table_size);
255             }
256             catch(const std::invalid_argument& ia)
257             {
258                 error(ia.what());
259             }
260             if (has_wctable)
261                 error("WCTABLE can only be declared once");
262             if (!curr_line.empty())
263                 error("Garbage after WCTABLE statement: \"" + curr_line + "\"");
264             if (table_size > 65536)
265                 error("WCTABLE size can't be larger than 65536");
266 
267             has_wctable = true;
268 
269             if (FileHeader.MaximumCharacterSize == 1)
270             {
271                 while (table_size--)
272                 {
273                     if (!get_clean_line(input, curr_line))
274                         error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
275 
276                     uint8_t mb;
277                     uint16_t wc;
278 
279                     try
280                     {
281                         tokenize(curr_line, wc);
282                         tokenize(curr_line, mb);
283                     }
284                     catch(const std::invalid_argument& ia)
285                     {
286                         error(ia.what());
287                     }
288                     if (!curr_line.empty())
289                         error("Garbage after WCTABLE entry: \"" + curr_line + "\"");
290                     wc_table[wc] = mb;
291                 }
292             }
293             else
294             {
295                 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
296                 while (table_size--)
297                 {
298                     if (!get_clean_line(input, curr_line))
299                         error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
300                     uint16_t mb;
301                     uint16_t wc;
302 
303                     try
304                     {
305                         tokenize(curr_line, wc);
306                         tokenize(curr_line, mb);
307                     }
308                     catch(const std::invalid_argument& ia)
309                     {
310                         error(ia.what());
311                     }
312                     if (!curr_line.empty())
313                         error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
314                     wc_table_dbcs[wc] = mb;
315                 }
316             }
317         }
318         else if (token == "DBCSRANGE")
319         {
320             if (dbcs_range_count != 0)
321                 error("DBCSRANGE can only be declared once");
322 
323             try
324             {
325                 tokenize(curr_line, dbcs_range_count);
326             }
327             catch(const std::invalid_argument& ia)
328             {
329                 error(ia.what());
330             }
331             if (dbcs_range_count > (MAXIMUM_LEADBYTES / 2))
332                 error("DBCSRANGE count can't exceed " + std::to_string(MAXIMUM_LEADBYTES / 2));
333             if (!curr_line.empty())
334                 error("Garbage after DBCSRANGE token");
335 
336             std::size_t current_offset = 0;
337 
338             uint16_t range_count = dbcs_range_count;
339             uint16_t current_range = 0;
340             while (range_count--)
341             {
342                 if (!get_clean_line(input, curr_line))
343                     error("Expected new range after DBCSRANGE");
344 
345                 uint8_t RangeStart, RangeEnd;
346                 try
347                 {
348                     tokenize(curr_line, RangeStart);
349                     tokenize(curr_line, RangeEnd);
350                 }
351                 catch(const std::invalid_argument& ia)
352                 {
353                     error(ia.what());
354                 }
355                 if (!curr_line.empty())
356                     error("Garbage after DBCS range declaration");
357 
358                 if (RangeStart > RangeEnd)
359                     error("Invalid range specified for DBCSRANGE");
360 
361                 FileHeader.LeadByte[current_range*2] = RangeStart;
362                 FileHeader.LeadByte[current_range*2+1] = RangeEnd;
363                 current_range++;
364 
365                 dbcs_table.resize(dbcs_table.size() + 256 * (RangeEnd - RangeStart + 1), FileHeader.UniDefaultChar);
366 
367                 for (uint8_t LeadByte = RangeStart; LeadByte <= RangeEnd; LeadByte++)
368                 {
369                     if (!get_clean_line(input, curr_line))
370                         error("Expected new DBCSTABLE after DBCS range declaration");
371 
372                     tokenize(curr_line, token);
373                     if (token != "DBCSTABLE")
374                         error("Expected new DBCSTABLE after DBCS range declaration");
375 
376                     uint16_t table_size;
377                     try
378                     {
379                         tokenize(curr_line, table_size);
380                     }
381                     catch(const std::invalid_argument& ia)
382                     {
383                         error(ia.what());
384                     }
385                     if (table_size > 256)
386                         error("DBCSTABLE can't have more than 256 entries");
387                     while (table_size--)
388                     {
389                         if (!get_clean_line(input, curr_line))
390                             error("Expected " + std::to_string(table_size + 1) + " more lines after DBCSTABLE token");
391 
392                         uint8_t mb;
393                         uint16_t wc;
394 
395                         try
396                         {
397                             tokenize(curr_line, mb);
398                             tokenize(curr_line, wc);
399                         }
400                         catch(const std::invalid_argument& ia)
401                         {
402                             error(ia.what());
403                         }
404                         if (!curr_line.empty())
405                             error("Garbage after DBCSTABLE entry: \"" + curr_line + "\"");
406 
407                         dbcs_table[current_offset + mb] = wc;
408                     }
409                     current_offset += 256;
410                     /* Offsets start at 256 for the offset table. */
411                     lb_offsets[LeadByte] = current_offset;
412                 }
413             }
414         }
415         else if (token == "GLYPHTABLE")
416         {
417             uint16_t table_size;
418             try
419             {
420                 tokenize(curr_line, table_size);
421             }
422             catch(const std::invalid_argument& ia)
423             {
424                 error(ia.what());
425             }
426             if (has_glyphs)
427                 error("GLYPHTABLE can only be declared once");
428             if (table_size > 256)
429                 error("GLYPHTABLE size can't be larger than 256");
430             if (!curr_line.empty())
431                 error("Garbage after GLYPHTABLE statement: \"" + curr_line + "\"");
432             has_glyphs = true;
433 
434             while (table_size--)
435             {
436                 if (!get_clean_line(input, curr_line))
437                     error("Expected " + std::to_string(table_size + 1) + " more lines after GLYPHTABLE token");
438 
439                 uint8_t mb;
440                 uint16_t wc;
441 
442                 try
443                 {
444                     tokenize(curr_line, mb);
445                     tokenize(curr_line, wc);
446                 }
447                 catch(const std::invalid_argument& ia)
448                 {
449                     error(ia.what());
450                 }
451                 if (!curr_line.empty())
452                     error("Garbage after GLYPHTABLE entry: \"" + curr_line + "\"");
453                 glyph_table[mb] = wc;
454             }
455         }
456         else
457         {
458             error("Unexpected token \"" + token + "\"");
459         }
460     }
461 
462     if (token != "ENDCODEPAGE")
463         error("Expected last token to be \"ENDCODEPAGE\"");
464 
465     input.close();
466 
467     /* Ensure this is minimally workable */
468     if (!has_mbtable)
469         error("File has no MBTABLE statement");
470     if (!has_wctable)
471         error("File has no WCTABLE statement");
472 
473     /* Glyph table fixup */
474     if (has_glyphs)
475     {
476         for(int i = 0; i < 256; i++)
477         {
478             if (glyph_table[i] == 0)
479                 glyph_table[i] = mb_table[i];
480         }
481     }
482 
483     /* Translated default char fixup */
484     if (FileHeader.MaximumCharacterSize == 1)
485     {
486         FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
487         FileHeader.TransUniDefaultChar = wc_table[FileHeader.UniDefaultChar];
488     }
489     else
490     {
491         if (FileHeader.DefaultChar > 0xFF)
492         {
493             uint16_t offset = lb_offsets[FileHeader.DefaultChar >> 8];
494             if (!offset)
495                 error("Default MB char is not translatable!");
496             FileHeader.TransDefaultChar = dbcs_table[(FileHeader.DefaultChar & 0xFF) + (offset - 256)];
497         }
498         else
499         {
500             FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
501         }
502         uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
503         FileHeader.TransUniDefaultChar = wc_table_dbcs[FileHeader.UniDefaultChar];
504     }
505     FileHeader.HeaderSize = sizeof(NLS_FILE_HEADER) / sizeof(uint16_t);
506 
507     std::ofstream output(argv[2], std::ios_base::binary);
508 
509     output.write(reinterpret_cast<char*>(&FileHeader), sizeof(FileHeader));
510 
511     uint16_t wc_table_offset = sizeof(mb_table) / sizeof(uint16_t)
512                                + 1                                  /* size of glyph table */
513                                + (has_glyphs ? 256 : 0)             /* Glyph table */
514                                + 1                                  /* Number of DBCS LeadByte ranges */
515                                + (dbcs_range_count ? 256 : 0)       /* offsets of lead byte sub tables */
516                                + dbcs_table.size()                  /* LeadByte sub tables */
517                                + 1;                                 /* Unknown flag */
518 
519     output.write(reinterpret_cast<char*>(&wc_table_offset), sizeof(wc_table_offset));
520 
521     output.write(reinterpret_cast<char*>(mb_table), sizeof(mb_table));
522 
523     uint16_t glyph_table_size = has_glyphs ? 256 : 0;
524     output.write(reinterpret_cast<char*>(&glyph_table_size), sizeof(glyph_table_size));
525     if (has_glyphs)
526         output.write(reinterpret_cast<char*>(glyph_table), sizeof(glyph_table));
527 
528     output.write(reinterpret_cast<char*>(&dbcs_range_count), sizeof(dbcs_range_count));
529     if (dbcs_range_count)
530     {
531         output.write(reinterpret_cast<char*>(lb_offsets), sizeof(lb_offsets));
532     }
533     if (dbcs_table.size())
534     {
535         output.write(reinterpret_cast<char*>(dbcs_table.data()), dbcs_table.size() * sizeof(uint16_t));
536     }
537 
538     uint16_t unknown_flag = FileHeader.MaximumCharacterSize == 1 ? 0 : 4;
539     output.write(reinterpret_cast<char*>(&unknown_flag), sizeof(unknown_flag));
540 
541     output.write(reinterpret_cast<char*>(wc_table), 65536 * FileHeader.MaximumCharacterSize);
542 
543     output.close();
544     delete[] wc_table;
545 
546     return 0;
547 }
548