xref: /reactos/sdk/tools/txt2nls/main.cpp (revision b3194e32)
1 /*
2  * PROJECT:     ReactOS TXT to NLS Converter
3  * LICENSE:     GPL-2.0-or-later (https://spdx.org/licenses/GPL-2.0-or-later)
4  * FILE:        sdk/tools/txt2nls/main.c
5  * COPYRIGHT:   Copyright 2021 Jérôme Gardou <jerome.gardou@reactos.org>
6  */
7 
8 #include <iostream>
9 #include <fstream>
10 #include <limits>
11 #include <vector>
12 #include <cstring>
13 #include <string>
14 #include <cstdint>
15 
16 static const char whitespaces[] = " \t\f\v\n\r";
17 static long line_number = -1;
18 
19 #pragma pack(push, 1)
20 #define MAXIMUM_LEADBYTES 12
21 struct NLS_FILE_HEADER
22 {
23     uint16_t HeaderSize;
24     uint16_t CodePage;
25     uint16_t MaximumCharacterSize;
26     uint16_t DefaultChar;
27     uint16_t UniDefaultChar;
28     uint16_t TransDefaultChar;
29     uint16_t TransUniDefaultChar;
30     uint8_t LeadByte[MAXIMUM_LEADBYTES];
31 };
32 static_assert(sizeof(NLS_FILE_HEADER) == 26, "Wrong size for NLS_FILE_HEADER");
33 #pragma pack(pop)
34 
35 static std::istream& get_clean_line(std::istream& stream, std::string& str)
36 {
37     do
38     {
39         std::istream& ret = std::getline(stream, str);
40         if (!ret)
41             return ret;
42 
43         /* Ignore comments */
44         std::size_t comment_pos = str.find_first_of(';');
45         if (comment_pos != std::string::npos)
46         {
47             str.erase(comment_pos);
48         }
49 
50         /* Remove trailing spaces */
51         std::size_t end_of_line = str.find_last_not_of(whitespaces);
52         if (end_of_line != std::string::npos)
53             str.erase(end_of_line + 1);
54         else
55             str.clear();
56 
57         line_number++;
58     } while (str.empty());
59 
60     return stream;
61 }
62 
63 static void tokenize(std::string& str, std::string& token)
64 {
65     std::size_t token_start = str.find_first_not_of(whitespaces);
66     if (token_start == std::string::npos)
67     {
68         token = "";
69         str.clear();
70         return;
71     }
72 
73     std::size_t token_end = str.find_first_of(whitespaces, token_start);
74     if (token_end == std::string::npos)
75     {
76         token = str.substr(token_start);
77         str.clear();
78         return;
79     }
80 
81     token = str.substr(token_start, token_end);
82     str.erase(0, str.find_first_not_of(whitespaces, token_end));
83 }
84 
85 template<typename T>
86 static void tokenize(std::string& str, T& int_token, int base = 0)
87 {
88     std::string token;
89     tokenize(str, token);
90 
91     long val;
92     val = std::stol(token, nullptr, base);
93     if ((val > std::numeric_limits<T>::max()) || (val < std::numeric_limits<T>::min()))
94         throw std::invalid_argument(token + " does not fit range ["
95             + std::to_string(std::numeric_limits<T>::min()) + ":" + std::to_string(std::numeric_limits<T>::max()) + "]");
96 
97     int_token = val;
98 }
99 
100 void error(const std::string& err)
101 {
102     std::cerr << "Error parsing line " << line_number <<": " << err << std::endl;
103     std::exit(1);
104 }
105 
106 int main(int argc, char* argv[])
107 {
108     if (argc != 3)
109     {
110         std::cerr << "Usage: " << argv[0] << " <txt_in> <nls_out>" << std::endl;
111         return 1;
112     }
113 
114     std::ifstream input(argv[1]);
115     if (!input.is_open())
116     {
117         std::cerr << "Unable to open " << argv[1] << std::endl;
118         return 1;
119     }
120 
121     NLS_FILE_HEADER FileHeader;
122     memset(&FileHeader, 0, sizeof(FileHeader));
123 
124     std::string curr_line;
125     // Get code page
126     if (!get_clean_line(input, curr_line))
127     {
128         std::cerr << "ERROR: File is empty" << std::endl;
129         return 1;
130     }
131 
132     std::string token;
133     tokenize(curr_line, token);
134     if (token != "CODEPAGE")
135         error("expected CODEPAGE, got \"" + token + "\" instead");
136     try
137     {
138         tokenize(curr_line, FileHeader.CodePage, 10);
139     }
140     catch(const std::invalid_argument& ia)
141     {
142         error(ia.what());
143     }
144 
145     if (!curr_line.empty())
146         error("Garbage after CODEPAGE statement: \"" + curr_line + "\"");
147 
148     /* Get CPINFO */
149     if (!get_clean_line(input, curr_line))
150         error("Nothing after CODEPAGE statement");
151 
152     tokenize(curr_line, token);
153     if (token != "CPINFO")
154         error("Expected CPINFO, got \"" + token + "\" instead");
155     try
156     {
157         tokenize(curr_line, FileHeader.MaximumCharacterSize);
158         tokenize(curr_line, FileHeader.DefaultChar);
159         tokenize(curr_line, FileHeader.UniDefaultChar);
160     }
161     catch(const std::invalid_argument& ia)
162     {
163         error(ia.what());
164         return 1;
165     }
166     if (!curr_line.empty())
167         error("Garbage after CPINFO statement: \"" + curr_line + "\"");
168     if ((FileHeader.MaximumCharacterSize != 1) && (FileHeader.MaximumCharacterSize != 2))
169         error("Expected 1 or 2 as max char size in CPINFO, got \"" + std::to_string(FileHeader.MaximumCharacterSize) + "\" instead");
170     if ((FileHeader.MaximumCharacterSize == 1) && (FileHeader.DefaultChar > std::numeric_limits<uint8_t>::max()))
171         error("Default MB character " + std::to_string(FileHeader.DefaultChar) + " doesn't fit in a 8-bit value");
172 
173     /* Setup tables & default values */
174     bool has_mbtable = false;
175     uint16_t mb_table[256] = {0};
176 
177     bool has_wctable = false;
178     uint8_t* wc_table = new uint8_t[65536 * FileHeader.MaximumCharacterSize];
179     if (FileHeader.MaximumCharacterSize == 1)
180     {
181         for (int i = 0; i < 65536; i++)
182             wc_table[i] = FileHeader.DefaultChar;
183     }
184     else
185     {
186         uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
187         for (int i = 0; i < 65536; i++)
188             wc_table_dbcs[i] = FileHeader.DefaultChar;
189     }
190 
191     std::vector<uint16_t> dbcs_table;
192     uint16_t lb_offsets[256] = {0};
193     uint16_t dbcs_range_count = 0;
194 
195     uint16_t glyph_table[256] = {0};
196     bool has_glyphs = false;
197 
198     /* Now parse */
199     while (get_clean_line(input, curr_line))
200     {
201         tokenize(curr_line, token);
202 
203         if (token == "ENDCODEPAGE")
204         {
205             if (!curr_line.empty())
206                 error("Garbage after ENDCODEPAGE statement: \"" + curr_line + "\"");
207             break;
208         }
209         else if (token == "MBTABLE")
210         {
211             uint16_t table_size;
212             try
213             {
214                 tokenize(curr_line, table_size);
215             }
216             catch(const std::invalid_argument& ia)
217             {
218                 error(ia.what());
219             }
220             if (has_mbtable)
221                 error("MBTABLE can only be declared once");
222             if (table_size > 256)
223                 error("MBTABLE size can't be larger than 256");
224             if (!curr_line.empty())
225                 error("Garbage after MBTABLE statement: \"" + curr_line + "\"");
226 
227             has_mbtable = true;
228             while (table_size--)
229             {
230                 if (!get_clean_line(input, curr_line))
231                     error("Expected " + std::to_string(table_size + 1) + " more lines after MBTABLE token");
232 
233                 uint8_t mb;
234                 uint16_t wc;
235 
236                 try
237                 {
238                     tokenize(curr_line, mb);
239                     tokenize(curr_line, wc);
240                 }
241                 catch(const std::invalid_argument& ia)
242                 {
243                     error(ia.what());
244                 }
245                 if (!curr_line.empty())
246                     error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
247                 mb_table[mb] = wc;
248             }
249         }
250         else if (token == "WCTABLE")
251         {
252             uint32_t table_size;
253             try
254             {
255                 tokenize(curr_line, table_size);
256             }
257             catch(const std::invalid_argument& ia)
258             {
259                 error(ia.what());
260             }
261             if (has_wctable)
262                 error("WCTABLE can only be declared once");
263             if (!curr_line.empty())
264                 error("Garbage after WCTABLE statement: \"" + curr_line + "\"");
265             if (table_size > 65536)
266                 error("WCTABLE size can't be larger than 65536");
267 
268             has_wctable = true;
269 
270             if (FileHeader.MaximumCharacterSize == 1)
271             {
272                 while (table_size--)
273                 {
274                     if (!get_clean_line(input, curr_line))
275                         error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
276 
277                     uint8_t mb;
278                     uint16_t wc;
279 
280                     try
281                     {
282                         tokenize(curr_line, wc);
283                         tokenize(curr_line, mb);
284                     }
285                     catch(const std::invalid_argument& ia)
286                     {
287                         error(ia.what());
288                     }
289                     if (!curr_line.empty())
290                         error("Garbage after WCTABLE entry: \"" + curr_line + "\"");
291                     wc_table[wc] = mb;
292                 }
293             }
294             else
295             {
296                 uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
297                 while (table_size--)
298                 {
299                     if (!get_clean_line(input, curr_line))
300                         error("Expected " + std::to_string(table_size + 1) + " more lines after WCTABLE token");
301                     uint16_t mb;
302                     uint16_t wc;
303 
304                     try
305                     {
306                         tokenize(curr_line, wc);
307                         tokenize(curr_line, mb);
308                     }
309                     catch(const std::invalid_argument& ia)
310                     {
311                         error(ia.what());
312                     }
313                     if (!curr_line.empty())
314                         error("Garbage after MBTABLE entry: \"" + curr_line + "\"");
315                     wc_table_dbcs[wc] = mb;
316                 }
317             }
318         }
319         else if (token == "DBCSRANGE")
320         {
321             if (dbcs_range_count != 0)
322                 error("DBCSRANGE can only be declared once");
323 
324             try
325             {
326                 tokenize(curr_line, dbcs_range_count);
327             }
328             catch(const std::invalid_argument& ia)
329             {
330                 error(ia.what());
331             }
332             if (dbcs_range_count > (MAXIMUM_LEADBYTES / 2))
333                 error("DBCSRANGE count can't exceed " + std::to_string(MAXIMUM_LEADBYTES / 2));
334             if (!curr_line.empty())
335                 error("Garbage after DBCSRANGE token");
336 
337             std::size_t current_offset = 0;
338 
339             uint16_t range_count = dbcs_range_count;
340             uint16_t current_range = 0;
341             while (range_count--)
342             {
343                 if (!get_clean_line(input, curr_line))
344                     error("Expected new range after DBCSRANGE");
345 
346                 uint8_t RangeStart, RangeEnd;
347                 try
348                 {
349                     tokenize(curr_line, RangeStart);
350                     tokenize(curr_line, RangeEnd);
351                 }
352                 catch(const std::invalid_argument& ia)
353                 {
354                     error(ia.what());
355                 }
356                 if (!curr_line.empty())
357                     error("Garbage after DBCS range declaration");
358 
359                 if (RangeStart > RangeEnd)
360                     error("Invalid range specified for DBCSRANGE");
361 
362                 FileHeader.LeadByte[current_range*2] = RangeStart;
363                 FileHeader.LeadByte[current_range*2+1] = RangeEnd;
364                 current_range++;
365 
366                 dbcs_table.resize(dbcs_table.size() + 256 * (RangeEnd - RangeStart + 1), FileHeader.UniDefaultChar);
367 
368                 for (uint8_t LeadByte = RangeStart; LeadByte <= RangeEnd; LeadByte++)
369                 {
370                     if (!get_clean_line(input, curr_line))
371                         error("Expected new DBCSTABLE after DBCS range declaration");
372 
373                     tokenize(curr_line, token);
374                     if (token != "DBCSTABLE")
375                         error("Expected new DBCSTABLE after DBCS range declaration");
376 
377                     uint16_t table_size;
378                     try
379                     {
380                         tokenize(curr_line, table_size);
381                     }
382                     catch(const std::invalid_argument& ia)
383                     {
384                         error(ia.what());
385                     }
386                     if (table_size > 256)
387                         error("DBCSTABLE can't have more than 256 entries");
388                     while (table_size--)
389                     {
390                         if (!get_clean_line(input, curr_line))
391                             error("Expected " + std::to_string(table_size + 1) + " more lines after DBCSTABLE token");
392 
393                         uint8_t mb;
394                         uint16_t wc;
395 
396                         try
397                         {
398                             tokenize(curr_line, mb);
399                             tokenize(curr_line, wc);
400                         }
401                         catch(const std::invalid_argument& ia)
402                         {
403                             error(ia.what());
404                         }
405                         if (!curr_line.empty())
406                             error("Garbage after DBCSTABLE entry: \"" + curr_line + "\"");
407 
408                         dbcs_table[current_offset + mb] = wc;
409                     }
410                     current_offset += 256;
411                     /* Offsets start at 256 for the offset table. */
412                     lb_offsets[LeadByte] = current_offset;
413                 }
414             }
415         }
416         else if (token == "GLYPHTABLE")
417         {
418             uint16_t table_size;
419             try
420             {
421                 tokenize(curr_line, table_size);
422             }
423             catch(const std::invalid_argument& ia)
424             {
425                 error(ia.what());
426             }
427             if (has_glyphs)
428                 error("GLYPHTABLE can only be declared once");
429             if (table_size > 256)
430                 error("GLYPHTABLE size can't be larger than 256");
431             if (!curr_line.empty())
432                 error("Garbage after GLYPHTABLE statement: \"" + curr_line + "\"");
433             has_glyphs = true;
434 
435             while (table_size--)
436             {
437                 if (!get_clean_line(input, curr_line))
438                     error("Expected " + std::to_string(table_size + 1) + " more lines after GLYPHTABLE token");
439 
440                 uint8_t mb;
441                 uint16_t wc;
442 
443                 try
444                 {
445                     tokenize(curr_line, mb);
446                     tokenize(curr_line, wc);
447                 }
448                 catch(const std::invalid_argument& ia)
449                 {
450                     error(ia.what());
451                 }
452                 if (!curr_line.empty())
453                     error("Garbage after GLYPHTABLE entry: \"" + curr_line + "\"");
454                 glyph_table[mb] = wc;
455             }
456         }
457         else
458         {
459             error("Unexpected token \"" + token + "\"");
460         }
461     }
462 
463     if (token != "ENDCODEPAGE")
464         error("Expected last token to be \"ENDCODEPAGE\"");
465 
466     input.close();
467 
468     /* Ensure this is minimally workable */
469     if (!has_mbtable)
470         error("File has no MBTABLE statement");
471     if (!has_wctable)
472         error("File has no WCTABLE statement");
473 
474     /* Glyph table fixup */
475     if (has_glyphs)
476     {
477         for(int i = 0; i < 256; i++)
478         {
479             if (glyph_table[i] == 0)
480                 glyph_table[i] = mb_table[i];
481         }
482     }
483 
484     /* Translated default char fixup */
485     if (FileHeader.MaximumCharacterSize == 1)
486     {
487         FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
488         FileHeader.TransUniDefaultChar = wc_table[FileHeader.UniDefaultChar];
489     }
490     else
491     {
492         if (FileHeader.DefaultChar > 0xFF)
493         {
494             uint16_t offset = lb_offsets[FileHeader.DefaultChar >> 8];
495             if (!offset)
496                 error("Default MB char is not translatable!");
497             FileHeader.TransDefaultChar = dbcs_table[(FileHeader.DefaultChar & 0xFF) + (offset - 256)];
498         }
499         else
500         {
501             FileHeader.TransDefaultChar = mb_table[FileHeader.DefaultChar];
502         }
503         uint16_t* wc_table_dbcs = reinterpret_cast<uint16_t*>(wc_table);
504         FileHeader.TransUniDefaultChar = wc_table_dbcs[FileHeader.UniDefaultChar];
505     }
506     FileHeader.HeaderSize = sizeof(NLS_FILE_HEADER) / sizeof(uint16_t);
507 
508     std::ofstream output(argv[2], std::ios_base::binary);
509 
510     output.write(reinterpret_cast<char*>(&FileHeader), sizeof(FileHeader));
511 
512     uint16_t wc_table_offset = sizeof(mb_table) / sizeof(uint16_t)
513                                + 1                                  /* size of glyph table */
514                                + (has_glyphs ? 256 : 0)             /* Glyph table */
515                                + 1                                  /* Number of DBCS LeadByte ranges */
516                                + (dbcs_range_count ? 256 : 0)       /* offsets of lead byte sub tables */
517                                + dbcs_table.size()                  /* LeadByte sub tables */
518                                + 1;                                 /* Unknown flag */
519 
520     output.write(reinterpret_cast<char*>(&wc_table_offset), sizeof(wc_table_offset));
521 
522     output.write(reinterpret_cast<char*>(mb_table), sizeof(mb_table));
523 
524     uint16_t glyph_table_size = has_glyphs ? 256 : 0;
525     output.write(reinterpret_cast<char*>(&glyph_table_size), sizeof(glyph_table_size));
526     if (has_glyphs)
527         output.write(reinterpret_cast<char*>(glyph_table), sizeof(glyph_table));
528 
529     output.write(reinterpret_cast<char*>(&dbcs_range_count), sizeof(dbcs_range_count));
530     if (dbcs_range_count)
531     {
532         output.write(reinterpret_cast<char*>(lb_offsets), sizeof(lb_offsets));
533     }
534     if (dbcs_table.size())
535     {
536         output.write(reinterpret_cast<char*>(dbcs_table.data()), dbcs_table.size() * sizeof(uint16_t));
537     }
538 
539     uint16_t unknown_flag = FileHeader.MaximumCharacterSize == 1 ? 0 : 4;
540     output.write(reinterpret_cast<char*>(&unknown_flag), sizeof(unknown_flag));
541 
542     output.write(reinterpret_cast<char*>(wc_table), 65536 * FileHeader.MaximumCharacterSize);
543 
544     output.close();
545     delete[] wc_table;
546 
547     return 0;
548 }
549