1#!/usr/bin/env slsh
2% The unicode database contains 15 fields
3
4private define usage ()
5{
6   () = fprintf (stderr, "Usage: %s UnicodeData.txt EastAsianWidth.txt\n", __argv[0]);
7   exit (1);
8}
9
10if (__argc != 3)
11  usage ();
12
13private variable Unicode_Data_File = __argv[1];
14private variable East_Asian_File = __argv[2];
15
16private define make_char_def_table (num)
17{
18   variable s = struct
19     {
20	code_point,
21	char_name,
22	general_cat,
23	combining_class,
24	bidirectional_cat,
25	char_decomp_map,
26	decimal_digit_val,
27	digit_val,
28	numeric_val,
29	is_mirrored,
30	unicode1_name,
31	iso10646_comment,
32	uppercase_mapping,
33	lowercase_mapping,
34	titlecase_mapping,
35	east_asian_prop,
36     };
37
38   s.code_point = Int32_Type[num]; s.code_point[*] = [0:num-1];
39   s.char_name = String_Type[num]; s.char_name[*] = "";
40   s.general_cat = String_Type[num];  s.general_cat[*] = "";
41   s.combining_class = String_Type[num]; s.combining_class[*] = "";
42   s.bidirectional_cat = String_Type[num]; s.bidirectional_cat[*] = "";
43   s.char_decomp_map = String_Type[num]; s.char_decomp_map[*] = "";
44   s.decimal_digit_val = String_Type[num];
45   s.digit_val = String_Type[num];
46   s.numeric_val = String_Type[num];
47   s.is_mirrored = Char_Type[num];
48   s.unicode1_name = String_Type[num];
49   s.iso10646_comment = String_Type[num];
50   s.lowercase_mapping = Int32_Type[num];
51   s.uppercase_mapping = Int32_Type[num];
52   s.titlecase_mapping = Int32_Type[num];
53   s.east_asian_prop = String_Type[num]; s.east_asian_prop[*] = "";
54   return s;
55}
56
57private define fixup_ranges (starts, stops, s)
58{
59   variable nranges = length (starts);
60
61   foreach (get_struct_field_names (s))
62     {
63	variable field_name = ();
64
65	if (field_name == "code_point")
66	  continue;
67
68	variable field = get_struct_field (s, field_name);
69	_for (0, nranges-1, 1)
70	  {
71	     variable i = ();
72	     variable start = starts[i];
73	     variable stop = stops[i];
74
75	     field[[start+1:stop-1]] = field[start];
76	  }
77     }
78}
79
80private define hexstr_to_int (s)
81{
82   return integer (strcat ("0x0", strtrim (s)));
83}
84
85private define read_file (file)
86{
87   variable lines = fgetslines (fopen (file, "r"));
88
89   % Get the code point of the last line since it determines the number of
90   % code points
91   variable num = 1+hexstr_to_int (strchop (lines[-1], ';', 0)[0]);
92
93   variable s = make_char_def_table (num);
94   variable is_range = Char_Type[num];
95   variable i, j;
96
97   foreach (lines)
98     {
99	variable line = ();
100	variable fields = strchop (line, ';', 0);
101
102	if (fields[2] == "Cs")
103	  continue;		       %  surrogate
104
105	i = hexstr_to_int (fields[0]);
106
107	variable field = fields[1];
108	s.char_name[i] = field;
109	% A range is specified if the field is of the form <xxx, First>
110	% or <xxx, Last>
111	if (field[0] == '<')
112	  {
113	     if (string_match (field, ", First>$", 1))
114	       is_range [i] = 1;
115	     else if (string_match (field, ", Last>$", 1))
116	       is_range[i] = -1;
117	  }
118
119	s.general_cat[i] = fields[2];
120	s.combining_class[i] = fields[3];
121	s.bidirectional_cat[i] = fields[4];
122	s.char_decomp_map[i] = fields[5];
123	s.decimal_digit_val[i] = fields[6];
124	s.digit_val[i] = fields[7];
125	s.numeric_val[i] = fields[8];
126	s.is_mirrored[i] = (fields[9] == "Y");
127	s.unicode1_name[i] = fields[10];
128	s.iso10646_comment[i] = fields[11];
129	s.uppercase_mapping[i] = hexstr_to_int (fields[12]);
130	s.lowercase_mapping[i] = hexstr_to_int (fields[13]);
131	s.titlecase_mapping[i] = hexstr_to_int (fields[14]);
132     }
133
134   i = where (is_range == 1);
135   if (length (i))
136     {
137	j = where (is_range == -1);
138	if (length (i) != length (j))
139	  verror ("First and Last ranges do not match");
140
141	fixup_ranges (i, j, s);
142     }
143
144   i = where (s.lowercase_mapping == 0);
145   s.lowercase_mapping[i] = s.code_point[i];
146   i = where (s.uppercase_mapping == 0);
147   s.uppercase_mapping[i] = s.code_point[i];
148   i = where (s.titlecase_mapping == 0);
149   s.titlecase_mapping[i] = s.code_point[i];
150
151   return s;
152}
153
154private define read_east_asian_file (s, file)
155{
156   foreach (fopen (file, "r")) using ("line")
157     {
158	variable line = ();
159	if (line[0] == '#')
160	  continue;
161	line = strtrim (line);
162	!if (strlen (line))
163	  continue;
164	variable code, prop;
165	variable fields = strtok (line, "; ");
166	code = fields[0];
167	if (is_substr (code, ".."))
168	  {
169	     code = strtok (code, ".");
170	     variable code_start = hexstr_to_int (code[0]);
171	     variable code_stop = hexstr_to_int (code[1]);
172	     prop = fields[1];
173	     _for (code_start, code_stop, 1)
174	       {
175		  code = ();
176		  s.east_asian_prop[code] = prop;
177	       }
178	     continue;
179	  }
180	code = hexstr_to_int (code);
181	s.east_asian_prop[code] = fields[1];
182     }
183}
184
185private variable LOWER	= 0x0001;
186private variable UPPER	= 0x0002;
187private variable ALPHA	= 0x0004;
188private variable XDIGIT	= 0x0008;
189private variable SPACE	= 0x0010;
190private variable BLANK	= 0x0020;
191private variable CNTRL	= 0x0040;
192private variable PRINT	= 0x0080;
193
194private variable DIGIT	= 0x0100;
195private variable GRAPH	= 0x0200;
196private variable ALNUM	= 0x0400;
197private variable PUNCT	= 0x0800;
198private variable ASCII	= 0x1000;
199
200private variable Classification_C_Table_Type = "_pSLuint16_Type";
201private variable Classification_C_Table_Format = "0x%04X";
202
203private define init_file (file)
204{
205   variable fp = fopen (file, "w");
206
207   () = fprintf (fp, "/* This file was automatically created by %s */\n", __argv[0]);
208
209   return fp;
210}
211
212private define check_data_type (datatype, s, what, table_name)
213{
214   variable min_val, max_val;
215
216   switch (datatype)
217     {
218      case "char":
219	min_val = -128; max_val = 127;
220     }
221     {
222	case "unsigned char":
223	min_val = 0; max_val = 255;
224     }
225     {
226      case "_pSLint16_Type":
227	min_val = -32768; max_val = 32767;
228     }
229     {
230      case "_pSLuint16_Type":
231	min_val = 0; max_val = 0xFFFF;
232     }
233     {
234      case "_pSLint32_Type":
235	min_val = -2147483648;
236	max_val = 0x7FFFFFFF;
237     }
238     {
239      case "_pSLuint32_Type":
240	min_val = 0; max_val = 0xFFFFFFFFUL;
241     }
242     {
243      case "bit":
244	return;
245     }
246     {
247	() = fprintf (stderr, "check_data_type: %s not supported\n", datatype);
248	return;
249     }
250
251   variable i = wherenot (min_val <= what <= max_val);
252   if (length (i) == 0)
253     return;
254
255   () = fprintf (stderr, "***WARNING: table for %s needs a larger type for char 0x%04X\n", table_name, s.code_point[i[0]]);
256}
257
258private define write_toxxx_table (fp, s, what, datatype,
259				 table_name, format, shift_bits,
260				 greater_than_max_value)
261{
262   variable ch = s.code_point;
263   variable use_bitmap = 0;
264   variable i, j, k;
265   variable bits_per_value;
266
267   check_data_type (datatype, s, what, table_name);
268
269   if (datatype == "bit")
270     {
271	variable max_what = max(what);
272	bits_per_value = -1;
273	variable shift_bits_offset = 4;
274	foreach ([1,2,4,8]) % 7, 3, 1, 0
275	    {
276	       i = ();
277	       shift_bits_offset--;
278	       if (max_what >=  (1 shl i))
279		 continue;
280
281	       bits_per_value = i;
282	       break;
283	    }
284
285	if (bits_per_value == -1)
286	  verror ("bit data type cannot represent this object\n");
287
288	datatype = "unsigned char";
289	use_bitmap = 1;
290     }
291
292   if (use_bitmap)
293     {
294	variable num_values_per_8bits = 8/bits_per_value;
295	ch = ch/num_values_per_8bits;
296     }
297
298   % Take advantage of the sparseness of the table.  To this end, write
299   % N tables with nentries per table.
300   variable nentries = (1 shl shift_bits);
301   variable ntables = max(ch)/nentries + 1;
302
303   variable data = Int_Type[ntables * nentries];
304
305   if (use_bitmap)
306     {
307	i = length(what)/num_values_per_8bits;
308	if (i * num_values_per_8bits < length(what))
309	  i++;
310
311	if (greater_than_max_value)
312	  {
313	     vmessage ("Padding table: num_values_per_8bits = %d", num_values_per_8bits);
314	     variable new_what = @Array_Type(_typeof(what), [i*num_values_per_8bits]);
315	     new_what[[0:length(what)-1]] = what;
316	     new_what[[length(what):]] = greater_than_max_value;
317	     what = new_what;
318	  }
319	variable bitmap = UChar_Type[i];
320
321	% Fillout the bitmap with the correct values for characters beyond the
322	% tabulated range.
323
324	variable bit = 0;
325	_for (0, num_values_per_8bits-1, 1)
326	  {
327	     variable b = ();
328	     variable values = what[[b::num_values_per_8bits]];
329	     _for (0, bits_per_value-1, 1)
330	       {
331		  k = ();
332		  i = where (values & (1 shl k));
333		  bitmap[i] |= (1 shl bit);
334		  bit++;
335	       }
336	  }
337	what = bitmap;
338     }
339
340   data[[0:max(ch)]] = what;
341
342   variable unique_tables = Array_Type[ntables];
343   variable tables = Int_Type[ntables];
344
345   variable num_unique = 0;
346   unique_tables[0] = [1:nentries]*0;
347   num_unique = 1;
348
349   _for (0, ntables-1, 1)
350     {
351	i = ();
352	variable table = data[nentries*i + [0:nentries-1]];
353
354	j = 0;
355	while (j < num_unique)
356	  {
357	     if (0 == length (where (unique_tables[j] != table)))
358	       break;
359	     j++;
360	  }
361
362	tables[i] = j;
363	if (j == num_unique)
364	  {
365	     unique_tables[num_unique] = table;
366	     num_unique++;
367	  }
368     }
369
370   % How many tables do we really need?
371   i = where (tables != 0);
372   ntables = 1 + i[-1];
373
374   if (typeof (fp) == String_Type)
375     fp = init_file (fp);
376
377   variable bitmap_multiplier = 1;
378   if (use_bitmap)
379     bitmap_multiplier = num_values_per_8bits;
380
381   variable table_lookup_name = sprintf ("SL_%s_LOOKUP", strup(table_name));
382   variable max_char_name = sprintf ("SL_%s_MAX_CHAR", strup(table_name));
383   variable assign_lookup_name = sprintf ("SL_%s_ALOOKUP", strup(table_name));
384
385   table_name = sprintf ("_pSLwc_%s_Table", table_name);
386
387   () = fprintf (fp, "#define %s 0x%Xul\n\n", max_char_name,
388		 bitmap_multiplier * ntables * nentries);
389
390   if (use_bitmap == 0)
391     {
392	() = fprintf (fp, "#define %s(x) \\\n", table_lookup_name);
393	() = fprintf (fp, "  (((unsigned)(x)>=%s)?%d:(%s[(unsigned)(x)>>%d][(unsigned)(x)&0x%X]))\n\n",
394		      max_char_name, greater_than_max_value, table_name, shift_bits, nentries-1);
395     }
396   else if (num_values_per_8bits == 8) %  boolean (0 or 1)
397     {
398	()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name);
399	()=fprintf(fp, "{ \\\n");
400	()=fprintf(fp, "   const %s *_t; \\\n", datatype);
401	()=fprintf(fp, "   (y) = (((unsigned)(x) < %s) \\\n", max_char_name);
402	()=fprintf(fp, "	  && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n",
403		   table_name, shift_bits_offset + shift_bits);
404	()=fprintf(fp, "	  && (_t[(unsigned)((x)>>%d)&0x%X] & (%d << ((x)&%d)))); \\\n",
405		   shift_bits_offset, nentries - 1, int(2^bits_per_value-1), num_values_per_8bits-1);
406	()=fprintf(fp, "}\n");
407     }
408   else % bit mapped with num_values_per_8bits = 1,2, or 4
409     {
410	()=fprintf(fp, "#define %s(y,x) \\\n", assign_lookup_name);
411	()=fprintf(fp, "{ \\\n");
412	()=fprintf(fp, "   const %s *_t; \\\n", datatype);
413	()=fprintf(fp, "   (y) = (((unsigned)(x) < %s) \\\n", max_char_name);
414	()=fprintf(fp, "	  && (NULL != (_t = %s[(unsigned)(x)>>%d])) \\\n",
415		   table_name, shift_bits_offset + shift_bits);
416	()=fprintf(fp, "	  ? ((_t[(unsigned)((x)>>%d)&0x%X]>>(%d*((x)&%d)))&%d) : %d); \\\n",
417		   shift_bits_offset, nentries - 1, bits_per_value,
418		   num_values_per_8bits-1, int(2^bits_per_value-1),
419		   greater_than_max_value);
420	()=fprintf(fp, "}\n");
421     }
422
423   () = fprintf (fp, "extern const %s *%s[%d];\n\n", datatype, table_name, ntables);
424
425   () = fprintf (fp, "#ifdef DEFINE%s\n", strup (table_name));
426
427   format = [format, format, format, format, format, format, format, format];
428   format = strcat ("  /*0x%02X-0x%02X*/ ", strjoin (format, ", "));
429
430   _for (0, num_unique-1, 1)
431     {
432	i = ();
433	if ((i == 0) and use_bitmap)
434	  continue;
435
436	() = fprintf (fp, "static const %s Table_%02d[%d] =\n{\n",
437		      datatype, i, nentries);
438
439	table = unique_tables[i];
440	_for (0, nentries-1, 8)
441	  {
442	     j = ();
443	     if (j)
444	       () = fputs (",\n", fp);
445	     () = fprintf (fp, format,
446			   j, (j+7),
447			   table[j], table[j+1], table[j+2], table[j+3],
448			   table[j+4], table[j+5], table[j+6], table[j+7]);
449	  }
450	() = fputs ("\n};\n\n", fp);
451     }
452
453   () = fprintf (fp, "const %s *%s[%d] =\n{", datatype, table_name, ntables);
454   i = 0;
455   while (i < ntables)
456     {
457	if (i) () = fputs (",", fp);
458
459	!if (i mod 6)
460	  () = fputs ("\n", fp);
461
462	if (use_bitmap and (tables[i] == 0))
463	  () = fprintf (fp, "      NULL");
464	else
465	  () = fprintf (fp, "  Table_%02d", tables[i]);
466
467	i++;
468     }
469   () = fputs ("\n};\n", fp);
470   () = fprintf (fp, "#endif /* DEFINE%s */\n", strup(table_name));
471
472   () = fclose (fp);
473
474   variable size;
475
476   if (is_substr (datatype, "char"))
477     size = 1;
478   else if (is_substr (datatype, "short"))
479     size = 2;
480   else size = 4;
481
482   if (use_bitmap == 0)
483     {
484	vmessage ("Estimated table size: %d bytes",
485		  4*ntables + size*nentries*num_unique);
486     }
487   else
488     {
489	vmessage ("Estimated table size: %d bytes",
490		  4*ntables + size*nentries*(num_unique-1));
491     }
492}
493
494private define make_char_classes (s)
495{
496   variable i;
497   variable code_point = s.code_point;
498   variable gcat0 = int (s.general_cat);
499   variable char_classes = UShort_Type[length(code_point)];
500#iftrue
501   % LOWER
502   i = where (((code_point == s.lowercase_mapping)
503	       and (code_point != s.uppercase_mapping)));
504   char_classes[i] |= LOWER;
505
506   % UPPER
507   i = where (((code_point == s.uppercase_mapping)
508	       or (code_point == s.titlecase_mapping))
509	      and (code_point != s.lowercase_mapping));
510   char_classes[i] |= UPPER;
511#endif
512   % LOWER
513   i = where ((s.general_cat == "Ll") and (0 == (char_classes & UPPER)));
514   char_classes[i] |= LOWER;
515
516   % UPPER
517   i = where ((s.general_cat == "Lu") and (0 == (char_classes & LOWER)));
518   char_classes[i] |= UPPER;
519
520   % ALPHA
521   i = where ((char_classes & (UPPER|LOWER)) or (gcat0 == 'L'));
522   char_classes[i] |= ALPHA;
523
524   % XDIGIT
525   i = where (((code_point >= '0') and (code_point <= '9'))
526	      or ((code_point >= 'A') and (code_point <= 'F'))
527	      or ((code_point >= 'a') and (code_point <= 'f')));
528   char_classes[i] |= XDIGIT;
529
530   % SPACE, BLANK
531   char_classes[' '] |= SPACE|BLANK;
532   char_classes['\t'] |= SPACE|BLANK;
533   char_classes['\n'] |= SPACE;
534   char_classes['\r'] |= SPACE;
535   char_classes['\f'] |= SPACE;
536   char_classes['\v'] |= SPACE;
537   % char_classes [where (s.bidirectional_cat == "WS")] |= SPACE;
538   i = where ((gcat0 == 'Z')
539	      and not array_map (Int_Type, &is_substr, s.char_decomp_map, "<noBreak>"));
540   char_classes [i] |= SPACE;
541
542   % CNTRL
543   char_classes[where(s.char_name == "<control>")] |= CNTRL;
544
545   % PRINT
546   char_classes[where((s.char_name != "") and not (char_classes & CNTRL))]
547     |= PRINT;
548
549   % DIGIT
550   i = where ((char_classes & XDIGIT) and not (char_classes & ALPHA));
551   char_classes[i] |= DIGIT;
552
553   % GRAPH
554   char_classes[where ((char_classes & PRINT) and not (char_classes & SPACE))]
555     |= GRAPH;
556
557   % ALNUM
558   char_classes[where (char_classes & (ALPHA|DIGIT))] |= ALNUM;
559
560   % PUNCT
561   char_classes[where ((char_classes & GRAPH) and not (char_classes & ALNUM))]
562     |= PUNCT;
563
564   % ASCII
565   char_classes[where (code_point < 0x80)] |= ASCII;
566   return char_classes;
567}
568
569private define write_char_classes (file, s, char_classes)
570{
571   variable fp = init_file (file);
572   () = fprintf (fp, "#define SLCHARCLASS_LOWER\t0x%04X\n", LOWER);
573   () = fprintf (fp, "#define SLCHARCLASS_UPPER\t0x%04X\n", UPPER);
574   () = fprintf (fp, "#define SLCHARCLASS_ALPHA\t0x%04X\n", ALPHA);
575   () = fprintf (fp, "#define SLCHARCLASS_XDIGIT\t0x%04X\n", XDIGIT);
576   () = fprintf (fp, "#define SLCHARCLASS_SPACE\t0x%04X\n", SPACE);
577   () = fprintf (fp, "#define SLCHARCLASS_BLANK\t0x%04X\n", BLANK);
578   () = fprintf (fp, "#define SLCHARCLASS_CNTRL\t0x%04X\n", CNTRL);
579   () = fprintf (fp, "#define SLCHARCLASS_PRINT\t0x%04X\n", PRINT);
580   () = fprintf (fp, "#define SLCHARCLASS_DIGIT\t0x%04X\n", DIGIT);
581   () = fprintf (fp, "#define SLCHARCLASS_GRAPH\t0x%04X\n", GRAPH);
582   () = fprintf (fp, "#define SLCHARCLASS_ALNUM\t0x%04X\n", ALNUM);
583   () = fprintf (fp, "#define SLCHARCLASS_PUNCT\t0x%04X\n", PUNCT);
584   () = fprintf (fp, "#define SLCHARCLASS_ASCII\t0x%04X\n", ASCII);
585   () = fprintf (fp, "\n\n");
586   write_toxxx_table (fp, s, char_classes, Classification_C_Table_Type,
587		      "Classification", Classification_C_Table_Format, 8, 0);
588}
589
590private define main ()
591{
592   variable s = read_file (Unicode_Data_File);
593   read_east_asian_file (s, East_Asian_File);
594
595   variable char_classes = make_char_classes (s);
596   variable ch = s.code_point;
597   variable is_combining = ((s.general_cat == "Mn") or (s.general_cat == "Me"));
598   % Note: "Mc" (combining, yet spacing) is omitted here since I do
599   % not know what that means.
600
601   % Apparantly Hangul (Conjoining Jamo) characters 0x1160 - 0x11FF
602   % _behave_ like combining characters, but are not flagged as such in
603   % the database.
604   is_combining[[0x1160:0x11FF]] = 1;
605#ifnfalse
606   variable width = UChar_Type[length(ch)];
607   width[*] = 1;
608   width[where (s.east_asian_prop == "W")] = 2;
609   width[where (s.east_asian_prop == "F")] = 2;
610   width[where (s.east_asian_prop == "A")] = 3;   %  ambiguous
611
612   width[where (s.general_cat == "Cf")] = 0;
613   width[0xAD] = 3;		       %  SOFT-HYPHEN -- mark is as ambiguous
614
615   width[where(is_combining)] = 0;
616   %width[where (s.bidirectional_cat == "NSM")] = 0;
617   width[where (0 == array_map (Int_Type, &strncmp, s.char_name,
618				"ZERO WIDTH", 10))]
619     = 0;
620
621   width[[0x80:0x9F]] = 4;	       %  displayed as <xx> by SLsmg
622
623   write_toxxx_table ("slwcwidth.h", s, width, "bit",
624		      "Width", "0x%02X", 8, 1);
625#endif
626   write_toxxx_table ("slcombin.h", s,
627		      is_combining,
628		      "bit", "Combining", "0x%02X", 6, 0);
629   write_toxxx_table ("sllower.h", s, s.lowercase_mapping-ch, "_pSLint32_Type",
630		      "Tolower", "% 5d", 7, 0);
631   variable tmp = s.lowercase_mapping-ch;
632   write_toxxx_table ("slupper.h", s, s.uppercase_mapping-ch, "_pSLint32_Type",
633		      "Toupper", "% 5d", 7, 0);
634   write_char_classes ("slischar.h", s, char_classes);
635
636}
637
638main();
639