1------------------------------------------------------------------------------
2--                                                                          --
3--                            Matreshka Project                             --
4--                                                                          --
5--         Localization, Internationalization, Globalization for Ada        --
6--                                                                          --
7--                        Runtime Library Component                         --
8--                                                                          --
9------------------------------------------------------------------------------
10--                                                                          --
11-- Copyright © 2009-2015, Vadim Godunko <vgodunko@gmail.com>                --
12-- All rights reserved.                                                     --
13--                                                                          --
14-- Redistribution and use in source and binary forms, with or without       --
15-- modification, are permitted provided that the following conditions       --
16-- are met:                                                                 --
17--                                                                          --
18--  * Redistributions of source code must retain the above copyright        --
19--    notice, this list of conditions and the following disclaimer.         --
20--                                                                          --
21--  * Redistributions in binary form must reproduce the above copyright     --
22--    notice, this list of conditions and the following disclaimer in the   --
23--    documentation and/or other materials provided with the distribution.  --
24--                                                                          --
25--  * Neither the name of the Vadim Godunko, IE nor the names of its        --
26--    contributors may be used to endorse or promote products derived from  --
27--    this software without specific prior written permission.              --
28--                                                                          --
29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      --
30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        --
31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR    --
32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT     --
33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,   --
34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED --
35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR   --
36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   --
37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     --
38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       --
39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             --
40--                                                                          --
41------------------------------------------------------------------------------
42--  $Revision: 5317 $ $Date: 2015-05-16 02:29:09 +0300 (Sat, 16 May 2015) $
43------------------------------------------------------------------------------
44
45package Matreshka.Internals.Unicode.Ucd is
46
47   pragma Preelaborate;
48
49   type First_Stage_Index is mod 16#1100#;
50   type Second_Stage_Index is mod 16#100#;
51
52   type Sequence_Count is range 0 .. 2 ** 16 - 1;
53   for Sequence_Count'Size use 16;
54
55   subtype Sequence_Index is Sequence_Count range 1 .. Sequence_Count'Last;
56
57   type Code_Point_Sequence is array (Sequence_Index range <>) of Code_Point;
58
59   type Code_Point_Sequence_Access is access constant Code_Point_Sequence;
60
61   ---------------------
62   -- Core properties --
63   ---------------------
64
65   type General_Category is
66    (Control,
67     Format,
68     Unassigned,
69     Private_Use,
70     Surrogate,
71
72     Lowercase_Letter,
73     Titlecase_Letter,
74     Uppercase_Letter,
75     Modifier_Letter,
76     Other_Letter,
77
78     Spacing_Mark,
79     Enclosing_Mark,
80     Nonspacing_Mark,
81
82     Decimal_Number,
83     Letter_Number,
84     Other_Number,
85
86     Connector_Punctuation,
87     Dash_Punctuation,
88     Close_Punctuation,
89     Final_Punctuation,
90     Initial_Punctuation,
91     Other_Punctuation,
92     Open_Punctuation,
93
94     Currency_Symbol,
95     Modifier_Symbol,
96     Math_Symbol,
97     Other_Symbol,
98
99     Line_Separator,
100     Paragraph_Separator,
101     Space_Separator);
102   for General_Category'Size use 8;
103
104   type Boolean_Properties is
105    (Composition_Exclusion,         --  XXX
106     Expands_On_NFC,                --  Derived
107     Expands_On_NFD,                --  Derived
108     Expands_On_NFKC,               --  Derived
109     Expands_On_NFKD,               --  Derived
110     Full_Composition_Exclusion,    --  Derived
111
112     ASCII_Hex_Digit,
113     Bidi_Control,
114     Dash,
115     Deprecated,
116     Diacritic,
117     Extender,
118     Hex_Digit,
119     Hyphen,
120     Ideographic,
121     IDS_Binary_Operator,
122     IDS_Trinary_Operator,
123     Join_Control,
124     Logical_Order_Exception,
125     Noncharacter_Code_Point,
126     Other_Alphabetic,
127     Other_Default_Ignorable_Code_Point,
128     Other_Grapheme_Extend,
129     Other_ID_Continue,
130     Other_ID_Start,
131     Other_Lowercase,
132     Other_Math,
133     Other_Uppercase,
134     Pattern_Syntax,
135     Pattern_White_Space,
136     Quotation_Mark,
137     Radical,
138     Soft_Dotted,
139     STerm,
140     Terminal_Punctuation,
141     Unified_Ideograph,
142     Variation_Selector,
143     White_Space,
144
145--     Bidi_Mirrored,                 --  XXX
146
147     --  Derived core properties. This list must include only properties
148     --  from DerivedCoreProperties.txt file.
149
150     Alphabetic,                    --  Derived
151     Cased,                         --  Derived
152     Case_Ignorable,                --  Derived
153     Changes_When_Lowercased,       --  Derived
154     Changes_When_Uppercased,       --  Derived
155     Changes_When_Titlecased,       --  Derived
156     Changes_When_Casefolded,       --  Derived
157     Changes_When_Casemapped,       --  Derived
158     Default_Ignorable_Code_Point,  --  Derived
159     Grapheme_Base,                 --  Derived
160     Grapheme_Extend,               --  Derived
161     Grapheme_Link,                 --  Deprecated, derived
162     ID_Continue,                   --  Derived
163     ID_Start,                      --  Derived
164     Lowercase,                     --  Derived
165     Math,                          --  Derived
166     Uppercase,                     --  Derived
167     XID_Continue,                  --  Derived
168     XID_Start,                     --  Derived
169
170     --  Derived normalization properties.
171
172     Changes_When_NFKC_Casefolded); --  Derived
173
174   subtype Overridable_Boolean_Properties is Boolean_Properties
175     range ASCII_Hex_Digit .. Changes_When_NFKC_Casefolded;
176
177   type Boolean_Values is array (Overridable_Boolean_Properties) of Boolean;
178   for Boolean_Values'Component_Size use 1;
179   for Boolean_Values'Size use 64;  --  52 bits used for now
180
181   type East_Asian_Width is
182    (Ambiguous,
183     Fullwidth,
184     Halfwidth,
185     Neutral,
186     Narrow,
187     Wide);
188   for East_Asian_Width'Size use 8;
189
190   type Grapheme_Cluster_Break is
191    (Control,
192     CR,
193     Extend,
194     L,
195     LF,
196     LV,
197     LVT,
198     Prepend,
199     Regional_Indicator,
200     Spacing_Mark,
201     T,
202     V,
203     Other);
204   for Grapheme_Cluster_Break'Size use 8;
205
206   type Word_Break is
207    (CR,
208     LF,
209     Newline,
210     Extend,
211     Regional_Indicator,
212     Format,
213     Katakana,
214     Hebrew_Letter,
215     A_Letter,
216     Single_Quote,
217     Double_Quote,
218     Mid_Num_Let,
219     Mid_Letter,
220     Mid_Num,
221     Numeric,
222     Extend_Num_Let,
223     Other);
224   for Word_Break'Size use 8;
225
226   type Sentence_Break is
227    (Other,
228     CR,
229     LF,
230     Sep,
231     Sp,
232     Lower,
233     Upper,
234     O_Letter,
235     Numeric,
236     A_Term,
237     S_Term,
238     Close,
239     S_Continue,
240     Format,
241     Extend);
242   for Sentence_Break'Size use 8;
243
244   type Line_Break is
245    (Ambiguous,
246     Alphabetic,
247     Break_Both,
248     Break_After,
249     Break_Before,
250     Mandatory_Break,
251     Contingent_Break,
252     Conditional_Japanese_Starter,
253     Close_Punctuation,
254     Combining_Mark,
255     Close_Parenthesis,
256     Carriage_Return,
257     Exclamation,
258     Glue,
259     H2,
260     H3,
261     Hebrew_Letter,
262     Hyphen,
263     Ideographic,
264     Inseparable,
265     Infix_Numeric,
266     JL,
267     JT,
268     JV,
269     Line_Feed,
270     Next_Line,
271     Nonstarter,
272     Numeric,
273     Open_Punctuation,
274     Postfix_Numeric,
275     Prefix_Numeric,
276     Quotation,
277     Regional_Indicator,
278     Complex_Context,
279     Surrogate,
280     Space,
281     Break_Symbols,
282     Word_Joiner,
283     Unknown,
284     ZW_Space);
285   for Line_Break'Size use 8;
286
287   type Core_Values is record
288      GC  : General_Category;            --   8  (5) bits
289      EA  : East_Asian_Width;            --   8  (3) bits
290      GCB : Grapheme_Cluster_Break;      --   8  (4) bits
291      WB  : Word_Break;                  --   8  (4) bits
292      SB  : Sentence_Break;              --   8  (4) bits
293      LB  : Line_Break;                  --   8  (6) bits
294      B   : Boolean_Values;              --  64 (52) bits
295   end record;
296   for Core_Values'Size use 128;
297   for Core_Values use record
298      B   at 0 range   0 ..  63;
299      GC  at 0 range  64 ..  71;
300      GCB at 0 range  72 ..  79;
301      WB  at 0 range  80 ..  87;
302      SB  at 0 range  88 ..  95;
303      LB  at 0 range  96 .. 103;
304      EA  at 0 range 104 .. 111;
305   end record;
306
307   type Core_Second_Stage is array (Second_Stage_Index) of Core_Values;
308
309   type Core_Second_Stage_Access is not null access constant Core_Second_Stage;
310
311   type Core_First_Stage is
312     array (First_Stage_Index) of Core_Second_Stage_Access;
313
314   type Core_First_Stage_Access is not null access constant Core_First_Stage;
315
316   ------------
317   -- Casing --
318   ------------
319
320   type Casing_Context is
321    (Final_Sigma,
322     After_Soft_Dotted,
323     More_Above,
324     Before_Dot,
325     After_I);
326
327   type Case_Mapping_Range is record
328      First : Sequence_Count;
329      Last  : Sequence_Count;
330   end record;
331
332   type Case_Mapping_Kinds is (Lower, Upper, Title, Folding);
333
334   type Case_Mapping_Ranges is
335     array (Case_Mapping_Kinds) of Case_Mapping_Range;
336
337   type Simple_Case_Mappings is
338     array (Case_Mapping_Kinds) of Code_Point;
339
340   type Casing_Context_Mapping_Ranges is
341     array (Case_Mapping_Kinds range Lower .. Title) of Case_Mapping_Range;
342
343   type Case_Mapping is record
344      Simple        : Simple_Case_Mappings;
345      Ranges        : Case_Mapping_Ranges;
346      Context_First : Sequence_Count;
347      Context_Last  : Sequence_Count;
348   end record;
349
350   type Casing_Context_Mapping is record
351      Context  : Casing_Context;
352      Negative : Boolean;
353      Ranges   : Casing_Context_Mapping_Ranges;
354   end record;
355
356   type Case_Mapping_Second_Stage is
357     array (Second_Stage_Index) of Case_Mapping;
358
359   type Case_Mapping_Second_Stage_Access is
360     not null access constant Case_Mapping_Second_Stage;
361
362   type Case_Mapping_First_Stage is
363     array (First_Stage_Index) of Case_Mapping_Second_Stage_Access;
364
365   type Case_Mapping_First_Stage_Access is
366     access constant Case_Mapping_First_Stage;
367
368   type Casing_Context_Mapping_Sequence is
369     array (Sequence_Index range <>) of Casing_Context_Mapping;
370
371   type Casing_Context_Mapping_Sequence_Access is
372     access constant Casing_Context_Mapping_Sequence;
373
374   -------------------
375   -- Normalization --
376   -------------------
377
378   type Canonical_Combining_Class is mod 256;
379
380   Not_Reordered        : constant Canonical_Combining_Class := 0;
381   Overlay              : constant Canonical_Combining_Class := 1;
382   Nukta                : constant Canonical_Combining_Class := 7;
383   Kana_Voicing         : constant Canonical_Combining_Class := 8;
384   Virama               : constant Canonical_Combining_Class := 9;
385   Attached_Below_Left  : constant Canonical_Combining_Class := 200;
386   Attached_Below       : constant Canonical_Combining_Class := 202;
387   Attached_Above_Right : constant Canonical_Combining_Class := 216;
388   Below_Left           : constant Canonical_Combining_Class := 218;
389   Below                : constant Canonical_Combining_Class := 220;
390   Below_Right          : constant Canonical_Combining_Class := 222;
391   Left                 : constant Canonical_Combining_Class := 224;
392   Right                : constant Canonical_Combining_Class := 226;
393   Above_Left           : constant Canonical_Combining_Class := 228;
394   Above                : constant Canonical_Combining_Class := 230;
395   Above_Right          : constant Canonical_Combining_Class := 232;
396   Double_Below         : constant Canonical_Combining_Class := 233;
397   Double_Above         : constant Canonical_Combining_Class := 234;
398   Iota_Subscript       : constant Canonical_Combining_Class := 240;
399
400   type Decomposition_Type is
401    (None,
402     Canonical,
403     Font,
404     No_Break,
405     Initial,
406     Medial,
407     Final,
408     Isolated,
409     Circle,
410     Super,
411     Sub,
412     Vertical,
413     Wide,
414     Narrow,
415     Small,
416     Square,
417     Fraction,
418     Compat);
419   for Decomposition_Type'Size use 8;
420
421   subtype Not_Overridable_Boolean_Properties is Boolean_Properties
422     range Composition_Exclusion .. Full_Composition_Exclusion;
423
424   type Non_Overridable_Boolean_Values is
425     array (Not_Overridable_Boolean_Properties) of Boolean;
426   for Non_Overridable_Boolean_Values'Component_Size use 1;
427   for Non_Overridable_Boolean_Values'Size use 8;  --  6 bits used for now
428
429   type Normalization_Quick_Check is (No, Maybe, Yes);
430   for Normalization_Quick_Check'Size use 2;
431
432   type Normalization_Form is (NFC, NFD, NFKC, NFKD);
433
434   type Normalization_Quick_Checks is
435     array (Normalization_Form) of Normalization_Quick_Check;
436   for Normalization_Quick_Checks'Size use 8;
437   for Normalization_Quick_Checks'Component_Size
438     use Normalization_Quick_Check'Size;
439
440   type Normalization_Mapping_Range is record
441      First : Sequence_Count;
442      Last  : Sequence_Count;
443   end record;
444
445   type Decomposition_Kinds is (Canonical, Compatibility);
446
447   type Decomposition_Mapping is
448     array (Decomposition_Kinds) of Normalization_Mapping_Range;
449
450   type Normalization_Mapping is record
451      Decomposition : Decomposition_Mapping;
452      Composition   : Normalization_Mapping_Range;
453      CCC           : Canonical_Combining_Class;   --   8      bits
454      NQC           : Normalization_Quick_Checks;  --   8      bits
455      DT            : Decomposition_Type;          --   8  (5) bits
456      B             : Non_Overridable_Boolean_Values;
457   end record;
458   for Normalization_Mapping'Size use 128;
459
460   type Normalization_Mapping_Second_Stage is
461     array (Second_Stage_Index) of Normalization_Mapping;
462
463   type Normalization_Mapping_Second_Stage_Access is
464     not null access constant Normalization_Mapping_Second_Stage;
465
466   type Normalization_Mapping_First_Stage is
467     array (First_Stage_Index) of Normalization_Mapping_Second_Stage_Access;
468
469   type Composition_Mapping is
470     array (Sequence_Index range <>, Sequence_Index range <>) of Code_Point;
471
472   ---------------
473   -- Collation --
474   ---------------
475
476   type Collation_Weight is mod 2**16;
477   for Collation_Weight'Size use 16;
478   pragma Assert (Code_Unit_16'Size = Collation_Weight'Size);
479   --  Note: collation algoriphm in identical strength level adds copy of the
480   --  source string in NFD form at the end of produced sorting key, thus it is
481   --  important that Code_Unit_16 and Collation_Weight types have equal size.
482
483   type Collation_Element is record
484      Primary   : Collation_Weight;
485      Secondary : Collation_Weight;
486      Trinary   : Collation_Weight;
487   end record;
488
489   type Collation_Element_Sequence is
490     array (Sequence_Index range <>) of Collation_Element;
491
492   type Collation_Element_Sequence_Access is
493     access constant Collation_Element_Sequence;
494
495   type Contractor_Record is record
496      Code             : Code_Point;
497      Contractor_First : Sequence_Count;
498      Contractor_Last  : Sequence_Count;
499      Expansion_First  : Sequence_Count;
500      Expansion_Last   : Sequence_Count;
501   end record;
502
503   type Contractor_Array is
504     array (Sequence_Index range <>) of Contractor_Record;
505
506   type Contractor_Array_Access is access constant Contractor_Array;
507
508   type Collation_Record is record
509      Contractor_First : Sequence_Count;
510      Contractor_Last  : Sequence_Count;
511      Expansion_First  : Sequence_Count;
512      Expansion_Last   : Sequence_Count;
513   end record;
514
515   type Collation_Second_Stage is
516     array (Second_Stage_Index) of Collation_Record;
517
518   type Collation_Second_Stage_Access is
519     not null access constant Collation_Second_Stage;
520
521   type Collation_First_Stage is
522     array (First_Stage_Index) of Collation_Second_Stage_Access;
523
524   type Collation_First_Stage_Access is access constant Collation_First_Stage;
525
526   -------------------------------
527   -- Two stage table utilities --
528   -------------------------------
529
530   generic
531      type Element_Type is private;
532      type Second_Stage_Array is
533        array (Second_Stage_Index) of Element_Type;
534      type Second_Stage_Array_Access is
535        not null access constant Second_Stage_Array;
536      type First_Stage_Array is
537        array (First_Stage_Index) of Second_Stage_Array_Access;
538
539   function Generic_Element (Data : First_Stage_Array; Code : Code_Point)
540     return Element_Type;
541
542end Matreshka.Internals.Unicode.Ucd;
543