1------------------------------------------------------------------------------ 2-- -- 3-- Matreshka Project -- 4-- -- 5-- Localization, Internationalization, Globalization for Ada -- 6-- -- 7-- Runtime Library Component -- 8-- -- 9------------------------------------------------------------------------------ 10-- -- 11-- Copyright © 2009-2015, Vadim Godunko <vgodunko@gmail.com> -- 12-- All rights reserved. -- 13-- -- 14-- Redistribution and use in source and binary forms, with or without -- 15-- modification, are permitted provided that the following conditions -- 16-- are met: -- 17-- -- 18-- * Redistributions of source code must retain the above copyright -- 19-- notice, this list of conditions and the following disclaimer. -- 20-- -- 21-- * Redistributions in binary form must reproduce the above copyright -- 22-- notice, this list of conditions and the following disclaimer in the -- 23-- documentation and/or other materials provided with the distribution. -- 24-- -- 25-- * Neither the name of the Vadim Godunko, IE nor the names of its -- 26-- contributors may be used to endorse or promote products derived from -- 27-- this software without specific prior written permission. -- 28-- -- 29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -- 30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -- 31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -- 32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -- 33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -- 34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -- 35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -- 36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -- 37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -- 38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- 39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- 40-- -- 41------------------------------------------------------------------------------ 42-- $Revision: 5317 $ $Date: 2015-05-16 02:29:09 +0300 (Sat, 16 May 2015) $ 43------------------------------------------------------------------------------ 44 45package Matreshka.Internals.Unicode.Ucd is 46 47 pragma Preelaborate; 48 49 type First_Stage_Index is mod 16#1100#; 50 type Second_Stage_Index is mod 16#100#; 51 52 type Sequence_Count is range 0 .. 2 ** 16 - 1; 53 for Sequence_Count'Size use 16; 54 55 subtype Sequence_Index is Sequence_Count range 1 .. Sequence_Count'Last; 56 57 type Code_Point_Sequence is array (Sequence_Index range <>) of Code_Point; 58 59 type Code_Point_Sequence_Access is access constant Code_Point_Sequence; 60 61 --------------------- 62 -- Core properties -- 63 --------------------- 64 65 type General_Category is 66 (Control, 67 Format, 68 Unassigned, 69 Private_Use, 70 Surrogate, 71 72 Lowercase_Letter, 73 Titlecase_Letter, 74 Uppercase_Letter, 75 Modifier_Letter, 76 Other_Letter, 77 78 Spacing_Mark, 79 Enclosing_Mark, 80 Nonspacing_Mark, 81 82 Decimal_Number, 83 Letter_Number, 84 Other_Number, 85 86 Connector_Punctuation, 87 Dash_Punctuation, 88 Close_Punctuation, 89 Final_Punctuation, 90 Initial_Punctuation, 91 Other_Punctuation, 92 Open_Punctuation, 93 94 Currency_Symbol, 95 Modifier_Symbol, 96 Math_Symbol, 97 Other_Symbol, 98 99 Line_Separator, 100 Paragraph_Separator, 101 Space_Separator); 102 for General_Category'Size use 8; 103 104 type Boolean_Properties is 105 (Composition_Exclusion, -- XXX 106 Expands_On_NFC, -- Derived 107 Expands_On_NFD, -- Derived 108 Expands_On_NFKC, -- Derived 109 Expands_On_NFKD, -- Derived 110 Full_Composition_Exclusion, -- Derived 111 112 ASCII_Hex_Digit, 113 Bidi_Control, 114 Dash, 115 Deprecated, 116 Diacritic, 117 Extender, 118 Hex_Digit, 119 Hyphen, 120 Ideographic, 121 IDS_Binary_Operator, 122 IDS_Trinary_Operator, 123 Join_Control, 124 Logical_Order_Exception, 125 Noncharacter_Code_Point, 126 Other_Alphabetic, 127 Other_Default_Ignorable_Code_Point, 128 Other_Grapheme_Extend, 129 Other_ID_Continue, 130 Other_ID_Start, 131 Other_Lowercase, 132 Other_Math, 133 Other_Uppercase, 134 Pattern_Syntax, 135 Pattern_White_Space, 136 Quotation_Mark, 137 Radical, 138 Soft_Dotted, 139 STerm, 140 Terminal_Punctuation, 141 Unified_Ideograph, 142 Variation_Selector, 143 White_Space, 144 145-- Bidi_Mirrored, -- XXX 146 147 -- Derived core properties. This list must include only properties 148 -- from DerivedCoreProperties.txt file. 149 150 Alphabetic, -- Derived 151 Cased, -- Derived 152 Case_Ignorable, -- Derived 153 Changes_When_Lowercased, -- Derived 154 Changes_When_Uppercased, -- Derived 155 Changes_When_Titlecased, -- Derived 156 Changes_When_Casefolded, -- Derived 157 Changes_When_Casemapped, -- Derived 158 Default_Ignorable_Code_Point, -- Derived 159 Grapheme_Base, -- Derived 160 Grapheme_Extend, -- Derived 161 Grapheme_Link, -- Deprecated, derived 162 ID_Continue, -- Derived 163 ID_Start, -- Derived 164 Lowercase, -- Derived 165 Math, -- Derived 166 Uppercase, -- Derived 167 XID_Continue, -- Derived 168 XID_Start, -- Derived 169 170 -- Derived normalization properties. 171 172 Changes_When_NFKC_Casefolded); -- Derived 173 174 subtype Overridable_Boolean_Properties is Boolean_Properties 175 range ASCII_Hex_Digit .. Changes_When_NFKC_Casefolded; 176 177 type Boolean_Values is array (Overridable_Boolean_Properties) of Boolean; 178 for Boolean_Values'Component_Size use 1; 179 for Boolean_Values'Size use 64; -- 52 bits used for now 180 181 type East_Asian_Width is 182 (Ambiguous, 183 Fullwidth, 184 Halfwidth, 185 Neutral, 186 Narrow, 187 Wide); 188 for East_Asian_Width'Size use 8; 189 190 type Grapheme_Cluster_Break is 191 (Control, 192 CR, 193 Extend, 194 L, 195 LF, 196 LV, 197 LVT, 198 Prepend, 199 Regional_Indicator, 200 Spacing_Mark, 201 T, 202 V, 203 Other); 204 for Grapheme_Cluster_Break'Size use 8; 205 206 type Word_Break is 207 (CR, 208 LF, 209 Newline, 210 Extend, 211 Regional_Indicator, 212 Format, 213 Katakana, 214 Hebrew_Letter, 215 A_Letter, 216 Single_Quote, 217 Double_Quote, 218 Mid_Num_Let, 219 Mid_Letter, 220 Mid_Num, 221 Numeric, 222 Extend_Num_Let, 223 Other); 224 for Word_Break'Size use 8; 225 226 type Sentence_Break is 227 (Other, 228 CR, 229 LF, 230 Sep, 231 Sp, 232 Lower, 233 Upper, 234 O_Letter, 235 Numeric, 236 A_Term, 237 S_Term, 238 Close, 239 S_Continue, 240 Format, 241 Extend); 242 for Sentence_Break'Size use 8; 243 244 type Line_Break is 245 (Ambiguous, 246 Alphabetic, 247 Break_Both, 248 Break_After, 249 Break_Before, 250 Mandatory_Break, 251 Contingent_Break, 252 Conditional_Japanese_Starter, 253 Close_Punctuation, 254 Combining_Mark, 255 Close_Parenthesis, 256 Carriage_Return, 257 Exclamation, 258 Glue, 259 H2, 260 H3, 261 Hebrew_Letter, 262 Hyphen, 263 Ideographic, 264 Inseparable, 265 Infix_Numeric, 266 JL, 267 JT, 268 JV, 269 Line_Feed, 270 Next_Line, 271 Nonstarter, 272 Numeric, 273 Open_Punctuation, 274 Postfix_Numeric, 275 Prefix_Numeric, 276 Quotation, 277 Regional_Indicator, 278 Complex_Context, 279 Surrogate, 280 Space, 281 Break_Symbols, 282 Word_Joiner, 283 Unknown, 284 ZW_Space); 285 for Line_Break'Size use 8; 286 287 type Core_Values is record 288 GC : General_Category; -- 8 (5) bits 289 EA : East_Asian_Width; -- 8 (3) bits 290 GCB : Grapheme_Cluster_Break; -- 8 (4) bits 291 WB : Word_Break; -- 8 (4) bits 292 SB : Sentence_Break; -- 8 (4) bits 293 LB : Line_Break; -- 8 (6) bits 294 B : Boolean_Values; -- 64 (52) bits 295 end record; 296 for Core_Values'Size use 128; 297 for Core_Values use record 298 B at 0 range 0 .. 63; 299 GC at 0 range 64 .. 71; 300 GCB at 0 range 72 .. 79; 301 WB at 0 range 80 .. 87; 302 SB at 0 range 88 .. 95; 303 LB at 0 range 96 .. 103; 304 EA at 0 range 104 .. 111; 305 end record; 306 307 type Core_Second_Stage is array (Second_Stage_Index) of Core_Values; 308 309 type Core_Second_Stage_Access is not null access constant Core_Second_Stage; 310 311 type Core_First_Stage is 312 array (First_Stage_Index) of Core_Second_Stage_Access; 313 314 type Core_First_Stage_Access is not null access constant Core_First_Stage; 315 316 ------------ 317 -- Casing -- 318 ------------ 319 320 type Casing_Context is 321 (Final_Sigma, 322 After_Soft_Dotted, 323 More_Above, 324 Before_Dot, 325 After_I); 326 327 type Case_Mapping_Range is record 328 First : Sequence_Count; 329 Last : Sequence_Count; 330 end record; 331 332 type Case_Mapping_Kinds is (Lower, Upper, Title, Folding); 333 334 type Case_Mapping_Ranges is 335 array (Case_Mapping_Kinds) of Case_Mapping_Range; 336 337 type Simple_Case_Mappings is 338 array (Case_Mapping_Kinds) of Code_Point; 339 340 type Casing_Context_Mapping_Ranges is 341 array (Case_Mapping_Kinds range Lower .. Title) of Case_Mapping_Range; 342 343 type Case_Mapping is record 344 Simple : Simple_Case_Mappings; 345 Ranges : Case_Mapping_Ranges; 346 Context_First : Sequence_Count; 347 Context_Last : Sequence_Count; 348 end record; 349 350 type Casing_Context_Mapping is record 351 Context : Casing_Context; 352 Negative : Boolean; 353 Ranges : Casing_Context_Mapping_Ranges; 354 end record; 355 356 type Case_Mapping_Second_Stage is 357 array (Second_Stage_Index) of Case_Mapping; 358 359 type Case_Mapping_Second_Stage_Access is 360 not null access constant Case_Mapping_Second_Stage; 361 362 type Case_Mapping_First_Stage is 363 array (First_Stage_Index) of Case_Mapping_Second_Stage_Access; 364 365 type Case_Mapping_First_Stage_Access is 366 access constant Case_Mapping_First_Stage; 367 368 type Casing_Context_Mapping_Sequence is 369 array (Sequence_Index range <>) of Casing_Context_Mapping; 370 371 type Casing_Context_Mapping_Sequence_Access is 372 access constant Casing_Context_Mapping_Sequence; 373 374 ------------------- 375 -- Normalization -- 376 ------------------- 377 378 type Canonical_Combining_Class is mod 256; 379 380 Not_Reordered : constant Canonical_Combining_Class := 0; 381 Overlay : constant Canonical_Combining_Class := 1; 382 Nukta : constant Canonical_Combining_Class := 7; 383 Kana_Voicing : constant Canonical_Combining_Class := 8; 384 Virama : constant Canonical_Combining_Class := 9; 385 Attached_Below_Left : constant Canonical_Combining_Class := 200; 386 Attached_Below : constant Canonical_Combining_Class := 202; 387 Attached_Above_Right : constant Canonical_Combining_Class := 216; 388 Below_Left : constant Canonical_Combining_Class := 218; 389 Below : constant Canonical_Combining_Class := 220; 390 Below_Right : constant Canonical_Combining_Class := 222; 391 Left : constant Canonical_Combining_Class := 224; 392 Right : constant Canonical_Combining_Class := 226; 393 Above_Left : constant Canonical_Combining_Class := 228; 394 Above : constant Canonical_Combining_Class := 230; 395 Above_Right : constant Canonical_Combining_Class := 232; 396 Double_Below : constant Canonical_Combining_Class := 233; 397 Double_Above : constant Canonical_Combining_Class := 234; 398 Iota_Subscript : constant Canonical_Combining_Class := 240; 399 400 type Decomposition_Type is 401 (None, 402 Canonical, 403 Font, 404 No_Break, 405 Initial, 406 Medial, 407 Final, 408 Isolated, 409 Circle, 410 Super, 411 Sub, 412 Vertical, 413 Wide, 414 Narrow, 415 Small, 416 Square, 417 Fraction, 418 Compat); 419 for Decomposition_Type'Size use 8; 420 421 subtype Not_Overridable_Boolean_Properties is Boolean_Properties 422 range Composition_Exclusion .. Full_Composition_Exclusion; 423 424 type Non_Overridable_Boolean_Values is 425 array (Not_Overridable_Boolean_Properties) of Boolean; 426 for Non_Overridable_Boolean_Values'Component_Size use 1; 427 for Non_Overridable_Boolean_Values'Size use 8; -- 6 bits used for now 428 429 type Normalization_Quick_Check is (No, Maybe, Yes); 430 for Normalization_Quick_Check'Size use 2; 431 432 type Normalization_Form is (NFC, NFD, NFKC, NFKD); 433 434 type Normalization_Quick_Checks is 435 array (Normalization_Form) of Normalization_Quick_Check; 436 for Normalization_Quick_Checks'Size use 8; 437 for Normalization_Quick_Checks'Component_Size 438 use Normalization_Quick_Check'Size; 439 440 type Normalization_Mapping_Range is record 441 First : Sequence_Count; 442 Last : Sequence_Count; 443 end record; 444 445 type Decomposition_Kinds is (Canonical, Compatibility); 446 447 type Decomposition_Mapping is 448 array (Decomposition_Kinds) of Normalization_Mapping_Range; 449 450 type Normalization_Mapping is record 451 Decomposition : Decomposition_Mapping; 452 Composition : Normalization_Mapping_Range; 453 CCC : Canonical_Combining_Class; -- 8 bits 454 NQC : Normalization_Quick_Checks; -- 8 bits 455 DT : Decomposition_Type; -- 8 (5) bits 456 B : Non_Overridable_Boolean_Values; 457 end record; 458 for Normalization_Mapping'Size use 128; 459 460 type Normalization_Mapping_Second_Stage is 461 array (Second_Stage_Index) of Normalization_Mapping; 462 463 type Normalization_Mapping_Second_Stage_Access is 464 not null access constant Normalization_Mapping_Second_Stage; 465 466 type Normalization_Mapping_First_Stage is 467 array (First_Stage_Index) of Normalization_Mapping_Second_Stage_Access; 468 469 type Composition_Mapping is 470 array (Sequence_Index range <>, Sequence_Index range <>) of Code_Point; 471 472 --------------- 473 -- Collation -- 474 --------------- 475 476 type Collation_Weight is mod 2**16; 477 for Collation_Weight'Size use 16; 478 pragma Assert (Code_Unit_16'Size = Collation_Weight'Size); 479 -- Note: collation algoriphm in identical strength level adds copy of the 480 -- source string in NFD form at the end of produced sorting key, thus it is 481 -- important that Code_Unit_16 and Collation_Weight types have equal size. 482 483 type Collation_Element is record 484 Primary : Collation_Weight; 485 Secondary : Collation_Weight; 486 Trinary : Collation_Weight; 487 end record; 488 489 type Collation_Element_Sequence is 490 array (Sequence_Index range <>) of Collation_Element; 491 492 type Collation_Element_Sequence_Access is 493 access constant Collation_Element_Sequence; 494 495 type Contractor_Record is record 496 Code : Code_Point; 497 Contractor_First : Sequence_Count; 498 Contractor_Last : Sequence_Count; 499 Expansion_First : Sequence_Count; 500 Expansion_Last : Sequence_Count; 501 end record; 502 503 type Contractor_Array is 504 array (Sequence_Index range <>) of Contractor_Record; 505 506 type Contractor_Array_Access is access constant Contractor_Array; 507 508 type Collation_Record is record 509 Contractor_First : Sequence_Count; 510 Contractor_Last : Sequence_Count; 511 Expansion_First : Sequence_Count; 512 Expansion_Last : Sequence_Count; 513 end record; 514 515 type Collation_Second_Stage is 516 array (Second_Stage_Index) of Collation_Record; 517 518 type Collation_Second_Stage_Access is 519 not null access constant Collation_Second_Stage; 520 521 type Collation_First_Stage is 522 array (First_Stage_Index) of Collation_Second_Stage_Access; 523 524 type Collation_First_Stage_Access is access constant Collation_First_Stage; 525 526 ------------------------------- 527 -- Two stage table utilities -- 528 ------------------------------- 529 530 generic 531 type Element_Type is private; 532 type Second_Stage_Array is 533 array (Second_Stage_Index) of Element_Type; 534 type Second_Stage_Array_Access is 535 not null access constant Second_Stage_Array; 536 type First_Stage_Array is 537 array (First_Stage_Index) of Second_Stage_Array_Access; 538 539 function Generic_Element (Data : First_Stage_Array; Code : Code_Point) 540 return Element_Type; 541 542end Matreshka.Internals.Unicode.Ucd; 543