1 // Written in the D programming language.
2
3 /++
4 $(P The $(D std.uni) module provides an implementation
5 of fundamental Unicode algorithms and data structures.
6 This doesn't include UTF encoding and decoding primitives,
7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8 for this functionality. )
9
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14 $(LREF byCodePoint)
15 $(LREF byGrapheme)
16 $(LREF decodeGrapheme)
17 $(LREF graphemeStride)
18 ))
19 $(TR $(TD Comparison) $(TD
20 $(LREF icmp)
21 $(LREF sicmp)
22 ))
23 $(TR $(TD Classification) $(TD
24 $(LREF isAlpha)
25 $(LREF isAlphaNum)
26 $(LREF isCodepointSet)
27 $(LREF isControl)
28 $(LREF isFormat)
29 $(LREF isGraphical)
30 $(LREF isIntegralPair)
31 $(LREF isMark)
32 $(LREF isNonCharacter)
33 $(LREF isNumber)
34 $(LREF isPrivateUse)
35 $(LREF isPunctuation)
36 $(LREF isSpace)
37 $(LREF isSurrogate)
38 $(LREF isSurrogateHi)
39 $(LREF isSurrogateLo)
40 $(LREF isSymbol)
41 $(LREF isWhite)
42 ))
43 $(TR $(TD Normalization) $(TD
44 $(LREF NFC)
45 $(LREF NFD)
46 $(LREF NFKD)
47 $(LREF NormalizationForm)
48 $(LREF normalize)
49 ))
50 $(TR $(TD Decompose) $(TD
51 $(LREF decompose)
52 $(LREF decomposeHangul)
53 $(LREF UnicodeDecomposition)
54 ))
55 $(TR $(TD Compose) $(TD
56 $(LREF compose)
57 $(LREF composeJamo)
58 ))
59 $(TR $(TD Sets) $(TD
60 $(LREF CodepointInterval)
61 $(LREF CodepointSet)
62 $(LREF InversionList)
63 $(LREF unicode)
64 ))
65 $(TR $(TD Trie) $(TD
66 $(LREF codepointSetTrie)
67 $(LREF CodepointSetTrie)
68 $(LREF codepointTrie)
69 $(LREF CodepointTrie)
70 $(LREF toTrie)
71 $(LREF toDelegate)
72 ))
73 $(TR $(TD Casing) $(TD
74 $(LREF asCapitalized)
75 $(LREF asLowerCase)
76 $(LREF asUpperCase)
77 $(LREF isLower)
78 $(LREF isUpper)
79 $(LREF toLower)
80 $(LREF toLowerInPlace)
81 $(LREF toUpper)
82 $(LREF toUpperInPlace)
83 ))
84 $(TR $(TD Utf8Matcher) $(TD
85 $(LREF isUtfMatcher)
86 $(LREF MatcherConcept)
87 $(LREF utfMatcher)
88 ))
89 $(TR $(TD Separators) $(TD
90 $(LREF lineSep)
91 $(LREF nelSep)
92 $(LREF paraSep)
93 ))
94 $(TR $(TD Building blocks) $(TD
95 $(LREF allowedIn)
96 $(LREF combiningClass)
97 $(LREF Grapheme)
98 ))
99 )
100
101 $(P All primitives listed operate on Unicode characters and
102 sets of characters. For functions which operate on ASCII characters
103 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
104 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
105 used throughout this module see the $(S_LINK Terminology, terminology) section
106 below.
107 )
108 $(P The focus of this module is the core needs of developing Unicode-aware
109 applications. To that effect it provides the following optimized primitives:
110 )
111 $(UL
112 $(LI Character classification by category and common properties:
113 $(LREF isAlpha), $(LREF isWhite) and others.
114 )
115 $(LI
116 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
117 )
118 $(LI
119 Converting text to any of the four normalization forms via $(LREF normalize).
120 )
121 $(LI
122 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
123 by user-perceived characters, that is by $(LREF Grapheme) clusters.
124 )
125 $(LI
126 Decomposing and composing of individual character(s) according to canonical
127 or compatibility rules, see $(LREF compose) and $(LREF decompose),
128 including the specific version for Hangul syllables $(LREF composeJamo)
129 and $(LREF decomposeHangul).
130 )
131 )
132 $(P It's recognized that an application may need further enhancements
133 and extensions, such as less commonly known algorithms,
134 or tailoring existing ones for region specific needs. To help users
135 with building any extra functionality beyond the core primitives,
136 the module provides:
137 )
138 $(UL
139 $(LI
140 $(LREF CodepointSet), a type for easy manipulation of sets of characters.
141 Besides the typical set algebra it provides an unusual feature:
142 a D source code generator for detection of $(CODEPOINTS) in this set.
143 This is a boon for meta-programming parser frameworks,
144 and is used internally to power classification in small
145 sets like $(LREF isWhite).
146 )
147 $(LI
148 A way to construct optimal packed multi-stage tables also known as a
149 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
150 The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
151 construct custom tries that map dchar to value.
152 The end result is a fast and predictable $(BIGOH 1) lookup that powers
153 functions like $(LREF isAlpha) and $(LREF combiningClass),
154 but for user-defined data sets.
155 )
156 $(LI
157 A useful technique for Unicode-aware parsers that perform
158 character classification of encoded $(CODEPOINTS)
159 is to avoid unnecassary decoding at all costs.
160 $(LREF utfMatcher) provides an improvement over the usual workflow
161 of decode-classify-process, combining the decoding and classification
162 steps. By extracting necessary bits directly from encoded
163 $(S_LINK Code unit, code units) matchers achieve
164 significant performance improvements. See $(LREF MatcherConcept) for
165 the common interface of UTF matchers.
166 )
167 $(LI
168 Generally useful building blocks for customized normalization:
169 $(LREF combiningClass) for querying combining class
170 and $(LREF allowedIn) for testing the Quick_Check
171 property of a given normalization form.
172 )
173 $(LI
174 Access to a large selection of commonly used sets of $(CODEPOINTS).
175 $(S_LINK Unicode properties, Supported sets) include Script,
176 Block and General Category. The exact contents of a set can be
177 observed in the CLDR utility, on the
178 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
179 of the Unicode website.
180 See $(LREF unicode) for easy and (optionally) compile-time checked set
181 queries.
182 )
183 )
184 $(SECTION Synopsis)
185 ---
186 import std.uni;
187 void main()
188 {
189 // initialize code point sets using script/block or property name
190 // now 'set' contains code points from both scripts.
191 auto set = unicode("Cyrillic") | unicode("Armenian");
192 // same thing but simpler and checked at compile-time
193 auto ascii = unicode.ASCII;
194 auto currency = unicode.Currency_Symbol;
195
196 // easy set ops
197 auto a = set & ascii;
198 assert(a.empty); // as it has no intersection with ascii
199 a = set | ascii;
200 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
201
202 // some properties of code point sets
203 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
204 // testing presence of a code point in a set
205 // is just fine, it is O(logN)
206 assert(!b['$']);
207 assert(!b['\u058F']); // Armenian dram sign
208 assert(b['¥']);
209
210 // building fast lookup tables, these guarantee O(1) complexity
211 // 1-level Trie lookup table essentially a huge bit-set ~262Kb
212 auto oneTrie = toTrie!1(b);
213 // 2-level far more compact but typically slightly slower
214 auto twoTrie = toTrie!2(b);
215 // 3-level even smaller, and a bit slower yet
216 auto threeTrie = toTrie!3(b);
217 assert(oneTrie['£']);
218 assert(twoTrie['£']);
219 assert(threeTrie['£']);
220
221 // build the trie with the most sensible trie level
222 // and bind it as a functor
223 auto cyrillicOrArmenian = toDelegate(set);
224 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
225 assert(balance == "ընկեր!");
226 // compatible with bool delegate(dchar)
227 bool delegate(dchar) bindIt = cyrillicOrArmenian;
228
229 // Normalization
230 string s = "Plain ascii (and not only), is always normalized!";
231 assert(s is normalize(s));// is the same string
232
233 string nonS = "A\u0308ffin"; // A ligature
234 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
235 assert(nS == "Äffin");
236 assert(nS != nonS);
237 string composed = "Äffin";
238
239 assert(normalize!NFD(composed) == "A\u0308ffin");
240 // to NFKD, compatibility decomposition useful for fuzzy matching/searching
241 assert(normalize!NFKD("2¹⁰") == "210");
242 }
243 ---
244 $(SECTION Terminology
245 )
246 $(P The following is a list of important Unicode notions
247 and definitions. Any conventions used specifically in this
248 module alone are marked as such. The descriptions are based on the formal
249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250 chapter three of The Unicode Standard Core Specification.)
251 )
252 $(P $(DEF Abstract character) A unit of information used for the organization,
253 control, or representation of textual data.
254 Note that:
255 $(UL
256 $(LI When representing data, the nature of that data
257 is generally symbolic as opposed to some other
258 kind of data (for example, visual).
259 )
260 $(LI An abstract character has no concrete form
261 and should not be confused with a $(S_LINK Glyph, glyph).
262 )
263 $(LI An abstract character does not necessarily
264 correspond to what a user thinks of as a “character”
265 and should not be confused with a $(LREF Grapheme).
266 )
267 $(LI The abstract characters encoded (see Encoded character)
268 are known as Unicode abstract characters.
269 )
270 $(LI Abstract characters not directly
271 encoded by the Unicode Standard can often be
272 represented by the use of combining character sequences.
273 )
274 )
275 )
276 $(P $(DEF Canonical decomposition)
277 The decomposition of a character or character sequence
278 that results from recursively applying the canonical
279 mappings found in the Unicode Character Database
280 and these described in Conjoining Jamo Behavior
281 (section 12 of
282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283 )
284 $(P $(DEF Canonical composition)
285 The precise definition of the Canonical composition
286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287 Unicode Conformance) section 11.
288 Informally it's the process that does the reverse of the canonical
289 decomposition with the addition of certain rules
290 that e.g. prevent legacy characters from appearing in the composed result.
291 )
292 $(P $(DEF Canonical equivalent)
293 Two character sequences are said to be canonical equivalents if
294 their full canonical decompositions are identical.
295 )
296 $(P $(DEF Character) Typically differs by context.
297 For the purpose of this documentation the term $(I character)
298 implies $(I encoded character), that is, a code point having
299 an assigned abstract character (a symbolic meaning).
300 )
301 $(P $(DEF Code point) Any value in the Unicode codespace;
302 that is, the range of integers from 0 to 10FFFF (hex).
303 Not all code points are assigned to encoded characters.
304 )
305 $(P $(DEF Code unit) The minimal bit combination that can represent
306 a unit of encoded text for processing or interchange.
307 Depending on the encoding this could be:
308 8-bit code units in the UTF-8 ($(D char)),
309 16-bit code units in the UTF-16 ($(D wchar)),
310 and 32-bit code units in the UTF-32 ($(D dchar)).
311 $(I Note that in UTF-32, a code unit is a code point
312 and is represented by the D $(D dchar) type.)
313 )
314 $(P $(DEF Combining character) A character with the General Category
315 of Combining Mark(M).
316 $(UL
317 $(LI All characters with non-zero canonical combining class
318 are combining characters, but the reverse is not the case:
319 there are combining characters with a zero combining class.
320 )
321 $(LI These characters are not normally used in isolation
322 unless they are being described. They include such characters
323 as accents, diacritics, Hebrew points, Arabic vowel signs,
324 and Indic matras.
325 )
326 )
327 )
328 $(P $(DEF Combining class)
329 A numerical value used by the Unicode Canonical Ordering Algorithm
330 to determine which sequences of combining marks are to be
331 considered canonically equivalent and which are not.
332 )
333 $(P $(DEF Compatibility decomposition)
334 The decomposition of a character or character sequence that results
335 from recursively applying both the compatibility mappings and
336 the canonical mappings found in the Unicode Character Database, and those
337 described in Conjoining Jamo Behavior no characters
338 can be further decomposed.
339 )
340 $(P $(DEF Compatibility equivalent)
341 Two character sequences are said to be compatibility
342 equivalents if their full compatibility decompositions are identical.
343 )
344 $(P $(DEF Encoded character) An association (or mapping)
345 between an abstract character and a code point.
346 )
347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348 having been rasterized or otherwise imaged onto some display surface.
349 )
350 $(P $(DEF Grapheme base) A character with the property
351 Grapheme_Base, or any standard Korean syllable block.
352 )
353 $(P $(DEF Grapheme cluster) Defined as the text between
354 grapheme boundaries as specified by Unicode Standard Annex #29,
355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356 Important general properties of a grapheme:
357 $(UL
358 $(LI The grapheme cluster represents a horizontally segmentable
359 unit of text, consisting of some grapheme base (which may
360 consist of a Korean syllable) together with any number of
361 nonspacing marks applied to it.
362 )
363 $(LI A grapheme cluster typically starts with a grapheme base
364 and then extends across any subsequent sequence of nonspacing marks.
365 A grapheme cluster is most directly relevant to text rendering and
366 processes such as cursor placement and text selection in editing,
367 but may also be relevant to comparison and searching.
368 )
369 $(LI For many processes, a grapheme cluster behaves as if it was a
370 single character with the same properties as its grapheme base.
371 Effectively, nonspacing marks apply $(I graphically) to the base,
372 but do not change its properties.
373 )
374 )
375 $(P This module defines a number of primitives that work with graphemes:
376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377 All of them are using $(I extended grapheme) boundaries
378 as defined in the aforementioned standard annex.
379 )
380 )
381 $(P $(DEF Nonspacing mark) A combining character with the
382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383 )
384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385 )
386 $(SECTION Normalization
387 )
388 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
389 or $(S_LINK Compatibility equivalent, compatibility equivalent)
390 characters in the Unicode Standard make it necessary to have a full, formal
391 definition of equivalence for Unicode strings.
392 String equivalence is determined by a process called normalization,
393 whereby strings are converted into forms which are compared
394 directly for identity. This is the primary goal of the normalization process,
395 see the function $(LREF normalize) to convert into any of
396 the four defined forms.
397 )
398 $(P A very important attribute of the Unicode Normalization Forms
399 is that they must remain stable between versions of the Unicode Standard.
400 A Unicode string normalized to a particular Unicode Normalization Form
401 in one version of the standard is guaranteed to remain in that Normalization
402 Form for implementations of future versions of the standard.
403 )
404 $(P The Unicode Standard specifies four normalization forms.
405 Informally, two of these forms are defined by maximal decomposition
406 of equivalent sequences, and two of these forms are defined
407 by maximal $(I composition) of equivalent sequences.
408 $(UL
409 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
410 canonical decomposition) of a character sequence.)
411 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
412 compatibility decomposition) of a character sequence.)
413 $(LI Normalization Form C (NFC): The canonical composition of the
414 $(S_LINK Canonical decomposition, canonical decomposition)
415 of a coded character sequence.)
416 $(LI Normalization Form KC (NFKC): The canonical composition
417 of the $(S_LINK Compatibility decomposition,
418 compatibility decomposition) of a character sequence)
419 )
420 )
421 $(P The choice of the normalization form depends on the particular use case.
422 NFC is the best form for general text, since it's more compatible with
423 strings converted from legacy encodings. NFKC is the preferred form for
424 identifiers, especially where there are security concerns. NFD and NFKD
425 are the most useful for internal processing.
426 )
427 $(SECTION Construction of lookup tables
428 )
429 $(P The Unicode standard describes a set of algorithms that
430 depend on having the ability to quickly look up various properties
431 of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
432 it is not a trivial task to provide a space-efficient solution for
433 the multitude of properties.
434 )
435 $(P Common approaches such as hash-tables or binary search over
436 sorted code point intervals (as in $(LREF InversionList)) are insufficient.
437 Hash-tables have enormous memory footprint and binary search
438 over intervals is not fast enough for some heavy-duty algorithms.
439 )
440 $(P The recommended solution (see Unicode Implementation Guidelines)
441 is using multi-stage tables that are an implementation of the
442 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
443 keys and a fixed number of stages. For the remainder of the section
444 this will be called a fixed trie. The following describes a particular
445 implementation that is aimed for the speed of access at the expense
446 of ideal size savings.
447 )
448 $(P Taking a 2-level Trie as an example the principle of operation is as follows.
449 Split the number of bits in a key (code point, 21 bits) into 2 components
450 (e.g. 15 and 8). The first is the number of bits in the index of the trie
451 and the other is number of bits in each page of the trie.
452 The layout of the trie is then an array of size 2^^bits-of-index followed
453 an array of memory chunks of size 2^^bits-of-page/bits-per-element.
454 )
455 $(P The number of pages is variable (but not less then 1)
456 unlike the number of entries in the index. The slots of the index
457 all have to contain a number of a page that is present. The lookup is then
458 just a couple of operations - slice the upper bits,
459 lookup an index for these, take a page at this index and use
460 the lower bits as an offset within this page.
461
462 Assuming that pages are laid out consequently
463 in one array at $(D pages), the pseudo-code is:
464 )
465 ---
466 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
467 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
468 ---
469 $(P Where if $(D elemsPerPage) is a power of 2 the whole process is
470 a handful of simple instructions and 2 array reads. Subsequent levels
471 of the trie are introduced by recursing on this notion - the index array
472 is treated as values. The number of bits in index is then again
473 split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
474 )
475
476 $(P For completeness a level 1 trie is simply an array.
477 The current implementation takes advantage of bit-packing values
478 when the range is known to be limited in advance (such as $(D bool)).
479 See also $(LREF BitPacked) for enforcing it manually.
480 The major size advantage however comes from the fact
481 that multiple $(B identical pages on every level are merged) by construction.
482 )
483 $(P The process of constructing a trie is more involved and is hidden from
484 the user in a form of the convenience functions $(LREF codepointTrie),
485 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
486 In general a set or built-in AA with $(D dchar) type
487 can be turned into a trie. The trie object in this module
488 is read-only (immutable); it's effectively frozen after construction.
489 )
490 $(SECTION Unicode properties
491 )
492 $(P This is a full list of Unicode properties accessible through $(LREF unicode)
493 with specific helpers per category nested within. Consult the
494 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
495 when in doubt about the contents of a particular set.
496 )
497 $(P General category sets listed below are only accessible with the
498 $(LREF unicode) shorthand accessor.)
499 $(BOOKTABLE $(B General category ),
500 $(TR $(TH Abb.) $(TH Long form)
501 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
502 $(TR $(TD L) $(TD Letter)
503 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation))
504 $(TR $(TD Ll) $(TD Lowercase_Letter)
505 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
506 $(TR $(TD Lm) $(TD Modifier_Letter)
507 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol))
508 $(TR $(TD Lo) $(TD Other_Letter)
509 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol))
510 $(TR $(TD Lt) $(TD Titlecase_Letter)
511 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol))
512 $(TR $(TD Lu) $(TD Uppercase_Letter)
513 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol))
514 $(TR $(TD M) $(TD Mark)
515 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol))
516 $(TR $(TD Mc) $(TD Spacing_Mark)
517 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
518 $(TR $(TD Me) $(TD Enclosing_Mark)
519 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator))
520 $(TR $(TD Mn) $(TD Nonspacing_Mark)
521 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator))
522 $(TR $(TD C) $(TD Other)
523 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
524 $(TR $(TD Cc) $(TD Control) $(TD Pf)
525 $(TD Final_Punctuation) $(TD -) $(TD Any))
526 $(TR $(TD Cf) $(TD Format)
527 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
528 )
529 $(P Sets for other commonly useful properties that are
530 accessible with $(LREF unicode):)
531 $(BOOKTABLE $(B Common binary properties),
532 $(TR $(TH Name) $(TH Name) $(TH Name))
533 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase))
534 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
535 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space))
536 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark))
537 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical))
538 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted))
539 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm))
540 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation))
541 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
542 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase))
543 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector))
544 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space))
545 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue))
546 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start))
547 $(TR $(TD Hyphen) $(TD Other_Lowercase) )
548 $(TR $(TD ID_Continue) $(TD Other_Math) )
549 )
550 $(P Below is the table with block names accepted by $(LREF unicode.block).
551 Note that the shorthand version $(LREF unicode) requires "In"
552 to be prepended to the names of blocks so as to disambiguate
553 scripts and blocks.
554 )
555 $(BOOKTABLE $(B Blocks),
556 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian))
557 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols))
558 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar))
559 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A))
560 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue))
561 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo))
562 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms))
563 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham))
564 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki))
565 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic))
566 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian))
567 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian))
568 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic))
569 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
570 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya))
571 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya))
572 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa))
573 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc))
574 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician))
575 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions))
576 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement))
577 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards))
578 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area))
579 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang))
580 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols))
581 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic))
582 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan))
583 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra))
584 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada))
585 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian))
586 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala))
587 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants))
588 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng))
589 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters))
590 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials))
591 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese))
592 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement))
593 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts))
594 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A))
595 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B))
596 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
597 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation))
598 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A))
599 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B))
600 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri))
601 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac))
602 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog))
603 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa))
604 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags))
605 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le))
606 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
607 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet))
608 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols))
609 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri))
610 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil))
611 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu))
612 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana))
613 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
614 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan))
615 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh))
616 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
617 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic))
618 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics))
619 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended))
620 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai))
621 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors))
622 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement))
623 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions))
624 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms))
625 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
626 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals))
627 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
628 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) )
629 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) )
630 )
631 $(P Below is the table with script names accepted by $(LREF unicode.script)
632 and by the shorthand version $(LREF unicode):)
633 $(BOOKTABLE $(B Scripts),
634 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic))
635 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian))
636 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian))
637 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic))
638 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya))
639 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya))
640 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa))
641 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician))
642 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang))
643 $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
644 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan))
645 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra))
646 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada))
647 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian))
648 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala))
649 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng))
650 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese))
651 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri))
652 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac))
653 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog))
654 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa))
655 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le))
656 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham))
657 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet))
658 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri))
659 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil))
660 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu))
661 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana))
662 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai))
663 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan))
664 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh))
665 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic))
666 $(TR $(TD Han) $(TD Ogham) $(TD Vai))
667 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi))
668 )
669 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
670 $(BOOKTABLE $(B Hangul syllable type),
671 $(TR $(TH Abb.) $(TH Long form))
672 $(TR $(TD L) $(TD Leading_Jamo))
673 $(TR $(TD LV) $(TD LV_Syllable))
674 $(TR $(TD LVT) $(TD LVT_Syllable) )
675 $(TR $(TD T) $(TD Trailing_Jamo))
676 $(TR $(TD V) $(TD Vowel_Jamo))
677 )
678 References:
679 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
680 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
681 $(HTTP www.unicode.org, The Unicode Consortium),
682 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
683 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
684 $(HTTP www.unicode.org/uni2book/ch05.pdf,
685 Unicode Implementation Guidelines)
686 $(HTTP www.unicode.org/uni2book/ch03.pdf,
687 Unicode Conformance)
688 Trademarks:
689 Unicode(tm) is a trademark of Unicode, Inc.
690
691 Copyright: Copyright 2013 -
692 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
693 Authors: Dmitry Olshansky
694 Source: $(PHOBOSSRC std/_uni.d)
695 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
696
697 Macros:
698
699 SECTION = <h3><a id="$1">$0</a></h3>
700 DEF = <div><a id="$1"><i>$0</i></a></div>
701 S_LINK = <a href="#$1">$+</a>
702 CODEPOINT = $(S_LINK Code point, code point)
703 CODEPOINTS = $(S_LINK Code point, code points)
704 CHARACTER = $(S_LINK Character, character)
705 CHARACTERS = $(S_LINK Character, characters)
706 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
707 +/
708 module std.uni;
709
710 import std.meta; // AliasSeq
711 import std.range.primitives; // back, ElementEncodingType, ElementType, empty,
712 // front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put,
713 // save
714 import std.traits; // isConvertibleToString, isIntegral, isSomeChar,
715 // isSomeString, Unqual
716 import std.exception;// : enforce;
717 import core.memory; //: pureMalloc, pureRealloc, pureFree;
718 import core.exception; // : onOutOfMemoryError;
719 static import std.ascii;
720 // debug = std_uni;
721
722 debug(std_uni) import std.stdio; // writefln, writeln
723
724 private:
725
version(unittest)726 version (unittest)
727 {
728 private:
729 struct TestAliasedString
730 {
731 string get() @safe @nogc pure nothrow { return _s; }
732 alias get this;
733 @disable this(this);
734 string _s;
735 }
736
737 bool testAliasedString(alias func, Args...)(string s, Args args)
738 {
739 import std.algorithm.comparison : equal;
740 auto a = func(TestAliasedString(s), args);
741 auto b = func(s, args);
742 static if (is(typeof(equal(a, b))))
743 {
744 // For ranges, compare contents instead of object identity.
745 return equal(a, b);
746 }
747 else
748 {
749 return a == b;
750 }
751 }
752 }
753
754 void copyBackwards(T,U)(T[] src, U[] dest)
755 {
756 assert(src.length == dest.length);
757 for (size_t i=src.length; i-- > 0; )
758 dest[i] = src[i];
759 }
760
761 void copyForward(T,U)(T[] src, U[] dest)
762 {
763 assert(src.length == dest.length);
764 for (size_t i=0; i<src.length; i++)
765 dest[i] = src[i];
766 }
767
768 // TODO: update to reflect all major CPUs supporting unaligned reads
769 version (X86)
770 enum hasUnalignedReads = true;
771 else version (X86_64)
772 enum hasUnalignedReads = true;
773 else version (SystemZ)
774 enum hasUnalignedReads = true;
775 else
776 enum hasUnalignedReads = false; // better be safe then sorry
777
778 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
779 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
780 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
781
782 // test the intro example
783 @safe unittest
784 {
785 import std.algorithm.searching : find;
786 // initialize code point sets using script/block or property name
787 // set contains code points from both scripts.
788 auto set = unicode("Cyrillic") | unicode("Armenian");
789 // or simpler and statically-checked look
790 auto ascii = unicode.ASCII;
791 auto currency = unicode.Currency_Symbol;
792
793 // easy set ops
794 auto a = set & ascii;
795 assert(a.empty); // as it has no intersection with ascii
796 a = set | ascii;
797 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
798
799 // some properties of code point sets
800 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
801 // testing presence of a code point in a set
802 // is just fine, it is O(logN)
803 assert(!b['$']);
804 assert(!b['\u058F']); // Armenian dram sign
805 assert(b['¥']);
806
807 // building fast lookup tables, these guarantee O(1) complexity
808 // 1-level Trie lookup table essentially a huge bit-set ~262Kb
809 auto oneTrie = toTrie!1(b);
810 // 2-level far more compact but typically slightly slower
811 auto twoTrie = toTrie!2(b);
812 // 3-level even smaller, and a bit slower yet
813 auto threeTrie = toTrie!3(b);
814 assert(oneTrie['£']);
815 assert(twoTrie['£']);
816 assert(threeTrie['£']);
817
818 // build the trie with the most sensible trie level
819 // and bind it as a functor
820 auto cyrillicOrArmenian = toDelegate(set);
821 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
822 assert(balance == "ընկեր!");
823 // compatible with bool delegate(dchar)
824 bool delegate(dchar) bindIt = cyrillicOrArmenian;
825
826 // Normalization
827 string s = "Plain ascii (and not only), is always normalized!";
828 assert(s is normalize(s));// is the same string
829
830 string nonS = "A\u0308ffin"; // A ligature
831 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
832 assert(nS == "Äffin");
833 assert(nS != nonS);
834 string composed = "Äffin";
835
836 assert(normalize!NFD(composed) == "A\u0308ffin");
837 // to NFKD, compatibility decomposition useful for fuzzy matching/searching
838 assert(normalize!NFKD("2¹⁰") == "210");
839 }
840
841 enum lastDchar = 0x10FFFF;
842
843 auto force(T, F)(F from)
844 if (isIntegral!T && !is(T == F))
845 {
846 assert(from <= T.max && from >= T.min);
847 return cast(T) from;
848 }
849
850 auto force(T, F)(F from)
851 if (isBitPacked!T && !is(T == F))
852 {
853 assert(from <= 2^^bitSizeOf!T-1);
854 return T(cast(TypeOfBitPacked!T) from);
855 }
856
857 auto force(T, F)(F from)
858 if (is(T == F))
859 {
860 return from;
861 }
862
863 // repeat X times the bit-pattern in val assuming it's length is 'bits'
864 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
865 {
866 static if (times == 1)
867 return val;
868 else static if (bits == 1)
869 {
870 static if (times == size_t.sizeof*8)
871 return val ? size_t.max : 0;
872 else
873 return val ? (1 << times)-1 : 0;
874 }
875 else static if (times % 2)
876 return (replicateBits!(times-1, bits)(val)<<bits) | val;
877 else
878 return replicateBits!(times/2, bits*2)((val << bits) | val);
879 }
880
881 @safe pure nothrow @nogc unittest // for replicate
882 {
883 import std.algorithm.iteration : sum, map;
884 import std.range : iota;
885 size_t m = 0b111;
886 size_t m2 = 0b01;
887 foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
888 {
889 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
890 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
891 }
892 }
893
894 // multiple arrays squashed into one memory block
MultiArray(Types...)895 struct MultiArray(Types...)
896 {
897 import std.range.primitives : isOutputRange;
898 this(size_t[] sizes...) @safe pure nothrow
899 {
900 assert(dim == sizes.length);
901 size_t full_size;
902 foreach (i, v; Types)
903 {
904 full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
905 sz[i] = sizes[i];
906 static if (i >= 1)
907 offsets[i] = offsets[i-1] +
908 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
909 }
910
911 storage = new size_t[full_size];
912 }
913
914 this(const(size_t)[] raw_offsets,
915 const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc
916 {
917 offsets[] = raw_offsets[];
918 sz[] = raw_sizes[];
919 storage = data;
920 }
921
922 @property auto slice(size_t n)()inout pure nothrow @nogc
923 {
924 auto ptr = raw_ptr!n;
925 return packedArrayView!(Types[n])(ptr, sz[n]);
926 }
927
928 @property auto ptr(size_t n)()inout pure nothrow @nogc
929 {
930 auto ptr = raw_ptr!n;
931 return inout(PackedPtr!(Types[n]))(ptr);
932 }
933
934 template length(size_t n)
935 {
936 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
937
938 @property void length(size_t new_size)
939 {
940 if (new_size > sz[n])
941 {// extend
942 size_t delta = (new_size - sz[n]);
943 sz[n] += delta;
944 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
945 storage.length += delta;// extend space at end
946 // raw_slice!x must follow resize as it could be moved!
947 // next stmts move all data past this array, last-one-goes-first
948 static if (n != dim-1)
949 {
950 auto start = raw_ptr!(n+1);
951 // len includes delta
952 size_t len = (storage.ptr+storage.length-start);
953
954 copyBackwards(start[0 .. len-delta], start[delta .. len]);
955
956 start[0 .. delta] = 0;
957 // offsets are used for raw_slice, ptr etc.
958 foreach (i; n+1 .. dim)
959 offsets[i] += delta;
960 }
961 }
962 else if (new_size < sz[n])
963 {// shrink
964 size_t delta = (sz[n] - new_size);
965 sz[n] -= delta;
966 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
967 // move all data past this array, forward direction
968 static if (n != dim-1)
969 {
970 auto start = raw_ptr!(n+1);
971 size_t len = (storage.ptr+storage.length-start);
972 copyForward(start[0 .. len-delta], start[delta .. len]);
973
974 // adjust offsets last, they affect raw_slice
975 foreach (i; n+1 .. dim)
976 offsets[i] -= delta;
977 }
978 storage.length -= delta;
979 }
980 // else - NOP
981 }
982 }
983
984 @property size_t bytes(size_t n=size_t.max)() const @safe
985 {
986 static if (n == size_t.max)
987 return storage.length*size_t.sizeof;
988 else static if (n != Types.length-1)
989 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
990 else
991 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
992 }
993
994 void store(OutRange)(scope OutRange sink) const
995 if (isOutputRange!(OutRange, char))
996 {
997 import std.format : formattedWrite;
998 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
999 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
1000 formattedWrite(sink, ", [%( 0x%x, %)]", storage);
1001 }
1002
1003 private:
1004 import std.meta : staticMap;
1005 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
1006 {
1007 static if (n == 0)
1008 return storage.ptr;
1009 else
1010 {
1011 return storage.ptr+offsets[n];
1012 }
1013 }
1014 enum dim = Types.length;
1015 size_t[dim] offsets;// offset for level x
1016 size_t[dim] sz;// size of level x
1017 alias bitWidth = staticMap!(bitSizeOf, Types);
1018 size_t[] storage;
1019 }
1020
1021 @system unittest
1022 {
1023 import std.conv : text;
1024 enum dg = (){
1025 // sizes are:
1026 // lvl0: 3, lvl1 : 2, lvl2: 1
1027 auto m = MultiArray!(int, ubyte, int)(3,2,1);
1028
check(size_t k,T)1029 static void check(size_t k, T)(ref T m, int n)
1030 {
1031 foreach (i; 0 .. n)
1032 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1033 }
1034
checkB(size_t k,T)1035 static void checkB(size_t k, T)(ref T m, int n)
1036 {
1037 foreach (i; 0 .. n)
1038 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1039 }
1040
fill(size_t k,T)1041 static void fill(size_t k, T)(ref T m, int n)
1042 {
1043 foreach (i; 0 .. n)
1044 m.slice!(k)[i] = force!ubyte(i+1);
1045 }
1046
fillB(size_t k,T)1047 static void fillB(size_t k, T)(ref T m, int n)
1048 {
1049 foreach (i; 0 .. n)
1050 m.slice!(k)[i] = force!ubyte(n-i);
1051 }
1052
1053 m.length!1 = 100;
1054 fill!1(m, 100);
1055 check!1(m, 100);
1056
1057 m.length!0 = 220;
1058 fill!0(m, 220);
1059 check!1(m, 100);
1060 check!0(m, 220);
1061
1062 m.length!2 = 17;
1063 fillB!2(m, 17);
1064 checkB!2(m, 17);
1065 check!0(m, 220);
1066 check!1(m, 100);
1067
1068 m.length!2 = 33;
1069 checkB!2(m, 17);
1070 fillB!2(m, 33);
1071 checkB!2(m, 33);
1072 check!0(m, 220);
1073 check!1(m, 100);
1074
1075 m.length!1 = 195;
1076 fillB!1(m, 195);
1077 checkB!1(m, 195);
1078 checkB!2(m, 33);
1079 check!0(m, 220);
1080
1081 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1082 marr.length!0 = 15;
1083 marr.length!1 = 30;
1084 fill!1(marr, 30);
1085 fill!0(marr, 15);
1086 check!1(marr, 30);
1087 check!0(marr, 15);
1088 return 0;
1089 };
1090 enum ct = dg();
1091 auto rt = dg();
1092 }
1093
1094 @system unittest
1095 {// more bitpacking tests
1096 import std.conv : text;
1097
1098 alias Bitty =
1099 MultiArray!(BitPacked!(size_t, 3)
1100 , BitPacked!(size_t, 4)
1101 , BitPacked!(size_t, 3)
1102 , BitPacked!(size_t, 6)
1103 , bool);
1104 alias fn1 = sliceBits!(13, 16);
1105 alias fn2 = sliceBits!( 9, 13);
1106 alias fn3 = sliceBits!( 6, 9);
1107 alias fn4 = sliceBits!( 0, 6);
check(size_t lvl,MA)1108 static void check(size_t lvl, MA)(ref MA arr){
1109 for (size_t i = 0; i< arr.length!lvl; i++)
1110 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1111 }
1112
fillIdx(size_t lvl,MA)1113 static void fillIdx(size_t lvl, MA)(ref MA arr){
1114 for (size_t i = 0; i< arr.length!lvl; i++)
1115 arr.slice!(lvl)[i] = i;
1116 }
1117 Bitty m1;
1118
1119 m1.length!4 = 10;
1120 m1.length!3 = 2^^6;
1121 m1.length!2 = 2^^3;
1122 m1.length!1 = 2^^4;
1123 m1.length!0 = 2^^3;
1124
1125 m1.length!4 = 2^^16;
1126
1127 for (size_t i = 0; i< m1.length!4; i++)
1128 m1.slice!(4)[i] = i % 2;
1129
1130 fillIdx!1(m1);
1131 check!1(m1);
1132 fillIdx!2(m1);
1133 check!2(m1);
1134 fillIdx!3(m1);
1135 check!3(m1);
1136 fillIdx!0(m1);
1137 check!0(m1);
1138 check!3(m1);
1139 check!2(m1);
1140 check!1(m1);
1141 for (size_t i=0; i < 2^^16; i++)
1142 {
1143 m1.slice!(4)[i] = i % 2;
1144 m1.slice!(0)[fn1(i)] = fn1(i);
1145 m1.slice!(1)[fn2(i)] = fn2(i);
1146 m1.slice!(2)[fn3(i)] = fn3(i);
1147 m1.slice!(3)[fn4(i)] = fn4(i);
1148 }
1149 for (size_t i=0; i < 2^^16; i++)
1150 {
1151 assert(m1.slice!(4)[i] == i % 2);
1152 assert(m1.slice!(0)[fn1(i)] == fn1(i));
1153 assert(m1.slice!(1)[fn2(i)] == fn2(i));
1154 assert(m1.slice!(2)[fn3(i)] == fn3(i));
1155 assert(m1.slice!(3)[fn4(i)] == fn4(i));
1156 }
1157 }
1158
spaceFor(size_t _bits)1159 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1160 {
1161 import std.math : nextPow2;
1162 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1163 static if (bits > 8*size_t.sizeof)
1164 {
1165 static assert(bits % (size_t.sizeof*8) == 0);
1166 return new_len * bits/(8*size_t.sizeof);
1167 }
1168 else
1169 {
1170 enum factor = size_t.sizeof*8/bits;
1171 return (new_len+factor-1)/factor; // rounded up
1172 }
1173 }
1174
isBitPackableType(T)1175 template isBitPackableType(T)
1176 {
1177 enum isBitPackableType = isBitPacked!T
1178 || isIntegral!T || is(T == bool) || isSomeChar!T;
1179 }
1180
1181 //============================================================================
1182 template PackedArrayView(T)
1183 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1184 && isBitPackableType!U) || isBitPackableType!T)
1185 {
1186 import std.math : nextPow2;
1187 private enum bits = bitSizeOf!T;
1188 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1189 }
1190
1191 //unsafe and fast access to a chunk of RAM as if it contains packed values
1192 template PackedPtr(T)
1193 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1194 && isBitPackableType!U) || isBitPackableType!T)
1195 {
1196 import std.math : nextPow2;
1197 private enum bits = bitSizeOf!T;
1198 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1199 }
1200
PackedPtrImpl(T,size_t bits)1201 struct PackedPtrImpl(T, size_t bits)
1202 {
1203 pure nothrow:
1204 static assert(isPow2OrZero(bits));
1205
1206 this(inout(size_t)* ptr)inout @safe @nogc
1207 {
1208 origin = ptr;
1209 }
1210
1211 private T simpleIndex(size_t n) inout
1212 {
1213 immutable q = n / factor;
1214 immutable r = n % factor;
1215 return cast(T)((origin[q] >> bits*r) & mask);
1216 }
1217
1218 private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1219 in
1220 {
1221 static if (isIntegral!T)
1222 assert(val <= mask);
1223 }
1224 body
1225 {
1226 immutable q = n / factor;
1227 immutable r = n % factor;
1228 immutable tgt_shift = bits*r;
1229 immutable word = origin[q];
1230 origin[q] = (word & ~(mask << tgt_shift))
1231 | (cast(size_t) val << tgt_shift);
1232 }
1233
1234 static if (factor == bytesPerWord// can safely pack by byte
1235 || factor == 1 // a whole word at a time
1236 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1237 && hasUnalignedReads)) // this needs unaligned reads
1238 {
1239 static if (factor == bytesPerWord)
1240 alias U = ubyte;
1241 else static if (factor == bytesPerWord/2)
1242 alias U = ushort;
1243 else static if (factor == bytesPerWord/4)
1244 alias U = uint;
1245 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1246 alias U = ulong;
1247
1248 T opIndex(size_t idx) inout
1249 {
1250 T ret;
1251 version (LittleEndian)
1252 ret = __ctfe ? simpleIndex(idx) :
1253 cast(inout(T))(cast(U*) origin)[idx];
1254 else
1255 ret = simpleIndex(idx);
1256 return ret;
1257 }
1258
1259 static if (isBitPacked!T) // lack of user-defined implicit conversion
1260 {
1261 void opIndexAssign(T val, size_t idx)
1262 {
1263 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1264 }
1265 }
1266
1267 void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1268 {
1269 version (LittleEndian)
1270 {
1271 if (__ctfe)
1272 simpleWrite(val, idx);
1273 else
1274 (cast(U*) origin)[idx] = cast(U) val;
1275 }
1276 else
1277 simpleWrite(val, idx);
1278 }
1279 }
1280 else
1281 {
1282 T opIndex(size_t n) inout
1283 {
1284 return simpleIndex(n);
1285 }
1286
1287 static if (isBitPacked!T) // lack of user-defined implicit conversion
1288 {
1289 void opIndexAssign(T val, size_t idx)
1290 {
1291 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1292 }
1293 }
1294
1295 void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1296 {
1297 return simpleWrite(val, n);
1298 }
1299 }
1300
1301 private:
1302 // factor - number of elements in one machine word
1303 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1304 enum bytesPerWord = size_t.sizeof;
1305 size_t* origin;
1306 }
1307
1308 // data is packed only by power of two sized packs per word,
1309 // thus avoiding mul/div overhead at the cost of ultimate packing
1310 // this construct doesn't own memory, only provides access, see MultiArray for usage
PackedArrayViewImpl(T,size_t bits)1311 struct PackedArrayViewImpl(T, size_t bits)
1312 {
1313 pure nothrow:
1314
1315 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1316 {
1317 ptr = inout(PackedPtr!(T))(origin);
1318 ofs = offset;
1319 limit = items;
1320 }
1321
1322 bool zeros(size_t s, size_t e)
1323 in
1324 {
1325 assert(s <= e);
1326 }
1327 body
1328 {
1329 s += ofs;
1330 e += ofs;
1331 immutable pad_s = roundUp(s);
1332 if ( s >= e)
1333 {
1334 foreach (i; s .. e)
1335 if (ptr[i])
1336 return false;
1337 return true;
1338 }
1339 immutable pad_e = roundDown(e);
1340 size_t i;
1341 for (i=s; i<pad_s; i++)
1342 if (ptr[i])
1343 return false;
1344 // all in between is x*factor elements
1345 for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1346 if (ptr.origin[j])
1347 return false;
1348 for (; i<e; i++)
1349 if (ptr[i])
1350 return false;
1351 return true;
1352 }
1353
1354 T opIndex(size_t idx) inout
1355 in
1356 {
1357 assert(idx < limit);
1358 }
1359 body
1360 {
1361 return ptr[ofs + idx];
1362 }
1363
1364 static if (isBitPacked!T) // lack of user-defined implicit conversion
1365 {
1366 void opIndexAssign(T val, size_t idx)
1367 {
1368 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1369 }
1370 }
1371
1372 void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1373 in
1374 {
1375 assert(idx < limit);
1376 }
1377 body
1378 {
1379 ptr[ofs + idx] = val;
1380 }
1381
1382 static if (isBitPacked!T) // lack of user-defined implicit conversions
1383 {
1384 void opSliceAssign(T val, size_t start, size_t end)
1385 {
1386 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1387 }
1388 }
1389
1390 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1391 in
1392 {
1393 assert(start <= end);
1394 assert(end <= limit);
1395 }
1396 body
1397 {
1398 // account for ofsetted view
1399 start += ofs;
1400 end += ofs;
1401 // rounded to factor granularity
1402 immutable pad_start = roundUp(start);// rounded up
1403 if (pad_start >= end) //rounded up >= then end of slice
1404 {
1405 //nothing to gain, use per element assignment
1406 foreach (i; start .. end)
1407 ptr[i] = val;
1408 return;
1409 }
1410 immutable pad_end = roundDown(end); // rounded down
1411 size_t i;
1412 for (i=start; i<pad_start; i++)
1413 ptr[i] = val;
1414 // all in between is x*factor elements
1415 if (pad_start != pad_end)
1416 {
1417 immutable repval = replicateBits!(factor, bits)(val);
1418 for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1419 ptr.origin[j] = repval;// so speed it up by factor
1420 }
1421 for (; i<end; i++)
1422 ptr[i] = val;
1423 }
1424
1425 auto opSlice(size_t from, size_t to)inout
1426 in
1427 {
1428 assert(from <= to);
1429 assert(ofs + to <= limit);
1430 }
1431 body
1432 {
1433 return typeof(this)(ptr.origin, ofs + from, to - from);
1434 }
1435
1436 auto opSlice(){ return opSlice(0, length); }
1437
1438 bool opEquals(T)(auto ref T arr) const
1439 {
1440 if (limit != arr.limit)
1441 return false;
1442 size_t s1 = ofs, s2 = arr.ofs;
1443 size_t e1 = s1 + limit, e2 = s2 + limit;
1444 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1445 {
1446 return ptr.origin[s1/factor .. e1/factor]
1447 == arr.ptr.origin[s2/factor .. e2/factor];
1448 }
1449 for (size_t i=0;i<limit; i++)
1450 if (this[i] != arr[i])
1451 return false;
1452 return true;
1453 }
1454
1455 @property size_t length()const{ return limit; }
1456
1457 private:
1458 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1459 auto roundDown()(size_t val){ return val/factor*factor; }
1460 // factor - number of elements in one machine word
1461 enum factor = size_t.sizeof*8/bits;
1462 PackedPtr!(T) ptr;
1463 size_t ofs, limit;
1464 }
1465
1466
SliceOverIndexed(T)1467 private struct SliceOverIndexed(T)
1468 {
1469 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1470 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1471 auto opIndex(size_t idx)const
1472 in
1473 {
1474 assert(idx < to - from);
1475 }
1476 body
1477 {
1478 return (*arr)[from+idx];
1479 }
1480
1481 static if (assignableIndex)
1482 void opIndexAssign(Item val, size_t idx)
1483 in
1484 {
1485 assert(idx < to - from);
1486 }
1487 body
1488 {
1489 (*arr)[from+idx] = val;
1490 }
1491
1492 auto opSlice(size_t a, size_t b)
1493 {
1494 return typeof(this)(from+a, from+b, arr);
1495 }
1496
1497 // static if (assignableSlice)
1498 void opSliceAssign(T)(T val, size_t start, size_t end)
1499 {
1500 (*arr)[start+from .. end+from] = val;
1501 }
1502
1503 auto opSlice()
1504 {
1505 return typeof(this)(from, to, arr);
1506 }
1507
1508 @property size_t length()const { return to-from;}
1509
1510 auto opDollar()const { return length; }
1511
1512 @property bool empty()const { return from == to; }
1513
1514 @property auto front()const { return (*arr)[from]; }
1515
1516 static if (assignableIndex)
1517 @property void front(Item val) { (*arr)[from] = val; }
1518
1519 @property auto back()const { return (*arr)[to-1]; }
1520
1521 static if (assignableIndex)
1522 @property void back(Item val) { (*arr)[to-1] = val; }
1523
1524 @property auto save() inout { return this; }
1525
1526 void popFront() { from++; }
1527
1528 void popBack() { to--; }
1529
1530 bool opEquals(T)(auto ref T arr) const
1531 {
1532 if (arr.length != length)
1533 return false;
1534 for (size_t i=0; i <length; i++)
1535 if (this[i] != arr[i])
1536 return false;
1537 return true;
1538 }
1539 private:
1540 alias Item = typeof(T.init[0]);
1541 size_t from, to;
1542 T* arr;
1543 }
1544
1545 static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1546
1547 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1548 if (is(Unqual!T == T))
1549 {
1550 return SliceOverIndexed!(const(T))(a, b, x);
1551 }
1552
1553 // BUG? inout is out of reach
1554 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1555 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1556 if (is(Unqual!T == T))
1557 {
1558 return SliceOverIndexed!T(a, b, x);
1559 }
1560
1561 @system unittest
1562 {
1563 int[] idxArray = [2, 3, 5, 8, 13];
1564 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1565
1566 assert(!sliced.empty);
1567 assert(sliced.front == 2);
1568 sliced.front = 1;
1569 assert(sliced.front == 1);
1570 assert(sliced.back == 13);
1571 sliced.popFront();
1572 assert(sliced.front == 3);
1573 assert(sliced.back == 13);
1574 sliced.back = 11;
1575 assert(sliced.back == 11);
1576 sliced.popBack();
1577
1578 assert(sliced.front == 3);
1579 assert(sliced[$-1] == 8);
1580 sliced = sliced[];
1581 assert(sliced[0] == 3);
1582 assert(sliced.back == 8);
1583 sliced = sliced[1..$];
1584 assert(sliced.front == 5);
1585 sliced = sliced[0..$-1];
1586 assert(sliced[$-1] == 5);
1587
1588 int[] other = [2, 5];
1589 assert(sliced[] == sliceOverIndexed(1, 2, &other));
1590 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1591 assert(idxArray[0 .. 2] == [-1, -1]);
1592 uint[] nullArr = null;
1593 auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1594 assert(nullSlice.empty);
1595 }
1596
packedArrayView(T)1597 private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow
1598 {
1599 return inout(PackedArrayView!T)(ptr, 0, items);
1600 }
1601
1602
1603 //============================================================================
1604 // Partially unrolled binary search using Shar's method
1605 //============================================================================
1606
genUnrolledSwitchSearch(size_t size)1607 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1608 {
1609 import core.bitop : bsr;
1610 import std.array : replace;
1611 import std.conv : to;
1612 assert(isPow2OrZero(size));
1613 string code = `
1614 import core.bitop : bsr;
1615 auto power = bsr(m)+1;
1616 switch (power){`;
1617 size_t i = bsr(size);
1618 foreach_reverse (val; 0 .. bsr(size))
1619 {
1620 auto v = 2^^val;
1621 code ~= `
1622 case pow:
1623 if (pred(range[idx+m], needle))
1624 idx += m;
1625 goto case;
1626 `.replace("m", to!string(v))
1627 .replace("pow", to!string(i));
1628 i--;
1629 }
1630 code ~= `
1631 case 0:
1632 if (pred(range[idx], needle))
1633 idx += 1;
1634 goto default;
1635 `;
1636 code ~= `
1637 default:
1638 }`;
1639 return code;
1640 }
1641
isPow2OrZero(size_t sz)1642 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1643 {
1644 // See also: std.math.isPowerOf2()
1645 return (sz & (sz-1)) == 0;
1646 }
1647
1648 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1649 if (is(T : ElementType!Range))
1650 {
1651 assert(isPow2OrZero(range.length));
1652 size_t idx = 0, m = range.length/2;
1653 while (m != 0)
1654 {
1655 if (pred(range[idx+m], needle))
1656 idx += m;
1657 m /= 2;
1658 }
1659 if (pred(range[idx], needle))
1660 idx += 1;
1661 return idx;
1662 }
1663
1664 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1665 if (is(T : ElementType!Range))
1666 {
1667 assert(isPow2OrZero(range.length));
1668 size_t idx = 0, m = range.length/2;
1669 enum max = 1 << 10;
1670 while (m >= max)
1671 {
1672 if (pred(range[idx+m], needle))
1673 idx += m;
1674 m /= 2;
1675 }
1676 mixin(genUnrolledSwitchSearch(max));
1677 return idx;
1678 }
1679
sharMethod(alias uniLowerBound)1680 template sharMethod(alias uniLowerBound)
1681 {
1682 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1683 if (is(T : ElementType!Range))
1684 {
1685 import std.functional : binaryFun;
1686 import std.math : nextPow2, truncPow2;
1687 alias pred = binaryFun!_pred;
1688 if (range.length == 0)
1689 return 0;
1690 if (isPow2OrZero(range.length))
1691 return uniLowerBound!pred(range, needle);
1692 size_t n = truncPow2(range.length);
1693 if (pred(range[n-1], needle))
1694 {// search in another 2^^k area that fully covers the tail of range
1695 size_t k = nextPow2(range.length - n + 1);
1696 return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1697 }
1698 else
1699 return uniLowerBound!pred(range[0 .. n], needle);
1700 }
1701 }
1702
1703 alias sharLowerBound = sharMethod!uniformLowerBound;
1704 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1705
1706 @safe unittest
1707 {
1708 import std.array : array;
1709 import std.range : assumeSorted, iota;
1710
stdLowerBound(T)1711 auto stdLowerBound(T)(T[] range, T needle)
1712 {
1713 return assumeSorted(range).lowerBound(needle).length;
1714 }
1715 immutable MAX = 5*1173;
1716 auto arr = array(iota(5, MAX, 5));
1717 assert(arr.length == MAX/5-1);
1718 foreach (i; 0 .. MAX+5)
1719 {
1720 auto st = stdLowerBound(arr, i);
1721 assert(st == sharLowerBound(arr, i));
1722 assert(st == sharSwitchLowerBound(arr, i));
1723 }
1724 arr = [];
1725 auto st = stdLowerBound(arr, 33);
1726 assert(st == sharLowerBound(arr, 33));
1727 assert(st == sharSwitchLowerBound(arr, 33));
1728 }
1729 //============================================================================
1730
1731 @safe
1732 {
1733 // hope to see simillar stuff in public interface... once Allocators are out
1734 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1735
1736 @trusted size_t genericReplace(Policy=void, T, Range)
1737 (ref T dest, size_t from, size_t to, Range stuff)
1738 {
1739 import std.algorithm.mutation : copy;
1740 size_t delta = to - from;
1741 size_t stuff_end = from+stuff.length;
1742 if (stuff.length > delta)
1743 {// replace increases length
1744 delta = stuff.length - delta;// now, new is > old by delta
1745 static if (is(Policy == void))
1746 dest.length = dest.length+delta;//@@@BUG lame @property
1747 else
1748 dest = Policy.realloc(dest, dest.length+delta);
1749 copyBackwards(dest[to .. dest.length-delta],
1750 dest[to+delta .. dest.length]);
1751 copyForward(stuff, dest[from .. stuff_end]);
1752 }
1753 else if (stuff.length == delta)
1754 {
1755 copy(stuff, dest[from .. to]);
1756 }
1757 else
1758 {// replace decreases length by delta
1759 delta = delta - stuff.length;
1760 copy(stuff, dest[from .. stuff_end]);
1761 copyForward(dest[to .. dest.length],
1762 dest[stuff_end .. dest.length-delta]);
1763 static if (is(Policy == void))
1764 dest.length = dest.length - delta;//@@@BUG lame @property
1765 else
1766 dest = Policy.realloc(dest, dest.length-delta);
1767 }
1768 return stuff_end;
1769 }
1770
1771
1772 // Simple storage manipulation policy
1773 @trusted private struct GcPolicy
1774 {
1775 import std.traits : isDynamicArray;
1776
dupGcPolicy1777 static T[] dup(T)(const T[] arr)
1778 {
1779 return arr.dup;
1780 }
1781
allocGcPolicy1782 static T[] alloc(T)(size_t size)
1783 {
1784 return new T[size];
1785 }
1786
reallocGcPolicy1787 static T[] realloc(T)(T[] arr, size_t sz)
1788 {
1789 arr.length = sz;
1790 return arr;
1791 }
1792
replaceImplGcPolicy1793 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1794 {
1795 replaceInPlace(dest, from, to, stuff);
1796 }
1797
1798 static void append(T, V)(ref T[] arr, V value)
1799 if (!isInputRange!V)
1800 {
1801 arr ~= force!T(value);
1802 }
1803
1804 static void append(T, V)(ref T[] arr, V value)
1805 if (isInputRange!V)
1806 {
1807 insertInPlace(arr, arr.length, value);
1808 }
1809
1810 static void destroy(T)(ref T arr)
1811 if (isDynamicArray!T && is(Unqual!T == T))
1812 {
1813 debug
1814 {
1815 arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1816 }
1817 arr = null;
1818 }
1819
1820 static void destroy(T)(ref T arr)
1821 if (isDynamicArray!T && !is(Unqual!T == T))
1822 {
1823 arr = null;
1824 }
1825 }
1826
1827 // ditto
1828 @trusted struct ReallocPolicy
1829 {
1830 import std.range.primitives : hasLength;
1831
dup(T)1832 static T[] dup(T)(const T[] arr)
1833 {
1834 auto result = alloc!T(arr.length);
1835 result[] = arr[];
1836 return result;
1837 }
1838
alloc(T)1839 static T[] alloc(T)(size_t size)
1840 {
1841 import core.stdc.stdlib : malloc;
1842 import std.exception : enforce;
1843
1844 import core.checkedint : mulu;
1845 bool overflow;
1846 size_t nbytes = mulu(size, T.sizeof, overflow);
1847 if (overflow) assert(0);
1848
1849 auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap");
1850 return ptr[0 .. size];
1851 }
1852
realloc(T)1853 static T[] realloc(T)(T[] arr, size_t size)
1854 {
1855 import core.stdc.stdlib : realloc;
1856 import std.exception : enforce;
1857 if (!size)
1858 {
1859 destroy(arr);
1860 return null;
1861 }
1862
1863 import core.checkedint : mulu;
1864 bool overflow;
1865 size_t nbytes = mulu(size, T.sizeof, overflow);
1866 if (overflow) assert(0);
1867
1868 auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap");
1869 return ptr[0 .. size];
1870 }
1871
replaceImpl(T,Range)1872 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1873 {
1874 genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1875 }
1876
1877 static void append(T, V)(ref T[] arr, V value)
1878 if (!isInputRange!V)
1879 {
1880 if (arr.length == size_t.max) assert(0);
1881 arr = realloc(arr, arr.length+1);
1882 arr[$-1] = force!T(value);
1883 }
1884
1885 @safe unittest
1886 {
1887 int[] arr;
1888 ReallocPolicy.append(arr, 3);
1889
1890 import std.algorithm.comparison : equal;
1891 assert(equal(arr, [3]));
1892 }
1893
1894 static void append(T, V)(ref T[] arr, V value)
1895 if (isInputRange!V && hasLength!V)
1896 {
1897 import core.checkedint : addu;
1898 bool overflow;
1899 size_t nelems = addu(arr.length, value.length, overflow);
1900 if (overflow) assert(0);
1901
1902 arr = realloc(arr, nelems);
1903
1904 import std.algorithm.mutation : copy;
1905 copy(value, arr[$-value.length..$]);
1906 }
1907
1908 @safe unittest
1909 {
1910 int[] arr;
1911 ReallocPolicy.append(arr, [1,2,3]);
1912
1913 import std.algorithm.comparison : equal;
1914 assert(equal(arr, [1,2,3]));
1915 }
1916
destroy(T)1917 static void destroy(T)(ref T[] arr)
1918 {
1919 import core.stdc.stdlib : free;
1920 if (arr.ptr)
1921 free(arr.ptr);
1922 arr = null;
1923 }
1924 }
1925
1926 //build hack
1927 alias _RealArray = CowArray!ReallocPolicy;
1928
1929 @safe unittest
1930 {
1931 import std.algorithm.comparison : equal;
1932
with(ReallocPolicy)1933 with(ReallocPolicy)
1934 {
1935 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1936 string file = __FILE__, size_t line = __LINE__)
1937 {
1938 {
1939 replaceImpl(orig, from, to, toReplace);
1940 scope(exit) destroy(orig);
1941 if (!equal(orig, result))
1942 return false;
1943 }
1944 return true;
1945 }
1946 static T[] arr(T)(T[] args... )
1947 {
1948 return dup(args);
1949 }
1950
1951 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1952 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1953 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1954 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1955 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1956 }
1957 }
1958
1959 /**
1960 Tests if T is some kind a set of code points. Intended for template constraints.
1961 */
isCodepointSet(T)1962 public template isCodepointSet(T)
1963 {
1964 static if (is(T dummy == InversionList!(Args), Args...))
1965 enum isCodepointSet = true;
1966 else
1967 enum isCodepointSet = false;
1968 }
1969
1970 /**
1971 Tests if $(D T) is a pair of integers that implicitly convert to $(D V).
1972 The following code must compile for any pair $(D T):
1973 ---
1974 (T x){ V a = x[0]; V b = x[1];}
1975 ---
1976 The following must not compile:
1977 ---
1978 (T x){ V c = x[2];}
1979 ---
1980 */
1981 public template isIntegralPair(T, V=uint)
1982 {
1983 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1984 && !is(typeof((T x){ V c = x[2]; }));
1985 }
1986
1987
1988 /**
1989 The recommended default type for set of $(CODEPOINTS).
1990 For details, see the current implementation: $(LREF InversionList).
1991 */
1992 public alias CodepointSet = InversionList!GcPolicy;
1993
1994
1995 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1996 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1997 // hence below doesn't seem to work
1998 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1999
2000 /**
2001 The recommended type of $(REF Tuple, std,_typecons)
2002 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
2003 Any interval type should pass $(LREF isIntegralPair) trait.
2004 */
2005 public struct CodepointInterval
2006 {
2007 pure:
2008 uint[2] _tuple;
2009 alias _tuple this;
2010
2011 @safe pure nothrow @nogc:
2012
thisCodepointInterval2013 this(uint low, uint high)
2014 {
2015 _tuple[0] = low;
2016 _tuple[1] = high;
2017 }
opEqualsCodepointInterval2018 bool opEquals(T)(T val) const
2019 {
2020 return this[0] == val[0] && this[1] == val[1];
2021 }
inoutCodepointInterval2022 @property ref inout(uint) a() inout { return _tuple[0]; }
inoutCodepointInterval2023 @property ref inout(uint) b() inout { return _tuple[1]; }
2024 }
2025
2026 /**
2027 $(P
2028 $(D InversionList) is a set of $(CODEPOINTS)
2029 represented as an array of open-right [a, b$(RPAREN)
2030 intervals (see $(LREF CodepointInterval) above).
2031 The name comes from the way the representation reads left to right.
2032 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2033 plus a singular value 60 looks like this:
2034 )
2035 ---
2036 10, 50, 60, 61, 80, 90
2037 ---
2038 $(P
2039 The way to read this is: start with negative meaning that all numbers
2040 smaller then the next one are not present in this set (and positive
2041 - the contrary). Then switch positive/negative after each
2042 number passed from left to right.
2043 )
2044 $(P This way negative spans until 10, then positive until 50,
2045 then negative until 60, then positive until 61, and so on.
2046 As seen this provides a space-efficient storage of highly redundant data
2047 that comes in long runs. A description which Unicode $(CHARACTER)
2048 properties fit nicely. The technique itself could be seen as a variation
2049 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2050 )
2051
2052 $(P Sets are value types (just like $(D int) is) thus they
2053 are never aliased.
2054 )
2055 Example:
2056 ---
2057 auto a = CodepointSet('a', 'z'+1);
2058 auto b = CodepointSet('A', 'Z'+1);
2059 auto c = a;
2060 a = a | b;
2061 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2062 assert(a != c);
2063 ---
2064 $(P See also $(LREF unicode) for simpler construction of sets
2065 from predefined ones.
2066 )
2067
2068 $(P Memory usage is 8 bytes per each contiguous interval in a set.
2069 The value semantics are achieved by using the
2070 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2071 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2072 )
2073
2074 Note:
2075 $(P It's not recommended to rely on the template parameters
2076 or the exact type of a current $(CODEPOINT) set in $(D std.uni).
2077 The type and parameters may change when the standard
2078 allocators design is finalized.
2079 Use $(LREF isCodepointSet) with templates or just stick with the default
2080 alias $(LREF CodepointSet) throughout the whole code base.
2081 )
2082 */
2083 @trusted public struct InversionList(SP=GcPolicy)
2084 {
2085 import std.range : assumeSorted;
2086
2087 /**
2088 Construct from another code point set of any type.
2089 */
2090 this(Set)(Set set) pure
2091 if (isCodepointSet!Set)
2092 {
2093 uint[] arr;
2094 foreach (v; set.byInterval)
2095 {
2096 arr ~= v.a;
2097 arr ~= v.b;
2098 }
2099 data = CowArray!(SP).reuse(arr);
2100 }
2101
2102 /**
2103 Construct a set from a forward range of code point intervals.
2104 */
2105 this(Range)(Range intervals) pure
2106 if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2107 {
2108 uint[] arr;
foreach(v;intervals)2109 foreach (v; intervals)
2110 {
2111 SP.append(arr, v.a);
2112 SP.append(arr, v.b);
2113 }
2114 data = CowArray!(SP).reuse(arr);
2115 sanitize(); //enforce invariant: sort intervals etc.
2116 }
2117
2118 //helper function that avoids sanity check to be CTFE-friendly
fromIntervals(Range)2119 private static fromIntervals(Range)(Range intervals) pure
2120 {
2121 import std.algorithm.iteration : map;
2122 import std.range : roundRobin;
2123 auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2124 intervals.save.map!"a[1]"());
2125 InversionList set;
2126 set.data = CowArray!(SP)(flattened);
2127 return set;
2128 }
2129 //ditto untill sort is CTFE-able
fromIntervals()2130 private static fromIntervals()(uint[] intervals...) pure
2131 in
2132 {
2133 import std.conv : text;
2134 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2135 for (uint i = 0; i < intervals.length; i += 2)
2136 {
2137 auto a = intervals[i], b = intervals[i+1];
2138 assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2139 }
2140 }
2141 body
2142 {
2143 InversionList set;
2144 set.data = CowArray!(SP)(intervals);
2145 return set;
2146 }
2147
2148 /**
2149 Construct a set from plain values of code point intervals.
2150 */
this()2151 this()(uint[] intervals...)
2152 in
2153 {
2154 import std.conv : text;
2155 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2156 for (uint i = 0; i < intervals.length; i += 2)
2157 {
2158 auto a = intervals[i], b = intervals[i+1];
2159 assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2160 }
2161 }
2162 body
2163 {
2164 data = CowArray!(SP)(intervals);
2165 sanitize(); //enforce invariant: sort intervals etc.
2166 }
2167
2168 ///
2169 @safe unittest
2170 {
2171 import std.algorithm.comparison : equal;
2172
2173 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2174 foreach (v; 'a'..'z'+1)
2175 assert(set[v]);
2176 // Cyrillic lowercase interval
2177 foreach (v; 'а'..'я'+1)
2178 assert(set[v]);
2179 //specific order is not required, intervals may interesect
2180 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2181 //the same end result
2182 assert(set2.byInterval.equal(set.byInterval));
2183 }
2184
2185 /**
2186 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2187
2188 Example:
2189 -----------
2190 import std.algorithm.comparison : equal;
2191 import std.typecons : tuple;
2192
2193 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2194
2195 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2196 -----------
2197 */
byInterval()2198 @property auto byInterval()
2199 {
2200 return Intervals!(typeof(data))(data);
2201 }
2202
2203 /**
2204 Tests the presence of code point $(D val) in this set.
2205 */
opIndex(uint val)2206 bool opIndex(uint val) const
2207 {
2208 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1
2209 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2210 return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2211 }
2212
2213 ///
2214 @safe unittest
2215 {
2216 auto gothic = unicode.Gothic;
2217 // Gothic letter ahsa
2218 assert(gothic['\U00010330']);
2219 // no ascii in Gothic obviously
2220 assert(!gothic['$']);
2221 }
2222
2223
2224 // Linear scan for $(D ch). Useful only for small sets.
2225 // TODO:
2226 // used internally in std.regex
2227 // should be properly exposed in a public API ?
scanFor()2228 package auto scanFor()(dchar ch) const
2229 {
2230 immutable len = data.length;
2231 for (size_t i = 0; i < len; i++)
2232 if (ch < data[i])
2233 return i & 1;
2234 return 0;
2235 }
2236
2237 /// Number of $(CODEPOINTS) in this set
length()2238 @property size_t length()
2239 {
2240 size_t sum = 0;
2241 foreach (iv; byInterval)
2242 {
2243 sum += iv.b - iv.a;
2244 }
2245 return sum;
2246 }
2247
2248 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2249 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2250 //============================================================================
2251 public:
2252 /**
2253 $(P Sets support natural syntax for set algebra, namely: )
2254 $(BOOKTABLE ,
2255 $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2256 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2257 $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2258 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2259 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2260 )
2261 */
2262 This opBinary(string op, U)(U rhs)
2263 if (isCodepointSet!U || is(U:dchar))
2264 {
2265 static if (op == "&" || op == "|" || op == "~")
2266 {// symmetric ops thus can swap arguments to reuse r-value
2267 static if (is(U:dchar))
2268 {
2269 auto tmp = this;
2270 mixin("tmp "~op~"= rhs; ");
2271 return tmp;
2272 }
2273 else
2274 {
2275 static if (is(Unqual!U == U))
2276 {
2277 // try hard to reuse r-value
2278 mixin("rhs "~op~"= this;");
2279 return rhs;
2280 }
2281 else
2282 {
2283 auto tmp = this;
2284 mixin("tmp "~op~"= rhs;");
2285 return tmp;
2286 }
2287 }
2288 }
2289 else static if (op == "-") // anti-symmetric
2290 {
2291 auto tmp = this;
2292 tmp -= rhs;
2293 return tmp;
2294 }
2295 else
2296 static assert(0, "no operator "~op~" defined for Set");
2297 }
2298
2299 ///
2300 @safe unittest
2301 {
2302 import std.algorithm.comparison : equal;
2303 import std.range : iota;
2304
2305 auto lower = unicode.LowerCase;
2306 auto upper = unicode.UpperCase;
2307 auto ascii = unicode.ASCII;
2308
2309 assert((lower & upper).empty); // no intersection
2310 auto lowerASCII = lower & ascii;
2311 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2312 // throw away all of the lowercase ASCII
2313 assert((ascii - lower).length == 128 - 26);
2314
2315 auto onlyOneOf = lower ~ ascii;
2316 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2317 assert(onlyOneOf['$']); // ASCII and not lowercase
2318 assert(!onlyOneOf['a']); // ASCII and lowercase
2319 assert(onlyOneOf['я']); // not ASCII but lowercase
2320
2321 // throw away all cased letters from ASCII
2322 auto noLetters = ascii - (lower | upper);
2323 assert(noLetters.length == 128 - 26*2);
2324 }
2325
2326 /// The 'op=' versions of the above overloaded operators.
2327 ref This opOpAssign(string op, U)(U rhs)
2328 if (isCodepointSet!U || is(U:dchar))
2329 {
2330 static if (op == "|") // union
2331 {
2332 static if (is(U:dchar))
2333 {
2334 this.addInterval(rhs, rhs+1);
2335 return this;
2336 }
2337 else
2338 return this.add(rhs);
2339 }
2340 else static if (op == "&") // intersection
2341 return this.intersect(rhs);// overloaded
2342 else static if (op == "-") // set difference
2343 return this.sub(rhs);// overloaded
2344 else static if (op == "~") // symmetric set difference
2345 {
2346 auto copy = this & rhs;
2347 this |= rhs;
2348 this -= copy;
2349 return this;
2350 }
2351 else
2352 static assert(0, "no operator "~op~" defined for Set");
2353 }
2354
2355 /**
2356 Tests the presence of codepoint $(D ch) in this set,
2357 the same as $(LREF opIndex).
2358 */
2359 bool opBinaryRight(string op: "in", U)(U ch) const
2360 if (is(U : dchar))
2361 {
2362 return this[ch];
2363 }
2364
2365 ///
2366 @safe unittest
2367 {
2368 assert('я' in unicode.Cyrillic);
2369 assert(!('z' in unicode.Cyrillic));
2370 }
2371
2372
2373
2374 /**
2375 * Obtains a set that is the inversion of this set.
2376 *
2377 * See_Also: $(LREF inverted)
2378 */
2379 auto opUnary(string op: "!")()
2380 {
2381 return this.inverted;
2382 }
2383
2384 /**
2385 A range that spans each $(CODEPOINT) in this set.
2386 */
byCodepoint()2387 @property auto byCodepoint()
2388 {
2389 @trusted static struct CodepointRange
2390 {
2391 this(This set)
2392 {
2393 r = set.byInterval;
2394 if (!r.empty)
2395 cur = r.front.a;
2396 }
2397
2398 @property dchar front() const
2399 {
2400 return cast(dchar) cur;
2401 }
2402
2403 @property bool empty() const
2404 {
2405 return r.empty;
2406 }
2407
2408 void popFront()
2409 {
2410 cur++;
2411 while (cur >= r.front.b)
2412 {
2413 r.popFront();
2414 if (r.empty)
2415 break;
2416 cur = r.front.a;
2417 }
2418 }
2419 private:
2420 uint cur;
2421 typeof(This.init.byInterval) r;
2422 }
2423
2424 return CodepointRange(this);
2425 }
2426
2427 ///
2428 @safe unittest
2429 {
2430 import std.algorithm.comparison : equal;
2431 import std.range : iota;
2432
2433 auto set = unicode.ASCII;
2434 set.byCodepoint.equal(iota(0, 0x80));
2435 }
2436
2437 /**
2438 $(P Obtain textual representation of this set in from of
2439 open-right intervals and feed it to $(D sink).
2440 )
2441 $(P Used by various standard formatting facilities such as
2442 $(REF formattedWrite, std,_format), $(REF write, std,_stdio),
2443 $(REF writef, std,_stdio), $(REF to, std,_conv) and others.
2444 )
2445 Example:
2446 ---
2447 import std.conv;
2448 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2449 ---
2450 */
2451
2452 private import std.format : FormatSpec;
2453
2454 /***************************************
2455 * Obtain a textual representation of this InversionList
2456 * in form of open-right intervals.
2457 *
2458 * The formatting flag is applied individually to each value, for example:
2459 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2460 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2461 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2462 */
toString(Writer)2463 void toString(Writer)(scope Writer sink,
2464 FormatSpec!char fmt) /* const */
2465 {
2466 import std.format : formatValue;
2467 auto range = byInterval;
2468 if (range.empty)
2469 return;
2470
2471 while (1)
2472 {
2473 auto i = range.front;
2474 range.popFront();
2475
2476 put(sink, "[");
2477 formatValue(sink, i.a, fmt);
2478 put(sink, "..");
2479 formatValue(sink, i.b, fmt);
2480 put(sink, ")");
2481 if (range.empty) return;
2482 put(sink, " ");
2483 }
2484 }
2485
2486 ///
2487 @safe unittest
2488 {
2489 import std.conv : to;
2490 import std.format : format;
2491 import std.uni : unicode;
2492
2493 assert(unicode.Cyrillic.to!string ==
2494 "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2495
2496 // The specs '%s' and '%d' are equivalent to the to!string call above.
2497 assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2498
2499 assert(format("%#x", unicode.Cyrillic) ==
2500 "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2501 ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2502
2503 assert(format("%#X", unicode.Cyrillic) ==
2504 "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2505 ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2506 }
2507
2508 @safe unittest
2509 {
2510 import std.exception : assertThrown;
2511 import std.format : format, FormatException;
2512 assertThrown!FormatException(format("%a", unicode.ASCII));
2513 }
2514
2515
2516 /**
2517 Add an interval [a, b$(RPAREN) to this set.
2518 */
add()2519 ref add()(uint a, uint b)
2520 {
2521 addInterval(a, b);
2522 return this;
2523 }
2524
2525 ///
2526 @safe unittest
2527 {
2528 CodepointSet someSet;
2529 someSet.add('0', '5').add('A','Z'+1);
2530 someSet.add('5', '9'+1);
2531 assert(someSet['0']);
2532 assert(someSet['5']);
2533 assert(someSet['9']);
2534 assert(someSet['Z']);
2535 }
2536
2537 private:
2538
2539 package(std) // used from: std.regex.internal.parser
2540 ref intersect(U)(U rhs)
2541 if (isCodepointSet!U)
2542 {
2543 Marker mark;
2544 foreach ( i; rhs.byInterval)
2545 {
2546 mark = this.dropUpTo(i.a, mark);
2547 mark = this.skipUpTo(i.b, mark);
2548 }
2549 this.dropUpTo(uint.max, mark);
2550 return this;
2551 }
2552
intersect()2553 ref intersect()(dchar ch)
2554 {
2555 foreach (i; byInterval)
2556 if (i.a <= ch && ch < i.b)
2557 return this = This.init.add(ch, ch+1);
2558 this = This.init;
2559 return this;
2560 }
2561
2562 @safe unittest
2563 {
2564 assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2565 }
2566
sub()2567 ref sub()(dchar ch)
2568 {
2569 return subChar(ch);
2570 }
2571
2572 // same as the above except that skip & drop parts are swapped
2573 package(std) // used from: std.regex.internal.parser
2574 ref sub(U)(U rhs)
2575 if (isCodepointSet!U)
2576 {
2577 Marker mark;
2578 foreach (i; rhs.byInterval)
2579 {
2580 mark = this.skipUpTo(i.a, mark);
2581 mark = this.dropUpTo(i.b, mark);
2582 }
2583 return this;
2584 }
2585
2586 package(std) // used from: std.regex.internal.parse
2587 ref add(U)(U rhs)
2588 if (isCodepointSet!U)
2589 {
2590 Marker start;
2591 foreach (i; rhs.byInterval)
2592 {
2593 start = addInterval(i.a, i.b, start);
2594 }
2595 return this;
2596 }
2597
2598 // end of mixin-able part
2599 //============================================================================
2600 public:
2601 /**
2602 Obtains a set that is the inversion of this set.
2603
2604 See the '!' $(LREF opUnary) for the same but using operators.
2605 */
inverted()2606 @property auto inverted()
2607 {
2608 InversionList inversion = this;
2609 if (inversion.data.length == 0)
2610 {
2611 inversion.addInterval(0, lastDchar+1);
2612 return inversion;
2613 }
2614 if (inversion.data[0] != 0)
2615 genericReplace(inversion.data, 0, 0, [0]);
2616 else
2617 genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2618 if (data[data.length-1] != lastDchar+1)
2619 genericReplace(inversion.data,
2620 inversion.data.length, inversion.data.length, [lastDchar+1]);
2621 else
2622 genericReplace(inversion.data,
2623 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2624
2625 return inversion;
2626 }
2627
2628 ///
2629 @safe unittest
2630 {
2631 auto set = unicode.ASCII;
2632 // union with the inverse gets all of the code points in the Unicode
2633 assert((set | set.inverted).length == 0x110000);
2634 // no intersection with the inverse
2635 assert((set & set.inverted).empty);
2636 }
2637
2638 /**
2639 Generates string with D source code of unary function with name of
2640 $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
2641 the code is adjusted to be a lambda function.
2642
2643 The function generated tests if the $(CODEPOINT) passed
2644 belongs to this set or not. The result is to be used with string mixin.
2645 The intended usage area is aggressive optimization via meta programming
2646 in parser generators and the like.
2647
2648 Note: Use with care for relatively small or regular sets. It
2649 could end up being slower then just using multi-staged tables.
2650
2651 Example:
2652 ---
2653 import std.stdio;
2654
2655 // construct set directly from [a, b$RPAREN intervals
2656 auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2657 writeln(set);
2658 writeln(set.toSourceCode("func"));
2659 ---
2660
2661 The above outputs something along the lines of:
2662 ---
2663 bool func(dchar ch) @safe pure nothrow @nogc
2664 {
2665 if (ch < 45)
2666 {
2667 if (ch == 10 || ch == 11) return true;
2668 return false;
2669 }
2670 else if (ch < 65) return true;
2671 else
2672 {
2673 if (ch < 100) return false;
2674 if (ch < 200) return true;
2675 return false;
2676 }
2677 }
2678 ---
2679 */
2680 string toSourceCode(string funcName="")
2681 {
2682 import std.algorithm.searching : countUntil;
2683 import std.array : array;
2684 import std.format : format;
2685 enum maxBinary = 3;
linearScope(R)2686 static string linearScope(R)(R ivals, string indent)
2687 {
2688 string result = indent~"{\n";
2689 string deeper = indent~" ";
2690 foreach (ival; ivals)
2691 {
2692 immutable span = ival[1] - ival[0];
2693 assert(span != 0);
2694 if (span == 1)
2695 {
2696 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2697 }
2698 else if (span == 2)
2699 {
2700 result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2701 deeper, ival[0], ival[0]+1);
2702 }
2703 else
2704 {
2705 if (ival[0] != 0) // dchar is unsigned and < 0 is useless
2706 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2707 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2708 }
2709 }
2710 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2711 return result;
2712 }
2713
binaryScope(R)2714 static string binaryScope(R)(R ivals, string indent)
2715 {
2716 // time to do unrolled comparisons?
2717 if (ivals.length < maxBinary)
2718 return linearScope(ivals, indent);
2719 else
2720 return bisect(ivals, ivals.length/2, indent);
2721 }
2722
2723 // not used yet if/elsebinary search is far better with DMD as of 2.061
2724 // and GDC is doing fine job either way
switchScope(R)2725 static string switchScope(R)(R ivals, string indent)
2726 {
2727 string result = indent~"switch (ch){\n";
2728 string deeper = indent~" ";
2729 foreach (ival; ivals)
2730 {
2731 if (ival[0]+1 == ival[1])
2732 {
2733 result ~= format("%scase %s: return true;\n",
2734 deeper, ival[0]);
2735 }
2736 else
2737 {
2738 result ~= format("%scase %s: .. case %s: return true;\n",
2739 deeper, ival[0], ival[1]-1);
2740 }
2741 }
2742 result ~= deeper~"default: return false;\n"~indent~"}\n";
2743 return result;
2744 }
2745
bisect(R)2746 static string bisect(R)(R range, size_t idx, string indent)
2747 {
2748 string deeper = indent ~ " ";
2749 // bisect on one [a, b) interval at idx
2750 string result = indent~"{\n";
2751 // less branch, < a
2752 result ~= format("%sif (ch < %s)\n%s",
2753 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2754 // middle point, >= a && < b
2755 result ~= format("%selse if (ch < %s) return true;\n",
2756 deeper, range[idx][1]);
2757 // greater or equal branch, >= b
2758 result ~= format("%selse\n%s",
2759 deeper, binaryScope(range[idx+1..$], deeper));
2760 return result~indent~"}\n";
2761 }
2762
2763 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2764 funcName.empty ? "function" : funcName);
2765 auto range = byInterval.array();
2766 // special case first bisection to be on ASCII vs beyond
2767 auto tillAscii = countUntil!"a[0] > 0x80"(range);
2768 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2769 code ~= binaryScope(range, "");
2770 else
2771 code ~= bisect(range, tillAscii, "");
2772 return code;
2773 }
2774
2775 /**
2776 True if this set doesn't contain any $(CODEPOINTS).
2777 */
empty()2778 @property bool empty() const
2779 {
2780 return data.length == 0;
2781 }
2782
2783 ///
2784 @safe unittest
2785 {
2786 CodepointSet emptySet;
2787 assert(emptySet.length == 0);
2788 assert(emptySet.empty);
2789 }
2790
2791 private:
2792 alias This = typeof(this);
2793 alias Marker = size_t;
2794
2795 // a random-access range of integral pairs
Intervals(Range)2796 static struct Intervals(Range)
2797 {
2798 this(Range sp)
2799 {
2800 slice = sp;
2801 start = 0;
2802 end = sp.length;
2803 }
2804
2805 this(Range sp, size_t s, size_t e)
2806 {
2807 slice = sp;
2808 start = s;
2809 end = e;
2810 }
2811
2812 @property auto front()const
2813 {
2814 immutable a = slice[start];
2815 immutable b = slice[start+1];
2816 return CodepointInterval(a, b);
2817 }
2818
2819 //may break sorted property - but we need std.sort to access it
2820 //hence package protection attribute
2821 package @property void front(CodepointInterval val)
2822 {
2823 slice[start] = val.a;
2824 slice[start+1] = val.b;
2825 }
2826
2827 @property auto back()const
2828 {
2829 immutable a = slice[end-2];
2830 immutable b = slice[end-1];
2831 return CodepointInterval(a, b);
2832 }
2833
2834 //ditto about package
2835 package @property void back(CodepointInterval val)
2836 {
2837 slice[end-2] = val.a;
2838 slice[end-1] = val.b;
2839 }
2840
2841 void popFront()
2842 {
2843 start += 2;
2844 }
2845
2846 void popBack()
2847 {
2848 end -= 2;
2849 }
2850
2851 auto opIndex(size_t idx) const
2852 {
2853 immutable a = slice[start+idx*2];
2854 immutable b = slice[start+idx*2+1];
2855 return CodepointInterval(a, b);
2856 }
2857
2858 //ditto about package
2859 package void opIndexAssign(CodepointInterval val, size_t idx)
2860 {
2861 slice[start+idx*2] = val.a;
2862 slice[start+idx*2+1] = val.b;
2863 }
2864
2865 auto opSlice(size_t s, size_t e)
2866 {
2867 return Intervals(slice, s*2+start, e*2+start);
2868 }
2869
2870 @property size_t length()const { return slice.length/2; }
2871
2872 @property bool empty()const { return start == end; }
2873
2874 @property auto save(){ return this; }
2875 private:
2876 size_t start, end;
2877 Range slice;
2878 }
2879
2880 // called after construction from intervals
2881 // to make sure invariants hold
sanitize()2882 void sanitize()
2883 {
2884 import std.algorithm.comparison : max;
2885 import std.algorithm.mutation : SwapStrategy;
2886 import std.algorithm.sorting : sort;
2887 if (data.length == 0)
2888 return;
2889 alias Ival = CodepointInterval;
2890 //intervals wrapper for a _range_ over packed array
2891 auto ivals = Intervals!(typeof(data[]))(data[]);
2892 //@@@BUG@@@ can't use "a.a < b.a" see issue 12265
2893 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2894 // what follows is a variation on stable remove
2895 // differences:
2896 // - predicate is binary, and is tested against
2897 // the last kept element (at 'i').
2898 // - predicate mutates lhs (merges rhs into lhs)
2899 size_t len = ivals.length;
2900 size_t i = 0;
2901 size_t j = 1;
2902 while (j < len)
2903 {
2904 if (ivals[i].b >= ivals[j].a)
2905 {
2906 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2907 j++;
2908 }
2909 else //unmergable
2910 {
2911 // check if there is a hole after merges
2912 // (in the best case we do 0 writes to ivals)
2913 if (j != i+1)
2914 ivals[i+1] = ivals[j]; //copy over
2915 i++;
2916 j++;
2917 }
2918 }
2919 len = i + 1;
2920 for (size_t k=0; k + 1 < len; k++)
2921 {
2922 assert(ivals[k].a < ivals[k].b);
2923 assert(ivals[k].b < ivals[k+1].a);
2924 }
2925 data.length = len * 2;
2926 }
2927
2928 // special case for normal InversionList
subChar(dchar ch)2929 ref subChar(dchar ch)
2930 {
2931 auto mark = skipUpTo(ch);
2932 if (mark != data.length
2933 && data[mark] == ch && data[mark-1] == ch)
2934 {
2935 // it has split, meaning that ch happens to be in one of intervals
2936 data[mark] = data[mark]+1;
2937 }
2938 return this;
2939 }
2940
2941 //
2942 Marker addInterval(int a, int b, Marker hint=Marker.init)
2943 in
2944 {
2945 assert(a <= b);
2946 }
2947 body
2948 {
2949 import std.range : assumeSorted, SearchPolicy;
2950 auto range = assumeSorted(data[]);
2951 size_t pos;
2952 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2953 if (a_idx == range.length)
2954 {
2955 // [---+++----++++----++++++]
2956 // [ a b]
2957 data.append(a, b);
2958 return data.length-1;
2959 }
2960 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2961 uint[3] buf = void;
2962 uint to_insert;
debug(std_uni)2963 debug(std_uni)
2964 {
2965 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2966 }
2967 if (b_idx == range.length)
2968 {
2969 // [-------++++++++----++++++-]
2970 // [ s a b]
2971 if (a_idx & 1)// a in positive
2972 {
2973 buf[0] = b;
2974 to_insert = 1;
2975 }
2976 else// a in negative
2977 {
2978 buf[0] = a;
2979 buf[1] = b;
2980 to_insert = 2;
2981 }
2982 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2983 return pos - 1;
2984 }
2985
2986 uint top = data[b_idx];
2987
debug(std_uni)2988 debug(std_uni)
2989 {
2990 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2991 writefln("a=%s; b=%s; top=%s;", a, b, top);
2992 }
2993 if (a_idx & 1)
2994 {// a in positive
2995 if (b_idx & 1)// b in positive
2996 {
2997 // [-------++++++++----++++++-]
2998 // [ s a b ]
2999 buf[0] = top;
3000 to_insert = 1;
3001 }
3002 else // b in negative
3003 {
3004 // [-------++++++++----++++++-]
3005 // [ s a b ]
3006 if (top == b)
3007 {
3008 assert(b_idx+1 < data.length);
3009 buf[0] = data[b_idx+1];
3010 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3011 return pos - 1;
3012 }
3013 buf[0] = b;
3014 buf[1] = top;
3015 to_insert = 2;
3016 }
3017 }
3018 else
3019 { // a in negative
3020 if (b_idx & 1) // b in positive
3021 {
3022 // [----------+++++----++++++-]
3023 // [ a b ]
3024 buf[0] = a;
3025 buf[1] = top;
3026 to_insert = 2;
3027 }
3028 else// b in negative
3029 {
3030 // [----------+++++----++++++-]
3031 // [ a s b ]
3032 if (top == b)
3033 {
3034 assert(b_idx+1 < data.length);
3035 buf[0] = a;
3036 buf[1] = data[b_idx+1];
3037 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3038 return pos - 1;
3039 }
3040 buf[0] = a;
3041 buf[1] = b;
3042 buf[2] = top;
3043 to_insert = 3;
3044 }
3045 }
3046 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
debug(std_uni)3047 debug(std_uni)
3048 {
3049 writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3050 writeln("inserting ", buf[0 .. to_insert]);
3051 }
3052 return pos - 1;
3053 }
3054
3055 //
3056 Marker dropUpTo(uint a, Marker pos=Marker.init)
3057 in
3058 {
3059 assert(pos % 2 == 0); // at start of interval
3060 }
3061 body
3062 {
3063 auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3064 if (range.empty)
3065 return pos;
3066 size_t idx = pos;
3067 idx += range.lowerBound(a).length;
3068
debug(std_uni)3069 debug(std_uni)
3070 {
3071 writeln("dropUpTo full length=", data.length);
3072 writeln(pos,"~~~", idx);
3073 }
3074 if (idx == data.length)
3075 return genericReplace(data, pos, idx, cast(uint[])[]);
3076 if (idx & 1)
3077 { // a in positive
3078 //[--+++----++++++----+++++++------...]
3079 // |<---si s a t
3080 genericReplace(data, pos, idx, [a]);
3081 }
3082 else
3083 { // a in negative
3084 //[--+++----++++++----+++++++-------+++...]
3085 // |<---si s a t
3086 genericReplace(data, pos, idx, cast(uint[])[]);
3087 }
3088 return pos;
3089 }
3090
3091 //
3092 Marker skipUpTo(uint a, Marker pos=Marker.init)
out(result)3093 out(result)
3094 {
3095 assert(result % 2 == 0);// always start of interval
3096 //(may be 0-width after-split)
3097 }
3098 body
3099 {
3100 assert(data.length % 2 == 0);
3101 auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3102 size_t idx = pos+range.lowerBound(a).length;
3103
3104 if (idx >= data.length) // could have Marker point to recently removed stuff
3105 return data.length;
3106
3107 if (idx & 1)// inside of interval, check for split
3108 {
3109
3110 immutable top = data[idx];
3111 if (top == a)// no need to split, it's end
3112 return idx+1;
3113 immutable start = data[idx-1];
3114 if (a == start)
3115 return idx-1;
3116 // split it up
3117 genericReplace(data, idx, idx+1, [a, a, top]);
3118 return idx+1; // avoid odd index
3119 }
3120 return idx;
3121 }
3122
3123 CowArray!SP data;
3124 }
3125
3126 @system unittest
3127 {
3128 import std.conv : to;
3129 assert(unicode.ASCII.to!string() == "[0..128)");
3130 }
3131
3132 // pedantic version for ctfe, and aligned-access only architectures
safeRead24(scope const ubyte * ptr,size_t idx)3133 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3134 {
3135 idx *= 3;
3136 version (LittleEndian)
3137 return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3138 + (cast(uint) ptr[idx+2]<<16);
3139 else
3140 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3141 + ptr[idx+2];
3142 }
3143
3144 // ditto
safeWrite24(scope ubyte * ptr,uint val,size_t idx)3145 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3146 {
3147 idx *= 3;
3148 version (LittleEndian)
3149 {
3150 ptr[idx] = val & 0xFF;
3151 ptr[idx+1] = (val >> 8) & 0xFF;
3152 ptr[idx+2] = (val >> 16) & 0xFF;
3153 }
3154 else
3155 {
3156 ptr[idx] = (val >> 16) & 0xFF;
3157 ptr[idx+1] = (val >> 8) & 0xFF;
3158 ptr[idx+2] = val & 0xFF;
3159 }
3160 }
3161
3162 // unaligned x86-like read/write functions
unalignedRead24(scope const ubyte * ptr,size_t idx)3163 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3164 {
3165 uint* src = cast(uint*)(ptr+3*idx);
3166 version (LittleEndian)
3167 return *src & 0xFF_FFFF;
3168 else
3169 return *src >> 8;
3170 }
3171
3172 // ditto
unalignedWrite24(scope ubyte * ptr,uint val,size_t idx)3173 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3174 {
3175 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3176 version (LittleEndian)
3177 *dest = val | (*dest & 0xFF00_0000);
3178 else
3179 *dest = (val << 8) | (*dest & 0xFF);
3180 }
3181
read24(scope const ubyte * ptr,size_t idx)3182 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3183 {
3184 static if (hasUnalignedReads)
3185 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3186 else
3187 return safeRead24(ptr, idx);
3188 }
3189
write24(scope ubyte * ptr,uint val,size_t idx)3190 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3191 {
3192 static if (hasUnalignedReads)
3193 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3194 else
3195 return safeWrite24(ptr, val, idx);
3196 }
3197
3198 struct CowArray(SP=GcPolicy)
3199 {
3200 import std.range.primitives : hasLength;
3201
3202 @safe:
reuseCowArray3203 static auto reuse(uint[] arr)
3204 {
3205 CowArray cow;
3206 cow.data = arr;
3207 SP.append(cow.data, 1);
3208 assert(cow.refCount == 1);
3209 assert(cow.length == arr.length);
3210 return cow;
3211 }
3212
3213 this(Range)(Range range)
3214 if (isInputRange!Range && hasLength!Range)
3215 {
3216 import std.algorithm.mutation : copy;
3217 length = range.length;
3218 copy(range, data[0..$-1]);
3219 }
3220
3221 this(Range)(Range range)
3222 if (isForwardRange!Range && !hasLength!Range)
3223 {
3224 import std.algorithm.mutation : copy;
3225 import std.range.primitives : walkLength;
3226 immutable len = walkLength(range.save);
3227 length = len;
3228 copy(range, data[0..$-1]);
3229 }
3230
thisCowArray3231 this(this)
3232 {
3233 if (!empty)
3234 {
3235 refCount = refCount + 1;
3236 }
3237 }
3238
~thisCowArray3239 ~this()
3240 {
3241 if (!empty)
3242 {
3243 immutable cnt = refCount;
3244 if (cnt == 1)
3245 SP.destroy(data);
3246 else
3247 refCount = cnt - 1;
3248 }
3249 }
3250
3251 // no ref-count for empty U24 array
emptyCowArray3252 @property bool empty() const { return data.length == 0; }
3253
3254 // report one less then actual size
lengthCowArray3255 @property size_t length() const
3256 {
3257 return data.length ? data.length - 1 : 0;
3258 }
3259
3260 //+ an extra slot for ref-count
lengthCowArray3261 @property void length(size_t len)
3262 {
3263 import std.algorithm.comparison : min;
3264 import std.algorithm.mutation : copy;
3265 if (len == 0)
3266 {
3267 if (!empty)
3268 freeThisReference();
3269 return;
3270 }
3271 immutable total = len + 1; // including ref-count
3272 if (empty)
3273 {
3274 data = SP.alloc!uint(total);
3275 refCount = 1;
3276 return;
3277 }
3278 immutable cur_cnt = refCount;
3279 if (cur_cnt != 1) // have more references to this memory
3280 {
3281 refCount = cur_cnt - 1;
3282 auto new_data = SP.alloc!uint(total);
3283 // take shrinking into account
3284 auto to_copy = min(total, data.length) - 1;
3285 copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3286 data = new_data; // before setting refCount!
3287 refCount = 1;
3288 }
3289 else // 'this' is the only reference
3290 {
3291 // use the realloc (hopefully in-place operation)
3292 data = SP.realloc(data, total);
3293 refCount = 1; // setup a ref-count in the new end of the array
3294 }
3295 }
3296
3297 alias opDollar = length;
3298
opIndexCowArray3299 uint opIndex()(size_t idx)const
3300 {
3301 return data[idx];
3302 }
3303
opIndexAssignCowArray3304 void opIndexAssign(uint val, size_t idx)
3305 {
3306 auto cnt = refCount;
3307 if (cnt != 1)
3308 dupThisReference(cnt);
3309 data[idx] = val;
3310 }
3311
3312 //
opSliceCowArray3313 auto opSlice(size_t from, size_t to)
3314 {
3315 if (!empty)
3316 {
3317 auto cnt = refCount;
3318 if (cnt != 1)
3319 dupThisReference(cnt);
3320 }
3321 return data[from .. to];
3322
3323 }
3324
3325 //
opSliceCowArray3326 auto opSlice(size_t from, size_t to) const
3327 {
3328 return data[from .. to];
3329 }
3330
3331 // length slices before the ref count
opSliceCowArray3332 auto opSlice()
3333 {
3334 return opSlice(0, length);
3335 }
3336
3337 // ditto
opSliceCowArray3338 auto opSlice() const
3339 {
3340 return opSlice(0, length);
3341 }
3342
3343 void append(Range)(Range range)
3344 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3345 {
3346 size_t nl = length + range.length;
3347 length = nl;
3348 copy(range, this[nl-range.length .. nl]);
3349 }
3350
appendCowArray3351 void append()(uint[] val...)
3352 {
3353 length = length + val.length;
3354 data[$-val.length-1 .. $-1] = val[];
3355 }
3356
opEqualsCowArray3357 bool opEquals()(auto const ref CowArray rhs)const
3358 {
3359 if (empty ^ rhs.empty)
3360 return false; // one is empty and the other isn't
3361 return empty || data[0..$-1] == rhs.data[0..$-1];
3362 }
3363
3364 private:
3365 // ref-count is right after the data
refCountCowArray3366 @property uint refCount() const
3367 {
3368 return data[$-1];
3369 }
3370
refCountCowArray3371 @property void refCount(uint cnt)
3372 {
3373 data[$-1] = cnt;
3374 }
3375
freeThisReferenceCowArray3376 void freeThisReference()
3377 {
3378 immutable count = refCount;
3379 if (count != 1) // have more references to this memory
3380 {
3381 // dec shared ref-count
3382 refCount = count - 1;
3383 data = [];
3384 }
3385 else
3386 SP.destroy(data);
3387 assert(!data.ptr);
3388 }
3389
dupThisReferenceCowArray3390 void dupThisReference(uint count)
3391 in
3392 {
3393 assert(!empty && count != 1 && count == refCount);
3394 }
3395 body
3396 {
3397 import std.algorithm.mutation : copy;
3398 // dec shared ref-count
3399 refCount = count - 1;
3400 // copy to the new chunk of RAM
3401 auto new_data = SP.alloc!uint(data.length);
3402 // bit-blit old stuff except the counter
3403 copy(data[0..$-1], new_data[0..$-1]);
3404 data = new_data; // before setting refCount!
3405 refCount = 1; // so that this updates the right one
3406 }
3407
3408 uint[] data;
3409 }
3410
3411 @safe unittest// Uint24 tests
3412 {
3413 import std.algorithm.comparison : equal;
3414 import std.algorithm.mutation : copy;
3415 import std.conv : text;
3416 import std.range : iota, chain;
3417 import std.range.primitives : isBidirectionalRange, isOutputRange;
funcRef(T)3418 void funcRef(T)(ref T u24)
3419 {
3420 u24.length = 2;
3421 u24[1] = 1024;
3422 T u24_c = u24;
3423 assert(u24[1] == 1024);
3424 u24.length = 0;
3425 assert(u24.empty);
3426 u24.append([1, 2]);
3427 assert(equal(u24[], [1, 2]));
3428 u24.append(111);
3429 assert(equal(u24[], [1, 2, 111]));
3430 assert(!u24_c.empty && u24_c[1] == 1024);
3431 u24.length = 3;
3432 copy(iota(0, 3), u24[]);
3433 assert(equal(u24[], iota(0, 3)));
3434 assert(u24_c[1] == 1024);
3435 }
3436
func2(T)3437 void func2(T)(T u24)
3438 {
3439 T u24_2 = u24;
3440 T u24_3;
3441 u24_3 = u24_2;
3442 assert(u24_2 == u24_3);
3443 assert(equal(u24[], u24_2[]));
3444 assert(equal(u24_2[], u24_3[]));
3445 funcRef(u24_3);
3446
3447 assert(equal(u24_3[], iota(0, 3)));
3448 assert(!equal(u24_2[], u24_3[]));
3449 assert(equal(u24_2[], u24[]));
3450 u24_2 = u24_3;
3451 assert(equal(u24_2[], iota(0, 3)));
3452 // to test that passed arg is intact outside
3453 // plus try out opEquals
3454 u24 = u24_3;
3455 u24 = T.init;
3456 u24_3 = T.init;
3457 assert(u24.empty);
3458 assert(u24 == u24_3);
3459 assert(u24 != u24_2);
3460 }
3461
3462 foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3463 {
3464 alias Range = typeof(CowArray!Policy.init[]);
3465 alias U24A = CowArray!Policy;
3466 static assert(isForwardRange!Range);
3467 static assert(isBidirectionalRange!Range);
3468 static assert(isOutputRange!(Range, uint));
3469 static assert(isRandomAccessRange!(Range));
3470
3471 auto arr = U24A([42u, 36, 100]);
3472 assert(arr[0] == 42);
3473 assert(arr[1] == 36);
3474 arr[0] = 72;
3475 arr[1] = 0xFE_FEFE;
3476 assert(arr[0] == 72);
3477 assert(arr[1] == 0xFE_FEFE);
3478 assert(arr[2] == 100);
3479 U24A arr2 = arr;
3480 assert(arr2[0] == 72);
3481 arr2[0] = 11;
3482 // test COW-ness
3483 assert(arr[0] == 72);
3484 assert(arr2[0] == 11);
3485 // set this to about 100M to stress-test COW memory management
3486 foreach (v; 0 .. 10_000)
3487 func2(arr);
3488 assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3489
3490 auto r2 = U24A(iota(0, 100));
3491 assert(equal(r2[], iota(0, 100)), text(r2[]));
3492 copy(iota(10, 170, 2), r2[10 .. 90]);
3493 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3494 , text(r2[]));
3495 }
3496 }
3497
version(unittest)3498 version (unittest)
3499 {
3500 private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3501 }
3502
3503 @safe unittest// core set primitives test
3504 {
3505 import std.conv : text;
foreach(CodeList;AllSets)3506 foreach (CodeList; AllSets)
3507 {
3508 CodeList a;
3509 //"plug a hole" test
3510 a.add(10, 20).add(25, 30).add(15, 27);
3511 assert(a == CodeList(10, 30), text(a));
3512
3513 auto x = CodeList.init;
3514 x.add(10, 20).add(30, 40).add(50, 60);
3515
3516 a = x;
3517 a.add(20, 49);//[10, 49) [50, 60)
3518 assert(a == CodeList(10, 49, 50 ,60));
3519
3520 a = x;
3521 a.add(20, 50);
3522 assert(a == CodeList(10, 60), text(a));
3523
3524 // simple unions, mostly edge effects
3525 x = CodeList.init;
3526 x.add(10, 20).add(40, 60);
3527
3528 a = x;
3529 a.add(10, 25); //[10, 25) [40, 60)
3530 assert(a == CodeList(10, 25, 40, 60));
3531
3532 a = x;
3533 a.add(5, 15); //[5, 20) [40, 60)
3534 assert(a == CodeList(5, 20, 40, 60));
3535
3536 a = x;
3537 a.add(0, 10); // [0, 20) [40, 60)
3538 assert(a == CodeList(0, 20, 40, 60));
3539
3540 a = x;
3541 a.add(0, 5); // prepand
3542 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3543
3544 a = x;
3545 a.add(5, 20);
3546 assert(a == CodeList(5, 20, 40, 60));
3547
3548 a = x;
3549 a.add(3, 37);
3550 assert(a == CodeList(3, 37, 40, 60));
3551
3552 a = x;
3553 a.add(37, 65);
3554 assert(a == CodeList(10, 20, 37, 65));
3555
3556 // some tests on helpers for set intersection
3557 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3558 a = x;
3559
3560 auto m = a.skipUpTo(60);
3561 a.dropUpTo(110, m);
3562 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3563
3564 a = x;
3565 a.dropUpTo(100);
3566 assert(a == CodeList(100, 120), text(a.data[]));
3567
3568 a = x;
3569 m = a.skipUpTo(50);
3570 a.dropUpTo(140, m);
3571 assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3572 a = x;
3573 a.dropUpTo(60);
3574 assert(a == CodeList(100, 120), text(a.data[]));
3575 }
3576 }
3577
3578
3579 //test constructor to work with any order of intervals
3580 @safe unittest
3581 {
3582 import std.algorithm.comparison : equal;
3583 import std.conv : text, to;
3584 import std.range : chain, iota;
3585 import std.typecons : tuple;
3586 //ensure constructor handles bad ordering and overlap
3587 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3588 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3589 assert(ch in c1, to!string(ch));
3590
3591 //contiguos
3592 assert(CodepointSet(1000, 1006, 1006, 1009)
3593 .byInterval.equal([tuple(1000, 1009)]));
3594 //contains
3595 assert(CodepointSet(900, 1200, 1000, 1100)
3596 .byInterval.equal([tuple(900, 1200)]));
3597 //intersect left
3598 assert(CodepointSet(900, 1100, 1000, 1200)
3599 .byInterval.equal([tuple(900, 1200)]));
3600 //intersect right
3601 assert(CodepointSet(1000, 1200, 900, 1100)
3602 .byInterval.equal([tuple(900, 1200)]));
3603
3604 //ditto with extra items at end
3605 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3606 .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3607 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3608 .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3609
3610 //"plug a hole" test
3611 auto c2 = CodepointSet(20, 40,
3612 60, 80, 100, 140, 150, 200,
3613 40, 60, 80, 100, 140, 150
3614 );
3615 assert(c2.byInterval.equal([tuple(20, 200)]));
3616
3617 auto c3 = CodepointSet(
3618 20, 40, 60, 80, 100, 140, 150, 200,
3619 0, 10, 15, 100, 10, 20, 200, 220);
3620 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3621 }
3622
3623
3624 @safe unittest
3625 { // full set operations
3626 import std.conv : text;
foreach(CodeList;AllSets)3627 foreach (CodeList; AllSets)
3628 {
3629 CodeList a, b, c, d;
3630
3631 //"plug a hole"
3632 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3633 b.add(40, 60).add(80, 100).add(140, 150);
3634 c = a | b;
3635 d = b | a;
3636 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3637 assert(c == d, text(c," vs ", d));
3638
3639 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3640 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3641 d = b | a;
3642 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3643 assert(c == d, text(c," vs ", d));
3644
3645 b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3646 c = a | b;//[10, 140) [145, 200)
3647 d = b | a;
3648 assert(c == CodeList(10, 140, 145, 200));
3649 assert(c == d, text(c," vs ", d));
3650
3651 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3652 c = a | b;//[0, 140) [150, 220)
3653 d = b | a;
3654 assert(c == CodeList(0, 140, 150, 220));
3655 assert(c == d, text(c," vs ", d));
3656
3657
3658 a = CodeList.init.add(20, 40).add(60, 80);
3659 b = CodeList.init.add(25, 35).add(65, 75);
3660 c = a & b;
3661 d = b & a;
3662 assert(c == CodeList(25, 35, 65, 75), text(c));
3663 assert(c == d, text(c," vs ", d));
3664
3665 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3666 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3667 c = a & b;
3668 d = b & a;
3669 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3670 assert(c == d, text(c," vs ", d));
3671
3672 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3673 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3674 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3675 d = b & a;
3676
3677 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3678 assert(c == d, text(c, " vs ",d));
3679 assert((c & a) == c);
3680 assert((d & b) == d);
3681 assert((c & d) == d);
3682
3683 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3684 c = a & b;
3685 d = b & a;
3686 assert(c == CodeList(150, 200), text(c));
3687 assert(c == d, text(c, " vs ",d));
3688 assert((c & a) == c);
3689 assert((d & b) == d);
3690 assert((c & d) == d);
3691
3692 assert((a & a) == a);
3693 assert((b & b) == b);
3694
3695 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3696 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3697 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3698 d = b - a;// [40, 60) [80, 100) [200, 300)
3699 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3700 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3701 assert(c - d == c, text(c-d, " vs ", c));
3702 assert(d - c == d, text(d-c, " vs ", d));
3703 assert(c - c == CodeList.init);
3704 assert(d - d == CodeList.init);
3705
3706 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200);
3707 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300);
3708 c = a - b;// [160, 190)
3709 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3710 assert(c == CodeList(160, 190), text(c));
3711 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3712 assert(c - d == c, text(c-d, " vs ", c));
3713 assert(d - c == d, text(d-c, " vs ", d));
3714 assert(c - c == CodeList.init);
3715 assert(d - d == CodeList.init);
3716
3717 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3718 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190);
3719 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3720 d = b ~ a;
3721 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3722 text(c));
3723 assert(c == d, text(c, " vs ", d));
3724 }
3725 }
3726
3727 }
3728
3729 @safe unittest// vs single dchar
3730 {
3731 import std.conv : text;
3732 CodepointSet a = CodepointSet(10, 100, 120, 200);
3733 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3734 assert((a & 'B') == CodepointSet(66, 67));
3735 }
3736
3737 @safe unittest// iteration & opIndex
3738 {
3739 import std.algorithm.comparison : equal;
3740 import std.conv : text;
3741 import std.typecons : tuple, Tuple;
3742
3743 foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3744 {
3745 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3746 auto a = CodeList('A','N','a', 'n');
3747 assert(equal(a.byInterval,
3748 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3749 ), text(a.byInterval));
3750
3751 // same @@@BUG as in issue 8949 ?
version(bug8949)3752 version (bug8949)
3753 {
3754 import std.range : retro;
3755 assert(equal(retro(a.byInterval),
3756 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3757 ), text(retro(a.byInterval)));
3758 }
3759 auto achr = a.byCodepoint;
3760 assert(equal(achr, arr), text(a.byCodepoint));
3761 foreach (ch; a.byCodepoint)
3762 assert(a[ch]);
3763 auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3764 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3765 foreach (ch; x.byCodepoint)
3766 assert(x[ch]);
3767 static if (is(CodeList == CodepointSet))
3768 {
3769 auto y = CodeList(x.byInterval);
3770 assert(equal(x.byInterval, y.byInterval));
3771 }
3772 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3773 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3774 }
3775 }
3776
3777 //============================================================================
3778 // Generic Trie template and various ways to build it
3779 //============================================================================
3780
3781 // debug helper to get a shortened array dump
arrayRepr(T)3782 auto arrayRepr(T)(T x)
3783 {
3784 import std.conv : text;
3785 if (x.length > 32)
3786 {
3787 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3788 }
3789 else
3790 return text(x);
3791 }
3792
3793 /**
3794 Maps $(D Key) to a suitable integer index within the range of $(D size_t).
3795 The mapping is constructed by applying predicates from $(D Prefix) left to right
3796 and concatenating the resulting bits.
3797
3798 The first (leftmost) predicate defines the most significant bits of
3799 the resulting index.
3800 */
mapTrieIndex(Prefix...)3801 template mapTrieIndex(Prefix...)
3802 {
3803 size_t mapTrieIndex(Key)(Key key)
3804 if (isValidPrefixForTrie!(Key, Prefix))
3805 {
3806 alias p = Prefix;
3807 size_t idx;
3808 foreach (i, v; p[0..$-1])
3809 {
3810 idx |= p[i](key);
3811 idx <<= p[i+1].bitSize;
3812 }
3813 idx |= p[$-1](key);
3814 return idx;
3815 }
3816 }
3817
3818 /*
3819 $(D TrieBuilder) is a type used for incremental construction
3820 of $(LREF Trie)s.
3821
3822 See $(LREF buildTrie) for generic helpers built on top of it.
3823 */
3824 @trusted private struct TrieBuilder(Value, Key, Args...)
3825 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3826 {
3827 import std.exception : enforce;
3828
3829 private:
3830 // last index is not stored in table, it is used as an offset to values in a block.
3831 static if (is(Value == bool))// always pack bool
3832 alias V = BitPacked!(Value, 1);
3833 else
3834 alias V = Value;
deduceMaxIndex(Preds...)3835 static auto deduceMaxIndex(Preds...)()
3836 {
3837 size_t idx = 1;
3838 foreach (v; Preds)
3839 idx *= 2^^v.bitSize;
3840 return idx;
3841 }
3842
3843 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3844 {
3845 alias Prefix = Args[1..$];
3846 enum lastPageSize = 2^^Prefix[$-1].bitSize;
3847 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3848 enum roughedMaxIndex =
3849 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3850 // check warp around - if wrapped, use the default deduction rule
3851 enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3852 deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3853 }
3854 else
3855 {
3856 alias Prefix = Args;
3857 enum maxIndex = deduceMaxIndex!(Prefix)();
3858 }
3859
3860 alias getIndex = mapTrieIndex!(Prefix);
3861
3862 enum lastLevel = Prefix.length-1;
3863 struct ConstructState
3864 {
3865 size_t idx_zeros, idx_ones;
3866 }
3867 // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3868 size_t[Prefix.length] indices;
3869 // default filler value to use
3870 Value defValue;
3871 // this is a full-width index of next item
3872 size_t curIndex;
3873 // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3874 ConstructState[Prefix.length] state;
3875 // the table being constructed
3876 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3877
3878 @disable this();
3879
3880 //shortcut for index variable at level 'level'
idx(size_t level)3881 @property ref idx(size_t level)(){ return indices[level]; }
3882
3883 // this function assumes no holes in the input so
3884 // indices are going one by one
addValue(size_t level,T)3885 void addValue(size_t level, T)(T val, size_t numVals)
3886 {
3887 alias j = idx!level;
3888 enum pageSize = 1 << Prefix[level].bitSize;
3889 if (numVals == 0)
3890 return;
3891 auto ptr = table.slice!(level);
3892 if (numVals == 1)
3893 {
3894 static if (level == Prefix.length-1)
3895 ptr[j] = val;
3896 else
3897 {// can incur narrowing conversion
3898 assert(j < ptr.length);
3899 ptr[j] = force!(typeof(ptr[j]))(val);
3900 }
3901 j++;
3902 if (j % pageSize == 0)
3903 spillToNextPage!level(ptr);
3904 return;
3905 }
3906 // longer row of values
3907 // get to the next page boundary
3908 immutable nextPB = (j + pageSize) & ~(pageSize-1);
3909 immutable n = nextPB - j;// can fill right in this page
3910 if (numVals < n) //fits in current page
3911 {
3912 ptr[j .. j+numVals] = val;
3913 j += numVals;
3914 return;
3915 }
3916 static if (level != 0)//on the first level it always fits
3917 {
3918 numVals -= n;
3919 //write till the end of current page
3920 ptr[j .. j+n] = val;
3921 j += n;
3922 //spill to the next page
3923 spillToNextPage!level(ptr);
3924 // page at once loop
3925 if (state[level].idx_zeros != size_t.max && val == T.init)
3926 {
3927 alias NextIdx = typeof(table.slice!(level-1)[0]);
3928 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3929 numVals/pageSize);
3930 ptr = table.slice!level; //table structure might have changed
3931 numVals %= pageSize;
3932 }
3933 else
3934 {
3935 while (numVals >= pageSize)
3936 {
3937 numVals -= pageSize;
3938 ptr[j .. j+pageSize] = val;
3939 j += pageSize;
3940 spillToNextPage!level(ptr);
3941 }
3942 }
3943 if (numVals)
3944 {
3945 // the leftovers, an incomplete page
3946 ptr[j .. j+numVals] = val;
3947 j += numVals;
3948 }
3949 }
3950 }
3951
spillToNextPage(size_t level,Slice)3952 void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3953 {
3954 // last level (i.e. topmost) has 1 "page"
3955 // thus it need not to add a new page on upper level
3956 static if (level != 0)
3957 spillToNextPageImpl!(level)(ptr);
3958 }
3959
3960 // this can re-use the current page if duplicate or allocate a new one
3961 // it also makes sure that previous levels point to the correct page in this level
spillToNextPageImpl(size_t level,Slice)3962 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3963 {
3964 alias NextIdx = typeof(table.slice!(level-1)[0]);
3965 NextIdx next_lvl_index;
3966 enum pageSize = 1 << Prefix[level].bitSize;
3967 assert(idx!level % pageSize == 0);
3968 immutable last = idx!level-pageSize;
3969 const slice = ptr[idx!level - pageSize .. idx!level];
3970 size_t j;
3971 for (j=0; j<last; j+=pageSize)
3972 {
3973 if (ptr[j .. j+pageSize] == slice)
3974 {
3975 // get index to it, reuse ptr space for the next block
3976 next_lvl_index = force!NextIdx(j/pageSize);
3977 version (none)
3978 {
3979 import std.stdio : writefln, writeln;
3980 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]"
3981 ,level
3982 ,indices[level-1], pageSize, j, j+pageSize);
3983 writeln("LEVEL(", level
3984 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3985 writeln("LEVEL(", level
3986 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3987 }
3988 idx!level -= pageSize; // reuse this page, it is duplicate
3989 break;
3990 }
3991 }
3992 if (j == last)
3993 {
3994 L_allocate_page:
3995 next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3996 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3997 {
3998 state[level].idx_zeros = next_lvl_index;
3999 }
4000 // allocate next page
4001 version (none)
4002 {
4003 import std.stdio : writefln;
4004 writefln("LEVEL(%s) page allocated: %s"
4005 , level, arrayRepr(slice[0 .. pageSize]));
4006 writefln("LEVEL(%s) index: %s ; page at this index %s"
4007 , level
4008 , next_lvl_index
4009 , arrayRepr(
4010 table.slice!(level)
4011 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4012 ));
4013 }
4014 table.length!level = table.length!level + pageSize;
4015 }
4016 L_know_index:
4017 // for the previous level, values are indices to the pages in the current level
4018 addValue!(level-1)(next_lvl_index, 1);
4019 ptr = table.slice!level; //re-load the slice after moves
4020 }
4021
4022 // idx - full-width index to fill with v (full-width index != key)
4023 // fills everything in the range of [curIndex, idx) with filler
putAt(size_t idx,Value v)4024 void putAt(size_t idx, Value v)
4025 {
4026 assert(idx >= curIndex);
4027 immutable numFillers = idx - curIndex;
4028 addValue!lastLevel(defValue, numFillers);
4029 addValue!lastLevel(v, 1);
4030 curIndex = idx + 1;
4031 }
4032
4033 // ditto, but sets the range of [idxA, idxB) to v
putRangeAt(size_t idxA,size_t idxB,Value v)4034 void putRangeAt(size_t idxA, size_t idxB, Value v)
4035 {
4036 assert(idxA >= curIndex);
4037 assert(idxB >= idxA);
4038 size_t numFillers = idxA - curIndex;
4039 addValue!lastLevel(defValue, numFillers);
4040 addValue!lastLevel(v, idxB - idxA);
4041 curIndex = idxB; // open-right
4042 }
4043
4044 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4045 "duplicate key->value mapping";
4046
4047 public:
4048 /**
4049 Construct a builder, where $(D filler) is a value
4050 to indicate empty slots (or "not found" condition).
4051 */
this(Value filler)4052 this(Value filler)
4053 {
4054 curIndex = 0;
4055 defValue = filler;
4056 // zeros-page index, ones-page index
4057 foreach (ref v; state)
4058 v = ConstructState(size_t.max, size_t.max);
4059 table = typeof(table)(indices);
4060 // one page per level is a bootstrap minimum
4061 foreach (i, Pred; Prefix)
4062 table.length!i = (1 << Pred.bitSize);
4063 }
4064
4065 /**
4066 Put a value $(D v) into interval as
4067 mapped by keys from $(D a) to $(D b).
4068 All slots prior to $(D a) are filled with
4069 the default filler.
4070 */
putRange(Key a,Key b,Value v)4071 void putRange(Key a, Key b, Value v)
4072 {
4073 auto idxA = getIndex(a), idxB = getIndex(b);
4074 // indexes of key should always grow
4075 enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4076 putRangeAt(idxA, idxB, v);
4077 }
4078
4079 /**
4080 Put a value $(D v) into slot mapped by $(D key).
4081 All slots prior to $(D key) are filled with the
4082 default filler.
4083 */
putValue(Key key,Value v)4084 void putValue(Key key, Value v)
4085 {
4086 import std.conv : text;
4087 auto idx = getIndex(key);
4088 enforce(idx >= curIndex, text(errMsg, " ", idx));
4089 putAt(idx, v);
4090 }
4091
4092 /// Finishes construction of Trie, yielding an immutable Trie instance.
build()4093 auto build()
4094 {
4095 static if (maxIndex != 0) // doesn't cover full range of size_t
4096 {
4097 assert(curIndex <= maxIndex);
4098 addValue!lastLevel(defValue, maxIndex - curIndex);
4099 }
4100 else
4101 {
4102 if (curIndex != 0 // couldn't wrap around
4103 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4104 {
4105 addValue!lastLevel(defValue, size_t.max - curIndex);
4106 addValue!lastLevel(defValue, 1);
4107 }
4108 // else curIndex already completed the full range of size_t by wrapping around
4109 }
4110 return Trie!(V, Key, maxIndex, Prefix)(table);
4111 }
4112 }
4113
4114 /**
4115 $(P A generic Trie data-structure for a fixed number of stages.
4116 The design goal is optimal speed with smallest footprint size.
4117 )
4118 $(P It's intentionally read-only and doesn't provide constructors.
4119 To construct one use a special builder,
4120 see $(LREF TrieBuilder) and $(LREF buildTrie).
4121 )
4122
4123 */
4124 @trusted private struct Trie(Value, Key, Args...)
4125 if (isValidPrefixForTrie!(Key, Args)
4126 || (isValidPrefixForTrie!(Key, Args[1..$])
4127 && is(typeof(Args[0]) : size_t)))
4128 {
4129 import std.range.primitives : isOutputRange;
4130 static if (is(typeof(Args[0]) : size_t))
4131 {
4132 private enum maxIndex = Args[0];
4133 private enum hasBoundsCheck = true;
4134 private alias Prefix = Args[1..$];
4135 }
4136 else
4137 {
4138 private enum hasBoundsCheck = false;
4139 private alias Prefix = Args;
4140 }
4141
this()4142 private this()(typeof(_table) table)
4143 {
4144 _table = table;
4145 }
4146
4147 // only for constant Tries constructed from precompiled tables
this()4148 private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4149 const(size_t)[] data) const
4150 {
4151 _table = typeof(_table)(offsets, sizes, data);
4152 }
4153
4154 /**
4155 $(P Lookup the $(D key) in this $(D Trie). )
4156
4157 $(P The lookup always succeeds if key fits the domain
4158 provided during construction. The whole domain defined
4159 is covered so instead of not found condition
4160 the sentinel (filler) value could be used. )
4161
4162 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4163 define a domain of $(D Trie) keys and the sentinel value. )
4164
4165 Note:
4166 Domain range-checking is only enabled in debug builds
4167 and results in assertion failure.
4168 */
4169 TypeOfBitPacked!Value opIndex()(Key key) const
4170 {
4171 static if (hasBoundsCheck)
4172 assert(mapTrieIndex!Prefix(key) < maxIndex);
4173 size_t idx;
4174 alias p = Prefix;
4175 idx = cast(size_t) p[0](key);
4176 foreach (i, v; p[0..$-1])
4177 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4178 return _table.ptr!(p.length-1)[idx];
4179 }
4180
4181 ///
4182 @property size_t bytes(size_t n=size_t.max)() const
4183 {
4184 return _table.bytes!n;
4185 }
4186
4187 ///
pages(size_t n)4188 @property size_t pages(size_t n)() const
4189 {
4190 return (bytes!n+2^^(Prefix[n].bitSize-1))
4191 /2^^Prefix[n].bitSize;
4192 }
4193
4194 ///
4195 void store(OutRange)(scope OutRange sink) const
4196 if (isOutputRange!(OutRange, char))
4197 {
4198 _table.store(sink);
4199 }
4200
4201 private:
4202 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4203 }
4204
4205 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4206 // left-to-right, the most significant bits first
GetBitSlicing(size_t top,sizes...)4207 template GetBitSlicing(size_t top, sizes...)
4208 {
4209 static if (sizes.length > 0)
4210 alias GetBitSlicing =
4211 AliasSeq!(sliceBits!(top - sizes[0], top),
4212 GetBitSlicing!(top - sizes[0], sizes[1..$]));
4213 else
4214 alias GetBitSlicing = AliasSeq!();
4215 }
4216
callableWith(T)4217 template callableWith(T)
4218 {
4219 template callableWith(alias Pred)
4220 {
4221 static if (!is(typeof(Pred(T.init))))
4222 enum callableWith = false;
4223 else
4224 {
4225 alias Result = typeof(Pred(T.init));
4226 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4227 }
4228 }
4229 }
4230
4231 /*
4232 Check if $(D Prefix) is a valid set of predicates
4233 for $(D Trie) template having $(D Key) as the type of keys.
4234 This requires all predicates to be callable, take
4235 single argument of type $(D Key) and return unsigned value.
4236 */
isValidPrefixForTrie(Key,Prefix...)4237 template isValidPrefixForTrie(Key, Prefix...)
4238 {
4239 import std.meta : allSatisfy;
4240 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4241 }
4242
4243 /*
4244 Check if $(D Args) is a set of maximum key value followed by valid predicates
4245 for $(D Trie) template having $(D Key) as the type of keys.
4246 */
isValidArgsForTrie(Key,Args...)4247 template isValidArgsForTrie(Key, Args...)
4248 {
4249 static if (Args.length > 1)
4250 {
4251 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4252 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4253 }
4254 else
4255 enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4256 }
4257
sumOfIntegerTuple(ints...)4258 @property size_t sumOfIntegerTuple(ints...)()
4259 {
4260 size_t count=0;
4261 foreach (v; ints)
4262 count += v;
4263 return count;
4264 }
4265
4266 /**
4267 A shorthand for creating a custom multi-level fixed Trie
4268 from a $(D CodepointSet). $(D sizes) are numbers of bits per level,
4269 with the most significant bits used first.
4270
4271 Note: The sum of $(D sizes) must be equal 21.
4272
4273 See_Also: $(LREF toTrie), which is even simpler.
4274
4275 Example:
4276 ---
4277 {
4278 import std.stdio;
4279 auto set = unicode("Number");
4280 auto trie = codepointSetTrie!(8, 5, 8)(set);
4281 writeln("Input code points to test:");
4282 foreach (line; stdin.byLine)
4283 {
4284 int count=0;
4285 foreach (dchar ch; line)
4286 if (trie[ch])// is number
4287 count++;
4288 writefln("Contains %d number code points.", count);
4289 }
4290 }
4291 ---
4292 */
4293 public template codepointSetTrie(sizes...)
4294 if (sumOfIntegerTuple!sizes == 21)
4295 {
4296 auto codepointSetTrie(Set)(Set set)
4297 if (isCodepointSet!Set)
4298 {
4299 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4300 foreach (ival; set.byInterval)
4301 builder.putRange(ival[0], ival[1], true);
4302 return builder.build();
4303 }
4304 }
4305
4306 /// Type of Trie generated by codepointSetTrie function.
4307 public template CodepointSetTrie(sizes...)
4308 if (sumOfIntegerTuple!sizes == 21)
4309 {
4310 alias Prefix = GetBitSlicing!(21, sizes);
4311 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4312 }
4313
4314 /**
4315 A slightly more general tool for building fixed $(D Trie)
4316 for the Unicode data.
4317
4318 Specifically unlike $(D codepointSetTrie) it's allows creating mappings
4319 of $(D dchar) to an arbitrary type $(D T).
4320
4321 Note: Overload taking $(D CodepointSet)s will naturally convert
4322 only to bool mapping $(D Trie)s.
4323 */
4324 public template codepointTrie(T, sizes...)
4325 if (sumOfIntegerTuple!sizes == 21)
4326 {
4327 alias Prefix = GetBitSlicing!(21, sizes);
4328
4329 static if (is(TypeOfBitPacked!T == bool))
4330 {
4331 auto codepointTrie(Set)(in Set set)
4332 if (isCodepointSet!Set)
4333 {
4334 return codepointSetTrie(set);
4335 }
4336 }
4337
codepointTrie()4338 auto codepointTrie()(T[dchar] map, T defValue=T.init)
4339 {
4340 return buildTrie!(T, dchar, Prefix)(map, defValue);
4341 }
4342
4343 // unsorted range of pairs
4344 auto codepointTrie(R)(R range, T defValue=T.init)
4345 if (isInputRange!R
4346 && is(typeof(ElementType!R.init[0]) : T)
4347 && is(typeof(ElementType!R.init[1]) : dchar))
4348 {
4349 // build from unsorted array of pairs
4350 // TODO: expose index sorting functions for Trie
4351 return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4352 }
4353 }
4354
4355 @system pure unittest
4356 {
4357 import std.algorithm.comparison : max;
4358 import std.algorithm.searching : count;
4359
4360 // pick characters from the Greek script
4361 auto set = unicode.Greek;
4362
4363 // a user-defined property (or an expensive function)
4364 // that we want to look up
luckFactor(dchar ch)4365 static uint luckFactor(dchar ch)
4366 {
4367 // here we consider a character lucky
4368 // if its code point has a lot of identical hex-digits
4369 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4370 ubyte[6] nibbles; // 6 4-bit chunks of code point
4371 uint value = ch;
4372 foreach (i; 0 .. 6)
4373 {
4374 nibbles[i] = value & 0xF;
4375 value >>= 4;
4376 }
4377 uint luck;
4378 foreach (n; nibbles)
4379 luck = cast(uint) max(luck, count(nibbles[], n));
4380 return luck;
4381 }
4382
4383 // only unsigned built-ins are supported at the moment
4384 alias LuckFactor = BitPacked!(uint, 3);
4385
4386 // create a temporary associative array (AA)
4387 LuckFactor[dchar] map;
4388 foreach (ch; set.byCodepoint)
4389 map[ch] = LuckFactor(luckFactor(ch));
4390
4391 // bits per stage are chosen randomly, fell free to optimize
4392 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4393
4394 // from now on the AA is not needed
4395 foreach (ch; set.byCodepoint)
4396 assert(trie[ch] == luckFactor(ch)); // verify
4397 // CJK is not Greek, thus it has the default value
4398 assert(trie['\u4444'] == 0);
4399 // and here is a couple of quite lucky Greek characters:
4400 // Greek small letter epsilon with dasia
4401 assert(trie['\u1F11'] == 3);
4402 // Ancient Greek metretes sign
4403 assert(trie['\U00010181'] == 3);
4404
4405 }
4406
4407 /// Type of Trie as generated by codepointTrie function.
4408 public template CodepointTrie(T, sizes...)
4409 if (sumOfIntegerTuple!sizes == 21)
4410 {
4411 alias Prefix = GetBitSlicing!(21, sizes);
4412 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4413 }
4414
cmpK0(alias Pred)4415 package template cmpK0(alias Pred)
4416 {
4417 import std.typecons : Tuple;
4418 static bool cmpK0(Value, Key)
4419 (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4420 {
4421 return Pred(a[1]) < Pred(b[1]);
4422 }
4423 }
4424
4425 /**
4426 The most general utility for construction of $(D Trie)s
4427 short of using $(D TrieBuilder) directly.
4428
4429 Provides a number of convenience overloads.
4430 $(D Args) is tuple of maximum key value followed by
4431 predicates to construct index from key.
4432
4433 Alternatively if the first argument is not a value convertible to $(D Key)
4434 then the whole tuple of $(D Args) is treated as predicates
4435 and the maximum Key is deduced from predicates.
4436 */
4437 private template buildTrie(Value, Key, Args...)
4438 if (isValidArgsForTrie!(Key, Args))
4439 {
4440 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4441 {
4442 alias Prefix = Args[1..$];
4443 }
4444 else
4445 alias Prefix = Args;
4446
4447 alias getIndex = mapTrieIndex!(Prefix);
4448
4449 // for multi-sort
GetComparators(size_t n)4450 template GetComparators(size_t n)
4451 {
4452 static if (n > 0)
4453 alias GetComparators =
4454 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4455 else
4456 alias GetComparators = AliasSeq!();
4457 }
4458
4459 /*
4460 Build $(D Trie) from a range of a Key-Value pairs,
4461 assuming it is sorted by Key as defined by the following lambda:
4462 ------
4463 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4464 ------
4465 Exception is thrown if it's detected that the above order doesn't hold.
4466
4467 In other words $(LREF mapTrieIndex) should be a
4468 monotonically increasing function that maps $(D Key) to an integer.
4469
4470 See_Also: $(REF sort, std,_algorithm),
4471 $(REF SortedRange, std,_range),
4472 $(REF setUnion, std,_algorithm).
4473 */
4474 auto buildTrie(Range)(Range range, Value filler=Value.init)
4475 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4476 && is(typeof(Range.init.front[1]) : Key))
4477 {
4478 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4479 foreach (v; range)
4480 builder.putValue(v[1], v[0]);
4481 return builder.build();
4482 }
4483
4484 /*
4485 If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
4486 to build $(D Trie) from a range of open-right intervals of $(D Key)s.
4487 The requirement on the ordering of keys (and the behavior on the
4488 violation of it) is the same as for Key-Value range overload.
4489
4490 Intervals denote ranges of !$(D filler) i.e. the opposite of filler.
4491 If no filler provided keys inside of the intervals map to true,
4492 and $(D filler) is false.
4493 */
4494 auto buildTrie(Range)(Range range, Value filler=Value.init)
4495 if (is(TypeOfBitPacked!Value == bool)
4496 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4497 && is(typeof(Range.init.front[1]) : Key))
4498 {
4499 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4500 foreach (ival; range)
4501 builder.putRange(ival[0], ival[1], !filler);
4502 return builder.build();
4503 }
4504
4505 auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4506 if (isInputRange!Range
4507 && is(typeof(Range.init.front[0]) : Value)
4508 && is(typeof(Range.init.front[1]) : Key))
4509 {
4510 import std.algorithm.sorting : multiSort;
4511 alias Comps = GetComparators!(Prefix.length);
4512 if (unsorted)
4513 multiSort!(Comps)(range);
4514 return buildTrie(range, filler);
4515 }
4516
4517 /*
4518 If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
4519 to build $(D Trie) simply from an input range of $(D Key)s.
4520 The requirement on the ordering of keys (and the behavior on the
4521 violation of it) is the same as for Key-Value range overload.
4522
4523 Keys found in range denote !$(D filler) i.e. the opposite of filler.
4524 If no filler provided keys map to true, and $(D filler) is false.
4525 */
4526 auto buildTrie(Range)(Range range, Value filler=Value.init)
4527 if (is(TypeOfBitPacked!Value == bool)
4528 && isInputRange!Range && is(typeof(Range.init.front) : Key))
4529 {
4530 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4531 foreach (v; range)
4532 builder.putValue(v, !filler);
4533 return builder.build();
4534 }
4535
4536 /*
4537 If $(D Key) is unsigned integer $(D Trie) could be constructed from array
4538 of values where array index serves as key.
4539 */
4540 auto buildTrie()(Value[] array, Value filler=Value.init)
4541 if (isUnsigned!Key)
4542 {
4543 auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4544 foreach (idx, v; array)
4545 builder.putValue(idx, v);
4546 return builder.build();
4547 }
4548
4549 /*
4550 Builds $(D Trie) from associative array.
4551 */
buildTrie(Key,Value)4552 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4553 {
4554 import std.array : array;
4555 import std.range : zip;
4556 auto range = array(zip(map.values, map.keys));
4557 return buildTrie(range, filler, true); // sort it
4558 }
4559 }
4560
4561 // helper in place of assumeSize to
4562 //reduce mangled name & help DMD inline Trie functors
clamp(size_t bits)4563 struct clamp(size_t bits)
4564 {
4565 static size_t opCall(T)(T arg){ return arg; }
4566 enum bitSize = bits;
4567 }
4568
clampIdx(size_t idx,size_t bits)4569 struct clampIdx(size_t idx, size_t bits)
4570 {
4571 static size_t opCall(T)(T arg){ return arg[idx]; }
4572 enum bitSize = bits;
4573 }
4574
4575 /**
4576 Conceptual type that outlines the common properties of all UTF Matchers.
4577
4578 Note: For illustration purposes only, every method
4579 call results in assertion failure.
4580 Use $(LREF utfMatcher) to obtain a concrete matcher
4581 for UTF-8 or UTF-16 encodings.
4582 */
4583 public struct MatcherConcept
4584 {
4585 /**
4586 $(P Perform a semantic equivalent 2 operations:
4587 decoding a $(CODEPOINT) at front of $(D inp) and testing if
4588 it belongs to the set of $(CODEPOINTS) of this matcher. )
4589
4590 $(P The effect on $(D inp) depends on the kind of function called:)
4591
4592 $(P Match. If the codepoint is found in the set then range $(D inp)
4593 is advanced by its size in $(S_LINK Code unit, code units),
4594 otherwise the range is not modifed.)
4595
4596 $(P Skip. The range is always advanced by the size
4597 of the tested $(CODEPOINT) regardless of the result of test.)
4598
4599 $(P Test. The range is left unaffected regardless
4600 of the result of test.)
4601 */
4602 public bool match(Range)(ref Range inp)
4603 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4604 {
4605 assert(false);
4606 }
4607
4608 ///ditto
4609 public bool skip(Range)(ref Range inp)
4610 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4611 {
4612 assert(false);
4613 }
4614
4615 ///ditto
4616 public bool test(Range)(ref Range inp)
4617 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4618 {
4619 assert(false);
4620 }
4621 ///
4622 @safe unittest
4623 {
4624 string truth = "2² = 4";
4625 auto m = utfMatcher!char(unicode.Number);
4626 assert(m.match(truth)); // '2' is a number all right
4627 assert(truth == "² = 4"); // skips on match
4628 assert(m.match(truth)); // so is the superscript '2'
4629 assert(!m.match(truth)); // space is not a number
4630 assert(truth == " = 4"); // unaffected on no match
4631 assert(!m.skip(truth)); // same test ...
4632 assert(truth == "= 4"); // but skips a codepoint regardless
4633 assert(!m.test(truth)); // '=' is not a number
4634 assert(truth == "= 4"); // test never affects argument
4635 }
4636
4637 /**
4638 Advanced feature - provide direct access to a subset of matcher based a
4639 set of known encoding lengths. Lengths are provided in
4640 $(S_LINK Code unit, code units). The sub-matcher then may do less
4641 operations per any $(D test)/$(D match).
4642
4643 Use with care as the sub-matcher won't match
4644 any $(CODEPOINTS) that have encoded length that doesn't belong
4645 to the selected set of lengths. Also the sub-matcher object references
4646 the parent matcher and must not be used past the liftetime
4647 of the latter.
4648
4649 Another caveat of using sub-matcher is that skip is not available
4650 preciesly because sub-matcher doesn't detect all lengths.
4651 */
subMatcherMatcherConcept4652 @property auto subMatcher(Lengths...)()
4653 {
4654 assert(0);
4655 return this;
4656 }
4657
4658 @safe unittest
4659 {
4660 auto m = utfMatcher!char(unicode.Number);
4661 string square = "2²";
4662 // about sub-matchers
4663 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4664 assert(m.subMatcher!1.match(square)); // ASCII-only, works
4665 assert(!m.subMatcher!1.test(square)); // unicode '²'
4666 assert(m.subMatcher!(2,3,4).match(square)); //
4667 assert(square == "");
4668 wstring wsquare = "2²";
4669 auto m16 = utfMatcher!wchar(unicode.Number);
4670 // may keep ref, but the orignal (m16) must be kept alive
4671 auto bmp = m16.subMatcher!1;
4672 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4673 assert(bmp.match(wsquare)); // And '²' too
4674 }
4675 }
4676
4677 /**
4678 Test if $(D M) is an UTF Matcher for ranges of $(D Char).
4679 */
4680 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4681 C[] s;
4682 auto d = s.decoder;
4683 M m;
4684 assert(is(typeof(m.match(d)) == bool));
4685 assert(is(typeof(m.test(d)) == bool));
4686 static if (is(typeof(m.skip(d))))
4687 {
4688 assert(is(typeof(m.skip(d)) == bool));
4689 assert(is(typeof(m.skip(s)) == bool));
4690 }
4691 assert(is(typeof(m.match(s)) == bool));
4692 assert(is(typeof(m.test(s)) == bool));
4693 });
4694
4695 @safe unittest
4696 {
4697 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4698 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4699 static assert(isUtfMatcher!(CharMatcher, char));
4700 static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4701 static assert(isUtfMatcher!(WcharMatcher, wchar));
4702 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4703 }
4704
4705 enum Mode {
4706 alwaysSkip,
4707 neverSkip,
4708 skipOnMatch
4709 }
4710
ForwardStrings()4711 mixin template ForwardStrings()
4712 {
4713 private bool fwdStr(string fn, C)(ref C[] str) const pure
4714 {
4715 import std.utf : byCodeUnit;
4716 alias type = typeof(byCodeUnit(str));
4717 return mixin(fn~"(*cast(type*)&str)");
4718 }
4719 }
4720
Utf8Matcher()4721 template Utf8Matcher()
4722 {
4723 enum validSize(int sz) = sz >= 1 && sz <= 4;
4724
4725 void badEncoding() pure @safe
4726 {
4727 import std.utf : UTFException;
4728 throw new UTFException("Invalid UTF-8 sequence");
4729 }
4730
4731 //for 1-stage ASCII
4732 alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4733 //for 2-stage lookup of 2 byte UTF-8 sequences
4734 alias Utf8Spec2 = AliasSeq!(bool, char[2],
4735 clampIdx!(0, 5), clampIdx!(1, 6));
4736 //ditto for 3 byte
4737 alias Utf8Spec3 = AliasSeq!(bool, char[3],
4738 clampIdx!(0, 4),
4739 clampIdx!(1, 6),
4740 clampIdx!(2, 6)
4741 );
4742 //ditto for 4 byte
4743 alias Utf8Spec4 = AliasSeq!(bool, char[4],
4744 clampIdx!(0, 3), clampIdx!(1, 6),
4745 clampIdx!(2, 6), clampIdx!(3, 6)
4746 );
4747 alias Tables = AliasSeq!(
4748 typeof(TrieBuilder!(AsciiSpec)(false).build()),
4749 typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4750 typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4751 typeof(TrieBuilder!(Utf8Spec4)(false).build())
4752 );
4753 alias Table(int size) = Tables[size-1];
4754
4755 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4756 enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4757
4758 char truncate()(char ch) pure @safe
4759 {
4760 ch -= 0x80;
4761 if (ch < 0x40)
4762 {
4763 return ch;
4764 }
4765 else
4766 {
4767 badEncoding();
4768 return cast(char) 0;
4769 }
4770 }
4771
4772 static auto encode(size_t sz)(dchar ch)
4773 if (sz > 1)
4774 {
4775 import std.utf : encodeUTF = encode;
4776 char[4] buf;
4777 encodeUTF(buf, ch);
4778 char[sz] ret;
4779 buf[0] &= leadMask!sz;
4780 foreach (n; 1 .. sz)
4781 buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4782 ret[] = buf[0 .. sz];
4783 return ret;
4784 }
4785
4786 auto build(Set)(Set set)
4787 {
4788 import std.algorithm.iteration : map;
4789 auto ascii = set & unicode.ASCII;
4790 auto utf8_2 = set & CodepointSet(0x80, 0x800);
4791 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4792 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4793 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4794 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4795 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4796 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4797 alias Ret = Impl!(1,2,3,4);
4798 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4799 }
4800
4801 // Bootstrap UTF-8 static matcher interface
4802 // from 3 primitives: tab!(size), lookup and Sizes
4803 mixin template DefMatcher()
4804 {
4805 import std.format : format;
4806 import std.meta : Erase, staticIndexOf;
4807 enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4808 alias UniSizes = Erase!(1, Sizes);
4809
4810 //generate dispatch code sequence for unicode parts
4811 static auto genDispatch()
4812 {
4813 string code;
4814 foreach (size; UniSizes)
4815 code ~= format(q{
4816 if ((ch & ~leadMask!%d) == encMask!(%d))
4817 return lookup!(%d, mode)(inp);
4818 else
4819 }, size, size, size);
4820 static if (Sizes.length == 4) //covers all code unit cases
4821 code ~= "{ badEncoding(); return false; }";
4822 else
4823 code ~= "return false;"; //may be just fine but not covered
4824 return code;
4825 }
4826 enum dispatch = genDispatch();
4827
4828 public bool match(Range)(ref Range inp) const pure
4829 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4830 {
4831 enum mode = Mode.skipOnMatch;
4832 assert(!inp.empty);
4833 immutable ch = inp[0];
4834 static if (hasASCII)
4835 {
4836 if (ch < 0x80)
4837 {
4838 immutable r = tab!1[ch];
4839 if (r)
4840 inp.popFront();
4841 return r;
4842 }
4843 else
4844 mixin(dispatch);
4845 }
4846 else
4847 mixin(dispatch);
4848 }
4849
4850 static if (Sizes.length == 4) // can skip iff can detect all encodings
4851 {
4852 public bool skip(Range)(ref Range inp) const pure @trusted
4853 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4854 {
4855 enum mode = Mode.alwaysSkip;
4856 assert(!inp.empty);
4857 auto ch = inp[0];
4858 static if (hasASCII)
4859 {
4860 if (ch < 0x80)
4861 {
4862 inp.popFront();
4863 return tab!1[ch];
4864 }
4865 else
4866 mixin(dispatch);
4867 }
4868 else
4869 mixin(dispatch);
4870 }
4871 }
4872
4873 public bool test(Range)(ref Range inp) const pure @trusted
4874 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4875 {
4876 enum mode = Mode.neverSkip;
4877 assert(!inp.empty);
4878 auto ch = inp[0];
4879 static if (hasASCII)
4880 {
4881 if (ch < 0x80)
4882 return tab!1[ch];
4883 else
4884 mixin(dispatch);
4885 }
4886 else
4887 mixin(dispatch);
4888 }
4889
4890 bool match(C)(ref C[] str) const pure @trusted
4891 if (isSomeChar!C)
4892 {
4893 return fwdStr!"match"(str);
4894 }
4895
4896 bool skip(C)(ref C[] str) const pure @trusted
4897 if (isSomeChar!C)
4898 {
4899 return fwdStr!"skip"(str);
4900 }
4901
4902 bool test(C)(ref C[] str) const pure @trusted
4903 if (isSomeChar!C)
4904 {
4905 return fwdStr!"test"(str);
4906 }
4907
4908 mixin ForwardStrings;
4909 }
4910
4911 struct Impl(Sizes...)
4912 {
4913 import std.meta : allSatisfy, staticMap;
4914 static assert(allSatisfy!(validSize, Sizes),
4915 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4916 private:
4917 //pick tables for chosen sizes
4918 alias OurTabs = staticMap!(Table, Sizes);
4919 OurTabs tables;
4920 mixin DefMatcher;
4921 //static disptach helper UTF size ==> table
4922 alias tab(int i) = tables[i - 1];
4923
4924 package @property auto subMatcher(SizesToPick...)() @trusted
4925 {
4926 return CherryPick!(Impl, SizesToPick)(&this);
4927 }
4928
4929 bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted
4930 {
4931 import std.typecons : staticIota;
4932 if (inp.length < size)
4933 {
4934 badEncoding();
4935 return false;
4936 }
4937 char[size] needle = void;
4938 needle[0] = leadMask!size & inp[0];
4939 foreach (i; staticIota!(1, size))
4940 {
4941 needle[i] = truncate(inp[i]);
4942 }
4943 //overlong encoding checks
4944 static if (size == 2)
4945 {
4946 //0x80-0x7FF
4947 //got 6 bits in needle[1], must use at least 8 bits
4948 //must use at least 2 bits in needle[1]
4949 if (needle[0] < 2) badEncoding();
4950 }
4951 else static if (size == 3)
4952 {
4953 //0x800-0xFFFF
4954 //got 6 bits in needle[2], must use at least 12bits
4955 //must use 6 bits in needle[1] or anything in needle[0]
4956 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4957 }
4958 else static if (size == 4)
4959 {
4960 //0x800-0xFFFF
4961 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4962 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4963 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4964 }
4965 static if (mode == Mode.alwaysSkip)
4966 {
4967 inp.popFrontN(size);
4968 return tab!size[needle];
4969 }
4970 else static if (mode == Mode.neverSkip)
4971 {
4972 return tab!size[needle];
4973 }
4974 else
4975 {
4976 static assert(mode == Mode.skipOnMatch);
4977 if (tab!size[needle])
4978 {
4979 inp.popFrontN(size);
4980 return true;
4981 }
4982 else
4983 return false;
4984 }
4985 }
4986 }
4987
4988 struct CherryPick(I, Sizes...)
4989 {
4990 import std.meta : allSatisfy;
4991 static assert(allSatisfy!(validSize, Sizes),
4992 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4993 private:
4994 I* m;
4995 @property ref tab(int i)() const pure { return m.tables[i - 1]; }
4996 bool lookup(int size, Mode mode, Range)(ref Range inp) const pure
4997 {
4998 return m.lookup!(size, mode)(inp);
4999 }
5000 mixin DefMatcher;
5001 }
5002 }
5003
Utf16Matcher()5004 template Utf16Matcher()
5005 {
5006 enum validSize(int sz) = sz >= 1 && sz <= 2;
5007
5008 void badEncoding() pure
5009 {
5010 import std.utf : UTFException;
5011 throw new UTFException("Invalid UTF-16 sequence");
5012 }
5013
5014 // 1-stage ASCII
5015 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5016 //2-stage BMP
5017 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5018 //4-stage - full Unicode
5019 //assume that 0xD800 & 0xDC00 bits are cleared
5020 //thus leaving 10 bit per wchar to worry about
5021 alias UniSpec = AliasSeq!(bool, wchar[2],
5022 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5023 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5024 );
5025 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5026 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5027 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5028
5029 auto encode2(dchar ch)
5030 {
5031 ch -= 0x1_0000;
5032 assert(ch <= 0xF_FFFF);
5033 wchar[2] ret;
5034 //do not put surrogate bits, they are sliced off
5035 ret[0] = cast(wchar)(ch >> 10);
5036 ret[1] = (ch & 0xFFF);
5037 return ret;
5038 }
5039
5040 auto build(Set)(Set set)
5041 {
5042 import std.algorithm.iteration : map;
5043 auto ascii = set & unicode.ASCII;
5044 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5045 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5046 auto other = set - (bmp | ascii);
5047 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5048 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5049 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5050 alias Ret = Impl!(1,2);
5051 return Ret(asciiT, bmpT, otherT);
5052 }
5053
5054 //bootstrap full UTF-16 matcher interace from
5055 //sizeFlags, lookupUni and ascii
5056 mixin template DefMatcher()
5057 {
5058 public bool match(Range)(ref Range inp) const pure @trusted
5059 if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5060 {
5061 enum mode = Mode.skipOnMatch;
5062 assert(!inp.empty);
5063 immutable ch = inp[0];
5064 static if (sizeFlags & 1)
5065 {
5066 if (ch < 0x80)
5067 {
5068 if (ascii[ch])
5069 {
5070 inp.popFront();
5071 return true;
5072 }
5073 else
5074 return false;
5075 }
5076 return lookupUni!mode(inp);
5077 }
5078 else
5079 return lookupUni!mode(inp);
5080 }
5081
5082 static if (Sizes.length == 2)
5083 {
5084 public bool skip(Range)(ref Range inp) const pure @trusted
5085 if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5086 {
5087 enum mode = Mode.alwaysSkip;
5088 assert(!inp.empty);
5089 immutable ch = inp[0];
5090 static if (sizeFlags & 1)
5091 {
5092 if (ch < 0x80)
5093 {
5094 inp.popFront();
5095 return ascii[ch];
5096 }
5097 else
5098 return lookupUni!mode(inp);
5099 }
5100 else
5101 return lookupUni!mode(inp);
5102 }
5103 }
5104
5105 public bool test(Range)(ref Range inp) const pure @trusted
5106 if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5107 {
5108 enum mode = Mode.neverSkip;
5109 assert(!inp.empty);
5110 auto ch = inp[0];
5111 static if (sizeFlags & 1)
5112 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5113 else
5114 return lookupUni!mode(inp);
5115 }
5116
5117 bool match(C)(ref C[] str) const pure @trusted
5118 if (isSomeChar!C)
5119 {
5120 return fwdStr!"match"(str);
5121 }
5122
5123 bool skip(C)(ref C[] str) const pure @trusted
5124 if (isSomeChar!C)
5125 {
5126 return fwdStr!"skip"(str);
5127 }
5128
5129 bool test(C)(ref C[] str) const pure @trusted
5130 if (isSomeChar!C)
5131 {
5132 return fwdStr!"test"(str);
5133 }
5134
5135 mixin ForwardStrings; //dispatch strings to range versions
5136 }
5137
5138 struct Impl(Sizes...)
5139 if (Sizes.length >= 1 && Sizes.length <= 2)
5140 {
5141 private:
5142 import std.meta : allSatisfy;
5143 static assert(allSatisfy!(validSize, Sizes),
5144 "Only lengths of 1 and 2 code units are possible in UTF-16");
5145 static if (Sizes.length > 1)
5146 enum sizeFlags = Sizes[0] | Sizes[1];
5147 else
5148 enum sizeFlags = Sizes[0];
5149
5150 static if (sizeFlags & 1)
5151 {
5152 Ascii ascii;
5153 Bmp bmp;
5154 }
5155 static if (sizeFlags & 2)
5156 {
5157 Uni uni;
5158 }
5159 mixin DefMatcher;
5160
5161 package @property auto subMatcher(SizesToPick...)() @trusted
5162 {
5163 return CherryPick!(Impl, SizesToPick)(&this);
5164 }
5165
5166 bool lookupUni(Mode mode, Range)(ref Range inp) const pure
5167 {
5168 wchar x = cast(wchar)(inp[0] - 0xD800);
5169 //not a high surrogate
5170 if (x > 0x3FF)
5171 {
5172 //low surrogate
5173 if (x <= 0x7FF) badEncoding();
5174 static if (sizeFlags & 1)
5175 {
5176 auto ch = inp[0];
5177 static if (mode == Mode.alwaysSkip)
5178 inp.popFront();
5179 static if (mode == Mode.skipOnMatch)
5180 {
5181 if (bmp[ch])
5182 {
5183 inp.popFront();
5184 return true;
5185 }
5186 else
5187 return false;
5188 }
5189 else
5190 return bmp[ch];
5191 }
5192 else //skip is not available for sub-matchers, so just false
5193 return false;
5194 }
5195 else
5196 {
5197 static if (sizeFlags & 2)
5198 {
5199 if (inp.length < 2)
5200 badEncoding();
5201 wchar y = cast(wchar)(inp[1] - 0xDC00);
5202 //not a low surrogate
5203 if (y > 0x3FF)
5204 badEncoding();
5205 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5206 static if (mode == Mode.alwaysSkip)
5207 inp.popFrontN(2);
5208 static if (mode == Mode.skipOnMatch)
5209 {
5210 if (uni[needle])
5211 {
5212 inp.popFrontN(2);
5213 return true;
5214 }
5215 else
5216 return false;
5217 }
5218 else
5219 return uni[needle];
5220 }
5221 else //ditto
5222 return false;
5223 }
5224 }
5225 }
5226
5227 struct CherryPick(I, Sizes...)
5228 if (Sizes.length >= 1 && Sizes.length <= 2)
5229 {
5230 private:
5231 import std.meta : allSatisfy;
5232 I* m;
5233 enum sizeFlags = I.sizeFlags;
5234
5235 static if (sizeFlags & 1)
5236 {
5237 @property ref ascii()() const pure{ return m.ascii; }
5238 }
5239
5240 bool lookupUni(Mode mode, Range)(ref Range inp) const pure
5241 {
5242 return m.lookupUni!mode(inp);
5243 }
5244 mixin DefMatcher;
5245 static assert(allSatisfy!(validSize, Sizes),
5246 "Only lengths of 1 and 2 code units are possible in UTF-16");
5247 }
5248 }
5249
utf8Matcher(Set)5250 private auto utf8Matcher(Set)(Set set) @trusted
5251 {
5252 return Utf8Matcher!().build(set);
5253 }
5254
utf16Matcher(Set)5255 private auto utf16Matcher(Set)(Set set) @trusted
5256 {
5257 return Utf16Matcher!().build(set);
5258 }
5259
5260 /**
5261 Constructs a matcher object
5262 to classify $(CODEPOINTS) from the $(D set) for encoding
5263 that has $(D Char) as code unit.
5264
5265 See $(LREF MatcherConcept) for API outline.
5266 */
5267 public auto utfMatcher(Char, Set)(Set set) @trusted
5268 if (isCodepointSet!Set)
5269 {
5270 static if (is(Char : char))
5271 return utf8Matcher(set);
5272 else static if (is(Char : wchar))
5273 return utf16Matcher(set);
5274 else static if (is(Char : dchar))
5275 static assert(false, "UTF-32 needs no decoding,
5276 and thus not supported by utfMatcher");
5277 else
5278 static assert(false, "Only character types 'char' and 'wchar' are allowed");
5279 }
5280
5281
5282 //a range of code units, packed with index to speed up forward iteration
5283 package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc
5284 if (is(C : wchar) || is(C : char))
5285 {
5286 static struct Decoder
5287 {
5288 pure nothrow:
5289 C[] str;
5290 size_t idx;
frontDecoder5291 @property C front(){ return str[idx]; }
backDecoder5292 @property C back(){ return str[$-1]; }
popFrontDecoder5293 void popFront(){ idx++; }
popBackDecoder5294 void popBack(){ str = str[0..$-1]; }
popFrontNDecoder5295 void popFrontN(size_t n){ idx += n; }
emptyDecoder5296 @property bool empty(){ return idx == str.length; }
saveDecoder5297 @property auto save(){ return this; }
opIndexDecoder5298 auto opIndex(size_t i){ return str[idx+i]; }
lengthDecoder5299 @property size_t length(){ return str.length - idx; }
5300 alias opDollar = length;
opSliceDecoder5301 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5302 }
5303 static assert(isRandomAccessRange!Decoder);
5304 static assert(is(ElementType!Decoder : C));
5305 return Decoder(s, offset);
5306 }
5307
5308 @safe unittest
5309 {
5310 string rs = "hi! ネемног砀 текста";
5311 auto codec = rs.decoder;
5312 auto utf8 = utf8Matcher(unicode.Letter);
5313 auto asc = utf8.subMatcher!(1);
5314 auto uni = utf8.subMatcher!(2,3,4);
5315 assert(asc.test(codec));
5316 assert(!uni.match(codec));
5317 assert(utf8.skip(codec));
5318 assert(codec.idx == 1);
5319
5320 assert(!uni.match(codec));
5321 assert(asc.test(codec));
5322 assert(utf8.skip(codec));
5323 assert(codec.idx == 2);
5324 assert(!asc.match(codec));
5325
5326 assert(!utf8.test(codec));
5327 assert(!utf8.skip(codec));
5328
5329 assert(!asc.test(codec));
5330 assert(!utf8.test(codec));
5331 assert(!utf8.skip(codec));
5332 assert(utf8.test(codec));
5333 foreach (i; 0 .. 7)
5334 {
5335 assert(!asc.test(codec));
5336 assert(uni.test(codec));
5337 assert(utf8.skip(codec));
5338 }
5339 assert(!utf8.test(codec));
5340 assert(!utf8.skip(codec));
5341 //the same with match where applicable
5342 codec = rs.decoder;
5343 assert(utf8.match(codec));
5344 assert(codec.idx == 1);
5345 assert(utf8.match(codec));
5346 assert(codec.idx == 2);
5347 assert(!utf8.match(codec));
5348 assert(codec.idx == 2);
5349 assert(!utf8.skip(codec));
5350 assert(!utf8.skip(codec));
5351
5352 foreach (i; 0 .. 7)
5353 {
5354 assert(!asc.test(codec));
5355 assert(utf8.test(codec));
5356 assert(utf8.match(codec));
5357 }
5358 auto i = codec.idx;
5359 assert(!utf8.match(codec));
5360 assert(codec.idx == i);
5361 }
5362
5363 @safe unittest
5364 {
5365 import std.range : stride;
testAll(Matcher,Range)5366 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r)
5367 {
5368 bool t = m.test(r);
5369 auto save = r.idx;
5370 assert(t == m.match(r));
5371 assert(r.idx == save || t); //ether no change or was match
5372 r.idx = save;
5373 static if (is(typeof(m.skip(r))))
5374 {
5375 assert(t == m.skip(r));
5376 assert(r.idx != save); //always changed
5377 r.idx = save;
5378 }
5379 return t;
5380 }
5381 auto utf16 = utfMatcher!wchar(unicode.L);
5382 auto bmp = utf16.subMatcher!1;
5383 auto nonBmp = utf16.subMatcher!1;
5384 auto utf8 = utfMatcher!char(unicode.L);
5385 auto ascii = utf8.subMatcher!1;
5386 auto uni2 = utf8.subMatcher!2;
5387 auto uni3 = utf8.subMatcher!3;
5388 auto uni24 = utf8.subMatcher!(2,4);
5389 foreach (ch; unicode.L.byCodepoint.stride(3))
5390 {
5391 import std.utf : encode;
5392 char[4] buf;
5393 wchar[2] buf16;
5394 auto len = encode(buf, ch);
5395 auto len16 = encode(buf16, ch);
5396 auto c8 = buf[0 .. len].decoder;
5397 auto c16 = buf16[0 .. len16].decoder;
5398 assert(testAll(utf16, c16));
5399 assert(testAll(bmp, c16) || len16 != 1);
5400 assert(testAll(nonBmp, c16) || len16 != 2);
5401
5402 assert(testAll(utf8, c8));
5403
5404 //submatchers return false on out of their domain
5405 assert(testAll(ascii, c8) || len != 1);
5406 assert(testAll(uni2, c8) || len != 2);
5407 assert(testAll(uni3, c8) || len != 3);
5408 assert(testAll(uni24, c8) || (len != 2 && len != 4));
5409 }
5410 }
5411
5412 // cover decode fail cases of Matcher
5413 @system unittest
5414 {
5415 import std.algorithm.iteration : map;
5416 import std.exception : collectException;
5417 import std.format : format;
5418 auto utf16 = utfMatcher!wchar(unicode.L);
5419 auto utf8 = utfMatcher!char(unicode.L);
5420 //decode failure cases UTF-8
5421 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5422 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5423 "\xCF\x00\0x00\0x00\x00");
foreach(msg;fails8)5424 foreach (msg; fails8)
5425 {
5426 assert(collectException((){
5427 auto s = msg;
5428 size_t idx = 0;
5429 utf8.test(s);
5430 }()), format("%( %2x %)", cast(ubyte[]) msg));
5431 }
5432 //decode failure cases UTF-16
5433 alias fails16 = AliasSeq!([0xD811], [0xDC02]);
foreach(msg;fails16)5434 foreach (msg; fails16)
5435 {
5436 assert(collectException((){
5437 auto s = msg.map!(x => cast(wchar) x);
5438 utf16.test(s);
5439 }()));
5440 }
5441 }
5442
5443 /++
5444 Convenience function to construct optimal configurations for
5445 packed Trie from any $(D set) of $(CODEPOINTS).
5446
5447 The parameter $(D level) indicates the number of trie levels to use,
5448 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5449 speed-size wise.
5450
5451 $(P Level 1 is fastest and the most memory hungry (a bit array). )
5452 $(P Level 4 is the slowest and has the smallest footprint. )
5453
5454 See the $(S_LINK Synopsis, Synopsis) section for example.
5455
5456 Note:
5457 Level 4 stays very practical (being faster and more predictable)
5458 compared to using direct lookup on the $(D set) itself.
5459
5460
5461 +/
5462 public auto toTrie(size_t level, Set)(Set set)
5463 if (isCodepointSet!Set)
5464 {
5465 static if (level == 1)
5466 return codepointSetTrie!(21)(set);
5467 else static if (level == 2)
5468 return codepointSetTrie!(10, 11)(set);
5469 else static if (level == 3)
5470 return codepointSetTrie!(8, 5, 8)(set);
5471 else static if (level == 4)
5472 return codepointSetTrie!(6, 4, 4, 7)(set);
5473 else
5474 static assert(false,
5475 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5476 }
5477
5478 /**
5479 $(P Builds a $(D Trie) with typically optimal speed-size trade-off
5480 and wraps it into a delegate of the following type:
5481 $(D bool delegate(dchar ch)). )
5482
5483 $(P Effectively this creates a 'tester' lambda suitable
5484 for algorithms like std.algorithm.find that take unary predicates. )
5485
5486 See the $(S_LINK Synopsis, Synopsis) section for example.
5487 */
5488 public auto toDelegate(Set)(Set set)
5489 if (isCodepointSet!Set)
5490 {
5491 // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5492 auto t = toTrie!3(set);
5493 return (dchar ch) => t[ch];
5494 }
5495
5496 /**
5497 $(P Opaque wrapper around unsigned built-in integers and
5498 code unit (char/wchar/dchar) types.
5499 Parameter $(D sz) indicates that the value is confined
5500 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5501 packed more tightly when stored in certain
5502 data-structures like trie. )
5503
5504 Note:
5505 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T)
5506 but not vise-versa. Users have to ensure the value fits in
5507 the range required and use the $(D cast)
5508 operator to perform the conversion.)
5509 */
5510 struct BitPacked(T, size_t sz)
5511 if (isIntegral!T || is(T:dchar))
5512 {
5513 enum bitSize = sz;
5514 T _value;
5515 alias _value this;
5516 }
5517
5518 /*
5519 Depending on the form of the passed argument $(D bitSizeOf) returns
5520 the amount of bits required to represent a given type
5521 or a return type of a given functor.
5522 */
5523 template bitSizeOf(Args...)
5524 if (Args.length == 1)
5525 {
5526 import std.traits : ReturnType;
5527 alias T = Args[0];
5528 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5529 {
5530 enum bitSizeOf = T.bitSize;
5531 }
5532 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5533 {
5534 enum bitSizeOf = bitSizeOf!(ReturnType!T);
5535 }
5536 else
5537 {
5538 enum bitSizeOf = T.sizeof*8;
5539 }
5540 }
5541
5542 /**
5543 Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x)
5544 and thus suitable for packing.
5545 */
isBitPacked(T)5546 template isBitPacked(T)
5547 {
5548 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5549 enum isBitPacked = true;
5550 else
5551 enum isBitPacked = false;
5552 }
5553
5554 /**
5555 Gives the type $(D U) from $(LREF BitPacked)!(U, x)
5556 or $(D T) itself for every other type.
5557 */
TypeOfBitPacked(T)5558 template TypeOfBitPacked(T)
5559 {
5560 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5561 alias TypeOfBitPacked = U;
5562 else
5563 alias TypeOfBitPacked = T;
5564 }
5565
5566 /*
5567 Wrapper, used in definition of custom data structures from $(D Trie) template.
5568 Applying it to a unary lambda function indicates that the returned value always
5569 fits within $(D bits) of bits.
5570 */
assumeSize(alias Fn,size_t bits)5571 struct assumeSize(alias Fn, size_t bits)
5572 {
5573 enum bitSize = bits;
5574 static auto ref opCall(T)(auto ref T arg)
5575 {
5576 return Fn(arg);
5577 }
5578 }
5579
5580 /*
5581 A helper for defining lambda function that yields a slice
5582 of certain bits from an unsigned integral value.
5583 The resulting lambda is wrapped in assumeSize and can be used directly
5584 with $(D Trie) template.
5585 */
sliceBits(size_t from,size_t to)5586 struct sliceBits(size_t from, size_t to)
5587 {
5588 //for now bypass assumeSize, DMD has trouble inlining it
5589 enum bitSize = to-from;
5590 static auto opCall(T)(T x)
5591 out(result)
5592 {
5593 assert(result < (1 << to-from));
5594 }
5595 body
5596 {
5597 static assert(from < to);
5598 static if (from == 0)
5599 return x & ((1 << to)-1);
5600 else
5601 return (x >> from) & ((1<<(to-from))-1);
5602 }
5603 }
5604
low_8(uint x)5605 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
midlow_8(uint x)5606 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5607 alias lo8 = assumeSize!(low_8, 8);
5608 alias mlo8 = assumeSize!(midlow_8, 8);
5609
5610 static assert(bitSizeOf!lo8 == 8);
5611 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5612 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5613
Sequence(size_t start,size_t end)5614 template Sequence(size_t start, size_t end)
5615 {
5616 static if (start < end)
5617 alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5618 else
5619 alias Sequence = AliasSeq!();
5620 }
5621
5622 //---- TRIE TESTS ----
5623 @system unittest
5624 {
5625 import std.algorithm.iteration : map;
5626 import std.algorithm.sorting : sort;
5627 import std.array : array;
5628 import std.conv : text, to;
5629 import std.range : iota;
trieStats(TRIE)5630 static trieStats(TRIE)(TRIE t)
5631 {
5632 version (std_uni_stats)
5633 {
5634 import std.stdio : writefln, writeln;
5635 writeln("---TRIE FOOTPRINT STATS---");
5636 foreach (i; staticIota!(0, t.table.dim) )
5637 {
5638 writefln("lvl%s = %s bytes; %s pages"
5639 , i, t.bytes!i, t.pages!i);
5640 }
5641 writefln("TOTAL: %s bytes", t.bytes);
5642 version (none)
5643 {
5644 writeln("INDEX (excluding value level):");
5645 foreach (i; staticIota!(0, t.table.dim-1) )
5646 writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5647 }
5648 writeln("---------------------------");
5649 }
5650 }
5651 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5652 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
5653 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5654 alias Set = CodepointSet;
5655 auto set = Set('A','Z','a','z');
5656 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5657 for (int a='a'; a<'z';a++)
5658 assert(trie[a]);
5659 for (int a='A'; a<'Z';a++)
5660 assert(trie[a]);
5661 for (int a=0; a<'A'; a++)
5662 assert(!trie[a]);
5663 for (int a ='Z'; a<'a'; a++)
5664 assert(!trie[a]);
5665 trieStats(trie);
5666
5667 auto redundant2 = Set(
5668 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5669 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5670 trieStats(trie2);
5671 foreach (e; redundant2.byCodepoint)
5672 assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5673 foreach (i; 0 .. 1024)
5674 {
5675 assert(trie2[i] == (i in redundant2));
5676 }
5677
5678
5679 auto redundant3 = Set(
5680 2, 4, 6, 8, 16,
5681 2+16, 4+16, 16+6, 16+8, 16+16,
5682 2+32, 4+32, 32+6, 32+8,
5683 );
5684
5685 enum max3 = 256;
5686 // sliceBits
5687 auto trie3 = buildTrie!(bool, uint, max3,
5688 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5689 )(redundant3.byInterval);
5690 trieStats(trie3);
5691 foreach (i; 0 .. max3)
5692 assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5693
5694 auto redundant4 = Set(
5695 10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5696 1000, 2000, 3000, 4000, 5000, 6000
5697 );
5698 enum max4 = 2^^16;
5699 auto trie4 = buildTrie!(bool, size_t, max4,
5700 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5701 )(redundant4.byInterval);
5702 foreach (i; 0 .. max4)
5703 {
5704 if (i in redundant4)
5705 assert(trie4[i], text(cast(uint) i));
5706 }
5707 trieStats(trie4);
5708
5709 alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5710 string[] redundantS = ["tea", "start", "orange"];
5711 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5712 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5713 // using first char only
5714 assert(redundantS == ["orange", "start", "tea"]);
5715 assert(strie["test"], text(strie["test"]));
5716 assert(!strie["aea"]);
5717 assert(strie["s"]);
5718
5719 // a bit size test
5720 auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5721 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5722 trieStats(bt);
5723 foreach (i; 0 .. 256)
5724 assert(bt[cast(ubyte) i]);
5725 }
5726
5727 template useItemAt(size_t idx, T)
5728 if (isIntegral!T || is(T: dchar))
5729 {
impl(in T[]arr)5730 size_t impl(in T[] arr){ return arr[idx]; }
5731 alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5732 }
5733
useLastItem(T)5734 template useLastItem(T)
5735 {
5736 size_t impl(in T[] arr){ return arr[$-1]; }
5737 alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5738 }
5739
fullBitSize(Prefix...)5740 template fullBitSize(Prefix...)
5741 {
5742 static if (Prefix.length > 0)
5743 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5744 else
5745 enum fullBitSize = 0;
5746 }
5747
idxTypes(Key,size_t fullBits,Prefix...)5748 template idxTypes(Key, size_t fullBits, Prefix...)
5749 {
5750 static if (Prefix.length == 1)
5751 {// the last level is value level, so no index once reduced to 1-level
5752 alias idxTypes = AliasSeq!();
5753 }
5754 else
5755 {
5756 // Important note on bit packing
5757 // Each level has to hold enough of bits to address the next one
5758 // The bottom level is known to hold full bit width
5759 // thus it's size in pages is full_bit_width - size_of_last_prefix
5760 // Recourse on this notion
5761 alias idxTypes =
5762 AliasSeq!(
5763 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5764 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5765 );
5766 }
5767 }
5768
5769 //============================================================================
5770
5771 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5772 if (is(Char1 : dchar) && is(Char2 : dchar))
5773 {
5774 import std.algorithm.comparison : cmp;
5775 import std.algorithm.iteration : map, filter;
5776 import std.ascii : toLower;
pred(dchar c)5777 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5778 return cmp(
5779 a.map!toLower.filter!pred,
5780 b.map!toLower.filter!pred);
5781 }
5782
5783 @safe pure unittest
5784 {
5785 assert(!comparePropertyName("foo-bar", "fooBar"));
5786 }
5787
5788 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5789 if (is(Char1 : dchar) && is(Char2 : dchar))
5790 {
5791 return comparePropertyName(a, b) < 0;
5792 }
5793
5794 //============================================================================
5795 // Utilities for compression of Unicode code point sets
5796 //============================================================================
5797
compressTo(uint val,ref ubyte[]arr)5798 @safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
5799 {
5800 // not optimized as usually done 1 time (and not public interface)
5801 if (val < 128)
5802 arr ~= cast(ubyte) val;
5803 else if (val < (1 << 13))
5804 {
5805 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5806 arr ~= val & 0xFF;
5807 }
5808 else
5809 {
5810 assert(val < (1 << 21));
5811 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5812 arr ~= (val >> 8) & 0xFF;
5813 arr ~= val & 0xFF;
5814 }
5815 }
5816
decompressFrom(const (ubyte)[]arr,ref size_t idx)5817 @safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
5818 {
5819 import std.exception : enforce;
5820 immutable first = arr[idx++];
5821 if (!(first & 0x80)) // no top bit -> [0 .. 127]
5822 return first;
5823 immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5824 uint val = (first & 0x1F);
5825 enforce(idx + extra <= arr.length, "bad code point interval encoding");
5826 foreach (j; 0 .. extra)
5827 val = (val << 8) | arr[idx+j];
5828 idx += extra;
5829 return val;
5830 }
5831
5832
5833 package ubyte[] compressIntervals(Range)(Range intervals)
5834 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5835 {
5836 ubyte[] storage;
5837 uint base = 0;
5838 // RLE encode
foreach(val;intervals)5839 foreach (val; intervals)
5840 {
5841 compressTo(val[0]-base, storage);
5842 base = val[0];
5843 if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5844 {
5845 compressTo(val[1]-base, storage);
5846 base = val[1];
5847 }
5848 }
5849 return storage;
5850 }
5851
5852 @safe pure unittest
5853 {
5854 import std.algorithm.comparison : equal;
5855 import std.typecons : tuple;
5856
5857 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5858 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5859 assert(compressIntervals(run) == enc);
5860 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5861 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5862 assert(compressIntervals(run2) == enc2);
5863 size_t idx = 0;
5864 assert(decompressFrom(enc, idx) == 80);
5865 assert(decompressFrom(enc, idx) == 47);
5866 assert(decompressFrom(enc, idx) == 1);
5867 assert(decompressFrom(enc, idx) == (1 << 10));
5868 idx = 0;
5869 assert(decompressFrom(enc2, idx) == 0);
5870 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5871 assert(equal(decompressIntervals(compressIntervals(run)), run));
5872 assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5873 }
5874
5875 // Creates a range of $(D CodepointInterval) that lazily decodes compressed data.
decompressIntervals(const (ubyte)[]data)5876 @safe package auto decompressIntervals(const(ubyte)[] data) pure
5877 {
5878 return DecompressedIntervals(data);
5879 }
5880
5881 @safe struct DecompressedIntervals
5882 {
5883 pure:
5884 const(ubyte)[] _stream;
5885 size_t _idx;
5886 CodepointInterval _front;
5887
thisDecompressedIntervals5888 this(const(ubyte)[] stream)
5889 {
5890 _stream = stream;
5891 popFront();
5892 }
5893
frontDecompressedIntervals5894 @property CodepointInterval front()
5895 {
5896 assert(!empty);
5897 return _front;
5898 }
5899
popFrontDecompressedIntervals5900 void popFront()
5901 {
5902 if (_idx == _stream.length)
5903 {
5904 _idx = size_t.max;
5905 return;
5906 }
5907 uint base = _front[1];
5908 _front[0] = base + decompressFrom(_stream, _idx);
5909 if (_idx == _stream.length)// odd length ---> till the end
5910 _front[1] = lastDchar+1;
5911 else
5912 {
5913 base = _front[0];
5914 _front[1] = base + decompressFrom(_stream, _idx);
5915 }
5916 }
5917
emptyDecompressedIntervals5918 @property bool empty() const
5919 {
5920 return _idx == size_t.max;
5921 }
5922
saveDecompressedIntervals5923 @property DecompressedIntervals save() { return this; }
5924 }
5925
5926 static assert(isInputRange!DecompressedIntervals);
5927 static assert(isForwardRange!DecompressedIntervals);
5928 //============================================================================
5929
version(std_uni_bootstrap)5930 version (std_uni_bootstrap){}
5931 else
5932 {
5933
5934 // helper for looking up code point sets
findUnicodeSet(alias table,C)5935 @trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure
5936 {
5937 import std.algorithm.iteration : map;
5938 import std.range : assumeSorted;
5939 auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5940 (table.map!"a.name"());
5941 size_t idx = range.lowerBound(name).length;
5942 if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5943 return idx;
5944 return -1;
5945 }
5946
5947 // another one that loads it
loadUnicodeSet(alias table,Set,C)5948 @trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure
5949 {
5950 auto idx = findUnicodeSet!table(name);
5951 if (idx >= 0)
5952 {
5953 dest = Set(asSet(table[idx].compressed));
5954 return true;
5955 }
5956 return false;
5957 }
5958
5959 @trusted bool loadProperty(Set=CodepointSet, C)
5960 (in C[] name, ref Set target) pure
5961 {
5962 import std.internal.unicode_tables : uniProps; // generated file
5963 alias ucmp = comparePropertyName;
5964 // conjure cumulative properties by hand
5965 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5966 {
5967 target = asSet(uniProps.Lu);
5968 target |= asSet(uniProps.Ll);
5969 target |= asSet(uniProps.Lt);
5970 target |= asSet(uniProps.Lo);
5971 target |= asSet(uniProps.Lm);
5972 }
5973 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5974 {
5975 target = asSet(uniProps.Ll);
5976 target |= asSet(uniProps.Lu);
5977 target |= asSet(uniProps.Lt);// Title case
5978 }
5979 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5980 {
5981 target = asSet(uniProps.Mn);
5982 target |= asSet(uniProps.Mc);
5983 target |= asSet(uniProps.Me);
5984 }
5985 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5986 {
5987 target = asSet(uniProps.Nd);
5988 target |= asSet(uniProps.Nl);
5989 target |= asSet(uniProps.No);
5990 }
5991 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5992 {
5993 target = asSet(uniProps.Pc);
5994 target |= asSet(uniProps.Pd);
5995 target |= asSet(uniProps.Ps);
5996 target |= asSet(uniProps.Pe);
5997 target |= asSet(uniProps.Pi);
5998 target |= asSet(uniProps.Pf);
5999 target |= asSet(uniProps.Po);
6000 }
6001 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6002 {
6003 target = asSet(uniProps.Sm);
6004 target |= asSet(uniProps.Sc);
6005 target |= asSet(uniProps.Sk);
6006 target |= asSet(uniProps.So);
6007 }
6008 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6009 {
6010 target = asSet(uniProps.Zs);
6011 target |= asSet(uniProps.Zl);
6012 target |= asSet(uniProps.Zp);
6013 }
6014 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6015 {
6016 target = asSet(uniProps.Co);
6017 target |= asSet(uniProps.Lo);
6018 target |= asSet(uniProps.No);
6019 target |= asSet(uniProps.So);
6020 target |= asSet(uniProps.Po);
6021 }
6022 else if (ucmp(name, "graphical") == 0)
6023 {
6024 target = asSet(uniProps.Alphabetic);
6025
6026 target |= asSet(uniProps.Mn);
6027 target |= asSet(uniProps.Mc);
6028 target |= asSet(uniProps.Me);
6029
6030 target |= asSet(uniProps.Nd);
6031 target |= asSet(uniProps.Nl);
6032 target |= asSet(uniProps.No);
6033
6034 target |= asSet(uniProps.Pc);
6035 target |= asSet(uniProps.Pd);
6036 target |= asSet(uniProps.Ps);
6037 target |= asSet(uniProps.Pe);
6038 target |= asSet(uniProps.Pi);
6039 target |= asSet(uniProps.Pf);
6040 target |= asSet(uniProps.Po);
6041
6042 target |= asSet(uniProps.Zs);
6043
6044 target |= asSet(uniProps.Sm);
6045 target |= asSet(uniProps.Sc);
6046 target |= asSet(uniProps.Sk);
6047 target |= asSet(uniProps.So);
6048 }
6049 else if (ucmp(name, "any") == 0)
6050 target = Set.fromIntervals(0, 0x110000);
6051 else if (ucmp(name, "ascii") == 0)
6052 target = Set.fromIntervals(0, 0x80);
6053 else
6054 return loadUnicodeSet!(uniProps.tab)(name, target);
6055 return true;
6056 }
6057
6058 // CTFE-only helper for checking property names at compile-time
isPrettyPropertyName(C)6059 @safe bool isPrettyPropertyName(C)(in C[] name)
6060 {
6061 import std.algorithm.searching : find;
6062 auto names = [
6063 "L", "Letter",
6064 "LC", "Cased Letter",
6065 "M", "Mark",
6066 "N", "Number",
6067 "P", "Punctuation",
6068 "S", "Symbol",
6069 "Z", "Separator",
6070 "Graphical",
6071 "any",
6072 "ascii"
6073 ];
6074 auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6075 return !x.empty;
6076 }
6077
6078 // ditto, CTFE-only, not optimized
findSetName(alias table,C)6079 @safe private static bool findSetName(alias table, C)(in C[] name)
6080 {
6081 return findUnicodeSet!table(name) >= 0;
6082 }
6083
SetSearcher(alias table,string kind)6084 template SetSearcher(alias table, string kind)
6085 {
6086 /// Run-time checked search.
6087 static auto opCall(C)(in C[] name)
6088 if (is(C : dchar))
6089 {
6090 import std.conv : to;
6091 CodepointSet set;
6092 if (loadUnicodeSet!table(name, set))
6093 return set;
6094 throw new Exception("No unicode set for "~kind~" by name "
6095 ~name.to!string()~" was found.");
6096 }
6097 /// Compile-time checked search.
6098 static @property auto opDispatch(string name)()
6099 {
6100 static if (findSetName!table(name))
6101 {
6102 CodepointSet set;
6103 loadUnicodeSet!table(name, set);
6104 return set;
6105 }
6106 else
6107 static assert(false, "No unicode set for "~kind~" by name "
6108 ~name~" was found.");
6109 }
6110 }
6111
6112 /**
6113 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6114 a block, script or general category.
6115
6116 It uses well defined standard rules of property name lookup.
6117 This includes fuzzy matching of names, so that
6118 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6119 and yield the same set of white space $(CHARACTERS).
6120 */
6121 @safe public struct unicode
6122 {
6123 /**
6124 Performs the lookup of set of $(CODEPOINTS)
6125 with compile-time correctness checking.
6126 This short-cut version combines 3 searches:
6127 across blocks, scripts, and common binary properties.
6128
6129 Note that since scripts and blocks overlap the
6130 usual trick to disambiguate is used - to get a block use
6131 $(D unicode.InBlockName), to search a script
6132 use $(D unicode.ScriptName).
6133
6134 See_Also: $(LREF block), $(LREF script)
6135 and (not included in this search) $(LREF hangulSyllableType).
6136 */
6137
opDispatchunicode6138 static @property auto opDispatch(string name)() pure
6139 {
6140 static if (findAny(name))
6141 return loadAny(name);
6142 else
6143 static assert(false, "No unicode set by name "~name~" was found.");
6144 }
6145
6146 ///
6147 @safe unittest
6148 {
6149 import std.exception : collectException;
6150 auto ascii = unicode.ASCII;
6151 assert(ascii['A']);
6152 assert(ascii['~']);
6153 assert(!ascii['\u00e0']);
6154 // matching is case-insensitive
6155 assert(ascii == unicode.ascII);
6156 assert(!ascii['à']);
6157 // underscores, '-' and whitespace in names are ignored too
6158 auto latin = unicode.in_latin1_Supplement;
6159 assert(latin['à']);
6160 assert(!latin['$']);
6161 // BTW Latin 1 Supplement is a block, hence "In" prefix
6162 assert(latin == unicode("In Latin 1 Supplement"));
6163 // run-time look up throws if no such set is found
6164 assert(collectException(unicode("InCyrilliac")));
6165 }
6166
6167 /**
6168 The same lookup across blocks, scripts, or binary properties,
6169 but performed at run-time.
6170 This version is provided for cases where $(D name)
6171 is not known beforehand; otherwise compile-time
6172 checked $(LREF opDispatch) is typically a better choice.
6173
6174 See the $(S_LINK Unicode properties, table of properties) for available
6175 sets.
6176 */
6177 static auto opCall(C)(in C[] name)
6178 if (is(C : dchar))
6179 {
6180 return loadAny(name);
6181 }
6182
6183 /**
6184 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6185
6186 Note:
6187 Here block names are unambiguous as no scripts are searched
6188 and thus to search use simply $(D unicode.block.BlockName) notation.
6189
6190 See $(S_LINK Unicode properties, table of properties) for available sets.
6191 See_Also: $(S_LINK Unicode properties, table of properties).
6192 */
6193 struct block
6194 {
6195 import std.internal.unicode_tables : blocks; // generated file
6196 mixin SetSearcher!(blocks.tab, "block");
6197 }
6198
6199 ///
6200 @safe unittest
6201 {
6202 // use .block for explicitness
6203 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6204 }
6205
6206 /**
6207 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6208
6209 See the $(S_LINK Unicode properties, table of properties) for available
6210 sets.
6211 */
6212 struct script
6213 {
6214 import std.internal.unicode_tables : scripts; // generated file
6215 mixin SetSearcher!(scripts.tab, "script");
6216 }
6217
6218 ///
6219 @safe unittest
6220 {
6221 auto arabicScript = unicode.script.arabic;
6222 auto arabicBlock = unicode.block.arabic;
6223 // there is an intersection between script and block
6224 assert(arabicBlock['']);
6225 assert(arabicScript['']);
6226 // but they are different
6227 assert(arabicBlock != arabicScript);
6228 assert(arabicBlock == unicode.inArabic);
6229 assert(arabicScript == unicode.arabic);
6230 }
6231
6232 /**
6233 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6234
6235 Other non-binary properties (once supported) follow the same
6236 notation - $(D unicode.propertyName.propertyValue) for compile-time
6237 checked access and $(D unicode.propertyName(propertyValue))
6238 for run-time checked one.
6239
6240 See the $(S_LINK Unicode properties, table of properties) for available
6241 sets.
6242 */
6243 struct hangulSyllableType
6244 {
6245 import std.internal.unicode_tables : hangul; // generated file
6246 mixin SetSearcher!(hangul.tab, "hangul syllable type");
6247 }
6248
6249 ///
6250 @safe unittest
6251 {
6252 // L here is syllable type not Letter as in unicode.L short-cut
6253 auto leadingVowel = unicode.hangulSyllableType("L");
6254 // check that some leading vowels are present
6255 foreach (vowel; '\u1110'..'\u115F')
6256 assert(leadingVowel[vowel]);
6257 assert(leadingVowel == unicode.hangulSyllableType.L);
6258 }
6259
6260 private:
6261 alias ucmp = comparePropertyName;
6262
findAnyunicode6263 static bool findAny(string name)
6264 {
6265 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6266 return isPrettyPropertyName(name)
6267 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6268 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6269 }
6270
6271 static auto loadAny(Set=CodepointSet, C)(in C[] name) pure
6272 {
6273 import std.conv : to;
6274 import std.internal.unicode_tables : blocks, scripts; // generated file
6275 Set set;
6276 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6277 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6278 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6279 if (loaded)
6280 return set;
6281 throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6282 }
6283
6284 // FIXME: re-disable once the compiler is fixed
6285 // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6286 //@disable ~this();
6287 }
6288
6289 @safe unittest
6290 {
6291 import std.internal.unicode_tables : blocks, uniProps; // generated file
6292 assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6293 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6294 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6295 }
6296
6297 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6298
6299 // control - '\r'
6300 enum controlSwitch = `
6301 case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6302 case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6303 `;
6304 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6305 // kill unrolled switches
6306
isRegionalIndicator(dchar ch)6307 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6308 {
6309 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6310 }
6311
genericDecodeGrapheme(bool getValue)6312 template genericDecodeGrapheme(bool getValue)
6313 {
6314 alias graphemeExtend = graphemeExtendTrie;
6315 alias spacingMark = mcTrie;
6316 static if (getValue)
6317 alias Value = Grapheme;
6318 else
6319 alias Value = void;
6320
6321 Value genericDecodeGrapheme(Input)(ref Input range)
6322 {
6323 import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6324 enum GraphemeState {
6325 Start,
6326 CR,
6327 RI,
6328 L,
6329 V,
6330 LVT
6331 }
6332 static if (getValue)
6333 Grapheme grapheme;
6334 auto state = GraphemeState.Start;
6335 enum eat = q{
6336 static if (getValue)
6337 grapheme ~= ch;
6338 range.popFront();
6339 };
6340
6341 dchar ch;
6342 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6343 while (!range.empty)
6344 {
6345 ch = range.front;
6346 final switch (state) with(GraphemeState)
6347 {
6348 case Start:
6349 mixin(eat);
6350 if (ch == '\r')
6351 state = CR;
6352 else if (isRegionalIndicator(ch))
6353 state = RI;
6354 else if (isHangL(ch))
6355 state = L;
6356 else if (hangLV[ch] || isHangV(ch))
6357 state = V;
6358 else if (hangLVT[ch])
6359 state = LVT;
6360 else if (isHangT(ch))
6361 state = LVT;
6362 else
6363 {
6364 switch (ch)
6365 {
6366 mixin(controlSwitch);
6367 goto L_End;
6368 default:
6369 goto L_End_Extend;
6370 }
6371 }
6372 break;
6373 case CR:
6374 if (ch == '\n')
6375 mixin(eat);
6376 goto L_End_Extend;
6377 case RI:
6378 if (isRegionalIndicator(ch))
6379 mixin(eat);
6380 else
6381 goto L_End_Extend;
6382 break;
6383 case L:
6384 if (isHangL(ch))
6385 mixin(eat);
6386 else if (isHangV(ch) || hangLV[ch])
6387 {
6388 state = V;
6389 mixin(eat);
6390 }
6391 else if (hangLVT[ch])
6392 {
6393 state = LVT;
6394 mixin(eat);
6395 }
6396 else
6397 goto L_End_Extend;
6398 break;
6399 case V:
6400 if (isHangV(ch))
6401 mixin(eat);
6402 else if (isHangT(ch))
6403 {
6404 state = LVT;
6405 mixin(eat);
6406 }
6407 else
6408 goto L_End_Extend;
6409 break;
6410 case LVT:
6411 if (isHangT(ch))
6412 {
6413 mixin(eat);
6414 }
6415 else
6416 goto L_End_Extend;
6417 break;
6418 }
6419 }
6420 L_End_Extend:
6421 while (!range.empty)
6422 {
6423 ch = range.front;
6424 // extend & spacing marks
6425 if (!graphemeExtend[ch] && !spacingMark[ch])
6426 break;
6427 mixin(eat);
6428 }
6429 L_End:
6430 static if (getValue)
6431 return grapheme;
6432 }
6433
6434 }
6435
6436 public: // Public API continues
6437
6438 /++
6439 Computes the length of grapheme cluster starting at $(D index).
6440 Both the resulting length and the $(D index) are measured
6441 in $(S_LINK Code unit, code units).
6442
6443 Params:
6444 C = type that is implicitly convertible to $(D dchars)
6445 input = array of grapheme clusters
6446 index = starting index into $(D input[])
6447
6448 Returns:
6449 length of grapheme cluster
6450 +/
6451 size_t graphemeStride(C)(in C[] input, size_t index)
6452 if (is(C : dchar))
6453 {
6454 auto src = input[index..$];
6455 auto n = src.length;
6456 genericDecodeGrapheme!(false)(src);
6457 return n - src.length;
6458 }
6459
6460 ///
6461 @safe unittest
6462 {
6463 assert(graphemeStride(" ", 1) == 1);
6464 // A + combing ring above
6465 string city = "A\u030Arhus";
6466 size_t first = graphemeStride(city, 0);
6467 assert(first == 3); //\u030A has 2 UTF-8 code units
6468 assert(city[0 .. first] == "A\u030A");
6469 assert(city[first..$] == "rhus");
6470 }
6471
6472 /++
6473 Reads one full grapheme cluster from an input range of dchar $(D inp).
6474
6475 For examples see the $(LREF Grapheme) below.
6476
6477 Note:
6478 This function modifies $(D inp) and thus $(D inp)
6479 must be an L-value.
6480 +/
6481 Grapheme decodeGrapheme(Input)(ref Input inp)
6482 if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar))
6483 {
6484 return genericDecodeGrapheme!true(inp);
6485 }
6486
6487 @system unittest
6488 {
6489 import std.algorithm.comparison : equal;
6490
6491 Grapheme gr;
6492 string s = " \u0020\u0308 ";
6493 gr = decodeGrapheme(s);
6494 assert(gr.length == 1 && gr[0] == ' ');
6495 gr = decodeGrapheme(s);
6496 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
6497 s = "\u0300\u0308\u1100";
6498 assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
6499 assert(equal(decodeGrapheme(s)[], "\u1100"));
6500 s = "\u11A8\u0308\uAC01";
6501 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
6502 assert(equal(decodeGrapheme(s)[], "\uAC01"));
6503 }
6504
6505 /++
6506 $(P Iterate a string by grapheme.)
6507
6508 $(P Useful for doing string manipulation that needs to be aware
6509 of graphemes.)
6510
6511 See_Also:
6512 $(LREF byCodePoint)
6513 +/
6514 auto byGrapheme(Range)(Range range)
6515 if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
6516 {
6517 // TODO: Bidirectional access
Result(R)6518 static struct Result(R)
6519 {
6520 private R _range;
6521 private Grapheme _front;
6522
6523 bool empty() @property
6524 {
6525 return _front.length == 0;
6526 }
6527
6528 Grapheme front() @property
6529 {
6530 return _front;
6531 }
6532
6533 void popFront()
6534 {
6535 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
6536 }
6537
6538 static if (isForwardRange!R)
6539 {
6540 Result save() @property
6541 {
6542 return Result(_range.save, _front);
6543 }
6544 }
6545 }
6546
6547 auto result = Result!(Range)(range);
6548 result.popFront();
6549 return result;
6550 }
6551
6552 ///
6553 @safe unittest
6554 {
6555 import std.algorithm.comparison : equal;
6556 import std.range.primitives : walkLength;
6557 import std.range : take, drop;
6558 auto text = "noe\u0308l"; // noël using e + combining diaeresis
6559 assert(text.walkLength == 5); // 5 code points
6560
6561 auto gText = text.byGrapheme;
6562 assert(gText.walkLength == 4); // 4 graphemes
6563
6564 assert(gText.take(3).equal("noe\u0308".byGrapheme));
6565 assert(gText.drop(3).equal("l".byGrapheme));
6566 }
6567
6568 // For testing non-forward-range input ranges
6569 version (unittest)
6570 private static struct InputRangeString
6571 {
6572 private string s;
6573
emptyInputRangeString6574 bool empty() @property { return s.empty; }
frontInputRangeString6575 dchar front() @property { return s.front; }
popFrontInputRangeString6576 void popFront() { s.popFront(); }
6577 }
6578
6579 @system unittest
6580 {
6581 import std.algorithm.comparison : equal;
6582 import std.array : array;
6583 import std.range : retro;
6584 import std.range.primitives : walkLength;
6585 assert("".byGrapheme.walkLength == 0);
6586
6587 auto reverse = "le\u0308on";
6588 assert(reverse.walkLength == 5);
6589
6590 auto gReverse = reverse.byGrapheme;
6591 assert(gReverse.walkLength == 4);
6592
6593 foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
6594 {
6595 assert(text.walkLength == 5);
6596 static assert(isForwardRange!(typeof(text)));
6597
6598 auto gText = text.byGrapheme;
6599 static assert(isForwardRange!(typeof(gText)));
6600 assert(gText.walkLength == 4);
6601 assert(gText.array.retro.equal(gReverse));
6602 }
6603
6604 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
6605 static assert(!isForwardRange!(typeof(nonForwardRange)));
6606 assert(nonForwardRange.walkLength == 4);
6607 }
6608
6609 /++
6610 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
6611
6612 $(P Useful for converting the result to a string after doing operations
6613 on graphemes.)
6614
6615 $(P Acts as the identity function when given a range of code points.)
6616 +/
6617 auto byCodePoint(Range)(Range range)
6618 if (isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme))
6619 {
6620 // TODO: Propagate bidirectional access
6621 static struct Result
6622 {
6623 private Range _range;
6624 private size_t i = 0;
6625
emptyResult6626 bool empty() @property
6627 {
6628 return _range.empty;
6629 }
6630
frontResult6631 dchar front() @property
6632 {
6633 return _range.front[i];
6634 }
6635
popFrontResult6636 void popFront()
6637 {
6638 ++i;
6639
6640 if (i >= _range.front.length)
6641 {
6642 _range.popFront();
6643 i = 0;
6644 }
6645 }
6646
6647 static if (isForwardRange!Range)
6648 {
saveResult6649 Result save() @property
6650 {
6651 return Result(_range.save, i);
6652 }
6653 }
6654 }
6655
6656 return Result(range);
6657 }
6658
6659 /// Ditto
6660 Range byCodePoint(Range)(Range range)
6661 if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
6662 {
6663 return range;
6664 }
6665
6666 ///
6667 @safe unittest
6668 {
6669 import std.array : array;
6670 import std.conv : text;
6671 import std.range : retro;
6672
6673 string s = "noe\u0308l"; // noël
6674
6675 // reverse it and convert the result to a string
6676 string reverse = s.byGrapheme
6677 .array
6678 .retro
6679 .byCodePoint
6680 .text;
6681
6682 assert(reverse == "le\u0308on"); // lëon
6683 }
6684
6685 @system unittest
6686 {
6687 import std.algorithm.comparison : equal;
6688 import std.range.primitives : walkLength;
6689 assert("".byGrapheme.byCodePoint.equal(""));
6690
6691 string text = "noe\u0308l";
6692 static assert(is(typeof(text.byCodePoint) == string));
6693
6694 auto gText = InputRangeString(text).byGrapheme;
6695 static assert(!isForwardRange!(typeof(gText)));
6696
6697 auto cpText = gText.byCodePoint;
6698 static assert(!isForwardRange!(typeof(cpText)));
6699
6700 assert(cpText.walkLength == text.walkLength);
6701 }
6702
6703 @trusted:
6704
6705 /++
6706 $(P A structure designed to effectively pack $(CHARACTERS)
6707 of a $(CLUSTER).
6708 )
6709
6710 $(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme)
6711 always refer to distinct objects. In most actual scenarios a $(D Grapheme)
6712 fits on the stack and avoids memory allocation overhead for all but quite
6713 long clusters.
6714 )
6715
6716 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
6717 +/
6718 @trusted struct Grapheme
6719 {
6720 import std.traits : isDynamicArray;
6721
6722 public:
6723 /// Ctor
6724 this(C)(in C[] chars...)
6725 if (is(C : dchar))
6726 {
6727 this ~= chars;
6728 }
6729
6730 ///ditto
6731 this(Input)(Input seq)
6732 if (!isDynamicArray!Input
6733 && isInputRange!Input && is(ElementType!Input : dchar))
6734 {
6735 this ~= seq;
6736 }
6737
6738 /// Gets a $(CODEPOINT) at the given index in this cluster.
opIndexGrapheme6739 dchar opIndex(size_t index) const pure nothrow @nogc
6740 {
6741 assert(index < length);
6742 return read24(isBig ? ptr_ : small_.ptr, index);
6743 }
6744
6745 /++
6746 Writes a $(CODEPOINT) $(D ch) at given index in this cluster.
6747
6748 Warning:
6749 Use of this facility may invalidate grapheme cluster,
6750 see also $(LREF Grapheme.valid).
6751 +/
opIndexAssignGrapheme6752 void opIndexAssign(dchar ch, size_t index) pure nothrow @nogc
6753 {
6754 assert(index < length);
6755 write24(isBig ? ptr_ : small_.ptr, ch, index);
6756 }
6757
6758 ///
6759 @safe unittest
6760 {
6761 auto g = Grapheme("A\u0302");
6762 assert(g[0] == 'A');
6763 assert(g.valid);
6764 g[1] = '~'; // ASCII tilda is not a combining mark
6765 assert(g[1] == '~');
6766 assert(!g.valid);
6767 }
6768
6769 /++
6770 Random-access range over Grapheme's $(CHARACTERS).
6771
6772 Warning: Invalidates when this Grapheme leaves the scope,
6773 attempts to use it then would lead to memory corruption.
6774 +/
6775 @system SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) pure nothrow @nogc
6776 {
6777 return sliceOverIndexed(a, b, &this);
6778 }
6779
6780 /// ditto
6781 @system SliceOverIndexed!Grapheme opSlice() pure nothrow @nogc
6782 {
6783 return sliceOverIndexed(0, length, &this);
6784 }
6785
6786 /// Grapheme cluster length in $(CODEPOINTS).
lengthGrapheme6787 @property size_t length() const pure nothrow @nogc
6788 {
6789 return isBig ? len_ : slen_ & 0x7F;
6790 }
6791
6792 /++
6793 Append $(CHARACTER) $(D ch) to this grapheme.
6794 Warning:
6795 Use of this facility may invalidate grapheme cluster,
6796 see also $(D valid).
6797
6798 See_Also: $(LREF Grapheme.valid)
6799 +/
6800 ref opOpAssign(string op)(dchar ch)
6801 {
6802 static if (op == "~")
6803 {
6804 if (!isBig)
6805 {
6806 if (slen_ == small_cap)
6807 convertToBig();// & fallthrough to "big" branch
6808 else
6809 {
6810 write24(small_.ptr, ch, smallLength);
6811 slen_++;
6812 return this;
6813 }
6814 }
6815
6816 assert(isBig);
6817 if (len_ == cap_)
6818 {
6819 import core.checkedint : addu, mulu;
6820 bool overflow;
6821 cap_ = addu(cap_, grow, overflow);
6822 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
6823 if (overflow) assert(0);
6824 ptr_ = cast(ubyte*) pureRealloc(ptr_, nelems);
6825 if (ptr_ is null) onOutOfMemoryError();
6826 }
6827 write24(ptr_, ch, len_++);
6828 return this;
6829 }
6830 else
6831 static assert(false, "No operation "~op~" defined for Grapheme");
6832 }
6833
6834 ///
6835 @system unittest
6836 {
6837 import std.algorithm.comparison : equal;
6838 auto g = Grapheme("A");
6839 assert(g.valid);
6840 g ~= '\u0301';
6841 assert(g[].equal("A\u0301"));
6842 assert(g.valid);
6843 g ~= "B";
6844 // not a valid grapheme cluster anymore
6845 assert(!g.valid);
6846 // still could be useful though
6847 assert(g[].equal("A\u0301B"));
6848 }
6849
6850 /// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme.
6851 ref opOpAssign(string op, Input)(Input inp)
6852 if (isInputRange!Input && is(ElementType!Input : dchar))
6853 {
6854 static if (op == "~")
6855 {
6856 foreach (dchar ch; inp)
6857 this ~= ch;
6858 return this;
6859 }
6860 else
6861 static assert(false, "No operation "~op~" defined for Grapheme");
6862 }
6863
6864 /++
6865 True if this object contains valid extended grapheme cluster.
6866 Decoding primitives of this module always return a valid $(D Grapheme).
6867
6868 Appending to and direct manipulation of grapheme's $(CHARACTERS) may
6869 render it no longer valid. Certain applications may chose to use
6870 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
6871 entirely.
6872 +/
6873 @property bool valid()() /*const*/
6874 {
6875 auto r = this[];
6876 genericDecodeGrapheme!false(r);
6877 return r.length == 0;
6878 }
6879
thisGrapheme6880 this(this) pure @nogc nothrow
6881 {
6882 if (isBig)
6883 {// dup it
6884 import core.checkedint : addu, mulu;
6885 bool overflow;
6886 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
6887 if (overflow) assert(0);
6888
6889 auto p = cast(ubyte*) pureMalloc(raw_cap);
6890 if (p is null) onOutOfMemoryError();
6891 p[0 .. raw_cap] = ptr_[0 .. raw_cap];
6892 ptr_ = p;
6893 }
6894 }
6895
~thisGrapheme6896 ~this() pure @nogc nothrow
6897 {
6898 if (isBig)
6899 {
6900 pureFree(ptr_);
6901 }
6902 }
6903
6904
6905 private:
6906 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
6907 // "out of the blue" grow rate, needs testing
6908 // (though graphemes are typically small < 9)
6909 enum grow = 20;
6910 enum small_cap = small_bytes/3;
6911 enum small_flag = 0x80, small_mask = 0x7F;
6912 // 16 bytes in 32bits, should be enough for the majority of cases
6913 union
6914 {
6915 struct
6916 {
6917 ubyte* ptr_;
6918 size_t cap_;
6919 size_t len_;
6920 size_t padding_;
6921 }
6922 struct
6923 {
6924 ubyte[small_bytes] small_;
6925 ubyte slen_;
6926 }
6927 }
6928
convertToBigGrapheme6929 void convertToBig() pure @nogc nothrow
6930 {
6931 static assert(grow.max / 3 - 1 >= grow);
6932 enum nbytes = 3 * (grow + 1);
6933 size_t k = smallLength;
6934 ubyte* p = cast(ubyte*) pureMalloc(nbytes);
6935 if (p is null) onOutOfMemoryError();
6936 for (int i=0; i<k; i++)
6937 write24(p, read24(small_.ptr, i), i);
6938 // now we can overwrite small array data
6939 ptr_ = p;
6940 len_ = slen_;
6941 assert(grow > len_);
6942 cap_ = grow;
6943 setBig();
6944 }
6945
setBigGrapheme6946 void setBig() pure nothrow @nogc { slen_ |= small_flag; }
6947
smallLengthGrapheme6948 @property size_t smallLength() const pure nothrow @nogc
6949 {
6950 return slen_ & small_mask;
6951 }
isBigGrapheme6952 @property ubyte isBig() const pure nothrow @nogc
6953 {
6954 return slen_ & small_flag;
6955 }
6956 }
6957
6958 static assert(Grapheme.sizeof == size_t.sizeof*4);
6959
6960
6961 @system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
6962 {
6963 import std.algorithm.comparison : equal;
6964 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
6965 assert(byGrapheme("ЮУЗ").equal(data[]));
6966 }
6967
6968 ///
6969 @system unittest
6970 {
6971 import std.algorithm.comparison : equal;
6972 import std.algorithm.iteration : filter;
6973 import std.range : isRandomAccessRange;
6974
6975 string bold = "ku\u0308hn";
6976
6977 // note that decodeGrapheme takes parameter by ref
6978 auto first = decodeGrapheme(bold);
6979
6980 assert(first.length == 1);
6981 assert(first[0] == 'k');
6982
6983 // the next grapheme is 2 characters long
6984 auto wideOne = decodeGrapheme(bold);
6985 // slicing a grapheme yields a random-access range of dchar
6986 assert(wideOne[].equal("u\u0308"));
6987 assert(wideOne.length == 2);
6988 static assert(isRandomAccessRange!(typeof(wideOne[])));
6989
6990 // all of the usual range manipulation is possible
6991 assert(wideOne[].filter!isMark().equal("\u0308"));
6992
6993 auto g = Grapheme("A");
6994 assert(g.valid);
6995 g ~= '\u0301';
6996 assert(g[].equal("A\u0301"));
6997 assert(g.valid);
6998 g ~= "B";
6999 // not a valid grapheme cluster anymore
7000 assert(!g.valid);
7001 // still could be useful though
7002 assert(g[].equal("A\u0301B"));
7003 }
7004
7005 @safe unittest
7006 {
7007 auto g = Grapheme("A\u0302");
7008 assert(g[0] == 'A');
7009 assert(g.valid);
7010 g[1] = '~'; // ASCII tilda is not a combining mark
7011 assert(g[1] == '~');
7012 assert(!g.valid);
7013 }
7014
7015 @system unittest
7016 {
7017 import std.algorithm.comparison : equal;
7018 import std.algorithm.iteration : map;
7019 import std.conv : text;
7020 import std.range : iota;
7021
7022 // not valid clusters (but it just a test)
7023 auto g = Grapheme('a', 'b', 'c', 'd', 'e');
7024 assert(g[0] == 'a');
7025 assert(g[1] == 'b');
7026 assert(g[2] == 'c');
7027 assert(g[3] == 'd');
7028 assert(g[4] == 'e');
7029 g[3] = 'Й';
7030 assert(g[2] == 'c');
7031 assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7032 assert(g[4] == 'e');
7033 assert(!g.valid);
7034
7035 g ~= 'ц';
7036 g ~= '~';
7037 assert(g[0] == 'a');
7038 assert(g[1] == 'b');
7039 assert(g[2] == 'c');
7040 assert(g[3] == 'Й');
7041 assert(g[4] == 'e');
7042 assert(g[5] == 'ц');
7043 assert(g[6] == '~');
7044 assert(!g.valid);
7045
7046 Grapheme copy = g;
7047 copy[0] = 'X';
7048 copy[1] = '-';
7049 assert(g[0] == 'a' && copy[0] == 'X');
7050 assert(g[1] == 'b' && copy[1] == '-');
7051 assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7052 copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7053 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7054 copy ~= "xyz";
7055 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7056 assert(!copy.valid);
7057
7058 Grapheme h;
7059 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7060 h ~= v;
7061 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7062 }
7063
7064 /++
7065 $(P Does basic case-insensitive comparison of $(D r1) and $(D r2).
7066 This function uses simpler comparison rule thus achieving better performance
7067 than $(LREF icmp). However keep in mind the warning below.)
7068
7069 Params:
7070 r1 = an input range of characters
7071 r2 = an input range of characters
7072
7073 Returns:
7074 An $(D int) that is 0 if the strings match,
7075 <0 if $(D r1) is lexicographically "less" than $(D r2),
7076 >0 if $(D r1) is lexicographically "greater" than $(D r2)
7077
7078 Warning:
7079 This function only handles 1:1 $(CODEPOINT) mapping
7080 and thus is not sufficient for certain alphabets
7081 like German, Greek and few others.
7082
7083 See_Also:
7084 $(LREF icmp)
7085 $(REF cmp, std,algorithm,comparison)
7086 +/
7087 int sicmp(S1, S2)(S1 r1, S2 r2)
7088 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7089 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7090 {
7091 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7092 import std.utf : byDchar;
7093
7094 auto str1 = r1.byDchar;
7095 auto str2 = r2.byDchar;
7096
foreach(immutable lhs;str1)7097 foreach (immutable lhs; str1)
7098 {
7099 if (str2.empty)
7100 return 1;
7101 immutable rhs = str2.front;
7102 str2.popFront();
7103 int diff = lhs - rhs;
7104 if (!diff)
7105 continue;
7106 size_t idx = simpleCaseTrie[lhs];
7107 size_t idx2 = simpleCaseTrie[rhs];
7108 // simpleCaseTrie is packed index table
7109 if (idx != EMPTY_CASE_TRIE)
7110 {
7111 if (idx2 != EMPTY_CASE_TRIE)
7112 {// both cased chars
7113 // adjust idx --> start of bucket
7114 idx = idx - sTable[idx].n;
7115 idx2 = idx2 - sTable[idx2].n;
7116 if (idx == idx2)// one bucket, equivalent chars
7117 continue;
7118 else// not the same bucket
7119 diff = sTable[idx].ch - sTable[idx2].ch;
7120 }
7121 else
7122 diff = sTable[idx - sTable[idx].n].ch - rhs;
7123 }
7124 else if (idx2 != EMPTY_CASE_TRIE)
7125 {
7126 diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7127 }
7128 // one of chars is not cased at all
7129 return diff;
7130 }
7131 return str2.empty ? 0 : -1;
7132 }
7133
7134 ///
7135 @safe @nogc pure nothrow unittest
7136 {
7137 assert(sicmp("Август", "авгусТ") == 0);
7138 // Greek also works as long as there is no 1:M mapping in sight
7139 assert(sicmp("ΌΎ", "όύ") == 0);
7140 // things like the following won't get matched as equal
7141 // Greek small letter iota with dialytika and tonos
7142 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7143
7144 // while icmp has no problem with that
7145 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
7146 assert(icmp("ΌΎ", "όύ") == 0);
7147 }
7148
7149 // overloads for the most common cases to reduce compile time
7150 @safe @nogc pure nothrow
7151 {
sicmp(const (char)[]str1,const (char)[]str2)7152 int sicmp(const(char)[] str1, const(char)[] str2)
7153 { return sicmp!(const(char)[], const(char)[])(str1, str2); }
sicmp(const (wchar)[]str1,const (wchar)[]str2)7154 int sicmp(const(wchar)[] str1, const(wchar)[] str2)
7155 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
sicmp(const (dchar)[]str1,const (dchar)[]str2)7156 int sicmp(const(dchar)[] str1, const(dchar)[] str2)
7157 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7158 }
7159
fullCasedCmp(Range)7160 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7161 {
7162 import std.algorithm.searching : skipOver;
7163 import std.internal.unicode_tables : fullCaseTable; // generated file
7164 alias fTable = fullCaseTable;
7165 size_t idx = fullCaseTrie[lhs];
7166 // fullCaseTrie is packed index table
7167 if (idx == EMPTY_CASE_TRIE)
7168 return lhs;
7169 immutable start = idx - fTable[idx].n;
7170 immutable end = fTable[idx].size + start;
7171 assert(fTable[start].entry_len == 1);
7172 for (idx=start; idx<end; idx++)
7173 {
7174 auto entryLen = fTable[idx].entry_len;
7175 if (entryLen == 1)
7176 {
7177 if (fTable[idx].seq[0] == rhs)
7178 {
7179 return 0;
7180 }
7181 }
7182 else
7183 {// OK it's a long chunk, like 'ss' for German
7184 dstring seq = fTable[idx].seq[0 .. entryLen];
7185 if (rhs == seq[0]
7186 && rtail.skipOver(seq[1..$]))
7187 {
7188 // note that this path modifies rtail
7189 // iff we managed to get there
7190 return 0;
7191 }
7192 }
7193 }
7194 return fTable[start].seq[0]; // new remapped character for accurate diffs
7195 }
7196
7197 /++
7198 Does case insensitive comparison of `r1` and `r2`.
7199 Follows the rules of full case-folding mapping.
7200 This includes matching as equal german ß with "ss" and
7201 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7202 The cost of `icmp` being pedantically correct is
7203 slightly worse performance.
7204
7205 Params:
7206 r1 = a forward range of characters
7207 r2 = a forward range of characters
7208
7209 Returns:
7210 An $(D int) that is 0 if the strings match,
7211 <0 if $(D str1) is lexicographically "less" than $(D str2),
7212 >0 if $(D str1) is lexicographically "greater" than $(D str2)
7213
7214 See_Also:
7215 $(LREF sicmp)
7216 $(REF cmp, std,algorithm,comparison)
7217 +/
7218 int icmp(S1, S2)(S1 r1, S2 r2)
7219 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7220 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7221 {
7222 import std.utf : byDchar;
7223
7224 auto str1 = r1.byDchar;
7225 auto str2 = r2.byDchar;
7226
7227 for (;;)
7228 {
7229 if (str1.empty)
7230 return str2.empty ? 0 : -1;
7231 immutable lhs = str1.front;
7232 if (str2.empty)
7233 return 1;
7234 immutable rhs = str2.front;
7235 str1.popFront();
7236 str2.popFront();
7237 if (!(lhs - rhs))
7238 continue;
7239 // first try to match lhs to <rhs,right-tail> sequence
7240 immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
7241 if (!cmpLR)
7242 continue;
7243 // then rhs to <lhs,left-tail> sequence
7244 immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
7245 if (!cmpRL)
7246 continue;
7247 // cmpXX contain remapped codepoints
7248 // to obtain stable ordering of icmp
7249 return cmpLR - cmpRL;
7250 }
7251 }
7252
7253 ///
7254 @safe @nogc pure nothrow unittest
7255 {
7256 assert(icmp("Rußland", "Russland") == 0);
7257 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
7258 }
7259
7260 /**
7261 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
7262 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
7263 */
7264 @safe @nogc nothrow pure unittest
7265 {
7266 import std.utf : byDchar;
7267
7268 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
7269 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
7270 }
7271
7272 // test different character types
7273 @safe unittest
7274 {
7275 assert(icmp("Rußland", "Russland") == 0);
7276 assert(icmp("Rußland"w, "Russland") == 0);
7277 assert(icmp("Rußland", "Russland"w) == 0);
7278 assert(icmp("Rußland"w, "Russland"w) == 0);
7279 assert(icmp("Rußland"d, "Russland"w) == 0);
7280 assert(icmp("Rußland"w, "Russland"d) == 0);
7281 }
7282
7283 // overloads for the most common cases to reduce compile time
7284 @safe @nogc pure nothrow
7285 {
icmp(const (char)[]str1,const (char)[]str2)7286 int icmp(const(char)[] str1, const(char)[] str2)
7287 { return icmp!(const(char)[], const(char)[])(str1, str2); }
icmp(const (wchar)[]str1,const (wchar)[]str2)7288 int icmp(const(wchar)[] str1, const(wchar)[] str2)
7289 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
icmp(const (dchar)[]str1,const (dchar)[]str2)7290 int icmp(const(dchar)[] str1, const(dchar)[] str2)
7291 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7292 }
7293
7294 @safe unittest
7295 {
7296 import std.algorithm.sorting : sort;
7297 import std.conv : to;
7298 import std.exception : assertCTFEable;
7299 assertCTFEable!(
7300 {
7301 foreach (cfunc; AliasSeq!(icmp, sicmp))
7302 {
7303 foreach (S1; AliasSeq!(string, wstring, dstring))
7304 foreach (S2; AliasSeq!(string, wstring, dstring))
7305 (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396
7306 assert(cfunc("".to!S1(), "".to!S2()) == 0);
7307 assert(cfunc("A".to!S1(), "".to!S2()) > 0);
7308 assert(cfunc("".to!S1(), "0".to!S2()) < 0);
7309 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
7310 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
7311 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
7312 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
7313 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
7314 // Check example:
7315 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
7316 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
7317 }();
7318 // check that the order is properly agnostic to the case
7319 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"];
7320 sort!((a,b) => cfunc(a,b) < 0)(strs);
7321 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]);
7322 }
7323 assert(icmp("ßb", "ssa") > 0);
7324 // Check example:
7325 assert(icmp("Russland", "Rußland") == 0);
7326 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
7327 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
7328 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7329 //bugzilla 11057
7330 assert( icmp("K", "L") < 0 );
7331 });
7332 }
7333
7334 // issue 17372
7335 @safe pure unittest
7336 {
7337 import std.algorithm.iteration : joiner, map;
7338 import std.algorithm.sorting : sort;
7339 import std.array : array;
7340 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
7341 }
7342
7343 // This is package for the moment to be used as a support tool for std.regex
7344 // It needs a better API
7345 /*
7346 Return a range of all $(CODEPOINTS) that casefold to
7347 and from this $(D ch).
7348 */
simpleCaseFoldings(dchar ch)7349 package auto simpleCaseFoldings(dchar ch) @safe
7350 {
7351 import std.internal.unicode_tables : simpleCaseTable; // generated file
7352 alias sTable = simpleCaseTable;
7353 static struct Range
7354 {
7355 @safe pure nothrow:
7356 uint idx; //if == uint.max, then read c.
7357 union
7358 {
7359 dchar c; // == 0 - empty range
7360 uint len;
7361 }
7362 @property bool isSmall() const { return idx == uint.max; }
7363
7364 this(dchar ch)
7365 {
7366 idx = uint.max;
7367 c = ch;
7368 }
7369
7370 this(uint start, uint size)
7371 {
7372 idx = start;
7373 len = size;
7374 }
7375
7376 @property dchar front() const
7377 {
7378 assert(!empty);
7379 if (isSmall)
7380 {
7381 return c;
7382 }
7383 auto ch = sTable[idx].ch;
7384 return ch;
7385 }
7386
7387 @property bool empty() const
7388 {
7389 if (isSmall)
7390 {
7391 return c == 0;
7392 }
7393 return len == 0;
7394 }
7395
7396 @property size_t length() const
7397 {
7398 if (isSmall)
7399 {
7400 return c == 0 ? 0 : 1;
7401 }
7402 return len;
7403 }
7404
7405 void popFront()
7406 {
7407 if (isSmall)
7408 c = 0;
7409 else
7410 {
7411 idx++;
7412 len--;
7413 }
7414 }
7415 }
7416 immutable idx = simpleCaseTrie[ch];
7417 if (idx == EMPTY_CASE_TRIE)
7418 return Range(ch);
7419 auto entry = sTable[idx];
7420 immutable start = idx - entry.n;
7421 return Range(start, entry.size);
7422 }
7423
7424 @system unittest
7425 {
7426 import std.algorithm.comparison : equal;
7427 import std.algorithm.searching : canFind;
7428 import std.array : array;
7429 import std.exception : assertCTFEable;
7430 assertCTFEable!((){
7431 auto r = simpleCaseFoldings('Э').array;
7432 assert(r.length == 2);
7433 assert(r.canFind('э') && r.canFind('Э'));
7434 auto sr = simpleCaseFoldings('~');
7435 assert(sr.equal("~"));
7436 //A with ring above - casefolds to the same bucket as Angstrom sign
7437 sr = simpleCaseFoldings('Å');
7438 assert(sr.length == 3);
7439 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
7440 });
7441 }
7442
7443 /++
7444 $(P Returns the $(S_LINK Combining class, combining class) of $(D ch).)
7445 +/
combiningClass(dchar ch)7446 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
7447 {
7448 return combiningClassTrie[ch];
7449 }
7450
7451 ///
7452 @safe unittest
7453 {
7454 // shorten the code
7455 alias CC = combiningClass;
7456
7457 // combining tilda
7458 assert(CC('\u0303') == 230);
7459 // combining ring below
7460 assert(CC('\u0325') == 220);
7461 // the simple consequence is that "tilda" should be
7462 // placed after a "ring below" in a sequence
7463 }
7464
7465 @safe pure nothrow @nogc unittest
7466 {
7467 foreach (ch; 0 .. 0x80)
7468 assert(combiningClass(ch) == 0);
7469 assert(combiningClass('\u05BD') == 22);
7470 assert(combiningClass('\u0300') == 230);
7471 assert(combiningClass('\u0317') == 220);
7472 assert(combiningClass('\u1939') == 222);
7473 }
7474
7475 /// Unicode character decomposition type.
7476 enum UnicodeDecomposition {
7477 /// Canonical decomposition. The result is canonically equivalent sequence.
7478 Canonical,
7479 /**
7480 Compatibility decomposition. The result is compatibility equivalent sequence.
7481 Note: Compatibility decomposition is a $(B lossy) conversion,
7482 typically suitable only for fuzzy matching and internal processing.
7483 */
7484 Compatibility
7485 }
7486
7487 /**
7488 Shorthand aliases for character decomposition type, passed as a
7489 template parameter to $(LREF decompose).
7490 */
7491 enum {
7492 Canonical = UnicodeDecomposition.Canonical,
7493 Compatibility = UnicodeDecomposition.Compatibility
7494 }
7495
7496 /++
7497 Try to canonically compose 2 $(CHARACTERS).
7498 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
7499
7500 The assumption is that $(D first) comes before $(D second) in the original text,
7501 usually meaning that the first is a starter.
7502
7503 Note: Hangul syllables are not covered by this function.
7504 See $(D composeJamo) below.
7505 +/
compose(dchar first,dchar second)7506 public dchar compose(dchar first, dchar second) pure nothrow @safe
7507 {
7508 import std.algorithm.iteration : map;
7509 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
7510 import std.range : assumeSorted;
7511 immutable packed = compositionJumpTrie[first];
7512 if (packed == ushort.max)
7513 return dchar.init;
7514 // unpack offset and length
7515 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
7516 // TODO: optimize this micro binary search (no more then 4-5 steps)
7517 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
7518 immutable target = r.lowerBound(second).length;
7519 if (target == cnt)
7520 return dchar.init;
7521 immutable entry = compositionTable[idx+target];
7522 if (entry.rhs != second)
7523 return dchar.init;
7524 return entry.composed;
7525 }
7526
7527 ///
7528 @safe unittest
7529 {
7530 assert(compose('A','\u0308') == '\u00C4');
7531 assert(compose('A', 'B') == dchar.init);
7532 assert(compose('C', '\u0301') == '\u0106');
7533 // note that the starter is the first one
7534 // thus the following doesn't compose
7535 assert(compose('\u0308', 'A') == dchar.init);
7536 }
7537
7538 /++
7539 Returns a full $(S_LINK Canonical decomposition, Canonical)
7540 (by default) or $(S_LINK Compatibility decomposition, Compatibility)
7541 decomposition of $(CHARACTER) $(D ch).
7542 If no decomposition is available returns a $(LREF Grapheme)
7543 with the $(D ch) itself.
7544
7545 Note:
7546 This function also decomposes hangul syllables
7547 as prescribed by the standard.
7548
7549 See_Also: $(LREF decomposeHangul) for a restricted version
7550 that takes into account only hangul syllables but
7551 no other decompositions.
7552 +/
7553 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
7554 {
7555 import std.algorithm.searching : until;
7556 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
7557 static if (decompType == Canonical)
7558 {
7559 alias table = decompCanonTable;
7560 alias mapping = canonMappingTrie;
7561 }
7562 else static if (decompType == Compatibility)
7563 {
7564 alias table = decompCompatTable;
7565 alias mapping = compatMappingTrie;
7566 }
7567 immutable idx = mapping[ch];
7568 if (!idx) // not found, check hangul arithmetic decomposition
7569 return decomposeHangul(ch);
7570 auto decomp = table[idx..$].until(0);
7571 return Grapheme(decomp);
7572 }
7573
7574 ///
7575 @system unittest
7576 {
7577 import std.algorithm.comparison : equal;
7578
7579 assert(compose('A','\u0308') == '\u00C4');
7580 assert(compose('A', 'B') == dchar.init);
7581 assert(compose('C', '\u0301') == '\u0106');
7582 // note that the starter is the first one
7583 // thus the following doesn't compose
7584 assert(compose('\u0308', 'A') == dchar.init);
7585
7586 assert(decompose('Ĉ')[].equal("C\u0302"));
7587 assert(decompose('D')[].equal("D"));
7588 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
7589 assert(decompose!Compatibility('¹')[].equal("1"));
7590 }
7591
7592 //----------------------------------------------------------------------------
7593 // Hangul specific composition/decomposition
7594 enum jamoSBase = 0xAC00;
7595 enum jamoLBase = 0x1100;
7596 enum jamoVBase = 0x1161;
7597 enum jamoTBase = 0x11A7;
7598 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
7599 enum jamoNCount = jamoVCount * jamoTCount;
7600 enum jamoSCount = jamoLCount * jamoNCount;
7601
7602 // Tests if $(D ch) is a Hangul leading consonant jamo.
isJamoL(dchar ch)7603 bool isJamoL(dchar ch) pure nothrow @nogc @safe
7604 {
7605 // first cmp rejects ~ 1M code points above leading jamo range
7606 return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
7607 }
7608
7609 // Tests if $(D ch) is a Hangul vowel jamo.
isJamoT(dchar ch)7610 bool isJamoT(dchar ch) pure nothrow @nogc @safe
7611 {
7612 // first cmp rejects ~ 1M code points above trailing jamo range
7613 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
7614 return ch < jamoTBase+jamoTCount && ch > jamoTBase;
7615 }
7616
7617 // Tests if $(D ch) is a Hangul trailnig consonant jamo.
isJamoV(dchar ch)7618 bool isJamoV(dchar ch) pure nothrow @nogc @safe
7619 {
7620 // first cmp rejects ~ 1M code points above vowel range
7621 return ch < jamoVBase+jamoVCount && ch >= jamoVBase;
7622 }
7623
hangulSyllableIndex(dchar ch)7624 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
7625 {
7626 int idxS = cast(int) ch - jamoSBase;
7627 return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
7628 }
7629
7630 // internal helper: compose hangul syllables leaving dchar.init in holes
hangulRecompose(dchar[]seq)7631 void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe
7632 {
7633 for (size_t idx = 0; idx + 1 < seq.length; )
7634 {
7635 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
7636 {
7637 immutable int indexL = seq[idx] - jamoLBase;
7638 immutable int indexV = seq[idx+1] - jamoVBase;
7639 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
7640 if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
7641 {
7642 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
7643 seq[idx+1] = dchar.init;
7644 seq[idx+2] = dchar.init;
7645 idx += 3;
7646 }
7647 else
7648 {
7649 seq[idx] = jamoSBase + indexLV;
7650 seq[idx+1] = dchar.init;
7651 idx += 2;
7652 }
7653 }
7654 else
7655 idx++;
7656 }
7657 }
7658
7659 //----------------------------------------------------------------------------
7660 public:
7661
7662 /**
7663 Decomposes a Hangul syllable. If $(D ch) is not a composed syllable
7664 then this function returns $(LREF Grapheme) containing only $(D ch) as is.
7665 */
decomposeHangul(dchar ch)7666 Grapheme decomposeHangul(dchar ch) @safe
7667 {
7668 immutable idxS = cast(int) ch - jamoSBase;
7669 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
7670 immutable idxL = idxS / jamoNCount;
7671 immutable idxV = (idxS % jamoNCount) / jamoTCount;
7672 immutable idxT = idxS % jamoTCount;
7673
7674 immutable partL = jamoLBase + idxL;
7675 immutable partV = jamoVBase + idxV;
7676 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
7677 return Grapheme(partL, partV, jamoTBase + idxT);
7678 else // <L, V> decomposition
7679 return Grapheme(partL, partV);
7680 }
7681
7682 ///
7683 @system unittest
7684 {
7685 import std.algorithm.comparison : equal;
7686 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
7687 }
7688
7689 /++
7690 Try to compose hangul syllable out of a leading consonant ($(D lead)),
7691 a $(D vowel) and optional $(D trailing) consonant jamos.
7692
7693 On success returns the composed LV or LVT hangul syllable.
7694
7695 If any of $(D lead) and $(D vowel) are not a valid hangul jamo
7696 of the respective $(CHARACTER) class returns dchar.init.
7697 +/
7698 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
7699 {
7700 if (!isJamoL(lead))
7701 return dchar.init;
7702 immutable indexL = lead - jamoLBase;
7703 if (!isJamoV(vowel))
7704 return dchar.init;
7705 immutable indexV = vowel - jamoVBase;
7706 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
7707 immutable dchar syllable = jamoSBase + indexLV;
7708 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
7709 }
7710
7711 ///
7712 @safe unittest
7713 {
7714 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
7715 // leaving out T-vowel, or passing any codepoint
7716 // that is not trailing consonant composes an LV-syllable
7717 assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
7718 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
7719 assert(composeJamo('\u1111', 'A') == dchar.init);
7720 assert(composeJamo('A', '\u1171') == dchar.init);
7721 }
7722
7723 @system unittest
7724 {
7725 import std.algorithm.comparison : equal;
7726 import std.conv : text;
7727
testDecomp(UnicodeDecomposition T)7728 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
7729 {
7730 Grapheme g = decompose!T(ch);
7731 assert(equal(g[], r), text(g[], " vs ", r));
7732 }
7733 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
7734 testDecomp!Canonical('\uF907', "\u9F9C");
7735 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
7736 testDecomp!Compatibility('\uA7F9', "\u0153");
7737
7738 // check examples
7739 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
7740 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
7741 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
7742 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
7743 assert(composeJamo('\u1111', 'A') == dchar.init);
7744 assert(composeJamo('A', '\u1171') == dchar.init);
7745 }
7746
7747 /**
7748 Enumeration type for normalization forms,
7749 passed as template parameter for functions like $(LREF normalize).
7750 */
7751 enum NormalizationForm {
7752 NFC,
7753 NFD,
7754 NFKC,
7755 NFKD
7756 }
7757
7758
7759 enum {
7760 /**
7761 Shorthand aliases from values indicating normalization forms.
7762 */
7763 NFC = NormalizationForm.NFC,
7764 ///ditto
7765 NFD = NormalizationForm.NFD,
7766 ///ditto
7767 NFKC = NormalizationForm.NFKC,
7768 ///ditto
7769 NFKD = NormalizationForm.NFKD
7770 }
7771
7772 /++
7773 Returns $(D input) string normalized to the chosen form.
7774 Form C is used by default.
7775
7776 For more information on normalization forms see
7777 the $(S_LINK Normalization, normalization section).
7778
7779 Note:
7780 In cases where the string in question is already normalized,
7781 it is returned unmodified and no memory allocation happens.
7782 +/
inout(C)7783 inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
7784 {
7785 import std.algorithm.mutation : SwapStrategy;
7786 import std.algorithm.sorting : sort;
7787 import std.array : appender;
7788 import std.range : zip;
7789
7790 auto anchors = splitNormalized!norm(input);
7791 if (anchors[0] == input.length && anchors[1] == input.length)
7792 return input;
7793 dchar[] decomposed;
7794 decomposed.reserve(31);
7795 ubyte[] ccc;
7796 ccc.reserve(31);
7797 auto app = appender!(C[])();
7798 do
7799 {
7800 app.put(input[0 .. anchors[0]]);
7801 foreach (dchar ch; input[anchors[0]..anchors[1]])
7802 static if (norm == NFD || norm == NFC)
7803 {
7804 foreach (dchar c; decompose!Canonical(ch)[])
7805 decomposed ~= c;
7806 }
7807 else // NFKD & NFKC
7808 {
7809 foreach (dchar c; decompose!Compatibility(ch)[])
7810 decomposed ~= c;
7811 }
7812 ccc.length = decomposed.length;
7813 size_t firstNonStable = 0;
7814 ubyte lastClazz = 0;
7815
7816 foreach (idx, dchar ch; decomposed)
7817 {
7818 immutable clazz = combiningClass(ch);
7819 ccc[idx] = clazz;
7820 if (clazz == 0 && lastClazz != 0)
7821 {
7822 // found a stable code point after unstable ones
7823 sort!("a[0] < b[0]", SwapStrategy.stable)
7824 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
7825 firstNonStable = decomposed.length;
7826 }
7827 else if (clazz != 0 && lastClazz == 0)
7828 {
7829 // found first unstable code point after stable ones
7830 firstNonStable = idx;
7831 }
7832 lastClazz = clazz;
7833 }
7834 sort!("a[0] < b[0]", SwapStrategy.stable)
7835 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
7836 static if (norm == NFC || norm == NFKC)
7837 {
7838 import std.algorithm.searching : countUntil;
7839 auto first = countUntil(ccc, 0);
7840 if (first >= 0) // no starters?? no recomposition
7841 {
7842 for (;;)
7843 {
7844 immutable second = recompose(first, decomposed, ccc);
7845 if (second == decomposed.length)
7846 break;
7847 first = second;
7848 }
7849 // 2nd pass for hangul syllables
7850 hangulRecompose(decomposed);
7851 }
7852 }
7853 static if (norm == NFD || norm == NFKD)
7854 app.put(decomposed);
7855 else
7856 {
7857 import std.algorithm.mutation : remove;
7858 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
7859 app.put(decomposed[0 .. clean.length]);
7860 }
7861 // reset variables
7862 decomposed.length = 0;
7863 decomposed.assumeSafeAppend();
7864 ccc.length = 0;
7865 ccc.assumeSafeAppend();
7866 input = input[anchors[1]..$];
7867 // and move on
7868 anchors = splitNormalized!norm(input);
7869 }while (anchors[0] != input.length);
7870 app.put(input[0 .. anchors[0]]);
7871 return cast(inout(C)[])app.data;
7872 }
7873
7874 ///
7875 @safe unittest
7876 {
7877 // any encoding works
7878 wstring greet = "Hello world";
7879 assert(normalize(greet) is greet); // the same exact slice
7880
7881 // An example of a character with all 4 forms being different:
7882 // Greek upsilon with acute and hook symbol (code point 0x03D3)
7883 assert(normalize!NFC("ϓ") == "\u03D3");
7884 assert(normalize!NFD("ϓ") == "\u03D2\u0301");
7885 assert(normalize!NFKC("ϓ") == "\u038E");
7886 assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
7887 }
7888
7889 @safe unittest
7890 {
7891 import std.conv : text;
7892
7893 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
7894 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
7895 assert(normalize!NFD("Äffin") == "A\u0308ffin");
7896
7897 // check example
7898
7899 // any encoding works
7900 wstring greet = "Hello world";
7901 assert(normalize(greet) is greet); // the same exact slice
7902
7903 // An example of a character with all 4 forms being different:
7904 // Greek upsilon with acute and hook symbol (code point 0x03D3)
7905 assert(normalize!NFC("ϓ") == "\u03D3");
7906 assert(normalize!NFD("ϓ") == "\u03D2\u0301");
7907 assert(normalize!NFKC("ϓ") == "\u038E");
7908 assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
7909 }
7910
7911 // canonically recompose given slice of code points, works in-place and mutates data
recompose(size_t start,dchar[]input,ubyte[]ccc)7912 private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe
7913 {
7914 assert(input.length == ccc.length);
7915 int accumCC = -1;// so that it's out of 0 .. 255 range
7916 // writefln("recomposing %( %04x %)", input);
7917 // first one is always a starter thus we start at i == 1
7918 size_t i = start+1;
7919 for (; ; )
7920 {
7921 if (i == input.length)
7922 break;
7923 immutable curCC = ccc[i];
7924 // In any character sequence beginning with a starter S
7925 // a character C is blocked from S if and only if there
7926 // is some character B between S and C, and either B
7927 // is a starter or it has the same or higher combining class as C.
7928 //------------------------
7929 // Applying to our case:
7930 // S is input[0]
7931 // accumCC is the maximum CCC of characters between C and S,
7932 // as ccc are sorted
7933 // C is input[i]
7934
7935 if (curCC > accumCC)
7936 {
7937 immutable comp = compose(input[start], input[i]);
7938 if (comp != dchar.init)
7939 {
7940 input[start] = comp;
7941 input[i] = dchar.init;// put a sentinel
7942 // current was merged so its CCC shouldn't affect
7943 // composing with the next one
7944 }
7945 else
7946 {
7947 // if it was a starter then accumCC is now 0, end of loop
7948 accumCC = curCC;
7949 if (accumCC == 0)
7950 break;
7951 }
7952 }
7953 else
7954 {
7955 // ditto here
7956 accumCC = curCC;
7957 if (accumCC == 0)
7958 break;
7959 }
7960 i++;
7961 }
7962 return i;
7963 }
7964
7965 // returns tuple of 2 indexes that delimit:
7966 // normalized text, piece that needs normalization and
7967 // the rest of input starting with stable code point
splitNormalized(NormalizationForm norm,C)7968 private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
7969 {
7970 import std.typecons : tuple;
7971 ubyte lastCC = 0;
7972
7973 foreach (idx, dchar ch; input)
7974 {
7975 static if (norm == NFC)
7976 if (ch < 0x0300)
7977 {
7978 lastCC = 0;
7979 continue;
7980 }
7981 immutable ubyte CC = combiningClass(ch);
7982 if (lastCC > CC && CC != 0)
7983 {
7984 return seekStable!norm(idx, input);
7985 }
7986
7987 if (notAllowedIn!norm(ch))
7988 {
7989 return seekStable!norm(idx, input);
7990 }
7991 lastCC = CC;
7992 }
7993 return tuple(input.length, input.length);
7994 }
7995
seekStable(NormalizationForm norm,C)7996 private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input)
7997 {
7998 import std.typecons : tuple;
7999 import std.utf : codeLength;
8000
8001 auto br = input[0 .. idx];
8002 size_t region_start = 0;// default
8003 for (;;)
8004 {
8005 if (br.empty)// start is 0
8006 break;
8007 dchar ch = br.back;
8008 if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8009 {
8010 region_start = br.length - codeLength!C(ch);
8011 break;
8012 }
8013 br.popFront();
8014 }
8015 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8016 size_t region_end=input.length;// end is $ by default
8017 foreach (i, dchar ch; input[idx..$])
8018 {
8019 if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8020 {
8021 region_end = i+idx;
8022 break;
8023 }
8024 }
8025 // writeln("Region to normalize: ", input[region_start .. region_end]);
8026 return tuple(region_start, region_end);
8027 }
8028
8029 /**
8030 Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization
8031 form $(D norm).
8032 */
allowedIn(NormalizationForm norm)8033 public bool allowedIn(NormalizationForm norm)(dchar ch)
8034 {
8035 return !notAllowedIn!norm(ch);
8036 }
8037
8038 ///
8039 @safe unittest
8040 {
8041 // e.g. Cyrillic is always allowed, so is ASCII
8042 assert(allowedIn!NFC('я'));
8043 assert(allowedIn!NFD('я'));
8044 assert(allowedIn!NFKC('я'));
8045 assert(allowedIn!NFKD('я'));
8046 assert(allowedIn!NFC('Z'));
8047 }
8048
8049 // not user friendly name but more direct
notAllowedIn(NormalizationForm norm)8050 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8051 {
8052 static if (norm == NFC)
8053 alias qcTrie = nfcQCTrie;
8054 else static if (norm == NFD)
8055 alias qcTrie = nfdQCTrie;
8056 else static if (norm == NFKC)
8057 alias qcTrie = nfkcQCTrie;
8058 else static if (norm == NFKD)
8059 alias qcTrie = nfkdQCTrie;
8060 else
8061 static assert("Unknown normalization form "~norm);
8062 return qcTrie[ch];
8063 }
8064
8065 @safe unittest
8066 {
8067 assert(allowedIn!NFC('я'));
8068 assert(allowedIn!NFD('я'));
8069 assert(allowedIn!NFKC('я'));
8070 assert(allowedIn!NFKD('я'));
8071 assert(allowedIn!NFC('Z'));
8072 }
8073
8074 }
8075
version(std_uni_bootstrap)8076 version (std_uni_bootstrap)
8077 {
8078 // old version used for bootstrapping of gen_uni.d that generates
8079 // up to date optimal versions of all of isXXX functions
8080 @safe pure nothrow @nogc public bool isWhite(dchar c)
8081 {
8082 import std.ascii : isWhite;
8083 return isWhite(c) ||
8084 c == lineSep || c == paraSep ||
8085 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8086 (c >= '\u2000' && c <= '\u200A') ||
8087 c == '\u202F' || c == '\u205F' || c == '\u3000';
8088 }
8089 }
8090 else
8091 {
8092
8093 // trusted -> avoid bounds check
8094 @trusted pure nothrow @nogc private
8095 {
8096 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8097
8098 // hide template instances behind functions (Bugzilla 13232)
toLowerIndex(dchar c)8099 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
toLowerSimpleIndex(dchar c)8100 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
toLowerTab(size_t idx)8101 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8102
toTitleIndex(dchar c)8103 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
toTitleSimpleIndex(dchar c)8104 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
toTitleTab(size_t idx)8105 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8106
toUpperIndex(dchar c)8107 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
toUpperSimpleIndex(dchar c)8108 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
toUpperTab(size_t idx)8109 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8110 }
8111
8112 public:
8113
8114 /++
8115 Whether or not $(D c) is a Unicode whitespace $(CHARACTER).
8116 (general Unicode category: Part of C0(tab, vertical tab, form feed,
8117 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8118 +/
8119 @safe pure nothrow @nogc
isWhite(dchar c)8120 public bool isWhite(dchar c)
8121 {
8122 import std.internal.unicode_tables : isWhiteGen; // generated file
8123 return isWhiteGen(c); // call pregenerated binary search
8124 }
8125
8126 /++
8127 Return whether $(D c) is a Unicode lowercase $(CHARACTER).
8128 +/
8129 @safe pure nothrow @nogc
isLower(dchar c)8130 bool isLower(dchar c)
8131 {
8132 import std.ascii : isLower, isASCII;
8133 if (isASCII(c))
8134 return isLower(c);
8135 return lowerCaseTrie[c];
8136 }
8137
8138 @safe unittest
8139 {
8140 import std.ascii : isLower;
8141 foreach (v; 0 .. 0x80)
8142 assert(isLower(v) == .isLower(v));
8143 assert(.isLower('я'));
8144 assert(.isLower('й'));
8145 assert(!.isLower('Ж'));
8146 // Greek HETA
8147 assert(!.isLower('\u0370'));
8148 assert(.isLower('\u0371'));
8149 assert(!.isLower('\u039C')); // capital MU
8150 assert(.isLower('\u03B2')); // beta
8151 // from extended Greek
8152 assert(!.isLower('\u1F18'));
8153 assert(.isLower('\u1F00'));
8154 foreach (v; unicode.lowerCase.byCodepoint)
8155 assert(.isLower(v) && !isUpper(v));
8156 }
8157
8158
8159 /++
8160 Return whether $(D c) is a Unicode uppercase $(CHARACTER).
8161 +/
8162 @safe pure nothrow @nogc
isUpper(dchar c)8163 bool isUpper(dchar c)
8164 {
8165 import std.ascii : isUpper, isASCII;
8166 if (isASCII(c))
8167 return isUpper(c);
8168 return upperCaseTrie[c];
8169 }
8170
8171 @safe unittest
8172 {
8173 import std.ascii : isLower;
8174 foreach (v; 0 .. 0x80)
8175 assert(isLower(v) == .isLower(v));
8176 assert(!isUpper('й'));
8177 assert(isUpper('Ж'));
8178 // Greek HETA
8179 assert(isUpper('\u0370'));
8180 assert(!isUpper('\u0371'));
8181 assert(isUpper('\u039C')); // capital MU
8182 assert(!isUpper('\u03B2')); // beta
8183 // from extended Greek
8184 assert(!isUpper('\u1F00'));
8185 assert(isUpper('\u1F18'));
8186 foreach (v; unicode.upperCase.byCodepoint)
8187 assert(isUpper(v) && !.isLower(v));
8188 }
8189
8190
8191 //TODO: Hidden for now, needs better API.
8192 //Other transforms could use better API as well, but this one is a new primitive.
8193 @safe pure nothrow @nogc
toTitlecase(dchar c)8194 private dchar toTitlecase(dchar c)
8195 {
8196 // optimize ASCII case
8197 if (c < 0xAA)
8198 {
8199 if (c < 'a')
8200 return c;
8201 if (c <= 'z')
8202 return c - 32;
8203 return c;
8204 }
8205 size_t idx = toTitleSimpleIndex(c);
8206 if (idx != ushort.max)
8207 {
8208 return toTitleTab(idx);
8209 }
8210 return c;
8211 }
8212
8213 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
8214 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
8215
8216 // generic toUpper/toLower on whole string, creates new or returns as is
8217 private S toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) @trusted pure
8218 if (isSomeString!S)
8219 {
8220 import std.array : appender;
8221 import std.ascii : isASCII;
8222
foreach(i,dchar cOuter;s)8223 foreach (i, dchar cOuter; s)
8224 {
8225 ushort idx = indexFn(cOuter);
8226 if (idx == ushort.max)
8227 continue;
8228 auto result = appender!S(s[0 .. i]);
8229 result.reserve(s.length);
8230 foreach (dchar c; s[i .. $])
8231 {
8232 if (c.isASCII)
8233 {
8234 result.put(asciiConvert(c));
8235 }
8236 else
8237 {
8238 idx = indexFn(c);
8239 if (idx == ushort.max)
8240 result.put(c);
8241 else if (idx < maxIdx)
8242 {
8243 c = tableFn(idx);
8244 result.put(c);
8245 }
8246 else
8247 {
8248 auto val = tableFn(idx);
8249 // unpack length + codepoint
8250 immutable uint len = val >> 24;
8251 result.put(cast(dchar)(val & 0xFF_FFFF));
8252 foreach (j; idx+1 .. idx+len)
8253 result.put(tableFn(j));
8254 }
8255 }
8256 }
8257 return result.data;
8258 }
8259 return s;
8260 }
8261
8262 @safe unittest //12428
8263 {
8264 import std.array : replicate;
8265 auto s = "abcdefghij".replicate(300);
8266 s = s[0 .. 10];
8267
8268 toUpper(s);
8269
8270 assert(s == "abcdefghij");
8271 }
8272
8273
8274 // generic toUpper/toLower on whole range, returns range
8275 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
8276 // Accept range of dchar's
8277 if (isInputRange!Range &&
8278 isSomeChar!(ElementEncodingType!Range) &&
8279 ElementEncodingType!Range.sizeof == dchar.sizeof)
8280 {
8281 static struct ToCaserImpl
8282 {
emptyToCaserImpl8283 @property bool empty()
8284 {
8285 return !nLeft && r.empty;
8286 }
8287
frontToCaserImpl8288 @property auto front()
8289 {
8290 import std.ascii : isASCII;
8291
8292 if (!nLeft)
8293 {
8294 dchar c = r.front;
8295 if (c.isASCII)
8296 {
8297 buf[0] = asciiConvert(c);
8298 nLeft = 1;
8299 }
8300 else
8301 {
8302 const idx = indexFn(c);
8303 if (idx == ushort.max)
8304 {
8305 buf[0] = c;
8306 nLeft = 1;
8307 }
8308 else if (idx < maxIdx)
8309 {
8310 buf[0] = tableFn(idx);
8311 nLeft = 1;
8312 }
8313 else
8314 {
8315 immutable val = tableFn(idx);
8316 // unpack length + codepoint
8317 nLeft = val >> 24;
8318 if (nLeft == 0)
8319 nLeft = 1;
8320 assert(nLeft <= buf.length);
8321 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
8322 foreach (j; 1 .. nLeft)
8323 buf[nLeft - j - 1] = tableFn(idx + j);
8324 }
8325 }
8326 }
8327 return buf[nLeft - 1];
8328 }
8329
popFrontToCaserImpl8330 void popFront()
8331 {
8332 if (!nLeft)
8333 front;
8334 assert(nLeft);
8335 --nLeft;
8336 if (!nLeft)
8337 r.popFront();
8338 }
8339
8340 static if (isForwardRange!Range)
8341 {
saveToCaserImpl8342 @property auto save()
8343 {
8344 auto ret = this;
8345 ret.r = r.save;
8346 return ret;
8347 }
8348 }
8349
8350 private:
8351 Range r;
8352 uint nLeft;
8353 dchar[3] buf = void;
8354 }
8355
8356 return ToCaserImpl(str);
8357 }
8358
8359 /*********************
8360 * Convert input range or string to upper or lower case.
8361 *
8362 * Does not allocate memory.
8363 * Characters in UTF-8 or UTF-16 format that cannot be decoded
8364 * are treated as $(REF replacementDchar, std,utf).
8365 *
8366 * Params:
8367 * str = string or range of characters
8368 *
8369 * Returns:
8370 * an InputRange of dchars
8371 *
8372 * See_Also:
8373 * $(LREF toUpper), $(LREF toLower)
8374 */
8375
8376 auto asLowerCase(Range)(Range str)
8377 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8378 !isConvertibleToString!Range)
8379 {
8380 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8381 {
8382 import std.utf : byDchar;
8383
8384 // Decode first
8385 return asLowerCase(str.byDchar);
8386 }
8387 else
8388 {
8389 static import std.ascii;
8390 return toCaser!(LowerTriple, std.ascii.toLower)(str);
8391 }
8392 }
8393
8394 /// ditto
8395 auto asUpperCase(Range)(Range str)
8396 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8397 !isConvertibleToString!Range)
8398 {
8399 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8400 {
8401 import std.utf : byDchar;
8402
8403 // Decode first
8404 return asUpperCase(str.byDchar);
8405 }
8406 else
8407 {
8408 static import std.ascii;
8409 return toCaser!(UpperTriple, std.ascii.toUpper)(str);
8410 }
8411 }
8412
8413 ///
8414 @safe pure unittest
8415 {
8416 import std.algorithm.comparison : equal;
8417
8418 assert("hEllo".asUpperCase.equal("HELLO"));
8419 }
8420
8421 // explicitly undocumented
8422 auto asLowerCase(Range)(auto ref Range str)
8423 if (isConvertibleToString!Range)
8424 {
8425 import std.traits : StringTypeOf;
8426 return asLowerCase!(StringTypeOf!Range)(str);
8427 }
8428
8429 // explicitly undocumented
8430 auto asUpperCase(Range)(auto ref Range str)
8431 if (isConvertibleToString!Range)
8432 {
8433 import std.traits : StringTypeOf;
8434 return asUpperCase!(StringTypeOf!Range)(str);
8435 }
8436
8437 @safe unittest
8438 {
8439 assert(testAliasedString!asLowerCase("hEllo"));
8440 assert(testAliasedString!asUpperCase("hEllo"));
8441 }
8442
8443 @safe unittest
8444 {
8445 import std.array : array;
8446
8447 auto a = "HELLo".asLowerCase;
8448 auto savea = a.save;
8449 auto s = a.array;
8450 assert(s == "hello");
8451 s = savea.array;
8452 assert(s == "hello");
8453
8454 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
8455 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
8456
foreach(i,slwr;lower)8457 foreach (i, slwr; lower)
8458 {
8459 import std.utf : byChar;
8460
8461 auto sx = slwr.asUpperCase.byChar.array;
8462 assert(sx == toUpper(slwr));
8463 auto sy = upper[i].asLowerCase.byChar.array;
8464 assert(sy == toLower(upper[i]));
8465 }
8466
8467 // Not necessary to call r.front
8468 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
8469 {
8470 }
8471
8472 import std.algorithm.comparison : equal;
8473
8474 "HELLo"w.asLowerCase.equal("hello"d);
8475 "HELLo"w.asUpperCase.equal("HELLO"d);
8476 "HELLo"d.asLowerCase.equal("hello"d);
8477 "HELLo"d.asUpperCase.equal("HELLO"d);
8478
8479 import std.utf : byChar;
8480 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
8481 }
8482
8483 // generic capitalizer on whole range, returns range
8484 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
8485 Range)(Range str)
8486 // Accept range of dchar's
8487 if (isInputRange!Range &&
8488 isSomeChar!(ElementEncodingType!Range) &&
8489 ElementEncodingType!Range.sizeof == dchar.sizeof)
8490 {
8491 static struct ToCapitalizerImpl
8492 {
emptyToCapitalizerImpl8493 @property bool empty()
8494 {
8495 return lower ? lwr.empty : !nLeft && r.empty;
8496 }
8497
frontToCapitalizerImpl8498 @property auto front()
8499 {
8500 if (lower)
8501 return lwr.front;
8502
8503 if (!nLeft)
8504 {
8505 immutable dchar c = r.front;
8506 const idx = indexFnUpper(c);
8507 if (idx == ushort.max)
8508 {
8509 buf[0] = c;
8510 nLeft = 1;
8511 }
8512 else if (idx < maxIdxUpper)
8513 {
8514 buf[0] = tableFnUpper(idx);
8515 nLeft = 1;
8516 }
8517 else
8518 {
8519 immutable val = tableFnUpper(idx);
8520 // unpack length + codepoint
8521 nLeft = val >> 24;
8522 if (nLeft == 0)
8523 nLeft = 1;
8524 assert(nLeft <= buf.length);
8525 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
8526 foreach (j; 1 .. nLeft)
8527 buf[nLeft - j - 1] = tableFnUpper(idx + j);
8528 }
8529 }
8530 return buf[nLeft - 1];
8531 }
8532
popFrontToCapitalizerImpl8533 void popFront()
8534 {
8535 if (lower)
8536 lwr.popFront();
8537 else
8538 {
8539 if (!nLeft)
8540 front;
8541 assert(nLeft);
8542 --nLeft;
8543 if (!nLeft)
8544 {
8545 r.popFront();
8546 lwr = r.asLowerCase();
8547 lower = true;
8548 }
8549 }
8550 }
8551
8552 static if (isForwardRange!Range)
8553 {
saveToCapitalizerImpl8554 @property auto save()
8555 {
8556 auto ret = this;
8557 ret.r = r.save;
8558 ret.lwr = lwr.save;
8559 return ret;
8560 }
8561 }
8562
8563 private:
8564 Range r;
8565 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
8566 bool lower = false; // false for first character, true for rest of string
8567 dchar[3] buf = void;
8568 uint nLeft = 0;
8569 }
8570
8571 return ToCapitalizerImpl(str);
8572 }
8573
8574 /*********************
8575 * Capitalize input range or string, meaning convert the first
8576 * character to upper case and subsequent characters to lower case.
8577 *
8578 * Does not allocate memory.
8579 * Characters in UTF-8 or UTF-16 format that cannot be decoded
8580 * are treated as $(REF replacementDchar, std,utf).
8581 *
8582 * Params:
8583 * str = string or range of characters
8584 *
8585 * Returns:
8586 * an InputRange of dchars
8587 *
8588 * See_Also:
8589 * $(LREF toUpper), $(LREF toLower)
8590 * $(LREF asUpperCase), $(LREF asLowerCase)
8591 */
8592
8593 auto asCapitalized(Range)(Range str)
8594 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8595 !isConvertibleToString!Range)
8596 {
8597 static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8598 {
8599 import std.utf : byDchar;
8600
8601 // Decode first
8602 return toCapitalizer!UpperTriple(str.byDchar);
8603 }
8604 else
8605 {
8606 return toCapitalizer!UpperTriple(str);
8607 }
8608 }
8609
8610 ///
8611 @safe pure unittest
8612 {
8613 import std.algorithm.comparison : equal;
8614
8615 assert("hEllo".asCapitalized.equal("Hello"));
8616 }
8617
8618 auto asCapitalized(Range)(auto ref Range str)
8619 if (isConvertibleToString!Range)
8620 {
8621 import std.traits : StringTypeOf;
8622 return asCapitalized!(StringTypeOf!Range)(str);
8623 }
8624
8625 @safe unittest
8626 {
8627 assert(testAliasedString!asCapitalized("hEllo"));
8628 }
8629
8630 @safe pure nothrow @nogc unittest
8631 {
8632 auto r = "hEllo".asCapitalized();
8633 assert(r.front == 'H');
8634 }
8635
8636 @safe unittest
8637 {
8638 import std.array : array;
8639
8640 auto a = "hELLo".asCapitalized;
8641 auto savea = a.save;
8642 auto s = a.array;
8643 assert(s == "Hello");
8644 s = savea.array;
8645 assert(s == "Hello");
8646
8647 string[2][] cases =
8648 [
8649 ["", ""],
8650 ["h", "H"],
8651 ["H", "H"],
8652 ["3", "3"],
8653 ["123", "123"],
8654 ["h123A", "H123a"],
8655 ["феж", "Феж"],
8656 ["\u1Fe2", "\u03a5\u0308\u0300"],
8657 ];
8658
8659 foreach (i; 0 .. cases.length)
8660 {
8661 import std.utf : byChar;
8662
8663 auto r = cases[i][0].asCapitalized.byChar.array;
8664 auto result = cases[i][1];
8665 assert(r == result);
8666 }
8667
8668 // Don't call r.front
8669 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
8670 {
8671 }
8672
8673 import std.algorithm.comparison : equal;
8674
8675 "HELLo"w.asCapitalized.equal("Hello"d);
8676 "hElLO"w.asCapitalized.equal("Hello"d);
8677 "hello"d.asCapitalized.equal("Hello"d);
8678 "HELLO"d.asCapitalized.equal("Hello"d);
8679
8680 import std.utf : byChar;
8681 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
8682 }
8683
8684 // TODO: helper, I wish std.utf was more flexible (and stright)
encodeTo(scope char[]buf,size_t idx,dchar c)8685 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
8686 {
8687 if (c <= 0x7F)
8688 {
8689 buf[idx] = cast(char) c;
8690 idx++;
8691 }
8692 else if (c <= 0x7FF)
8693 {
8694 buf[idx] = cast(char)(0xC0 | (c >> 6));
8695 buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
8696 idx += 2;
8697 }
8698 else if (c <= 0xFFFF)
8699 {
8700 buf[idx] = cast(char)(0xE0 | (c >> 12));
8701 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
8702 buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
8703 idx += 3;
8704 }
8705 else if (c <= 0x10FFFF)
8706 {
8707 buf[idx] = cast(char)(0xF0 | (c >> 18));
8708 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
8709 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
8710 buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
8711 idx += 4;
8712 }
8713 else
8714 assert(0);
8715 return idx;
8716 }
8717
8718 @safe unittest
8719 {
8720 char[] s = "abcd".dup;
8721 size_t i = 0;
8722 i = encodeTo(s, i, 'X');
8723 assert(s == "Xbcd");
8724
8725 i = encodeTo(s, i, cast(dchar)'\u00A9');
8726 assert(s == "X\xC2\xA9d");
8727 }
8728
8729 // TODO: helper, I wish std.utf was more flexible (and stright)
encodeTo(scope wchar[]buf,size_t idx,dchar c)8730 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
8731 {
8732 import std.utf : UTFException;
8733 if (c <= 0xFFFF)
8734 {
8735 if (0xD800 <= c && c <= 0xDFFF)
8736 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
8737 buf[idx] = cast(wchar) c;
8738 idx++;
8739 }
8740 else if (c <= 0x10FFFF)
8741 {
8742 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
8743 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
8744 idx += 2;
8745 }
8746 else
8747 assert(0);
8748 return idx;
8749 }
8750
encodeTo(scope dchar[]buf,size_t idx,dchar c)8751 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
8752 {
8753 buf[idx] = c;
8754 idx++;
8755 return idx;
8756 }
8757
8758 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
8759 if (is(C == char) || is(C == wchar) || is(C == dchar))
8760 {
8761 import std.utf : decode, codeLength;
8762 size_t curIdx = 0;
8763 size_t destIdx = 0;
8764 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
8765 size_t lastUnchanged = 0;
8766 // in-buffer move of bytes to a new start index
8767 // the trick is that it may not need to copy at all
moveTo(C[]str,size_t dest,size_t from,size_t to)8768 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
8769 {
8770 // Interestingly we may just bump pointer for a while
8771 // then have to copy if a re-cased char was smaller the original
8772 // later we may regain pace with char that got bigger
8773 // In the end it sometimes flip-flops between the 2 cases below
8774 if (dest == from)
8775 return to;
8776 // got to copy
8777 foreach (C c; str[from .. to])
8778 str[dest++] = c;
8779 return dest;
8780 }
8781 while (curIdx != s.length)
8782 {
8783 size_t startIdx = curIdx;
8784 immutable ch = decode(s, curIdx);
8785 // TODO: special case for ASCII
8786 immutable caseIndex = indexFn(ch);
8787 if (caseIndex == ushort.max) // unchanged, skip over
8788 {
8789 continue;
8790 }
8791 else if (caseIndex < maxIdx) // 1:1 codepoint mapping
8792 {
8793 // previous cased chars had the same length as uncased ones
8794 // thus can just adjust pointer
8795 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
8796 lastUnchanged = curIdx;
8797 immutable cased = tableFn(caseIndex);
8798 immutable casedLen = codeLength!C(cased);
8799 if (casedLen + destIdx > curIdx) // no place to fit cased char
8800 {
8801 // switch to slow codepath, where we allocate
8802 return slowToCase(s, startIdx, destIdx);
8803 }
8804 else
8805 {
8806 destIdx = encodeTo(s, destIdx, cased);
8807 }
8808 }
8809 else // 1:m codepoint mapping, slow codepath
8810 {
8811 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
8812 lastUnchanged = curIdx;
8813 return slowToCase(s, startIdx, destIdx);
8814 }
8815 assert(destIdx <= curIdx);
8816 }
8817 if (lastUnchanged != s.length)
8818 {
8819 destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
8820 }
8821 s = s[0 .. destIdx];
8822 }
8823
8824 // helper to precalculate size of case-converted string
toCaseLength(alias indexFn,uint maxIdx,alias tableFn)8825 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
8826 {
8827 size_t toCaseLength(C)(in C[] str)
8828 {
8829 import std.utf : decode, codeLength;
8830 size_t codeLen = 0;
8831 size_t lastNonTrivial = 0;
8832 size_t curIdx = 0;
8833 while (curIdx != str.length)
8834 {
8835 immutable startIdx = curIdx;
8836 immutable ch = decode(str, curIdx);
8837 immutable ushort caseIndex = indexFn(ch);
8838 if (caseIndex == ushort.max)
8839 continue;
8840 else if (caseIndex < maxIdx)
8841 {
8842 codeLen += startIdx - lastNonTrivial;
8843 lastNonTrivial = curIdx;
8844 immutable cased = tableFn(caseIndex);
8845 codeLen += codeLength!C(cased);
8846 }
8847 else
8848 {
8849 codeLen += startIdx - lastNonTrivial;
8850 lastNonTrivial = curIdx;
8851 immutable val = tableFn(caseIndex);
8852 immutable len = val >> 24;
8853 immutable dchar cased = val & 0xFF_FFFF;
8854 codeLen += codeLength!C(cased);
8855 foreach (j; caseIndex+1 .. caseIndex+len)
8856 codeLen += codeLength!C(tableFn(j));
8857 }
8858 }
8859 if (lastNonTrivial != str.length)
8860 codeLen += str.length - lastNonTrivial;
8861 return codeLen;
8862 }
8863 }
8864
8865 @safe unittest
8866 {
8867 alias toLowerLength = toCaseLength!(LowerTriple);
8868 assert(toLowerLength("abcd") == 4);
8869 assert(toLowerLength("аБВгд456") == 10+3);
8870 }
8871
8872 // slower code path that preallocates and then copies
8873 // case-converted stuf to the new string
toCaseInPlaceAlloc(alias indexFn,uint maxIdx,alias tableFn)8874 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
8875 {
8876 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
8877 size_t destIdx) @trusted pure
8878 if (is(C == char) || is(C == wchar) || is(C == dchar))
8879 {
8880 import std.utf : decode;
8881 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
8882 auto trueLength = destIdx + caseLength(s[curIdx..$]);
8883 C[] ns = new C[trueLength];
8884 ns[0 .. destIdx] = s[0 .. destIdx];
8885 size_t lastUnchanged = curIdx;
8886 while (curIdx != s.length)
8887 {
8888 immutable startIdx = curIdx; // start of current codepoint
8889 immutable ch = decode(s, curIdx);
8890 immutable caseIndex = indexFn(ch);
8891 if (caseIndex == ushort.max) // skip over
8892 {
8893 continue;
8894 }
8895 else if (caseIndex < maxIdx) // 1:1 codepoint mapping
8896 {
8897 immutable cased = tableFn(caseIndex);
8898 auto toCopy = startIdx - lastUnchanged;
8899 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
8900 lastUnchanged = curIdx;
8901 destIdx += toCopy;
8902 destIdx = encodeTo(ns, destIdx, cased);
8903 }
8904 else // 1:m codepoint mapping, slow codepath
8905 {
8906 auto toCopy = startIdx - lastUnchanged;
8907 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
8908 lastUnchanged = curIdx;
8909 destIdx += toCopy;
8910 auto val = tableFn(caseIndex);
8911 // unpack length + codepoint
8912 immutable uint len = val >> 24;
8913 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
8914 foreach (j; caseIndex+1 .. caseIndex+len)
8915 destIdx = encodeTo(ns, destIdx, tableFn(j));
8916 }
8917 }
8918 if (lastUnchanged != s.length)
8919 {
8920 auto toCopy = s.length - lastUnchanged;
8921 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
8922 destIdx += toCopy;
8923 }
8924 assert(ns.length == destIdx);
8925 s = ns;
8926 }
8927 }
8928
8929 /++
8930 Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
8931 For a few characters string length may increase after the transformation,
8932 in such a case the function reallocates exactly once.
8933 If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
8934 +/
8935 void toLowerInPlace(C)(ref C[] s) @trusted pure
8936 if (is(C == char) || is(C == wchar) || is(C == dchar))
8937 {
8938 toCaseInPlace!(LowerTriple)(s);
8939 }
8940 // overloads for the most common cases to reduce compile time
8941 @safe pure /*TODO nothrow*/
8942 {
toLowerInPlace(ref char[]s)8943 void toLowerInPlace(ref char[] s)
8944 { toLowerInPlace!char(s); }
toLowerInPlace(ref wchar[]s)8945 void toLowerInPlace(ref wchar[] s)
8946 { toLowerInPlace!wchar(s); }
toLowerInPlace(ref dchar[]s)8947 void toLowerInPlace(ref dchar[] s)
8948 { toLowerInPlace!dchar(s); }
8949 }
8950
8951 /++
8952 Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place.
8953 For a few characters string length may increase after the transformation,
8954 in such a case the function reallocates exactly once.
8955 If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
8956 +/
8957 void toUpperInPlace(C)(ref C[] s) @trusted pure
8958 if (is(C == char) || is(C == wchar) || is(C == dchar))
8959 {
8960 toCaseInPlace!(UpperTriple)(s);
8961 }
8962 // overloads for the most common cases to reduce compile time/code size
8963 @safe pure /*TODO nothrow*/
8964 {
toUpperInPlace(ref char[]s)8965 void toUpperInPlace(ref char[] s)
8966 { toUpperInPlace!char(s); }
toUpperInPlace(ref wchar[]s)8967 void toUpperInPlace(ref wchar[] s)
8968 { toUpperInPlace!wchar(s); }
toUpperInPlace(ref dchar[]s)8969 void toUpperInPlace(ref dchar[] s)
8970 { toUpperInPlace!dchar(s); }
8971 }
8972
8973 /++
8974 If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
8975 is returned. Otherwise $(D c) is returned.
8976
8977 Warning: certain alphabets like German and Greek have no 1:1
8978 upper-lower mapping. Use overload of toLower which takes full string instead.
8979 +/
8980 @safe pure nothrow @nogc
toLower(dchar c)8981 dchar toLower(dchar c)
8982 {
8983 // optimize ASCII case
8984 if (c < 0xAA)
8985 {
8986 if (c < 'A')
8987 return c;
8988 if (c <= 'Z')
8989 return c + 32;
8990 return c;
8991 }
8992 size_t idx = toLowerSimpleIndex(c);
8993 if (idx != ushort.max)
8994 {
8995 return toLowerTab(idx);
8996 }
8997 return c;
8998 }
8999
9000 /++
9001 Returns a string which is identical to $(D s) except that all of its
9002 characters are converted to lowercase (by preforming Unicode lowercase mapping).
9003 If none of $(D s) characters were affected, then $(D s) itself is returned.
9004 +/
9005 S toLower(S)(S s) @trusted pure
9006 if (isSomeString!S)
9007 {
9008 static import std.ascii;
9009 return toCase!(LowerTriple, std.ascii.toLower)(s);
9010 }
9011 // overloads for the most common cases to reduce compile time
9012 @safe pure /*TODO nothrow*/
9013 {
toLower(string s)9014 string toLower(string s)
9015 { return toLower!string(s); }
toLower(wstring s)9016 wstring toLower(wstring s)
9017 { return toLower!wstring(s); }
toLower(dstring s)9018 dstring toLower(dstring s)
9019 { return toLower!dstring(s); }
9020
9021 @safe unittest
9022 {
9023 // https://issues.dlang.org/show_bug.cgi?id=16663
9024
9025 static struct String
9026 {
9027 string data;
9028 alias data this;
9029 }
9030
foo()9031 void foo()
9032 {
9033 auto u = toLower(String(""));
9034 }
9035 }
9036 }
9037
9038
9039 @system unittest //@@@BUG std.format is not @safe
9040 {
9041 static import std.ascii;
9042 import std.format : format;
9043 foreach (ch; 0 .. 0x80)
9044 assert(std.ascii.toLower(ch) == toLower(ch));
9045 assert(toLower('Я') == 'я');
9046 assert(toLower('Δ') == 'δ');
9047 foreach (ch; unicode.upperCase.byCodepoint)
9048 {
9049 dchar low = ch.toLower();
9050 assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9051 }
9052 assert(toLower("АЯ") == "ая");
9053
9054 assert("\u1E9E".toLower == "\u00df");
9055 assert("\u00df".toUpper == "SS");
9056 }
9057
9058 //bugzilla 9629
9059 @safe unittest
9060 {
9061 wchar[] test = "hello þ world"w.dup;
9062 auto piece = test[6 .. 7];
9063 toUpperInPlace(piece);
9064 assert(test == "hello Þ world");
9065 }
9066
9067
9068 @safe unittest
9069 {
9070 import std.algorithm.comparison : cmp;
9071 string s1 = "FoL";
9072 string s2 = toLower(s1);
9073 assert(cmp(s2, "fol") == 0, s2);
9074 assert(s2 != s1);
9075
9076 char[] s3 = s1.dup;
9077 toLowerInPlace(s3);
9078 assert(s3 == s2);
9079
9080 s1 = "A\u0100B\u0101d";
9081 s2 = toLower(s1);
9082 s3 = s1.dup;
9083 assert(cmp(s2, "a\u0101b\u0101d") == 0);
9084 assert(s2 !is s1);
9085 toLowerInPlace(s3);
9086 assert(s3 == s2);
9087
9088 s1 = "A\u0460B\u0461d";
9089 s2 = toLower(s1);
9090 s3 = s1.dup;
9091 assert(cmp(s2, "a\u0461b\u0461d") == 0);
9092 assert(s2 !is s1);
9093 toLowerInPlace(s3);
9094 assert(s3 == s2);
9095
9096 s1 = "\u0130";
9097 s2 = toLower(s1);
9098 s3 = s1.dup;
9099 assert(s2 == "i\u0307");
9100 assert(s2 !is s1);
9101 toLowerInPlace(s3);
9102 assert(s3 == s2);
9103
9104 // Test on wchar and dchar strings.
9105 assert(toLower("Some String"w) == "some string"w);
9106 assert(toLower("Some String"d) == "some string"d);
9107
9108 // bugzilla 12455
9109 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9110 assert(isUpper(c));
9111 assert(toLower(c) == 'i');
9112 // extend on 12455 reprot - check simple-case toUpper too
9113 c = '\u1f87';
9114 assert(isLower(c));
9115 assert(toUpper(c) == '\u1F8F');
9116 }
9117
9118
9119 /++
9120 If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9121 is returned. Otherwise $(D c) is returned.
9122
9123 Warning:
9124 Certain alphabets like German and Greek have no 1:1
9125 upper-lower mapping. Use overload of toUpper which takes full string instead.
9126
9127 toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9128 to produce an algorithm that can convert a range of characters to upper case
9129 without allocating memory.
9130 A string can then be produced by using $(REF copy, std,algorithm,mutation)
9131 to send it to an $(REF appender, std,array).
9132 +/
9133 @safe pure nothrow @nogc
toUpper(dchar c)9134 dchar toUpper(dchar c)
9135 {
9136 // optimize ASCII case
9137 if (c < 0xAA)
9138 {
9139 if (c < 'a')
9140 return c;
9141 if (c <= 'z')
9142 return c - 32;
9143 return c;
9144 }
9145 size_t idx = toUpperSimpleIndex(c);
9146 if (idx != ushort.max)
9147 {
9148 return toUpperTab(idx);
9149 }
9150 return c;
9151 }
9152
9153 ///
9154 @system unittest
9155 {
9156 import std.algorithm.iteration : map;
9157 import std.algorithm.mutation : copy;
9158 import std.array : appender;
9159
9160 auto abuf = appender!(char[])();
9161 "hello".map!toUpper.copy(&abuf);
9162 assert(abuf.data == "HELLO");
9163 }
9164
9165 @safe unittest
9166 {
9167 static import std.ascii;
9168 import std.format : format;
9169 foreach (ch; 0 .. 0x80)
9170 assert(std.ascii.toUpper(ch) == toUpper(ch));
9171 assert(toUpper('я') == 'Я');
9172 assert(toUpper('δ') == 'Δ');
9173 auto title = unicode.Titlecase_Letter;
9174 foreach (ch; unicode.lowerCase.byCodepoint)
9175 {
9176 dchar up = ch.toUpper();
9177 assert(up == ch || isUpper(up) || title[up],
9178 format("%x -> %x", ch, up));
9179 }
9180 }
9181
9182 /++
9183 Returns a string which is identical to $(D s) except that all of its
9184 characters are converted to uppercase (by preforming Unicode uppercase mapping).
9185 If none of $(D s) characters were affected, then $(D s) itself is returned.
9186 +/
9187 S toUpper(S)(S s) @trusted pure
9188 if (isSomeString!S)
9189 {
9190 static import std.ascii;
9191 return toCase!(UpperTriple, std.ascii.toUpper)(s);
9192 }
9193 // overloads for the most common cases to reduce compile time
9194 @safe pure /*TODO nothrow*/
9195 {
toUpper(string s)9196 string toUpper(string s)
9197 { return toUpper!string(s); }
toUpper(wstring s)9198 wstring toUpper(wstring s)
9199 { return toUpper!wstring(s); }
toUpper(dstring s)9200 dstring toUpper(dstring s)
9201 { return toUpper!dstring(s); }
9202
9203 @safe unittest
9204 {
9205 // https://issues.dlang.org/show_bug.cgi?id=16663
9206
9207 static struct String
9208 {
9209 string data;
9210 alias data this;
9211 }
9212
foo()9213 void foo()
9214 {
9215 auto u = toUpper(String(""));
9216 }
9217 }
9218 }
9219
9220 @safe unittest
9221 {
9222 import std.algorithm.comparison : cmp;
9223
9224 string s1 = "FoL";
9225 string s2;
9226 char[] s3;
9227
9228 s2 = toUpper(s1);
9229 s3 = s1.dup; toUpperInPlace(s3);
9230 assert(s3 == s2, s3);
9231 assert(cmp(s2, "FOL") == 0);
9232 assert(s2 !is s1);
9233
9234 s1 = "a\u0100B\u0101d";
9235 s2 = toUpper(s1);
9236 s3 = s1.dup; toUpperInPlace(s3);
9237 assert(s3 == s2);
9238 assert(cmp(s2, "A\u0100B\u0100D") == 0);
9239 assert(s2 !is s1);
9240
9241 s1 = "a\u0460B\u0461d";
9242 s2 = toUpper(s1);
9243 s3 = s1.dup; toUpperInPlace(s3);
9244 assert(s3 == s2);
9245 assert(cmp(s2, "A\u0460B\u0460D") == 0);
9246 assert(s2 !is s1);
9247 }
9248
9249 @system unittest
9250 {
doTest(C)9251 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
9252 {
9253 import std.format : format;
9254 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
9255 auto low = s.toLower() , up = s.toUpper();
9256 auto lowInp = s.dup, upInp = s.dup;
9257 lowInp.toLowerInPlace();
9258 upInp.toUpperInPlace();
9259 assert(low == trueLow, format(diff, low, trueLow));
9260 assert(up == trueUp, format(diff, up, trueUp));
9261 assert(lowInp == trueLow,
9262 format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow));
9263 assert(upInp == trueUp,
9264 format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp));
9265 }
9266 foreach (S; AliasSeq!(dstring, wstring, string))
9267 {
9268
9269 S easy = "123";
9270 S good = "abCФеж";
9271 S awful = "\u0131\u023f\u2126";
9272 S wicked = "\u0130\u1FE2";
9273 auto options = [easy, good, awful, wicked];
9274 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9275 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9276
9277 foreach (val; AliasSeq!(easy, good))
9278 {
9279 auto e = val.dup;
9280 auto g = e;
9281 e.toUpperInPlace();
9282 assert(e is g);
9283 e.toLowerInPlace();
9284 assert(e is g);
9285 }
foreach(i,v;options)9286 foreach (i, v; options)
9287 {
9288 doTest(v, upper[i], lower[i]);
9289 }
9290
9291 // a few combinatorial runs
9292 foreach (i; 0 .. options.length)
9293 foreach (j; i .. options.length)
9294 foreach (k; j .. options.length)
9295 {
9296 auto sample = options[i] ~ options[j] ~ options[k];
9297 auto sample2 = options[k] ~ options[j] ~ options[i];
9298 doTest(sample, upper[i] ~ upper[j] ~ upper[k],
9299 lower[i] ~ lower[j] ~ lower[k]);
9300 doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
9301 lower[k] ~ lower[j] ~ lower[i]);
9302 }
9303 }
9304 }
9305
9306
9307 /++
9308 Returns whether $(D c) is a Unicode alphabetic $(CHARACTER)
9309 (general Unicode category: Alphabetic).
9310 +/
9311 @safe pure nothrow @nogc
isAlpha(dchar c)9312 bool isAlpha(dchar c)
9313 {
9314 // optimization
9315 if (c < 0xAA)
9316 {
9317 size_t x = c - 'A';
9318 if (x <= 'Z' - 'A')
9319 return true;
9320 else
9321 {
9322 x = c - 'a';
9323 if (x <= 'z'-'a')
9324 return true;
9325 }
9326 return false;
9327 }
9328
9329 return alphaTrie[c];
9330 }
9331
9332 @safe unittest
9333 {
9334 auto alpha = unicode("Alphabetic");
9335 foreach (ch; alpha.byCodepoint)
9336 assert(isAlpha(ch));
9337 foreach (ch; 0 .. 0x4000)
9338 assert((ch in alpha) == isAlpha(ch));
9339 }
9340
9341
9342 /++
9343 Returns whether $(D c) is a Unicode mark
9344 (general Unicode category: Mn, Me, Mc).
9345 +/
9346 @safe pure nothrow @nogc
isMark(dchar c)9347 bool isMark(dchar c)
9348 {
9349 return markTrie[c];
9350 }
9351
9352 @safe unittest
9353 {
9354 auto mark = unicode("Mark");
9355 foreach (ch; mark.byCodepoint)
9356 assert(isMark(ch));
9357 foreach (ch; 0 .. 0x4000)
9358 assert((ch in mark) == isMark(ch));
9359 }
9360
9361 /++
9362 Returns whether $(D c) is a Unicode numerical $(CHARACTER)
9363 (general Unicode category: Nd, Nl, No).
9364 +/
9365 @safe pure nothrow @nogc
isNumber(dchar c)9366 bool isNumber(dchar c)
9367 {
9368 // optimization for ascii case
9369 if (c <= 0x7F)
9370 {
9371 return c >= '0' && c <= '9';
9372 }
9373 else
9374 {
9375 return numberTrie[c];
9376 }
9377 }
9378
9379 @safe unittest
9380 {
9381 auto n = unicode("N");
9382 foreach (ch; n.byCodepoint)
9383 assert(isNumber(ch));
9384 foreach (ch; 0 .. 0x4000)
9385 assert((ch in n) == isNumber(ch));
9386 }
9387
9388 /++
9389 Returns whether $(D c) is a Unicode alphabetic $(CHARACTER) or number.
9390 (general Unicode category: Alphabetic, Nd, Nl, No).
9391
9392 Params:
9393 c = any Unicode character
9394 Returns:
9395 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
9396 categories
9397 +/
9398 @safe pure nothrow @nogc
isAlphaNum(dchar c)9399 bool isAlphaNum(dchar c)
9400 {
9401 static import std.ascii;
9402
9403 // optimization for ascii case
9404 if (std.ascii.isASCII(c))
9405 {
9406 return std.ascii.isAlphaNum(c);
9407 }
9408 else
9409 {
9410 return isAlpha(c) || isNumber(c);
9411 }
9412 }
9413
9414 @safe unittest
9415 {
9416 auto n = unicode("N");
9417 auto alpha = unicode("Alphabetic");
9418
9419 foreach (ch; n.byCodepoint)
9420 assert(isAlphaNum(ch));
9421
9422 foreach (ch; alpha.byCodepoint)
9423 assert(isAlphaNum(ch));
9424
9425 foreach (ch; 0 .. 0x4000)
9426 {
9427 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
9428 }
9429 }
9430
9431 /++
9432 Returns whether $(D c) is a Unicode punctuation $(CHARACTER)
9433 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
9434 +/
9435 @safe pure nothrow @nogc
isPunctuation(dchar c)9436 bool isPunctuation(dchar c)
9437 {
9438 static import std.ascii;
9439
9440 // optimization for ascii case
9441 if (c <= 0x7F)
9442 {
9443 return std.ascii.isPunctuation(c);
9444 }
9445 else
9446 {
9447 return punctuationTrie[c];
9448 }
9449 }
9450
9451 @safe unittest
9452 {
9453 assert(isPunctuation('\u0021'));
9454 assert(isPunctuation('\u0028'));
9455 assert(isPunctuation('\u0029'));
9456 assert(isPunctuation('\u002D'));
9457 assert(isPunctuation('\u005F'));
9458 assert(isPunctuation('\u00AB'));
9459 assert(isPunctuation('\u00BB'));
9460 foreach (ch; unicode("P").byCodepoint)
9461 assert(isPunctuation(ch));
9462 }
9463
9464 /++
9465 Returns whether $(D c) is a Unicode symbol $(CHARACTER)
9466 (general Unicode category: Sm, Sc, Sk, So).
9467 +/
9468 @safe pure nothrow @nogc
isSymbol(dchar c)9469 bool isSymbol(dchar c)
9470 {
9471 return symbolTrie[c];
9472 }
9473
9474 @safe unittest
9475 {
9476 import std.format : format;
9477 assert(isSymbol('\u0024'));
9478 assert(isSymbol('\u002B'));
9479 assert(isSymbol('\u005E'));
9480 assert(isSymbol('\u00A6'));
9481 foreach (ch; unicode("S").byCodepoint)
9482 assert(isSymbol(ch), format("%04x", ch));
9483 }
9484
9485 /++
9486 Returns whether $(D c) is a Unicode space $(CHARACTER)
9487 (general Unicode category: Zs)
9488 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
9489 For commonly used less strict semantics see $(LREF isWhite).
9490 +/
9491 @safe pure nothrow @nogc
isSpace(dchar c)9492 bool isSpace(dchar c)
9493 {
9494 import std.internal.unicode_tables : isSpaceGen; // generated file
9495 return isSpaceGen(c);
9496 }
9497
9498 @safe unittest
9499 {
9500 assert(isSpace('\u0020'));
9501 auto space = unicode.Zs;
9502 foreach (ch; space.byCodepoint)
9503 assert(isSpace(ch));
9504 foreach (ch; 0 .. 0x1000)
9505 assert(isSpace(ch) == space[ch]);
9506 }
9507
9508
9509 /++
9510 Returns whether $(D c) is a Unicode graphical $(CHARACTER)
9511 (general Unicode category: L, M, N, P, S, Zs).
9512
9513 +/
9514 @safe pure nothrow @nogc
isGraphical(dchar c)9515 bool isGraphical(dchar c)
9516 {
9517 return graphicalTrie[c];
9518 }
9519
9520
9521 @safe unittest
9522 {
9523 auto set = unicode("Graphical");
9524 import std.format : format;
9525 foreach (ch; set.byCodepoint)
9526 assert(isGraphical(ch), format("%4x", ch));
9527 foreach (ch; 0 .. 0x4000)
9528 assert((ch in set) == isGraphical(ch));
9529 }
9530
9531
9532 /++
9533 Returns whether $(D c) is a Unicode control $(CHARACTER)
9534 (general Unicode category: Cc).
9535 +/
9536 @safe pure nothrow @nogc
isControl(dchar c)9537 bool isControl(dchar c)
9538 {
9539 import std.internal.unicode_tables : isControlGen; // generated file
9540 return isControlGen(c);
9541 }
9542
9543 @safe unittest
9544 {
9545 assert(isControl('\u0000'));
9546 assert(isControl('\u0081'));
9547 assert(!isControl('\u0100'));
9548 auto cc = unicode.Cc;
9549 foreach (ch; cc.byCodepoint)
9550 assert(isControl(ch));
9551 foreach (ch; 0 .. 0x1000)
9552 assert(isControl(ch) == cc[ch]);
9553 }
9554
9555
9556 /++
9557 Returns whether $(D c) is a Unicode formatting $(CHARACTER)
9558 (general Unicode category: Cf).
9559 +/
9560 @safe pure nothrow @nogc
isFormat(dchar c)9561 bool isFormat(dchar c)
9562 {
9563 import std.internal.unicode_tables : isFormatGen; // generated file
9564 return isFormatGen(c);
9565 }
9566
9567
9568 @safe unittest
9569 {
9570 assert(isFormat('\u00AD'));
9571 foreach (ch; unicode("Format").byCodepoint)
9572 assert(isFormat(ch));
9573 }
9574
9575 // code points for private use, surrogates are not likely to change in near feature
9576 // if need be they can be generated from unicode data as well
9577
9578 /++
9579 Returns whether $(D c) is a Unicode Private Use $(CODEPOINT)
9580 (general Unicode category: Co).
9581 +/
9582 @safe pure nothrow @nogc
isPrivateUse(dchar c)9583 bool isPrivateUse(dchar c)
9584 {
9585 return (0x00_E000 <= c && c <= 0x00_F8FF)
9586 || (0x0F_0000 <= c && c <= 0x0F_FFFD)
9587 || (0x10_0000 <= c && c <= 0x10_FFFD);
9588 }
9589
9590 /++
9591 Returns whether $(D c) is a Unicode surrogate $(CODEPOINT)
9592 (general Unicode category: Cs).
9593 +/
9594 @safe pure nothrow @nogc
isSurrogate(dchar c)9595 bool isSurrogate(dchar c)
9596 {
9597 return (0xD800 <= c && c <= 0xDFFF);
9598 }
9599
9600 /++
9601 Returns whether $(D c) is a Unicode high surrogate (lead surrogate).
9602 +/
9603 @safe pure nothrow @nogc
isSurrogateHi(dchar c)9604 bool isSurrogateHi(dchar c)
9605 {
9606 return (0xD800 <= c && c <= 0xDBFF);
9607 }
9608
9609 /++
9610 Returns whether $(D c) is a Unicode low surrogate (trail surrogate).
9611 +/
9612 @safe pure nothrow @nogc
isSurrogateLo(dchar c)9613 bool isSurrogateLo(dchar c)
9614 {
9615 return (0xDC00 <= c && c <= 0xDFFF);
9616 }
9617
9618 /++
9619 Returns whether $(D c) is a Unicode non-character i.e.
9620 a $(CODEPOINT) with no assigned abstract character.
9621 (general Unicode category: Cn)
9622 +/
9623 @safe pure nothrow @nogc
isNonCharacter(dchar c)9624 bool isNonCharacter(dchar c)
9625 {
9626 return nonCharacterTrie[c];
9627 }
9628
9629 @safe unittest
9630 {
9631 auto set = unicode("Cn");
9632 foreach (ch; set.byCodepoint)
9633 assert(isNonCharacter(ch));
9634 }
9635
9636 private:
9637 // load static data from pre-generated tables into usable datastructures
9638
9639
asSet(const (ubyte)[]compressed)9640 @safe auto asSet(const (ubyte)[] compressed) pure
9641 {
9642 return CodepointSet.fromIntervals(decompressIntervals(compressed));
9643 }
9644
asTrie(T...)9645 @safe pure nothrow auto asTrie(T...)(in TrieEntry!T e)
9646 {
9647 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
9648 }
9649
9650 @safe pure nothrow @nogc @property
9651 {
9652 import std.internal.unicode_tables; // generated file
9653
9654 // It's important to use auto return here, so that the compiler
9655 // only runs semantic on the return type if the function gets
9656 // used. Also these are functions rather than templates to not
9657 // increase the object size of the caller.
lowerCaseTrie()9658 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
upperCaseTrie()9659 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
simpleCaseTrie()9660 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
fullCaseTrie()9661 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
alphaTrie()9662 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
markTrie()9663 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
numberTrie()9664 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
punctuationTrie()9665 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
symbolTrie()9666 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
graphicalTrie()9667 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
nonCharacterTrie()9668 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
9669
9670 //normalization quick-check tables
nfcQCTrie()9671 auto nfcQCTrie()
9672 {
9673 import std.internal.unicode_norm : nfcQCTrieEntries;
9674 static immutable res = asTrie(nfcQCTrieEntries);
9675 return res;
9676 }
9677
nfdQCTrie()9678 auto nfdQCTrie()
9679 {
9680 import std.internal.unicode_norm : nfdQCTrieEntries;
9681 static immutable res = asTrie(nfdQCTrieEntries);
9682 return res;
9683 }
9684
nfkcQCTrie()9685 auto nfkcQCTrie()
9686 {
9687 import std.internal.unicode_norm : nfkcQCTrieEntries;
9688 static immutable res = asTrie(nfkcQCTrieEntries);
9689 return res;
9690 }
9691
nfkdQCTrie()9692 auto nfkdQCTrie()
9693 {
9694 import std.internal.unicode_norm : nfkdQCTrieEntries;
9695 static immutable res = asTrie(nfkdQCTrieEntries);
9696 return res;
9697 }
9698
9699 //grapheme breaking algorithm tables
mcTrie()9700 auto mcTrie()
9701 {
9702 import std.internal.unicode_grapheme : mcTrieEntries;
9703 static immutable res = asTrie(mcTrieEntries);
9704 return res;
9705 }
9706
graphemeExtendTrie()9707 auto graphemeExtendTrie()
9708 {
9709 import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
9710 static immutable res = asTrie(graphemeExtendTrieEntries);
9711 return res;
9712 }
9713
hangLV()9714 auto hangLV()
9715 {
9716 import std.internal.unicode_grapheme : hangulLVTrieEntries;
9717 static immutable res = asTrie(hangulLVTrieEntries);
9718 return res;
9719 }
9720
hangLVT()9721 auto hangLVT()
9722 {
9723 import std.internal.unicode_grapheme : hangulLVTTrieEntries;
9724 static immutable res = asTrie(hangulLVTTrieEntries);
9725 return res;
9726 }
9727
9728 // tables below are used for composition/decomposition
combiningClassTrie()9729 auto combiningClassTrie()
9730 {
9731 import std.internal.unicode_comp : combiningClassTrieEntries;
9732 static immutable res = asTrie(combiningClassTrieEntries);
9733 return res;
9734 }
9735
compatMappingTrie()9736 auto compatMappingTrie()
9737 {
9738 import std.internal.unicode_decomp : compatMappingTrieEntries;
9739 static immutable res = asTrie(compatMappingTrieEntries);
9740 return res;
9741 }
9742
canonMappingTrie()9743 auto canonMappingTrie()
9744 {
9745 import std.internal.unicode_decomp : canonMappingTrieEntries;
9746 static immutable res = asTrie(canonMappingTrieEntries);
9747 return res;
9748 }
9749
compositionJumpTrie()9750 auto compositionJumpTrie()
9751 {
9752 import std.internal.unicode_comp : compositionJumpTrieEntries;
9753 static immutable res = asTrie(compositionJumpTrieEntries);
9754 return res;
9755 }
9756
9757 //case conversion tables
toUpperIndexTrie()9758 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
toLowerIndexTrie()9759 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
toTitleIndexTrie()9760 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
9761 //simple case conversion tables
toUpperSimpleIndexTrie()9762 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
toLowerSimpleIndexTrie()9763 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
toTitleSimpleIndexTrie()9764 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
9765
9766 }
9767
9768 }// version (!std_uni_bootstrap)
9769