1 // Written in the D programming language.
2 
3 /++
4     $(P The $(D std.uni) module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(BOOKTABLE,
12 $(TR $(TH Category) $(TH Functions))
13 $(TR $(TD Decode) $(TD
14     $(LREF byCodePoint)
15     $(LREF byGrapheme)
16     $(LREF decodeGrapheme)
17     $(LREF graphemeStride)
18 ))
19 $(TR $(TD Comparison) $(TD
20     $(LREF icmp)
21     $(LREF sicmp)
22 ))
23 $(TR $(TD Classification) $(TD
24     $(LREF isAlpha)
25     $(LREF isAlphaNum)
26     $(LREF isCodepointSet)
27     $(LREF isControl)
28     $(LREF isFormat)
29     $(LREF isGraphical)
30     $(LREF isIntegralPair)
31     $(LREF isMark)
32     $(LREF isNonCharacter)
33     $(LREF isNumber)
34     $(LREF isPrivateUse)
35     $(LREF isPunctuation)
36     $(LREF isSpace)
37     $(LREF isSurrogate)
38     $(LREF isSurrogateHi)
39     $(LREF isSurrogateLo)
40     $(LREF isSymbol)
41     $(LREF isWhite)
42 ))
43 $(TR $(TD Normalization) $(TD
44     $(LREF NFC)
45     $(LREF NFD)
46     $(LREF NFKD)
47     $(LREF NormalizationForm)
48     $(LREF normalize)
49 ))
50 $(TR $(TD Decompose) $(TD
51     $(LREF decompose)
52     $(LREF decomposeHangul)
53     $(LREF UnicodeDecomposition)
54 ))
55 $(TR $(TD Compose) $(TD
56     $(LREF compose)
57     $(LREF composeJamo)
58 ))
59 $(TR $(TD Sets) $(TD
60     $(LREF CodepointInterval)
61     $(LREF CodepointSet)
62     $(LREF InversionList)
63     $(LREF unicode)
64 ))
65 $(TR $(TD Trie) $(TD
66     $(LREF codepointSetTrie)
67     $(LREF CodepointSetTrie)
68     $(LREF codepointTrie)
69     $(LREF CodepointTrie)
70     $(LREF toTrie)
71     $(LREF toDelegate)
72 ))
73 $(TR $(TD Casing) $(TD
74     $(LREF asCapitalized)
75     $(LREF asLowerCase)
76     $(LREF asUpperCase)
77     $(LREF isLower)
78     $(LREF isUpper)
79     $(LREF toLower)
80     $(LREF toLowerInPlace)
81     $(LREF toUpper)
82     $(LREF toUpperInPlace)
83 ))
84 $(TR $(TD Utf8Matcher) $(TD
85     $(LREF isUtfMatcher)
86     $(LREF MatcherConcept)
87     $(LREF utfMatcher)
88 ))
89 $(TR $(TD Separators) $(TD
90     $(LREF lineSep)
91     $(LREF nelSep)
92     $(LREF paraSep)
93 ))
94 $(TR $(TD Building blocks) $(TD
95     $(LREF allowedIn)
96     $(LREF combiningClass)
97     $(LREF Grapheme)
98 ))
99 )
100 
101     $(P All primitives listed operate on Unicode characters and
102         sets of characters. For functions which operate on ASCII characters
103         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
104         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
105         used throughout this module see the $(S_LINK Terminology, terminology) section
106         below.
107     )
108     $(P The focus of this module is the core needs of developing Unicode-aware
109         applications. To that effect it provides the following optimized primitives:
110     )
111     $(UL
112         $(LI Character classification by category and common properties:
113             $(LREF isAlpha), $(LREF isWhite) and others.
114         )
115         $(LI
116             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
117         )
118         $(LI
119             Converting text to any of the four normalization forms via $(LREF normalize).
120         )
121         $(LI
122             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
123             by user-perceived characters, that is by $(LREF Grapheme) clusters.
124         )
125         $(LI
126             Decomposing and composing of individual character(s) according to canonical
127             or compatibility rules, see $(LREF compose) and $(LREF decompose),
128             including the specific version for Hangul syllables $(LREF composeJamo)
129             and $(LREF decomposeHangul).
130         )
131     )
132     $(P It's recognized that an application may need further enhancements
133         and extensions, such as less commonly known algorithms,
134         or tailoring existing ones for region specific needs. To help users
135         with building any extra functionality beyond the core primitives,
136         the module provides:
137     )
138     $(UL
139         $(LI
140             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
141             Besides the typical set algebra it provides an unusual feature:
142             a D source code generator for detection of $(CODEPOINTS) in this set.
143             This is a boon for meta-programming parser frameworks,
144             and is used internally to power classification in small
145             sets like $(LREF isWhite).
146         )
147         $(LI
148             A way to construct optimal packed multi-stage tables also known as a
149             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
150             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
151             construct custom tries that map dchar to value.
152             The end result is a fast and predictable $(BIGOH 1) lookup that powers
153             functions like $(LREF isAlpha) and $(LREF combiningClass),
154             but for user-defined data sets.
155         )
156         $(LI
157             A useful technique for Unicode-aware parsers that perform
158             character classification of encoded $(CODEPOINTS)
159             is to avoid unnecassary decoding at all costs.
160             $(LREF utfMatcher) provides an improvement over the usual workflow
161             of decode-classify-process, combining the decoding and classification
162             steps. By extracting necessary bits directly from encoded
163             $(S_LINK Code unit, code units) matchers achieve
164             significant performance improvements. See $(LREF MatcherConcept) for
165             the common interface of UTF matchers.
166         )
167         $(LI
168             Generally useful building blocks for customized normalization:
169             $(LREF combiningClass) for querying combining class
170             and $(LREF allowedIn) for testing the Quick_Check
171             property of a given normalization form.
172         )
173         $(LI
174             Access to a large selection of commonly used sets of $(CODEPOINTS).
175             $(S_LINK Unicode properties, Supported sets) include Script,
176             Block and General Category. The exact contents of a set can be
177             observed in the CLDR utility, on the
178             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
179             of the Unicode website.
180             See $(LREF unicode) for easy and (optionally) compile-time checked set
181             queries.
182         )
183     )
184     $(SECTION Synopsis)
185     ---
186     import std.uni;
187     void main()
188     {
189         // initialize code point sets using script/block or property name
190         // now 'set' contains code points from both scripts.
191         auto set = unicode("Cyrillic") | unicode("Armenian");
192         // same thing but simpler and checked at compile-time
193         auto ascii = unicode.ASCII;
194         auto currency = unicode.Currency_Symbol;
195 
196         // easy set ops
197         auto a = set & ascii;
198         assert(a.empty); // as it has no intersection with ascii
199         a = set | ascii;
200         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
201 
202         // some properties of code point sets
203         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
204         // testing presence of a code point in a set
205         // is just fine, it is O(logN)
206         assert(!b['$']);
207         assert(!b['\u058F']); // Armenian dram sign
208         assert(b['¥']);
209 
210         // building fast lookup tables, these guarantee O(1) complexity
211         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
212         auto oneTrie = toTrie!1(b);
213         // 2-level far more compact but typically slightly slower
214         auto twoTrie = toTrie!2(b);
215         // 3-level even smaller, and a bit slower yet
216         auto threeTrie = toTrie!3(b);
217         assert(oneTrie['£']);
218         assert(twoTrie['£']);
219         assert(threeTrie['£']);
220 
221         // build the trie with the most sensible trie level
222         // and bind it as a functor
223         auto cyrillicOrArmenian = toDelegate(set);
224         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
225         assert(balance == "ընկեր!");
226         // compatible with bool delegate(dchar)
227         bool delegate(dchar) bindIt = cyrillicOrArmenian;
228 
229         // Normalization
230         string s = "Plain ascii (and not only), is always normalized!";
231         assert(s is normalize(s));// is the same string
232 
233         string nonS = "A\u0308ffin"; // A ligature
234         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
235         assert(nS == "Äffin");
236         assert(nS != nonS);
237         string composed = "Äffin";
238 
239         assert(normalize!NFD(composed) == "A\u0308ffin");
240         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
241         assert(normalize!NFKD("2¹⁰") == "210");
242     }
243     ---
244     $(SECTION Terminology
245     )
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as acharacter265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 ($(D char)),
309         16-bit code units in the UTF-16 ($(D wchar)),
310         and 32-bit code units in the UTF-32 ($(D dchar)).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D $(D dchar) type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization
387     )
388     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
389         or $(S_LINK Compatibility equivalent, compatibility equivalent)
390         characters in the Unicode Standard make it necessary to have a full, formal
391         definition of equivalence for Unicode strings.
392         String equivalence is determined by a process called normalization,
393         whereby strings are converted into forms which are compared
394         directly for identity. This is the primary goal of the normalization process,
395         see the function $(LREF normalize) to convert into any of
396         the four defined forms.
397     )
398     $(P A very important attribute of the Unicode Normalization Forms
399         is that they must remain stable between versions of the Unicode Standard.
400         A Unicode string normalized to a particular Unicode Normalization Form
401         in one version of the standard is guaranteed to remain in that Normalization
402         Form for implementations of future versions of the standard.
403     )
404     $(P The Unicode Standard specifies four normalization forms.
405         Informally, two of these forms are defined by maximal decomposition
406         of equivalent sequences, and two of these forms are defined
407         by maximal $(I composition) of equivalent sequences.
408             $(UL
409             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
410                 canonical decomposition) of a character sequence.)
411             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
412                 compatibility decomposition) of a character sequence.)
413             $(LI Normalization Form C (NFC): The canonical composition of the
414                 $(S_LINK Canonical decomposition, canonical decomposition)
415                 of a coded character sequence.)
416             $(LI Normalization Form KC (NFKC): The canonical composition
417             of the $(S_LINK Compatibility decomposition,
418                 compatibility decomposition) of a character sequence)
419             )
420     )
421     $(P The choice of the normalization form depends on the particular use case.
422         NFC is the best form for general text, since it's more compatible with
423         strings converted from legacy encodings. NFKC is the preferred form for
424         identifiers, especially where there are security concerns. NFD and NFKD
425         are the most useful for internal processing.
426     )
427     $(SECTION Construction of lookup tables
428     )
429     $(P The Unicode standard describes a set of algorithms that
430         depend on having the ability to quickly look up various properties
431         of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
432         it is not a trivial task to provide a space-efficient solution for
433         the multitude of properties.
434     )
435     $(P Common approaches such as hash-tables or binary search over
436         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
437         Hash-tables have enormous memory footprint and binary search
438         over intervals is not fast enough for some heavy-duty algorithms.
439     )
440     $(P The recommended solution (see Unicode Implementation Guidelines)
441         is using multi-stage tables that are an implementation of the
442         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
443         keys and a fixed number of stages. For the remainder of the section
444         this will be called a fixed trie. The following describes a particular
445         implementation that is aimed for the speed of access at the expense
446         of ideal size savings.
447     )
448     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
449         Split the number of bits in a key (code point, 21 bits) into 2 components
450         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
451          and the other is number of bits in each page of the trie.
452         The layout of the trie is then an array of size 2^^bits-of-index followed
453         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
454     )
455     $(P The number of pages is variable (but not less then 1)
456         unlike the number of entries in the index. The slots of the index
457         all have to contain a number of a page that is present. The lookup is then
458         just a couple of operations - slice the upper bits,
459         lookup an index for these, take a page at this index and use
460         the lower bits as an offset within this page.
461 
462         Assuming that pages are laid out consequently
463         in one array at $(D pages), the pseudo-code is:
464     )
465     ---
466     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
467     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
468     ---
469     $(P Where if $(D elemsPerPage) is a power of 2 the whole process is
470         a handful of simple instructions and 2 array reads. Subsequent levels
471         of the trie are introduced by recursing on this notion - the index array
472         is treated as values. The number of bits in index is then again
473         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
474     )
475 
476     $(P For completeness a level 1 trie is simply an array.
477         The current implementation takes advantage of bit-packing values
478         when the range is known to be limited in advance (such as $(D bool)).
479         See also $(LREF BitPacked) for enforcing it manually.
480         The major size advantage however comes from the fact
481         that multiple $(B identical pages on every level are merged) by construction.
482     )
483     $(P The process of constructing a trie is more involved and is hidden from
484         the user in a form of the convenience functions $(LREF codepointTrie),
485         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
486         In general a set or built-in AA with $(D dchar) type
487         can be turned into a trie. The trie object in this module
488         is read-only (immutable); it's effectively frozen after construction.
489     )
490     $(SECTION Unicode properties
491     )
492     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
493         with specific helpers per category nested within. Consult the
494         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
495         when in doubt about the contents of a particular set.
496     )
497     $(P General category sets listed below are only accessible with the
498         $(LREF unicode) shorthand accessor.)
499         $(BOOKTABLE $(B General category ),
500              $(TR $(TH Abb.) $(TH Long form)
501                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
502             $(TR $(TD L) $(TD Letter)
503                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
504             $(TR $(TD Ll) $(TD Lowercase_Letter)
505                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
506             $(TR $(TD Lm) $(TD Modifier_Letter)
507                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
508             $(TR $(TD Lo) $(TD Other_Letter)
509                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
510             $(TR $(TD Lt) $(TD Titlecase_Letter)
511               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
512             $(TR $(TD Lu) $(TD Uppercase_Letter)
513               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
514             $(TR $(TD M) $(TD Mark)
515               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
516             $(TR $(TD Mc) $(TD Spacing_Mark)
517               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
518             $(TR $(TD Me) $(TD Enclosing_Mark)
519               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
520             $(TR $(TD Mn) $(TD Nonspacing_Mark)
521               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
522             $(TR $(TD C) $(TD Other)
523               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
524             $(TR $(TD Cc) $(TD Control) $(TD Pf)
525               $(TD Final_Punctuation)   $(TD -) $(TD Any))
526             $(TR $(TD Cf) $(TD Format)
527               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
528     )
529     $(P Sets for other commonly useful properties that are
530         accessible with $(LREF unicode):)
531         $(BOOKTABLE $(B Common binary properties),
532             $(TR $(TH Name) $(TH Name) $(TH Name))
533             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
534             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
535             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
536             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
537             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
538             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
539             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
540             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
541             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
542             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
543             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
544             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
545             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
546             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
547             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
548             $(TR $(TD ID_Continue) $(TD Other_Math)  )
549     )
550     $(P Below is the table with block names accepted by $(LREF unicode.block).
551         Note that the shorthand version $(LREF unicode) requires "In"
552         to be prepended to the names of blocks so as to disambiguate
553         scripts and blocks.
554     )
555     $(BOOKTABLE $(B Blocks),
556         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
557         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
558         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
559         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
560         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
561         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
562         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
563         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
564         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
565         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
566         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
567         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
568         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
569         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
570         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
571         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
572         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
573         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
574         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
575         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
576         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
577         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
578         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
579         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
580         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
581         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
582         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
583         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
584         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
585         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
586         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
587         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
588         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
589         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
590         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
591         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
592         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
593         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
594         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
595         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
596         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
597         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
598         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
599         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
600         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
601         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
602         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
603         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
604         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
605         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
606         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
607         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
608         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
609         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
610         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
611         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
612         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
613         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
614         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
615         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
616         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
617         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
618         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
619         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
620         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
621         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
622         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
623         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
624         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
625         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
626         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
627         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
628         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
629         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
630     )
631     $(P Below is the table with script names accepted by $(LREF unicode.script)
632         and by the shorthand version $(LREF unicode):)
633         $(BOOKTABLE $(B Scripts),
634             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
635             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
636             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
637             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
638             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
639             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
640             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
641             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
642             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
643             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
644             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
645             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
646             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
647             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
648             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
649             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
650             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
651             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
652             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
653             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
654             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
655             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
656             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
657             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
658             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
659             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
660             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
661             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
662             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
663             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
664             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
665             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
666             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
667             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
668     )
669     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
670         $(BOOKTABLE $(B Hangul syllable type),
671             $(TR $(TH Abb.) $(TH Long form))
672             $(TR $(TD L)   $(TD Leading_Jamo))
673             $(TR $(TD LV)  $(TD LV_Syllable))
674             $(TR $(TD LVT) $(TD LVT_Syllable) )
675             $(TR $(TD T)   $(TD Trailing_Jamo))
676             $(TR $(TD V)   $(TD Vowel_Jamo))
677     )
678     References:
679         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
680         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
681         $(HTTP www.unicode.org, The Unicode Consortium),
682         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
683         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
684         $(HTTP www.unicode.org/uni2book/ch05.pdf,
685             Unicode Implementation Guidelines)
686         $(HTTP www.unicode.org/uni2book/ch03.pdf,
687             Unicode Conformance)
688     Trademarks:
689         Unicode(tm) is a trademark of Unicode, Inc.
690 
691     Copyright: Copyright 2013 -
692     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
693     Authors:   Dmitry Olshansky
694     Source:    $(PHOBOSSRC std/_uni.d)
695     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
696 
697 Macros:
698 
699 SECTION = <h3><a id="$1">$0</a></h3>
700 DEF = <div><a id="$1"><i>$0</i></a></div>
701 S_LINK = <a href="#$1">$+</a>
702 CODEPOINT = $(S_LINK Code point, code point)
703 CODEPOINTS = $(S_LINK Code point, code points)
704 CHARACTER = $(S_LINK Character, character)
705 CHARACTERS = $(S_LINK Character, characters)
706 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
707 +/
708 module std.uni;
709 
710 import std.meta; // AliasSeq
711 import std.range.primitives; // back, ElementEncodingType, ElementType, empty,
712     // front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put,
713     // save
714 import std.traits; // isConvertibleToString, isIntegral, isSomeChar,
715     // isSomeString, Unqual
716 import std.exception;// : enforce;
717 import core.memory; //: pureMalloc, pureRealloc, pureFree;
718 import core.exception; // : onOutOfMemoryError;
719 static import std.ascii;
720 // debug = std_uni;
721 
722 debug(std_uni) import std.stdio; // writefln, writeln
723 
724 private:
725 
version(unittest)726 version (unittest)
727 {
728 private:
729     struct TestAliasedString
730     {
731         string get() @safe @nogc pure nothrow { return _s; }
732         alias get this;
733         @disable this(this);
734         string _s;
735     }
736 
737     bool testAliasedString(alias func, Args...)(string s, Args args)
738     {
739         import std.algorithm.comparison : equal;
740         auto a = func(TestAliasedString(s), args);
741         auto b = func(s, args);
742         static if (is(typeof(equal(a, b))))
743         {
744             // For ranges, compare contents instead of object identity.
745             return equal(a, b);
746         }
747         else
748         {
749             return a == b;
750         }
751     }
752 }
753 
754 void copyBackwards(T,U)(T[] src, U[] dest)
755 {
756     assert(src.length == dest.length);
757     for (size_t i=src.length; i-- > 0; )
758         dest[i] = src[i];
759 }
760 
761 void copyForward(T,U)(T[] src, U[] dest)
762 {
763     assert(src.length == dest.length);
764     for (size_t i=0; i<src.length; i++)
765         dest[i] = src[i];
766 }
767 
768 // TODO: update to reflect all major CPUs supporting unaligned reads
769 version (X86)
770     enum hasUnalignedReads = true;
771 else version (X86_64)
772     enum hasUnalignedReads = true;
773 else version (SystemZ)
774     enum hasUnalignedReads = true;
775 else
776     enum hasUnalignedReads = false; // better be safe then sorry
777 
778 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
779 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
780 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
781 
782 // test the intro example
783 @safe unittest
784 {
785     import std.algorithm.searching : find;
786     // initialize code point sets using script/block or property name
787     // set contains code points from both scripts.
788     auto set = unicode("Cyrillic") | unicode("Armenian");
789     // or simpler and statically-checked look
790     auto ascii = unicode.ASCII;
791     auto currency = unicode.Currency_Symbol;
792 
793     // easy set ops
794     auto a = set & ascii;
795     assert(a.empty); // as it has no intersection with ascii
796     a = set | ascii;
797     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
798 
799     // some properties of code point sets
800     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
801     // testing presence of a code point in a set
802     // is just fine, it is O(logN)
803     assert(!b['$']);
804     assert(!b['\u058F']); // Armenian dram sign
805     assert(b['¥']);
806 
807     // building fast lookup tables, these guarantee O(1) complexity
808     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
809     auto oneTrie = toTrie!1(b);
810     // 2-level far more compact but typically slightly slower
811     auto twoTrie = toTrie!2(b);
812     // 3-level even smaller, and a bit slower yet
813     auto threeTrie = toTrie!3(b);
814     assert(oneTrie['£']);
815     assert(twoTrie['£']);
816     assert(threeTrie['£']);
817 
818     // build the trie with the most sensible trie level
819     // and bind it as a functor
820     auto cyrillicOrArmenian = toDelegate(set);
821     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
822     assert(balance == "ընկեր!");
823     // compatible with bool delegate(dchar)
824     bool delegate(dchar) bindIt = cyrillicOrArmenian;
825 
826     // Normalization
827     string s = "Plain ascii (and not only), is always normalized!";
828     assert(s is normalize(s));// is the same string
829 
830     string nonS = "A\u0308ffin"; // A ligature
831     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
832     assert(nS == "Äffin");
833     assert(nS != nonS);
834     string composed = "Äffin";
835 
836     assert(normalize!NFD(composed) == "A\u0308ffin");
837     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
838     assert(normalize!NFKD("2¹⁰") == "210");
839 }
840 
841 enum lastDchar = 0x10FFFF;
842 
843 auto force(T, F)(F from)
844 if (isIntegral!T && !is(T == F))
845 {
846     assert(from <= T.max && from >= T.min);
847     return cast(T) from;
848 }
849 
850 auto force(T, F)(F from)
851 if (isBitPacked!T && !is(T == F))
852 {
853     assert(from <= 2^^bitSizeOf!T-1);
854     return T(cast(TypeOfBitPacked!T) from);
855 }
856 
857 auto force(T, F)(F from)
858 if (is(T == F))
859 {
860     return from;
861 }
862 
863 // repeat X times the bit-pattern in val assuming it's length is 'bits'
864 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
865 {
866     static if (times == 1)
867         return val;
868     else static if (bits == 1)
869     {
870         static if (times == size_t.sizeof*8)
871             return val ? size_t.max : 0;
872         else
873             return val ? (1 << times)-1 : 0;
874     }
875     else static if (times % 2)
876         return (replicateBits!(times-1, bits)(val)<<bits) | val;
877     else
878         return replicateBits!(times/2, bits*2)((val << bits) | val);
879 }
880 
881 @safe pure nothrow @nogc unittest // for replicate
882 {
883     import std.algorithm.iteration : sum, map;
884     import std.range : iota;
885     size_t m = 0b111;
886     size_t m2 = 0b01;
887     foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
888     {
889         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
890         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
891     }
892 }
893 
894 // multiple arrays squashed into one memory block
MultiArray(Types...)895 struct MultiArray(Types...)
896 {
897     import std.range.primitives : isOutputRange;
898     this(size_t[] sizes...) @safe pure nothrow
899     {
900         assert(dim == sizes.length);
901         size_t full_size;
902         foreach (i, v; Types)
903         {
904             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
905             sz[i] = sizes[i];
906             static if (i >= 1)
907                 offsets[i] = offsets[i-1] +
908                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
909         }
910 
911         storage = new size_t[full_size];
912     }
913 
914     this(const(size_t)[] raw_offsets,
915         const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc
916     {
917         offsets[] = raw_offsets[];
918         sz[] = raw_sizes[];
919         storage = data;
920     }
921 
922     @property auto slice(size_t n)()inout pure nothrow @nogc
923     {
924         auto ptr = raw_ptr!n;
925         return packedArrayView!(Types[n])(ptr, sz[n]);
926     }
927 
928     @property auto ptr(size_t n)()inout pure nothrow @nogc
929     {
930         auto ptr = raw_ptr!n;
931         return inout(PackedPtr!(Types[n]))(ptr);
932     }
933 
934     template length(size_t n)
935     {
936         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
937 
938         @property void length(size_t new_size)
939         {
940             if (new_size > sz[n])
941             {// extend
942                 size_t delta = (new_size - sz[n]);
943                 sz[n] += delta;
944                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
945                 storage.length +=  delta;// extend space at end
946                 // raw_slice!x must follow resize as it could be moved!
947                 // next stmts move all data past this array, last-one-goes-first
948                 static if (n != dim-1)
949                 {
950                     auto start = raw_ptr!(n+1);
951                     // len includes delta
952                     size_t len = (storage.ptr+storage.length-start);
953 
954                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
955 
956                     start[0 .. delta] = 0;
957                     // offsets are used for raw_slice, ptr etc.
958                     foreach (i; n+1 .. dim)
959                         offsets[i] += delta;
960                 }
961             }
962             else if (new_size < sz[n])
963             {// shrink
964                 size_t delta = (sz[n] - new_size);
965                 sz[n] -= delta;
966                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
967                 // move all data past this array, forward direction
968                 static if (n != dim-1)
969                 {
970                     auto start = raw_ptr!(n+1);
971                     size_t len = (storage.ptr+storage.length-start);
972                     copyForward(start[0 .. len-delta], start[delta .. len]);
973 
974                     // adjust offsets last, they affect raw_slice
975                     foreach (i; n+1 .. dim)
976                         offsets[i] -= delta;
977                 }
978                 storage.length -= delta;
979             }
980             // else - NOP
981         }
982     }
983 
984     @property size_t bytes(size_t n=size_t.max)() const @safe
985     {
986         static if (n == size_t.max)
987             return storage.length*size_t.sizeof;
988         else static if (n != Types.length-1)
989             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
990         else
991             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
992     }
993 
994     void store(OutRange)(scope OutRange sink) const
995         if (isOutputRange!(OutRange, char))
996     {
997         import std.format : formattedWrite;
998         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
999         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
1000         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
1001     }
1002 
1003 private:
1004     import std.meta : staticMap;
1005     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
1006     {
1007         static if (n == 0)
1008             return storage.ptr;
1009         else
1010         {
1011             return storage.ptr+offsets[n];
1012         }
1013     }
1014     enum dim = Types.length;
1015     size_t[dim] offsets;// offset for level x
1016     size_t[dim] sz;// size of level x
1017     alias bitWidth = staticMap!(bitSizeOf, Types);
1018     size_t[] storage;
1019 }
1020 
1021 @system unittest
1022 {
1023     import std.conv : text;
1024     enum dg = (){
1025         // sizes are:
1026         // lvl0: 3, lvl1 : 2, lvl2: 1
1027         auto m = MultiArray!(int, ubyte, int)(3,2,1);
1028 
check(size_t k,T)1029         static void check(size_t k, T)(ref T m, int n)
1030         {
1031             foreach (i; 0 .. n)
1032                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1033         }
1034 
checkB(size_t k,T)1035         static void checkB(size_t k, T)(ref T m, int n)
1036         {
1037             foreach (i; 0 .. n)
1038                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1039         }
1040 
fill(size_t k,T)1041         static void fill(size_t k, T)(ref T m, int n)
1042         {
1043             foreach (i; 0 .. n)
1044                 m.slice!(k)[i] = force!ubyte(i+1);
1045         }
1046 
fillB(size_t k,T)1047         static void fillB(size_t k, T)(ref T m, int n)
1048         {
1049             foreach (i; 0 .. n)
1050                 m.slice!(k)[i] = force!ubyte(n-i);
1051         }
1052 
1053         m.length!1 = 100;
1054         fill!1(m, 100);
1055         check!1(m, 100);
1056 
1057         m.length!0 = 220;
1058         fill!0(m, 220);
1059         check!1(m, 100);
1060         check!0(m, 220);
1061 
1062         m.length!2 = 17;
1063         fillB!2(m, 17);
1064         checkB!2(m, 17);
1065         check!0(m, 220);
1066         check!1(m, 100);
1067 
1068         m.length!2 = 33;
1069         checkB!2(m, 17);
1070         fillB!2(m, 33);
1071         checkB!2(m, 33);
1072         check!0(m, 220);
1073         check!1(m, 100);
1074 
1075         m.length!1 = 195;
1076         fillB!1(m, 195);
1077         checkB!1(m, 195);
1078         checkB!2(m, 33);
1079         check!0(m, 220);
1080 
1081         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1082         marr.length!0 = 15;
1083         marr.length!1 = 30;
1084         fill!1(marr, 30);
1085         fill!0(marr, 15);
1086         check!1(marr, 30);
1087         check!0(marr, 15);
1088         return 0;
1089     };
1090     enum ct = dg();
1091     auto rt = dg();
1092 }
1093 
1094 @system unittest
1095 {// more bitpacking tests
1096     import std.conv : text;
1097 
1098     alias Bitty =
1099       MultiArray!(BitPacked!(size_t, 3)
1100                 , BitPacked!(size_t, 4)
1101                 , BitPacked!(size_t, 3)
1102                 , BitPacked!(size_t, 6)
1103                 , bool);
1104     alias fn1 = sliceBits!(13, 16);
1105     alias fn2 = sliceBits!( 9, 13);
1106     alias fn3 = sliceBits!( 6,  9);
1107     alias fn4 = sliceBits!( 0,  6);
check(size_t lvl,MA)1108     static void check(size_t lvl, MA)(ref MA arr){
1109         for (size_t i = 0; i< arr.length!lvl; i++)
1110             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1111     }
1112 
fillIdx(size_t lvl,MA)1113     static void fillIdx(size_t lvl, MA)(ref MA arr){
1114         for (size_t i = 0; i< arr.length!lvl; i++)
1115             arr.slice!(lvl)[i] = i;
1116     }
1117     Bitty m1;
1118 
1119     m1.length!4 = 10;
1120     m1.length!3 = 2^^6;
1121     m1.length!2 = 2^^3;
1122     m1.length!1 = 2^^4;
1123     m1.length!0 = 2^^3;
1124 
1125     m1.length!4 = 2^^16;
1126 
1127     for (size_t i = 0; i< m1.length!4; i++)
1128         m1.slice!(4)[i] = i % 2;
1129 
1130     fillIdx!1(m1);
1131     check!1(m1);
1132     fillIdx!2(m1);
1133     check!2(m1);
1134     fillIdx!3(m1);
1135     check!3(m1);
1136     fillIdx!0(m1);
1137     check!0(m1);
1138     check!3(m1);
1139     check!2(m1);
1140     check!1(m1);
1141     for (size_t i=0; i < 2^^16; i++)
1142     {
1143         m1.slice!(4)[i] = i % 2;
1144         m1.slice!(0)[fn1(i)] = fn1(i);
1145         m1.slice!(1)[fn2(i)] = fn2(i);
1146         m1.slice!(2)[fn3(i)] = fn3(i);
1147         m1.slice!(3)[fn4(i)] = fn4(i);
1148     }
1149     for (size_t i=0; i < 2^^16; i++)
1150     {
1151         assert(m1.slice!(4)[i] == i % 2);
1152         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1153         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1154         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1155         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1156     }
1157 }
1158 
spaceFor(size_t _bits)1159 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1160 {
1161     import std.math : nextPow2;
1162     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1163     static if (bits > 8*size_t.sizeof)
1164     {
1165         static assert(bits % (size_t.sizeof*8) == 0);
1166         return new_len * bits/(8*size_t.sizeof);
1167     }
1168     else
1169     {
1170         enum factor = size_t.sizeof*8/bits;
1171         return (new_len+factor-1)/factor; // rounded up
1172     }
1173 }
1174 
isBitPackableType(T)1175 template isBitPackableType(T)
1176 {
1177     enum isBitPackableType = isBitPacked!T
1178         || isIntegral!T || is(T == bool) || isSomeChar!T;
1179 }
1180 
1181 //============================================================================
1182 template PackedArrayView(T)
1183 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1184     && isBitPackableType!U) || isBitPackableType!T)
1185 {
1186     import std.math : nextPow2;
1187     private enum bits = bitSizeOf!T;
1188     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1189 }
1190 
1191 //unsafe and fast access to a chunk of RAM as if it contains packed values
1192 template PackedPtr(T)
1193 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1194     && isBitPackableType!U) || isBitPackableType!T)
1195 {
1196     import std.math : nextPow2;
1197     private enum bits = bitSizeOf!T;
1198     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1199 }
1200 
PackedPtrImpl(T,size_t bits)1201 struct PackedPtrImpl(T, size_t bits)
1202 {
1203 pure nothrow:
1204     static assert(isPow2OrZero(bits));
1205 
1206     this(inout(size_t)* ptr)inout @safe @nogc
1207     {
1208         origin = ptr;
1209     }
1210 
1211     private T simpleIndex(size_t n) inout
1212     {
1213         immutable q = n / factor;
1214         immutable r = n % factor;
1215         return cast(T)((origin[q] >> bits*r) & mask);
1216     }
1217 
1218     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1219     in
1220     {
1221         static if (isIntegral!T)
1222             assert(val <= mask);
1223     }
1224     body
1225     {
1226         immutable q = n / factor;
1227         immutable r = n % factor;
1228         immutable tgt_shift = bits*r;
1229         immutable word = origin[q];
1230         origin[q] = (word & ~(mask << tgt_shift))
1231             | (cast(size_t) val << tgt_shift);
1232     }
1233 
1234     static if (factor == bytesPerWord// can safely pack by byte
1235          || factor == 1 // a whole word at a time
1236          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1237                 && hasUnalignedReads)) // this needs unaligned reads
1238     {
1239         static if (factor == bytesPerWord)
1240             alias U = ubyte;
1241         else static if (factor == bytesPerWord/2)
1242             alias U = ushort;
1243         else static if (factor == bytesPerWord/4)
1244             alias U = uint;
1245         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1246             alias U = ulong;
1247 
1248         T opIndex(size_t idx) inout
1249         {
1250             T ret;
1251             version (LittleEndian)
1252                 ret = __ctfe ? simpleIndex(idx) :
1253                     cast(inout(T))(cast(U*) origin)[idx];
1254             else
1255                 ret = simpleIndex(idx);
1256             return ret;
1257         }
1258 
1259         static if (isBitPacked!T) // lack of user-defined implicit conversion
1260         {
1261             void opIndexAssign(T val, size_t idx)
1262             {
1263                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1264             }
1265         }
1266 
1267         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1268         {
1269             version (LittleEndian)
1270             {
1271                 if (__ctfe)
1272                     simpleWrite(val, idx);
1273                 else
1274                     (cast(U*) origin)[idx] = cast(U) val;
1275             }
1276             else
1277                 simpleWrite(val, idx);
1278         }
1279     }
1280     else
1281     {
1282         T opIndex(size_t n) inout
1283         {
1284             return simpleIndex(n);
1285         }
1286 
1287         static if (isBitPacked!T) // lack of user-defined implicit conversion
1288         {
1289             void opIndexAssign(T val, size_t idx)
1290             {
1291                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1292             }
1293         }
1294 
1295         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1296         {
1297             return simpleWrite(val, n);
1298         }
1299     }
1300 
1301 private:
1302     // factor - number of elements in one machine word
1303     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1304     enum bytesPerWord =  size_t.sizeof;
1305     size_t* origin;
1306 }
1307 
1308 // data is packed only by power of two sized packs per word,
1309 // thus avoiding mul/div overhead at the cost of ultimate packing
1310 // this construct doesn't own memory, only provides access, see MultiArray for usage
PackedArrayViewImpl(T,size_t bits)1311 struct PackedArrayViewImpl(T, size_t bits)
1312 {
1313 pure nothrow:
1314 
1315     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1316     {
1317         ptr = inout(PackedPtr!(T))(origin);
1318         ofs = offset;
1319         limit = items;
1320     }
1321 
1322     bool zeros(size_t s, size_t e)
1323     in
1324     {
1325         assert(s <= e);
1326     }
1327     body
1328     {
1329         s += ofs;
1330         e += ofs;
1331         immutable pad_s = roundUp(s);
1332         if ( s >= e)
1333         {
1334             foreach (i; s .. e)
1335                 if (ptr[i])
1336                     return false;
1337             return true;
1338         }
1339         immutable pad_e = roundDown(e);
1340         size_t i;
1341         for (i=s; i<pad_s; i++)
1342             if (ptr[i])
1343                 return false;
1344         // all in between is x*factor elements
1345         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1346             if (ptr.origin[j])
1347                 return false;
1348         for (; i<e; i++)
1349             if (ptr[i])
1350                 return false;
1351         return true;
1352     }
1353 
1354     T opIndex(size_t idx) inout
1355     in
1356     {
1357         assert(idx < limit);
1358     }
1359     body
1360     {
1361         return ptr[ofs + idx];
1362     }
1363 
1364     static if (isBitPacked!T) // lack of user-defined implicit conversion
1365     {
1366         void opIndexAssign(T val, size_t idx)
1367         {
1368             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1369         }
1370     }
1371 
1372     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1373     in
1374     {
1375         assert(idx < limit);
1376     }
1377     body
1378     {
1379         ptr[ofs + idx] = val;
1380     }
1381 
1382     static if (isBitPacked!T) // lack of user-defined implicit conversions
1383     {
1384         void opSliceAssign(T val, size_t start, size_t end)
1385         {
1386             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1387         }
1388     }
1389 
1390     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1391     in
1392     {
1393         assert(start <= end);
1394         assert(end <= limit);
1395     }
1396     body
1397     {
1398         // account for ofsetted view
1399         start += ofs;
1400         end += ofs;
1401         // rounded to factor granularity
1402         immutable pad_start = roundUp(start);// rounded up
1403         if (pad_start >= end) //rounded up >= then end of slice
1404         {
1405             //nothing to gain, use per element assignment
1406             foreach (i; start .. end)
1407                 ptr[i] = val;
1408             return;
1409         }
1410         immutable pad_end = roundDown(end); // rounded down
1411         size_t i;
1412         for (i=start; i<pad_start; i++)
1413             ptr[i] = val;
1414         // all in between is x*factor elements
1415         if (pad_start != pad_end)
1416         {
1417             immutable repval = replicateBits!(factor, bits)(val);
1418             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1419                 ptr.origin[j] = repval;// so speed it up by factor
1420         }
1421         for (; i<end; i++)
1422             ptr[i] = val;
1423     }
1424 
1425     auto opSlice(size_t from, size_t to)inout
1426     in
1427     {
1428         assert(from <= to);
1429         assert(ofs + to <= limit);
1430     }
1431     body
1432     {
1433         return typeof(this)(ptr.origin, ofs + from, to - from);
1434     }
1435 
1436     auto opSlice(){ return opSlice(0, length); }
1437 
1438     bool opEquals(T)(auto ref T arr) const
1439     {
1440         if (limit != arr.limit)
1441            return false;
1442         size_t s1 = ofs, s2 = arr.ofs;
1443         size_t e1 = s1 + limit, e2 = s2 + limit;
1444         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1445         {
1446             return ptr.origin[s1/factor .. e1/factor]
1447                 == arr.ptr.origin[s2/factor .. e2/factor];
1448         }
1449         for (size_t i=0;i<limit; i++)
1450             if (this[i] != arr[i])
1451                 return false;
1452         return true;
1453     }
1454 
1455     @property size_t length()const{ return limit; }
1456 
1457 private:
1458     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1459     auto roundDown()(size_t val){ return val/factor*factor; }
1460     // factor - number of elements in one machine word
1461     enum factor = size_t.sizeof*8/bits;
1462     PackedPtr!(T) ptr;
1463     size_t ofs, limit;
1464 }
1465 
1466 
SliceOverIndexed(T)1467 private struct SliceOverIndexed(T)
1468 {
1469     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1470     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1471     auto opIndex(size_t idx)const
1472     in
1473     {
1474         assert(idx < to - from);
1475     }
1476     body
1477     {
1478         return (*arr)[from+idx];
1479     }
1480 
1481     static if (assignableIndex)
1482     void opIndexAssign(Item val, size_t idx)
1483     in
1484     {
1485         assert(idx < to - from);
1486     }
1487     body
1488     {
1489        (*arr)[from+idx] = val;
1490     }
1491 
1492     auto opSlice(size_t a, size_t b)
1493     {
1494         return typeof(this)(from+a, from+b, arr);
1495     }
1496 
1497     // static if (assignableSlice)
1498     void opSliceAssign(T)(T val, size_t start, size_t end)
1499     {
1500         (*arr)[start+from .. end+from] = val;
1501     }
1502 
1503     auto opSlice()
1504     {
1505         return typeof(this)(from, to, arr);
1506     }
1507 
1508     @property size_t length()const { return to-from;}
1509 
1510     auto opDollar()const { return length; }
1511 
1512     @property bool empty()const { return from == to; }
1513 
1514     @property auto front()const { return (*arr)[from]; }
1515 
1516     static if (assignableIndex)
1517     @property void front(Item val) { (*arr)[from] = val; }
1518 
1519     @property auto back()const { return (*arr)[to-1]; }
1520 
1521     static if (assignableIndex)
1522     @property void back(Item val) { (*arr)[to-1] = val; }
1523 
1524     @property auto save() inout { return this; }
1525 
1526     void popFront() {   from++; }
1527 
1528     void popBack() {    to--; }
1529 
1530     bool opEquals(T)(auto ref T arr) const
1531     {
1532         if (arr.length != length)
1533             return false;
1534         for (size_t i=0; i <length; i++)
1535             if (this[i] != arr[i])
1536                 return false;
1537         return true;
1538     }
1539 private:
1540     alias Item = typeof(T.init[0]);
1541     size_t from, to;
1542     T* arr;
1543 }
1544 
1545 static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1546 
1547 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1548 if (is(Unqual!T == T))
1549 {
1550     return SliceOverIndexed!(const(T))(a, b, x);
1551 }
1552 
1553 // BUG? inout is out of reach
1554 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1555 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1556 if (is(Unqual!T == T))
1557 {
1558     return SliceOverIndexed!T(a, b, x);
1559 }
1560 
1561 @system unittest
1562 {
1563     int[] idxArray = [2, 3, 5, 8, 13];
1564     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1565 
1566     assert(!sliced.empty);
1567     assert(sliced.front == 2);
1568     sliced.front = 1;
1569     assert(sliced.front == 1);
1570     assert(sliced.back == 13);
1571     sliced.popFront();
1572     assert(sliced.front == 3);
1573     assert(sliced.back == 13);
1574     sliced.back = 11;
1575     assert(sliced.back == 11);
1576     sliced.popBack();
1577 
1578     assert(sliced.front == 3);
1579     assert(sliced[$-1] == 8);
1580     sliced = sliced[];
1581     assert(sliced[0] == 3);
1582     assert(sliced.back == 8);
1583     sliced = sliced[1..$];
1584     assert(sliced.front == 5);
1585     sliced = sliced[0..$-1];
1586     assert(sliced[$-1] == 5);
1587 
1588     int[] other = [2, 5];
1589     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1590     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1591     assert(idxArray[0 .. 2] == [-1, -1]);
1592     uint[] nullArr = null;
1593     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1594     assert(nullSlice.empty);
1595 }
1596 
packedArrayView(T)1597 private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow
1598 {
1599     return inout(PackedArrayView!T)(ptr, 0, items);
1600 }
1601 
1602 
1603 //============================================================================
1604 // Partially unrolled binary search using Shar's method
1605 //============================================================================
1606 
genUnrolledSwitchSearch(size_t size)1607 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1608 {
1609     import core.bitop : bsr;
1610     import std.array : replace;
1611     import std.conv : to;
1612     assert(isPow2OrZero(size));
1613     string code = `
1614     import core.bitop : bsr;
1615     auto power = bsr(m)+1;
1616     switch (power){`;
1617     size_t i = bsr(size);
1618     foreach_reverse (val; 0 .. bsr(size))
1619     {
1620         auto v = 2^^val;
1621         code ~= `
1622         case pow:
1623             if (pred(range[idx+m], needle))
1624                 idx +=  m;
1625             goto case;
1626         `.replace("m", to!string(v))
1627         .replace("pow", to!string(i));
1628         i--;
1629     }
1630     code ~= `
1631         case 0:
1632             if (pred(range[idx], needle))
1633                 idx += 1;
1634             goto default;
1635         `;
1636     code ~= `
1637         default:
1638     }`;
1639     return code;
1640 }
1641 
isPow2OrZero(size_t sz)1642 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1643 {
1644     // See also: std.math.isPowerOf2()
1645     return (sz & (sz-1)) == 0;
1646 }
1647 
1648 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1649 if (is(T : ElementType!Range))
1650 {
1651     assert(isPow2OrZero(range.length));
1652     size_t idx = 0, m = range.length/2;
1653     while (m != 0)
1654     {
1655         if (pred(range[idx+m], needle))
1656             idx += m;
1657         m /= 2;
1658     }
1659     if (pred(range[idx], needle))
1660         idx += 1;
1661     return idx;
1662 }
1663 
1664 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1665 if (is(T : ElementType!Range))
1666 {
1667     assert(isPow2OrZero(range.length));
1668     size_t idx = 0, m = range.length/2;
1669     enum max = 1 << 10;
1670     while (m >= max)
1671     {
1672         if (pred(range[idx+m], needle))
1673             idx += m;
1674         m /= 2;
1675     }
1676     mixin(genUnrolledSwitchSearch(max));
1677     return idx;
1678 }
1679 
sharMethod(alias uniLowerBound)1680 template sharMethod(alias uniLowerBound)
1681 {
1682     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1683         if (is(T : ElementType!Range))
1684     {
1685         import std.functional : binaryFun;
1686         import std.math : nextPow2, truncPow2;
1687         alias pred = binaryFun!_pred;
1688         if (range.length == 0)
1689             return 0;
1690         if (isPow2OrZero(range.length))
1691             return uniLowerBound!pred(range, needle);
1692         size_t n = truncPow2(range.length);
1693         if (pred(range[n-1], needle))
1694         {// search in another 2^^k area that fully covers the tail of range
1695             size_t k = nextPow2(range.length - n + 1);
1696             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1697         }
1698         else
1699             return uniLowerBound!pred(range[0 .. n], needle);
1700     }
1701 }
1702 
1703 alias sharLowerBound = sharMethod!uniformLowerBound;
1704 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1705 
1706 @safe unittest
1707 {
1708     import std.array : array;
1709     import std.range : assumeSorted, iota;
1710 
stdLowerBound(T)1711     auto stdLowerBound(T)(T[] range, T needle)
1712     {
1713         return assumeSorted(range).lowerBound(needle).length;
1714     }
1715     immutable MAX = 5*1173;
1716     auto arr = array(iota(5, MAX, 5));
1717     assert(arr.length == MAX/5-1);
1718     foreach (i; 0 .. MAX+5)
1719     {
1720         auto st = stdLowerBound(arr, i);
1721         assert(st == sharLowerBound(arr, i));
1722         assert(st == sharSwitchLowerBound(arr, i));
1723     }
1724     arr = [];
1725     auto st = stdLowerBound(arr, 33);
1726     assert(st == sharLowerBound(arr, 33));
1727     assert(st == sharSwitchLowerBound(arr, 33));
1728 }
1729 //============================================================================
1730 
1731 @safe
1732 {
1733 // hope to see simillar stuff in public interface... once Allocators are out
1734 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1735 
1736 @trusted size_t genericReplace(Policy=void, T, Range)
1737     (ref T dest, size_t from, size_t to, Range stuff)
1738 {
1739     import std.algorithm.mutation : copy;
1740     size_t delta = to - from;
1741     size_t stuff_end = from+stuff.length;
1742     if (stuff.length > delta)
1743     {// replace increases length
1744         delta = stuff.length - delta;// now, new is > old  by delta
1745         static if (is(Policy == void))
1746             dest.length = dest.length+delta;//@@@BUG lame @property
1747         else
1748             dest = Policy.realloc(dest, dest.length+delta);
1749         copyBackwards(dest[to .. dest.length-delta],
1750             dest[to+delta .. dest.length]);
1751         copyForward(stuff, dest[from .. stuff_end]);
1752     }
1753     else if (stuff.length == delta)
1754     {
1755         copy(stuff, dest[from .. to]);
1756     }
1757     else
1758     {// replace decreases length by delta
1759         delta = delta - stuff.length;
1760         copy(stuff, dest[from .. stuff_end]);
1761         copyForward(dest[to .. dest.length],
1762             dest[stuff_end .. dest.length-delta]);
1763         static if (is(Policy == void))
1764             dest.length = dest.length - delta;//@@@BUG lame @property
1765         else
1766             dest = Policy.realloc(dest, dest.length-delta);
1767     }
1768     return stuff_end;
1769 }
1770 
1771 
1772 // Simple storage manipulation policy
1773 @trusted private struct GcPolicy
1774 {
1775     import std.traits : isDynamicArray;
1776 
dupGcPolicy1777     static T[] dup(T)(const T[] arr)
1778     {
1779         return arr.dup;
1780     }
1781 
allocGcPolicy1782     static T[] alloc(T)(size_t size)
1783     {
1784         return new T[size];
1785     }
1786 
reallocGcPolicy1787     static T[] realloc(T)(T[] arr, size_t sz)
1788     {
1789         arr.length = sz;
1790         return arr;
1791     }
1792 
replaceImplGcPolicy1793     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1794     {
1795         replaceInPlace(dest, from, to, stuff);
1796     }
1797 
1798     static void append(T, V)(ref T[] arr, V value)
1799         if (!isInputRange!V)
1800     {
1801         arr ~= force!T(value);
1802     }
1803 
1804     static void append(T, V)(ref T[] arr, V value)
1805         if (isInputRange!V)
1806     {
1807         insertInPlace(arr, arr.length, value);
1808     }
1809 
1810     static void destroy(T)(ref T arr)
1811         if (isDynamicArray!T && is(Unqual!T == T))
1812     {
1813         debug
1814         {
1815             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1816         }
1817         arr = null;
1818     }
1819 
1820     static void destroy(T)(ref T arr)
1821         if (isDynamicArray!T && !is(Unqual!T == T))
1822     {
1823         arr = null;
1824     }
1825 }
1826 
1827 // ditto
1828 @trusted struct ReallocPolicy
1829 {
1830     import std.range.primitives : hasLength;
1831 
dup(T)1832     static T[] dup(T)(const T[] arr)
1833     {
1834         auto result = alloc!T(arr.length);
1835         result[] = arr[];
1836         return result;
1837     }
1838 
alloc(T)1839     static T[] alloc(T)(size_t size)
1840     {
1841         import core.stdc.stdlib : malloc;
1842         import std.exception : enforce;
1843 
1844         import core.checkedint : mulu;
1845         bool overflow;
1846         size_t nbytes = mulu(size, T.sizeof, overflow);
1847         if (overflow) assert(0);
1848 
1849         auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap");
1850         return ptr[0 .. size];
1851     }
1852 
realloc(T)1853     static T[] realloc(T)(T[] arr, size_t size)
1854     {
1855         import core.stdc.stdlib : realloc;
1856         import std.exception : enforce;
1857         if (!size)
1858         {
1859             destroy(arr);
1860             return null;
1861         }
1862 
1863         import core.checkedint : mulu;
1864         bool overflow;
1865         size_t nbytes = mulu(size, T.sizeof, overflow);
1866         if (overflow) assert(0);
1867 
1868         auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap");
1869         return ptr[0 .. size];
1870     }
1871 
replaceImpl(T,Range)1872     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1873     {
1874         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1875     }
1876 
1877     static void append(T, V)(ref T[] arr, V value)
1878         if (!isInputRange!V)
1879     {
1880         if (arr.length == size_t.max) assert(0);
1881         arr = realloc(arr, arr.length+1);
1882         arr[$-1] = force!T(value);
1883     }
1884 
1885     @safe unittest
1886     {
1887         int[] arr;
1888         ReallocPolicy.append(arr, 3);
1889 
1890         import std.algorithm.comparison : equal;
1891         assert(equal(arr, [3]));
1892     }
1893 
1894     static void append(T, V)(ref T[] arr, V value)
1895         if (isInputRange!V && hasLength!V)
1896     {
1897         import core.checkedint : addu;
1898         bool overflow;
1899         size_t nelems = addu(arr.length, value.length, overflow);
1900         if (overflow) assert(0);
1901 
1902         arr = realloc(arr, nelems);
1903 
1904         import std.algorithm.mutation : copy;
1905         copy(value, arr[$-value.length..$]);
1906     }
1907 
1908     @safe unittest
1909     {
1910         int[] arr;
1911         ReallocPolicy.append(arr, [1,2,3]);
1912 
1913         import std.algorithm.comparison : equal;
1914         assert(equal(arr, [1,2,3]));
1915     }
1916 
destroy(T)1917     static void destroy(T)(ref T[] arr)
1918     {
1919         import core.stdc.stdlib : free;
1920         if (arr.ptr)
1921             free(arr.ptr);
1922         arr = null;
1923     }
1924 }
1925 
1926 //build hack
1927 alias _RealArray = CowArray!ReallocPolicy;
1928 
1929 @safe unittest
1930 {
1931     import std.algorithm.comparison : equal;
1932 
with(ReallocPolicy)1933     with(ReallocPolicy)
1934     {
1935         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1936                    string file = __FILE__, size_t line = __LINE__)
1937         {
1938             {
1939                 replaceImpl(orig, from, to, toReplace);
1940                 scope(exit) destroy(orig);
1941                 if (!equal(orig, result))
1942                     return false;
1943             }
1944             return true;
1945         }
1946         static T[] arr(T)(T[] args... )
1947         {
1948             return dup(args);
1949         }
1950 
1951         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1952         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1953         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1954         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1955         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1956     }
1957 }
1958 
1959 /**
1960     Tests if T is some kind a set of code points. Intended for template constraints.
1961 */
isCodepointSet(T)1962 public template isCodepointSet(T)
1963 {
1964     static if (is(T dummy == InversionList!(Args), Args...))
1965         enum isCodepointSet = true;
1966     else
1967         enum isCodepointSet = false;
1968 }
1969 
1970 /**
1971     Tests if $(D T) is a pair of integers that implicitly convert to $(D V).
1972     The following code must compile for any pair $(D T):
1973     ---
1974     (T x){ V a = x[0]; V b = x[1];}
1975     ---
1976     The following must not compile:
1977      ---
1978     (T x){ V c = x[2];}
1979     ---
1980 */
1981 public template isIntegralPair(T, V=uint)
1982 {
1983     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1984         && !is(typeof((T x){ V c = x[2]; }));
1985 }
1986 
1987 
1988 /**
1989     The recommended default type for set of $(CODEPOINTS).
1990     For details, see the current implementation: $(LREF InversionList).
1991 */
1992 public alias CodepointSet = InversionList!GcPolicy;
1993 
1994 
1995 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1996 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1997 // hence below doesn't seem to work
1998 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1999 
2000 /**
2001     The recommended type of $(REF Tuple, std,_typecons)
2002     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
2003     Any interval type should pass $(LREF isIntegralPair) trait.
2004 */
2005 public struct CodepointInterval
2006 {
2007 pure:
2008     uint[2] _tuple;
2009     alias _tuple this;
2010 
2011 @safe pure nothrow @nogc:
2012 
thisCodepointInterval2013     this(uint low, uint high)
2014     {
2015         _tuple[0] = low;
2016         _tuple[1] = high;
2017     }
opEqualsCodepointInterval2018     bool opEquals(T)(T val) const
2019     {
2020         return this[0] == val[0] && this[1] == val[1];
2021     }
inoutCodepointInterval2022     @property ref inout(uint) a() inout { return _tuple[0]; }
inoutCodepointInterval2023     @property ref inout(uint) b() inout { return _tuple[1]; }
2024 }
2025 
2026 /**
2027     $(P
2028     $(D InversionList) is a set of $(CODEPOINTS)
2029     represented as an array of open-right [a, b$(RPAREN)
2030     intervals (see $(LREF CodepointInterval) above).
2031     The name comes from the way the representation reads left to right.
2032     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2033     plus a singular value 60 looks like this:
2034     )
2035     ---
2036     10, 50, 60, 61, 80, 90
2037     ---
2038     $(P
2039     The way to read this is: start with negative meaning that all numbers
2040     smaller then the next one are not present in this set (and positive
2041     - the contrary). Then switch positive/negative after each
2042     number passed from left to right.
2043     )
2044     $(P This way negative spans until 10, then positive until 50,
2045     then negative until 60, then positive until 61, and so on.
2046     As seen this provides a space-efficient storage of highly redundant data
2047     that comes in long runs. A description which Unicode $(CHARACTER)
2048     properties fit nicely. The technique itself could be seen as a variation
2049     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2050     )
2051 
2052     $(P Sets are value types (just like $(D int) is) thus they
2053         are never aliased.
2054     )
2055         Example:
2056         ---
2057         auto a = CodepointSet('a', 'z'+1);
2058         auto b = CodepointSet('A', 'Z'+1);
2059         auto c = a;
2060         a = a | b;
2061         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2062         assert(a != c);
2063         ---
2064     $(P See also $(LREF unicode) for simpler construction of sets
2065         from predefined ones.
2066     )
2067 
2068     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2069     The value semantics are achieved by using the
2070     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2071     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2072     )
2073 
2074     Note:
2075     $(P It's not recommended to rely on the template parameters
2076     or the exact type of a current $(CODEPOINT) set in $(D std.uni).
2077     The type and parameters may change when the standard
2078     allocators design is finalized.
2079     Use $(LREF isCodepointSet) with templates or just stick with the default
2080     alias $(LREF CodepointSet) throughout the whole code base.
2081     )
2082 */
2083 @trusted public struct InversionList(SP=GcPolicy)
2084 {
2085     import std.range : assumeSorted;
2086 
2087     /**
2088         Construct from another code point set of any type.
2089     */
2090     this(Set)(Set set) pure
2091         if (isCodepointSet!Set)
2092     {
2093         uint[] arr;
2094         foreach (v; set.byInterval)
2095         {
2096             arr ~= v.a;
2097             arr ~= v.b;
2098         }
2099         data = CowArray!(SP).reuse(arr);
2100     }
2101 
2102     /**
2103         Construct a set from a forward range of code point intervals.
2104     */
2105     this(Range)(Range intervals) pure
2106         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2107     {
2108         uint[] arr;
foreach(v;intervals)2109         foreach (v; intervals)
2110         {
2111             SP.append(arr, v.a);
2112             SP.append(arr, v.b);
2113         }
2114         data = CowArray!(SP).reuse(arr);
2115         sanitize(); //enforce invariant: sort intervals etc.
2116     }
2117 
2118     //helper function that avoids sanity check to be CTFE-friendly
fromIntervals(Range)2119     private static fromIntervals(Range)(Range intervals) pure
2120     {
2121         import std.algorithm.iteration : map;
2122         import std.range : roundRobin;
2123         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2124             intervals.save.map!"a[1]"());
2125         InversionList set;
2126         set.data = CowArray!(SP)(flattened);
2127         return set;
2128     }
2129     //ditto untill sort is CTFE-able
fromIntervals()2130     private static fromIntervals()(uint[] intervals...) pure
2131     in
2132     {
2133         import std.conv : text;
2134         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2135         for (uint i = 0; i < intervals.length; i += 2)
2136         {
2137             auto a = intervals[i], b = intervals[i+1];
2138             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2139         }
2140     }
2141     body
2142     {
2143         InversionList set;
2144         set.data = CowArray!(SP)(intervals);
2145         return set;
2146     }
2147 
2148     /**
2149         Construct a set from plain values of code point intervals.
2150     */
this()2151     this()(uint[] intervals...)
2152     in
2153     {
2154         import std.conv : text;
2155         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2156         for (uint i = 0; i < intervals.length; i += 2)
2157         {
2158             auto a = intervals[i], b = intervals[i+1];
2159             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2160         }
2161     }
2162     body
2163     {
2164         data = CowArray!(SP)(intervals);
2165         sanitize(); //enforce invariant: sort intervals etc.
2166     }
2167 
2168     ///
2169     @safe unittest
2170     {
2171         import std.algorithm.comparison : equal;
2172 
2173         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2174         foreach (v; 'a'..'z'+1)
2175             assert(set[v]);
2176         // Cyrillic lowercase interval
2177         foreach (v; 'а'..'я'+1)
2178             assert(set[v]);
2179         //specific order is not required, intervals may interesect
2180         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2181         //the same end result
2182         assert(set2.byInterval.equal(set.byInterval));
2183     }
2184 
2185     /**
2186         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2187 
2188         Example:
2189         -----------
2190         import std.algorithm.comparison : equal;
2191         import std.typecons : tuple;
2192 
2193         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2194 
2195         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2196         -----------
2197     */
byInterval()2198     @property auto byInterval()
2199     {
2200         return Intervals!(typeof(data))(data);
2201     }
2202 
2203     /**
2204         Tests the presence of code point $(D val) in this set.
2205     */
opIndex(uint val)2206     bool opIndex(uint val) const
2207     {
2208         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2209         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2210         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2211     }
2212 
2213     ///
2214     @safe unittest
2215     {
2216         auto gothic = unicode.Gothic;
2217         // Gothic letter ahsa
2218         assert(gothic['\U00010330']);
2219         // no ascii in Gothic obviously
2220         assert(!gothic['$']);
2221     }
2222 
2223 
2224     // Linear scan for $(D ch). Useful only for small sets.
2225     // TODO:
2226     // used internally in std.regex
2227     // should be properly exposed in a public API ?
scanFor()2228     package auto scanFor()(dchar ch) const
2229     {
2230         immutable len = data.length;
2231         for (size_t i = 0; i < len; i++)
2232             if (ch < data[i])
2233                 return i & 1;
2234         return 0;
2235     }
2236 
2237     /// Number of $(CODEPOINTS) in this set
length()2238     @property size_t length()
2239     {
2240         size_t sum = 0;
2241         foreach (iv; byInterval)
2242         {
2243             sum += iv.b - iv.a;
2244         }
2245         return sum;
2246     }
2247 
2248 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2249 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2250 //============================================================================
2251 public:
2252     /**
2253         $(P Sets support natural syntax for set algebra, namely: )
2254         $(BOOKTABLE ,
2255             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2256             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2257             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2258             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2259             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2260         )
2261     */
2262     This opBinary(string op, U)(U rhs)
2263         if (isCodepointSet!U || is(U:dchar))
2264     {
2265         static if (op == "&" || op == "|" || op == "~")
2266         {// symmetric ops thus can swap arguments to reuse r-value
2267             static if (is(U:dchar))
2268             {
2269                 auto tmp = this;
2270                 mixin("tmp "~op~"= rhs; ");
2271                 return tmp;
2272             }
2273             else
2274             {
2275                 static if (is(Unqual!U == U))
2276                 {
2277                     // try hard to reuse r-value
2278                     mixin("rhs "~op~"= this;");
2279                     return rhs;
2280                 }
2281                 else
2282                 {
2283                     auto tmp = this;
2284                     mixin("tmp "~op~"= rhs;");
2285                     return tmp;
2286                 }
2287             }
2288         }
2289         else static if (op == "-") // anti-symmetric
2290         {
2291             auto tmp = this;
2292             tmp -= rhs;
2293             return tmp;
2294         }
2295         else
2296             static assert(0, "no operator "~op~" defined for Set");
2297     }
2298 
2299     ///
2300     @safe unittest
2301     {
2302         import std.algorithm.comparison : equal;
2303         import std.range : iota;
2304 
2305         auto lower = unicode.LowerCase;
2306         auto upper = unicode.UpperCase;
2307         auto ascii = unicode.ASCII;
2308 
2309         assert((lower & upper).empty); // no intersection
2310         auto lowerASCII = lower & ascii;
2311         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2312         // throw away all of the lowercase ASCII
2313         assert((ascii - lower).length == 128 - 26);
2314 
2315         auto onlyOneOf = lower ~ ascii;
2316         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2317         assert(onlyOneOf['$']); // ASCII and not lowercase
2318         assert(!onlyOneOf['a']); // ASCII and lowercase
2319         assert(onlyOneOf['я']); // not ASCII but lowercase
2320 
2321         // throw away all cased letters from ASCII
2322         auto noLetters = ascii - (lower | upper);
2323         assert(noLetters.length == 128 - 26*2);
2324     }
2325 
2326     /// The 'op=' versions of the above overloaded operators.
2327     ref This opOpAssign(string op, U)(U rhs)
2328         if (isCodepointSet!U || is(U:dchar))
2329     {
2330         static if (op == "|")    // union
2331         {
2332             static if (is(U:dchar))
2333             {
2334                 this.addInterval(rhs, rhs+1);
2335                 return this;
2336             }
2337             else
2338                 return this.add(rhs);
2339         }
2340         else static if (op == "&")   // intersection
2341                 return this.intersect(rhs);// overloaded
2342         else static if (op == "-")   // set difference
2343                 return this.sub(rhs);// overloaded
2344         else static if (op == "~")   // symmetric set difference
2345         {
2346             auto copy = this & rhs;
2347             this |= rhs;
2348             this -= copy;
2349             return this;
2350         }
2351         else
2352             static assert(0, "no operator "~op~" defined for Set");
2353     }
2354 
2355     /**
2356         Tests the presence of codepoint $(D ch) in this set,
2357         the same as $(LREF opIndex).
2358     */
2359     bool opBinaryRight(string op: "in", U)(U ch) const
2360         if (is(U : dchar))
2361     {
2362         return this[ch];
2363     }
2364 
2365     ///
2366     @safe unittest
2367     {
2368         assert('я' in unicode.Cyrillic);
2369         assert(!('z' in unicode.Cyrillic));
2370     }
2371 
2372 
2373 
2374     /**
2375      * Obtains a set that is the inversion of this set.
2376      *
2377      * See_Also: $(LREF inverted)
2378      */
2379     auto opUnary(string op: "!")()
2380     {
2381         return this.inverted;
2382     }
2383 
2384     /**
2385         A range that spans each $(CODEPOINT) in this set.
2386     */
byCodepoint()2387     @property auto byCodepoint()
2388     {
2389         @trusted static struct CodepointRange
2390         {
2391             this(This set)
2392             {
2393                 r = set.byInterval;
2394                 if (!r.empty)
2395                     cur = r.front.a;
2396             }
2397 
2398             @property dchar front() const
2399             {
2400                 return cast(dchar) cur;
2401             }
2402 
2403             @property bool empty() const
2404             {
2405                 return r.empty;
2406             }
2407 
2408             void popFront()
2409             {
2410                 cur++;
2411                 while (cur >= r.front.b)
2412                 {
2413                     r.popFront();
2414                     if (r.empty)
2415                         break;
2416                     cur = r.front.a;
2417                 }
2418             }
2419         private:
2420             uint cur;
2421             typeof(This.init.byInterval) r;
2422         }
2423 
2424         return CodepointRange(this);
2425     }
2426 
2427     ///
2428     @safe unittest
2429     {
2430         import std.algorithm.comparison : equal;
2431         import std.range : iota;
2432 
2433         auto set = unicode.ASCII;
2434         set.byCodepoint.equal(iota(0, 0x80));
2435     }
2436 
2437     /**
2438         $(P Obtain textual representation of this set in from of
2439         open-right intervals and feed it to $(D sink).
2440         )
2441         $(P Used by various standard formatting facilities such as
2442          $(REF formattedWrite, std,_format), $(REF write, std,_stdio),
2443          $(REF writef, std,_stdio), $(REF to, std,_conv) and others.
2444         )
2445         Example:
2446         ---
2447         import std.conv;
2448         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2449         ---
2450     */
2451 
2452     private import std.format : FormatSpec;
2453 
2454     /***************************************
2455      * Obtain a textual representation of this InversionList
2456      * in form of open-right intervals.
2457      *
2458      * The formatting flag is applied individually to each value, for example:
2459      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2460      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2461      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2462      */
toString(Writer)2463     void toString(Writer)(scope Writer sink,
2464                   FormatSpec!char fmt) /* const */
2465     {
2466         import std.format : formatValue;
2467         auto range = byInterval;
2468         if (range.empty)
2469             return;
2470 
2471         while (1)
2472         {
2473             auto i = range.front;
2474             range.popFront();
2475 
2476             put(sink, "[");
2477             formatValue(sink, i.a, fmt);
2478             put(sink, "..");
2479             formatValue(sink, i.b, fmt);
2480             put(sink, ")");
2481             if (range.empty) return;
2482             put(sink, " ");
2483         }
2484     }
2485 
2486     ///
2487     @safe unittest
2488     {
2489         import std.conv : to;
2490         import std.format : format;
2491         import std.uni : unicode;
2492 
2493         assert(unicode.Cyrillic.to!string ==
2494             "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
2495 
2496         // The specs '%s' and '%d' are equivalent to the to!string call above.
2497         assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
2498 
2499         assert(format("%#x", unicode.Cyrillic) ==
2500             "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
2501             ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
2502 
2503         assert(format("%#X", unicode.Cyrillic) ==
2504             "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
2505             ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
2506     }
2507 
2508     @safe unittest
2509     {
2510         import std.exception : assertThrown;
2511         import std.format : format, FormatException;
2512         assertThrown!FormatException(format("%a", unicode.ASCII));
2513     }
2514 
2515 
2516     /**
2517         Add an interval [a, b$(RPAREN) to this set.
2518     */
add()2519     ref add()(uint a, uint b)
2520     {
2521         addInterval(a, b);
2522         return this;
2523     }
2524 
2525     ///
2526     @safe unittest
2527     {
2528         CodepointSet someSet;
2529         someSet.add('0', '5').add('A','Z'+1);
2530         someSet.add('5', '9'+1);
2531         assert(someSet['0']);
2532         assert(someSet['5']);
2533         assert(someSet['9']);
2534         assert(someSet['Z']);
2535     }
2536 
2537 private:
2538 
2539   package(std)  // used from: std.regex.internal.parser
2540     ref intersect(U)(U rhs)
2541         if (isCodepointSet!U)
2542     {
2543         Marker mark;
2544         foreach ( i; rhs.byInterval)
2545         {
2546             mark = this.dropUpTo(i.a, mark);
2547             mark = this.skipUpTo(i.b, mark);
2548         }
2549         this.dropUpTo(uint.max, mark);
2550         return this;
2551     }
2552 
intersect()2553     ref intersect()(dchar ch)
2554     {
2555         foreach (i; byInterval)
2556             if (i.a <= ch && ch < i.b)
2557                 return this = This.init.add(ch, ch+1);
2558         this = This.init;
2559         return this;
2560     }
2561 
2562     @safe unittest
2563     {
2564         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2565     }
2566 
sub()2567     ref sub()(dchar ch)
2568     {
2569         return subChar(ch);
2570     }
2571 
2572     // same as the above except that skip & drop parts are swapped
2573   package(std)  // used from: std.regex.internal.parser
2574     ref sub(U)(U rhs)
2575         if (isCodepointSet!U)
2576     {
2577         Marker mark;
2578         foreach (i; rhs.byInterval)
2579         {
2580             mark = this.skipUpTo(i.a, mark);
2581             mark = this.dropUpTo(i.b, mark);
2582         }
2583         return this;
2584     }
2585 
2586   package(std)  // used from: std.regex.internal.parse
2587     ref add(U)(U rhs)
2588         if (isCodepointSet!U)
2589     {
2590         Marker start;
2591         foreach (i; rhs.byInterval)
2592         {
2593             start = addInterval(i.a, i.b, start);
2594         }
2595         return this;
2596     }
2597 
2598 // end of mixin-able part
2599 //============================================================================
2600 public:
2601     /**
2602         Obtains a set that is the inversion of this set.
2603 
2604         See the '!' $(LREF opUnary) for the same but using operators.
2605     */
inverted()2606     @property auto inverted()
2607     {
2608         InversionList inversion = this;
2609         if (inversion.data.length == 0)
2610         {
2611             inversion.addInterval(0, lastDchar+1);
2612             return inversion;
2613         }
2614         if (inversion.data[0] != 0)
2615             genericReplace(inversion.data, 0, 0, [0]);
2616         else
2617             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2618         if (data[data.length-1] != lastDchar+1)
2619             genericReplace(inversion.data,
2620                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2621         else
2622             genericReplace(inversion.data,
2623                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2624 
2625         return inversion;
2626     }
2627 
2628     ///
2629     @safe unittest
2630     {
2631         auto set = unicode.ASCII;
2632         // union with the inverse gets all of the code points in the Unicode
2633         assert((set | set.inverted).length == 0x110000);
2634         // no intersection with the inverse
2635         assert((set & set.inverted).empty);
2636     }
2637 
2638     /**
2639         Generates string with D source code of unary function with name of
2640         $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
2641         the code is adjusted to be a lambda function.
2642 
2643         The function generated tests if the $(CODEPOINT) passed
2644         belongs to this set or not. The result is to be used with string mixin.
2645         The intended usage area is aggressive optimization via meta programming
2646         in parser generators and the like.
2647 
2648         Note: Use with care for relatively small or regular sets. It
2649         could end up being slower then just using multi-staged tables.
2650 
2651         Example:
2652         ---
2653         import std.stdio;
2654 
2655         // construct set directly from [a, b$RPAREN intervals
2656         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2657         writeln(set);
2658         writeln(set.toSourceCode("func"));
2659         ---
2660 
2661         The above outputs something along the lines of:
2662         ---
2663         bool func(dchar ch)  @safe pure nothrow @nogc
2664         {
2665             if (ch < 45)
2666             {
2667                 if (ch == 10 || ch == 11) return true;
2668                 return false;
2669             }
2670             else if (ch < 65) return true;
2671             else
2672             {
2673                 if (ch < 100) return false;
2674                 if (ch < 200) return true;
2675                 return false;
2676             }
2677         }
2678         ---
2679     */
2680     string toSourceCode(string funcName="")
2681     {
2682         import std.algorithm.searching : countUntil;
2683         import std.array : array;
2684         import std.format : format;
2685         enum maxBinary = 3;
linearScope(R)2686         static string linearScope(R)(R ivals, string indent)
2687         {
2688             string result = indent~"{\n";
2689             string deeper = indent~"    ";
2690             foreach (ival; ivals)
2691             {
2692                 immutable span = ival[1] - ival[0];
2693                 assert(span != 0);
2694                 if (span == 1)
2695                 {
2696                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2697                 }
2698                 else if (span == 2)
2699                 {
2700                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2701                         deeper, ival[0], ival[0]+1);
2702                 }
2703                 else
2704                 {
2705                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2706                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2707                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2708                 }
2709             }
2710             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2711             return result;
2712         }
2713 
binaryScope(R)2714         static string binaryScope(R)(R ivals, string indent)
2715         {
2716             // time to do unrolled comparisons?
2717             if (ivals.length < maxBinary)
2718                 return linearScope(ivals, indent);
2719             else
2720                 return bisect(ivals, ivals.length/2, indent);
2721         }
2722 
2723         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2724         // and GDC is doing fine job either way
switchScope(R)2725         static string switchScope(R)(R ivals, string indent)
2726         {
2727             string result = indent~"switch (ch){\n";
2728             string deeper = indent~"    ";
2729             foreach (ival; ivals)
2730             {
2731                 if (ival[0]+1 == ival[1])
2732                 {
2733                     result ~= format("%scase %s: return true;\n",
2734                         deeper, ival[0]);
2735                 }
2736                 else
2737                 {
2738                     result ~= format("%scase %s: .. case %s: return true;\n",
2739                          deeper, ival[0], ival[1]-1);
2740                 }
2741             }
2742             result ~= deeper~"default: return false;\n"~indent~"}\n";
2743             return result;
2744         }
2745 
bisect(R)2746         static string bisect(R)(R range, size_t idx, string indent)
2747         {
2748             string deeper = indent ~ "    ";
2749             // bisect on one [a, b) interval at idx
2750             string result = indent~"{\n";
2751             // less branch, < a
2752             result ~= format("%sif (ch < %s)\n%s",
2753                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2754             // middle point,  >= a && < b
2755             result ~= format("%selse if (ch < %s) return true;\n",
2756                 deeper, range[idx][1]);
2757             // greater or equal branch,  >= b
2758             result ~= format("%selse\n%s",
2759                 deeper, binaryScope(range[idx+1..$], deeper));
2760             return result~indent~"}\n";
2761         }
2762 
2763         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2764             funcName.empty ? "function" : funcName);
2765         auto range = byInterval.array();
2766         // special case first bisection to be on ASCII vs beyond
2767         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2768         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2769             code ~= binaryScope(range, "");
2770         else
2771             code ~= bisect(range, tillAscii, "");
2772         return code;
2773     }
2774 
2775     /**
2776         True if this set doesn't contain any $(CODEPOINTS).
2777     */
empty()2778     @property bool empty() const
2779     {
2780         return data.length == 0;
2781     }
2782 
2783     ///
2784     @safe unittest
2785     {
2786         CodepointSet emptySet;
2787         assert(emptySet.length == 0);
2788         assert(emptySet.empty);
2789     }
2790 
2791 private:
2792     alias This = typeof(this);
2793     alias Marker = size_t;
2794 
2795     // a random-access range of integral pairs
Intervals(Range)2796     static struct Intervals(Range)
2797     {
2798         this(Range sp)
2799         {
2800             slice = sp;
2801             start = 0;
2802             end = sp.length;
2803         }
2804 
2805         this(Range sp, size_t s, size_t e)
2806         {
2807             slice = sp;
2808             start = s;
2809             end = e;
2810         }
2811 
2812         @property auto front()const
2813         {
2814             immutable a = slice[start];
2815             immutable b = slice[start+1];
2816             return CodepointInterval(a, b);
2817         }
2818 
2819         //may break sorted property - but we need std.sort to access it
2820         //hence package protection attribute
2821         package @property void front(CodepointInterval val)
2822         {
2823             slice[start] = val.a;
2824             slice[start+1] = val.b;
2825         }
2826 
2827         @property auto back()const
2828         {
2829             immutable a = slice[end-2];
2830             immutable b = slice[end-1];
2831             return CodepointInterval(a, b);
2832         }
2833 
2834         //ditto about package
2835         package @property void back(CodepointInterval val)
2836         {
2837             slice[end-2] = val.a;
2838             slice[end-1] = val.b;
2839         }
2840 
2841         void popFront()
2842         {
2843             start += 2;
2844         }
2845 
2846         void popBack()
2847         {
2848             end -= 2;
2849         }
2850 
2851         auto opIndex(size_t idx) const
2852         {
2853             immutable a = slice[start+idx*2];
2854             immutable b = slice[start+idx*2+1];
2855             return CodepointInterval(a, b);
2856         }
2857 
2858         //ditto about package
2859         package void opIndexAssign(CodepointInterval val, size_t idx)
2860         {
2861             slice[start+idx*2] = val.a;
2862             slice[start+idx*2+1] = val.b;
2863         }
2864 
2865         auto opSlice(size_t s, size_t e)
2866         {
2867             return Intervals(slice, s*2+start, e*2+start);
2868         }
2869 
2870         @property size_t length()const {  return slice.length/2; }
2871 
2872         @property bool empty()const { return start == end; }
2873 
2874         @property auto save(){ return this; }
2875     private:
2876         size_t start, end;
2877         Range slice;
2878     }
2879 
2880     // called after construction from intervals
2881     // to make sure invariants hold
sanitize()2882     void sanitize()
2883     {
2884         import std.algorithm.comparison : max;
2885         import std.algorithm.mutation : SwapStrategy;
2886         import std.algorithm.sorting : sort;
2887         if (data.length == 0)
2888             return;
2889         alias Ival = CodepointInterval;
2890         //intervals wrapper for a _range_ over packed array
2891         auto ivals = Intervals!(typeof(data[]))(data[]);
2892         //@@@BUG@@@ can't use "a.a < b.a" see issue 12265
2893         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2894         // what follows is a variation on stable remove
2895         // differences:
2896         // - predicate is binary, and is tested against
2897         //   the last kept element (at 'i').
2898         // - predicate mutates lhs (merges rhs into lhs)
2899         size_t len = ivals.length;
2900         size_t i = 0;
2901         size_t j = 1;
2902         while (j < len)
2903         {
2904             if (ivals[i].b >= ivals[j].a)
2905             {
2906                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2907                 j++;
2908             }
2909             else //unmergable
2910             {
2911                 // check if there is a hole after merges
2912                 // (in the best case we do 0 writes to ivals)
2913                 if (j != i+1)
2914                     ivals[i+1] = ivals[j]; //copy over
2915                 i++;
2916                 j++;
2917             }
2918         }
2919         len = i + 1;
2920         for (size_t k=0; k + 1 < len; k++)
2921         {
2922             assert(ivals[k].a < ivals[k].b);
2923             assert(ivals[k].b < ivals[k+1].a);
2924         }
2925         data.length = len * 2;
2926     }
2927 
2928     // special case for normal InversionList
subChar(dchar ch)2929     ref subChar(dchar ch)
2930     {
2931         auto mark = skipUpTo(ch);
2932         if (mark != data.length
2933             && data[mark] == ch && data[mark-1] == ch)
2934         {
2935             // it has split, meaning that ch happens to be in one of intervals
2936             data[mark] = data[mark]+1;
2937         }
2938         return this;
2939     }
2940 
2941     //
2942     Marker addInterval(int a, int b, Marker hint=Marker.init)
2943     in
2944     {
2945         assert(a <= b);
2946     }
2947     body
2948     {
2949         import std.range : assumeSorted, SearchPolicy;
2950         auto range = assumeSorted(data[]);
2951         size_t pos;
2952         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2953         if (a_idx == range.length)
2954         {
2955             //  [---+++----++++----++++++]
2956             //  [                         a  b]
2957             data.append(a, b);
2958             return data.length-1;
2959         }
2960         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2961         uint[3] buf = void;
2962         uint to_insert;
debug(std_uni)2963         debug(std_uni)
2964         {
2965             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2966         }
2967         if (b_idx == range.length)
2968         {
2969             //  [-------++++++++----++++++-]
2970             //  [      s     a                 b]
2971             if (a_idx & 1)// a in positive
2972             {
2973                 buf[0] = b;
2974                 to_insert = 1;
2975             }
2976             else// a in negative
2977             {
2978                 buf[0] = a;
2979                 buf[1] = b;
2980                 to_insert = 2;
2981             }
2982             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2983             return pos - 1;
2984         }
2985 
2986         uint top = data[b_idx];
2987 
debug(std_uni)2988         debug(std_uni)
2989         {
2990             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2991             writefln("a=%s; b=%s; top=%s;", a, b, top);
2992         }
2993         if (a_idx & 1)
2994         {// a in positive
2995             if (b_idx & 1)// b in positive
2996             {
2997                 //  [-------++++++++----++++++-]
2998                 //  [       s    a        b    ]
2999                 buf[0] = top;
3000                 to_insert = 1;
3001             }
3002             else // b in negative
3003             {
3004                 //  [-------++++++++----++++++-]
3005                 //  [       s    a   b         ]
3006                 if (top == b)
3007                 {
3008                     assert(b_idx+1 < data.length);
3009                     buf[0] = data[b_idx+1];
3010                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3011                     return pos - 1;
3012                 }
3013                 buf[0] = b;
3014                 buf[1] = top;
3015                 to_insert = 2;
3016             }
3017         }
3018         else
3019         { // a in negative
3020             if (b_idx & 1) // b in positive
3021             {
3022                 //  [----------+++++----++++++-]
3023                 //  [     a     b              ]
3024                 buf[0] = a;
3025                 buf[1] = top;
3026                 to_insert = 2;
3027             }
3028             else// b in negative
3029             {
3030                 //  [----------+++++----++++++-]
3031                 //  [  a       s      b        ]
3032                 if (top == b)
3033                 {
3034                     assert(b_idx+1 < data.length);
3035                     buf[0] = a;
3036                     buf[1] = data[b_idx+1];
3037                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3038                     return pos - 1;
3039                 }
3040                 buf[0] = a;
3041                 buf[1] = b;
3042                 buf[2] = top;
3043                 to_insert = 3;
3044             }
3045         }
3046         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
debug(std_uni)3047         debug(std_uni)
3048         {
3049             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3050             writeln("inserting ", buf[0 .. to_insert]);
3051         }
3052         return pos - 1;
3053     }
3054 
3055     //
3056     Marker dropUpTo(uint a, Marker pos=Marker.init)
3057     in
3058     {
3059         assert(pos % 2 == 0); // at start of interval
3060     }
3061     body
3062     {
3063         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3064         if (range.empty)
3065             return pos;
3066         size_t idx = pos;
3067         idx += range.lowerBound(a).length;
3068 
debug(std_uni)3069         debug(std_uni)
3070         {
3071             writeln("dropUpTo full length=", data.length);
3072             writeln(pos,"~~~", idx);
3073         }
3074         if (idx == data.length)
3075             return genericReplace(data, pos, idx, cast(uint[])[]);
3076         if (idx & 1)
3077         {   // a in positive
3078             //[--+++----++++++----+++++++------...]
3079             //      |<---si       s  a  t
3080             genericReplace(data, pos, idx, [a]);
3081         }
3082         else
3083         {   // a in negative
3084             //[--+++----++++++----+++++++-------+++...]
3085             //      |<---si              s  a  t
3086             genericReplace(data, pos, idx, cast(uint[])[]);
3087         }
3088         return pos;
3089     }
3090 
3091     //
3092     Marker skipUpTo(uint a, Marker pos=Marker.init)
out(result)3093     out(result)
3094     {
3095         assert(result % 2 == 0);// always start of interval
3096         //(may be  0-width after-split)
3097     }
3098     body
3099     {
3100         assert(data.length % 2 == 0);
3101         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3102         size_t idx = pos+range.lowerBound(a).length;
3103 
3104         if (idx >= data.length) // could have Marker point to recently removed stuff
3105             return data.length;
3106 
3107         if (idx & 1)// inside of interval, check for split
3108         {
3109 
3110             immutable top = data[idx];
3111             if (top == a)// no need to split, it's end
3112                 return idx+1;
3113             immutable start = data[idx-1];
3114             if (a == start)
3115                 return idx-1;
3116             // split it up
3117             genericReplace(data, idx, idx+1, [a, a, top]);
3118             return idx+1;        // avoid odd index
3119         }
3120         return idx;
3121     }
3122 
3123     CowArray!SP data;
3124 }
3125 
3126 @system unittest
3127 {
3128     import std.conv : to;
3129     assert(unicode.ASCII.to!string() == "[0..128)");
3130 }
3131 
3132 // pedantic version for ctfe, and aligned-access only architectures
safeRead24(scope const ubyte * ptr,size_t idx)3133 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3134 {
3135     idx *= 3;
3136     version (LittleEndian)
3137         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3138              + (cast(uint) ptr[idx+2]<<16);
3139     else
3140         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3141              + ptr[idx+2];
3142 }
3143 
3144 // ditto
safeWrite24(scope ubyte * ptr,uint val,size_t idx)3145 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3146 {
3147     idx *= 3;
3148     version (LittleEndian)
3149     {
3150         ptr[idx] = val & 0xFF;
3151         ptr[idx+1] = (val >> 8) & 0xFF;
3152         ptr[idx+2] = (val >> 16) & 0xFF;
3153     }
3154     else
3155     {
3156         ptr[idx] = (val >> 16) & 0xFF;
3157         ptr[idx+1] = (val >> 8) & 0xFF;
3158         ptr[idx+2] = val & 0xFF;
3159     }
3160 }
3161 
3162 // unaligned x86-like read/write functions
unalignedRead24(scope const ubyte * ptr,size_t idx)3163 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3164 {
3165     uint* src = cast(uint*)(ptr+3*idx);
3166     version (LittleEndian)
3167         return *src & 0xFF_FFFF;
3168     else
3169         return *src >> 8;
3170 }
3171 
3172 // ditto
unalignedWrite24(scope ubyte * ptr,uint val,size_t idx)3173 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3174 {
3175     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3176     version (LittleEndian)
3177         *dest = val | (*dest & 0xFF00_0000);
3178     else
3179         *dest = (val << 8) | (*dest & 0xFF);
3180 }
3181 
read24(scope const ubyte * ptr,size_t idx)3182 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3183 {
3184     static if (hasUnalignedReads)
3185         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3186     else
3187         return safeRead24(ptr, idx);
3188 }
3189 
write24(scope ubyte * ptr,uint val,size_t idx)3190 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3191 {
3192     static if (hasUnalignedReads)
3193         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3194     else
3195         return safeWrite24(ptr, val, idx);
3196 }
3197 
3198 struct CowArray(SP=GcPolicy)
3199 {
3200     import std.range.primitives : hasLength;
3201 
3202   @safe:
reuseCowArray3203     static auto reuse(uint[] arr)
3204     {
3205         CowArray cow;
3206         cow.data = arr;
3207         SP.append(cow.data, 1);
3208         assert(cow.refCount == 1);
3209         assert(cow.length == arr.length);
3210         return cow;
3211     }
3212 
3213     this(Range)(Range range)
3214         if (isInputRange!Range && hasLength!Range)
3215     {
3216         import std.algorithm.mutation : copy;
3217         length = range.length;
3218         copy(range, data[0..$-1]);
3219     }
3220 
3221     this(Range)(Range range)
3222         if (isForwardRange!Range && !hasLength!Range)
3223     {
3224         import std.algorithm.mutation : copy;
3225         import std.range.primitives : walkLength;
3226         immutable len = walkLength(range.save);
3227         length = len;
3228         copy(range, data[0..$-1]);
3229     }
3230 
thisCowArray3231     this(this)
3232     {
3233         if (!empty)
3234         {
3235             refCount = refCount + 1;
3236         }
3237     }
3238 
~thisCowArray3239     ~this()
3240     {
3241         if (!empty)
3242         {
3243             immutable cnt = refCount;
3244             if (cnt == 1)
3245                 SP.destroy(data);
3246             else
3247                 refCount = cnt - 1;
3248         }
3249     }
3250 
3251     // no ref-count for empty U24 array
emptyCowArray3252     @property bool empty() const { return data.length == 0; }
3253 
3254     // report one less then actual size
lengthCowArray3255     @property size_t length() const
3256     {
3257         return data.length ? data.length - 1 : 0;
3258     }
3259 
3260     //+ an extra slot for ref-count
lengthCowArray3261     @property void length(size_t len)
3262     {
3263         import std.algorithm.comparison : min;
3264         import std.algorithm.mutation : copy;
3265         if (len == 0)
3266         {
3267             if (!empty)
3268                 freeThisReference();
3269             return;
3270         }
3271         immutable total = len + 1; // including ref-count
3272         if (empty)
3273         {
3274             data = SP.alloc!uint(total);
3275             refCount = 1;
3276             return;
3277         }
3278         immutable cur_cnt = refCount;
3279         if (cur_cnt != 1) // have more references to this memory
3280         {
3281             refCount = cur_cnt - 1;
3282             auto new_data = SP.alloc!uint(total);
3283             // take shrinking into account
3284             auto to_copy = min(total, data.length) - 1;
3285             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3286             data = new_data; // before setting refCount!
3287             refCount = 1;
3288         }
3289         else // 'this' is the only reference
3290         {
3291             // use the realloc (hopefully in-place operation)
3292             data = SP.realloc(data, total);
3293             refCount = 1; // setup a ref-count in the new end of the array
3294         }
3295     }
3296 
3297     alias opDollar = length;
3298 
opIndexCowArray3299     uint opIndex()(size_t idx)const
3300     {
3301         return data[idx];
3302     }
3303 
opIndexAssignCowArray3304     void opIndexAssign(uint val, size_t idx)
3305     {
3306         auto cnt = refCount;
3307         if (cnt != 1)
3308             dupThisReference(cnt);
3309         data[idx] = val;
3310     }
3311 
3312     //
opSliceCowArray3313     auto opSlice(size_t from, size_t to)
3314     {
3315         if (!empty)
3316         {
3317             auto cnt = refCount;
3318             if (cnt != 1)
3319                 dupThisReference(cnt);
3320         }
3321         return data[from .. to];
3322 
3323     }
3324 
3325     //
opSliceCowArray3326     auto opSlice(size_t from, size_t to) const
3327     {
3328         return data[from .. to];
3329     }
3330 
3331     // length slices before the ref count
opSliceCowArray3332     auto opSlice()
3333     {
3334         return opSlice(0, length);
3335     }
3336 
3337     // ditto
opSliceCowArray3338     auto opSlice() const
3339     {
3340         return opSlice(0, length);
3341     }
3342 
3343     void append(Range)(Range range)
3344         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3345     {
3346         size_t nl = length + range.length;
3347         length = nl;
3348         copy(range, this[nl-range.length .. nl]);
3349     }
3350 
appendCowArray3351     void append()(uint[] val...)
3352     {
3353         length = length + val.length;
3354         data[$-val.length-1 .. $-1] = val[];
3355     }
3356 
opEqualsCowArray3357     bool opEquals()(auto const ref CowArray rhs)const
3358     {
3359         if (empty ^ rhs.empty)
3360             return false; // one is empty and the other isn't
3361         return empty || data[0..$-1] == rhs.data[0..$-1];
3362     }
3363 
3364 private:
3365     // ref-count is right after the data
refCountCowArray3366     @property uint refCount() const
3367     {
3368         return data[$-1];
3369     }
3370 
refCountCowArray3371     @property void refCount(uint cnt)
3372     {
3373         data[$-1] = cnt;
3374     }
3375 
freeThisReferenceCowArray3376     void freeThisReference()
3377     {
3378         immutable count = refCount;
3379         if (count != 1) // have more references to this memory
3380         {
3381             // dec shared ref-count
3382             refCount = count - 1;
3383             data = [];
3384         }
3385         else
3386             SP.destroy(data);
3387         assert(!data.ptr);
3388     }
3389 
dupThisReferenceCowArray3390     void dupThisReference(uint count)
3391     in
3392     {
3393         assert(!empty && count != 1 && count == refCount);
3394     }
3395     body
3396     {
3397         import std.algorithm.mutation : copy;
3398         // dec shared ref-count
3399         refCount = count - 1;
3400         // copy to the new chunk of RAM
3401         auto new_data = SP.alloc!uint(data.length);
3402         // bit-blit old stuff except the counter
3403         copy(data[0..$-1], new_data[0..$-1]);
3404         data = new_data; // before setting refCount!
3405         refCount = 1; // so that this updates the right one
3406     }
3407 
3408     uint[] data;
3409 }
3410 
3411 @safe unittest// Uint24 tests
3412 {
3413     import std.algorithm.comparison : equal;
3414     import std.algorithm.mutation : copy;
3415     import std.conv : text;
3416     import std.range : iota, chain;
3417     import std.range.primitives : isBidirectionalRange, isOutputRange;
funcRef(T)3418     void funcRef(T)(ref T u24)
3419     {
3420         u24.length = 2;
3421         u24[1] = 1024;
3422         T u24_c = u24;
3423         assert(u24[1] == 1024);
3424         u24.length = 0;
3425         assert(u24.empty);
3426         u24.append([1, 2]);
3427         assert(equal(u24[], [1, 2]));
3428         u24.append(111);
3429         assert(equal(u24[], [1, 2, 111]));
3430         assert(!u24_c.empty && u24_c[1] == 1024);
3431         u24.length = 3;
3432         copy(iota(0, 3), u24[]);
3433         assert(equal(u24[], iota(0, 3)));
3434         assert(u24_c[1] == 1024);
3435     }
3436 
func2(T)3437     void func2(T)(T u24)
3438     {
3439         T u24_2 = u24;
3440         T u24_3;
3441         u24_3 = u24_2;
3442         assert(u24_2 == u24_3);
3443         assert(equal(u24[], u24_2[]));
3444         assert(equal(u24_2[], u24_3[]));
3445         funcRef(u24_3);
3446 
3447         assert(equal(u24_3[], iota(0, 3)));
3448         assert(!equal(u24_2[], u24_3[]));
3449         assert(equal(u24_2[], u24[]));
3450         u24_2 = u24_3;
3451         assert(equal(u24_2[], iota(0, 3)));
3452         // to test that passed arg is intact outside
3453         // plus try out opEquals
3454         u24 = u24_3;
3455         u24 = T.init;
3456         u24_3 = T.init;
3457         assert(u24.empty);
3458         assert(u24 == u24_3);
3459         assert(u24 != u24_2);
3460     }
3461 
3462     foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3463     {
3464         alias Range = typeof(CowArray!Policy.init[]);
3465         alias U24A = CowArray!Policy;
3466         static assert(isForwardRange!Range);
3467         static assert(isBidirectionalRange!Range);
3468         static assert(isOutputRange!(Range, uint));
3469         static assert(isRandomAccessRange!(Range));
3470 
3471         auto arr = U24A([42u, 36, 100]);
3472         assert(arr[0] == 42);
3473         assert(arr[1] == 36);
3474         arr[0] = 72;
3475         arr[1] = 0xFE_FEFE;
3476         assert(arr[0] == 72);
3477         assert(arr[1] == 0xFE_FEFE);
3478         assert(arr[2] == 100);
3479         U24A arr2 = arr;
3480         assert(arr2[0] == 72);
3481         arr2[0] = 11;
3482         // test COW-ness
3483         assert(arr[0] == 72);
3484         assert(arr2[0] == 11);
3485         // set this to about 100M to stress-test COW memory management
3486         foreach (v; 0 .. 10_000)
3487             func2(arr);
3488         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3489 
3490         auto r2 = U24A(iota(0, 100));
3491         assert(equal(r2[], iota(0, 100)), text(r2[]));
3492         copy(iota(10, 170, 2), r2[10 .. 90]);
3493         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3494                , text(r2[]));
3495     }
3496 }
3497 
version(unittest)3498 version (unittest)
3499 {
3500     private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3501 }
3502 
3503 @safe unittest// core set primitives test
3504 {
3505     import std.conv : text;
foreach(CodeList;AllSets)3506     foreach (CodeList; AllSets)
3507     {
3508         CodeList a;
3509         //"plug a hole" test
3510         a.add(10, 20).add(25, 30).add(15, 27);
3511         assert(a == CodeList(10, 30), text(a));
3512 
3513         auto x = CodeList.init;
3514         x.add(10, 20).add(30, 40).add(50, 60);
3515 
3516         a = x;
3517         a.add(20, 49);//[10, 49) [50, 60)
3518         assert(a == CodeList(10, 49, 50 ,60));
3519 
3520         a = x;
3521         a.add(20, 50);
3522         assert(a == CodeList(10, 60), text(a));
3523 
3524         // simple unions, mostly edge effects
3525         x = CodeList.init;
3526         x.add(10, 20).add(40, 60);
3527 
3528         a = x;
3529         a.add(10, 25); //[10, 25) [40, 60)
3530         assert(a == CodeList(10, 25, 40, 60));
3531 
3532         a = x;
3533         a.add(5, 15); //[5, 20) [40, 60)
3534         assert(a == CodeList(5, 20, 40, 60));
3535 
3536         a = x;
3537         a.add(0, 10); // [0, 20) [40, 60)
3538         assert(a == CodeList(0, 20, 40, 60));
3539 
3540         a = x;
3541         a.add(0, 5); // prepand
3542         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3543 
3544         a = x;
3545         a.add(5, 20);
3546         assert(a == CodeList(5, 20, 40, 60));
3547 
3548         a = x;
3549         a.add(3, 37);
3550         assert(a == CodeList(3, 37, 40, 60));
3551 
3552         a = x;
3553         a.add(37, 65);
3554         assert(a == CodeList(10, 20, 37, 65));
3555 
3556         // some tests on helpers for set intersection
3557         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3558         a = x;
3559 
3560         auto m = a.skipUpTo(60);
3561         a.dropUpTo(110, m);
3562         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3563 
3564         a = x;
3565         a.dropUpTo(100);
3566         assert(a == CodeList(100, 120), text(a.data[]));
3567 
3568         a = x;
3569         m = a.skipUpTo(50);
3570         a.dropUpTo(140, m);
3571         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3572         a = x;
3573         a.dropUpTo(60);
3574         assert(a == CodeList(100, 120), text(a.data[]));
3575     }
3576 }
3577 
3578 
3579 //test constructor to work with any order of intervals
3580 @safe unittest
3581 {
3582     import std.algorithm.comparison : equal;
3583     import std.conv : text, to;
3584     import std.range : chain, iota;
3585     import std.typecons : tuple;
3586     //ensure constructor handles bad ordering and overlap
3587     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3588     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3589         assert(ch in c1, to!string(ch));
3590 
3591     //contiguos
3592     assert(CodepointSet(1000, 1006, 1006, 1009)
3593         .byInterval.equal([tuple(1000, 1009)]));
3594     //contains
3595     assert(CodepointSet(900, 1200, 1000, 1100)
3596         .byInterval.equal([tuple(900, 1200)]));
3597     //intersect left
3598     assert(CodepointSet(900, 1100, 1000, 1200)
3599         .byInterval.equal([tuple(900, 1200)]));
3600     //intersect right
3601     assert(CodepointSet(1000, 1200, 900, 1100)
3602         .byInterval.equal([tuple(900, 1200)]));
3603 
3604     //ditto with extra items at end
3605     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3606         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3607     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3608         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3609 
3610     //"plug a hole" test
3611     auto c2 = CodepointSet(20, 40,
3612         60, 80, 100, 140, 150, 200,
3613         40, 60, 80, 100, 140, 150
3614     );
3615     assert(c2.byInterval.equal([tuple(20, 200)]));
3616 
3617     auto c3 = CodepointSet(
3618         20, 40, 60, 80, 100, 140, 150, 200,
3619         0, 10, 15, 100, 10, 20, 200, 220);
3620     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3621 }
3622 
3623 
3624 @safe unittest
3625 {   // full set operations
3626     import std.conv : text;
foreach(CodeList;AllSets)3627     foreach (CodeList; AllSets)
3628     {
3629         CodeList a, b, c, d;
3630 
3631         //"plug a hole"
3632         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3633         b.add(40, 60).add(80, 100).add(140, 150);
3634         c = a | b;
3635         d = b | a;
3636         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3637         assert(c == d, text(c," vs ", d));
3638 
3639         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3640         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3641         d = b | a;
3642         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3643         assert(c == d, text(c," vs ", d));
3644 
3645         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3646         c = a | b;//[10, 140) [145, 200)
3647         d = b | a;
3648         assert(c == CodeList(10, 140, 145, 200));
3649         assert(c == d, text(c," vs ", d));
3650 
3651         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3652         c = a | b;//[0, 140) [150, 220)
3653         d = b | a;
3654         assert(c == CodeList(0, 140, 150, 220));
3655         assert(c == d, text(c," vs ", d));
3656 
3657 
3658         a = CodeList.init.add(20, 40).add(60, 80);
3659         b = CodeList.init.add(25, 35).add(65, 75);
3660         c = a & b;
3661         d = b & a;
3662         assert(c == CodeList(25, 35, 65, 75), text(c));
3663         assert(c == d, text(c," vs ", d));
3664 
3665         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3666         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3667         c = a & b;
3668         d = b & a;
3669         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3670         assert(c == d, text(c," vs ", d));
3671 
3672         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3673         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3674         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3675         d = b & a;
3676 
3677         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3678         assert(c == d, text(c, " vs ",d));
3679         assert((c & a) == c);
3680         assert((d & b) == d);
3681         assert((c & d) == d);
3682 
3683         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3684         c = a & b;
3685         d = b & a;
3686         assert(c == CodeList(150, 200), text(c));
3687         assert(c == d, text(c, " vs ",d));
3688         assert((c & a) == c);
3689         assert((d & b) == d);
3690         assert((c & d) == d);
3691 
3692         assert((a & a) == a);
3693         assert((b & b) == b);
3694 
3695         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3696         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3697         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3698         d = b - a;// [40, 60) [80, 100) [200, 300)
3699         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3700         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3701         assert(c - d == c, text(c-d, " vs ", c));
3702         assert(d - c == d, text(d-c, " vs ", d));
3703         assert(c - c == CodeList.init);
3704         assert(d - d == CodeList.init);
3705 
3706         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3707         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3708         c = a - b;// [160, 190)
3709         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3710         assert(c == CodeList(160, 190), text(c));
3711         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3712         assert(c - d == c, text(c-d, " vs ", c));
3713         assert(d - c == d, text(d-c, " vs ", d));
3714         assert(c - c == CodeList.init);
3715         assert(d - d == CodeList.init);
3716 
3717         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3718         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3719         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3720         d = b ~ a;
3721         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3722                text(c));
3723         assert(c == d, text(c, " vs ", d));
3724     }
3725 }
3726 
3727 }
3728 
3729 @safe unittest// vs single dchar
3730 {
3731     import std.conv : text;
3732     CodepointSet a = CodepointSet(10, 100, 120, 200);
3733     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3734     assert((a & 'B') == CodepointSet(66, 67));
3735 }
3736 
3737 @safe unittest// iteration & opIndex
3738 {
3739     import std.algorithm.comparison : equal;
3740     import std.conv : text;
3741     import std.typecons : tuple, Tuple;
3742 
3743     foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3744     {
3745         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3746         auto a = CodeList('A','N','a', 'n');
3747         assert(equal(a.byInterval,
3748                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3749             ), text(a.byInterval));
3750 
3751         // same @@@BUG as in issue 8949 ?
version(bug8949)3752         version (bug8949)
3753         {
3754             import std.range : retro;
3755             assert(equal(retro(a.byInterval),
3756                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3757             ), text(retro(a.byInterval)));
3758         }
3759         auto achr = a.byCodepoint;
3760         assert(equal(achr, arr), text(a.byCodepoint));
3761         foreach (ch; a.byCodepoint)
3762             assert(a[ch]);
3763         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3764         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3765         foreach (ch; x.byCodepoint)
3766             assert(x[ch]);
3767         static if (is(CodeList == CodepointSet))
3768         {
3769             auto y = CodeList(x.byInterval);
3770             assert(equal(x.byInterval, y.byInterval));
3771         }
3772         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3773         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3774     }
3775 }
3776 
3777 //============================================================================
3778 // Generic Trie template and various ways to build it
3779 //============================================================================
3780 
3781 // debug helper to get a shortened array dump
arrayRepr(T)3782 auto arrayRepr(T)(T x)
3783 {
3784     import std.conv : text;
3785     if (x.length > 32)
3786     {
3787         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3788     }
3789     else
3790         return text(x);
3791 }
3792 
3793 /**
3794     Maps $(D Key) to a suitable integer index within the range of $(D size_t).
3795     The mapping is constructed by applying predicates from $(D Prefix) left to right
3796     and concatenating the resulting bits.
3797 
3798     The first (leftmost) predicate defines the most significant bits of
3799     the resulting index.
3800  */
mapTrieIndex(Prefix...)3801 template mapTrieIndex(Prefix...)
3802 {
3803     size_t mapTrieIndex(Key)(Key key)
3804         if (isValidPrefixForTrie!(Key, Prefix))
3805     {
3806         alias p = Prefix;
3807         size_t idx;
3808         foreach (i, v; p[0..$-1])
3809         {
3810             idx |= p[i](key);
3811             idx <<= p[i+1].bitSize;
3812         }
3813         idx |= p[$-1](key);
3814         return idx;
3815     }
3816 }
3817 
3818 /*
3819     $(D TrieBuilder) is a type used for incremental construction
3820     of $(LREF Trie)s.
3821 
3822     See $(LREF buildTrie) for generic helpers built on top of it.
3823 */
3824 @trusted private struct TrieBuilder(Value, Key, Args...)
3825 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3826 {
3827     import std.exception : enforce;
3828 
3829 private:
3830     // last index is not stored in table, it is used as an offset to values in a block.
3831     static if (is(Value == bool))// always pack bool
3832         alias V = BitPacked!(Value, 1);
3833     else
3834         alias V = Value;
deduceMaxIndex(Preds...)3835     static auto deduceMaxIndex(Preds...)()
3836     {
3837         size_t idx = 1;
3838         foreach (v; Preds)
3839             idx *= 2^^v.bitSize;
3840         return idx;
3841     }
3842 
3843     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3844     {
3845         alias Prefix = Args[1..$];
3846         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3847         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3848         enum roughedMaxIndex =
3849             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3850         // check warp around - if wrapped, use the default deduction rule
3851         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3852             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3853     }
3854     else
3855     {
3856         alias Prefix = Args;
3857         enum maxIndex = deduceMaxIndex!(Prefix)();
3858     }
3859 
3860     alias getIndex = mapTrieIndex!(Prefix);
3861 
3862     enum lastLevel = Prefix.length-1;
3863     struct ConstructState
3864     {
3865         size_t idx_zeros, idx_ones;
3866     }
3867     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3868     size_t[Prefix.length] indices;
3869     // default filler value to use
3870     Value defValue;
3871     // this is a full-width index of next item
3872     size_t curIndex;
3873     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3874     ConstructState[Prefix.length] state;
3875     // the table being constructed
3876     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3877 
3878     @disable this();
3879 
3880     //shortcut for index variable at level 'level'
idx(size_t level)3881     @property ref idx(size_t level)(){ return indices[level]; }
3882 
3883     // this function assumes no holes in the input so
3884     // indices are going one by one
addValue(size_t level,T)3885     void addValue(size_t level, T)(T val, size_t numVals)
3886     {
3887         alias j = idx!level;
3888         enum pageSize = 1 << Prefix[level].bitSize;
3889         if (numVals == 0)
3890             return;
3891         auto ptr = table.slice!(level);
3892         if (numVals == 1)
3893         {
3894             static if (level == Prefix.length-1)
3895                 ptr[j] = val;
3896             else
3897             {// can incur narrowing conversion
3898                 assert(j < ptr.length);
3899                 ptr[j] = force!(typeof(ptr[j]))(val);
3900             }
3901             j++;
3902             if (j % pageSize == 0)
3903                 spillToNextPage!level(ptr);
3904             return;
3905         }
3906         // longer row of values
3907         // get to the next page boundary
3908         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3909         immutable n =  nextPB - j;// can fill right in this page
3910         if (numVals < n) //fits in current page
3911         {
3912             ptr[j .. j+numVals]  = val;
3913             j += numVals;
3914             return;
3915         }
3916         static if (level != 0)//on the first level it always fits
3917         {
3918             numVals -= n;
3919             //write till the end of current page
3920             ptr[j .. j+n]  = val;
3921             j += n;
3922             //spill to the next page
3923             spillToNextPage!level(ptr);
3924             // page at once loop
3925             if (state[level].idx_zeros != size_t.max && val == T.init)
3926             {
3927                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3928                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3929                     numVals/pageSize);
3930                 ptr = table.slice!level; //table structure might have changed
3931                 numVals %= pageSize;
3932             }
3933             else
3934             {
3935                 while (numVals >= pageSize)
3936                 {
3937                     numVals -= pageSize;
3938                     ptr[j .. j+pageSize]  = val;
3939                     j += pageSize;
3940                     spillToNextPage!level(ptr);
3941                 }
3942             }
3943             if (numVals)
3944             {
3945                 // the leftovers, an incomplete page
3946                 ptr[j .. j+numVals]  = val;
3947                 j += numVals;
3948             }
3949         }
3950     }
3951 
spillToNextPage(size_t level,Slice)3952     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3953     {
3954         // last level (i.e. topmost) has 1 "page"
3955         // thus it need not to add a new page on upper level
3956         static if (level != 0)
3957             spillToNextPageImpl!(level)(ptr);
3958     }
3959 
3960     // this can re-use the current page if duplicate or allocate a new one
3961     // it also makes sure that previous levels point to the correct page in this level
spillToNextPageImpl(size_t level,Slice)3962     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3963     {
3964         alias NextIdx = typeof(table.slice!(level-1)[0]);
3965         NextIdx next_lvl_index;
3966         enum pageSize = 1 << Prefix[level].bitSize;
3967         assert(idx!level % pageSize == 0);
3968         immutable last = idx!level-pageSize;
3969         const slice = ptr[idx!level - pageSize .. idx!level];
3970         size_t j;
3971         for (j=0; j<last; j+=pageSize)
3972         {
3973             if (ptr[j .. j+pageSize] == slice)
3974             {
3975                 // get index to it, reuse ptr space for the next block
3976                 next_lvl_index = force!NextIdx(j/pageSize);
3977                 version (none)
3978                 {
3979                 import std.stdio : writefln, writeln;
3980                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3981                         ,level
3982                         ,indices[level-1], pageSize, j, j+pageSize);
3983                 writeln("LEVEL(", level
3984                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3985                 writeln("LEVEL(", level
3986                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3987                 }
3988                 idx!level -= pageSize; // reuse this page, it is duplicate
3989                 break;
3990             }
3991         }
3992         if (j == last)
3993         {
3994     L_allocate_page:
3995             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3996             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3997             {
3998                 state[level].idx_zeros = next_lvl_index;
3999             }
4000             // allocate next page
4001             version (none)
4002             {
4003             import std.stdio : writefln;
4004             writefln("LEVEL(%s) page allocated: %s"
4005                      , level, arrayRepr(slice[0 .. pageSize]));
4006             writefln("LEVEL(%s) index: %s ; page at this index %s"
4007                      , level
4008                      , next_lvl_index
4009                      , arrayRepr(
4010                          table.slice!(level)
4011                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4012                         ));
4013             }
4014             table.length!level = table.length!level + pageSize;
4015         }
4016     L_know_index:
4017         // for the previous level, values are indices to the pages in the current level
4018         addValue!(level-1)(next_lvl_index, 1);
4019         ptr = table.slice!level; //re-load the slice after moves
4020     }
4021 
4022     // idx - full-width index to fill with v (full-width index != key)
4023     // fills everything in the range of [curIndex, idx) with filler
putAt(size_t idx,Value v)4024     void putAt(size_t idx, Value v)
4025     {
4026         assert(idx >= curIndex);
4027         immutable numFillers = idx - curIndex;
4028         addValue!lastLevel(defValue, numFillers);
4029         addValue!lastLevel(v, 1);
4030         curIndex = idx + 1;
4031     }
4032 
4033     // ditto, but sets the range of [idxA, idxB) to v
putRangeAt(size_t idxA,size_t idxB,Value v)4034     void putRangeAt(size_t idxA, size_t idxB, Value v)
4035     {
4036         assert(idxA >= curIndex);
4037         assert(idxB >= idxA);
4038         size_t numFillers = idxA - curIndex;
4039         addValue!lastLevel(defValue, numFillers);
4040         addValue!lastLevel(v, idxB - idxA);
4041         curIndex = idxB; // open-right
4042     }
4043 
4044     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4045         "duplicate key->value mapping";
4046 
4047 public:
4048     /**
4049         Construct a builder, where $(D filler) is a value
4050         to indicate empty slots (or "not found" condition).
4051     */
this(Value filler)4052     this(Value filler)
4053     {
4054         curIndex = 0;
4055         defValue = filler;
4056         // zeros-page index, ones-page index
4057         foreach (ref v; state)
4058             v = ConstructState(size_t.max, size_t.max);
4059         table = typeof(table)(indices);
4060         // one page per level is a bootstrap minimum
4061         foreach (i, Pred; Prefix)
4062             table.length!i = (1 << Pred.bitSize);
4063     }
4064 
4065     /**
4066         Put a value $(D v) into interval as
4067         mapped by keys from $(D a) to $(D b).
4068         All slots prior to $(D a) are filled with
4069         the default filler.
4070     */
putRange(Key a,Key b,Value v)4071     void putRange(Key a, Key b, Value v)
4072     {
4073         auto idxA = getIndex(a), idxB = getIndex(b);
4074         // indexes of key should always grow
4075         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4076         putRangeAt(idxA, idxB, v);
4077     }
4078 
4079     /**
4080         Put a value $(D v) into slot mapped by $(D key).
4081         All slots prior to $(D key) are filled with the
4082         default filler.
4083     */
putValue(Key key,Value v)4084     void putValue(Key key, Value v)
4085     {
4086         import std.conv : text;
4087         auto idx = getIndex(key);
4088         enforce(idx >= curIndex, text(errMsg, " ", idx));
4089         putAt(idx, v);
4090     }
4091 
4092     /// Finishes construction of Trie, yielding an immutable Trie instance.
build()4093     auto build()
4094     {
4095         static if (maxIndex != 0) // doesn't cover full range of size_t
4096         {
4097             assert(curIndex <= maxIndex);
4098             addValue!lastLevel(defValue, maxIndex - curIndex);
4099         }
4100         else
4101         {
4102             if (curIndex != 0 // couldn't wrap around
4103                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4104             {
4105                 addValue!lastLevel(defValue, size_t.max - curIndex);
4106                 addValue!lastLevel(defValue, 1);
4107             }
4108             // else curIndex already completed the full range of size_t by wrapping around
4109         }
4110         return Trie!(V, Key, maxIndex, Prefix)(table);
4111     }
4112 }
4113 
4114 /**
4115     $(P A generic Trie data-structure for a fixed number of stages.
4116     The design goal is optimal speed with smallest footprint size.
4117     )
4118     $(P It's intentionally read-only and doesn't provide constructors.
4119      To construct one use a special builder,
4120      see $(LREF TrieBuilder) and $(LREF buildTrie).
4121     )
4122 
4123 */
4124 @trusted private struct Trie(Value, Key, Args...)
4125 if (isValidPrefixForTrie!(Key, Args)
4126     || (isValidPrefixForTrie!(Key, Args[1..$])
4127     && is(typeof(Args[0]) : size_t)))
4128 {
4129     import std.range.primitives : isOutputRange;
4130     static if (is(typeof(Args[0]) : size_t))
4131     {
4132         private enum maxIndex = Args[0];
4133         private enum hasBoundsCheck = true;
4134         private alias Prefix = Args[1..$];
4135     }
4136     else
4137     {
4138         private enum hasBoundsCheck = false;
4139         private alias Prefix = Args;
4140     }
4141 
this()4142     private this()(typeof(_table) table)
4143     {
4144         _table = table;
4145     }
4146 
4147     // only for constant Tries constructed from precompiled tables
this()4148     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4149         const(size_t)[] data) const
4150     {
4151         _table = typeof(_table)(offsets, sizes, data);
4152     }
4153 
4154     /**
4155         $(P Lookup the $(D key) in this $(D Trie). )
4156 
4157         $(P The lookup always succeeds if key fits the domain
4158         provided during construction. The whole domain defined
4159         is covered so instead of not found condition
4160         the sentinel (filler) value could be used. )
4161 
4162         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4163         define a domain of $(D Trie) keys and the sentinel value. )
4164 
4165         Note:
4166         Domain range-checking is only enabled in debug builds
4167         and results in assertion failure.
4168     */
4169     TypeOfBitPacked!Value opIndex()(Key key) const
4170     {
4171         static if (hasBoundsCheck)
4172             assert(mapTrieIndex!Prefix(key) < maxIndex);
4173         size_t idx;
4174         alias p = Prefix;
4175         idx = cast(size_t) p[0](key);
4176         foreach (i, v; p[0..$-1])
4177             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4178         return _table.ptr!(p.length-1)[idx];
4179     }
4180 
4181     ///
4182     @property size_t bytes(size_t n=size_t.max)() const
4183     {
4184         return _table.bytes!n;
4185     }
4186 
4187     ///
pages(size_t n)4188     @property size_t pages(size_t n)() const
4189     {
4190         return (bytes!n+2^^(Prefix[n].bitSize-1))
4191                 /2^^Prefix[n].bitSize;
4192     }
4193 
4194     ///
4195     void store(OutRange)(scope OutRange sink) const
4196         if (isOutputRange!(OutRange, char))
4197     {
4198         _table.store(sink);
4199     }
4200 
4201 private:
4202     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4203 }
4204 
4205 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4206 // left-to-right, the most significant bits first
GetBitSlicing(size_t top,sizes...)4207 template GetBitSlicing(size_t top, sizes...)
4208 {
4209     static if (sizes.length > 0)
4210         alias GetBitSlicing =
4211             AliasSeq!(sliceBits!(top - sizes[0], top),
4212                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4213     else
4214         alias GetBitSlicing = AliasSeq!();
4215 }
4216 
callableWith(T)4217 template callableWith(T)
4218 {
4219     template callableWith(alias Pred)
4220     {
4221         static if (!is(typeof(Pred(T.init))))
4222             enum callableWith = false;
4223         else
4224         {
4225             alias Result = typeof(Pred(T.init));
4226             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4227         }
4228     }
4229 }
4230 
4231 /*
4232     Check if $(D Prefix) is a valid set of predicates
4233     for $(D Trie) template having $(D Key) as the type of keys.
4234     This requires all predicates to be callable, take
4235     single argument of type $(D Key) and return unsigned value.
4236 */
isValidPrefixForTrie(Key,Prefix...)4237 template isValidPrefixForTrie(Key, Prefix...)
4238 {
4239     import std.meta : allSatisfy;
4240     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4241 }
4242 
4243 /*
4244     Check if $(D Args) is a set of maximum key value followed by valid predicates
4245     for $(D Trie) template having $(D Key) as the type of keys.
4246 */
isValidArgsForTrie(Key,Args...)4247 template isValidArgsForTrie(Key, Args...)
4248 {
4249     static if (Args.length > 1)
4250     {
4251         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4252             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4253     }
4254     else
4255         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4256 }
4257 
sumOfIntegerTuple(ints...)4258 @property size_t sumOfIntegerTuple(ints...)()
4259 {
4260     size_t count=0;
4261     foreach (v; ints)
4262         count += v;
4263     return count;
4264 }
4265 
4266 /**
4267     A shorthand for creating a custom multi-level fixed Trie
4268     from a $(D CodepointSet). $(D sizes) are numbers of bits per level,
4269     with the most significant bits used first.
4270 
4271     Note: The sum of $(D sizes) must be equal 21.
4272 
4273     See_Also: $(LREF toTrie), which is even simpler.
4274 
4275     Example:
4276     ---
4277     {
4278         import std.stdio;
4279         auto set = unicode("Number");
4280         auto trie = codepointSetTrie!(8, 5, 8)(set);
4281         writeln("Input code points to test:");
4282         foreach (line; stdin.byLine)
4283         {
4284             int count=0;
4285             foreach (dchar ch; line)
4286                 if (trie[ch])// is number
4287                     count++;
4288             writefln("Contains %d number code points.", count);
4289         }
4290     }
4291     ---
4292 */
4293 public template codepointSetTrie(sizes...)
4294 if (sumOfIntegerTuple!sizes == 21)
4295 {
4296     auto codepointSetTrie(Set)(Set set)
4297         if (isCodepointSet!Set)
4298     {
4299         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4300         foreach (ival; set.byInterval)
4301             builder.putRange(ival[0], ival[1], true);
4302         return builder.build();
4303     }
4304 }
4305 
4306 /// Type of Trie generated by codepointSetTrie function.
4307 public template CodepointSetTrie(sizes...)
4308 if (sumOfIntegerTuple!sizes == 21)
4309 {
4310     alias Prefix = GetBitSlicing!(21, sizes);
4311     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4312 }
4313 
4314 /**
4315     A slightly more general tool for building fixed $(D Trie)
4316     for the Unicode data.
4317 
4318     Specifically unlike $(D codepointSetTrie) it's allows creating mappings
4319     of $(D dchar) to an arbitrary type $(D T).
4320 
4321     Note: Overload taking $(D CodepointSet)s will naturally convert
4322     only to bool mapping $(D Trie)s.
4323 */
4324 public template codepointTrie(T, sizes...)
4325 if (sumOfIntegerTuple!sizes == 21)
4326 {
4327     alias Prefix = GetBitSlicing!(21, sizes);
4328 
4329     static if (is(TypeOfBitPacked!T == bool))
4330     {
4331         auto codepointTrie(Set)(in Set set)
4332             if (isCodepointSet!Set)
4333         {
4334             return codepointSetTrie(set);
4335         }
4336     }
4337 
codepointTrie()4338     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4339     {
4340         return buildTrie!(T, dchar, Prefix)(map, defValue);
4341     }
4342 
4343     // unsorted range of pairs
4344     auto codepointTrie(R)(R range, T defValue=T.init)
4345         if (isInputRange!R
4346             && is(typeof(ElementType!R.init[0]) : T)
4347             && is(typeof(ElementType!R.init[1]) : dchar))
4348     {
4349         // build from unsorted array of pairs
4350         // TODO: expose index sorting functions for Trie
4351         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4352     }
4353 }
4354 
4355 @system pure unittest
4356 {
4357     import std.algorithm.comparison : max;
4358     import std.algorithm.searching : count;
4359 
4360     // pick characters from the Greek script
4361     auto set = unicode.Greek;
4362 
4363     // a user-defined property (or an expensive function)
4364     // that we want to look up
luckFactor(dchar ch)4365     static uint luckFactor(dchar ch)
4366     {
4367         // here we consider a character lucky
4368         // if its code point has a lot of identical hex-digits
4369         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4370         ubyte[6] nibbles; // 6 4-bit chunks of code point
4371         uint value = ch;
4372         foreach (i; 0 .. 6)
4373         {
4374             nibbles[i] = value & 0xF;
4375             value >>= 4;
4376         }
4377         uint luck;
4378         foreach (n; nibbles)
4379             luck = cast(uint) max(luck, count(nibbles[], n));
4380         return luck;
4381     }
4382 
4383     // only unsigned built-ins are supported at the moment
4384     alias LuckFactor = BitPacked!(uint, 3);
4385 
4386     // create a temporary associative array (AA)
4387     LuckFactor[dchar] map;
4388     foreach (ch; set.byCodepoint)
4389         map[ch] = LuckFactor(luckFactor(ch));
4390 
4391     // bits per stage are chosen randomly, fell free to optimize
4392     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4393 
4394     // from now on the AA is not needed
4395     foreach (ch; set.byCodepoint)
4396         assert(trie[ch] == luckFactor(ch)); // verify
4397     // CJK is not Greek, thus it has the default value
4398     assert(trie['\u4444'] == 0);
4399     // and here is a couple of quite lucky Greek characters:
4400     // Greek small letter epsilon with dasia
4401     assert(trie['\u1F11'] == 3);
4402     // Ancient Greek metretes sign
4403     assert(trie['\U00010181'] == 3);
4404 
4405 }
4406 
4407 /// Type of Trie as generated by codepointTrie function.
4408 public template CodepointTrie(T, sizes...)
4409 if (sumOfIntegerTuple!sizes == 21)
4410 {
4411     alias Prefix = GetBitSlicing!(21, sizes);
4412     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4413 }
4414 
cmpK0(alias Pred)4415 package template cmpK0(alias Pred)
4416 {
4417     import std.typecons : Tuple;
4418     static bool cmpK0(Value, Key)
4419         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4420     {
4421         return Pred(a[1]) < Pred(b[1]);
4422     }
4423 }
4424 
4425 /**
4426     The most general utility for construction of $(D Trie)s
4427     short of using $(D TrieBuilder) directly.
4428 
4429     Provides a number of convenience overloads.
4430     $(D Args) is tuple of maximum key value followed by
4431     predicates to construct index from key.
4432 
4433     Alternatively if the first argument is not a value convertible to $(D Key)
4434     then the whole tuple of $(D Args) is treated as predicates
4435     and the maximum Key is deduced from predicates.
4436 */
4437 private template buildTrie(Value, Key, Args...)
4438 if (isValidArgsForTrie!(Key, Args))
4439 {
4440     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4441     {
4442         alias Prefix = Args[1..$];
4443     }
4444     else
4445         alias Prefix = Args;
4446 
4447     alias getIndex = mapTrieIndex!(Prefix);
4448 
4449     // for multi-sort
GetComparators(size_t n)4450     template GetComparators(size_t n)
4451     {
4452         static if (n > 0)
4453             alias GetComparators =
4454                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4455         else
4456             alias GetComparators = AliasSeq!();
4457     }
4458 
4459     /*
4460         Build $(D Trie) from a range of a Key-Value pairs,
4461         assuming it is sorted by Key as defined by the following lambda:
4462         ------
4463         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4464         ------
4465         Exception is thrown if it's detected that the above order doesn't hold.
4466 
4467         In other words $(LREF mapTrieIndex) should be a
4468         monotonically increasing function that maps $(D Key) to an integer.
4469 
4470         See_Also: $(REF sort, std,_algorithm),
4471         $(REF SortedRange, std,_range),
4472         $(REF setUnion, std,_algorithm).
4473     */
4474     auto buildTrie(Range)(Range range, Value filler=Value.init)
4475         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4476             && is(typeof(Range.init.front[1]) : Key))
4477     {
4478         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4479         foreach (v; range)
4480             builder.putValue(v[1], v[0]);
4481         return builder.build();
4482     }
4483 
4484     /*
4485         If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
4486         to build $(D Trie) from a range of open-right intervals of $(D Key)s.
4487         The requirement  on the ordering of keys (and the behavior on the
4488         violation of it) is the same as for Key-Value range overload.
4489 
4490         Intervals denote ranges of !$(D filler) i.e. the opposite of filler.
4491         If no filler provided keys inside of the intervals map to true,
4492         and $(D filler) is false.
4493     */
4494     auto buildTrie(Range)(Range range, Value filler=Value.init)
4495         if (is(TypeOfBitPacked!Value ==  bool)
4496             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4497             && is(typeof(Range.init.front[1]) : Key))
4498     {
4499         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4500         foreach (ival; range)
4501             builder.putRange(ival[0], ival[1], !filler);
4502         return builder.build();
4503     }
4504 
4505     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4506         if (isInputRange!Range
4507             && is(typeof(Range.init.front[0]) : Value)
4508             && is(typeof(Range.init.front[1]) : Key))
4509     {
4510         import std.algorithm.sorting : multiSort;
4511         alias Comps = GetComparators!(Prefix.length);
4512         if (unsorted)
4513             multiSort!(Comps)(range);
4514         return buildTrie(range, filler);
4515     }
4516 
4517     /*
4518         If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
4519         to build $(D Trie) simply from an input range of $(D Key)s.
4520         The requirement  on the ordering of keys (and the behavior on the
4521         violation of it) is the same as for Key-Value range overload.
4522 
4523         Keys found in range denote !$(D filler) i.e. the opposite of filler.
4524         If no filler provided keys map to true, and $(D filler) is false.
4525     */
4526     auto buildTrie(Range)(Range range, Value filler=Value.init)
4527         if (is(TypeOfBitPacked!Value ==  bool)
4528             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4529     {
4530         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4531         foreach (v; range)
4532             builder.putValue(v, !filler);
4533         return builder.build();
4534     }
4535 
4536     /*
4537         If $(D Key) is unsigned integer $(D Trie) could be constructed from array
4538         of values where array index serves as key.
4539     */
4540     auto buildTrie()(Value[] array, Value filler=Value.init)
4541         if (isUnsigned!Key)
4542     {
4543         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4544         foreach (idx, v; array)
4545             builder.putValue(idx, v);
4546         return builder.build();
4547     }
4548 
4549     /*
4550         Builds $(D Trie) from associative array.
4551     */
buildTrie(Key,Value)4552     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4553     {
4554         import std.array : array;
4555         import std.range : zip;
4556         auto range = array(zip(map.values, map.keys));
4557         return buildTrie(range, filler, true); // sort it
4558     }
4559 }
4560 
4561 // helper in place of assumeSize to
4562 //reduce mangled name & help DMD inline Trie functors
clamp(size_t bits)4563 struct clamp(size_t bits)
4564 {
4565     static size_t opCall(T)(T arg){ return arg; }
4566     enum bitSize = bits;
4567 }
4568 
clampIdx(size_t idx,size_t bits)4569 struct clampIdx(size_t idx, size_t bits)
4570 {
4571     static size_t opCall(T)(T arg){ return arg[idx]; }
4572     enum bitSize = bits;
4573 }
4574 
4575 /**
4576     Conceptual type that outlines the common properties of all UTF Matchers.
4577 
4578     Note: For illustration purposes only, every method
4579     call results in assertion failure.
4580     Use $(LREF utfMatcher) to obtain a concrete matcher
4581     for UTF-8 or UTF-16 encodings.
4582 */
4583 public struct MatcherConcept
4584 {
4585     /**
4586         $(P Perform a semantic equivalent 2 operations:
4587         decoding a $(CODEPOINT) at front of $(D inp) and testing if
4588         it belongs to the set of $(CODEPOINTS) of this matcher. )
4589 
4590         $(P The effect on $(D inp) depends on the kind of function called:)
4591 
4592         $(P Match. If the codepoint is found in the set then range $(D inp)
4593         is advanced by its size in $(S_LINK Code unit, code units),
4594         otherwise the range is not modifed.)
4595 
4596         $(P Skip. The range is always advanced by the size
4597         of the tested $(CODEPOINT) regardless of the result of test.)
4598 
4599         $(P Test. The range is left unaffected regardless
4600         of the result of test.)
4601     */
4602     public bool match(Range)(ref Range inp)
4603         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4604     {
4605        assert(false);
4606     }
4607 
4608     ///ditto
4609     public bool skip(Range)(ref Range inp)
4610         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4611     {
4612         assert(false);
4613     }
4614 
4615     ///ditto
4616     public bool test(Range)(ref Range inp)
4617         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4618     {
4619         assert(false);
4620     }
4621     ///
4622     @safe unittest
4623     {
4624         string truth = "2² = 4";
4625         auto m = utfMatcher!char(unicode.Number);
4626         assert(m.match(truth)); // '2' is a number all right
4627         assert(truth == "² = 4"); // skips on match
4628         assert(m.match(truth)); // so is the superscript '2'
4629         assert(!m.match(truth)); // space is not a number
4630         assert(truth == " = 4"); // unaffected on no match
4631         assert(!m.skip(truth)); // same test ...
4632         assert(truth == "= 4"); // but skips a codepoint regardless
4633         assert(!m.test(truth)); // '=' is not a number
4634         assert(truth == "= 4"); // test never affects argument
4635     }
4636 
4637     /**
4638         Advanced feature - provide direct access to a subset of matcher based a
4639         set of known encoding lengths. Lengths are provided in
4640         $(S_LINK Code unit, code units). The sub-matcher then may do less
4641         operations per any $(D test)/$(D match).
4642 
4643         Use with care as the sub-matcher won't match
4644         any $(CODEPOINTS) that have encoded length that doesn't belong
4645         to the selected set of lengths. Also the sub-matcher object references
4646         the parent matcher and must not be used past the liftetime
4647         of the latter.
4648 
4649         Another caveat of using sub-matcher is that skip is not available
4650         preciesly because sub-matcher doesn't detect all lengths.
4651     */
subMatcherMatcherConcept4652     @property auto subMatcher(Lengths...)()
4653     {
4654         assert(0);
4655         return this;
4656     }
4657 
4658     @safe unittest
4659     {
4660         auto m = utfMatcher!char(unicode.Number);
4661         string square = "2²";
4662         // about sub-matchers
4663         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4664         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4665         assert(!m.subMatcher!1.test(square)); // unicode '²'
4666         assert(m.subMatcher!(2,3,4).match(square));  //
4667         assert(square == "");
4668         wstring wsquare = "2²";
4669         auto m16 = utfMatcher!wchar(unicode.Number);
4670         // may keep ref, but the orignal (m16) must be kept alive
4671         auto bmp = m16.subMatcher!1;
4672         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4673         assert(bmp.match(wsquare)); // And '²' too
4674     }
4675 }
4676 
4677 /**
4678     Test if $(D M) is an UTF Matcher for ranges of $(D Char).
4679 */
4680 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4681     C[] s;
4682     auto d = s.decoder;
4683     M m;
4684     assert(is(typeof(m.match(d)) == bool));
4685     assert(is(typeof(m.test(d)) == bool));
4686     static if (is(typeof(m.skip(d))))
4687     {
4688         assert(is(typeof(m.skip(d)) == bool));
4689         assert(is(typeof(m.skip(s)) == bool));
4690     }
4691     assert(is(typeof(m.match(s)) == bool));
4692     assert(is(typeof(m.test(s)) == bool));
4693 });
4694 
4695 @safe unittest
4696 {
4697     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4698     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4699     static assert(isUtfMatcher!(CharMatcher, char));
4700     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4701     static assert(isUtfMatcher!(WcharMatcher, wchar));
4702     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4703 }
4704 
4705 enum Mode {
4706     alwaysSkip,
4707     neverSkip,
4708     skipOnMatch
4709 }
4710 
ForwardStrings()4711 mixin template ForwardStrings()
4712 {
4713     private bool fwdStr(string fn, C)(ref C[] str) const pure
4714     {
4715         import std.utf : byCodeUnit;
4716         alias type = typeof(byCodeUnit(str));
4717         return mixin(fn~"(*cast(type*)&str)");
4718     }
4719 }
4720 
Utf8Matcher()4721 template Utf8Matcher()
4722 {
4723     enum validSize(int sz) = sz >= 1 && sz <= 4;
4724 
4725     void badEncoding() pure @safe
4726     {
4727         import std.utf : UTFException;
4728         throw new UTFException("Invalid UTF-8 sequence");
4729     }
4730 
4731     //for 1-stage ASCII
4732     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4733     //for 2-stage lookup of 2 byte UTF-8 sequences
4734     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4735         clampIdx!(0, 5), clampIdx!(1, 6));
4736     //ditto for 3 byte
4737     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4738         clampIdx!(0, 4),
4739         clampIdx!(1, 6),
4740         clampIdx!(2, 6)
4741     );
4742     //ditto for 4 byte
4743     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4744         clampIdx!(0, 3), clampIdx!(1, 6),
4745         clampIdx!(2, 6), clampIdx!(3, 6)
4746     );
4747     alias Tables = AliasSeq!(
4748         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4749         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4750         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4751         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4752     );
4753     alias Table(int size) = Tables[size-1];
4754 
4755     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4756     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4757 
4758     char truncate()(char ch) pure @safe
4759     {
4760         ch -= 0x80;
4761         if (ch < 0x40)
4762         {
4763             return ch;
4764         }
4765         else
4766         {
4767             badEncoding();
4768             return cast(char) 0;
4769         }
4770     }
4771 
4772     static auto encode(size_t sz)(dchar ch)
4773         if (sz > 1)
4774     {
4775         import std.utf : encodeUTF = encode;
4776         char[4] buf;
4777         encodeUTF(buf, ch);
4778         char[sz] ret;
4779         buf[0] &= leadMask!sz;
4780         foreach (n; 1 .. sz)
4781             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4782         ret[] = buf[0 .. sz];
4783         return ret;
4784     }
4785 
4786     auto build(Set)(Set set)
4787     {
4788         import std.algorithm.iteration : map;
4789         auto ascii = set & unicode.ASCII;
4790         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4791         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4792         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4793         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4794         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4795         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4796         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4797         alias Ret = Impl!(1,2,3,4);
4798         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4799     }
4800 
4801     // Bootstrap UTF-8 static matcher interface
4802     // from 3 primitives: tab!(size), lookup and Sizes
4803     mixin template DefMatcher()
4804     {
4805         import std.format : format;
4806         import std.meta : Erase, staticIndexOf;
4807         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4808         alias UniSizes = Erase!(1, Sizes);
4809 
4810         //generate dispatch code sequence for unicode parts
4811         static auto genDispatch()
4812         {
4813             string code;
4814             foreach (size; UniSizes)
4815                 code ~= format(q{
4816                     if ((ch & ~leadMask!%d) == encMask!(%d))
4817                         return lookup!(%d, mode)(inp);
4818                     else
4819                 }, size, size, size);
4820             static if (Sizes.length == 4) //covers all code unit cases
4821                 code ~= "{ badEncoding(); return false; }";
4822             else
4823                 code ~= "return false;"; //may be just fine but not covered
4824             return code;
4825         }
4826         enum dispatch = genDispatch();
4827 
4828         public bool match(Range)(ref Range inp) const pure
4829             if (isRandomAccessRange!Range && is(ElementType!Range : char))
4830         {
4831             enum mode = Mode.skipOnMatch;
4832             assert(!inp.empty);
4833             immutable ch = inp[0];
4834             static if (hasASCII)
4835             {
4836                 if (ch < 0x80)
4837                 {
4838                     immutable r = tab!1[ch];
4839                     if (r)
4840                         inp.popFront();
4841                     return r;
4842                 }
4843                 else
4844                     mixin(dispatch);
4845             }
4846             else
4847                 mixin(dispatch);
4848         }
4849 
4850         static if (Sizes.length == 4) // can skip iff can detect all encodings
4851         {
4852             public bool skip(Range)(ref Range inp) const pure @trusted
4853                 if (isRandomAccessRange!Range && is(ElementType!Range : char))
4854             {
4855                 enum mode = Mode.alwaysSkip;
4856                 assert(!inp.empty);
4857                 auto ch = inp[0];
4858                 static if (hasASCII)
4859                 {
4860                     if (ch < 0x80)
4861                     {
4862                         inp.popFront();
4863                         return tab!1[ch];
4864                     }
4865                     else
4866                         mixin(dispatch);
4867                 }
4868                 else
4869                     mixin(dispatch);
4870             }
4871         }
4872 
4873         public bool test(Range)(ref Range inp) const pure @trusted
4874             if (isRandomAccessRange!Range && is(ElementType!Range : char))
4875         {
4876             enum mode = Mode.neverSkip;
4877             assert(!inp.empty);
4878             auto ch = inp[0];
4879             static if (hasASCII)
4880             {
4881                 if (ch < 0x80)
4882                     return tab!1[ch];
4883                 else
4884                     mixin(dispatch);
4885             }
4886             else
4887                 mixin(dispatch);
4888         }
4889 
4890         bool match(C)(ref C[] str) const pure @trusted
4891             if (isSomeChar!C)
4892         {
4893             return fwdStr!"match"(str);
4894         }
4895 
4896         bool skip(C)(ref C[] str) const pure @trusted
4897             if (isSomeChar!C)
4898         {
4899             return fwdStr!"skip"(str);
4900         }
4901 
4902         bool test(C)(ref C[] str) const pure @trusted
4903             if (isSomeChar!C)
4904         {
4905             return fwdStr!"test"(str);
4906         }
4907 
4908         mixin ForwardStrings;
4909     }
4910 
4911     struct Impl(Sizes...)
4912     {
4913         import std.meta : allSatisfy, staticMap;
4914         static assert(allSatisfy!(validSize, Sizes),
4915             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4916     private:
4917         //pick tables for chosen sizes
4918         alias OurTabs = staticMap!(Table, Sizes);
4919         OurTabs tables;
4920         mixin DefMatcher;
4921         //static disptach helper UTF size ==> table
4922         alias tab(int i) = tables[i - 1];
4923 
4924         package @property auto subMatcher(SizesToPick...)() @trusted
4925         {
4926             return CherryPick!(Impl, SizesToPick)(&this);
4927         }
4928 
4929         bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted
4930         {
4931             import std.typecons : staticIota;
4932             if (inp.length < size)
4933             {
4934                 badEncoding();
4935                 return false;
4936             }
4937             char[size] needle = void;
4938             needle[0] = leadMask!size & inp[0];
4939             foreach (i; staticIota!(1, size))
4940             {
4941                 needle[i] = truncate(inp[i]);
4942             }
4943             //overlong encoding checks
4944             static if (size == 2)
4945             {
4946                 //0x80-0x7FF
4947                 //got 6 bits in needle[1], must use at least 8 bits
4948                 //must use at least 2 bits in needle[1]
4949                 if (needle[0] < 2) badEncoding();
4950             }
4951             else static if (size == 3)
4952             {
4953                 //0x800-0xFFFF
4954                 //got 6 bits in needle[2], must use at least 12bits
4955                 //must use 6 bits in needle[1] or anything in needle[0]
4956                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4957             }
4958             else static if (size == 4)
4959             {
4960                 //0x800-0xFFFF
4961                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4962                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4963                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4964             }
4965             static if (mode == Mode.alwaysSkip)
4966             {
4967                 inp.popFrontN(size);
4968                 return tab!size[needle];
4969             }
4970             else static if (mode == Mode.neverSkip)
4971             {
4972                 return tab!size[needle];
4973             }
4974             else
4975             {
4976                 static assert(mode == Mode.skipOnMatch);
4977                 if (tab!size[needle])
4978                 {
4979                     inp.popFrontN(size);
4980                     return true;
4981                 }
4982                 else
4983                     return false;
4984             }
4985         }
4986     }
4987 
4988     struct CherryPick(I, Sizes...)
4989     {
4990         import std.meta : allSatisfy;
4991         static assert(allSatisfy!(validSize, Sizes),
4992             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4993     private:
4994         I* m;
4995         @property ref tab(int i)() const pure { return m.tables[i - 1]; }
4996         bool lookup(int size, Mode mode, Range)(ref Range inp) const pure
4997         {
4998             return m.lookup!(size, mode)(inp);
4999         }
5000         mixin DefMatcher;
5001     }
5002 }
5003 
Utf16Matcher()5004 template Utf16Matcher()
5005 {
5006     enum validSize(int sz) = sz >= 1 && sz <= 2;
5007 
5008     void badEncoding() pure
5009     {
5010         import std.utf : UTFException;
5011         throw new UTFException("Invalid UTF-16 sequence");
5012     }
5013 
5014     // 1-stage ASCII
5015     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5016     //2-stage BMP
5017     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5018     //4-stage - full Unicode
5019     //assume that 0xD800 & 0xDC00 bits are cleared
5020     //thus leaving 10 bit per wchar to worry about
5021     alias UniSpec = AliasSeq!(bool, wchar[2],
5022         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5023         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5024     );
5025     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5026     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5027     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5028 
5029     auto encode2(dchar ch)
5030     {
5031         ch -= 0x1_0000;
5032         assert(ch <= 0xF_FFFF);
5033         wchar[2] ret;
5034         //do not put surrogate bits, they are sliced off
5035         ret[0] = cast(wchar)(ch >> 10);
5036         ret[1] = (ch & 0xFFF);
5037         return ret;
5038     }
5039 
5040     auto build(Set)(Set set)
5041     {
5042         import std.algorithm.iteration : map;
5043         auto ascii = set & unicode.ASCII;
5044         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5045             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5046         auto other = set - (bmp | ascii);
5047         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5048         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5049         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5050         alias Ret = Impl!(1,2);
5051         return Ret(asciiT, bmpT, otherT);
5052     }
5053 
5054     //bootstrap full UTF-16 matcher interace from
5055     //sizeFlags, lookupUni and ascii
5056     mixin template DefMatcher()
5057     {
5058         public bool match(Range)(ref Range inp) const pure @trusted
5059             if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5060         {
5061             enum mode = Mode.skipOnMatch;
5062             assert(!inp.empty);
5063             immutable ch = inp[0];
5064             static if (sizeFlags & 1)
5065             {
5066                 if (ch < 0x80)
5067                 {
5068                   if (ascii[ch])
5069                   {
5070                       inp.popFront();
5071                       return true;
5072                   }
5073                   else
5074                       return false;
5075                 }
5076                 return lookupUni!mode(inp);
5077             }
5078             else
5079                 return lookupUni!mode(inp);
5080         }
5081 
5082         static if (Sizes.length == 2)
5083         {
5084             public bool skip(Range)(ref Range inp) const pure @trusted
5085                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5086             {
5087                 enum mode = Mode.alwaysSkip;
5088                 assert(!inp.empty);
5089                 immutable ch = inp[0];
5090                 static if (sizeFlags & 1)
5091                 {
5092                     if (ch < 0x80)
5093                     {
5094                         inp.popFront();
5095                         return ascii[ch];
5096                     }
5097                     else
5098                         return lookupUni!mode(inp);
5099                 }
5100                 else
5101                     return lookupUni!mode(inp);
5102             }
5103         }
5104 
5105         public bool test(Range)(ref Range inp) const pure @trusted
5106             if (isRandomAccessRange!Range && is(ElementType!Range : wchar))
5107         {
5108             enum mode = Mode.neverSkip;
5109             assert(!inp.empty);
5110             auto ch = inp[0];
5111             static if (sizeFlags & 1)
5112                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5113             else
5114                 return lookupUni!mode(inp);
5115         }
5116 
5117         bool match(C)(ref C[] str) const pure @trusted
5118             if (isSomeChar!C)
5119         {
5120             return fwdStr!"match"(str);
5121         }
5122 
5123         bool skip(C)(ref C[] str) const pure @trusted
5124             if (isSomeChar!C)
5125         {
5126             return fwdStr!"skip"(str);
5127         }
5128 
5129         bool test(C)(ref C[] str) const pure @trusted
5130             if (isSomeChar!C)
5131         {
5132             return fwdStr!"test"(str);
5133         }
5134 
5135         mixin ForwardStrings; //dispatch strings to range versions
5136     }
5137 
5138     struct Impl(Sizes...)
5139         if (Sizes.length >= 1 && Sizes.length <= 2)
5140     {
5141     private:
5142         import std.meta : allSatisfy;
5143         static assert(allSatisfy!(validSize, Sizes),
5144             "Only lengths of 1 and 2 code units are possible in UTF-16");
5145         static if (Sizes.length > 1)
5146             enum sizeFlags = Sizes[0] | Sizes[1];
5147         else
5148             enum sizeFlags = Sizes[0];
5149 
5150         static if (sizeFlags & 1)
5151         {
5152             Ascii ascii;
5153             Bmp bmp;
5154         }
5155         static if (sizeFlags & 2)
5156         {
5157             Uni uni;
5158         }
5159         mixin DefMatcher;
5160 
5161         package @property auto subMatcher(SizesToPick...)() @trusted
5162         {
5163             return CherryPick!(Impl, SizesToPick)(&this);
5164         }
5165 
5166         bool lookupUni(Mode mode, Range)(ref Range inp) const pure
5167         {
5168             wchar x = cast(wchar)(inp[0] - 0xD800);
5169             //not a high surrogate
5170             if (x > 0x3FF)
5171             {
5172                 //low surrogate
5173                 if (x <= 0x7FF) badEncoding();
5174                 static if (sizeFlags & 1)
5175                 {
5176                     auto ch = inp[0];
5177                     static if (mode == Mode.alwaysSkip)
5178                         inp.popFront();
5179                     static if (mode == Mode.skipOnMatch)
5180                     {
5181                         if (bmp[ch])
5182                         {
5183                             inp.popFront();
5184                             return true;
5185                         }
5186                         else
5187                             return false;
5188                     }
5189                     else
5190                         return bmp[ch];
5191                 }
5192                 else //skip is not available for sub-matchers, so just false
5193                     return false;
5194             }
5195             else
5196             {
5197                 static if (sizeFlags & 2)
5198                 {
5199                     if (inp.length < 2)
5200                         badEncoding();
5201                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5202                     //not a low surrogate
5203                     if (y > 0x3FF)
5204                         badEncoding();
5205                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5206                     static if (mode == Mode.alwaysSkip)
5207                         inp.popFrontN(2);
5208                     static if (mode == Mode.skipOnMatch)
5209                     {
5210                         if (uni[needle])
5211                         {
5212                             inp.popFrontN(2);
5213                             return true;
5214                         }
5215                         else
5216                             return false;
5217                     }
5218                     else
5219                         return uni[needle];
5220                 }
5221                 else //ditto
5222                     return false;
5223             }
5224         }
5225     }
5226 
5227     struct CherryPick(I, Sizes...)
5228         if (Sizes.length >= 1 && Sizes.length <= 2)
5229     {
5230     private:
5231         import std.meta : allSatisfy;
5232         I* m;
5233         enum sizeFlags = I.sizeFlags;
5234 
5235         static if (sizeFlags & 1)
5236         {
5237             @property ref ascii()() const pure{ return m.ascii; }
5238         }
5239 
5240         bool lookupUni(Mode mode, Range)(ref Range inp) const pure
5241         {
5242             return m.lookupUni!mode(inp);
5243         }
5244         mixin DefMatcher;
5245         static assert(allSatisfy!(validSize, Sizes),
5246             "Only lengths of 1 and 2 code units are possible in UTF-16");
5247     }
5248 }
5249 
utf8Matcher(Set)5250 private auto utf8Matcher(Set)(Set set) @trusted
5251 {
5252     return Utf8Matcher!().build(set);
5253 }
5254 
utf16Matcher(Set)5255 private auto utf16Matcher(Set)(Set set) @trusted
5256 {
5257     return Utf16Matcher!().build(set);
5258 }
5259 
5260 /**
5261     Constructs a matcher object
5262     to classify $(CODEPOINTS) from the $(D set) for encoding
5263     that has $(D Char) as code unit.
5264 
5265     See $(LREF MatcherConcept) for API outline.
5266 */
5267 public auto utfMatcher(Char, Set)(Set set) @trusted
5268 if (isCodepointSet!Set)
5269 {
5270     static if (is(Char : char))
5271         return utf8Matcher(set);
5272     else static if (is(Char : wchar))
5273         return utf16Matcher(set);
5274     else static if (is(Char : dchar))
5275         static assert(false, "UTF-32 needs no decoding,
5276             and thus not supported by utfMatcher");
5277     else
5278         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5279 }
5280 
5281 
5282 //a range of code units, packed with index to speed up forward iteration
5283 package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc
5284 if (is(C : wchar) || is(C : char))
5285 {
5286     static struct Decoder
5287     {
5288     pure nothrow:
5289         C[] str;
5290         size_t idx;
frontDecoder5291         @property C front(){ return str[idx]; }
backDecoder5292         @property C back(){ return str[$-1]; }
popFrontDecoder5293         void popFront(){ idx++; }
popBackDecoder5294         void popBack(){ str = str[0..$-1]; }
popFrontNDecoder5295         void popFrontN(size_t n){ idx += n; }
emptyDecoder5296         @property bool empty(){ return idx == str.length; }
saveDecoder5297         @property auto save(){ return this; }
opIndexDecoder5298         auto opIndex(size_t i){ return str[idx+i]; }
lengthDecoder5299         @property size_t length(){ return str.length - idx; }
5300         alias opDollar = length;
opSliceDecoder5301         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5302     }
5303     static assert(isRandomAccessRange!Decoder);
5304     static assert(is(ElementType!Decoder : C));
5305     return Decoder(s, offset);
5306 }
5307 
5308 @safe unittest
5309 {
5310     string rs = "hi! ネемног砀 текста";
5311     auto codec = rs.decoder;
5312     auto utf8 =  utf8Matcher(unicode.Letter);
5313     auto asc = utf8.subMatcher!(1);
5314     auto uni = utf8.subMatcher!(2,3,4);
5315     assert(asc.test(codec));
5316     assert(!uni.match(codec));
5317     assert(utf8.skip(codec));
5318     assert(codec.idx == 1);
5319 
5320     assert(!uni.match(codec));
5321     assert(asc.test(codec));
5322     assert(utf8.skip(codec));
5323     assert(codec.idx == 2);
5324     assert(!asc.match(codec));
5325 
5326     assert(!utf8.test(codec));
5327     assert(!utf8.skip(codec));
5328 
5329     assert(!asc.test(codec));
5330     assert(!utf8.test(codec));
5331     assert(!utf8.skip(codec));
5332     assert(utf8.test(codec));
5333     foreach (i; 0 .. 7)
5334     {
5335         assert(!asc.test(codec));
5336         assert(uni.test(codec));
5337         assert(utf8.skip(codec));
5338     }
5339     assert(!utf8.test(codec));
5340     assert(!utf8.skip(codec));
5341     //the same with match where applicable
5342     codec = rs.decoder;
5343     assert(utf8.match(codec));
5344     assert(codec.idx == 1);
5345     assert(utf8.match(codec));
5346     assert(codec.idx == 2);
5347     assert(!utf8.match(codec));
5348     assert(codec.idx == 2);
5349     assert(!utf8.skip(codec));
5350     assert(!utf8.skip(codec));
5351 
5352     foreach (i; 0 .. 7)
5353     {
5354         assert(!asc.test(codec));
5355         assert(utf8.test(codec));
5356         assert(utf8.match(codec));
5357     }
5358     auto i = codec.idx;
5359     assert(!utf8.match(codec));
5360     assert(codec.idx == i);
5361 }
5362 
5363 @safe unittest
5364 {
5365     import std.range : stride;
testAll(Matcher,Range)5366     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r)
5367     {
5368         bool t = m.test(r);
5369         auto save = r.idx;
5370         assert(t == m.match(r));
5371         assert(r.idx == save || t); //ether no change or was match
5372         r.idx = save;
5373         static if (is(typeof(m.skip(r))))
5374         {
5375             assert(t == m.skip(r));
5376             assert(r.idx != save); //always changed
5377             r.idx = save;
5378         }
5379         return t;
5380     }
5381     auto utf16 = utfMatcher!wchar(unicode.L);
5382     auto bmp = utf16.subMatcher!1;
5383     auto nonBmp = utf16.subMatcher!1;
5384     auto utf8 = utfMatcher!char(unicode.L);
5385     auto ascii = utf8.subMatcher!1;
5386     auto uni2 = utf8.subMatcher!2;
5387     auto uni3 = utf8.subMatcher!3;
5388     auto uni24 = utf8.subMatcher!(2,4);
5389     foreach (ch; unicode.L.byCodepoint.stride(3))
5390     {
5391         import std.utf : encode;
5392         char[4] buf;
5393         wchar[2] buf16;
5394         auto len = encode(buf, ch);
5395         auto len16 = encode(buf16, ch);
5396         auto c8 = buf[0 .. len].decoder;
5397         auto c16 = buf16[0 .. len16].decoder;
5398         assert(testAll(utf16, c16));
5399         assert(testAll(bmp, c16) || len16 != 1);
5400         assert(testAll(nonBmp, c16) || len16 != 2);
5401 
5402         assert(testAll(utf8, c8));
5403 
5404         //submatchers return false on out of their domain
5405         assert(testAll(ascii, c8) || len != 1);
5406         assert(testAll(uni2, c8) || len != 2);
5407         assert(testAll(uni3, c8) || len != 3);
5408         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5409     }
5410 }
5411 
5412 // cover decode fail cases of Matcher
5413 @system unittest
5414 {
5415     import std.algorithm.iteration : map;
5416     import std.exception : collectException;
5417     import std.format : format;
5418     auto utf16 = utfMatcher!wchar(unicode.L);
5419     auto utf8 = utfMatcher!char(unicode.L);
5420     //decode failure cases UTF-8
5421     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5422         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5423         "\xCF\x00\0x00\0x00\x00");
foreach(msg;fails8)5424     foreach (msg; fails8)
5425     {
5426         assert(collectException((){
5427             auto s = msg;
5428             size_t idx = 0;
5429             utf8.test(s);
5430         }()), format("%( %2x %)", cast(ubyte[]) msg));
5431     }
5432     //decode failure cases UTF-16
5433     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
foreach(msg;fails16)5434     foreach (msg; fails16)
5435     {
5436         assert(collectException((){
5437             auto s = msg.map!(x => cast(wchar) x);
5438             utf16.test(s);
5439         }()));
5440     }
5441 }
5442 
5443 /++
5444     Convenience function to construct optimal configurations for
5445     packed Trie from any $(D set) of $(CODEPOINTS).
5446 
5447     The parameter $(D level) indicates the number of trie levels to use,
5448     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5449     speed-size wise.
5450 
5451     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5452     $(P Level 4 is the slowest and has the smallest footprint. )
5453 
5454     See the $(S_LINK Synopsis, Synopsis) section for example.
5455 
5456     Note:
5457     Level 4 stays very practical (being faster and more predictable)
5458     compared to using direct lookup on the $(D set) itself.
5459 
5460 
5461 +/
5462 public auto toTrie(size_t level, Set)(Set set)
5463 if (isCodepointSet!Set)
5464 {
5465     static if (level == 1)
5466         return codepointSetTrie!(21)(set);
5467     else static if (level == 2)
5468         return codepointSetTrie!(10, 11)(set);
5469     else static if (level == 3)
5470         return codepointSetTrie!(8, 5, 8)(set);
5471     else static if (level == 4)
5472          return codepointSetTrie!(6, 4, 4, 7)(set);
5473     else
5474         static assert(false,
5475             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5476 }
5477 
5478 /**
5479     $(P Builds a $(D Trie) with typically optimal speed-size trade-off
5480     and wraps it into a delegate of the following type:
5481     $(D bool delegate(dchar ch)). )
5482 
5483     $(P Effectively this creates a 'tester' lambda suitable
5484     for algorithms like std.algorithm.find that take unary predicates. )
5485 
5486     See the $(S_LINK Synopsis, Synopsis) section for example.
5487 */
5488 public auto toDelegate(Set)(Set set)
5489 if (isCodepointSet!Set)
5490 {
5491     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5492     auto t = toTrie!3(set);
5493     return (dchar ch) => t[ch];
5494 }
5495 
5496 /**
5497     $(P Opaque wrapper around unsigned built-in integers and
5498     code unit (char/wchar/dchar) types.
5499     Parameter $(D sz) indicates that the value is confined
5500     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5501     packed more tightly when stored in certain
5502     data-structures like trie. )
5503 
5504     Note:
5505     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T)
5506     but not vise-versa. Users have to ensure the value fits in
5507     the range required and use the $(D cast)
5508     operator to perform the conversion.)
5509 */
5510 struct BitPacked(T, size_t sz)
5511 if (isIntegral!T || is(T:dchar))
5512 {
5513     enum bitSize = sz;
5514     T _value;
5515     alias _value this;
5516 }
5517 
5518 /*
5519     Depending on the form of the passed argument $(D bitSizeOf) returns
5520     the amount of bits required to represent a given type
5521     or a return type of a given functor.
5522 */
5523 template bitSizeOf(Args...)
5524 if (Args.length == 1)
5525 {
5526     import std.traits : ReturnType;
5527     alias T = Args[0];
5528     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5529     {
5530         enum bitSizeOf = T.bitSize;
5531     }
5532     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5533     {
5534         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5535     }
5536     else
5537     {
5538         enum bitSizeOf = T.sizeof*8;
5539     }
5540 }
5541 
5542 /**
5543     Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x)
5544     and thus suitable for packing.
5545 */
isBitPacked(T)5546 template isBitPacked(T)
5547 {
5548     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5549         enum isBitPacked = true;
5550     else
5551         enum isBitPacked = false;
5552 }
5553 
5554 /**
5555     Gives the type $(D U) from $(LREF BitPacked)!(U, x)
5556     or $(D T) itself for every other type.
5557 */
TypeOfBitPacked(T)5558 template TypeOfBitPacked(T)
5559 {
5560     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5561         alias TypeOfBitPacked = U;
5562     else
5563         alias TypeOfBitPacked = T;
5564 }
5565 
5566 /*
5567     Wrapper, used in definition of custom data structures from $(D Trie) template.
5568     Applying it to a unary lambda function indicates that the returned value always
5569     fits within $(D bits) of bits.
5570 */
assumeSize(alias Fn,size_t bits)5571 struct assumeSize(alias Fn, size_t bits)
5572 {
5573     enum bitSize = bits;
5574     static auto ref opCall(T)(auto ref T arg)
5575     {
5576         return Fn(arg);
5577     }
5578 }
5579 
5580 /*
5581     A helper for defining lambda function that yields a slice
5582     of certain bits from an unsigned integral value.
5583     The resulting lambda is wrapped in assumeSize and can be used directly
5584     with $(D Trie) template.
5585 */
sliceBits(size_t from,size_t to)5586 struct sliceBits(size_t from, size_t to)
5587 {
5588     //for now bypass assumeSize, DMD has trouble inlining it
5589     enum bitSize = to-from;
5590     static auto opCall(T)(T x)
5591     out(result)
5592     {
5593         assert(result < (1 << to-from));
5594     }
5595     body
5596     {
5597         static assert(from < to);
5598         static if (from == 0)
5599             return x & ((1 << to)-1);
5600         else
5601         return (x >> from) & ((1<<(to-from))-1);
5602     }
5603 }
5604 
low_8(uint x)5605 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
midlow_8(uint x)5606 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5607 alias lo8 = assumeSize!(low_8, 8);
5608 alias mlo8 = assumeSize!(midlow_8, 8);
5609 
5610 static assert(bitSizeOf!lo8 == 8);
5611 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5612 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5613 
Sequence(size_t start,size_t end)5614 template Sequence(size_t start, size_t end)
5615 {
5616     static if (start < end)
5617         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5618     else
5619         alias Sequence = AliasSeq!();
5620 }
5621 
5622 //---- TRIE TESTS ----
5623 @system unittest
5624 {
5625     import std.algorithm.iteration : map;
5626     import std.algorithm.sorting : sort;
5627     import std.array : array;
5628     import std.conv : text, to;
5629     import std.range : iota;
trieStats(TRIE)5630     static trieStats(TRIE)(TRIE t)
5631     {
5632         version (std_uni_stats)
5633         {
5634             import std.stdio : writefln, writeln;
5635             writeln("---TRIE FOOTPRINT STATS---");
5636             foreach (i; staticIota!(0, t.table.dim) )
5637             {
5638                 writefln("lvl%s = %s bytes;  %s pages"
5639                          , i, t.bytes!i, t.pages!i);
5640             }
5641             writefln("TOTAL: %s bytes", t.bytes);
5642             version (none)
5643             {
5644                 writeln("INDEX (excluding value level):");
5645                 foreach (i; staticIota!(0, t.table.dim-1) )
5646                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5647             }
5648             writeln("---------------------------");
5649         }
5650     }
5651     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5652     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5653     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5654     alias Set = CodepointSet;
5655     auto set = Set('A','Z','a','z');
5656     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5657     for (int a='a'; a<'z';a++)
5658         assert(trie[a]);
5659     for (int a='A'; a<'Z';a++)
5660         assert(trie[a]);
5661     for (int a=0; a<'A'; a++)
5662         assert(!trie[a]);
5663     for (int a ='Z'; a<'a'; a++)
5664         assert(!trie[a]);
5665     trieStats(trie);
5666 
5667     auto redundant2 = Set(
5668         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5669     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5670     trieStats(trie2);
5671     foreach (e; redundant2.byCodepoint)
5672         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5673     foreach (i; 0 .. 1024)
5674     {
5675         assert(trie2[i] == (i in redundant2));
5676     }
5677 
5678 
5679     auto redundant3 = Set(
5680           2,    4,    6,    8,    16,
5681        2+16, 4+16, 16+6, 16+8, 16+16,
5682        2+32, 4+32, 32+6, 32+8,
5683       );
5684 
5685     enum max3 = 256;
5686     // sliceBits
5687     auto trie3 = buildTrie!(bool, uint, max3,
5688             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5689         )(redundant3.byInterval);
5690     trieStats(trie3);
5691     foreach (i; 0 .. max3)
5692         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5693 
5694     auto redundant4 = Set(
5695             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5696             1000, 2000, 3000, 4000, 5000, 6000
5697         );
5698     enum max4 = 2^^16;
5699     auto trie4 = buildTrie!(bool, size_t, max4,
5700             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5701         )(redundant4.byInterval);
5702     foreach (i; 0 .. max4)
5703     {
5704         if (i in redundant4)
5705             assert(trie4[i], text(cast(uint) i));
5706     }
5707     trieStats(trie4);
5708 
5709         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5710         string[] redundantS = ["tea", "start", "orange"];
5711         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5712         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5713         // using first char only
5714         assert(redundantS == ["orange", "start", "tea"]);
5715         assert(strie["test"], text(strie["test"]));
5716         assert(!strie["aea"]);
5717         assert(strie["s"]);
5718 
5719     // a bit size test
5720     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5721     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5722     trieStats(bt);
5723     foreach (i; 0 .. 256)
5724         assert(bt[cast(ubyte) i]);
5725 }
5726 
5727 template useItemAt(size_t idx, T)
5728 if (isIntegral!T || is(T: dchar))
5729 {
impl(in T[]arr)5730     size_t impl(in T[] arr){ return arr[idx]; }
5731     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5732 }
5733 
useLastItem(T)5734 template useLastItem(T)
5735 {
5736     size_t impl(in T[] arr){ return arr[$-1]; }
5737     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5738 }
5739 
fullBitSize(Prefix...)5740 template fullBitSize(Prefix...)
5741 {
5742     static if (Prefix.length > 0)
5743         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5744     else
5745         enum fullBitSize = 0;
5746 }
5747 
idxTypes(Key,size_t fullBits,Prefix...)5748 template idxTypes(Key, size_t fullBits, Prefix...)
5749 {
5750     static if (Prefix.length == 1)
5751     {// the last level is value level, so no index once reduced to 1-level
5752         alias idxTypes = AliasSeq!();
5753     }
5754     else
5755     {
5756         // Important note on bit packing
5757         // Each level has to hold enough of bits to address the next one
5758         // The bottom level is known to hold full bit width
5759         // thus it's size in pages is full_bit_width - size_of_last_prefix
5760         // Recourse on this notion
5761         alias idxTypes =
5762             AliasSeq!(
5763                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5764                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5765             );
5766     }
5767 }
5768 
5769 //============================================================================
5770 
5771 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5772 if (is(Char1 : dchar) && is(Char2 : dchar))
5773 {
5774     import std.algorithm.comparison : cmp;
5775     import std.algorithm.iteration : map, filter;
5776     import std.ascii : toLower;
pred(dchar c)5777     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5778     return cmp(
5779         a.map!toLower.filter!pred,
5780         b.map!toLower.filter!pred);
5781 }
5782 
5783 @safe pure unittest
5784 {
5785     assert(!comparePropertyName("foo-bar", "fooBar"));
5786 }
5787 
5788 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5789 if (is(Char1 : dchar) && is(Char2 : dchar))
5790 {
5791     return comparePropertyName(a, b) < 0;
5792 }
5793 
5794 //============================================================================
5795 // Utilities for compression of Unicode code point sets
5796 //============================================================================
5797 
compressTo(uint val,ref ubyte[]arr)5798 @safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
5799 {
5800     // not optimized as usually done 1 time (and not public interface)
5801     if (val < 128)
5802         arr ~= cast(ubyte) val;
5803     else if (val < (1 << 13))
5804     {
5805         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5806         arr ~= val & 0xFF;
5807     }
5808     else
5809     {
5810         assert(val < (1 << 21));
5811         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5812         arr ~= (val >> 8) & 0xFF;
5813         arr ~= val  & 0xFF;
5814     }
5815 }
5816 
decompressFrom(const (ubyte)[]arr,ref size_t idx)5817 @safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
5818 {
5819     import std.exception : enforce;
5820     immutable first = arr[idx++];
5821     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5822         return first;
5823     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5824     uint val = (first & 0x1F);
5825     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5826     foreach (j; 0 .. extra)
5827         val = (val << 8) | arr[idx+j];
5828     idx += extra;
5829     return val;
5830 }
5831 
5832 
5833 package ubyte[] compressIntervals(Range)(Range intervals)
5834 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5835 {
5836     ubyte[] storage;
5837     uint base = 0;
5838     // RLE encode
foreach(val;intervals)5839     foreach (val; intervals)
5840     {
5841         compressTo(val[0]-base, storage);
5842         base = val[0];
5843         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5844         {
5845             compressTo(val[1]-base, storage);
5846             base = val[1];
5847         }
5848     }
5849     return storage;
5850 }
5851 
5852 @safe pure unittest
5853 {
5854     import std.algorithm.comparison : equal;
5855     import std.typecons : tuple;
5856 
5857     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5858     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5859     assert(compressIntervals(run) == enc);
5860     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5861     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5862     assert(compressIntervals(run2) == enc2);
5863     size_t  idx = 0;
5864     assert(decompressFrom(enc, idx) == 80);
5865     assert(decompressFrom(enc, idx) == 47);
5866     assert(decompressFrom(enc, idx) == 1);
5867     assert(decompressFrom(enc, idx) == (1 << 10));
5868     idx = 0;
5869     assert(decompressFrom(enc2, idx) == 0);
5870     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5871     assert(equal(decompressIntervals(compressIntervals(run)), run));
5872     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5873 }
5874 
5875 // Creates a range of $(D CodepointInterval) that lazily decodes compressed data.
decompressIntervals(const (ubyte)[]data)5876 @safe package auto decompressIntervals(const(ubyte)[] data) pure
5877 {
5878     return DecompressedIntervals(data);
5879 }
5880 
5881 @safe struct DecompressedIntervals
5882 {
5883 pure:
5884     const(ubyte)[] _stream;
5885     size_t _idx;
5886     CodepointInterval _front;
5887 
thisDecompressedIntervals5888     this(const(ubyte)[] stream)
5889     {
5890         _stream = stream;
5891         popFront();
5892     }
5893 
frontDecompressedIntervals5894     @property CodepointInterval front()
5895     {
5896         assert(!empty);
5897         return _front;
5898     }
5899 
popFrontDecompressedIntervals5900     void popFront()
5901     {
5902         if (_idx == _stream.length)
5903         {
5904             _idx = size_t.max;
5905             return;
5906         }
5907         uint base = _front[1];
5908         _front[0] = base + decompressFrom(_stream, _idx);
5909         if (_idx == _stream.length)// odd length ---> till the end
5910             _front[1] = lastDchar+1;
5911         else
5912         {
5913             base = _front[0];
5914             _front[1] = base + decompressFrom(_stream, _idx);
5915         }
5916     }
5917 
emptyDecompressedIntervals5918     @property bool empty() const
5919     {
5920         return _idx == size_t.max;
5921     }
5922 
saveDecompressedIntervals5923     @property DecompressedIntervals save() { return this; }
5924 }
5925 
5926 static assert(isInputRange!DecompressedIntervals);
5927 static assert(isForwardRange!DecompressedIntervals);
5928 //============================================================================
5929 
version(std_uni_bootstrap)5930 version (std_uni_bootstrap){}
5931 else
5932 {
5933 
5934 // helper for looking up code point sets
findUnicodeSet(alias table,C)5935 @trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure
5936 {
5937     import std.algorithm.iteration : map;
5938     import std.range : assumeSorted;
5939     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5940         (table.map!"a.name"());
5941     size_t idx = range.lowerBound(name).length;
5942     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5943         return idx;
5944     return -1;
5945 }
5946 
5947 // another one that loads it
loadUnicodeSet(alias table,Set,C)5948 @trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure
5949 {
5950     auto idx = findUnicodeSet!table(name);
5951     if (idx >= 0)
5952     {
5953         dest = Set(asSet(table[idx].compressed));
5954         return true;
5955     }
5956     return false;
5957 }
5958 
5959 @trusted bool loadProperty(Set=CodepointSet, C)
5960     (in C[] name, ref Set target) pure
5961 {
5962     import std.internal.unicode_tables : uniProps; // generated file
5963     alias ucmp = comparePropertyName;
5964     // conjure cumulative properties by hand
5965     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5966     {
5967         target = asSet(uniProps.Lu);
5968         target |= asSet(uniProps.Ll);
5969         target |= asSet(uniProps.Lt);
5970         target |= asSet(uniProps.Lo);
5971         target |= asSet(uniProps.Lm);
5972     }
5973     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5974     {
5975         target = asSet(uniProps.Ll);
5976         target |= asSet(uniProps.Lu);
5977         target |= asSet(uniProps.Lt);// Title case
5978     }
5979     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
5980     {
5981         target = asSet(uniProps.Mn);
5982         target |= asSet(uniProps.Mc);
5983         target |= asSet(uniProps.Me);
5984     }
5985     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
5986     {
5987         target = asSet(uniProps.Nd);
5988         target |= asSet(uniProps.Nl);
5989         target |= asSet(uniProps.No);
5990     }
5991     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
5992     {
5993         target = asSet(uniProps.Pc);
5994         target |= asSet(uniProps.Pd);
5995         target |= asSet(uniProps.Ps);
5996         target |= asSet(uniProps.Pe);
5997         target |= asSet(uniProps.Pi);
5998         target |= asSet(uniProps.Pf);
5999         target |= asSet(uniProps.Po);
6000     }
6001     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6002     {
6003         target = asSet(uniProps.Sm);
6004         target |= asSet(uniProps.Sc);
6005         target |= asSet(uniProps.Sk);
6006         target |= asSet(uniProps.So);
6007     }
6008     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6009     {
6010         target = asSet(uniProps.Zs);
6011         target |= asSet(uniProps.Zl);
6012         target |= asSet(uniProps.Zp);
6013     }
6014     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6015     {
6016         target = asSet(uniProps.Co);
6017         target |= asSet(uniProps.Lo);
6018         target |= asSet(uniProps.No);
6019         target |= asSet(uniProps.So);
6020         target |= asSet(uniProps.Po);
6021     }
6022     else if (ucmp(name, "graphical") == 0)
6023     {
6024         target = asSet(uniProps.Alphabetic);
6025 
6026         target |= asSet(uniProps.Mn);
6027         target |= asSet(uniProps.Mc);
6028         target |= asSet(uniProps.Me);
6029 
6030         target |= asSet(uniProps.Nd);
6031         target |= asSet(uniProps.Nl);
6032         target |= asSet(uniProps.No);
6033 
6034         target |= asSet(uniProps.Pc);
6035         target |= asSet(uniProps.Pd);
6036         target |= asSet(uniProps.Ps);
6037         target |= asSet(uniProps.Pe);
6038         target |= asSet(uniProps.Pi);
6039         target |= asSet(uniProps.Pf);
6040         target |= asSet(uniProps.Po);
6041 
6042         target |= asSet(uniProps.Zs);
6043 
6044         target |= asSet(uniProps.Sm);
6045         target |= asSet(uniProps.Sc);
6046         target |= asSet(uniProps.Sk);
6047         target |= asSet(uniProps.So);
6048     }
6049     else if (ucmp(name, "any") == 0)
6050         target = Set.fromIntervals(0, 0x110000);
6051     else if (ucmp(name, "ascii") == 0)
6052         target = Set.fromIntervals(0, 0x80);
6053     else
6054         return loadUnicodeSet!(uniProps.tab)(name, target);
6055     return true;
6056 }
6057 
6058 // CTFE-only helper for checking property names at compile-time
isPrettyPropertyName(C)6059 @safe bool isPrettyPropertyName(C)(in C[] name)
6060 {
6061     import std.algorithm.searching : find;
6062     auto names = [
6063         "L", "Letter",
6064         "LC", "Cased Letter",
6065         "M", "Mark",
6066         "N", "Number",
6067         "P", "Punctuation",
6068         "S", "Symbol",
6069         "Z", "Separator",
6070         "Graphical",
6071         "any",
6072         "ascii"
6073     ];
6074     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6075     return !x.empty;
6076 }
6077 
6078 // ditto, CTFE-only, not optimized
findSetName(alias table,C)6079 @safe private static bool findSetName(alias table, C)(in C[] name)
6080 {
6081     return findUnicodeSet!table(name) >= 0;
6082 }
6083 
SetSearcher(alias table,string kind)6084 template SetSearcher(alias table, string kind)
6085 {
6086     /// Run-time checked search.
6087     static auto opCall(C)(in C[] name)
6088         if (is(C : dchar))
6089     {
6090         import std.conv : to;
6091         CodepointSet set;
6092         if (loadUnicodeSet!table(name, set))
6093             return set;
6094         throw new Exception("No unicode set for "~kind~" by name "
6095             ~name.to!string()~" was found.");
6096     }
6097     /// Compile-time checked search.
6098     static @property auto opDispatch(string name)()
6099     {
6100         static if (findSetName!table(name))
6101         {
6102             CodepointSet set;
6103             loadUnicodeSet!table(name, set);
6104             return set;
6105         }
6106         else
6107             static assert(false, "No unicode set for "~kind~" by name "
6108                 ~name~" was found.");
6109     }
6110 }
6111 
6112 /**
6113     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6114     a block, script or general category.
6115 
6116     It uses well defined standard rules of property name lookup.
6117     This includes fuzzy matching of names, so that
6118     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6119     and yield the same set of white space $(CHARACTERS).
6120 */
6121 @safe public struct unicode
6122 {
6123     /**
6124         Performs the lookup of set of $(CODEPOINTS)
6125         with compile-time correctness checking.
6126         This short-cut version combines 3 searches:
6127         across blocks, scripts, and common binary properties.
6128 
6129         Note that since scripts and blocks overlap the
6130         usual trick to disambiguate is used - to get a block use
6131         $(D unicode.InBlockName), to search a script
6132         use $(D unicode.ScriptName).
6133 
6134         See_Also: $(LREF block), $(LREF script)
6135         and (not included in this search) $(LREF hangulSyllableType).
6136     */
6137 
opDispatchunicode6138     static @property auto opDispatch(string name)() pure
6139     {
6140         static if (findAny(name))
6141             return loadAny(name);
6142         else
6143             static assert(false, "No unicode set by name "~name~" was found.");
6144     }
6145 
6146     ///
6147     @safe unittest
6148     {
6149         import std.exception : collectException;
6150         auto ascii = unicode.ASCII;
6151         assert(ascii['A']);
6152         assert(ascii['~']);
6153         assert(!ascii['\u00e0']);
6154         // matching is case-insensitive
6155         assert(ascii == unicode.ascII);
6156         assert(!ascii['à']);
6157         // underscores, '-' and whitespace in names are ignored too
6158         auto latin = unicode.in_latin1_Supplement;
6159         assert(latin['à']);
6160         assert(!latin['$']);
6161         // BTW Latin 1 Supplement is a block, hence "In" prefix
6162         assert(latin == unicode("In Latin 1 Supplement"));
6163         // run-time look up throws if no such set is found
6164         assert(collectException(unicode("InCyrilliac")));
6165     }
6166 
6167     /**
6168         The same lookup across blocks, scripts, or binary properties,
6169         but performed at run-time.
6170         This version is provided for cases where $(D name)
6171         is not known beforehand; otherwise compile-time
6172         checked $(LREF opDispatch) is typically a better choice.
6173 
6174         See the $(S_LINK Unicode properties, table of properties) for available
6175         sets.
6176     */
6177     static auto opCall(C)(in C[] name)
6178         if (is(C : dchar))
6179     {
6180         return loadAny(name);
6181     }
6182 
6183     /**
6184         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6185 
6186         Note:
6187         Here block names are unambiguous as no scripts are searched
6188         and thus to search use simply $(D unicode.block.BlockName) notation.
6189 
6190         See $(S_LINK Unicode properties, table of properties) for available sets.
6191         See_Also: $(S_LINK Unicode properties, table of properties).
6192     */
6193     struct block
6194     {
6195         import std.internal.unicode_tables : blocks; // generated file
6196         mixin SetSearcher!(blocks.tab, "block");
6197     }
6198 
6199     ///
6200     @safe unittest
6201     {
6202         // use .block for explicitness
6203         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6204     }
6205 
6206     /**
6207         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6208 
6209         See the $(S_LINK Unicode properties, table of properties) for available
6210         sets.
6211     */
6212     struct script
6213     {
6214         import std.internal.unicode_tables : scripts; // generated file
6215         mixin SetSearcher!(scripts.tab, "script");
6216     }
6217 
6218     ///
6219     @safe unittest
6220     {
6221         auto arabicScript = unicode.script.arabic;
6222         auto arabicBlock = unicode.block.arabic;
6223         // there is an intersection between script and block
6224         assert(arabicBlock['؁']);
6225         assert(arabicScript['؁']);
6226         // but they are different
6227         assert(arabicBlock != arabicScript);
6228         assert(arabicBlock == unicode.inArabic);
6229         assert(arabicScript == unicode.arabic);
6230     }
6231 
6232     /**
6233         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6234 
6235         Other non-binary properties (once supported) follow the same
6236         notation - $(D unicode.propertyName.propertyValue) for compile-time
6237         checked access and $(D unicode.propertyName(propertyValue))
6238         for run-time checked one.
6239 
6240         See the $(S_LINK Unicode properties, table of properties) for available
6241         sets.
6242     */
6243     struct hangulSyllableType
6244     {
6245         import std.internal.unicode_tables : hangul; // generated file
6246         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6247     }
6248 
6249     ///
6250     @safe unittest
6251     {
6252         // L here is syllable type not Letter as in unicode.L short-cut
6253         auto leadingVowel = unicode.hangulSyllableType("L");
6254         // check that some leading vowels are present
6255         foreach (vowel; '\u1110'..'\u115F')
6256             assert(leadingVowel[vowel]);
6257         assert(leadingVowel == unicode.hangulSyllableType.L);
6258     }
6259 
6260 private:
6261     alias ucmp = comparePropertyName;
6262 
findAnyunicode6263     static bool findAny(string name)
6264     {
6265         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6266         return isPrettyPropertyName(name)
6267             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6268             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6269     }
6270 
6271     static auto loadAny(Set=CodepointSet, C)(in C[] name) pure
6272     {
6273         import std.conv : to;
6274         import std.internal.unicode_tables : blocks, scripts; // generated file
6275         Set set;
6276         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6277             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6278                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6279         if (loaded)
6280             return set;
6281         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6282     }
6283 
6284     // FIXME: re-disable once the compiler is fixed
6285     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6286     //@disable ~this();
6287 }
6288 
6289 @safe unittest
6290 {
6291     import std.internal.unicode_tables : blocks, uniProps; // generated file
6292     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6293     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6294     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6295 }
6296 
6297 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6298 
6299 // control - '\r'
6300 enum controlSwitch = `
6301     case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
6302     case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
6303 `;
6304 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6305 // kill unrolled switches
6306 
isRegionalIndicator(dchar ch)6307 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6308 {
6309     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6310 }
6311 
genericDecodeGrapheme(bool getValue)6312 template genericDecodeGrapheme(bool getValue)
6313 {
6314     alias graphemeExtend = graphemeExtendTrie;
6315     alias spacingMark = mcTrie;
6316     static if (getValue)
6317         alias Value = Grapheme;
6318     else
6319         alias Value = void;
6320 
6321     Value genericDecodeGrapheme(Input)(ref Input range)
6322     {
6323         import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
6324         enum GraphemeState {
6325             Start,
6326             CR,
6327             RI,
6328             L,
6329             V,
6330             LVT
6331         }
6332         static if (getValue)
6333             Grapheme grapheme;
6334         auto state = GraphemeState.Start;
6335         enum eat = q{
6336             static if (getValue)
6337                 grapheme ~= ch;
6338             range.popFront();
6339         };
6340 
6341         dchar ch;
6342         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
6343         while (!range.empty)
6344         {
6345             ch = range.front;
6346             final switch (state) with(GraphemeState)
6347             {
6348             case Start:
6349                 mixin(eat);
6350                 if (ch == '\r')
6351                     state = CR;
6352                 else if (isRegionalIndicator(ch))
6353                     state = RI;
6354                 else if (isHangL(ch))
6355                     state = L;
6356                 else if (hangLV[ch] || isHangV(ch))
6357                     state = V;
6358                 else if (hangLVT[ch])
6359                     state = LVT;
6360                 else if (isHangT(ch))
6361                     state = LVT;
6362                 else
6363                 {
6364                     switch (ch)
6365                     {
6366                     mixin(controlSwitch);
6367                         goto L_End;
6368                     default:
6369                         goto L_End_Extend;
6370                     }
6371                 }
6372             break;
6373             case CR:
6374                 if (ch == '\n')
6375                     mixin(eat);
6376                 goto L_End_Extend;
6377             case RI:
6378                 if (isRegionalIndicator(ch))
6379                     mixin(eat);
6380                 else
6381                     goto L_End_Extend;
6382             break;
6383             case L:
6384                 if (isHangL(ch))
6385                     mixin(eat);
6386                 else if (isHangV(ch) || hangLV[ch])
6387                 {
6388                     state = V;
6389                     mixin(eat);
6390                 }
6391                 else if (hangLVT[ch])
6392                 {
6393                     state = LVT;
6394                     mixin(eat);
6395                 }
6396                 else
6397                     goto L_End_Extend;
6398             break;
6399             case V:
6400                 if (isHangV(ch))
6401                     mixin(eat);
6402                 else if (isHangT(ch))
6403                 {
6404                     state = LVT;
6405                     mixin(eat);
6406                 }
6407                 else
6408                     goto L_End_Extend;
6409             break;
6410             case LVT:
6411                 if (isHangT(ch))
6412                 {
6413                     mixin(eat);
6414                 }
6415                 else
6416                     goto L_End_Extend;
6417             break;
6418             }
6419         }
6420     L_End_Extend:
6421         while (!range.empty)
6422         {
6423             ch = range.front;
6424             // extend & spacing marks
6425             if (!graphemeExtend[ch] && !spacingMark[ch])
6426                 break;
6427             mixin(eat);
6428         }
6429     L_End:
6430         static if (getValue)
6431             return grapheme;
6432     }
6433 
6434 }
6435 
6436 public: // Public API continues
6437 
6438 /++
6439     Computes the length of grapheme cluster starting at $(D index).
6440     Both the resulting length and the $(D index) are measured
6441     in $(S_LINK Code unit, code units).
6442 
6443     Params:
6444         C = type that is implicitly convertible to $(D dchars)
6445         input = array of grapheme clusters
6446         index = starting index into $(D input[])
6447 
6448     Returns:
6449         length of grapheme cluster
6450 +/
6451 size_t graphemeStride(C)(in C[] input, size_t index)
6452 if (is(C : dchar))
6453 {
6454     auto src = input[index..$];
6455     auto n = src.length;
6456     genericDecodeGrapheme!(false)(src);
6457     return n - src.length;
6458 }
6459 
6460 ///
6461 @safe unittest
6462 {
6463     assert(graphemeStride("  ", 1) == 1);
6464     // A + combing ring above
6465     string city = "A\u030Arhus";
6466     size_t first = graphemeStride(city, 0);
6467     assert(first == 3); //\u030A has 2 UTF-8 code units
6468     assert(city[0 .. first] == "A\u030A");
6469     assert(city[first..$] == "rhus");
6470 }
6471 
6472 /++
6473     Reads one full grapheme cluster from an input range of dchar $(D inp).
6474 
6475     For examples see the $(LREF Grapheme) below.
6476 
6477     Note:
6478     This function modifies $(D inp) and thus $(D inp)
6479     must be an L-value.
6480 +/
6481 Grapheme decodeGrapheme(Input)(ref Input inp)
6482 if (isInputRange!Input && is(Unqual!(ElementType!Input) == dchar))
6483 {
6484     return genericDecodeGrapheme!true(inp);
6485 }
6486 
6487 @system unittest
6488 {
6489     import std.algorithm.comparison : equal;
6490 
6491     Grapheme gr;
6492     string s = " \u0020\u0308 ";
6493     gr = decodeGrapheme(s);
6494     assert(gr.length == 1 && gr[0] == ' ');
6495     gr = decodeGrapheme(s);
6496     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
6497     s = "\u0300\u0308\u1100";
6498     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
6499     assert(equal(decodeGrapheme(s)[], "\u1100"));
6500     s = "\u11A8\u0308\uAC01";
6501     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
6502     assert(equal(decodeGrapheme(s)[], "\uAC01"));
6503 }
6504 
6505 /++
6506     $(P Iterate a string by grapheme.)
6507 
6508     $(P Useful for doing string manipulation that needs to be aware
6509     of graphemes.)
6510 
6511     See_Also:
6512         $(LREF byCodePoint)
6513 +/
6514 auto byGrapheme(Range)(Range range)
6515 if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
6516 {
6517     // TODO: Bidirectional access
Result(R)6518     static struct Result(R)
6519     {
6520         private R _range;
6521         private Grapheme _front;
6522 
6523         bool empty() @property
6524         {
6525             return _front.length == 0;
6526         }
6527 
6528         Grapheme front() @property
6529         {
6530             return _front;
6531         }
6532 
6533         void popFront()
6534         {
6535             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
6536         }
6537 
6538         static if (isForwardRange!R)
6539         {
6540             Result save() @property
6541             {
6542                 return Result(_range.save, _front);
6543             }
6544         }
6545     }
6546 
6547     auto result = Result!(Range)(range);
6548     result.popFront();
6549     return result;
6550 }
6551 
6552 ///
6553 @safe unittest
6554 {
6555     import std.algorithm.comparison : equal;
6556     import std.range.primitives : walkLength;
6557     import std.range : take, drop;
6558     auto text = "noe\u0308l"; // noël using e + combining diaeresis
6559     assert(text.walkLength == 5); // 5 code points
6560 
6561     auto gText = text.byGrapheme;
6562     assert(gText.walkLength == 4); // 4 graphemes
6563 
6564     assert(gText.take(3).equal("noe\u0308".byGrapheme));
6565     assert(gText.drop(3).equal("l".byGrapheme));
6566 }
6567 
6568 // For testing non-forward-range input ranges
6569 version (unittest)
6570 private static struct InputRangeString
6571 {
6572     private string s;
6573 
emptyInputRangeString6574     bool empty() @property { return s.empty; }
frontInputRangeString6575     dchar front() @property { return s.front; }
popFrontInputRangeString6576     void popFront() { s.popFront(); }
6577 }
6578 
6579 @system unittest
6580 {
6581     import std.algorithm.comparison : equal;
6582     import std.array : array;
6583     import std.range : retro;
6584     import std.range.primitives : walkLength;
6585     assert("".byGrapheme.walkLength == 0);
6586 
6587     auto reverse = "le\u0308on";
6588     assert(reverse.walkLength == 5);
6589 
6590     auto gReverse = reverse.byGrapheme;
6591     assert(gReverse.walkLength == 4);
6592 
6593     foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
6594     {
6595         assert(text.walkLength == 5);
6596         static assert(isForwardRange!(typeof(text)));
6597 
6598         auto gText = text.byGrapheme;
6599         static assert(isForwardRange!(typeof(gText)));
6600         assert(gText.walkLength == 4);
6601         assert(gText.array.retro.equal(gReverse));
6602     }
6603 
6604     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
6605     static assert(!isForwardRange!(typeof(nonForwardRange)));
6606     assert(nonForwardRange.walkLength == 4);
6607 }
6608 
6609 /++
6610     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
6611 
6612     $(P Useful for converting the result to a string after doing operations
6613     on graphemes.)
6614 
6615     $(P Acts as the identity function when given a range of code points.)
6616 +/
6617 auto byCodePoint(Range)(Range range)
6618 if (isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme))
6619 {
6620     // TODO: Propagate bidirectional access
6621     static struct Result
6622     {
6623         private Range _range;
6624         private size_t i = 0;
6625 
emptyResult6626         bool empty() @property
6627         {
6628             return _range.empty;
6629         }
6630 
frontResult6631         dchar front() @property
6632         {
6633             return _range.front[i];
6634         }
6635 
popFrontResult6636         void popFront()
6637         {
6638             ++i;
6639 
6640             if (i >= _range.front.length)
6641             {
6642                 _range.popFront();
6643                 i = 0;
6644             }
6645         }
6646 
6647         static if (isForwardRange!Range)
6648         {
saveResult6649             Result save() @property
6650             {
6651                 return Result(_range.save, i);
6652             }
6653         }
6654     }
6655 
6656     return Result(range);
6657 }
6658 
6659 /// Ditto
6660 Range byCodePoint(Range)(Range range)
6661 if (isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
6662 {
6663     return range;
6664 }
6665 
6666 ///
6667 @safe unittest
6668 {
6669     import std.array : array;
6670     import std.conv : text;
6671     import std.range : retro;
6672 
6673     string s = "noe\u0308l"; // noël
6674 
6675     // reverse it and convert the result to a string
6676     string reverse = s.byGrapheme
6677         .array
6678         .retro
6679         .byCodePoint
6680         .text;
6681 
6682     assert(reverse == "le\u0308on"); // lëon
6683 }
6684 
6685 @system unittest
6686 {
6687     import std.algorithm.comparison : equal;
6688     import std.range.primitives : walkLength;
6689     assert("".byGrapheme.byCodePoint.equal(""));
6690 
6691     string text = "noe\u0308l";
6692     static assert(is(typeof(text.byCodePoint) == string));
6693 
6694     auto gText = InputRangeString(text).byGrapheme;
6695     static assert(!isForwardRange!(typeof(gText)));
6696 
6697     auto cpText = gText.byCodePoint;
6698     static assert(!isForwardRange!(typeof(cpText)));
6699 
6700     assert(cpText.walkLength == text.walkLength);
6701 }
6702 
6703 @trusted:
6704 
6705 /++
6706     $(P A structure designed to effectively pack $(CHARACTERS)
6707     of a $(CLUSTER).
6708     )
6709 
6710     $(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme)
6711     always refer to distinct objects. In most actual scenarios a $(D Grapheme)
6712     fits on the stack and avoids memory allocation overhead for all but quite
6713     long clusters.
6714     )
6715 
6716     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
6717 +/
6718 @trusted struct Grapheme
6719 {
6720     import std.traits : isDynamicArray;
6721 
6722 public:
6723     /// Ctor
6724     this(C)(in C[] chars...)
6725         if (is(C : dchar))
6726     {
6727         this ~= chars;
6728     }
6729 
6730     ///ditto
6731     this(Input)(Input seq)
6732         if (!isDynamicArray!Input
6733             && isInputRange!Input && is(ElementType!Input : dchar))
6734     {
6735         this ~= seq;
6736     }
6737 
6738     /// Gets a $(CODEPOINT) at the given index in this cluster.
opIndexGrapheme6739     dchar opIndex(size_t index) const pure nothrow @nogc
6740     {
6741         assert(index < length);
6742         return read24(isBig ? ptr_ : small_.ptr, index);
6743     }
6744 
6745     /++
6746         Writes a $(CODEPOINT) $(D ch) at given index in this cluster.
6747 
6748         Warning:
6749         Use of this facility may invalidate grapheme cluster,
6750         see also $(LREF Grapheme.valid).
6751     +/
opIndexAssignGrapheme6752     void opIndexAssign(dchar ch, size_t index) pure nothrow @nogc
6753     {
6754         assert(index < length);
6755         write24(isBig ? ptr_ : small_.ptr, ch, index);
6756     }
6757 
6758     ///
6759     @safe unittest
6760     {
6761         auto g = Grapheme("A\u0302");
6762         assert(g[0] == 'A');
6763         assert(g.valid);
6764         g[1] = '~'; // ASCII tilda is not a combining mark
6765         assert(g[1] == '~');
6766         assert(!g.valid);
6767     }
6768 
6769     /++
6770         Random-access range over Grapheme's $(CHARACTERS).
6771 
6772         Warning: Invalidates when this Grapheme leaves the scope,
6773         attempts to use it then would lead to memory corruption.
6774     +/
6775     @system SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) pure nothrow @nogc
6776     {
6777         return sliceOverIndexed(a, b, &this);
6778     }
6779 
6780     /// ditto
6781     @system SliceOverIndexed!Grapheme opSlice() pure nothrow @nogc
6782     {
6783         return sliceOverIndexed(0, length, &this);
6784     }
6785 
6786     /// Grapheme cluster length in $(CODEPOINTS).
lengthGrapheme6787     @property size_t length() const pure nothrow @nogc
6788     {
6789         return isBig ? len_ : slen_ & 0x7F;
6790     }
6791 
6792     /++
6793         Append $(CHARACTER) $(D ch) to this grapheme.
6794         Warning:
6795         Use of this facility may invalidate grapheme cluster,
6796         see also $(D valid).
6797 
6798         See_Also: $(LREF Grapheme.valid)
6799     +/
6800     ref opOpAssign(string op)(dchar ch)
6801     {
6802         static if (op == "~")
6803         {
6804             if (!isBig)
6805             {
6806                 if (slen_ == small_cap)
6807                     convertToBig();// & fallthrough to "big" branch
6808                 else
6809                 {
6810                     write24(small_.ptr, ch, smallLength);
6811                     slen_++;
6812                     return this;
6813                 }
6814             }
6815 
6816             assert(isBig);
6817             if (len_ == cap_)
6818             {
6819                 import core.checkedint : addu, mulu;
6820                 bool overflow;
6821                 cap_ = addu(cap_, grow, overflow);
6822                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
6823                 if (overflow) assert(0);
6824                 ptr_ = cast(ubyte*) pureRealloc(ptr_, nelems);
6825                 if (ptr_ is null) onOutOfMemoryError();
6826             }
6827             write24(ptr_, ch, len_++);
6828             return this;
6829         }
6830         else
6831             static assert(false, "No operation "~op~" defined for Grapheme");
6832     }
6833 
6834     ///
6835     @system unittest
6836     {
6837         import std.algorithm.comparison : equal;
6838         auto g = Grapheme("A");
6839         assert(g.valid);
6840         g ~= '\u0301';
6841         assert(g[].equal("A\u0301"));
6842         assert(g.valid);
6843         g ~= "B";
6844         // not a valid grapheme cluster anymore
6845         assert(!g.valid);
6846         // still could be useful though
6847         assert(g[].equal("A\u0301B"));
6848     }
6849 
6850     /// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme.
6851     ref opOpAssign(string op, Input)(Input inp)
6852         if (isInputRange!Input && is(ElementType!Input : dchar))
6853     {
6854         static if (op == "~")
6855         {
6856             foreach (dchar ch; inp)
6857                 this ~= ch;
6858             return this;
6859         }
6860         else
6861             static assert(false, "No operation "~op~" defined for Grapheme");
6862     }
6863 
6864     /++
6865         True if this object contains valid extended grapheme cluster.
6866         Decoding primitives of this module always return a valid $(D Grapheme).
6867 
6868         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
6869         render it no longer valid. Certain applications may chose to use
6870         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
6871         entirely.
6872     +/
6873     @property bool valid()() /*const*/
6874     {
6875         auto r = this[];
6876         genericDecodeGrapheme!false(r);
6877         return r.length == 0;
6878     }
6879 
thisGrapheme6880     this(this) pure @nogc nothrow
6881     {
6882         if (isBig)
6883         {// dup it
6884             import core.checkedint : addu, mulu;
6885             bool overflow;
6886             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
6887             if (overflow) assert(0);
6888 
6889             auto p = cast(ubyte*) pureMalloc(raw_cap);
6890             if (p is null) onOutOfMemoryError();
6891             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
6892             ptr_ = p;
6893         }
6894     }
6895 
~thisGrapheme6896     ~this() pure @nogc nothrow
6897     {
6898         if (isBig)
6899         {
6900             pureFree(ptr_);
6901         }
6902     }
6903 
6904 
6905 private:
6906     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
6907     // "out of the blue" grow rate, needs testing
6908     // (though graphemes are typically small < 9)
6909     enum grow = 20;
6910     enum small_cap = small_bytes/3;
6911     enum small_flag = 0x80, small_mask = 0x7F;
6912     // 16 bytes in 32bits, should be enough for the majority of cases
6913     union
6914     {
6915         struct
6916         {
6917             ubyte* ptr_;
6918             size_t cap_;
6919             size_t len_;
6920             size_t padding_;
6921         }
6922         struct
6923         {
6924             ubyte[small_bytes] small_;
6925             ubyte slen_;
6926         }
6927     }
6928 
convertToBigGrapheme6929     void convertToBig() pure @nogc nothrow
6930     {
6931         static assert(grow.max / 3 - 1 >= grow);
6932         enum nbytes = 3 * (grow + 1);
6933         size_t k = smallLength;
6934         ubyte* p = cast(ubyte*) pureMalloc(nbytes);
6935         if (p is null) onOutOfMemoryError();
6936         for (int i=0; i<k; i++)
6937             write24(p, read24(small_.ptr, i), i);
6938         // now we can overwrite small array data
6939         ptr_ = p;
6940         len_ = slen_;
6941         assert(grow > len_);
6942         cap_ = grow;
6943         setBig();
6944     }
6945 
setBigGrapheme6946     void setBig() pure nothrow @nogc { slen_ |= small_flag; }
6947 
smallLengthGrapheme6948     @property size_t smallLength() const pure nothrow @nogc
6949     {
6950         return slen_ & small_mask;
6951     }
isBigGrapheme6952     @property ubyte isBig() const pure nothrow @nogc
6953     {
6954         return slen_ & small_flag;
6955     }
6956 }
6957 
6958 static assert(Grapheme.sizeof == size_t.sizeof*4);
6959 
6960 
6961 @system pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
6962 {
6963     import std.algorithm.comparison : equal;
6964     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
6965     assert(byGrapheme("ЮУЗ").equal(data[]));
6966 }
6967 
6968 ///
6969 @system unittest
6970 {
6971     import std.algorithm.comparison : equal;
6972     import std.algorithm.iteration : filter;
6973     import std.range : isRandomAccessRange;
6974 
6975     string bold = "ku\u0308hn";
6976 
6977     // note that decodeGrapheme takes parameter by ref
6978     auto first = decodeGrapheme(bold);
6979 
6980     assert(first.length == 1);
6981     assert(first[0] == 'k');
6982 
6983     // the next grapheme is 2 characters long
6984     auto wideOne = decodeGrapheme(bold);
6985     // slicing a grapheme yields a random-access range of dchar
6986     assert(wideOne[].equal("u\u0308"));
6987     assert(wideOne.length == 2);
6988     static assert(isRandomAccessRange!(typeof(wideOne[])));
6989 
6990     // all of the usual range manipulation is possible
6991     assert(wideOne[].filter!isMark().equal("\u0308"));
6992 
6993     auto g = Grapheme("A");
6994     assert(g.valid);
6995     g ~= '\u0301';
6996     assert(g[].equal("A\u0301"));
6997     assert(g.valid);
6998     g ~= "B";
6999     // not a valid grapheme cluster anymore
7000     assert(!g.valid);
7001     // still could be useful though
7002     assert(g[].equal("A\u0301B"));
7003 }
7004 
7005 @safe unittest
7006 {
7007     auto g = Grapheme("A\u0302");
7008     assert(g[0] == 'A');
7009     assert(g.valid);
7010     g[1] = '~'; // ASCII tilda is not a combining mark
7011     assert(g[1] == '~');
7012     assert(!g.valid);
7013 }
7014 
7015 @system unittest
7016 {
7017     import std.algorithm.comparison : equal;
7018     import std.algorithm.iteration : map;
7019     import std.conv : text;
7020     import std.range : iota;
7021 
7022     // not valid clusters (but it just a test)
7023     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7024     assert(g[0] == 'a');
7025     assert(g[1] == 'b');
7026     assert(g[2] == 'c');
7027     assert(g[3] == 'd');
7028     assert(g[4] == 'e');
7029     g[3] = 'Й';
7030     assert(g[2] == 'c');
7031     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7032     assert(g[4] == 'e');
7033     assert(!g.valid);
7034 
7035     g ~= 'ц';
7036     g ~= '~';
7037     assert(g[0] == 'a');
7038     assert(g[1] == 'b');
7039     assert(g[2] == 'c');
7040     assert(g[3] == 'Й');
7041     assert(g[4] == 'e');
7042     assert(g[5] == 'ц');
7043     assert(g[6] == '~');
7044     assert(!g.valid);
7045 
7046     Grapheme copy = g;
7047     copy[0] = 'X';
7048     copy[1] = '-';
7049     assert(g[0] == 'a' && copy[0] == 'X');
7050     assert(g[1] == 'b' && copy[1] == '-');
7051     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7052     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7053     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7054     copy ~= "xyz";
7055     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7056     assert(!copy.valid);
7057 
7058     Grapheme h;
7059     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7060         h ~= v;
7061     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7062 }
7063 
7064 /++
7065     $(P Does basic case-insensitive comparison of $(D r1) and $(D r2).
7066     This function uses simpler comparison rule thus achieving better performance
7067     than $(LREF icmp). However keep in mind the warning below.)
7068 
7069     Params:
7070         r1 = an input range of characters
7071         r2 = an input range of characters
7072 
7073     Returns:
7074         An $(D int) that is 0 if the strings match,
7075         &lt;0 if $(D r1) is lexicographically "less" than $(D r2),
7076         &gt;0 if $(D r1) is lexicographically "greater" than $(D r2)
7077 
7078     Warning:
7079     This function only handles 1:1 $(CODEPOINT) mapping
7080     and thus is not sufficient for certain alphabets
7081     like German, Greek and few others.
7082 
7083     See_Also:
7084         $(LREF icmp)
7085         $(REF cmp, std,algorithm,comparison)
7086 +/
7087 int sicmp(S1, S2)(S1 r1, S2 r2)
7088 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7089     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7090 {
7091     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7092     import std.utf : byDchar;
7093 
7094     auto str1 = r1.byDchar;
7095     auto str2 = r2.byDchar;
7096 
foreach(immutable lhs;str1)7097     foreach (immutable lhs; str1)
7098     {
7099         if (str2.empty)
7100             return 1;
7101         immutable rhs = str2.front;
7102         str2.popFront();
7103         int diff = lhs - rhs;
7104         if (!diff)
7105             continue;
7106         size_t idx = simpleCaseTrie[lhs];
7107         size_t idx2 = simpleCaseTrie[rhs];
7108         // simpleCaseTrie is packed index table
7109         if (idx != EMPTY_CASE_TRIE)
7110         {
7111             if (idx2 != EMPTY_CASE_TRIE)
7112             {// both cased chars
7113                 // adjust idx --> start of bucket
7114                 idx = idx - sTable[idx].n;
7115                 idx2 = idx2 - sTable[idx2].n;
7116                 if (idx == idx2)// one bucket, equivalent chars
7117                     continue;
7118                 else//  not the same bucket
7119                     diff = sTable[idx].ch - sTable[idx2].ch;
7120             }
7121             else
7122                 diff = sTable[idx - sTable[idx].n].ch - rhs;
7123         }
7124         else if (idx2 != EMPTY_CASE_TRIE)
7125         {
7126             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
7127         }
7128         // one of chars is not cased at all
7129         return diff;
7130     }
7131     return str2.empty ? 0 : -1;
7132 }
7133 
7134 ///
7135 @safe @nogc pure nothrow unittest
7136 {
7137     assert(sicmp("Август", "авгусТ") == 0);
7138     // Greek also works as long as there is no 1:M mapping in sight
7139     assert(sicmp("ΌΎ", "όύ") == 0);
7140     // things like the following won't get matched as equal
7141     // Greek small letter iota with dialytika and tonos
7142     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7143 
7144     // while icmp has no problem with that
7145     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
7146     assert(icmp("ΌΎ", "όύ") == 0);
7147 }
7148 
7149 // overloads for the most common cases to reduce compile time
7150 @safe @nogc pure nothrow
7151 {
sicmp(const (char)[]str1,const (char)[]str2)7152     int sicmp(const(char)[] str1, const(char)[] str2)
7153     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
sicmp(const (wchar)[]str1,const (wchar)[]str2)7154     int sicmp(const(wchar)[] str1, const(wchar)[] str2)
7155     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
sicmp(const (dchar)[]str1,const (dchar)[]str2)7156     int sicmp(const(dchar)[] str1, const(dchar)[] str2)
7157     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7158 }
7159 
fullCasedCmp(Range)7160 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
7161 {
7162     import std.algorithm.searching : skipOver;
7163     import std.internal.unicode_tables : fullCaseTable; // generated file
7164     alias fTable = fullCaseTable;
7165     size_t idx = fullCaseTrie[lhs];
7166     // fullCaseTrie is packed index table
7167     if (idx == EMPTY_CASE_TRIE)
7168         return lhs;
7169     immutable start = idx - fTable[idx].n;
7170     immutable end = fTable[idx].size + start;
7171     assert(fTable[start].entry_len == 1);
7172     for (idx=start; idx<end; idx++)
7173     {
7174         auto entryLen = fTable[idx].entry_len;
7175         if (entryLen == 1)
7176         {
7177             if (fTable[idx].seq[0] == rhs)
7178             {
7179                 return 0;
7180             }
7181         }
7182         else
7183         {// OK it's a long chunk, like 'ss' for German
7184             dstring seq = fTable[idx].seq[0 .. entryLen];
7185             if (rhs == seq[0]
7186                 && rtail.skipOver(seq[1..$]))
7187             {
7188                 // note that this path modifies rtail
7189                 // iff we managed to get there
7190                 return 0;
7191             }
7192         }
7193     }
7194     return fTable[start].seq[0]; // new remapped character for accurate diffs
7195 }
7196 
7197 /++
7198     Does case insensitive comparison of `r1` and `r2`.
7199     Follows the rules of full case-folding mapping.
7200     This includes matching as equal german ß with "ss" and
7201     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
7202     The cost of `icmp` being pedantically correct is
7203     slightly worse performance.
7204 
7205     Params:
7206         r1 = a forward range of characters
7207         r2 = a forward range of characters
7208 
7209     Returns:
7210         An $(D int) that is 0 if the strings match,
7211         &lt;0 if $(D str1) is lexicographically "less" than $(D str2),
7212         &gt;0 if $(D str1) is lexicographically "greater" than $(D str2)
7213 
7214     See_Also:
7215         $(LREF sicmp)
7216         $(REF cmp, std,algorithm,comparison)
7217 +/
7218 int icmp(S1, S2)(S1 r1, S2 r2)
7219 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
7220     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
7221 {
7222     import std.utf : byDchar;
7223 
7224     auto str1 = r1.byDchar;
7225     auto str2 = r2.byDchar;
7226 
7227     for (;;)
7228     {
7229         if (str1.empty)
7230             return str2.empty ? 0 : -1;
7231         immutable lhs = str1.front;
7232         if (str2.empty)
7233             return 1;
7234         immutable rhs = str2.front;
7235         str1.popFront();
7236         str2.popFront();
7237         if (!(lhs - rhs))
7238             continue;
7239         // first try to match lhs to <rhs,right-tail> sequence
7240         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
7241         if (!cmpLR)
7242             continue;
7243         // then rhs to <lhs,left-tail> sequence
7244         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
7245         if (!cmpRL)
7246             continue;
7247         // cmpXX contain remapped codepoints
7248         // to obtain stable ordering of icmp
7249         return cmpLR - cmpRL;
7250     }
7251 }
7252 
7253 ///
7254 @safe @nogc pure nothrow unittest
7255 {
7256     assert(icmp("Rußland", "Russland") == 0);
7257     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
7258 }
7259 
7260 /**
7261  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
7262  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
7263  */
7264 @safe @nogc nothrow pure unittest
7265 {
7266     import std.utf : byDchar;
7267 
7268     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
7269     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
7270 }
7271 
7272 // test different character types
7273 @safe unittest
7274 {
7275     assert(icmp("Rußland", "Russland") == 0);
7276     assert(icmp("Rußland"w, "Russland") == 0);
7277     assert(icmp("Rußland", "Russland"w) == 0);
7278     assert(icmp("Rußland"w, "Russland"w) == 0);
7279     assert(icmp("Rußland"d, "Russland"w) == 0);
7280     assert(icmp("Rußland"w, "Russland"d) == 0);
7281 }
7282 
7283 // overloads for the most common cases to reduce compile time
7284 @safe @nogc pure nothrow
7285 {
icmp(const (char)[]str1,const (char)[]str2)7286     int icmp(const(char)[] str1, const(char)[] str2)
7287     { return icmp!(const(char)[], const(char)[])(str1, str2); }
icmp(const (wchar)[]str1,const (wchar)[]str2)7288     int icmp(const(wchar)[] str1, const(wchar)[] str2)
7289     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
icmp(const (dchar)[]str1,const (dchar)[]str2)7290     int icmp(const(dchar)[] str1, const(dchar)[] str2)
7291     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
7292 }
7293 
7294 @safe unittest
7295 {
7296     import std.algorithm.sorting : sort;
7297     import std.conv : to;
7298     import std.exception : assertCTFEable;
7299     assertCTFEable!(
7300     {
7301     foreach (cfunc; AliasSeq!(icmp, sicmp))
7302     {
7303         foreach (S1; AliasSeq!(string, wstring, dstring))
7304         foreach (S2; AliasSeq!(string, wstring, dstring))
7305         (){ // avoid slow optimizations for large functions @@@BUG@@@ 2396
7306             assert(cfunc("".to!S1(), "".to!S2()) == 0);
7307             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
7308             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
7309             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
7310             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
7311             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
7312             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
7313             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
7314             // Check example:
7315             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
7316             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
7317         }();
7318         // check that the order is properly agnostic to the case
7319         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
7320         sort!((a,b) => cfunc(a,b) < 0)(strs);
7321         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
7322     }
7323     assert(icmp("ßb", "ssa") > 0);
7324     // Check example:
7325     assert(icmp("Russland", "Rußland") == 0);
7326     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
7327     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
7328     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
7329     //bugzilla 11057
7330     assert( icmp("K", "L") < 0 );
7331     });
7332 }
7333 
7334 // issue 17372
7335 @safe pure unittest
7336 {
7337     import std.algorithm.iteration : joiner, map;
7338     import std.algorithm.sorting : sort;
7339     import std.array : array;
7340     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
7341 }
7342 
7343 // This is package for the moment to be used as a support tool for std.regex
7344 // It needs a better API
7345 /*
7346     Return a range of all $(CODEPOINTS) that casefold to
7347     and from this $(D ch).
7348 */
simpleCaseFoldings(dchar ch)7349 package auto simpleCaseFoldings(dchar ch) @safe
7350 {
7351     import std.internal.unicode_tables : simpleCaseTable; // generated file
7352     alias sTable = simpleCaseTable;
7353     static struct Range
7354     {
7355     @safe pure nothrow:
7356         uint idx; //if == uint.max, then read c.
7357         union
7358         {
7359             dchar c; // == 0 - empty range
7360             uint len;
7361         }
7362         @property bool isSmall() const { return idx == uint.max; }
7363 
7364         this(dchar ch)
7365         {
7366             idx = uint.max;
7367             c = ch;
7368         }
7369 
7370         this(uint start, uint size)
7371         {
7372             idx = start;
7373             len = size;
7374         }
7375 
7376         @property dchar front() const
7377         {
7378             assert(!empty);
7379             if (isSmall)
7380             {
7381                 return c;
7382             }
7383             auto ch = sTable[idx].ch;
7384             return ch;
7385         }
7386 
7387         @property bool empty() const
7388         {
7389             if (isSmall)
7390             {
7391                 return c == 0;
7392             }
7393             return len == 0;
7394         }
7395 
7396         @property size_t length() const
7397         {
7398             if (isSmall)
7399             {
7400                 return c == 0 ? 0 : 1;
7401             }
7402             return len;
7403         }
7404 
7405         void popFront()
7406         {
7407             if (isSmall)
7408                 c = 0;
7409             else
7410             {
7411                 idx++;
7412                 len--;
7413             }
7414         }
7415     }
7416     immutable idx = simpleCaseTrie[ch];
7417     if (idx == EMPTY_CASE_TRIE)
7418         return Range(ch);
7419     auto entry = sTable[idx];
7420     immutable start = idx - entry.n;
7421     return Range(start, entry.size);
7422 }
7423 
7424 @system unittest
7425 {
7426     import std.algorithm.comparison : equal;
7427     import std.algorithm.searching : canFind;
7428     import std.array : array;
7429     import std.exception : assertCTFEable;
7430     assertCTFEable!((){
7431         auto r = simpleCaseFoldings('Э').array;
7432         assert(r.length == 2);
7433         assert(r.canFind('э') && r.canFind('Э'));
7434         auto sr = simpleCaseFoldings('~');
7435         assert(sr.equal("~"));
7436         //A with ring above - casefolds to the same bucket as Angstrom sign
7437         sr = simpleCaseFoldings('Å');
7438         assert(sr.length == 3);
7439         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
7440     });
7441 }
7442 
7443 /++
7444     $(P Returns the $(S_LINK Combining class, combining class) of $(D ch).)
7445 +/
combiningClass(dchar ch)7446 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
7447 {
7448     return combiningClassTrie[ch];
7449 }
7450 
7451 ///
7452 @safe unittest
7453 {
7454     // shorten the code
7455     alias CC = combiningClass;
7456 
7457     // combining tilda
7458     assert(CC('\u0303') == 230);
7459     // combining ring below
7460     assert(CC('\u0325') == 220);
7461     // the simple consequence is that  "tilda" should be
7462     // placed after a "ring below" in a sequence
7463 }
7464 
7465 @safe pure nothrow @nogc unittest
7466 {
7467     foreach (ch; 0 .. 0x80)
7468         assert(combiningClass(ch) == 0);
7469     assert(combiningClass('\u05BD') == 22);
7470     assert(combiningClass('\u0300') == 230);
7471     assert(combiningClass('\u0317') == 220);
7472     assert(combiningClass('\u1939') == 222);
7473 }
7474 
7475 /// Unicode character decomposition type.
7476 enum UnicodeDecomposition {
7477     /// Canonical decomposition. The result is canonically equivalent sequence.
7478     Canonical,
7479     /**
7480          Compatibility decomposition. The result is compatibility equivalent sequence.
7481          Note: Compatibility decomposition is a $(B lossy) conversion,
7482          typically suitable only for fuzzy matching and internal processing.
7483     */
7484     Compatibility
7485 }
7486 
7487 /**
7488     Shorthand aliases for character decomposition type, passed as a
7489     template parameter to $(LREF decompose).
7490 */
7491 enum {
7492     Canonical = UnicodeDecomposition.Canonical,
7493     Compatibility = UnicodeDecomposition.Compatibility
7494 }
7495 
7496 /++
7497     Try to canonically compose 2 $(CHARACTERS).
7498     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
7499 
7500     The assumption is that $(D first) comes before $(D second) in the original text,
7501     usually meaning that the first is a starter.
7502 
7503     Note: Hangul syllables are not covered by this function.
7504     See $(D composeJamo) below.
7505 +/
compose(dchar first,dchar second)7506 public dchar compose(dchar first, dchar second) pure nothrow @safe
7507 {
7508     import std.algorithm.iteration : map;
7509     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
7510     import std.range : assumeSorted;
7511     immutable packed = compositionJumpTrie[first];
7512     if (packed == ushort.max)
7513         return dchar.init;
7514     // unpack offset and length
7515     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
7516     // TODO: optimize this micro binary search (no more then 4-5 steps)
7517     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
7518     immutable target = r.lowerBound(second).length;
7519     if (target == cnt)
7520         return dchar.init;
7521     immutable entry = compositionTable[idx+target];
7522     if (entry.rhs != second)
7523         return dchar.init;
7524     return entry.composed;
7525 }
7526 
7527 ///
7528 @safe unittest
7529 {
7530     assert(compose('A','\u0308') == '\u00C4');
7531     assert(compose('A', 'B') == dchar.init);
7532     assert(compose('C', '\u0301') == '\u0106');
7533     // note that the starter is the first one
7534     // thus the following doesn't compose
7535     assert(compose('\u0308', 'A') == dchar.init);
7536 }
7537 
7538 /++
7539     Returns a full $(S_LINK Canonical decomposition, Canonical)
7540     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
7541     decomposition of $(CHARACTER) $(D ch).
7542     If no decomposition is available returns a $(LREF Grapheme)
7543     with the $(D ch) itself.
7544 
7545     Note:
7546     This function also decomposes hangul syllables
7547     as prescribed by the standard.
7548 
7549     See_Also: $(LREF decomposeHangul) for a restricted version
7550     that takes into account only hangul syllables  but
7551     no other decompositions.
7552 +/
7553 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
7554 {
7555     import std.algorithm.searching : until;
7556     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
7557     static if (decompType == Canonical)
7558     {
7559         alias table = decompCanonTable;
7560         alias mapping = canonMappingTrie;
7561     }
7562     else static if (decompType == Compatibility)
7563     {
7564         alias table = decompCompatTable;
7565         alias mapping = compatMappingTrie;
7566     }
7567     immutable idx = mapping[ch];
7568     if (!idx) // not found, check hangul arithmetic decomposition
7569         return decomposeHangul(ch);
7570     auto decomp = table[idx..$].until(0);
7571     return Grapheme(decomp);
7572 }
7573 
7574 ///
7575 @system unittest
7576 {
7577     import std.algorithm.comparison : equal;
7578 
7579     assert(compose('A','\u0308') == '\u00C4');
7580     assert(compose('A', 'B') == dchar.init);
7581     assert(compose('C', '\u0301') == '\u0106');
7582     // note that the starter is the first one
7583     // thus the following doesn't compose
7584     assert(compose('\u0308', 'A') == dchar.init);
7585 
7586     assert(decompose('Ĉ')[].equal("C\u0302"));
7587     assert(decompose('D')[].equal("D"));
7588     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
7589     assert(decompose!Compatibility('¹')[].equal("1"));
7590 }
7591 
7592 //----------------------------------------------------------------------------
7593 // Hangul specific composition/decomposition
7594 enum jamoSBase = 0xAC00;
7595 enum jamoLBase = 0x1100;
7596 enum jamoVBase = 0x1161;
7597 enum jamoTBase = 0x11A7;
7598 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
7599 enum jamoNCount = jamoVCount * jamoTCount;
7600 enum jamoSCount = jamoLCount * jamoNCount;
7601 
7602 // Tests if $(D ch) is a Hangul leading consonant jamo.
isJamoL(dchar ch)7603 bool isJamoL(dchar ch) pure nothrow @nogc @safe
7604 {
7605     // first cmp rejects ~ 1M code points above leading jamo range
7606     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
7607 }
7608 
7609 // Tests if $(D ch) is a Hangul vowel jamo.
isJamoT(dchar ch)7610 bool isJamoT(dchar ch) pure nothrow @nogc @safe
7611 {
7612     // first cmp rejects ~ 1M code points above trailing jamo range
7613     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
7614     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
7615 }
7616 
7617 // Tests if $(D ch) is a Hangul trailnig consonant jamo.
isJamoV(dchar ch)7618 bool isJamoV(dchar ch) pure nothrow @nogc @safe
7619 {
7620     // first cmp rejects ~ 1M code points above vowel range
7621     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
7622 }
7623 
hangulSyllableIndex(dchar ch)7624 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
7625 {
7626     int idxS = cast(int) ch - jamoSBase;
7627     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
7628 }
7629 
7630 // internal helper: compose hangul syllables leaving dchar.init in holes
hangulRecompose(dchar[]seq)7631 void hangulRecompose(dchar[] seq) pure nothrow @nogc @safe
7632 {
7633     for (size_t idx = 0; idx + 1 < seq.length; )
7634     {
7635         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
7636         {
7637             immutable int indexL = seq[idx] - jamoLBase;
7638             immutable int indexV = seq[idx+1] - jamoVBase;
7639             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
7640             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
7641             {
7642                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
7643                 seq[idx+1] = dchar.init;
7644                 seq[idx+2] = dchar.init;
7645                 idx += 3;
7646             }
7647             else
7648             {
7649                 seq[idx] = jamoSBase + indexLV;
7650                 seq[idx+1] = dchar.init;
7651                 idx += 2;
7652             }
7653         }
7654         else
7655             idx++;
7656     }
7657 }
7658 
7659 //----------------------------------------------------------------------------
7660 public:
7661 
7662 /**
7663     Decomposes a Hangul syllable. If $(D ch) is not a composed syllable
7664     then this function returns $(LREF Grapheme) containing only $(D ch) as is.
7665 */
decomposeHangul(dchar ch)7666 Grapheme decomposeHangul(dchar ch) @safe
7667 {
7668     immutable idxS = cast(int) ch - jamoSBase;
7669     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
7670     immutable idxL = idxS / jamoNCount;
7671     immutable idxV = (idxS % jamoNCount) / jamoTCount;
7672     immutable idxT = idxS % jamoTCount;
7673 
7674     immutable partL = jamoLBase + idxL;
7675     immutable partV = jamoVBase + idxV;
7676     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
7677         return Grapheme(partL, partV, jamoTBase + idxT);
7678     else // <L, V> decomposition
7679         return Grapheme(partL, partV);
7680 }
7681 
7682 ///
7683 @system unittest
7684 {
7685     import std.algorithm.comparison : equal;
7686     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
7687 }
7688 
7689 /++
7690     Try to compose hangul syllable out of a leading consonant ($(D lead)),
7691     a $(D vowel) and optional $(D trailing) consonant jamos.
7692 
7693     On success returns the composed LV or LVT hangul syllable.
7694 
7695     If any of $(D lead) and $(D vowel) are not a valid hangul jamo
7696     of the respective $(CHARACTER) class returns dchar.init.
7697 +/
7698 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
7699 {
7700     if (!isJamoL(lead))
7701         return dchar.init;
7702     immutable indexL = lead - jamoLBase;
7703     if (!isJamoV(vowel))
7704         return dchar.init;
7705     immutable indexV = vowel - jamoVBase;
7706     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
7707     immutable dchar syllable = jamoSBase + indexLV;
7708     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
7709 }
7710 
7711 ///
7712 @safe unittest
7713 {
7714     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
7715     // leaving out T-vowel, or passing any codepoint
7716     // that is not trailing consonant composes an LV-syllable
7717     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
7718     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
7719     assert(composeJamo('\u1111', 'A') == dchar.init);
7720     assert(composeJamo('A', '\u1171') == dchar.init);
7721 }
7722 
7723 @system unittest
7724 {
7725     import std.algorithm.comparison : equal;
7726     import std.conv : text;
7727 
testDecomp(UnicodeDecomposition T)7728     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
7729     {
7730         Grapheme g = decompose!T(ch);
7731         assert(equal(g[], r), text(g[], " vs ", r));
7732     }
7733     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
7734     testDecomp!Canonical('\uF907', "\u9F9C");
7735     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
7736     testDecomp!Compatibility('\uA7F9', "\u0153");
7737 
7738     // check examples
7739     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
7740     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
7741     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
7742     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
7743     assert(composeJamo('\u1111', 'A') == dchar.init);
7744     assert(composeJamo('A', '\u1171') == dchar.init);
7745 }
7746 
7747 /**
7748     Enumeration type for normalization forms,
7749     passed as template parameter for functions like $(LREF normalize).
7750 */
7751 enum NormalizationForm {
7752     NFC,
7753     NFD,
7754     NFKC,
7755     NFKD
7756 }
7757 
7758 
7759 enum {
7760     /**
7761         Shorthand aliases from values indicating normalization forms.
7762     */
7763     NFC = NormalizationForm.NFC,
7764     ///ditto
7765     NFD = NormalizationForm.NFD,
7766     ///ditto
7767     NFKC = NormalizationForm.NFKC,
7768     ///ditto
7769     NFKD = NormalizationForm.NFKD
7770 }
7771 
7772 /++
7773     Returns $(D input) string normalized to the chosen form.
7774     Form C is used by default.
7775 
7776     For more information on normalization forms see
7777     the $(S_LINK Normalization, normalization section).
7778 
7779     Note:
7780     In cases where the string in question is already normalized,
7781     it is returned unmodified and no memory allocation happens.
7782 +/
inout(C)7783 inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
7784 {
7785     import std.algorithm.mutation : SwapStrategy;
7786     import std.algorithm.sorting : sort;
7787     import std.array : appender;
7788     import std.range : zip;
7789 
7790     auto anchors = splitNormalized!norm(input);
7791     if (anchors[0] == input.length && anchors[1] == input.length)
7792         return input;
7793     dchar[] decomposed;
7794     decomposed.reserve(31);
7795     ubyte[] ccc;
7796     ccc.reserve(31);
7797     auto app = appender!(C[])();
7798     do
7799     {
7800         app.put(input[0 .. anchors[0]]);
7801         foreach (dchar ch; input[anchors[0]..anchors[1]])
7802             static if (norm == NFD || norm == NFC)
7803             {
7804                 foreach (dchar c; decompose!Canonical(ch)[])
7805                     decomposed ~= c;
7806             }
7807             else // NFKD & NFKC
7808             {
7809                 foreach (dchar c; decompose!Compatibility(ch)[])
7810                     decomposed ~= c;
7811             }
7812         ccc.length = decomposed.length;
7813         size_t firstNonStable = 0;
7814         ubyte lastClazz = 0;
7815 
7816         foreach (idx, dchar ch; decomposed)
7817         {
7818             immutable clazz = combiningClass(ch);
7819             ccc[idx] = clazz;
7820             if (clazz == 0 && lastClazz != 0)
7821             {
7822                 // found a stable code point after unstable ones
7823                 sort!("a[0] < b[0]", SwapStrategy.stable)
7824                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
7825                 firstNonStable = decomposed.length;
7826             }
7827             else if (clazz != 0 && lastClazz == 0)
7828             {
7829                 // found first unstable code point after stable ones
7830                 firstNonStable = idx;
7831             }
7832             lastClazz = clazz;
7833         }
7834         sort!("a[0] < b[0]", SwapStrategy.stable)
7835             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
7836         static if (norm == NFC || norm == NFKC)
7837         {
7838             import std.algorithm.searching : countUntil;
7839             auto first = countUntil(ccc, 0);
7840             if (first >= 0) // no starters?? no recomposition
7841             {
7842                 for (;;)
7843                 {
7844                     immutable second = recompose(first, decomposed, ccc);
7845                     if (second == decomposed.length)
7846                         break;
7847                     first = second;
7848                 }
7849                 // 2nd pass for hangul syllables
7850                 hangulRecompose(decomposed);
7851             }
7852         }
7853         static if (norm == NFD || norm == NFKD)
7854             app.put(decomposed);
7855         else
7856         {
7857             import std.algorithm.mutation : remove;
7858             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
7859             app.put(decomposed[0 .. clean.length]);
7860         }
7861         // reset variables
7862         decomposed.length = 0;
7863         decomposed.assumeSafeAppend();
7864         ccc.length = 0;
7865         ccc.assumeSafeAppend();
7866         input = input[anchors[1]..$];
7867         // and move on
7868         anchors = splitNormalized!norm(input);
7869     }while (anchors[0] != input.length);
7870     app.put(input[0 .. anchors[0]]);
7871     return cast(inout(C)[])app.data;
7872 }
7873 
7874 ///
7875 @safe unittest
7876 {
7877     // any encoding works
7878     wstring greet = "Hello world";
7879     assert(normalize(greet) is greet); // the same exact slice
7880 
7881     // An example of a character with all 4 forms being different:
7882     // Greek upsilon with acute and hook symbol (code point 0x03D3)
7883     assert(normalize!NFC("ϓ") == "\u03D3");
7884     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
7885     assert(normalize!NFKC("ϓ") == "\u038E");
7886     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
7887 }
7888 
7889 @safe unittest
7890 {
7891     import std.conv : text;
7892 
7893     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
7894     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
7895     assert(normalize!NFD("Äffin") == "A\u0308ffin");
7896 
7897     // check example
7898 
7899     // any encoding works
7900     wstring greet = "Hello world";
7901     assert(normalize(greet) is greet); // the same exact slice
7902 
7903     // An example of a character with all 4 forms being different:
7904     // Greek upsilon with acute and hook symbol (code point 0x03D3)
7905     assert(normalize!NFC("ϓ") == "\u03D3");
7906     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
7907     assert(normalize!NFKC("ϓ") == "\u038E");
7908     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
7909 }
7910 
7911 // canonically recompose given slice of code points, works in-place and mutates data
recompose(size_t start,dchar[]input,ubyte[]ccc)7912 private size_t recompose(size_t start, dchar[] input, ubyte[] ccc) pure nothrow @safe
7913 {
7914     assert(input.length == ccc.length);
7915     int accumCC = -1;// so that it's out of 0 .. 255 range
7916     // writefln("recomposing %( %04x %)", input);
7917     // first one is always a starter thus we start at i == 1
7918     size_t i = start+1;
7919     for (; ; )
7920     {
7921         if (i == input.length)
7922             break;
7923         immutable curCC = ccc[i];
7924         // In any character sequence beginning with a starter S
7925         // a character C is blocked from S if and only if there
7926         // is some character B between S and C, and either B
7927         // is a starter or it has the same or higher combining class as C.
7928         //------------------------
7929         // Applying to our case:
7930         // S is input[0]
7931         // accumCC is the maximum CCC of characters between C and S,
7932         //     as ccc are sorted
7933         // C is input[i]
7934 
7935         if (curCC > accumCC)
7936         {
7937             immutable comp = compose(input[start], input[i]);
7938             if (comp != dchar.init)
7939             {
7940                 input[start] = comp;
7941                 input[i] = dchar.init;// put a sentinel
7942                 // current was merged so its CCC shouldn't affect
7943                 // composing with the next one
7944             }
7945             else
7946             {
7947                 // if it was a starter then accumCC is now 0, end of loop
7948                 accumCC = curCC;
7949                 if (accumCC == 0)
7950                     break;
7951             }
7952         }
7953         else
7954         {
7955             // ditto here
7956             accumCC = curCC;
7957             if (accumCC == 0)
7958                 break;
7959         }
7960         i++;
7961     }
7962     return i;
7963 }
7964 
7965 // returns tuple of 2 indexes that delimit:
7966 // normalized text, piece that needs normalization and
7967 // the rest of input starting with stable code point
splitNormalized(NormalizationForm norm,C)7968 private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
7969 {
7970     import std.typecons : tuple;
7971     ubyte lastCC = 0;
7972 
7973     foreach (idx, dchar ch; input)
7974     {
7975         static if (norm == NFC)
7976             if (ch < 0x0300)
7977             {
7978                 lastCC = 0;
7979                 continue;
7980             }
7981         immutable ubyte CC = combiningClass(ch);
7982         if (lastCC > CC && CC != 0)
7983         {
7984             return seekStable!norm(idx, input);
7985         }
7986 
7987         if (notAllowedIn!norm(ch))
7988         {
7989            return seekStable!norm(idx, input);
7990         }
7991         lastCC = CC;
7992     }
7993     return tuple(input.length, input.length);
7994 }
7995 
seekStable(NormalizationForm norm,C)7996 private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input)
7997 {
7998     import std.typecons : tuple;
7999     import std.utf : codeLength;
8000 
8001     auto br = input[0 .. idx];
8002     size_t region_start = 0;// default
8003     for (;;)
8004     {
8005         if (br.empty)// start is 0
8006             break;
8007         dchar ch = br.back;
8008         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8009         {
8010             region_start = br.length - codeLength!C(ch);
8011             break;
8012         }
8013         br.popFront();
8014     }
8015     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8016     size_t region_end=input.length;// end is $ by default
8017     foreach (i, dchar ch; input[idx..$])
8018     {
8019         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8020         {
8021             region_end = i+idx;
8022             break;
8023         }
8024     }
8025     // writeln("Region to normalize: ", input[region_start .. region_end]);
8026     return tuple(region_start, region_end);
8027 }
8028 
8029 /**
8030     Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization
8031     form $(D norm).
8032 */
allowedIn(NormalizationForm norm)8033 public bool allowedIn(NormalizationForm norm)(dchar ch)
8034 {
8035     return !notAllowedIn!norm(ch);
8036 }
8037 
8038 ///
8039 @safe unittest
8040 {
8041     // e.g. Cyrillic is always allowed, so is ASCII
8042     assert(allowedIn!NFC('я'));
8043     assert(allowedIn!NFD('я'));
8044     assert(allowedIn!NFKC('я'));
8045     assert(allowedIn!NFKD('я'));
8046     assert(allowedIn!NFC('Z'));
8047 }
8048 
8049 // not user friendly name but more direct
notAllowedIn(NormalizationForm norm)8050 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8051 {
8052     static if (norm == NFC)
8053         alias qcTrie = nfcQCTrie;
8054     else static if (norm == NFD)
8055         alias qcTrie = nfdQCTrie;
8056     else static if (norm == NFKC)
8057         alias qcTrie = nfkcQCTrie;
8058     else static if (norm == NFKD)
8059         alias qcTrie = nfkdQCTrie;
8060     else
8061         static assert("Unknown normalization form "~norm);
8062     return qcTrie[ch];
8063 }
8064 
8065 @safe unittest
8066 {
8067     assert(allowedIn!NFC('я'));
8068     assert(allowedIn!NFD('я'));
8069     assert(allowedIn!NFKC('я'));
8070     assert(allowedIn!NFKD('я'));
8071     assert(allowedIn!NFC('Z'));
8072 }
8073 
8074 }
8075 
version(std_uni_bootstrap)8076 version (std_uni_bootstrap)
8077 {
8078     // old version used for bootstrapping of gen_uni.d that generates
8079     // up to date optimal versions of all of isXXX functions
8080     @safe pure nothrow @nogc public bool isWhite(dchar c)
8081     {
8082         import std.ascii : isWhite;
8083         return isWhite(c) ||
8084                c == lineSep || c == paraSep ||
8085                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
8086                (c >= '\u2000' && c <= '\u200A') ||
8087                c == '\u202F' || c == '\u205F' || c == '\u3000';
8088     }
8089 }
8090 else
8091 {
8092 
8093 // trusted -> avoid bounds check
8094 @trusted pure nothrow @nogc private
8095 {
8096     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
8097 
8098     // hide template instances behind functions (Bugzilla 13232)
toLowerIndex(dchar c)8099     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
toLowerSimpleIndex(dchar c)8100     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
toLowerTab(size_t idx)8101     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
8102 
toTitleIndex(dchar c)8103     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
toTitleSimpleIndex(dchar c)8104     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
toTitleTab(size_t idx)8105     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
8106 
toUpperIndex(dchar c)8107     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
toUpperSimpleIndex(dchar c)8108     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
toUpperTab(size_t idx)8109     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
8110 }
8111 
8112 public:
8113 
8114 /++
8115     Whether or not $(D c) is a Unicode whitespace $(CHARACTER).
8116     (general Unicode category: Part of C0(tab, vertical tab, form feed,
8117     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
8118 +/
8119 @safe pure nothrow @nogc
isWhite(dchar c)8120 public bool isWhite(dchar c)
8121 {
8122     import std.internal.unicode_tables : isWhiteGen; // generated file
8123     return isWhiteGen(c); // call pregenerated binary search
8124 }
8125 
8126 /++
8127     Return whether $(D c) is a Unicode lowercase $(CHARACTER).
8128 +/
8129 @safe pure nothrow @nogc
isLower(dchar c)8130 bool isLower(dchar c)
8131 {
8132     import std.ascii : isLower, isASCII;
8133     if (isASCII(c))
8134         return isLower(c);
8135     return lowerCaseTrie[c];
8136 }
8137 
8138 @safe unittest
8139 {
8140     import std.ascii : isLower;
8141     foreach (v; 0 .. 0x80)
8142         assert(isLower(v) == .isLower(v));
8143     assert(.isLower('я'));
8144     assert(.isLower('й'));
8145     assert(!.isLower('Ж'));
8146     // Greek HETA
8147     assert(!.isLower('\u0370'));
8148     assert(.isLower('\u0371'));
8149     assert(!.isLower('\u039C')); // capital MU
8150     assert(.isLower('\u03B2')); // beta
8151     // from extended Greek
8152     assert(!.isLower('\u1F18'));
8153     assert(.isLower('\u1F00'));
8154     foreach (v; unicode.lowerCase.byCodepoint)
8155         assert(.isLower(v) && !isUpper(v));
8156 }
8157 
8158 
8159 /++
8160     Return whether $(D c) is a Unicode uppercase $(CHARACTER).
8161 +/
8162 @safe pure nothrow @nogc
isUpper(dchar c)8163 bool isUpper(dchar c)
8164 {
8165     import std.ascii : isUpper, isASCII;
8166     if (isASCII(c))
8167         return isUpper(c);
8168     return upperCaseTrie[c];
8169 }
8170 
8171 @safe unittest
8172 {
8173     import std.ascii : isLower;
8174     foreach (v; 0 .. 0x80)
8175         assert(isLower(v) == .isLower(v));
8176     assert(!isUpper('й'));
8177     assert(isUpper('Ж'));
8178     // Greek HETA
8179     assert(isUpper('\u0370'));
8180     assert(!isUpper('\u0371'));
8181     assert(isUpper('\u039C')); // capital MU
8182     assert(!isUpper('\u03B2')); // beta
8183     // from extended Greek
8184     assert(!isUpper('\u1F00'));
8185     assert(isUpper('\u1F18'));
8186     foreach (v; unicode.upperCase.byCodepoint)
8187         assert(isUpper(v) && !.isLower(v));
8188 }
8189 
8190 
8191 //TODO: Hidden for now, needs better API.
8192 //Other transforms could use better API as well, but this one is a new primitive.
8193 @safe pure nothrow @nogc
toTitlecase(dchar c)8194 private dchar toTitlecase(dchar c)
8195 {
8196     // optimize ASCII case
8197     if (c < 0xAA)
8198     {
8199         if (c < 'a')
8200             return c;
8201         if (c <= 'z')
8202             return c - 32;
8203         return c;
8204     }
8205     size_t idx = toTitleSimpleIndex(c);
8206     if (idx != ushort.max)
8207     {
8208         return toTitleTab(idx);
8209     }
8210     return c;
8211 }
8212 
8213 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
8214 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
8215 
8216 // generic toUpper/toLower on whole string, creates new or returns as is
8217 private S toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) @trusted pure
8218 if (isSomeString!S)
8219 {
8220     import std.array : appender;
8221     import std.ascii : isASCII;
8222 
foreach(i,dchar cOuter;s)8223     foreach (i, dchar cOuter; s)
8224     {
8225         ushort idx = indexFn(cOuter);
8226         if (idx == ushort.max)
8227             continue;
8228         auto result = appender!S(s[0 .. i]);
8229         result.reserve(s.length);
8230         foreach (dchar c; s[i .. $])
8231         {
8232             if (c.isASCII)
8233             {
8234                 result.put(asciiConvert(c));
8235             }
8236             else
8237             {
8238                 idx = indexFn(c);
8239                 if (idx == ushort.max)
8240                     result.put(c);
8241                 else if (idx < maxIdx)
8242                 {
8243                     c = tableFn(idx);
8244                     result.put(c);
8245                 }
8246                 else
8247                 {
8248                     auto val = tableFn(idx);
8249                     // unpack length + codepoint
8250                     immutable uint len = val >> 24;
8251                     result.put(cast(dchar)(val & 0xFF_FFFF));
8252                     foreach (j; idx+1 .. idx+len)
8253                         result.put(tableFn(j));
8254                 }
8255             }
8256         }
8257         return result.data;
8258     }
8259     return s;
8260 }
8261 
8262 @safe unittest //12428
8263 {
8264     import std.array : replicate;
8265     auto s = "abcdefghij".replicate(300);
8266     s = s[0 .. 10];
8267 
8268     toUpper(s);
8269 
8270     assert(s == "abcdefghij");
8271 }
8272 
8273 
8274 // generic toUpper/toLower on whole range, returns range
8275 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
8276     // Accept range of dchar's
8277 if (isInputRange!Range &&
8278     isSomeChar!(ElementEncodingType!Range) &&
8279     ElementEncodingType!Range.sizeof == dchar.sizeof)
8280 {
8281     static struct ToCaserImpl
8282     {
emptyToCaserImpl8283         @property bool empty()
8284         {
8285             return !nLeft && r.empty;
8286         }
8287 
frontToCaserImpl8288         @property auto front()
8289         {
8290             import std.ascii : isASCII;
8291 
8292             if (!nLeft)
8293             {
8294                 dchar c = r.front;
8295                 if (c.isASCII)
8296                 {
8297                     buf[0] = asciiConvert(c);
8298                     nLeft = 1;
8299                 }
8300                 else
8301                 {
8302                     const idx = indexFn(c);
8303                     if (idx == ushort.max)
8304                     {
8305                         buf[0] = c;
8306                         nLeft = 1;
8307                     }
8308                     else if (idx < maxIdx)
8309                     {
8310                         buf[0] = tableFn(idx);
8311                         nLeft = 1;
8312                     }
8313                     else
8314                     {
8315                         immutable val = tableFn(idx);
8316                         // unpack length + codepoint
8317                         nLeft = val >> 24;
8318                         if (nLeft == 0)
8319                             nLeft = 1;
8320                         assert(nLeft <= buf.length);
8321                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
8322                         foreach (j; 1 .. nLeft)
8323                             buf[nLeft - j - 1] = tableFn(idx + j);
8324                     }
8325                 }
8326             }
8327             return buf[nLeft - 1];
8328         }
8329 
popFrontToCaserImpl8330         void popFront()
8331         {
8332             if (!nLeft)
8333                 front;
8334             assert(nLeft);
8335             --nLeft;
8336             if (!nLeft)
8337                 r.popFront();
8338         }
8339 
8340         static if (isForwardRange!Range)
8341         {
saveToCaserImpl8342             @property auto save()
8343             {
8344                 auto ret = this;
8345                 ret.r = r.save;
8346                 return ret;
8347             }
8348         }
8349 
8350       private:
8351         Range r;
8352         uint nLeft;
8353         dchar[3] buf = void;
8354     }
8355 
8356     return ToCaserImpl(str);
8357 }
8358 
8359 /*********************
8360  * Convert input range or string to upper or lower case.
8361  *
8362  * Does not allocate memory.
8363  * Characters in UTF-8 or UTF-16 format that cannot be decoded
8364  * are treated as $(REF replacementDchar, std,utf).
8365  *
8366  * Params:
8367  *      str = string or range of characters
8368  *
8369  * Returns:
8370  *      an InputRange of dchars
8371  *
8372  * See_Also:
8373  *      $(LREF toUpper), $(LREF toLower)
8374  */
8375 
8376 auto asLowerCase(Range)(Range str)
8377 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8378     !isConvertibleToString!Range)
8379 {
8380     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8381     {
8382         import std.utf : byDchar;
8383 
8384         // Decode first
8385         return asLowerCase(str.byDchar);
8386     }
8387     else
8388     {
8389         static import std.ascii;
8390         return toCaser!(LowerTriple, std.ascii.toLower)(str);
8391     }
8392 }
8393 
8394 /// ditto
8395 auto asUpperCase(Range)(Range str)
8396 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8397     !isConvertibleToString!Range)
8398 {
8399     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8400     {
8401         import std.utf : byDchar;
8402 
8403         // Decode first
8404         return asUpperCase(str.byDchar);
8405     }
8406     else
8407     {
8408         static import std.ascii;
8409         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
8410     }
8411 }
8412 
8413 ///
8414 @safe pure unittest
8415 {
8416     import std.algorithm.comparison : equal;
8417 
8418     assert("hEllo".asUpperCase.equal("HELLO"));
8419 }
8420 
8421 // explicitly undocumented
8422 auto asLowerCase(Range)(auto ref Range str)
8423 if (isConvertibleToString!Range)
8424 {
8425     import std.traits : StringTypeOf;
8426     return asLowerCase!(StringTypeOf!Range)(str);
8427 }
8428 
8429 // explicitly undocumented
8430 auto asUpperCase(Range)(auto ref Range str)
8431 if (isConvertibleToString!Range)
8432 {
8433     import std.traits : StringTypeOf;
8434     return asUpperCase!(StringTypeOf!Range)(str);
8435 }
8436 
8437 @safe unittest
8438 {
8439     assert(testAliasedString!asLowerCase("hEllo"));
8440     assert(testAliasedString!asUpperCase("hEllo"));
8441 }
8442 
8443 @safe unittest
8444 {
8445     import std.array : array;
8446 
8447     auto a = "HELLo".asLowerCase;
8448     auto savea = a.save;
8449     auto s = a.array;
8450     assert(s == "hello");
8451     s = savea.array;
8452     assert(s == "hello");
8453 
8454     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
8455     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
8456 
foreach(i,slwr;lower)8457     foreach (i, slwr; lower)
8458     {
8459         import std.utf : byChar;
8460 
8461         auto sx = slwr.asUpperCase.byChar.array;
8462         assert(sx == toUpper(slwr));
8463         auto sy = upper[i].asLowerCase.byChar.array;
8464         assert(sy == toLower(upper[i]));
8465     }
8466 
8467     // Not necessary to call r.front
8468     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
8469     {
8470     }
8471 
8472     import std.algorithm.comparison : equal;
8473 
8474     "HELLo"w.asLowerCase.equal("hello"d);
8475     "HELLo"w.asUpperCase.equal("HELLO"d);
8476     "HELLo"d.asLowerCase.equal("hello"d);
8477     "HELLo"d.asUpperCase.equal("HELLO"d);
8478 
8479     import std.utf : byChar;
8480     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
8481 }
8482 
8483 // generic capitalizer on whole range, returns range
8484 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
8485                            Range)(Range str)
8486     // Accept range of dchar's
8487 if (isInputRange!Range &&
8488     isSomeChar!(ElementEncodingType!Range) &&
8489     ElementEncodingType!Range.sizeof == dchar.sizeof)
8490 {
8491     static struct ToCapitalizerImpl
8492     {
emptyToCapitalizerImpl8493         @property bool empty()
8494         {
8495             return lower ? lwr.empty : !nLeft && r.empty;
8496         }
8497 
frontToCapitalizerImpl8498         @property auto front()
8499         {
8500             if (lower)
8501                 return lwr.front;
8502 
8503             if (!nLeft)
8504             {
8505                 immutable dchar c = r.front;
8506                 const idx = indexFnUpper(c);
8507                 if (idx == ushort.max)
8508                 {
8509                     buf[0] = c;
8510                     nLeft = 1;
8511                 }
8512                 else if (idx < maxIdxUpper)
8513                 {
8514                     buf[0] = tableFnUpper(idx);
8515                     nLeft = 1;
8516                 }
8517                 else
8518                 {
8519                     immutable val = tableFnUpper(idx);
8520                     // unpack length + codepoint
8521                     nLeft = val >> 24;
8522                     if (nLeft == 0)
8523                         nLeft = 1;
8524                     assert(nLeft <= buf.length);
8525                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
8526                     foreach (j; 1 .. nLeft)
8527                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
8528                 }
8529             }
8530             return buf[nLeft - 1];
8531         }
8532 
popFrontToCapitalizerImpl8533         void popFront()
8534         {
8535             if (lower)
8536                 lwr.popFront();
8537             else
8538             {
8539                 if (!nLeft)
8540                     front;
8541                 assert(nLeft);
8542                 --nLeft;
8543                 if (!nLeft)
8544                 {
8545                     r.popFront();
8546                     lwr = r.asLowerCase();
8547                     lower = true;
8548                 }
8549             }
8550         }
8551 
8552         static if (isForwardRange!Range)
8553         {
saveToCapitalizerImpl8554             @property auto save()
8555             {
8556                 auto ret = this;
8557                 ret.r = r.save;
8558                 ret.lwr = lwr.save;
8559                 return ret;
8560             }
8561         }
8562 
8563       private:
8564         Range r;
8565         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
8566         bool lower = false;     // false for first character, true for rest of string
8567         dchar[3] buf = void;
8568         uint nLeft = 0;
8569     }
8570 
8571     return ToCapitalizerImpl(str);
8572 }
8573 
8574 /*********************
8575  * Capitalize input range or string, meaning convert the first
8576  * character to upper case and subsequent characters to lower case.
8577  *
8578  * Does not allocate memory.
8579  * Characters in UTF-8 or UTF-16 format that cannot be decoded
8580  * are treated as $(REF replacementDchar, std,utf).
8581  *
8582  * Params:
8583  *      str = string or range of characters
8584  *
8585  * Returns:
8586  *      an InputRange of dchars
8587  *
8588  * See_Also:
8589  *      $(LREF toUpper), $(LREF toLower)
8590  *      $(LREF asUpperCase), $(LREF asLowerCase)
8591  */
8592 
8593 auto asCapitalized(Range)(Range str)
8594 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
8595     !isConvertibleToString!Range)
8596 {
8597     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
8598     {
8599         import std.utf : byDchar;
8600 
8601         // Decode first
8602         return toCapitalizer!UpperTriple(str.byDchar);
8603     }
8604     else
8605     {
8606         return toCapitalizer!UpperTriple(str);
8607     }
8608 }
8609 
8610 ///
8611 @safe pure unittest
8612 {
8613     import std.algorithm.comparison : equal;
8614 
8615     assert("hEllo".asCapitalized.equal("Hello"));
8616 }
8617 
8618 auto asCapitalized(Range)(auto ref Range str)
8619 if (isConvertibleToString!Range)
8620 {
8621     import std.traits : StringTypeOf;
8622     return asCapitalized!(StringTypeOf!Range)(str);
8623 }
8624 
8625 @safe unittest
8626 {
8627     assert(testAliasedString!asCapitalized("hEllo"));
8628 }
8629 
8630 @safe pure nothrow @nogc unittest
8631 {
8632     auto r = "hEllo".asCapitalized();
8633     assert(r.front == 'H');
8634 }
8635 
8636 @safe unittest
8637 {
8638     import std.array : array;
8639 
8640     auto a = "hELLo".asCapitalized;
8641     auto savea = a.save;
8642     auto s = a.array;
8643     assert(s == "Hello");
8644     s = savea.array;
8645     assert(s == "Hello");
8646 
8647     string[2][] cases =
8648     [
8649         ["", ""],
8650         ["h", "H"],
8651         ["H", "H"],
8652         ["3", "3"],
8653         ["123", "123"],
8654         ["h123A", "H123a"],
8655         ["феж", "Феж"],
8656         ["\u1Fe2", "\u03a5\u0308\u0300"],
8657     ];
8658 
8659     foreach (i; 0 .. cases.length)
8660     {
8661         import std.utf : byChar;
8662 
8663         auto r = cases[i][0].asCapitalized.byChar.array;
8664         auto result = cases[i][1];
8665         assert(r == result);
8666     }
8667 
8668     // Don't call r.front
8669     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
8670     {
8671     }
8672 
8673     import std.algorithm.comparison : equal;
8674 
8675     "HELLo"w.asCapitalized.equal("Hello"d);
8676     "hElLO"w.asCapitalized.equal("Hello"d);
8677     "hello"d.asCapitalized.equal("Hello"d);
8678     "HELLO"d.asCapitalized.equal("Hello"d);
8679 
8680     import std.utf : byChar;
8681     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
8682 }
8683 
8684 // TODO: helper, I wish std.utf was more flexible (and stright)
encodeTo(scope char[]buf,size_t idx,dchar c)8685 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
8686 {
8687     if (c <= 0x7F)
8688     {
8689         buf[idx] = cast(char) c;
8690         idx++;
8691     }
8692     else if (c <= 0x7FF)
8693     {
8694         buf[idx] = cast(char)(0xC0 | (c >> 6));
8695         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
8696         idx += 2;
8697     }
8698     else if (c <= 0xFFFF)
8699     {
8700         buf[idx] = cast(char)(0xE0 | (c >> 12));
8701         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
8702         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
8703         idx += 3;
8704     }
8705     else if (c <= 0x10FFFF)
8706     {
8707         buf[idx] = cast(char)(0xF0 | (c >> 18));
8708         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
8709         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
8710         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
8711         idx += 4;
8712     }
8713     else
8714         assert(0);
8715     return idx;
8716 }
8717 
8718 @safe unittest
8719 {
8720     char[] s = "abcd".dup;
8721     size_t i = 0;
8722     i = encodeTo(s, i, 'X');
8723     assert(s == "Xbcd");
8724 
8725     i = encodeTo(s, i, cast(dchar)'\u00A9');
8726     assert(s == "X\xC2\xA9d");
8727 }
8728 
8729 // TODO: helper, I wish std.utf was more flexible (and stright)
encodeTo(scope wchar[]buf,size_t idx,dchar c)8730 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
8731 {
8732     import std.utf : UTFException;
8733     if (c <= 0xFFFF)
8734     {
8735         if (0xD800 <= c && c <= 0xDFFF)
8736             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
8737         buf[idx] = cast(wchar) c;
8738         idx++;
8739     }
8740     else if (c <= 0x10FFFF)
8741     {
8742         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
8743         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
8744         idx += 2;
8745     }
8746     else
8747         assert(0);
8748     return idx;
8749 }
8750 
encodeTo(scope dchar[]buf,size_t idx,dchar c)8751 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
8752 {
8753     buf[idx] = c;
8754     idx++;
8755     return idx;
8756 }
8757 
8758 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
8759 if (is(C == char) || is(C == wchar)  || is(C == dchar))
8760 {
8761     import std.utf : decode, codeLength;
8762     size_t curIdx = 0;
8763     size_t destIdx = 0;
8764     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
8765     size_t lastUnchanged = 0;
8766     // in-buffer move of bytes to a new start index
8767     // the trick is that it may not need to copy at all
moveTo(C[]str,size_t dest,size_t from,size_t to)8768     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
8769     {
8770         // Interestingly we may just bump pointer for a while
8771         // then have to copy if a re-cased char was smaller the original
8772         // later we may regain pace with char that got bigger
8773         // In the end it sometimes flip-flops between the 2 cases below
8774         if (dest == from)
8775             return to;
8776         // got to copy
8777         foreach (C c; str[from .. to])
8778             str[dest++] = c;
8779         return dest;
8780     }
8781     while (curIdx != s.length)
8782     {
8783         size_t startIdx = curIdx;
8784         immutable ch = decode(s, curIdx);
8785         // TODO: special case for ASCII
8786         immutable caseIndex = indexFn(ch);
8787         if (caseIndex == ushort.max) // unchanged, skip over
8788         {
8789             continue;
8790         }
8791         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
8792         {
8793             // previous cased chars had the same length as uncased ones
8794             // thus can just adjust pointer
8795             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
8796             lastUnchanged = curIdx;
8797             immutable cased = tableFn(caseIndex);
8798             immutable casedLen = codeLength!C(cased);
8799             if (casedLen + destIdx > curIdx) // no place to fit cased char
8800             {
8801                 // switch to slow codepath, where we allocate
8802                 return slowToCase(s, startIdx, destIdx);
8803             }
8804             else
8805             {
8806                 destIdx = encodeTo(s, destIdx, cased);
8807             }
8808         }
8809         else  // 1:m codepoint mapping, slow codepath
8810         {
8811             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
8812             lastUnchanged = curIdx;
8813             return slowToCase(s, startIdx, destIdx);
8814         }
8815         assert(destIdx <= curIdx);
8816     }
8817     if (lastUnchanged != s.length)
8818     {
8819         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
8820     }
8821     s = s[0 .. destIdx];
8822 }
8823 
8824 // helper to precalculate size of case-converted string
toCaseLength(alias indexFn,uint maxIdx,alias tableFn)8825 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
8826 {
8827     size_t toCaseLength(C)(in C[] str)
8828     {
8829         import std.utf : decode, codeLength;
8830         size_t codeLen = 0;
8831         size_t lastNonTrivial = 0;
8832         size_t curIdx = 0;
8833         while (curIdx != str.length)
8834         {
8835             immutable startIdx = curIdx;
8836             immutable ch = decode(str, curIdx);
8837             immutable ushort caseIndex = indexFn(ch);
8838             if (caseIndex == ushort.max)
8839                 continue;
8840             else if (caseIndex < maxIdx)
8841             {
8842                 codeLen += startIdx - lastNonTrivial;
8843                 lastNonTrivial = curIdx;
8844                 immutable cased = tableFn(caseIndex);
8845                 codeLen += codeLength!C(cased);
8846             }
8847             else
8848             {
8849                 codeLen += startIdx - lastNonTrivial;
8850                 lastNonTrivial = curIdx;
8851                 immutable val = tableFn(caseIndex);
8852                 immutable len = val >> 24;
8853                 immutable dchar cased = val & 0xFF_FFFF;
8854                 codeLen += codeLength!C(cased);
8855                 foreach (j; caseIndex+1 .. caseIndex+len)
8856                     codeLen += codeLength!C(tableFn(j));
8857             }
8858         }
8859         if (lastNonTrivial != str.length)
8860             codeLen += str.length - lastNonTrivial;
8861         return codeLen;
8862     }
8863 }
8864 
8865 @safe unittest
8866 {
8867     alias toLowerLength = toCaseLength!(LowerTriple);
8868     assert(toLowerLength("abcd") == 4);
8869     assert(toLowerLength("аБВгд456") == 10+3);
8870 }
8871 
8872 // slower code path that preallocates and then copies
8873 // case-converted stuf to the new string
toCaseInPlaceAlloc(alias indexFn,uint maxIdx,alias tableFn)8874 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
8875 {
8876     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
8877         size_t destIdx) @trusted pure
8878         if (is(C == char) || is(C == wchar) || is(C == dchar))
8879     {
8880         import std.utf : decode;
8881         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
8882         auto trueLength = destIdx + caseLength(s[curIdx..$]);
8883         C[] ns = new C[trueLength];
8884         ns[0 .. destIdx] = s[0 .. destIdx];
8885         size_t lastUnchanged = curIdx;
8886         while (curIdx != s.length)
8887         {
8888             immutable startIdx = curIdx; // start of current codepoint
8889             immutable ch = decode(s, curIdx);
8890             immutable caseIndex = indexFn(ch);
8891             if (caseIndex == ushort.max) // skip over
8892             {
8893                 continue;
8894             }
8895             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
8896             {
8897                 immutable cased = tableFn(caseIndex);
8898                 auto toCopy = startIdx - lastUnchanged;
8899                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
8900                 lastUnchanged = curIdx;
8901                 destIdx += toCopy;
8902                 destIdx = encodeTo(ns, destIdx, cased);
8903             }
8904             else  // 1:m codepoint mapping, slow codepath
8905             {
8906                 auto toCopy = startIdx - lastUnchanged;
8907                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
8908                 lastUnchanged = curIdx;
8909                 destIdx += toCopy;
8910                 auto val = tableFn(caseIndex);
8911                 // unpack length + codepoint
8912                 immutable uint len = val >> 24;
8913                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
8914                 foreach (j; caseIndex+1 .. caseIndex+len)
8915                     destIdx = encodeTo(ns, destIdx, tableFn(j));
8916             }
8917         }
8918         if (lastUnchanged != s.length)
8919         {
8920             auto toCopy = s.length - lastUnchanged;
8921             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
8922             destIdx += toCopy;
8923         }
8924         assert(ns.length == destIdx);
8925         s = ns;
8926     }
8927 }
8928 
8929 /++
8930     Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
8931     For a few characters string length may increase after the transformation,
8932     in such a case the function reallocates exactly once.
8933     If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
8934 +/
8935 void toLowerInPlace(C)(ref C[] s) @trusted pure
8936 if (is(C == char) || is(C == wchar) || is(C == dchar))
8937 {
8938     toCaseInPlace!(LowerTriple)(s);
8939 }
8940 // overloads for the most common cases to reduce compile time
8941 @safe pure /*TODO nothrow*/
8942 {
toLowerInPlace(ref char[]s)8943     void toLowerInPlace(ref char[] s)
8944     { toLowerInPlace!char(s); }
toLowerInPlace(ref wchar[]s)8945     void toLowerInPlace(ref wchar[] s)
8946     { toLowerInPlace!wchar(s); }
toLowerInPlace(ref dchar[]s)8947     void toLowerInPlace(ref dchar[] s)
8948     { toLowerInPlace!dchar(s); }
8949 }
8950 
8951 /++
8952     Converts $(D s) to uppercase  (by performing Unicode uppercase mapping) in place.
8953     For a few characters string length may increase after the transformation,
8954     in such a case the function reallocates exactly once.
8955     If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
8956 +/
8957 void toUpperInPlace(C)(ref C[] s) @trusted pure
8958 if (is(C == char) || is(C == wchar) || is(C == dchar))
8959 {
8960     toCaseInPlace!(UpperTriple)(s);
8961 }
8962 // overloads for the most common cases to reduce compile time/code size
8963 @safe pure /*TODO nothrow*/
8964 {
toUpperInPlace(ref char[]s)8965     void toUpperInPlace(ref char[] s)
8966     { toUpperInPlace!char(s); }
toUpperInPlace(ref wchar[]s)8967     void toUpperInPlace(ref wchar[] s)
8968     { toUpperInPlace!wchar(s); }
toUpperInPlace(ref dchar[]s)8969     void toUpperInPlace(ref dchar[] s)
8970     { toUpperInPlace!dchar(s); }
8971 }
8972 
8973 /++
8974     If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
8975     is returned. Otherwise $(D c) is returned.
8976 
8977     Warning: certain alphabets like German and Greek have no 1:1
8978     upper-lower mapping. Use overload of toLower which takes full string instead.
8979 +/
8980 @safe pure nothrow @nogc
toLower(dchar c)8981 dchar toLower(dchar c)
8982 {
8983      // optimize ASCII case
8984     if (c < 0xAA)
8985     {
8986         if (c < 'A')
8987             return c;
8988         if (c <= 'Z')
8989             return c + 32;
8990         return c;
8991     }
8992     size_t idx = toLowerSimpleIndex(c);
8993     if (idx != ushort.max)
8994     {
8995         return toLowerTab(idx);
8996     }
8997     return c;
8998 }
8999 
9000 /++
9001     Returns a string which is identical to $(D s) except that all of its
9002     characters are converted to lowercase (by preforming Unicode lowercase mapping).
9003     If none of $(D s) characters were affected, then $(D s) itself is returned.
9004 +/
9005 S toLower(S)(S s) @trusted pure
9006 if (isSomeString!S)
9007 {
9008     static import std.ascii;
9009     return toCase!(LowerTriple, std.ascii.toLower)(s);
9010 }
9011 // overloads for the most common cases to reduce compile time
9012 @safe pure /*TODO nothrow*/
9013 {
toLower(string s)9014     string toLower(string s)
9015     { return toLower!string(s); }
toLower(wstring s)9016     wstring toLower(wstring s)
9017     { return toLower!wstring(s); }
toLower(dstring s)9018     dstring toLower(dstring s)
9019     { return toLower!dstring(s); }
9020 
9021     @safe unittest
9022     {
9023         // https://issues.dlang.org/show_bug.cgi?id=16663
9024 
9025         static struct String
9026         {
9027             string data;
9028             alias data this;
9029         }
9030 
foo()9031         void foo()
9032         {
9033             auto u = toLower(String(""));
9034         }
9035     }
9036 }
9037 
9038 
9039 @system unittest //@@@BUG std.format is not @safe
9040 {
9041     static import std.ascii;
9042     import std.format : format;
9043     foreach (ch; 0 .. 0x80)
9044         assert(std.ascii.toLower(ch) == toLower(ch));
9045     assert(toLower('Я') == 'я');
9046     assert(toLower('Δ') == 'δ');
9047     foreach (ch; unicode.upperCase.byCodepoint)
9048     {
9049         dchar low = ch.toLower();
9050         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
9051     }
9052     assert(toLower("АЯ") == "ая");
9053 
9054     assert("\u1E9E".toLower == "\u00df");
9055     assert("\u00df".toUpper == "SS");
9056 }
9057 
9058 //bugzilla 9629
9059 @safe unittest
9060 {
9061     wchar[] test = "hello þ world"w.dup;
9062     auto piece = test[6 .. 7];
9063     toUpperInPlace(piece);
9064     assert(test == "hello Þ world");
9065 }
9066 
9067 
9068 @safe unittest
9069 {
9070     import std.algorithm.comparison : cmp;
9071     string s1 = "FoL";
9072     string s2 = toLower(s1);
9073     assert(cmp(s2, "fol") == 0, s2);
9074     assert(s2 != s1);
9075 
9076     char[] s3 = s1.dup;
9077     toLowerInPlace(s3);
9078     assert(s3 == s2);
9079 
9080     s1 = "A\u0100B\u0101d";
9081     s2 = toLower(s1);
9082     s3 = s1.dup;
9083     assert(cmp(s2, "a\u0101b\u0101d") == 0);
9084     assert(s2 !is s1);
9085     toLowerInPlace(s3);
9086     assert(s3 == s2);
9087 
9088     s1 = "A\u0460B\u0461d";
9089     s2 = toLower(s1);
9090     s3 = s1.dup;
9091     assert(cmp(s2, "a\u0461b\u0461d") == 0);
9092     assert(s2 !is s1);
9093     toLowerInPlace(s3);
9094     assert(s3 == s2);
9095 
9096     s1 = "\u0130";
9097     s2 = toLower(s1);
9098     s3 = s1.dup;
9099     assert(s2 == "i\u0307");
9100     assert(s2 !is s1);
9101     toLowerInPlace(s3);
9102     assert(s3 == s2);
9103 
9104     // Test on wchar and dchar strings.
9105     assert(toLower("Some String"w) == "some string"w);
9106     assert(toLower("Some String"d) == "some string"d);
9107 
9108     // bugzilla 12455
9109     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
9110     assert(isUpper(c));
9111     assert(toLower(c) == 'i');
9112     // extend on 12455 reprot - check simple-case toUpper too
9113     c = '\u1f87';
9114     assert(isLower(c));
9115     assert(toUpper(c) == '\u1F8F');
9116 }
9117 
9118 
9119 /++
9120     If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
9121     is returned. Otherwise $(D c) is returned.
9122 
9123     Warning:
9124     Certain alphabets like German and Greek have no 1:1
9125     upper-lower mapping. Use overload of toUpper which takes full string instead.
9126 
9127     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
9128     to produce an algorithm that can convert a range of characters to upper case
9129     without allocating memory.
9130     A string can then be produced by using $(REF copy, std,algorithm,mutation)
9131     to send it to an $(REF appender, std,array).
9132 +/
9133 @safe pure nothrow @nogc
toUpper(dchar c)9134 dchar toUpper(dchar c)
9135 {
9136     // optimize ASCII case
9137     if (c < 0xAA)
9138     {
9139         if (c < 'a')
9140             return c;
9141         if (c <= 'z')
9142             return c - 32;
9143         return c;
9144     }
9145     size_t idx = toUpperSimpleIndex(c);
9146     if (idx != ushort.max)
9147     {
9148         return toUpperTab(idx);
9149     }
9150     return c;
9151 }
9152 
9153 ///
9154 @system unittest
9155 {
9156     import std.algorithm.iteration : map;
9157     import std.algorithm.mutation : copy;
9158     import std.array : appender;
9159 
9160     auto abuf = appender!(char[])();
9161     "hello".map!toUpper.copy(&abuf);
9162     assert(abuf.data == "HELLO");
9163 }
9164 
9165 @safe unittest
9166 {
9167     static import std.ascii;
9168     import std.format : format;
9169     foreach (ch; 0 .. 0x80)
9170         assert(std.ascii.toUpper(ch) == toUpper(ch));
9171     assert(toUpper('я') == 'Я');
9172     assert(toUpper('δ') == 'Δ');
9173     auto title = unicode.Titlecase_Letter;
9174     foreach (ch; unicode.lowerCase.byCodepoint)
9175     {
9176         dchar up = ch.toUpper();
9177         assert(up == ch || isUpper(up) || title[up],
9178             format("%x -> %x", ch, up));
9179     }
9180 }
9181 
9182 /++
9183     Returns a string which is identical to $(D s) except that all of its
9184     characters are converted to uppercase (by preforming Unicode uppercase mapping).
9185     If none of $(D s) characters were affected, then $(D s) itself is returned.
9186 +/
9187 S toUpper(S)(S s) @trusted pure
9188 if (isSomeString!S)
9189 {
9190     static import std.ascii;
9191     return toCase!(UpperTriple, std.ascii.toUpper)(s);
9192 }
9193 // overloads for the most common cases to reduce compile time
9194 @safe pure /*TODO nothrow*/
9195 {
toUpper(string s)9196     string toUpper(string s)
9197     { return toUpper!string(s); }
toUpper(wstring s)9198     wstring toUpper(wstring s)
9199     { return toUpper!wstring(s); }
toUpper(dstring s)9200     dstring toUpper(dstring s)
9201     { return toUpper!dstring(s); }
9202 
9203     @safe unittest
9204     {
9205         // https://issues.dlang.org/show_bug.cgi?id=16663
9206 
9207         static struct String
9208         {
9209             string data;
9210             alias data this;
9211         }
9212 
foo()9213         void foo()
9214         {
9215             auto u = toUpper(String(""));
9216         }
9217     }
9218 }
9219 
9220 @safe unittest
9221 {
9222     import std.algorithm.comparison : cmp;
9223 
9224     string s1 = "FoL";
9225     string s2;
9226     char[] s3;
9227 
9228     s2 = toUpper(s1);
9229     s3 = s1.dup; toUpperInPlace(s3);
9230     assert(s3 == s2, s3);
9231     assert(cmp(s2, "FOL") == 0);
9232     assert(s2 !is s1);
9233 
9234     s1 = "a\u0100B\u0101d";
9235     s2 = toUpper(s1);
9236     s3 = s1.dup; toUpperInPlace(s3);
9237     assert(s3 == s2);
9238     assert(cmp(s2, "A\u0100B\u0100D") == 0);
9239     assert(s2 !is s1);
9240 
9241     s1 = "a\u0460B\u0461d";
9242     s2 = toUpper(s1);
9243     s3 = s1.dup; toUpperInPlace(s3);
9244     assert(s3 == s2);
9245     assert(cmp(s2, "A\u0460B\u0460D") == 0);
9246     assert(s2 !is s1);
9247 }
9248 
9249 @system unittest
9250 {
doTest(C)9251     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
9252     {
9253         import std.format : format;
9254         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
9255         auto low = s.toLower() , up = s.toUpper();
9256         auto lowInp = s.dup, upInp = s.dup;
9257         lowInp.toLowerInPlace();
9258         upInp.toUpperInPlace();
9259         assert(low == trueLow, format(diff, low, trueLow));
9260         assert(up == trueUp,  format(diff, up, trueUp));
9261         assert(lowInp == trueLow,
9262             format(diff, cast(ubyte[]) s, cast(ubyte[]) lowInp, cast(ubyte[]) trueLow));
9263         assert(upInp == trueUp,
9264             format(diff, cast(ubyte[]) s, cast(ubyte[]) upInp, cast(ubyte[]) trueUp));
9265     }
9266     foreach (S; AliasSeq!(dstring, wstring, string))
9267     {
9268 
9269         S easy = "123";
9270         S good = "abCФеж";
9271         S awful = "\u0131\u023f\u2126";
9272         S wicked = "\u0130\u1FE2";
9273         auto options = [easy, good, awful, wicked];
9274         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9275         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9276 
9277         foreach (val; AliasSeq!(easy, good))
9278         {
9279             auto e = val.dup;
9280             auto g = e;
9281             e.toUpperInPlace();
9282             assert(e is g);
9283             e.toLowerInPlace();
9284             assert(e is g);
9285         }
foreach(i,v;options)9286         foreach (i, v; options)
9287         {
9288             doTest(v, upper[i], lower[i]);
9289         }
9290 
9291         // a few combinatorial runs
9292         foreach (i; 0 .. options.length)
9293         foreach (j; i .. options.length)
9294         foreach (k; j .. options.length)
9295         {
9296             auto sample = options[i] ~ options[j] ~ options[k];
9297             auto sample2 = options[k] ~ options[j] ~ options[i];
9298             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
9299                 lower[i] ~ lower[j] ~ lower[k]);
9300             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
9301                 lower[k] ~ lower[j] ~ lower[i]);
9302         }
9303     }
9304 }
9305 
9306 
9307 /++
9308     Returns whether $(D c) is a Unicode alphabetic $(CHARACTER)
9309     (general Unicode category: Alphabetic).
9310 +/
9311 @safe pure nothrow @nogc
isAlpha(dchar c)9312 bool isAlpha(dchar c)
9313 {
9314     // optimization
9315     if (c < 0xAA)
9316     {
9317         size_t x = c - 'A';
9318         if (x <= 'Z' - 'A')
9319             return true;
9320         else
9321         {
9322             x = c - 'a';
9323             if (x <= 'z'-'a')
9324                 return true;
9325         }
9326         return false;
9327     }
9328 
9329     return alphaTrie[c];
9330 }
9331 
9332 @safe unittest
9333 {
9334     auto alpha = unicode("Alphabetic");
9335     foreach (ch; alpha.byCodepoint)
9336         assert(isAlpha(ch));
9337     foreach (ch; 0 .. 0x4000)
9338         assert((ch in alpha) == isAlpha(ch));
9339 }
9340 
9341 
9342 /++
9343     Returns whether $(D c) is a Unicode mark
9344     (general Unicode category: Mn, Me, Mc).
9345 +/
9346 @safe pure nothrow @nogc
isMark(dchar c)9347 bool isMark(dchar c)
9348 {
9349     return markTrie[c];
9350 }
9351 
9352 @safe unittest
9353 {
9354     auto mark = unicode("Mark");
9355     foreach (ch; mark.byCodepoint)
9356         assert(isMark(ch));
9357     foreach (ch; 0 .. 0x4000)
9358         assert((ch in mark) == isMark(ch));
9359 }
9360 
9361 /++
9362     Returns whether $(D c) is a Unicode numerical $(CHARACTER)
9363     (general Unicode category: Nd, Nl, No).
9364 +/
9365 @safe pure nothrow @nogc
isNumber(dchar c)9366 bool isNumber(dchar c)
9367 {
9368     // optimization for ascii case
9369     if (c <= 0x7F)
9370     {
9371         return c >= '0' && c <= '9';
9372     }
9373     else
9374     {
9375         return numberTrie[c];
9376     }
9377 }
9378 
9379 @safe unittest
9380 {
9381     auto n = unicode("N");
9382     foreach (ch; n.byCodepoint)
9383         assert(isNumber(ch));
9384     foreach (ch; 0 .. 0x4000)
9385         assert((ch in n) == isNumber(ch));
9386 }
9387 
9388 /++
9389     Returns whether $(D c) is a Unicode alphabetic $(CHARACTER) or number.
9390     (general Unicode category: Alphabetic, Nd, Nl, No).
9391 
9392     Params:
9393         c = any Unicode character
9394     Returns:
9395         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
9396         categories
9397 +/
9398 @safe pure nothrow @nogc
isAlphaNum(dchar c)9399 bool isAlphaNum(dchar c)
9400 {
9401     static import std.ascii;
9402 
9403     // optimization for ascii case
9404     if (std.ascii.isASCII(c))
9405     {
9406         return std.ascii.isAlphaNum(c);
9407     }
9408     else
9409     {
9410         return isAlpha(c) || isNumber(c);
9411     }
9412 }
9413 
9414 @safe unittest
9415 {
9416     auto n = unicode("N");
9417     auto alpha = unicode("Alphabetic");
9418 
9419     foreach (ch; n.byCodepoint)
9420         assert(isAlphaNum(ch));
9421 
9422     foreach (ch; alpha.byCodepoint)
9423         assert(isAlphaNum(ch));
9424 
9425     foreach (ch; 0 .. 0x4000)
9426     {
9427         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
9428     }
9429 }
9430 
9431 /++
9432     Returns whether $(D c) is a Unicode punctuation $(CHARACTER)
9433     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
9434 +/
9435 @safe pure nothrow @nogc
isPunctuation(dchar c)9436 bool isPunctuation(dchar c)
9437 {
9438     static import std.ascii;
9439 
9440     // optimization for ascii case
9441     if (c <= 0x7F)
9442     {
9443         return std.ascii.isPunctuation(c);
9444     }
9445     else
9446     {
9447         return punctuationTrie[c];
9448     }
9449 }
9450 
9451 @safe unittest
9452 {
9453     assert(isPunctuation('\u0021'));
9454     assert(isPunctuation('\u0028'));
9455     assert(isPunctuation('\u0029'));
9456     assert(isPunctuation('\u002D'));
9457     assert(isPunctuation('\u005F'));
9458     assert(isPunctuation('\u00AB'));
9459     assert(isPunctuation('\u00BB'));
9460     foreach (ch; unicode("P").byCodepoint)
9461         assert(isPunctuation(ch));
9462 }
9463 
9464 /++
9465     Returns whether $(D c) is a Unicode symbol $(CHARACTER)
9466     (general Unicode category: Sm, Sc, Sk, So).
9467 +/
9468 @safe pure nothrow @nogc
isSymbol(dchar c)9469 bool isSymbol(dchar c)
9470 {
9471    return symbolTrie[c];
9472 }
9473 
9474 @safe unittest
9475 {
9476     import std.format : format;
9477     assert(isSymbol('\u0024'));
9478     assert(isSymbol('\u002B'));
9479     assert(isSymbol('\u005E'));
9480     assert(isSymbol('\u00A6'));
9481     foreach (ch; unicode("S").byCodepoint)
9482         assert(isSymbol(ch), format("%04x", ch));
9483 }
9484 
9485 /++
9486     Returns whether $(D c) is a Unicode space $(CHARACTER)
9487     (general Unicode category: Zs)
9488     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
9489     For commonly used less strict semantics see $(LREF isWhite).
9490 +/
9491 @safe pure nothrow @nogc
isSpace(dchar c)9492 bool isSpace(dchar c)
9493 {
9494     import std.internal.unicode_tables : isSpaceGen; // generated file
9495     return isSpaceGen(c);
9496 }
9497 
9498 @safe unittest
9499 {
9500     assert(isSpace('\u0020'));
9501     auto space = unicode.Zs;
9502     foreach (ch; space.byCodepoint)
9503         assert(isSpace(ch));
9504     foreach (ch; 0 .. 0x1000)
9505         assert(isSpace(ch) == space[ch]);
9506 }
9507 
9508 
9509 /++
9510     Returns whether $(D c) is a Unicode graphical $(CHARACTER)
9511     (general Unicode category: L, M, N, P, S, Zs).
9512 
9513 +/
9514 @safe pure nothrow @nogc
isGraphical(dchar c)9515 bool isGraphical(dchar c)
9516 {
9517     return graphicalTrie[c];
9518 }
9519 
9520 
9521 @safe unittest
9522 {
9523     auto set = unicode("Graphical");
9524     import std.format : format;
9525     foreach (ch; set.byCodepoint)
9526         assert(isGraphical(ch), format("%4x", ch));
9527     foreach (ch; 0 .. 0x4000)
9528         assert((ch in set) == isGraphical(ch));
9529 }
9530 
9531 
9532 /++
9533     Returns whether $(D c) is a Unicode control $(CHARACTER)
9534     (general Unicode category: Cc).
9535 +/
9536 @safe pure nothrow @nogc
isControl(dchar c)9537 bool isControl(dchar c)
9538 {
9539     import std.internal.unicode_tables : isControlGen; // generated file
9540     return isControlGen(c);
9541 }
9542 
9543 @safe unittest
9544 {
9545     assert(isControl('\u0000'));
9546     assert(isControl('\u0081'));
9547     assert(!isControl('\u0100'));
9548     auto cc = unicode.Cc;
9549     foreach (ch; cc.byCodepoint)
9550         assert(isControl(ch));
9551     foreach (ch; 0 .. 0x1000)
9552         assert(isControl(ch) == cc[ch]);
9553 }
9554 
9555 
9556 /++
9557     Returns whether $(D c) is a Unicode formatting $(CHARACTER)
9558     (general Unicode category: Cf).
9559 +/
9560 @safe pure nothrow @nogc
isFormat(dchar c)9561 bool isFormat(dchar c)
9562 {
9563     import std.internal.unicode_tables : isFormatGen; // generated file
9564     return isFormatGen(c);
9565 }
9566 
9567 
9568 @safe unittest
9569 {
9570     assert(isFormat('\u00AD'));
9571     foreach (ch; unicode("Format").byCodepoint)
9572         assert(isFormat(ch));
9573 }
9574 
9575 // code points for private use, surrogates are not likely to change in near feature
9576 // if need be they can be generated from unicode data as well
9577 
9578 /++
9579     Returns whether $(D c) is a Unicode Private Use $(CODEPOINT)
9580     (general Unicode category: Co).
9581 +/
9582 @safe pure nothrow @nogc
isPrivateUse(dchar c)9583 bool isPrivateUse(dchar c)
9584 {
9585     return (0x00_E000 <= c && c <= 0x00_F8FF)
9586         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
9587         || (0x10_0000 <= c && c <= 0x10_FFFD);
9588 }
9589 
9590 /++
9591     Returns whether $(D c) is a Unicode surrogate $(CODEPOINT)
9592     (general Unicode category: Cs).
9593 +/
9594 @safe pure nothrow @nogc
isSurrogate(dchar c)9595 bool isSurrogate(dchar c)
9596 {
9597     return (0xD800 <= c && c <= 0xDFFF);
9598 }
9599 
9600 /++
9601     Returns whether $(D c) is a Unicode high surrogate (lead surrogate).
9602 +/
9603 @safe pure nothrow @nogc
isSurrogateHi(dchar c)9604 bool isSurrogateHi(dchar c)
9605 {
9606     return (0xD800 <= c && c <= 0xDBFF);
9607 }
9608 
9609 /++
9610     Returns whether $(D c) is a Unicode low surrogate (trail surrogate).
9611 +/
9612 @safe pure nothrow @nogc
isSurrogateLo(dchar c)9613 bool isSurrogateLo(dchar c)
9614 {
9615     return (0xDC00 <= c && c <= 0xDFFF);
9616 }
9617 
9618 /++
9619     Returns whether $(D c) is a Unicode non-character i.e.
9620     a $(CODEPOINT) with no assigned abstract character.
9621     (general Unicode category: Cn)
9622 +/
9623 @safe pure nothrow @nogc
isNonCharacter(dchar c)9624 bool isNonCharacter(dchar c)
9625 {
9626     return nonCharacterTrie[c];
9627 }
9628 
9629 @safe unittest
9630 {
9631     auto set = unicode("Cn");
9632     foreach (ch; set.byCodepoint)
9633         assert(isNonCharacter(ch));
9634 }
9635 
9636 private:
9637 // load static data from pre-generated tables into usable datastructures
9638 
9639 
asSet(const (ubyte)[]compressed)9640 @safe auto asSet(const (ubyte)[] compressed) pure
9641 {
9642     return CodepointSet.fromIntervals(decompressIntervals(compressed));
9643 }
9644 
asTrie(T...)9645 @safe pure nothrow auto asTrie(T...)(in TrieEntry!T e)
9646 {
9647     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
9648 }
9649 
9650 @safe pure nothrow @nogc @property
9651 {
9652     import std.internal.unicode_tables; // generated file
9653 
9654     // It's important to use auto return here, so that the compiler
9655     // only runs semantic on the return type if the function gets
9656     // used. Also these are functions rather than templates to not
9657     // increase the object size of the caller.
lowerCaseTrie()9658     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
upperCaseTrie()9659     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
simpleCaseTrie()9660     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
fullCaseTrie()9661     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
alphaTrie()9662     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
markTrie()9663     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
numberTrie()9664     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
punctuationTrie()9665     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
symbolTrie()9666     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
graphicalTrie()9667     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
nonCharacterTrie()9668     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
9669 
9670     //normalization quick-check tables
nfcQCTrie()9671     auto nfcQCTrie()
9672     {
9673         import std.internal.unicode_norm : nfcQCTrieEntries;
9674         static immutable res = asTrie(nfcQCTrieEntries);
9675         return res;
9676     }
9677 
nfdQCTrie()9678     auto nfdQCTrie()
9679     {
9680         import std.internal.unicode_norm : nfdQCTrieEntries;
9681         static immutable res = asTrie(nfdQCTrieEntries);
9682         return res;
9683     }
9684 
nfkcQCTrie()9685     auto nfkcQCTrie()
9686     {
9687         import std.internal.unicode_norm : nfkcQCTrieEntries;
9688         static immutable res = asTrie(nfkcQCTrieEntries);
9689         return res;
9690     }
9691 
nfkdQCTrie()9692     auto nfkdQCTrie()
9693     {
9694         import std.internal.unicode_norm : nfkdQCTrieEntries;
9695         static immutable res = asTrie(nfkdQCTrieEntries);
9696         return res;
9697     }
9698 
9699     //grapheme breaking algorithm tables
mcTrie()9700     auto mcTrie()
9701     {
9702         import std.internal.unicode_grapheme : mcTrieEntries;
9703         static immutable res = asTrie(mcTrieEntries);
9704         return res;
9705     }
9706 
graphemeExtendTrie()9707     auto graphemeExtendTrie()
9708     {
9709         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
9710         static immutable res = asTrie(graphemeExtendTrieEntries);
9711         return res;
9712     }
9713 
hangLV()9714     auto hangLV()
9715     {
9716         import std.internal.unicode_grapheme : hangulLVTrieEntries;
9717         static immutable res = asTrie(hangulLVTrieEntries);
9718         return res;
9719     }
9720 
hangLVT()9721     auto hangLVT()
9722     {
9723         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
9724         static immutable res = asTrie(hangulLVTTrieEntries);
9725         return res;
9726     }
9727 
9728     // tables below are used for composition/decomposition
combiningClassTrie()9729     auto combiningClassTrie()
9730     {
9731         import std.internal.unicode_comp : combiningClassTrieEntries;
9732         static immutable res = asTrie(combiningClassTrieEntries);
9733         return res;
9734     }
9735 
compatMappingTrie()9736     auto compatMappingTrie()
9737     {
9738         import std.internal.unicode_decomp : compatMappingTrieEntries;
9739         static immutable res = asTrie(compatMappingTrieEntries);
9740         return res;
9741     }
9742 
canonMappingTrie()9743     auto canonMappingTrie()
9744     {
9745         import std.internal.unicode_decomp : canonMappingTrieEntries;
9746         static immutable res = asTrie(canonMappingTrieEntries);
9747         return res;
9748     }
9749 
compositionJumpTrie()9750     auto compositionJumpTrie()
9751     {
9752         import std.internal.unicode_comp : compositionJumpTrieEntries;
9753         static immutable res = asTrie(compositionJumpTrieEntries);
9754         return res;
9755     }
9756 
9757     //case conversion tables
toUpperIndexTrie()9758     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
toLowerIndexTrie()9759     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
toTitleIndexTrie()9760     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
9761     //simple case conversion tables
toUpperSimpleIndexTrie()9762     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
toLowerSimpleIndexTrie()9763     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
toTitleSimpleIndexTrie()9764     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
9765 
9766 }
9767 
9768 }// version (!std_uni_bootstrap)
9769