1#
2#
3#            Nim's Runtime Library
4#        (c) Copyright 2012 Andreas Rumpf
5#
6#    See the file "copying.txt", included in this
7#    distribution, for details about the copyright.
8#
9
10## This module provides support to handle the Unicode UTF-8 encoding.
11##
12## There are no specialized ``insert``, ``delete``, ``add`` and ``contains``
13## procedures for ``seq[Rune]`` in this module because the generic variants
14## of these procedures in the system module already work with it.
15##
16## The current version is compatible with Unicode v12.0.0.
17##
18## **See also:**
19## * `strutils module <strutils.html>`_
20## * `unidecode module <unidecode.html>`_
21## * `encodings module <encodings.html>`_
22
23include "system/inclrtl"
24
25type
26  RuneImpl = int32 # underlying type of Rune
27  Rune* = distinct RuneImpl ## \
28    ## Type that can hold a single Unicode code point.
29    ##
30    ## A Rune may be composed with other Runes to a character on the screen.
31    ## `RuneImpl` is the underlying type used to store Runes, currently `int32`.
32
33template ones(n: untyped): untyped = ((1 shl n)-1)
34
35proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} =
36  ## Returns the number of runes of the string ``s``.
37  runnableExamples:
38    let a = "añyóng"
39    doAssert a.runeLen == 6
40    ## note: a.len == 8
41
42  result = 0
43  var i = 0
44  while i < len(s):
45    if uint(s[i]) <= 127: inc(i)
46    elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
47    elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
48    elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
49    elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
50    elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
51    else: inc i
52    inc(result)
53
54proc runeLenAt*(s: string, i: Natural): int =
55  ## Returns the number of bytes the rune starting at ``s[i]`` takes.
56  ##
57  ## See also:
58  ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
59  runnableExamples:
60    let a = "añyóng"
61    doAssert a.runeLenAt(0) == 1
62    doAssert a.runeLenAt(1) == 2
63
64  if uint(s[i]) <= 127: result = 1
65  elif uint(s[i]) shr 5 == 0b110: result = 2
66  elif uint(s[i]) shr 4 == 0b1110: result = 3
67  elif uint(s[i]) shr 3 == 0b11110: result = 4
68  elif uint(s[i]) shr 2 == 0b111110: result = 5
69  elif uint(s[i]) shr 1 == 0b1111110: result = 6
70  else: result = 1
71
72const replRune = Rune(0xFFFD)
73
74template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) =
75  ## Returns the rune ``s[i]`` in ``result``.
76  ##
77  ## If ``doInc == true`` (default), ``i`` is incremented by the number
78  ## of bytes that have been processed.
79  bind ones
80  if uint(s[i]) <= 127:
81    result = Rune(uint(s[i]))
82    when doInc: inc(i)
83  elif uint(s[i]) shr 5 == 0b110:
84    # assert(uint(s[i+1]) shr 6 == 0b10)
85    if i <= s.len - 2:
86      result = Rune((uint(s[i]) and (ones(5))) shl 6 or
87                    (uint(s[i+1]) and ones(6)))
88      when doInc: inc(i, 2)
89    else:
90      result = replRune
91      when doInc: inc(i)
92  elif uint(s[i]) shr 4 == 0b1110:
93    # assert(uint(s[i+1]) shr 6 == 0b10)
94    # assert(uint(s[i+2]) shr 6 == 0b10)
95    if i <= s.len - 3:
96      result = Rune((uint(s[i]) and ones(4)) shl 12 or
97                    (uint(s[i+1]) and ones(6)) shl 6 or
98                    (uint(s[i+2]) and ones(6)))
99      when doInc: inc(i, 3)
100    else:
101      result = replRune
102      when doInc: inc(i)
103  elif uint(s[i]) shr 3 == 0b11110:
104    # assert(uint(s[i+1]) shr 6 == 0b10)
105    # assert(uint(s[i+2]) shr 6 == 0b10)
106    # assert(uint(s[i+3]) shr 6 == 0b10)
107    if i <= s.len - 4:
108      result = Rune((uint(s[i]) and ones(3)) shl 18 or
109                    (uint(s[i+1]) and ones(6)) shl 12 or
110                    (uint(s[i+2]) and ones(6)) shl 6 or
111                    (uint(s[i+3]) and ones(6)))
112      when doInc: inc(i, 4)
113    else:
114      result = replRune
115      when doInc: inc(i)
116  elif uint(s[i]) shr 2 == 0b111110:
117    # assert(uint(s[i+1]) shr 6 == 0b10)
118    # assert(uint(s[i+2]) shr 6 == 0b10)
119    # assert(uint(s[i+3]) shr 6 == 0b10)
120    # assert(uint(s[i+4]) shr 6 == 0b10)
121    if i <= s.len - 5:
122      result = Rune((uint(s[i]) and ones(2)) shl 24 or
123                (uint(s[i+1]) and ones(6)) shl 18 or
124                (uint(s[i+2]) and ones(6)) shl 12 or
125                (uint(s[i+3]) and ones(6)) shl 6 or
126                (uint(s[i+4]) and ones(6)))
127      when doInc: inc(i, 5)
128    else:
129      result = replRune
130      when doInc: inc(i)
131  elif uint(s[i]) shr 1 == 0b1111110:
132    # assert(uint(s[i+1]) shr 6 == 0b10)
133    # assert(uint(s[i+2]) shr 6 == 0b10)
134    # assert(uint(s[i+3]) shr 6 == 0b10)
135    # assert(uint(s[i+4]) shr 6 == 0b10)
136    # assert(uint(s[i+5]) shr 6 == 0b10)
137    if i <= s.len - 6:
138      result = Rune((uint(s[i]) and ones(1)) shl 30 or
139                    (uint(s[i+1]) and ones(6)) shl 24 or
140                    (uint(s[i+2]) and ones(6)) shl 18 or
141                    (uint(s[i+3]) and ones(6)) shl 12 or
142                    (uint(s[i+4]) and ones(6)) shl 6 or
143                    (uint(s[i+5]) and ones(6)))
144      when doInc: inc(i, 6)
145    else:
146      result = replRune
147      when doInc: inc(i)
148  else:
149    result = Rune(uint(s[i]))
150    when doInc: inc(i)
151
152proc runeAt*(s: string, i: Natural): Rune =
153  ## Returns the rune in ``s`` at **byte index** ``i``.
154  ##
155  ## See also:
156  ## * `runeAtPos proc <#runeAtPos,string,int>`_
157  ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
158  ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
159  runnableExamples:
160    let a = "añyóng"
161    doAssert a.runeAt(1) == "ñ".runeAt(0)
162    doAssert a.runeAt(2) == "ñ".runeAt(1)
163    doAssert a.runeAt(3) == "y".runeAt(0)
164  fastRuneAt(s, i, result, false)
165
166proc validateUtf8*(s: string): int =
167  ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
168  ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
169  ##
170  ## See also:
171  ## * `toUTF8 proc <#toUTF8,Rune>`_
172  ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
173  ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
174  var i = 0
175  let L = s.len
176  while i < L:
177    if uint(s[i]) <= 127:
178      inc(i)
179    elif uint(s[i]) shr 5 == 0b110:
180      if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations.
181      if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2)
182      else: return i
183    elif uint(s[i]) shr 4 == 0b1110:
184      if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10:
185        inc i, 3
186      else: return i
187    elif uint(s[i]) shr 3 == 0b11110:
188      if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and
189                     uint(s[i+2]) shr 6 == 0b10 and
190                     uint(s[i+3]) shr 6 == 0b10:
191        inc i, 4
192      else: return i
193    else:
194      return i
195  return -1
196
197template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
198  ## Copies UTF-8 representation of ``c`` into the preallocated string ``s``
199  ## starting at position ``pos``.
200  ##
201  ## If ``doInc == true`` (default), ``pos`` is incremented
202  ## by the number of bytes that have been processed.
203  ##
204  ## To be the most efficient, make sure ``s`` is preallocated
205  ## with an additional amount equal to the byte length of ``c``.
206  ##
207  ## See also:
208  ## * `validateUtf8 proc <#validateUtf8,string>`_
209  ## * `toUTF8 proc <#toUTF8,Rune>`_
210  ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
211  var i = RuneImpl(c)
212  if i <=% 127:
213    s.setLen(pos+1)
214    s[pos+0] = chr(i)
215    when doInc: inc(pos)
216  elif i <=% 0x07FF:
217    s.setLen(pos+2)
218    s[pos+0] = chr((i shr 6) or 0b110_00000)
219    s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
220    when doInc: inc(pos, 2)
221  elif i <=% 0xFFFF:
222    s.setLen(pos+3)
223    s[pos+0] = chr(i shr 12 or 0b1110_0000)
224    s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
225    s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
226    when doInc: inc(pos, 3)
227  elif i <=% 0x001FFFFF:
228    s.setLen(pos+4)
229    s[pos+0] = chr(i shr 18 or 0b1111_0000)
230    s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
231    s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
232    s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
233    when doInc: inc(pos, 4)
234  elif i <=% 0x03FFFFFF:
235    s.setLen(pos+5)
236    s[pos+0] = chr(i shr 24 or 0b111110_00)
237    s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
238    s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
239    s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
240    s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
241    when doInc: inc(pos, 5)
242  elif i <=% 0x7FFFFFFF:
243    s.setLen(pos+6)
244    s[pos+0] = chr(i shr 30 or 0b1111110_0)
245    s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
246    s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
247    s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
248    s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
249    s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
250    when doInc: inc(pos, 6)
251  else:
252    discard # error, exception?
253
254proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
255  ## Converts a rune into its UTF-8 representation.
256  ##
257  ## See also:
258  ## * `validateUtf8 proc <#validateUtf8,string>`_
259  ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
260  ## * `utf8 iterator <#utf8.i,string>`_
261  ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
262  runnableExamples:
263    let a = "añyóng"
264    doAssert a.runeAt(1).toUTF8 == "ñ"
265
266  result = ""
267  fastToUTF8Copy(c, result, 0, false)
268
269proc add*(s: var string; c: Rune) =
270  ## Adds a rune ``c`` to a string ``s``.
271  runnableExamples:
272    var s = "abc"
273    let c = "ä".runeAt(0)
274    s.add(c)
275    doAssert s == "abcä"
276
277  let pos = s.len
278  fastToUTF8Copy(c, s, pos, false)
279
280proc `$`*(rune: Rune): string =
281  ## An alias for `toUTF8 <#toUTF8,Rune>`_.
282  ##
283  ## See also:
284  ## * `validateUtf8 proc <#validateUtf8,string>`_
285  ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
286  rune.toUTF8
287
288proc `$`*(runes: seq[Rune]): string =
289  ## Converts a sequence of Runes to a string.
290  ##
291  ## See also:
292  ## * `toRunes <#toRunes,string>`_ for a reverse operation
293  runnableExamples:
294    let
295      someString = "öÑ"
296      someRunes = toRunes(someString)
297    doAssert $someRunes == someString
298
299  result = ""
300  for rune in runes:
301    result.add rune
302
303proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int =
304  ## Returns the byte position of rune
305  ## at position ``pos`` in ``s`` with an optional start byte position.
306  ## Returns the special value -1 if it runs out of the string.
307  ##
308  ## **Beware:** This can lead to unoptimized code and slow execution!
309  ## Most problems can be solved more efficiently by using an iterator
310  ## or conversion to a seq of Rune.
311  ##
312  ## See also:
313  ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
314  runnableExamples:
315    let a = "añyóng"
316    doAssert a.runeOffset(1) == 1
317    doAssert a.runeOffset(3) == 4
318    doAssert a.runeOffset(4) == 6
319
320  var
321    i = 0
322    o = start
323  while i < pos:
324    o += runeLenAt(s, o)
325    if o >= s.len:
326      return -1
327    inc i
328  return o
329
330proc runeReverseOffset*(s: string, rev: Positive): (int, int) =
331  ## Returns a tuple with the byte offset of the
332  ## rune at position ``rev`` in ``s``, counting
333  ## from the end (starting with 1) and the total
334  ## number of runes in the string.
335  ##
336  ## Returns a negative value for offset if there are too few runes in
337  ## the string to satisfy the request.
338  ##
339  ## **Beware:** This can lead to unoptimized code and slow execution!
340  ## Most problems can be solved more efficiently by using an iterator
341  ## or conversion to a seq of Rune.
342  ##
343  ## See also:
344  ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
345  var
346    a = rev.int
347    o = 0
348    x = 0
349  let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int
350  while o < s.len:
351    let r = runeLenAt(s, o)
352    o += r
353    if a > times:
354      x += r
355    dec a
356  result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int)
357
358proc runeAtPos*(s: string, pos: int): Rune =
359  ## Returns the rune at position ``pos``.
360  ##
361  ## **Beware:** This can lead to unoptimized code and slow execution!
362  ## Most problems can be solved more efficiently by using an iterator
363  ## or conversion to a seq of Rune.
364  ##
365  ## See also:
366  ## * `runeAt proc <#runeAt,string,Natural>`_
367  ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
368  ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
369  fastRuneAt(s, runeOffset(s, pos), result, false)
370
371proc runeStrAtPos*(s: string, pos: Natural): string =
372  ## Returns the rune at position ``pos`` as UTF8 String.
373  ##
374  ## **Beware:** This can lead to unoptimized code and slow execution!
375  ## Most problems can be solved more efficiently by using an iterator
376  ## or conversion to a seq of Rune.
377  ##
378  ## See also:
379  ## * `runeAt proc <#runeAt,string,Natural>`_
380  ## * `runeAtPos proc <#runeAtPos,string,int>`_
381  ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
382  let o = runeOffset(s, pos)
383  s[o .. (o+runeLenAt(s, o)-1)]
384
385proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
386  ## Returns the UTF-8 substring starting at code point ``pos``
387  ## with ``len`` code points.
388  ##
389  ## If ``pos`` or ``len`` is negative they count from
390  ## the end of the string. If ``len`` is not given it means the longest
391  ## possible string.
392  runnableExamples:
393    let s = "Hänsel  ««: 10,00€"
394    doAssert(runeSubStr(s, 0, 2) == "Hä")
395    doAssert(runeSubStr(s, 10, 1) == ":")
396    doAssert(runeSubStr(s, -6) == "10,00€")
397    doAssert(runeSubStr(s, 10) == ": 10,00€")
398    doAssert(runeSubStr(s, 12, 5) == "10,00")
399    doAssert(runeSubStr(s, -6, 3) == "10,")
400
401  if pos < 0:
402    let (o, rl) = runeReverseOffset(s, -pos)
403    if len >= rl:
404      result = s.substr(o, s.len-1)
405    elif len < 0:
406      let e = rl + len
407      if e < 0:
408        result = ""
409      else:
410        result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1)
411    else:
412      result = s.substr(o, runeOffset(s, len, o)-1)
413  else:
414    let o = runeOffset(s, pos)
415    if o < 0:
416      result = ""
417    elif len == int.high:
418      result = s.substr(o, s.len-1)
419    elif len < 0:
420      let (e, rl) = runeReverseOffset(s, -len)
421      discard rl
422      if e <= 0:
423        result = ""
424      else:
425        result = s.substr(o, e-1)
426    else:
427      var e = runeOffset(s, len, o)
428      if e < 0:
429        e = s.len
430      result = s.substr(o, e-1)
431
432proc `<=%`*(a, b: Rune): bool =
433  ## Checks if code point of `a` is smaller or equal to code point of `b`.
434  runnableExamples:
435    let
436      a = "ú".runeAt(0)
437      b = "ü".runeAt(0)
438    doAssert a <=% b
439  return int(a) <=% int(b)
440
441proc `<%`*(a, b: Rune): bool =
442  ## Checks if code point of `a` is smaller than code point of `b`.
443  runnableExamples:
444    let
445      a = "ú".runeAt(0)
446      b = "ü".runeAt(0)
447    doAssert a <% b
448  return int(a) <% int(b)
449
450proc `==`*(a, b: Rune): bool =
451  ## Checks if two runes are equal.
452  return int(a) == int(b)
453
454
455include "includes/unicode_ranges"
456
457proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
458  var n = len
459  var t = 0
460  while n > 1:
461    var m = n div 2
462    var p = t + m*stride
463    if c >= tab[p]:
464      t = p
465      n = n-m
466    else:
467      n = m
468  if n != 0 and c >= tab[t]:
469    return t
470  return -1
471
472proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
473  ## Converts ``c`` into lower case. This works for any rune.
474  ##
475  ## If possible, prefer ``toLower`` over ``toUpper``.
476  ##
477  ## See also:
478  ## * `toUpper proc <#toUpper,Rune>`_
479  ## * `toTitle proc <#toTitle,Rune>`_
480  ## * `isLower proc <#isLower,Rune>`_
481  var c = RuneImpl(c)
482  var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
483  if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
484    return Rune(c + toLowerRanges[p+2] - 500)
485  p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
486  if p >= 0 and c == toLowerSinglets[p]:
487    return Rune(c + toLowerSinglets[p+1] - 500)
488  return Rune(c)
489
490proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
491  ## Converts ``c`` into upper case. This works for any rune.
492  ##
493  ## If possible, prefer ``toLower`` over ``toUpper``.
494  ##
495  ## See also:
496  ## * `toLower proc <#toLower,Rune>`_
497  ## * `toTitle proc <#toTitle,Rune>`_
498  ## * `isUpper proc <#isUpper,Rune>`_
499  var c = RuneImpl(c)
500  var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
501  if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
502    return Rune(c + toUpperRanges[p+2] - 500)
503  p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
504  if p >= 0 and c == toUpperSinglets[p]:
505    return Rune(c + toUpperSinglets[p+1] - 500)
506  return Rune(c)
507
508proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
509  ## Converts ``c`` to title case.
510  ##
511  ## See also:
512  ## * `toLower proc <#toLower,Rune>`_
513  ## * `toUpper proc <#toUpper,Rune>`_
514  ## * `isTitle proc <#isTitle,Rune>`_
515  var c = RuneImpl(c)
516  var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
517  if p >= 0 and c == toTitleSinglets[p]:
518    return Rune(c + toTitleSinglets[p+1] - 500)
519  return Rune(c)
520
521proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} =
522  ## Returns true if ``c`` is a lower case rune.
523  ##
524  ## If possible, prefer ``isLower`` over ``isUpper``.
525  ##
526  ## See also:
527  ## * `toLower proc <#toLower,Rune>`_
528  ## * `isUpper proc <#isUpper,Rune>`_
529  ## * `isTitle proc <#isTitle,Rune>`_
530  var c = RuneImpl(c)
531  # Note: toUpperRanges is correct here!
532  var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
533  if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
534    return true
535  p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
536  if p >= 0 and c == toUpperSinglets[p]:
537    return true
538
539proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} =
540  ## Returns true if ``c`` is a upper case rune.
541  ##
542  ## If possible, prefer ``isLower`` over ``isUpper``.
543  ##
544  ## See also:
545  ## * `toUpper proc <#toUpper,Rune>`_
546  ## * `isLower proc <#isLower,Rune>`_
547  ## * `isTitle proc <#isTitle,Rune>`_
548  ## * `isAlpha proc <#isAlpha,Rune>`_
549  ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
550  var c = RuneImpl(c)
551  # Note: toLowerRanges is correct here!
552  var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
553  if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
554    return true
555  p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
556  if p >= 0 and c == toLowerSinglets[p]:
557    return true
558
559proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} =
560  ## Returns true if ``c`` is an *alpha* rune (i.e., a letter).
561  ##
562  ## See also:
563  ## * `isLower proc <#isLower,Rune>`_
564  ## * `isTitle proc <#isTitle,Rune>`_
565  ## * `isAlpha proc <#isAlpha,Rune>`_
566  ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
567  ## * `isCombining proc <#isCombining,Rune>`_
568  if isUpper(c) or isLower(c):
569    return true
570  var c = RuneImpl(c)
571  var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
572  if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
573    return true
574  p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
575  if p >= 0 and c == alphaSinglets[p]:
576    return true
577
578proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} =
579  ## Returns true if ``c`` is a Unicode titlecase code point.
580  ##
581  ## See also:
582  ## * `toTitle proc <#toTitle,Rune>`_
583  ## * `isLower proc <#isLower,Rune>`_
584  ## * `isUpper proc <#isUpper,Rune>`_
585  ## * `isAlpha proc <#isAlpha,Rune>`_
586  ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
587  return isUpper(c) and isLower(c)
588
589proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} =
590  ## Returns true if ``c`` is a Unicode whitespace code point.
591  ##
592  ## See also:
593  ## * `isLower proc <#isLower,Rune>`_
594  ## * `isUpper proc <#isUpper,Rune>`_
595  ## * `isTitle proc <#isTitle,Rune>`_
596  ## * `isAlpha proc <#isAlpha,Rune>`_
597  var c = RuneImpl(c)
598  var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
599  if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
600    return true
601
602proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} =
603  ## Returns true if ``c`` is a Unicode combining code unit.
604  ##
605  ## See also:
606  ## * `isLower proc <#isLower,Rune>`_
607  ## * `isUpper proc <#isUpper,Rune>`_
608  ## * `isTitle proc <#isTitle,Rune>`_
609  ## * `isAlpha proc <#isAlpha,Rune>`_
610  var c = RuneImpl(c)
611
612  # Optimized to return false immediately for ASCII
613  return c >= 0x0300 and (c <= 0x036f or
614    (c >= 0x1ab0 and c <= 0x1aff) or
615    (c >= 0x1dc0 and c <= 0x1dff) or
616    (c >= 0x20d0 and c <= 0x20ff) or
617    (c >= 0xfe20 and c <= 0xfe2f))
618
619template runeCheck(s, runeProc) =
620  ## Common code for isAlpha and isSpace.
621  result = if len(s) == 0: false else: true
622  var
623    i = 0
624    rune: Rune
625  while i < len(s) and result:
626    fastRuneAt(s, i, rune, doInc = true)
627    result = runeProc(rune) and result
628
629proc isAlpha*(s: string): bool {.noSideEffect,
630  rtl, extern: "nuc$1Str".} =
631  ## Returns true if ``s`` contains all alphabetic runes.
632  runnableExamples:
633    let a = "añyóng"
634    doAssert a.isAlpha
635  runeCheck(s, isAlpha)
636
637proc isSpace*(s: string): bool {.noSideEffect,
638  rtl, extern: "nuc$1Str".} =
639  ## Returns true if ``s`` contains all whitespace runes.
640  runnableExamples:
641    let a = "\t\l \v\r\f"
642    doAssert a.isSpace
643  runeCheck(s, isWhiteSpace)
644
645
646template convertRune(s, runeProc) =
647  ## Convert runes in ``s`` using ``runeProc`` as the converter.
648  result = newString(len(s))
649  var
650    i = 0
651    resultIndex = 0
652    rune: Rune
653  while i < len(s):
654    fastRuneAt(s, i, rune, doInc = true)
655    rune = runeProc(rune)
656    fastToUTF8Copy(rune, result, resultIndex, doInc = true)
657
658proc toUpper*(s: string): string {.noSideEffect,
659  rtl, extern: "nuc$1Str".} =
660  ## Converts ``s`` into upper-case runes.
661  runnableExamples:
662    doAssert toUpper("abγ") == "ABΓ"
663  convertRune(s, toUpper)
664
665proc toLower*(s: string): string {.noSideEffect,
666  rtl, extern: "nuc$1Str".} =
667  ## Converts ``s`` into lower-case runes.
668  runnableExamples:
669    doAssert toLower("ABΓ") == "abγ"
670  convertRune(s, toLower)
671
672proc swapCase*(s: string): string {.noSideEffect,
673  rtl, extern: "nuc$1".} =
674  ## Swaps the case of runes in ``s``.
675  ##
676  ## Returns a new string such that the cases of all runes
677  ## are swapped if possible.
678  runnableExamples:
679    doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
680
681  var
682    i = 0
683    resultIndex = 0
684    rune: Rune
685  result = newString(len(s))
686  while i < len(s):
687    fastRuneAt(s, i, rune)
688    if rune.isUpper():
689      rune = rune.toLower()
690    elif rune.isLower():
691      rune = rune.toUpper()
692    fastToUTF8Copy(rune, result, resultIndex, doInc = true)
693
694proc capitalize*(s: string): string {.noSideEffect,
695  rtl, extern: "nuc$1".} =
696  ## Converts the first character of ``s`` into an upper-case rune.
697  runnableExamples:
698    doAssert capitalize("βeta") == "Βeta"
699
700  if len(s) == 0:
701    return ""
702  var
703    rune: Rune
704    i = 0
705  fastRuneAt(s, i, rune, doInc = true)
706  result = $toUpper(rune) & substr(s, i)
707
708when not defined(nimHasEffectsOf):
709  {.pragma: effectsOf.}
710
711proc translate*(s: string, replacements: proc(key: string): string): string {.
712  rtl, extern: "nuc$1", effectsOf: replacements.} =
713  ## Translates words in a string using the ``replacements`` proc to substitute
714  ## words inside ``s`` with their replacements.
715  ##
716  ## ``replacements`` is any proc that takes a word and returns
717  ## a new word to fill it's place.
718  runnableExamples:
719    proc wordToNumber(s: string): string =
720      case s
721      of "one": "1"
722      of "two": "2"
723      else: s
724    let a = "one two three four"
725    doAssert a.translate(wordToNumber) == "1 2 three four"
726
727  # Allocate memory for the new string based on the old one.
728  # If the new string length is less than the old, no allocations
729  # will be needed. If the new string length is greater than the
730  # old, then maybe only one allocation is needed
731  result = newStringOfCap(s.len)
732  var
733    index = 0
734    lastIndex = 0
735    wordStart = 0
736    inWord = false
737    rune: Rune
738
739  while index < len(s):
740    lastIndex = index
741    fastRuneAt(s, index, rune)
742    let whiteSpace = rune.isWhiteSpace()
743
744    if whiteSpace and inWord:
745      # If we've reached the end of a word
746      let word = s[wordStart ..< lastIndex]
747      result.add(replacements(word))
748      result.add($rune)
749      inWord = false
750    elif not whiteSpace and not inWord:
751      # If we've hit a non space character and
752      # are not currently in a word, track
753      # the starting index of the word
754      inWord = true
755      wordStart = lastIndex
756    elif whiteSpace:
757      result.add($rune)
758
759  if wordStart < len(s) and inWord:
760    # Get the trailing word at the end
761    let word = s[wordStart .. ^1]
762    result.add(replacements(word))
763
764proc title*(s: string): string {.noSideEffect,
765  rtl, extern: "nuc$1".} =
766  ## Converts ``s`` to a unicode title.
767  ##
768  ## Returns a new string such that the first character
769  ## in each word inside ``s`` is capitalized.
770  runnableExamples:
771    doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
772
773  var
774    i = 0
775    resultIndex = 0
776    rune: Rune
777  result = newString(len(s))
778  var firstRune = true
779
780  while i < len(s):
781    fastRuneAt(s, i, rune)
782    if not rune.isWhiteSpace() and firstRune:
783      rune = rune.toUpper()
784      firstRune = false
785    elif rune.isWhiteSpace():
786      firstRune = true
787    fastToUTF8Copy(rune, result, resultIndex, doInc = true)
788
789
790iterator runes*(s: string): Rune =
791  ## Iterates over any rune of the string ``s`` returning runes.
792  var
793    i = 0
794    result: Rune
795  while i < len(s):
796    fastRuneAt(s, i, result, true)
797    yield result
798
799iterator utf8*(s: string): string =
800  ## Iterates over any rune of the string ``s`` returning utf8 values.
801  ##
802  ## See also:
803  ## * `validateUtf8 proc <#validateUtf8,string>`_
804  ## * `toUTF8 proc <#toUTF8,Rune>`_
805  ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
806  ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
807  var o = 0
808  while o < s.len:
809    let n = runeLenAt(s, o)
810    yield s[o .. (o+n-1)]
811    o += n
812
813proc toRunes*(s: string): seq[Rune] =
814  ## Obtains a sequence containing the Runes in ``s``.
815  ##
816  ## See also:
817  ## * `$ proc <#$,Rune>`_ for a reverse operation
818  runnableExamples:
819    let a = toRunes("aáä")
820    doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
821
822  result = newSeq[Rune]()
823  for r in s.runes:
824    result.add(r)
825
826proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1".} =
827  ## Compares two UTF-8 strings and ignores the case. Returns:
828  ##
829  ## | 0 if a == b
830  ## | < 0 if a < b
831  ## | > 0 if a > b
832  var i = 0
833  var j = 0
834  var ar, br: Rune
835  while i < a.len and j < b.len:
836    # slow path:
837    fastRuneAt(a, i, ar)
838    fastRuneAt(b, j, br)
839    result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
840    if result != 0: return
841  result = a.len - b.len
842
843proc reversed*(s: string): string =
844  ## Returns the reverse of ``s``, interpreting it as runes.
845  ##
846  ## Unicode combining characters are correctly interpreted as well.
847  runnableExamples:
848    assert reversed("Reverse this!") == "!siht esreveR"
849    assert reversed("先秦兩漢") == "漢兩秦先"
850    assert reversed("as⃝df̅") == "f̅ds⃝a"
851    assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
852
853  var
854    i = 0
855    lastI = 0
856    newPos = len(s) - 1
857    blockPos = 0
858    r: Rune
859
860  template reverseUntil(pos) =
861    var j = pos - 1
862    while j > blockPos:
863      result[newPos] = s[j]
864      dec j
865      dec newPos
866    blockPos = pos - 1
867
868  result = newString(len(s))
869
870  while i < len(s):
871    lastI = i
872    fastRuneAt(s, i, r, true)
873    if not isCombining(r):
874      reverseUntil(lastI)
875
876  reverseUntil(len(s))
877
878proc graphemeLen*(s: string; i: Natural): Natural =
879  ## The number of bytes belonging to byte index ``s[i]``,
880  ## including following combining code unit.
881  runnableExamples:
882    let a = "añyóng"
883    doAssert a.graphemeLen(1) == 2 ## ñ
884    doAssert a.graphemeLen(2) == 1
885    doAssert a.graphemeLen(4) == 2 ## ó
886
887  var j = i.int
888  var r, r2: Rune
889  if j < s.len:
890    fastRuneAt(s, j, r, true)
891    result = j-i
892    while j < s.len:
893      fastRuneAt(s, j, r2, true)
894      if not isCombining(r2): break
895      result = j-i
896
897proc lastRune*(s: string; last: int): (Rune, int) =
898  ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
899  ## in bytes.
900  if s[last] <= chr(127):
901    result = (Rune(s[last]), 1)
902  else:
903    var L = 0
904    while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L)
905    var r: Rune
906    fastRuneAt(s, last-L, r, false)
907    result = (r, L+1)
908
909proc size*(r: Rune): int {.noSideEffect.} =
910  ## Returns the number of bytes the rune ``r`` takes.
911  runnableExamples:
912    let a = toRunes "aá"
913    doAssert size(a[0]) == 1
914    doAssert size(a[1]) == 2
915
916  let v = r.uint32
917  if v <= 0x007F'u32: result = 1
918  elif v <= 0x07FF'u32: result = 2
919  elif v <= 0xFFFF'u32: result = 3
920  elif v <= 0x1FFFFF'u32: result = 4
921  elif v <= 0x3FFFFFF'u32: result = 5
922  elif v <= 0x7FFFFFFF'u32: result = 6
923  else: result = 1
924
925# --------- Private templates for different split separators -----------
926proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool =
927  var rune: Rune
928  fastRuneAt(s, index, rune, false)
929  return seps.contains(rune)
930
931proc stringHasSep(s: string, index: int, sep: Rune): bool =
932  var rune: Rune
933  fastRuneAt(s, index, rune, false)
934  return sep == rune
935
936template splitCommon(s, sep, maxsplit: untyped) =
937  ## Common code for split procedures.
938  let
939    sLen = len(s)
940  var
941    last = 0
942    splits = maxsplit
943  if sLen > 0:
944    while last <= sLen:
945      var first = last
946      while last < sLen and not stringHasSep(s, last, sep):
947        inc(last, runeLenAt(s, last))
948      if splits == 0: last = sLen
949      yield s[first .. (last - 1)]
950      if splits == 0: break
951      dec(splits)
952      inc(last, if last < sLen: runeLenAt(s, last) else: 1)
953
954iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces,
955  maxsplit: int = -1): string =
956  ## Splits the unicode string ``s`` into substrings using a group of separators.
957  ##
958  ## Substrings are separated by a substring containing only ``seps``.
959  runnableExamples:
960    import std/sequtils
961
962    assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
963      @["hÃllo", "this", "is", "an", "example", "是"]
964
965    # And the following code splits the same string using a sequence of Runes.
966    assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
967      @["añyóng", "hÃllo", "是", "example"]
968
969    # example with a `Rune` separator and unused one `;`:
970    assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
971
972    # Another example that splits a string containing a date.
973    let date = "2012-11-20T22:08:08.398990"
974
975    assert toSeq(split(date, " -:T".toRunes)) ==
976      @["2012", "11", "20", "22", "08", "08.398990"]
977
978  splitCommon(s, seps, maxsplit)
979
980iterator splitWhitespace*(s: string): string =
981  ## Splits a unicode string at whitespace runes.
982  splitCommon(s, unicodeSpaces, -1)
983
984template accResult(iter: untyped) =
985  result = @[]
986  for x in iter: add(result, x)
987
988proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
989  rtl, extern: "ncuSplitWhitespace".} =
990  ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
991  ## iterator, but is a proc that returns a sequence of substrings.
992  accResult(splitWhitespace(s))
993
994iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
995  ## Splits the unicode string ``s`` into substrings using a single separator.
996  ## Substrings are separated by the rune ``sep``.
997  runnableExamples:
998    import std/sequtils
999
1000    assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
1001      @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"]
1002
1003  splitCommon(s, sep, maxsplit)
1004
1005proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
1006    seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} =
1007  ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
1008  ## but is a proc that returns a sequence of substrings.
1009  accResult(split(s, seps, maxsplit))
1010
1011proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
1012  rtl, extern: "nucSplitRune".} =
1013  ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
1014  ## that returns a sequence of substrings.
1015  accResult(split(s, sep, maxsplit))
1016
1017proc strip*(s: string, leading = true, trailing = true,
1018            runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
1019            rtl, extern: "nucStrip".} =
1020  ## Strips leading or trailing ``runes`` from ``s`` and returns
1021  ## the resulting string.
1022  ##
1023  ## If ``leading`` is true (default), leading ``runes`` are stripped.
1024  ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
1025  ## If both are false, the string is returned unchanged.
1026  runnableExamples:
1027    let a = "\táñyóng   "
1028    doAssert a.strip == "áñyóng"
1029    doAssert a.strip(leading = false) == "\táñyóng"
1030    doAssert a.strip(trailing = false) == "áñyóng   "
1031
1032  var
1033    sI = 0          ## starting index into string ``s``
1034    eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
1035  if leading:
1036    var
1037      i = 0
1038      xI: int ## value of ``sI`` at the beginning of the iteration
1039      rune: Rune
1040    while i < len(s):
1041      xI = i
1042      fastRuneAt(s, i, rune)
1043      sI = i # Assume to start from next rune
1044      if not runes.contains(rune):
1045        sI = xI # Go back to where the current rune starts
1046        break
1047  if trailing:
1048    var
1049      i = eI
1050      xI: int
1051      rune: Rune
1052    while i >= 0:
1053      xI = i
1054      fastRuneAt(s, xI, rune)
1055      var yI = i - 1
1056      while yI >= 0:
1057        var
1058          yIend = yI
1059          pRune: Rune
1060        fastRuneAt(s, yIend, pRune)
1061        if yIend < xI: break
1062        i = yI
1063        rune = pRune
1064        dec(yI)
1065      if not runes.contains(rune):
1066        eI = xI - 1
1067        break
1068      dec(i)
1069  let newLen = eI - sI + 1
1070  result = newStringOfCap(newLen)
1071  if newLen > 0:
1072    result.add s[sI .. eI]
1073
1074proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
1075  rtl, extern: "nucRepeatRune".} =
1076  ## Returns a string of ``count`` Runes ``c``.
1077  ##
1078  ## The returned string will have a rune-length of ``count``.
1079  runnableExamples:
1080    let a = "ñ".runeAt(0)
1081    doAssert a.repeat(5) == "ñññññ"
1082
1083  let s = $c
1084  result = newStringOfCap(count * s.len)
1085  for i in 0 ..< count:
1086    result.add s
1087
1088proc align*(s: string, count: Natural, padding = ' '.Rune): string {.
1089  noSideEffect, rtl, extern: "nucAlignString".} =
1090  ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
1091  ## of ``count``.
1092  ##
1093  ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
1094  ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
1095  ## returned unchanged. If you need to left align a string use the `alignLeft
1096  ## proc <#alignLeft,string,Natural>`_.
1097  runnableExamples:
1098    assert align("abc", 4) == " abc"
1099    assert align("a", 0) == "a"
1100    assert align("1232", 6) == "  1232"
1101    assert align("1232", 6, '#'.Rune) == "##1232"
1102    assert align("Åge", 5) == "  Åge"
1103    assert align("×", 4, '_'.Rune) == "___×"
1104
1105  let sLen = s.runeLen
1106  if sLen < count:
1107    let padStr = $padding
1108    result = newStringOfCap(padStr.len * count)
1109    let spaces = count - sLen
1110    for i in 0 ..< spaces: result.add padStr
1111    result.add s
1112  else:
1113    result = s
1114
1115proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.
1116    noSideEffect.} =
1117  ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
1118  ## rune-length of ``count``.
1119  ##
1120  ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
1121  ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
1122  ## returned unchanged. If you need to right align a string use the `align
1123  ## proc <#align,string,Natural>`_.
1124  runnableExamples:
1125    assert alignLeft("abc", 4) == "abc "
1126    assert alignLeft("a", 0) == "a"
1127    assert alignLeft("1232", 6) == "1232  "
1128    assert alignLeft("1232", 6, '#'.Rune) == "1232##"
1129    assert alignLeft("Åge", 5) == "Åge  "
1130    assert alignLeft("×", 4, '_'.Rune) == "×___"
1131  let sLen = s.runeLen
1132  if sLen < count:
1133    let padStr = $padding
1134    result = newStringOfCap(s.len + (count - sLen) * padStr.len)
1135    result.add s
1136    for i in sLen ..< count:
1137      result.add padStr
1138  else:
1139    result = s
1140