1# 2# 3# Nim's Runtime Library 4# (c) Copyright 2012 Andreas Rumpf 5# 6# See the file "copying.txt", included in this 7# distribution, for details about the copyright. 8# 9 10## This module provides support to handle the Unicode UTF-8 encoding. 11## 12## There are no specialized ``insert``, ``delete``, ``add`` and ``contains`` 13## procedures for ``seq[Rune]`` in this module because the generic variants 14## of these procedures in the system module already work with it. 15## 16## The current version is compatible with Unicode v12.0.0. 17## 18## **See also:** 19## * `strutils module <strutils.html>`_ 20## * `unidecode module <unidecode.html>`_ 21## * `encodings module <encodings.html>`_ 22 23include "system/inclrtl" 24 25type 26 RuneImpl = int32 # underlying type of Rune 27 Rune* = distinct RuneImpl ## \ 28 ## Type that can hold a single Unicode code point. 29 ## 30 ## A Rune may be composed with other Runes to a character on the screen. 31 ## `RuneImpl` is the underlying type used to store Runes, currently `int32`. 32 33template ones(n: untyped): untyped = ((1 shl n)-1) 34 35proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} = 36 ## Returns the number of runes of the string ``s``. 37 runnableExamples: 38 let a = "añyóng" 39 doAssert a.runeLen == 6 40 ## note: a.len == 8 41 42 result = 0 43 var i = 0 44 while i < len(s): 45 if uint(s[i]) <= 127: inc(i) 46 elif uint(s[i]) shr 5 == 0b110: inc(i, 2) 47 elif uint(s[i]) shr 4 == 0b1110: inc(i, 3) 48 elif uint(s[i]) shr 3 == 0b11110: inc(i, 4) 49 elif uint(s[i]) shr 2 == 0b111110: inc(i, 5) 50 elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6) 51 else: inc i 52 inc(result) 53 54proc runeLenAt*(s: string, i: Natural): int = 55 ## Returns the number of bytes the rune starting at ``s[i]`` takes. 56 ## 57 ## See also: 58 ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ 59 runnableExamples: 60 let a = "añyóng" 61 doAssert a.runeLenAt(0) == 1 62 doAssert a.runeLenAt(1) == 2 63 64 if uint(s[i]) <= 127: result = 1 65 elif uint(s[i]) shr 5 == 0b110: result = 2 66 elif uint(s[i]) shr 4 == 0b1110: result = 3 67 elif uint(s[i]) shr 3 == 0b11110: result = 4 68 elif uint(s[i]) shr 2 == 0b111110: result = 5 69 elif uint(s[i]) shr 1 == 0b1111110: result = 6 70 else: result = 1 71 72const replRune = Rune(0xFFFD) 73 74template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) = 75 ## Returns the rune ``s[i]`` in ``result``. 76 ## 77 ## If ``doInc == true`` (default), ``i`` is incremented by the number 78 ## of bytes that have been processed. 79 bind ones 80 if uint(s[i]) <= 127: 81 result = Rune(uint(s[i])) 82 when doInc: inc(i) 83 elif uint(s[i]) shr 5 == 0b110: 84 # assert(uint(s[i+1]) shr 6 == 0b10) 85 if i <= s.len - 2: 86 result = Rune((uint(s[i]) and (ones(5))) shl 6 or 87 (uint(s[i+1]) and ones(6))) 88 when doInc: inc(i, 2) 89 else: 90 result = replRune 91 when doInc: inc(i) 92 elif uint(s[i]) shr 4 == 0b1110: 93 # assert(uint(s[i+1]) shr 6 == 0b10) 94 # assert(uint(s[i+2]) shr 6 == 0b10) 95 if i <= s.len - 3: 96 result = Rune((uint(s[i]) and ones(4)) shl 12 or 97 (uint(s[i+1]) and ones(6)) shl 6 or 98 (uint(s[i+2]) and ones(6))) 99 when doInc: inc(i, 3) 100 else: 101 result = replRune 102 when doInc: inc(i) 103 elif uint(s[i]) shr 3 == 0b11110: 104 # assert(uint(s[i+1]) shr 6 == 0b10) 105 # assert(uint(s[i+2]) shr 6 == 0b10) 106 # assert(uint(s[i+3]) shr 6 == 0b10) 107 if i <= s.len - 4: 108 result = Rune((uint(s[i]) and ones(3)) shl 18 or 109 (uint(s[i+1]) and ones(6)) shl 12 or 110 (uint(s[i+2]) and ones(6)) shl 6 or 111 (uint(s[i+3]) and ones(6))) 112 when doInc: inc(i, 4) 113 else: 114 result = replRune 115 when doInc: inc(i) 116 elif uint(s[i]) shr 2 == 0b111110: 117 # assert(uint(s[i+1]) shr 6 == 0b10) 118 # assert(uint(s[i+2]) shr 6 == 0b10) 119 # assert(uint(s[i+3]) shr 6 == 0b10) 120 # assert(uint(s[i+4]) shr 6 == 0b10) 121 if i <= s.len - 5: 122 result = Rune((uint(s[i]) and ones(2)) shl 24 or 123 (uint(s[i+1]) and ones(6)) shl 18 or 124 (uint(s[i+2]) and ones(6)) shl 12 or 125 (uint(s[i+3]) and ones(6)) shl 6 or 126 (uint(s[i+4]) and ones(6))) 127 when doInc: inc(i, 5) 128 else: 129 result = replRune 130 when doInc: inc(i) 131 elif uint(s[i]) shr 1 == 0b1111110: 132 # assert(uint(s[i+1]) shr 6 == 0b10) 133 # assert(uint(s[i+2]) shr 6 == 0b10) 134 # assert(uint(s[i+3]) shr 6 == 0b10) 135 # assert(uint(s[i+4]) shr 6 == 0b10) 136 # assert(uint(s[i+5]) shr 6 == 0b10) 137 if i <= s.len - 6: 138 result = Rune((uint(s[i]) and ones(1)) shl 30 or 139 (uint(s[i+1]) and ones(6)) shl 24 or 140 (uint(s[i+2]) and ones(6)) shl 18 or 141 (uint(s[i+3]) and ones(6)) shl 12 or 142 (uint(s[i+4]) and ones(6)) shl 6 or 143 (uint(s[i+5]) and ones(6))) 144 when doInc: inc(i, 6) 145 else: 146 result = replRune 147 when doInc: inc(i) 148 else: 149 result = Rune(uint(s[i])) 150 when doInc: inc(i) 151 152proc runeAt*(s: string, i: Natural): Rune = 153 ## Returns the rune in ``s`` at **byte index** ``i``. 154 ## 155 ## See also: 156 ## * `runeAtPos proc <#runeAtPos,string,int>`_ 157 ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ 158 ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ 159 runnableExamples: 160 let a = "añyóng" 161 doAssert a.runeAt(1) == "ñ".runeAt(0) 162 doAssert a.runeAt(2) == "ñ".runeAt(1) 163 doAssert a.runeAt(3) == "y".runeAt(0) 164 fastRuneAt(s, i, result, false) 165 166proc validateUtf8*(s: string): int = 167 ## Returns the position of the invalid byte in ``s`` if the string ``s`` does 168 ## not hold valid UTF-8 data. Otherwise ``-1`` is returned. 169 ## 170 ## See also: 171 ## * `toUTF8 proc <#toUTF8,Rune>`_ 172 ## * `$ proc <#$,Rune>`_ alias for `toUTF8` 173 ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ 174 var i = 0 175 let L = s.len 176 while i < L: 177 if uint(s[i]) <= 127: 178 inc(i) 179 elif uint(s[i]) shr 5 == 0b110: 180 if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations. 181 if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2) 182 else: return i 183 elif uint(s[i]) shr 4 == 0b1110: 184 if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10: 185 inc i, 3 186 else: return i 187 elif uint(s[i]) shr 3 == 0b11110: 188 if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and 189 uint(s[i+2]) shr 6 == 0b10 and 190 uint(s[i+3]) shr 6 == 0b10: 191 inc i, 4 192 else: return i 193 else: 194 return i 195 return -1 196 197template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) = 198 ## Copies UTF-8 representation of ``c`` into the preallocated string ``s`` 199 ## starting at position ``pos``. 200 ## 201 ## If ``doInc == true`` (default), ``pos`` is incremented 202 ## by the number of bytes that have been processed. 203 ## 204 ## To be the most efficient, make sure ``s`` is preallocated 205 ## with an additional amount equal to the byte length of ``c``. 206 ## 207 ## See also: 208 ## * `validateUtf8 proc <#validateUtf8,string>`_ 209 ## * `toUTF8 proc <#toUTF8,Rune>`_ 210 ## * `$ proc <#$,Rune>`_ alias for `toUTF8` 211 var i = RuneImpl(c) 212 if i <=% 127: 213 s.setLen(pos+1) 214 s[pos+0] = chr(i) 215 when doInc: inc(pos) 216 elif i <=% 0x07FF: 217 s.setLen(pos+2) 218 s[pos+0] = chr((i shr 6) or 0b110_00000) 219 s[pos+1] = chr((i and ones(6)) or 0b10_0000_00) 220 when doInc: inc(pos, 2) 221 elif i <=% 0xFFFF: 222 s.setLen(pos+3) 223 s[pos+0] = chr(i shr 12 or 0b1110_0000) 224 s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00) 225 s[pos+2] = chr(i and ones(6) or 0b10_0000_00) 226 when doInc: inc(pos, 3) 227 elif i <=% 0x001FFFFF: 228 s.setLen(pos+4) 229 s[pos+0] = chr(i shr 18 or 0b1111_0000) 230 s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00) 231 s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00) 232 s[pos+3] = chr(i and ones(6) or 0b10_0000_00) 233 when doInc: inc(pos, 4) 234 elif i <=% 0x03FFFFFF: 235 s.setLen(pos+5) 236 s[pos+0] = chr(i shr 24 or 0b111110_00) 237 s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00) 238 s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00) 239 s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00) 240 s[pos+4] = chr(i and ones(6) or 0b10_0000_00) 241 when doInc: inc(pos, 5) 242 elif i <=% 0x7FFFFFFF: 243 s.setLen(pos+6) 244 s[pos+0] = chr(i shr 30 or 0b1111110_0) 245 s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00) 246 s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00) 247 s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00) 248 s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00) 249 s[pos+5] = chr(i and ones(6) or 0b10_0000_00) 250 when doInc: inc(pos, 6) 251 else: 252 discard # error, exception? 253 254proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} = 255 ## Converts a rune into its UTF-8 representation. 256 ## 257 ## See also: 258 ## * `validateUtf8 proc <#validateUtf8,string>`_ 259 ## * `$ proc <#$,Rune>`_ alias for `toUTF8` 260 ## * `utf8 iterator <#utf8.i,string>`_ 261 ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ 262 runnableExamples: 263 let a = "añyóng" 264 doAssert a.runeAt(1).toUTF8 == "ñ" 265 266 result = "" 267 fastToUTF8Copy(c, result, 0, false) 268 269proc add*(s: var string; c: Rune) = 270 ## Adds a rune ``c`` to a string ``s``. 271 runnableExamples: 272 var s = "abc" 273 let c = "ä".runeAt(0) 274 s.add(c) 275 doAssert s == "abcä" 276 277 let pos = s.len 278 fastToUTF8Copy(c, s, pos, false) 279 280proc `$`*(rune: Rune): string = 281 ## An alias for `toUTF8 <#toUTF8,Rune>`_. 282 ## 283 ## See also: 284 ## * `validateUtf8 proc <#validateUtf8,string>`_ 285 ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ 286 rune.toUTF8 287 288proc `$`*(runes: seq[Rune]): string = 289 ## Converts a sequence of Runes to a string. 290 ## 291 ## See also: 292 ## * `toRunes <#toRunes,string>`_ for a reverse operation 293 runnableExamples: 294 let 295 someString = "öÑ" 296 someRunes = toRunes(someString) 297 doAssert $someRunes == someString 298 299 result = "" 300 for rune in runes: 301 result.add rune 302 303proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int = 304 ## Returns the byte position of rune 305 ## at position ``pos`` in ``s`` with an optional start byte position. 306 ## Returns the special value -1 if it runs out of the string. 307 ## 308 ## **Beware:** This can lead to unoptimized code and slow execution! 309 ## Most problems can be solved more efficiently by using an iterator 310 ## or conversion to a seq of Rune. 311 ## 312 ## See also: 313 ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_ 314 runnableExamples: 315 let a = "añyóng" 316 doAssert a.runeOffset(1) == 1 317 doAssert a.runeOffset(3) == 4 318 doAssert a.runeOffset(4) == 6 319 320 var 321 i = 0 322 o = start 323 while i < pos: 324 o += runeLenAt(s, o) 325 if o >= s.len: 326 return -1 327 inc i 328 return o 329 330proc runeReverseOffset*(s: string, rev: Positive): (int, int) = 331 ## Returns a tuple with the byte offset of the 332 ## rune at position ``rev`` in ``s``, counting 333 ## from the end (starting with 1) and the total 334 ## number of runes in the string. 335 ## 336 ## Returns a negative value for offset if there are too few runes in 337 ## the string to satisfy the request. 338 ## 339 ## **Beware:** This can lead to unoptimized code and slow execution! 340 ## Most problems can be solved more efficiently by using an iterator 341 ## or conversion to a seq of Rune. 342 ## 343 ## See also: 344 ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_ 345 var 346 a = rev.int 347 o = 0 348 x = 0 349 let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int 350 while o < s.len: 351 let r = runeLenAt(s, o) 352 o += r 353 if a > times: 354 x += r 355 dec a 356 result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int) 357 358proc runeAtPos*(s: string, pos: int): Rune = 359 ## Returns the rune at position ``pos``. 360 ## 361 ## **Beware:** This can lead to unoptimized code and slow execution! 362 ## Most problems can be solved more efficiently by using an iterator 363 ## or conversion to a seq of Rune. 364 ## 365 ## See also: 366 ## * `runeAt proc <#runeAt,string,Natural>`_ 367 ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_ 368 ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ 369 fastRuneAt(s, runeOffset(s, pos), result, false) 370 371proc runeStrAtPos*(s: string, pos: Natural): string = 372 ## Returns the rune at position ``pos`` as UTF8 String. 373 ## 374 ## **Beware:** This can lead to unoptimized code and slow execution! 375 ## Most problems can be solved more efficiently by using an iterator 376 ## or conversion to a seq of Rune. 377 ## 378 ## See also: 379 ## * `runeAt proc <#runeAt,string,Natural>`_ 380 ## * `runeAtPos proc <#runeAtPos,string,int>`_ 381 ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_ 382 let o = runeOffset(s, pos) 383 s[o .. (o+runeLenAt(s, o)-1)] 384 385proc runeSubStr*(s: string, pos: int, len: int = int.high): string = 386 ## Returns the UTF-8 substring starting at code point ``pos`` 387 ## with ``len`` code points. 388 ## 389 ## If ``pos`` or ``len`` is negative they count from 390 ## the end of the string. If ``len`` is not given it means the longest 391 ## possible string. 392 runnableExamples: 393 let s = "Hänsel ««: 10,00€" 394 doAssert(runeSubStr(s, 0, 2) == "Hä") 395 doAssert(runeSubStr(s, 10, 1) == ":") 396 doAssert(runeSubStr(s, -6) == "10,00€") 397 doAssert(runeSubStr(s, 10) == ": 10,00€") 398 doAssert(runeSubStr(s, 12, 5) == "10,00") 399 doAssert(runeSubStr(s, -6, 3) == "10,") 400 401 if pos < 0: 402 let (o, rl) = runeReverseOffset(s, -pos) 403 if len >= rl: 404 result = s.substr(o, s.len-1) 405 elif len < 0: 406 let e = rl + len 407 if e < 0: 408 result = "" 409 else: 410 result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1) 411 else: 412 result = s.substr(o, runeOffset(s, len, o)-1) 413 else: 414 let o = runeOffset(s, pos) 415 if o < 0: 416 result = "" 417 elif len == int.high: 418 result = s.substr(o, s.len-1) 419 elif len < 0: 420 let (e, rl) = runeReverseOffset(s, -len) 421 discard rl 422 if e <= 0: 423 result = "" 424 else: 425 result = s.substr(o, e-1) 426 else: 427 var e = runeOffset(s, len, o) 428 if e < 0: 429 e = s.len 430 result = s.substr(o, e-1) 431 432proc `<=%`*(a, b: Rune): bool = 433 ## Checks if code point of `a` is smaller or equal to code point of `b`. 434 runnableExamples: 435 let 436 a = "ú".runeAt(0) 437 b = "ü".runeAt(0) 438 doAssert a <=% b 439 return int(a) <=% int(b) 440 441proc `<%`*(a, b: Rune): bool = 442 ## Checks if code point of `a` is smaller than code point of `b`. 443 runnableExamples: 444 let 445 a = "ú".runeAt(0) 446 b = "ü".runeAt(0) 447 doAssert a <% b 448 return int(a) <% int(b) 449 450proc `==`*(a, b: Rune): bool = 451 ## Checks if two runes are equal. 452 return int(a) == int(b) 453 454 455include "includes/unicode_ranges" 456 457proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int = 458 var n = len 459 var t = 0 460 while n > 1: 461 var m = n div 2 462 var p = t + m*stride 463 if c >= tab[p]: 464 t = p 465 n = n-m 466 else: 467 n = m 468 if n != 0 and c >= tab[t]: 469 return t 470 return -1 471 472proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} = 473 ## Converts ``c`` into lower case. This works for any rune. 474 ## 475 ## If possible, prefer ``toLower`` over ``toUpper``. 476 ## 477 ## See also: 478 ## * `toUpper proc <#toUpper,Rune>`_ 479 ## * `toTitle proc <#toTitle,Rune>`_ 480 ## * `isLower proc <#isLower,Rune>`_ 481 var c = RuneImpl(c) 482 var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3) 483 if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]: 484 return Rune(c + toLowerRanges[p+2] - 500) 485 p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2) 486 if p >= 0 and c == toLowerSinglets[p]: 487 return Rune(c + toLowerSinglets[p+1] - 500) 488 return Rune(c) 489 490proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} = 491 ## Converts ``c`` into upper case. This works for any rune. 492 ## 493 ## If possible, prefer ``toLower`` over ``toUpper``. 494 ## 495 ## See also: 496 ## * `toLower proc <#toLower,Rune>`_ 497 ## * `toTitle proc <#toTitle,Rune>`_ 498 ## * `isUpper proc <#isUpper,Rune>`_ 499 var c = RuneImpl(c) 500 var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3) 501 if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]: 502 return Rune(c + toUpperRanges[p+2] - 500) 503 p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2) 504 if p >= 0 and c == toUpperSinglets[p]: 505 return Rune(c + toUpperSinglets[p+1] - 500) 506 return Rune(c) 507 508proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} = 509 ## Converts ``c`` to title case. 510 ## 511 ## See also: 512 ## * `toLower proc <#toLower,Rune>`_ 513 ## * `toUpper proc <#toUpper,Rune>`_ 514 ## * `isTitle proc <#isTitle,Rune>`_ 515 var c = RuneImpl(c) 516 var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2) 517 if p >= 0 and c == toTitleSinglets[p]: 518 return Rune(c + toTitleSinglets[p+1] - 500) 519 return Rune(c) 520 521proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} = 522 ## Returns true if ``c`` is a lower case rune. 523 ## 524 ## If possible, prefer ``isLower`` over ``isUpper``. 525 ## 526 ## See also: 527 ## * `toLower proc <#toLower,Rune>`_ 528 ## * `isUpper proc <#isUpper,Rune>`_ 529 ## * `isTitle proc <#isTitle,Rune>`_ 530 var c = RuneImpl(c) 531 # Note: toUpperRanges is correct here! 532 var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3) 533 if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]: 534 return true 535 p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2) 536 if p >= 0 and c == toUpperSinglets[p]: 537 return true 538 539proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} = 540 ## Returns true if ``c`` is a upper case rune. 541 ## 542 ## If possible, prefer ``isLower`` over ``isUpper``. 543 ## 544 ## See also: 545 ## * `toUpper proc <#toUpper,Rune>`_ 546 ## * `isLower proc <#isLower,Rune>`_ 547 ## * `isTitle proc <#isTitle,Rune>`_ 548 ## * `isAlpha proc <#isAlpha,Rune>`_ 549 ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ 550 var c = RuneImpl(c) 551 # Note: toLowerRanges is correct here! 552 var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3) 553 if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]: 554 return true 555 p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2) 556 if p >= 0 and c == toLowerSinglets[p]: 557 return true 558 559proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} = 560 ## Returns true if ``c`` is an *alpha* rune (i.e., a letter). 561 ## 562 ## See also: 563 ## * `isLower proc <#isLower,Rune>`_ 564 ## * `isTitle proc <#isTitle,Rune>`_ 565 ## * `isAlpha proc <#isAlpha,Rune>`_ 566 ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ 567 ## * `isCombining proc <#isCombining,Rune>`_ 568 if isUpper(c) or isLower(c): 569 return true 570 var c = RuneImpl(c) 571 var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2) 572 if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]: 573 return true 574 p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1) 575 if p >= 0 and c == alphaSinglets[p]: 576 return true 577 578proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} = 579 ## Returns true if ``c`` is a Unicode titlecase code point. 580 ## 581 ## See also: 582 ## * `toTitle proc <#toTitle,Rune>`_ 583 ## * `isLower proc <#isLower,Rune>`_ 584 ## * `isUpper proc <#isUpper,Rune>`_ 585 ## * `isAlpha proc <#isAlpha,Rune>`_ 586 ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_ 587 return isUpper(c) and isLower(c) 588 589proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} = 590 ## Returns true if ``c`` is a Unicode whitespace code point. 591 ## 592 ## See also: 593 ## * `isLower proc <#isLower,Rune>`_ 594 ## * `isUpper proc <#isUpper,Rune>`_ 595 ## * `isTitle proc <#isTitle,Rune>`_ 596 ## * `isAlpha proc <#isAlpha,Rune>`_ 597 var c = RuneImpl(c) 598 var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2) 599 if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]: 600 return true 601 602proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} = 603 ## Returns true if ``c`` is a Unicode combining code unit. 604 ## 605 ## See also: 606 ## * `isLower proc <#isLower,Rune>`_ 607 ## * `isUpper proc <#isUpper,Rune>`_ 608 ## * `isTitle proc <#isTitle,Rune>`_ 609 ## * `isAlpha proc <#isAlpha,Rune>`_ 610 var c = RuneImpl(c) 611 612 # Optimized to return false immediately for ASCII 613 return c >= 0x0300 and (c <= 0x036f or 614 (c >= 0x1ab0 and c <= 0x1aff) or 615 (c >= 0x1dc0 and c <= 0x1dff) or 616 (c >= 0x20d0 and c <= 0x20ff) or 617 (c >= 0xfe20 and c <= 0xfe2f)) 618 619template runeCheck(s, runeProc) = 620 ## Common code for isAlpha and isSpace. 621 result = if len(s) == 0: false else: true 622 var 623 i = 0 624 rune: Rune 625 while i < len(s) and result: 626 fastRuneAt(s, i, rune, doInc = true) 627 result = runeProc(rune) and result 628 629proc isAlpha*(s: string): bool {.noSideEffect, 630 rtl, extern: "nuc$1Str".} = 631 ## Returns true if ``s`` contains all alphabetic runes. 632 runnableExamples: 633 let a = "añyóng" 634 doAssert a.isAlpha 635 runeCheck(s, isAlpha) 636 637proc isSpace*(s: string): bool {.noSideEffect, 638 rtl, extern: "nuc$1Str".} = 639 ## Returns true if ``s`` contains all whitespace runes. 640 runnableExamples: 641 let a = "\t\l \v\r\f" 642 doAssert a.isSpace 643 runeCheck(s, isWhiteSpace) 644 645 646template convertRune(s, runeProc) = 647 ## Convert runes in ``s`` using ``runeProc`` as the converter. 648 result = newString(len(s)) 649 var 650 i = 0 651 resultIndex = 0 652 rune: Rune 653 while i < len(s): 654 fastRuneAt(s, i, rune, doInc = true) 655 rune = runeProc(rune) 656 fastToUTF8Copy(rune, result, resultIndex, doInc = true) 657 658proc toUpper*(s: string): string {.noSideEffect, 659 rtl, extern: "nuc$1Str".} = 660 ## Converts ``s`` into upper-case runes. 661 runnableExamples: 662 doAssert toUpper("abγ") == "ABΓ" 663 convertRune(s, toUpper) 664 665proc toLower*(s: string): string {.noSideEffect, 666 rtl, extern: "nuc$1Str".} = 667 ## Converts ``s`` into lower-case runes. 668 runnableExamples: 669 doAssert toLower("ABΓ") == "abγ" 670 convertRune(s, toLower) 671 672proc swapCase*(s: string): string {.noSideEffect, 673 rtl, extern: "nuc$1".} = 674 ## Swaps the case of runes in ``s``. 675 ## 676 ## Returns a new string such that the cases of all runes 677 ## are swapped if possible. 678 runnableExamples: 679 doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA" 680 681 var 682 i = 0 683 resultIndex = 0 684 rune: Rune 685 result = newString(len(s)) 686 while i < len(s): 687 fastRuneAt(s, i, rune) 688 if rune.isUpper(): 689 rune = rune.toLower() 690 elif rune.isLower(): 691 rune = rune.toUpper() 692 fastToUTF8Copy(rune, result, resultIndex, doInc = true) 693 694proc capitalize*(s: string): string {.noSideEffect, 695 rtl, extern: "nuc$1".} = 696 ## Converts the first character of ``s`` into an upper-case rune. 697 runnableExamples: 698 doAssert capitalize("βeta") == "Βeta" 699 700 if len(s) == 0: 701 return "" 702 var 703 rune: Rune 704 i = 0 705 fastRuneAt(s, i, rune, doInc = true) 706 result = $toUpper(rune) & substr(s, i) 707 708when not defined(nimHasEffectsOf): 709 {.pragma: effectsOf.} 710 711proc translate*(s: string, replacements: proc(key: string): string): string {. 712 rtl, extern: "nuc$1", effectsOf: replacements.} = 713 ## Translates words in a string using the ``replacements`` proc to substitute 714 ## words inside ``s`` with their replacements. 715 ## 716 ## ``replacements`` is any proc that takes a word and returns 717 ## a new word to fill it's place. 718 runnableExamples: 719 proc wordToNumber(s: string): string = 720 case s 721 of "one": "1" 722 of "two": "2" 723 else: s 724 let a = "one two three four" 725 doAssert a.translate(wordToNumber) == "1 2 three four" 726 727 # Allocate memory for the new string based on the old one. 728 # If the new string length is less than the old, no allocations 729 # will be needed. If the new string length is greater than the 730 # old, then maybe only one allocation is needed 731 result = newStringOfCap(s.len) 732 var 733 index = 0 734 lastIndex = 0 735 wordStart = 0 736 inWord = false 737 rune: Rune 738 739 while index < len(s): 740 lastIndex = index 741 fastRuneAt(s, index, rune) 742 let whiteSpace = rune.isWhiteSpace() 743 744 if whiteSpace and inWord: 745 # If we've reached the end of a word 746 let word = s[wordStart ..< lastIndex] 747 result.add(replacements(word)) 748 result.add($rune) 749 inWord = false 750 elif not whiteSpace and not inWord: 751 # If we've hit a non space character and 752 # are not currently in a word, track 753 # the starting index of the word 754 inWord = true 755 wordStart = lastIndex 756 elif whiteSpace: 757 result.add($rune) 758 759 if wordStart < len(s) and inWord: 760 # Get the trailing word at the end 761 let word = s[wordStart .. ^1] 762 result.add(replacements(word)) 763 764proc title*(s: string): string {.noSideEffect, 765 rtl, extern: "nuc$1".} = 766 ## Converts ``s`` to a unicode title. 767 ## 768 ## Returns a new string such that the first character 769 ## in each word inside ``s`` is capitalized. 770 runnableExamples: 771 doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma" 772 773 var 774 i = 0 775 resultIndex = 0 776 rune: Rune 777 result = newString(len(s)) 778 var firstRune = true 779 780 while i < len(s): 781 fastRuneAt(s, i, rune) 782 if not rune.isWhiteSpace() and firstRune: 783 rune = rune.toUpper() 784 firstRune = false 785 elif rune.isWhiteSpace(): 786 firstRune = true 787 fastToUTF8Copy(rune, result, resultIndex, doInc = true) 788 789 790iterator runes*(s: string): Rune = 791 ## Iterates over any rune of the string ``s`` returning runes. 792 var 793 i = 0 794 result: Rune 795 while i < len(s): 796 fastRuneAt(s, i, result, true) 797 yield result 798 799iterator utf8*(s: string): string = 800 ## Iterates over any rune of the string ``s`` returning utf8 values. 801 ## 802 ## See also: 803 ## * `validateUtf8 proc <#validateUtf8,string>`_ 804 ## * `toUTF8 proc <#toUTF8,Rune>`_ 805 ## * `$ proc <#$,Rune>`_ alias for `toUTF8` 806 ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_ 807 var o = 0 808 while o < s.len: 809 let n = runeLenAt(s, o) 810 yield s[o .. (o+n-1)] 811 o += n 812 813proc toRunes*(s: string): seq[Rune] = 814 ## Obtains a sequence containing the Runes in ``s``. 815 ## 816 ## See also: 817 ## * `$ proc <#$,Rune>`_ for a reverse operation 818 runnableExamples: 819 let a = toRunes("aáä") 820 doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)] 821 822 result = newSeq[Rune]() 823 for r in s.runes: 824 result.add(r) 825 826proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1".} = 827 ## Compares two UTF-8 strings and ignores the case. Returns: 828 ## 829 ## | 0 if a == b 830 ## | < 0 if a < b 831 ## | > 0 if a > b 832 var i = 0 833 var j = 0 834 var ar, br: Rune 835 while i < a.len and j < b.len: 836 # slow path: 837 fastRuneAt(a, i, ar) 838 fastRuneAt(b, j, br) 839 result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br)) 840 if result != 0: return 841 result = a.len - b.len 842 843proc reversed*(s: string): string = 844 ## Returns the reverse of ``s``, interpreting it as runes. 845 ## 846 ## Unicode combining characters are correctly interpreted as well. 847 runnableExamples: 848 assert reversed("Reverse this!") == "!siht esreveR" 849 assert reversed("先秦兩漢") == "漢兩秦先" 850 assert reversed("as⃝df̅") == "f̅ds⃝a" 851 assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞" 852 853 var 854 i = 0 855 lastI = 0 856 newPos = len(s) - 1 857 blockPos = 0 858 r: Rune 859 860 template reverseUntil(pos) = 861 var j = pos - 1 862 while j > blockPos: 863 result[newPos] = s[j] 864 dec j 865 dec newPos 866 blockPos = pos - 1 867 868 result = newString(len(s)) 869 870 while i < len(s): 871 lastI = i 872 fastRuneAt(s, i, r, true) 873 if not isCombining(r): 874 reverseUntil(lastI) 875 876 reverseUntil(len(s)) 877 878proc graphemeLen*(s: string; i: Natural): Natural = 879 ## The number of bytes belonging to byte index ``s[i]``, 880 ## including following combining code unit. 881 runnableExamples: 882 let a = "añyóng" 883 doAssert a.graphemeLen(1) == 2 ## ñ 884 doAssert a.graphemeLen(2) == 1 885 doAssert a.graphemeLen(4) == 2 ## ó 886 887 var j = i.int 888 var r, r2: Rune 889 if j < s.len: 890 fastRuneAt(s, j, r, true) 891 result = j-i 892 while j < s.len: 893 fastRuneAt(s, j, r2, true) 894 if not isCombining(r2): break 895 result = j-i 896 897proc lastRune*(s: string; last: int): (Rune, int) = 898 ## Length of the last rune in ``s[0..last]``. Returns the rune and its length 899 ## in bytes. 900 if s[last] <= chr(127): 901 result = (Rune(s[last]), 1) 902 else: 903 var L = 0 904 while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L) 905 var r: Rune 906 fastRuneAt(s, last-L, r, false) 907 result = (r, L+1) 908 909proc size*(r: Rune): int {.noSideEffect.} = 910 ## Returns the number of bytes the rune ``r`` takes. 911 runnableExamples: 912 let a = toRunes "aá" 913 doAssert size(a[0]) == 1 914 doAssert size(a[1]) == 2 915 916 let v = r.uint32 917 if v <= 0x007F'u32: result = 1 918 elif v <= 0x07FF'u32: result = 2 919 elif v <= 0xFFFF'u32: result = 3 920 elif v <= 0x1FFFFF'u32: result = 4 921 elif v <= 0x3FFFFFF'u32: result = 5 922 elif v <= 0x7FFFFFFF'u32: result = 6 923 else: result = 1 924 925# --------- Private templates for different split separators ----------- 926proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool = 927 var rune: Rune 928 fastRuneAt(s, index, rune, false) 929 return seps.contains(rune) 930 931proc stringHasSep(s: string, index: int, sep: Rune): bool = 932 var rune: Rune 933 fastRuneAt(s, index, rune, false) 934 return sep == rune 935 936template splitCommon(s, sep, maxsplit: untyped) = 937 ## Common code for split procedures. 938 let 939 sLen = len(s) 940 var 941 last = 0 942 splits = maxsplit 943 if sLen > 0: 944 while last <= sLen: 945 var first = last 946 while last < sLen and not stringHasSep(s, last, sep): 947 inc(last, runeLenAt(s, last)) 948 if splits == 0: last = sLen 949 yield s[first .. (last - 1)] 950 if splits == 0: break 951 dec(splits) 952 inc(last, if last < sLen: runeLenAt(s, last) else: 1) 953 954iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces, 955 maxsplit: int = -1): string = 956 ## Splits the unicode string ``s`` into substrings using a group of separators. 957 ## 958 ## Substrings are separated by a substring containing only ``seps``. 959 runnableExamples: 960 import std/sequtils 961 962 assert toSeq("hÃllo\lthis\lis an\texample\l是".split) == 963 @["hÃllo", "this", "is", "an", "example", "是"] 964 965 # And the following code splits the same string using a sequence of Runes. 966 assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) == 967 @["añyóng", "hÃllo", "是", "example"] 968 969 # example with a `Rune` separator and unused one `;`: 970 assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""] 971 972 # Another example that splits a string containing a date. 973 let date = "2012-11-20T22:08:08.398990" 974 975 assert toSeq(split(date, " -:T".toRunes)) == 976 @["2012", "11", "20", "22", "08", "08.398990"] 977 978 splitCommon(s, seps, maxsplit) 979 980iterator splitWhitespace*(s: string): string = 981 ## Splits a unicode string at whitespace runes. 982 splitCommon(s, unicodeSpaces, -1) 983 984template accResult(iter: untyped) = 985 result = @[] 986 for x in iter: add(result, x) 987 988proc splitWhitespace*(s: string): seq[string] {.noSideEffect, 989 rtl, extern: "ncuSplitWhitespace".} = 990 ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_ 991 ## iterator, but is a proc that returns a sequence of substrings. 992 accResult(splitWhitespace(s)) 993 994iterator split*(s: string, sep: Rune, maxsplit: int = -1): string = 995 ## Splits the unicode string ``s`` into substrings using a single separator. 996 ## Substrings are separated by the rune ``sep``. 997 runnableExamples: 998 import std/sequtils 999 1000 assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) == 1001 @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"] 1002 1003 splitCommon(s, sep, maxsplit) 1004 1005proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1): 1006 seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} = 1007 ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_, 1008 ## but is a proc that returns a sequence of substrings. 1009 accResult(split(s, seps, maxsplit)) 1010 1011proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, 1012 rtl, extern: "nucSplitRune".} = 1013 ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc 1014 ## that returns a sequence of substrings. 1015 accResult(split(s, sep, maxsplit)) 1016 1017proc strip*(s: string, leading = true, trailing = true, 1018 runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, 1019 rtl, extern: "nucStrip".} = 1020 ## Strips leading or trailing ``runes`` from ``s`` and returns 1021 ## the resulting string. 1022 ## 1023 ## If ``leading`` is true (default), leading ``runes`` are stripped. 1024 ## If ``trailing`` is true (default), trailing ``runes`` are stripped. 1025 ## If both are false, the string is returned unchanged. 1026 runnableExamples: 1027 let a = "\táñyóng " 1028 doAssert a.strip == "áñyóng" 1029 doAssert a.strip(leading = false) == "\táñyóng" 1030 doAssert a.strip(trailing = false) == "áñyóng " 1031 1032 var 1033 sI = 0 ## starting index into string ``s`` 1034 eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts 1035 if leading: 1036 var 1037 i = 0 1038 xI: int ## value of ``sI`` at the beginning of the iteration 1039 rune: Rune 1040 while i < len(s): 1041 xI = i 1042 fastRuneAt(s, i, rune) 1043 sI = i # Assume to start from next rune 1044 if not runes.contains(rune): 1045 sI = xI # Go back to where the current rune starts 1046 break 1047 if trailing: 1048 var 1049 i = eI 1050 xI: int 1051 rune: Rune 1052 while i >= 0: 1053 xI = i 1054 fastRuneAt(s, xI, rune) 1055 var yI = i - 1 1056 while yI >= 0: 1057 var 1058 yIend = yI 1059 pRune: Rune 1060 fastRuneAt(s, yIend, pRune) 1061 if yIend < xI: break 1062 i = yI 1063 rune = pRune 1064 dec(yI) 1065 if not runes.contains(rune): 1066 eI = xI - 1 1067 break 1068 dec(i) 1069 let newLen = eI - sI + 1 1070 result = newStringOfCap(newLen) 1071 if newLen > 0: 1072 result.add s[sI .. eI] 1073 1074proc repeat*(c: Rune, count: Natural): string {.noSideEffect, 1075 rtl, extern: "nucRepeatRune".} = 1076 ## Returns a string of ``count`` Runes ``c``. 1077 ## 1078 ## The returned string will have a rune-length of ``count``. 1079 runnableExamples: 1080 let a = "ñ".runeAt(0) 1081 doAssert a.repeat(5) == "ñññññ" 1082 1083 let s = $c 1084 result = newStringOfCap(count * s.len) 1085 for i in 0 ..< count: 1086 result.add s 1087 1088proc align*(s: string, count: Natural, padding = ' '.Rune): string {. 1089 noSideEffect, rtl, extern: "nucAlignString".} = 1090 ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length 1091 ## of ``count``. 1092 ## 1093 ## ``padding`` characters (by default spaces) are added before ``s`` resulting in 1094 ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is 1095 ## returned unchanged. If you need to left align a string use the `alignLeft 1096 ## proc <#alignLeft,string,Natural>`_. 1097 runnableExamples: 1098 assert align("abc", 4) == " abc" 1099 assert align("a", 0) == "a" 1100 assert align("1232", 6) == " 1232" 1101 assert align("1232", 6, '#'.Rune) == "##1232" 1102 assert align("Åge", 5) == " Åge" 1103 assert align("×", 4, '_'.Rune) == "___×" 1104 1105 let sLen = s.runeLen 1106 if sLen < count: 1107 let padStr = $padding 1108 result = newStringOfCap(padStr.len * count) 1109 let spaces = count - sLen 1110 for i in 0 ..< spaces: result.add padStr 1111 result.add s 1112 else: 1113 result = s 1114 1115proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {. 1116 noSideEffect.} = 1117 ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a 1118 ## rune-length of ``count``. 1119 ## 1120 ## ``padding`` characters (by default spaces) are added after ``s`` resulting in 1121 ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is 1122 ## returned unchanged. If you need to right align a string use the `align 1123 ## proc <#align,string,Natural>`_. 1124 runnableExamples: 1125 assert alignLeft("abc", 4) == "abc " 1126 assert alignLeft("a", 0) == "a" 1127 assert alignLeft("1232", 6) == "1232 " 1128 assert alignLeft("1232", 6, '#'.Rune) == "1232##" 1129 assert alignLeft("Åge", 5) == "Åge " 1130 assert alignLeft("×", 4, '_'.Rune) == "×___" 1131 let sLen = s.runeLen 1132 if sLen < count: 1133 let padStr = $padding 1134 result = newStringOfCap(s.len + (count - sLen) * padStr.len) 1135 result.add s 1136 for i in sLen ..< count: 1137 result.add padStr 1138 else: 1139 result = s 1140