1# 2# 3# Nim's Runtime Library 4# (c) Copyright 2012 Andreas Rumpf 5# 6# See the file "copying.txt", included in this 7# distribution, for details about the copyright. 8# 9 10# Nim support for C/C++'s `wide strings`:idx:. This is part of the system 11# module! Do not import it directly! 12 13#when not declared(ThisIsSystem): 14# {.error: "You must not import this module explicitly".} 15 16type 17 Utf16Char* = distinct int16 18 19when defined(nimv2): 20 21 type 22 WideCString* = ptr UncheckedArray[Utf16Char] 23 24 WideCStringObj* = object 25 bytes: int 26 data: WideCString 27 28 proc `=destroy`(a: var WideCStringObj) = 29 if a.data != nil: 30 when compileOption("threads"): 31 deallocShared(a.data) 32 else: 33 dealloc(a.data) 34 35 proc `=copy`(a: var WideCStringObj; b: WideCStringObj) {.error.} 36 37 proc `=sink`(a: var WideCStringObj; b: WideCStringObj) = 38 a.bytes = b.bytes 39 a.data = b.data 40 41 proc createWide(a: var WideCStringObj; bytes: int) = 42 a.bytes = bytes 43 when compileOption("threads"): 44 a.data = cast[typeof(a.data)](allocShared0(bytes)) 45 else: 46 a.data = cast[typeof(a.data)](alloc0(bytes)) 47 48 template `[]`*(a: WideCStringObj; idx: int): Utf16Char = a.data[idx] 49 template `[]=`*(a: WideCStringObj; idx: int; val: Utf16Char) = a.data[idx] = val 50 51 template nullWide(): untyped = WideCStringObj(bytes: 0, data: nil) 52 53 converter toWideCString*(x: WideCStringObj): WideCString {.inline.} = 54 result = x.data 55 56else: 57 template nullWide(): untyped = nil 58 59 type 60 WideCString* = ref UncheckedArray[Utf16Char] 61 WideCStringObj* = WideCString 62 63 template createWide(a; L) = 64 unsafeNew(a, L) 65 66proc ord(arg: Utf16Char): int = int(cast[uint16](arg)) 67 68proc len*(w: WideCString): int = 69 ## returns the length of a widestring. This traverses the whole string to 70 ## find the binary zero end marker! 71 result = 0 72 while int16(w[result]) != 0'i16: inc result 73 74const 75 UNI_REPLACEMENT_CHAR = Utf16Char(0xFFFD'i16) 76 UNI_MAX_BMP = 0x0000FFFF 77 UNI_MAX_UTF16 = 0x0010FFFF 78 # UNI_MAX_UTF32 = 0x7FFFFFFF 79 # UNI_MAX_LEGAL_UTF32 = 0x0010FFFF 80 81 halfShift = 10 82 halfBase = 0x0010000 83 halfMask = 0x3FF 84 85 UNI_SUR_HIGH_START = 0xD800 86 UNI_SUR_HIGH_END = 0xDBFF 87 UNI_SUR_LOW_START = 0xDC00 88 UNI_SUR_LOW_END = 0xDFFF 89 UNI_REPL = 0xFFFD 90 91template ones(n: untyped): untyped = ((1 shl n)-1) 92 93template fastRuneAt(s: cstring, i, L: int, result: untyped, doInc = true) = 94 ## Returns the unicode character `s[i]` in `result`. If `doInc == true` 95 ## `i` is incremented by the number of bytes that have been processed. 96 bind ones 97 98 if ord(s[i]) <= 127: 99 result = ord(s[i]) 100 when doInc: inc(i) 101 elif ord(s[i]) shr 5 == 0b110: 102 #assert(ord(s[i+1]) shr 6 == 0b10) 103 if i <= L - 2: 104 result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6)) 105 when doInc: inc(i, 2) 106 else: 107 result = UNI_REPL 108 when doInc: inc(i) 109 elif ord(s[i]) shr 4 == 0b1110: 110 if i <= L - 3: 111 #assert(ord(s[i+1]) shr 6 == 0b10) 112 #assert(ord(s[i+2]) shr 6 == 0b10) 113 result = (ord(s[i]) and ones(4)) shl 12 or 114 (ord(s[i+1]) and ones(6)) shl 6 or 115 (ord(s[i+2]) and ones(6)) 116 when doInc: inc(i, 3) 117 else: 118 result = UNI_REPL 119 when doInc: inc(i) 120 elif ord(s[i]) shr 3 == 0b11110: 121 if i <= L - 4: 122 #assert(ord(s[i+1]) shr 6 == 0b10) 123 #assert(ord(s[i+2]) shr 6 == 0b10) 124 #assert(ord(s[i+3]) shr 6 == 0b10) 125 result = (ord(s[i]) and ones(3)) shl 18 or 126 (ord(s[i+1]) and ones(6)) shl 12 or 127 (ord(s[i+2]) and ones(6)) shl 6 or 128 (ord(s[i+3]) and ones(6)) 129 when doInc: inc(i, 4) 130 else: 131 result = UNI_REPL 132 when doInc: inc(i) 133 else: 134 result = 0xFFFD 135 when doInc: inc(i) 136 137iterator runes(s: cstring, L: int): int = 138 var 139 i = 0 140 result: int 141 while i < L: 142 fastRuneAt(s, i, L, result, true) 143 yield result 144 145proc newWideCString*(size: int): WideCStringObj = 146 createWide(result, size * 2 + 2) 147 148proc newWideCString*(source: cstring, L: int): WideCStringObj = 149 createWide(result, L * 2 + 2) 150 var d = 0 151 for ch in runes(source, L): 152 153 if ch <= UNI_MAX_BMP: 154 if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_LOW_END: 155 result[d] = UNI_REPLACEMENT_CHAR 156 else: 157 result[d] = cast[Utf16Char](uint16(ch)) 158 elif ch > UNI_MAX_UTF16: 159 result[d] = UNI_REPLACEMENT_CHAR 160 else: 161 let ch = ch - halfBase 162 result[d] = cast[Utf16Char](uint16((ch shr halfShift) + UNI_SUR_HIGH_START)) 163 inc d 164 result[d] = cast[Utf16Char](uint16((ch and halfMask) + UNI_SUR_LOW_START)) 165 inc d 166 result[d] = Utf16Char(0) 167 168proc newWideCString*(s: cstring): WideCStringObj = 169 if s.isNil: return nullWide 170 171 result = newWideCString(s, s.len) 172 173proc newWideCString*(s: string): WideCStringObj = 174 result = newWideCString(cstring s, s.len) 175 176proc `$`*(w: WideCString, estimate: int, replacement: int = 0xFFFD): string = 177 result = newStringOfCap(estimate + estimate shr 2) 178 179 var i = 0 180 while w[i].int16 != 0'i16: 181 var ch = ord(w[i]) 182 inc i 183 if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END: 184 # If the 16 bits following the high surrogate are in the source buffer... 185 let ch2 = ord(w[i]) 186 187 # If it's a low surrogate, convert to UTF32: 188 if ch2 >= UNI_SUR_LOW_START and ch2 <= UNI_SUR_LOW_END: 189 ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase 190 inc i 191 else: 192 #invalid UTF-16 193 ch = replacement 194 elif ch >= UNI_SUR_LOW_START and ch <= UNI_SUR_LOW_END: 195 #invalid UTF-16 196 ch = replacement 197 198 if ch < 0x80: 199 result.add chr(ch) 200 elif ch < 0x800: 201 result.add chr((ch shr 6) or 0xc0) 202 result.add chr((ch and 0x3f) or 0x80) 203 elif ch < 0x10000: 204 result.add chr((ch shr 12) or 0xe0) 205 result.add chr(((ch shr 6) and 0x3f) or 0x80) 206 result.add chr((ch and 0x3f) or 0x80) 207 elif ch <= 0x10FFFF: 208 result.add chr((ch shr 18) or 0xf0) 209 result.add chr(((ch shr 12) and 0x3f) or 0x80) 210 result.add chr(((ch shr 6) and 0x3f) or 0x80) 211 result.add chr((ch and 0x3f) or 0x80) 212 else: 213 # replacement char(in case user give very large number): 214 result.add chr(0xFFFD shr 12 or 0b1110_0000) 215 result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00) 216 result.add chr(0xFFFD and ones(6) or 0b10_0000_00) 217 218proc `$`*(s: WideCString): string = 219 result = s $ 80 220 221when defined(nimv2): 222 proc `$`*(s: WideCStringObj, estimate: int, replacement: int = 0xFFFD): string = 223 `$`(s.data, estimate, replacement) 224 225 proc `$`*(s: WideCStringObj): string = 226 $(s.data) 227