1#
2#
3#            Nim's Runtime Library
4#        (c) Copyright 2012 Andreas Rumpf
5#
6#    See the file "copying.txt", included in this
7#    distribution, for details about the copyright.
8#
9
10# Nim support for C/C++'s `wide strings`:idx:. This is part of the system
11# module! Do not import it directly!
12
13#when not declared(ThisIsSystem):
14#  {.error: "You must not import this module explicitly".}
15
16type
17  Utf16Char* = distinct int16
18
19when defined(nimv2):
20
21  type
22    WideCString* = ptr UncheckedArray[Utf16Char]
23
24    WideCStringObj* = object
25      bytes: int
26      data: WideCString
27
28  proc `=destroy`(a: var WideCStringObj) =
29    if a.data != nil:
30      when compileOption("threads"):
31        deallocShared(a.data)
32      else:
33        dealloc(a.data)
34
35  proc `=copy`(a: var WideCStringObj; b: WideCStringObj) {.error.}
36
37  proc `=sink`(a: var WideCStringObj; b: WideCStringObj) =
38    a.bytes = b.bytes
39    a.data = b.data
40
41  proc createWide(a: var WideCStringObj; bytes: int) =
42    a.bytes = bytes
43    when compileOption("threads"):
44      a.data = cast[typeof(a.data)](allocShared0(bytes))
45    else:
46      a.data = cast[typeof(a.data)](alloc0(bytes))
47
48  template `[]`*(a: WideCStringObj; idx: int): Utf16Char = a.data[idx]
49  template `[]=`*(a: WideCStringObj; idx: int; val: Utf16Char) = a.data[idx] = val
50
51  template nullWide(): untyped = WideCStringObj(bytes: 0, data: nil)
52
53  converter toWideCString*(x: WideCStringObj): WideCString {.inline.} =
54    result = x.data
55
56else:
57  template nullWide(): untyped = nil
58
59  type
60    WideCString* = ref UncheckedArray[Utf16Char]
61    WideCStringObj* = WideCString
62
63  template createWide(a; L) =
64    unsafeNew(a, L)
65
66proc ord(arg: Utf16Char): int = int(cast[uint16](arg))
67
68proc len*(w: WideCString): int =
69  ## returns the length of a widestring. This traverses the whole string to
70  ## find the binary zero end marker!
71  result = 0
72  while int16(w[result]) != 0'i16: inc result
73
74const
75  UNI_REPLACEMENT_CHAR = Utf16Char(0xFFFD'i16)
76  UNI_MAX_BMP = 0x0000FFFF
77  UNI_MAX_UTF16 = 0x0010FFFF
78  # UNI_MAX_UTF32 = 0x7FFFFFFF
79  # UNI_MAX_LEGAL_UTF32 = 0x0010FFFF
80
81  halfShift = 10
82  halfBase = 0x0010000
83  halfMask = 0x3FF
84
85  UNI_SUR_HIGH_START = 0xD800
86  UNI_SUR_HIGH_END = 0xDBFF
87  UNI_SUR_LOW_START = 0xDC00
88  UNI_SUR_LOW_END = 0xDFFF
89  UNI_REPL = 0xFFFD
90
91template ones(n: untyped): untyped = ((1 shl n)-1)
92
93template fastRuneAt(s: cstring, i, L: int, result: untyped, doInc = true) =
94  ## Returns the unicode character `s[i]` in `result`. If `doInc == true`
95  ## `i` is incremented by the number of bytes that have been processed.
96  bind ones
97
98  if ord(s[i]) <= 127:
99    result = ord(s[i])
100    when doInc: inc(i)
101  elif ord(s[i]) shr 5 == 0b110:
102    #assert(ord(s[i+1]) shr 6 == 0b10)
103    if i <= L - 2:
104      result = (ord(s[i]) and (ones(5))) shl 6 or (ord(s[i+1]) and ones(6))
105      when doInc: inc(i, 2)
106    else:
107      result = UNI_REPL
108      when doInc: inc(i)
109  elif ord(s[i]) shr 4 == 0b1110:
110    if i <= L - 3:
111      #assert(ord(s[i+1]) shr 6 == 0b10)
112      #assert(ord(s[i+2]) shr 6 == 0b10)
113      result = (ord(s[i]) and ones(4)) shl 12 or
114               (ord(s[i+1]) and ones(6)) shl 6 or
115               (ord(s[i+2]) and ones(6))
116      when doInc: inc(i, 3)
117    else:
118      result = UNI_REPL
119      when doInc: inc(i)
120  elif ord(s[i]) shr 3 == 0b11110:
121    if i <= L - 4:
122      #assert(ord(s[i+1]) shr 6 == 0b10)
123      #assert(ord(s[i+2]) shr 6 == 0b10)
124      #assert(ord(s[i+3]) shr 6 == 0b10)
125      result = (ord(s[i]) and ones(3)) shl 18 or
126               (ord(s[i+1]) and ones(6)) shl 12 or
127               (ord(s[i+2]) and ones(6)) shl 6 or
128               (ord(s[i+3]) and ones(6))
129      when doInc: inc(i, 4)
130    else:
131      result = UNI_REPL
132      when doInc: inc(i)
133  else:
134    result = 0xFFFD
135    when doInc: inc(i)
136
137iterator runes(s: cstring, L: int): int =
138  var
139    i = 0
140    result: int
141  while i < L:
142    fastRuneAt(s, i, L, result, true)
143    yield result
144
145proc newWideCString*(size: int): WideCStringObj =
146  createWide(result, size * 2 + 2)
147
148proc newWideCString*(source: cstring, L: int): WideCStringObj =
149  createWide(result, L * 2 + 2)
150  var d = 0
151  for ch in runes(source, L):
152
153    if ch <= UNI_MAX_BMP:
154      if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_LOW_END:
155        result[d] = UNI_REPLACEMENT_CHAR
156      else:
157        result[d] = cast[Utf16Char](uint16(ch))
158    elif ch > UNI_MAX_UTF16:
159      result[d] = UNI_REPLACEMENT_CHAR
160    else:
161      let ch = ch - halfBase
162      result[d] = cast[Utf16Char](uint16((ch shr halfShift) + UNI_SUR_HIGH_START))
163      inc d
164      result[d] = cast[Utf16Char](uint16((ch and halfMask) + UNI_SUR_LOW_START))
165    inc d
166  result[d] = Utf16Char(0)
167
168proc newWideCString*(s: cstring): WideCStringObj =
169  if s.isNil: return nullWide
170
171  result = newWideCString(s, s.len)
172
173proc newWideCString*(s: string): WideCStringObj =
174  result = newWideCString(cstring s, s.len)
175
176proc `$`*(w: WideCString, estimate: int, replacement: int = 0xFFFD): string =
177  result = newStringOfCap(estimate + estimate shr 2)
178
179  var i = 0
180  while w[i].int16 != 0'i16:
181    var ch = ord(w[i])
182    inc i
183    if ch >= UNI_SUR_HIGH_START and ch <= UNI_SUR_HIGH_END:
184      # If the 16 bits following the high surrogate are in the source buffer...
185      let ch2 = ord(w[i])
186
187      # If it's a low surrogate, convert to UTF32:
188      if ch2 >= UNI_SUR_LOW_START and ch2 <= UNI_SUR_LOW_END:
189        ch = (((ch and halfMask) shl halfShift) + (ch2 and halfMask)) + halfBase
190        inc i
191      else:
192        #invalid UTF-16
193        ch = replacement
194    elif ch >= UNI_SUR_LOW_START and ch <= UNI_SUR_LOW_END:
195      #invalid UTF-16
196      ch = replacement
197
198    if ch < 0x80:
199      result.add chr(ch)
200    elif ch < 0x800:
201      result.add chr((ch shr 6) or 0xc0)
202      result.add chr((ch and 0x3f) or 0x80)
203    elif ch < 0x10000:
204      result.add chr((ch shr 12) or 0xe0)
205      result.add chr(((ch shr 6) and 0x3f) or 0x80)
206      result.add chr((ch and 0x3f) or 0x80)
207    elif ch <= 0x10FFFF:
208      result.add chr((ch shr 18) or 0xf0)
209      result.add chr(((ch shr 12) and 0x3f) or 0x80)
210      result.add chr(((ch shr 6) and 0x3f) or 0x80)
211      result.add chr((ch and 0x3f) or 0x80)
212    else:
213      # replacement char(in case user give very large number):
214      result.add chr(0xFFFD shr 12 or 0b1110_0000)
215      result.add chr(0xFFFD shr 6 and ones(6) or 0b10_0000_00)
216      result.add chr(0xFFFD and ones(6) or 0b10_0000_00)
217
218proc `$`*(s: WideCString): string =
219  result = s $ 80
220
221when defined(nimv2):
222  proc `$`*(s: WideCStringObj, estimate: int, replacement: int = 0xFFFD): string =
223    `$`(s.data, estimate, replacement)
224
225  proc `$`*(s: WideCStringObj): string =
226    $(s.data)
227