1#
2#
3#            Nim's Runtime Library
4#        (c) Copyright 2016 Andreas Rumpf
5#
6#    See the file "copying.txt", included in this
7#    distribution, for details about the copyright.
8#
9
10## Implements a representation of Unicode with the limited
11## ASCII character subset.
12
13import strutils
14import unicode
15
16# issue #3045
17
18const
19  Base = 36
20  TMin = 1
21  TMax = 26
22  Skew = 38
23  Damp = 700
24  InitialBias = 72
25  InitialN = 128
26  Delimiter = '-'
27
28type
29  PunyError* = object of ValueError
30
31proc decodeDigit(x: char): int {.raises: [PunyError].} =
32  if '0' <= x and x <= '9':
33    result = ord(x) - (ord('0') - 26)
34  elif 'A' <= x and x <= 'Z':
35    result = ord(x) - ord('A')
36  elif 'a' <= x and x <= 'z':
37    result = ord(x) - ord('a')
38  else:
39    raise newException(PunyError, "Bad input")
40
41proc encodeDigit(digit: int): Rune {.raises: [PunyError].} =
42  if 0 <= digit and digit < 26:
43    result = Rune(digit + ord('a'))
44  elif 26 <= digit and digit < 36:
45    result = Rune(digit + (ord('0') - 26))
46  else:
47    raise newException(PunyError, "internal error in punycode encoding")
48
49proc isBasic(c: char): bool = ord(c) < 0x80
50proc isBasic(r: Rune): bool = int(r) < 0x80
51
52proc adapt(delta, numPoints: int, first: bool): int =
53  var d = if first: delta div Damp else: delta div 2
54  d += d div numPoints
55  var k = 0
56  while d > ((Base-TMin)*TMax) div 2:
57    d = d div (Base - TMin)
58    k += Base
59  result = k + (Base - TMin + 1) * d div (d + Skew)
60
61proc encode*(prefix, s: string): string {.raises: [PunyError].} =
62  ## Encode a string that may contain Unicode.
63  ## Prepend `prefix` to the result
64  result = prefix
65  var (d, n, bias) = (0, InitialN, InitialBias)
66  var (b, remaining) = (0, 0)
67  for r in s.runes:
68    if r.isBasic:
69      # basic Ascii character
70      inc b
71      result.add($r)
72    else:
73      # special character
74      inc remaining
75
76  var h = b
77  if b > 0:
78    result.add(Delimiter) # we have some Ascii chars
79  while remaining != 0:
80    var m: int = high(int32)
81    for r in s.runes:
82      if m > int(r) and int(r) >= n:
83        m = int(r)
84    d += (m - n) * (h + 1)
85    if d < 0:
86      raise newException(PunyError, "invalid label " & s)
87    n = m
88    for r in s.runes:
89      if int(r) < n:
90        inc d
91        if d < 0:
92          raise newException(PunyError, "invalid label " & s)
93        continue
94      if int(r) > n:
95        continue
96      var q = d
97      var k = Base
98      while true:
99        var t = k - bias
100        if t < TMin:
101          t = TMin
102        elif t > TMax:
103          t = TMax
104        if q < t:
105          break
106        result.add($encodeDigit(t + (q - t) mod (Base - t)))
107        q = (q - t) div (Base - t)
108        k += Base
109      result.add($encodeDigit(q))
110      bias = adapt(d, h + 1, h == b)
111      d = 0
112      inc h
113      dec remaining
114    inc d
115    inc n
116
117proc encode*(s: string): string {.raises: [PunyError].} =
118  ## Encode a string that may contain Unicode. Prefix is empty.
119  result = encode("", s)
120
121proc decode*(encoded: string): string {.raises: [PunyError].} =
122  ## Decode a Punycode-encoded string
123  var
124    n = InitialN
125    i = 0
126    bias = InitialBias
127  var d = rfind(encoded, Delimiter)
128  result = ""
129
130  if d > 0:
131    # found Delimiter
132    for j in 0..<d:
133      var c = encoded[j] # char
134      if not c.isBasic:
135        raise newException(PunyError, "Encoded contains a non-basic char")
136      result.add(c) # add the character
137    inc d
138  else:
139    d = 0 # set to first index
140
141  while (d < len(encoded)):
142    var oldi = i
143    var w = 1
144    var k = Base
145    while true:
146      if d == len(encoded):
147        raise newException(PunyError, "Bad input: " & encoded)
148      var c = encoded[d]; inc d
149      var digit = int(decodeDigit(c))
150      if digit > (high(int32) - i) div w:
151        raise newException(PunyError, "Too large a value: " & $digit)
152      i += digit * w
153      var t: int
154      if k <= bias:
155        t = TMin
156      elif k >= bias + TMax:
157        t = TMax
158      else:
159        t = k - bias
160      if digit < t:
161        break
162      w *= Base - t
163      k += Base
164    bias = adapt(i - oldi, runeLen(result) + 1, oldi == 0)
165
166    if i div (runeLen(result) + 1) > high(int32) - n:
167      raise newException(PunyError, "Value too large")
168
169    n += i div (runeLen(result) + 1)
170    i = i mod (runeLen(result) + 1)
171    insert(result, $Rune(n), i)
172    inc i
173
174
175runnableExamples:
176  static:
177    block:
178      doAssert encode("") == ""
179      doAssert encode("a") == "a-"
180      doAssert encode("A") == "A-"
181      doAssert encode("3") == "3-"
182      doAssert encode("-") == "--"
183      doAssert encode("--") == "---"
184      doAssert encode("abc") == "abc-"
185      doAssert encode("London") == "London-"
186      doAssert encode("Lloyd-Atkinson") == "Lloyd-Atkinson-"
187      doAssert encode("This has spaces") == "This has spaces-"
188      doAssert encode("ü") == "tda"
189      doAssert encode("München") == "Mnchen-3ya"
190      doAssert encode("Mnchen-3ya") == "Mnchen-3ya-"
191      doAssert encode("München-Ost") == "Mnchen-Ost-9db"
192      doAssert encode("Bahnhof München-Ost") == "Bahnhof Mnchen-Ost-u6b"
193    block:
194      doAssert decode("") == ""
195      doAssert decode("a-") ==  "a"
196      doAssert decode("A-") == "A"
197      doAssert decode("3-") == "3"
198      doAssert decode("--") == "-"
199      doAssert decode("---") == "--"
200      doAssert decode("abc-") == "abc"
201      doAssert decode("London-") == "London"
202      doAssert decode("Lloyd-Atkinson-") == "Lloyd-Atkinson"
203      doAssert decode("This has spaces-") == "This has spaces"
204      doAssert decode("tda") == "ü"
205      doAssert decode("Mnchen-3ya") == "München"
206      doAssert decode("Mnchen-3ya-") == "Mnchen-3ya"
207      doAssert decode("Mnchen-Ost-9db") == "München-Ost"
208      doAssert decode("Bahnhof Mnchen-Ost-u6b") == "Bahnhof München-Ost"
209