1import sys
2from unittest import TestCase
3
4import simplejson as json
5import simplejson.decoder
6from simplejson.compat import b, PY3
7
8class TestScanString(TestCase):
9    # The bytes type is intentionally not used in most of these tests
10    # under Python 3 because the decoder immediately coerces to str before
11    # calling scanstring. In Python 2 we are testing the code paths
12    # for both unicode and str.
13    #
14    # The reason this is done is because Python 3 would require
15    # entirely different code paths for parsing bytes and str.
16    #
17    def test_py_scanstring(self):
18        self._test_scanstring(simplejson.decoder.py_scanstring)
19
20    def test_c_scanstring(self):
21        if not simplejson.decoder.c_scanstring:
22            return
23        self._test_scanstring(simplejson.decoder.c_scanstring)
24
25        self.assertTrue(isinstance(simplejson.decoder.c_scanstring('""', 0)[0], str))
26
27    def _test_scanstring(self, scanstring):
28        if sys.maxunicode == 65535:
29            self.assertEqual(
30                scanstring(u'"z\U0001d120x"', 1, None, True),
31                (u'z\U0001d120x', 6))
32        else:
33            self.assertEqual(
34                scanstring(u'"z\U0001d120x"', 1, None, True),
35                (u'z\U0001d120x', 5))
36
37        self.assertEqual(
38            scanstring('"\\u007b"', 1, None, True),
39            (u'{', 8))
40
41        self.assertEqual(
42            scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
43            (u'A JSON payload should be an object or array, not a string.', 60))
44
45        self.assertEqual(
46            scanstring('["Unclosed array"', 2, None, True),
47            (u'Unclosed array', 17))
48
49        self.assertEqual(
50            scanstring('["extra comma",]', 2, None, True),
51            (u'extra comma', 14))
52
53        self.assertEqual(
54            scanstring('["double extra comma",,]', 2, None, True),
55            (u'double extra comma', 21))
56
57        self.assertEqual(
58            scanstring('["Comma after the close"],', 2, None, True),
59            (u'Comma after the close', 24))
60
61        self.assertEqual(
62            scanstring('["Extra close"]]', 2, None, True),
63            (u'Extra close', 14))
64
65        self.assertEqual(
66            scanstring('{"Extra comma": true,}', 2, None, True),
67            (u'Extra comma', 14))
68
69        self.assertEqual(
70            scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
71            (u'Extra value after close', 26))
72
73        self.assertEqual(
74            scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
75            (u'Illegal expression', 21))
76
77        self.assertEqual(
78            scanstring('{"Illegal invocation": alert()}', 2, None, True),
79            (u'Illegal invocation', 21))
80
81        self.assertEqual(
82            scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
83            (u'Numbers cannot have leading zeroes', 37))
84
85        self.assertEqual(
86            scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
87            (u'Numbers cannot be hex', 24))
88
89        self.assertEqual(
90            scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
91            (u'Too deep', 30))
92
93        self.assertEqual(
94            scanstring('{"Missing colon" null}', 2, None, True),
95            (u'Missing colon', 16))
96
97        self.assertEqual(
98            scanstring('{"Double colon":: null}', 2, None, True),
99            (u'Double colon', 15))
100
101        self.assertEqual(
102            scanstring('{"Comma instead of colon", null}', 2, None, True),
103            (u'Comma instead of colon', 25))
104
105        self.assertEqual(
106            scanstring('["Colon instead of comma": false]', 2, None, True),
107            (u'Colon instead of comma', 25))
108
109        self.assertEqual(
110            scanstring('["Bad value", truth]', 2, None, True),
111            (u'Bad value', 12))
112
113        for c in map(chr, range(0x00, 0x1f)):
114            self.assertEqual(
115                scanstring(c + '"', 0, None, False),
116                (c, 2))
117            self.assertRaises(
118                ValueError,
119                scanstring, c + '"', 0, None, True)
120
121        self.assertRaises(ValueError, scanstring, '', 0, None, True)
122        self.assertRaises(ValueError, scanstring, 'a', 0, None, True)
123        self.assertRaises(ValueError, scanstring, '\\', 0, None, True)
124        self.assertRaises(ValueError, scanstring, '\\u', 0, None, True)
125        self.assertRaises(ValueError, scanstring, '\\u0', 0, None, True)
126        self.assertRaises(ValueError, scanstring, '\\u01', 0, None, True)
127        self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
128        self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
129        if sys.maxunicode > 65535:
130            self.assertRaises(ValueError,
131                              scanstring, '\\ud834\\u"', 0, None, True)
132            self.assertRaises(ValueError,
133                              scanstring, '\\ud834\\x0123"', 0, None, True)
134
135    def test_issue3623(self):
136        self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
137                          "xxx")
138        self.assertRaises(UnicodeDecodeError,
139                          json.encoder.encode_basestring_ascii, b("xx\xff"))
140
141    def test_overflow(self):
142        # Python 2.5 does not have maxsize, Python 3 does not have maxint
143        maxsize = getattr(sys, 'maxsize', getattr(sys, 'maxint', None))
144        assert maxsize is not None
145        self.assertRaises(OverflowError, json.decoder.scanstring, "xxx",
146                          maxsize + 1)
147
148    def test_surrogates(self):
149        scanstring = json.decoder.scanstring
150
151        def assertScan(given, expect, test_utf8=True):
152            givens = [given]
153            if not PY3 and test_utf8:
154                givens.append(given.encode('utf8'))
155            for given in givens:
156                (res, count) = scanstring(given, 1, None, True)
157                self.assertEqual(len(given), count)
158                self.assertEqual(res, expect)
159
160        assertScan(
161            u'"z\\ud834\\u0079x"',
162            u'z\ud834yx')
163        assertScan(
164            u'"z\\ud834\\udd20x"',
165            u'z\U0001d120x')
166        assertScan(
167            u'"z\\ud834\\ud834\\udd20x"',
168            u'z\ud834\U0001d120x')
169        assertScan(
170            u'"z\\ud834x"',
171            u'z\ud834x')
172        assertScan(
173            u'"z\\udd20x"',
174            u'z\udd20x')
175        assertScan(
176            u'"z\ud834x"',
177            u'z\ud834x')
178        # It may look strange to join strings together, but Python is drunk.
179        # https://gist.github.com/etrepum/5538443
180        assertScan(
181            u'"z\\ud834\udd20x12345"',
182            u''.join([u'z\ud834', u'\udd20x12345']))
183        assertScan(
184            u'"z\ud834\\udd20x"',
185            u''.join([u'z\ud834', u'\udd20x']))
186        # these have different behavior given UTF8 input, because the surrogate
187        # pair may be joined (in maxunicode > 65535 builds)
188        assertScan(
189            u''.join([u'"z\ud834', u'\udd20x"']),
190            u''.join([u'z\ud834', u'\udd20x']),
191            test_utf8=False)
192
193        self.assertRaises(ValueError,
194                          scanstring, u'"z\\ud83x"', 1, None, True)
195        self.assertRaises(ValueError,
196                          scanstring, u'"z\\ud834\\udd2x"', 1, None, True)
197