1--echo # Start of ctype_unescape.inc
2
3SET sql_mode = '';
4
5#
6# Testing how string literals with backslash and quote-quote are unescaped.
7# The tests assume that single quote (') is used as a delimiter.
8#
9
10#
11# Make sure that the parser really works using the character set we need.
12# We use binary strings to compose strings, to be able to test get malformed
13# sequences, which are possible as a result of mysql_real_escape_string().
14# The important thing for this test is to make the parser unescape using
15# the client character set, rather than binary. Currently it works exactly
16# that way by default, so the query below should return @@character_set_client
17#
18SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
19PREPARE stmt FROM @query;
20EXECUTE stmt;
21DEALLOCATE PREPARE stmt;
22
23let $CHARSET=`SELECT @@character_set_connection`;
24
25CREATE TABLE allbytes (a VARBINARY(10));
26
27#
28# Create various byte sequences to test. Testing the full banch of
29# possible combinations takes about 2 minutes. So this test provides
30# variants to run with:
31# - the full set of possible combinations
32# - a reduced test of combinations for selected bytes only
33#
34
35# Create selected byte combinations
36if ($ctype_unescape_combinations == 'selected')
37{
38--echo # Using selected bytes combinations
39--source include/bytes.inc
40#
41# Populate "selected_bytes" with bytes that have a special meaning.
42# We'll use "selected_bytes" to generate byte seqeunces,
43# instead of the full possible byte combinations, to reduce test time.
44#
45CREATE TABLE selected_bytes (a VARBINARY(10));
46
47# Bytes that have a special meaning in all character sets:
48# 0x00 - mysql_real_escape_string() quotes this to '\0'
49# 0x0D - mysql_real_escape_string() quotes this to '\r'
50# 0x0A - mysql_real_escape_string() quotes this to '\n'
51# 0x1A - mysql_real_escape_string() quotes this to '\Z'
52# 0x08 - mysql_real_escape_string() does not quote this,
53#        but '\b' is unescaped to 0x08.
54# 0x09 - mysql_real_escape_string() does not quote this,
55#        but '\t' is unescaped to 0x09.
56# 0x30 - '0', as in '\0'
57# 0x5A - 'Z', as in '\Z'
58# 0x62 - 'b', as in '\b'
59# 0x6E - 'n', as in '\n'
60# 0x72 - 't', as in '\r'
61# 0x74 - 't', as in '\t'
62
63INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
64INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
65
66# 0x22 - double quote
67# 0x25 - percent sign, '\%' is preserved as is for LIKE.
68# 0x27 - single quote
69# 0x5C - backslash
70# 0x5F - underscore, '\_' is preserved as is for LIKE.
71INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
72
73# Some bytes do not have any special meaning, for example basic Latin letters.
74# Let's add, one should be enough for a good enough coverage.
75INSERT INTO selected_bytes (a) VALUES ('a');
76
77#
78# This maps summarizes bytes that have a special
79# meaning in various character sets:
80#
81#          MBHEAD           MBTAIL            NONASCII-8BIT   BAD
82#          ------           ------           --------------   ----------
83#  big5:   [A1..F9]         [40..7E,A1..FE]   N/A             [80..A0,FA..FF]
84#  cp932:  [81..9F,E0..FC]  [40..7E,80..FC]   [A1..DF]        [FD..FF]
85#  gbk:    [81..FE]         [40..7E,80..FE]   N/A             [FF]
86#  sjis:   [81..9F,E0..FC]  [40..7E,80..FC]   [A1..DF]        [FD..FF]
87#  swe7:   N/A              N/A               [5B..5E,7B..7E] [80..FF]
88#
89
90INSERT INTO selected_bytes (a) VALUES
91(0x3F), # 7bit
92(0x40), # 7bit          mbtail
93(0x7E), # 7bit          mbtail  nonascii-8bit
94(0x7F), # 7bit                  nonascii-8bit
95(0x80), #               mbtail                 bad-mb
96(0x81), #       mbhead  mbtail
97(0x9F), #       mbhead  mbtail                 bad-mb
98(0xA0), #       mbhead  mbtail                 bad-mb
99(0xA1), #       mbhead  mbtail  nonascii-8bit
100(0xE0), #       mbhead  mbtai
101(0xEF), #       mbhead  mbtail
102(0xF9), #       mbhead  mbtail
103(0xFA), #       mbhead  mbtail                 bad-mb
104(0xFC), #       mbhead  mbtail                 bad-mb
105(0xFD), #       mbhead  mbtail                 bad-mb
106(0xFE), #       mbhead  mbtial                 bad-mb
107(0xFF); #                                      bad-mb
108
109#
110# Now populate the test table
111#
112
113# Use all single bytes, this is cheap, there are only 256 values.
114INSERT INTO allbytes (a) SELECT a FROM bytes;
115
116# Add selected bytes combinations
117INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
118INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
119INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
120DROP TABLE selected_bytes;
121
122# Delete all non-single byte sequences that do not have
123# backslashes or quotes at all. There is nothing special with these strings.
124DELETE FROM allbytes WHERE
125  OCTET_LENGTH(a)>1 AND
126  LOCATE(0x5C,a)=0 AND
127  a NOT LIKE '%\'%' AND
128  a NOT LIKE '%"%';
129
130}
131
132if ($ctype_unescape_combinations=='')
133{
134--echo # Using full byte combinations
135--source include/bytes2.inc
136INSERT INTO allbytes (a) SELECT a FROM bytes;
137INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
138INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
139INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
140}
141
142
143DELIMITER //;
144
145#
146# A procedure that make an SQL query using 'val' as a string literal.
147# The result of the query execution is written into the table 't1'.
148# NULL in t1.b means that query failed due to syntax error,
149# typically because of mis-interpreted closing quote delimiter.
150#
151CREATE PROCEDURE p1(val VARBINARY(10))
152BEGIN
153  DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
154  SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
155  PREPARE stmt FROM @query;
156  EXECUTE stmt;
157  DEALLOCATE PREPARE stmt;
158END//
159
160#
161# A procedure that iterates through all records in "allbytes".
162# And runs p1() for every record.
163#
164CREATE PROCEDURE p2()
165BEGIN
166  DECLARE val VARBINARY(10);
167  DECLARE done INT DEFAULT FALSE;
168  DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
169  DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
170  OPEN stmt;
171read_loop1: LOOP
172  FETCH stmt INTO val;
173  IF done THEN
174    LEAVE read_loop1;
175  END IF;
176  CALL p1(val);
177END LOOP;
178  CLOSE stmt;
179END//
180
181
182# A function that converts the value from binary to $CHARSET
183# and check if it has changed. CONVERT() fixes malformed strings.
184# So if the string changes in CONVERT(), it means it was not wellformed.
185--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
186
187#
188# A function that approximately reproduces how the SQL parser
189# would unescape a binary string.
190#
191CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
192BEGIN
193  # We need to do it in a way to avoid producing new escape sequences
194  # First, enclose all known escsape sequences to '{{xx}}'
195  # - Backslash not followed by a LIKE pattern characters _ and %
196  # - Double escapes
197  # This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
198  # So '\\1' in the last argument always means the match, no matter
199  # which alternative it came from.
200  SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
201  # Now unescape all enclosed standard escape sequences
202  SET a=REPLACE(a,'{{\\0}}', '\0');
203  SET a=REPLACE(a,'{{\\b}}', '\b');
204  SET a=REPLACE(a,'{{\\t}}', '\t');
205  SET a=REPLACE(a,'{{\\r}}', '\r');
206  SET a=REPLACE(a,'{{\\n}}', '\n');
207  SET a=REPLACE(a,'{{\\Z}}', '\Z');
208  SET a=REPLACE(a,'{{\\\'}}', '\'');
209  # Unescape double quotes
210  SET a=REPLACE(a,'{{\'\'}}', '\'');
211  # Unescape the rest: all other \x sequences mean just 'x'
212  SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
213  RETURN a;
214END//
215
216
217#
218# A function that checks what happened during unescaping.
219#
220# @param a - the value before unescaping
221# @param b - the value after unescaping
222#
223# The following return values are possible:
224# - SyntErr - b IS NULL, which means syntax error happened in p1().
225# - Preserv - the value was not modified during unescaping.
226#             This is possible if 0x5C was treated as mbtail.
227#             Or only LIKE escape sequences were found: '\_' and '\%'.
228# - Trivial - only 0x5C were removed.
229# - Regular - the value was unescaped like a binary string.
230#             Some standard escape sequences were found.
231#             No special multi-byte handling happened.
232# - Special - Something else happened. Should not happen.
233#
234CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
235BEGIN
236  RETURN CASE
237  WHEN b IS NULL THEN '[SyntErr]'
238  WHEN a=b THEN CASE
239    WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
240    WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
241    WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
242    ELSE '[Preserv][MB]' END
243  WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
244  WHEN UNESCAPE(a)=b THEN '[Regular]'
245  ELSE '[Special]' END;
246END//
247
248
249#
250# Check what happened with wellformedness during unescaping
251# @param a  - the value before unescaping
252# @param b  - the value after unescaping
253#
254# Returned values:
255# [FIXED] - the value was malformed and become wellformed after unescaping
256# [BROKE] - the value was wellformed and become malformed after unescaping
257# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
258# ''      - both values are wellformed
259#
260CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
261                               RETURNS VARBINARY(256)
262BEGIN
263  RETURN CASE
264  WHEN b IS NULL THEN ''
265  WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
266  WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
267  WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
268  ELSE ''
269  END;
270END//
271
272
273#
274# Check if the value could be generated by mysql_real_escape_string(),
275# or can only come from a direct user input.
276#
277# @param a - the value before unescaping
278#
279# Returns:
280# [USER] - if the value could not be generated by mysql_real_escape_string()
281# ''     - if the value was possibly generated by mysql_real_escape_string()
282#
283#
284CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
285                                                   RETURNS VARBINARY(256)
286BEGIN
287  DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
288  DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
289  DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
290  DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
291  DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
292  RETURN CASE
293  WHEN (a1=0x5C) AND
294       (a3=0x5C) AND
295       (a2>0x7F) AND
296       (a4 NOT IN ('_','%','0','t','r','n','Z')) AND
297       iswellformed(a2a4) THEN '[USER]'
298  ELSE ''
299  END;
300END//
301
302DELIMITER ;//
303
304
305CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
306CALL p2();
307# Avoid "Invalid XXX character string" warnings
308# We mark malformed strings in the output anyway
309--disable_warnings
310# All records marked with '[BAD]' mean that the string was unescaped
311# in a unexpected way, that means there is a bug in UNESCAPE() above.
312SELECT HEX(a),HEX(b),
313  CONCAT(unescape_type(a,b),
314         wellformedness(a,b),
315         mysql_real_escape_string_generated(a),
316         IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
317FROM t1 ORDER BY LENGTH(a),a;
318--enable_warnings
319DROP TABLE t1;
320DROP PROCEDURE p1;
321DROP PROCEDURE p2;
322DROP FUNCTION unescape;
323DROP FUNCTION unescape_type;
324DROP FUNCTION wellformedness;
325DROP FUNCTION mysql_real_escape_string_generated;
326DROP FUNCTION iswellformed;
327DROP TABLE allbytes;
328SET sql_mode = DEFAULT;
329
330--echo # End of ctype_backslash.inc
331