1 /*
2  * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
3  * Use of this file is governed by the BSD 3-clause license that
4  * can be found in the LICENSE.txt file in the project root.
5  */
6 
7 package org.antlr.v4.test.tool;
8 
9 import org.antlr.v4.gui.Trees;
10 import org.antlr.v4.runtime.ANTLRInputStream;
11 import org.antlr.v4.runtime.CharStream;
12 import org.antlr.v4.runtime.CharStreams;
13 import org.antlr.v4.runtime.CommonTokenStream;
14 import org.antlr.v4.runtime.LexerInterpreter;
15 import org.antlr.v4.runtime.tree.ParseTree;
16 import org.antlr.v4.tool.Grammar;
17 import org.antlr.v4.tool.GrammarParserInterpreter;
18 import org.junit.Test;
19 
20 import java.io.ByteArrayInputStream;
21 import java.io.InputStreamReader;
22 import java.nio.charset.StandardCharsets;
23 
24 import static org.junit.Assert.assertEquals;
25 
26 public class TestUnicodeGrammar extends BaseJavaToolTest {
27 	@Test
unicodeBMPLiteralInGrammar()28 	public void unicodeBMPLiteralInGrammar() throws Exception {
29 		String grammarText =
30 			"grammar Unicode;\n" +
31 			"r : 'hello' WORLD;\n" +
32 			"WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" +
33 			"WS : [ \\t\\r\\n]+ -> skip;\n";
34 		String inputText = "hello \u4E16\u754C";
35 		assertEquals(
36 				"(r:1 " + inputText + ")",
37 				parseTreeForGrammarWithInput(
38 						grammarText,
39 						"r",
40 						inputText));
41 	}
42 
43 	// TODO: This test cannot pass unless we change either the grammar
44 	// parser to decode surrogate pair literals to code points (which
45 	// would break existing clients) or to treat them as an
46 	// alternative:
47 	//
48 	// '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D')
49 	//
50 	// but I worry that might cause parse ambiguity if we're not careful.
51 	//@Test
unicodeSurrogatePairLiteralInGrammar()52 	public void unicodeSurrogatePairLiteralInGrammar() throws Exception {
53 		String grammarText =
54 			"grammar Unicode;\n" +
55 			"r : 'hello' WORLD;\n" +
56 			"WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" +
57 			"WS : [ \\t\\r\\n]+ -> skip;\n";
58 		String inputText = new StringBuilder("hello ")
59 				.appendCodePoint(0x1F30E)
60 				.toString();
61 		assertEquals(
62 				"(r:1 " + inputText + ")",
63 				parseTreeForGrammarWithInput(
64 						grammarText,
65 						"r",
66 						inputText));
67 	}
68 
69 	@Test
unicodeSMPLiteralInGrammar()70 	public void unicodeSMPLiteralInGrammar() throws Exception {
71 		String grammarText =
72 			"grammar Unicode;\n" +
73 			"r : 'hello' WORLD;\n" +
74 			"WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" +
75 			"WS : [ \\t\\r\\n]+ -> skip;\n";
76 		String inputText = new StringBuilder("hello ")
77 				.appendCodePoint(0x1F30E)
78 				.toString();
79 		assertEquals(
80 				"(r:1 " + inputText + ")",
81 				parseTreeForGrammarWithInput(
82 						grammarText,
83 						"r",
84 						inputText));
85 	}
86 
87 	@Test
unicodeSMPRangeInGrammar()88 	public void unicodeSMPRangeInGrammar() throws Exception {
89 		String grammarText =
90 			"grammar Unicode;\n" +
91 			"r : 'hello' WORLD;\n" +
92 			"WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" +
93 			"WS : [ \\t\\r\\n]+ -> skip;\n";
94 		String inputText = new StringBuilder("hello ")
95 				.appendCodePoint(0x1F30E)
96 				.toString();
97 		assertEquals(
98 				"(r:1 " + inputText + ")",
99 				parseTreeForGrammarWithInput(
100 						grammarText,
101 						"r",
102 						inputText));
103 	}
104 
105 	@Test
matchingDanglingSurrogateInInput()106 	public void matchingDanglingSurrogateInInput() throws Exception {
107 		String grammarText =
108 			"grammar Unicode;\n" +
109 			"r : 'hello' WORLD;\n" +
110 			"WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" +
111 			"WS : [ \\t\\r\\n]+ -> skip;\n";
112 		String inputText = "hello \uD83C";
113 		assertEquals(
114 				"(r:1 " + inputText + ")",
115 				parseTreeForGrammarWithInput(
116 						grammarText,
117 						"r",
118 						inputText));
119 	}
120 
121 	@Test
binaryGrammar()122 	public void binaryGrammar() throws Exception {
123 		String grammarText =
124 			"grammar Binary;\n" +
125 			"r : HEADER PACKET+ FOOTER;\n" +
126 			"HEADER : '\\u0002\\u0000\\u0001\\u0007';\n" +
127 			"PACKET : '\\u00D0' ('\\u00D1' | '\\u00D2' | '\\u00D3') +;\n" +
128 			"FOOTER : '\\u00FF';\n";
129 		byte[] toParse = new byte[] {
130 				(byte)0x02, (byte)0x00, (byte)0x01, (byte)0x07,
131 				(byte)0xD0, (byte)0xD2, (byte)0xD2, (byte)0xD3, (byte)0xD3, (byte)0xD3,
132 				(byte)0xD0, (byte)0xD3, (byte)0xD3, (byte)0xD1,
133 				(byte)0xFF
134 		};
135 		CharStream charStream;
136 		try (ByteArrayInputStream is = new ByteArrayInputStream(toParse);
137 		     // Note we use ISO_8859_1 to treat all byte values as Unicode "characters" from
138 		     // U+0000 to U+00FF.
139 		     InputStreamReader isr = new InputStreamReader(is, StandardCharsets.ISO_8859_1)) {
140 			charStream = new ANTLRInputStream(isr);
141 		}
142 		Grammar grammar = new Grammar(grammarText);
143 		LexerInterpreter lexEngine = grammar.createLexerInterpreter(charStream);
144 		CommonTokenStream tokens = new CommonTokenStream(lexEngine);
145 		GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
146 		ParseTree parseTree = parser.parse(grammar.rules.get("r").index);
147 		InterpreterTreeTextProvider nodeTextProvider =
148 				new InterpreterTreeTextProvider(grammar.getRuleNames());
149 		String result = Trees.toStringTree(parseTree, nodeTextProvider);
150 
151 		assertEquals(
152 				"(r:1 \u0002\u0000\u0001\u0007 \u00D0\u00D2\u00D2\u00D3\u00D3\u00D3 \u00D0\u00D3\u00D3\u00D1 \u00FF)",
153 				result);
154 	}
155 
parseTreeForGrammarWithInput( String grammarText, String rootRule, String inputText)156 	private static String parseTreeForGrammarWithInput(
157 			String grammarText,
158 			String rootRule,
159 			String inputText) throws Exception {
160 		Grammar grammar = new Grammar(grammarText);
161 		LexerInterpreter lexEngine = grammar.createLexerInterpreter(
162 				CharStreams.fromString(inputText));
163 		CommonTokenStream tokens = new CommonTokenStream(lexEngine);
164 		GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens);
165 		ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index);
166 		InterpreterTreeTextProvider nodeTextProvider =
167 				new InterpreterTreeTextProvider(grammar.getRuleNames());
168 		return Trees.toStringTree(parseTree, nodeTextProvider);
169 	}
170 }
171