1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 package org.antlr.v4.test.tool; 8 9 import org.antlr.v4.gui.Trees; 10 import org.antlr.v4.runtime.ANTLRInputStream; 11 import org.antlr.v4.runtime.CharStream; 12 import org.antlr.v4.runtime.CharStreams; 13 import org.antlr.v4.runtime.CommonTokenStream; 14 import org.antlr.v4.runtime.LexerInterpreter; 15 import org.antlr.v4.runtime.tree.ParseTree; 16 import org.antlr.v4.tool.Grammar; 17 import org.antlr.v4.tool.GrammarParserInterpreter; 18 import org.junit.Test; 19 20 import java.io.ByteArrayInputStream; 21 import java.io.InputStreamReader; 22 import java.nio.charset.StandardCharsets; 23 24 import static org.junit.Assert.assertEquals; 25 26 public class TestUnicodeGrammar extends BaseJavaToolTest { 27 @Test unicodeBMPLiteralInGrammar()28 public void unicodeBMPLiteralInGrammar() throws Exception { 29 String grammarText = 30 "grammar Unicode;\n" + 31 "r : 'hello' WORLD;\n" + 32 "WORLD : ('world' | '\\u4E16\\u754C' | '\\u1000\\u1019\\u1039\\u1018\\u102C' );\n" + 33 "WS : [ \\t\\r\\n]+ -> skip;\n"; 34 String inputText = "hello \u4E16\u754C"; 35 assertEquals( 36 "(r:1 " + inputText + ")", 37 parseTreeForGrammarWithInput( 38 grammarText, 39 "r", 40 inputText)); 41 } 42 43 // TODO: This test cannot pass unless we change either the grammar 44 // parser to decode surrogate pair literals to code points (which 45 // would break existing clients) or to treat them as an 46 // alternative: 47 // 48 // '\\uD83C\\uDF0D' -> ('\\u{1F30E}' | '\\uD83C\\uDF0D') 49 // 50 // but I worry that might cause parse ambiguity if we're not careful. 51 //@Test unicodeSurrogatePairLiteralInGrammar()52 public void unicodeSurrogatePairLiteralInGrammar() throws Exception { 53 String grammarText = 54 "grammar Unicode;\n" + 55 "r : 'hello' WORLD;\n" + 56 "WORLD : ('\\uD83C\\uDF0D' | '\\uD83C\\uDF0E' | '\\uD83C\\uDF0F' );\n" + 57 "WS : [ \\t\\r\\n]+ -> skip;\n"; 58 String inputText = new StringBuilder("hello ") 59 .appendCodePoint(0x1F30E) 60 .toString(); 61 assertEquals( 62 "(r:1 " + inputText + ")", 63 parseTreeForGrammarWithInput( 64 grammarText, 65 "r", 66 inputText)); 67 } 68 69 @Test unicodeSMPLiteralInGrammar()70 public void unicodeSMPLiteralInGrammar() throws Exception { 71 String grammarText = 72 "grammar Unicode;\n" + 73 "r : 'hello' WORLD;\n" + 74 "WORLD : ('\\u{1F30D}' | '\\u{1F30E}' | '\\u{1F30F}' );\n" + 75 "WS : [ \\t\\r\\n]+ -> skip;\n"; 76 String inputText = new StringBuilder("hello ") 77 .appendCodePoint(0x1F30E) 78 .toString(); 79 assertEquals( 80 "(r:1 " + inputText + ")", 81 parseTreeForGrammarWithInput( 82 grammarText, 83 "r", 84 inputText)); 85 } 86 87 @Test unicodeSMPRangeInGrammar()88 public void unicodeSMPRangeInGrammar() throws Exception { 89 String grammarText = 90 "grammar Unicode;\n" + 91 "r : 'hello' WORLD;\n" + 92 "WORLD : ('\\u{1F30D}'..'\\u{1F30F}' );\n" + 93 "WS : [ \\t\\r\\n]+ -> skip;\n"; 94 String inputText = new StringBuilder("hello ") 95 .appendCodePoint(0x1F30E) 96 .toString(); 97 assertEquals( 98 "(r:1 " + inputText + ")", 99 parseTreeForGrammarWithInput( 100 grammarText, 101 "r", 102 inputText)); 103 } 104 105 @Test matchingDanglingSurrogateInInput()106 public void matchingDanglingSurrogateInInput() throws Exception { 107 String grammarText = 108 "grammar Unicode;\n" + 109 "r : 'hello' WORLD;\n" + 110 "WORLD : ('\\uD83C' | '\\uD83D' | '\\uD83E' );\n" + 111 "WS : [ \\t\\r\\n]+ -> skip;\n"; 112 String inputText = "hello \uD83C"; 113 assertEquals( 114 "(r:1 " + inputText + ")", 115 parseTreeForGrammarWithInput( 116 grammarText, 117 "r", 118 inputText)); 119 } 120 121 @Test binaryGrammar()122 public void binaryGrammar() throws Exception { 123 String grammarText = 124 "grammar Binary;\n" + 125 "r : HEADER PACKET+ FOOTER;\n" + 126 "HEADER : '\\u0002\\u0000\\u0001\\u0007';\n" + 127 "PACKET : '\\u00D0' ('\\u00D1' | '\\u00D2' | '\\u00D3') +;\n" + 128 "FOOTER : '\\u00FF';\n"; 129 byte[] toParse = new byte[] { 130 (byte)0x02, (byte)0x00, (byte)0x01, (byte)0x07, 131 (byte)0xD0, (byte)0xD2, (byte)0xD2, (byte)0xD3, (byte)0xD3, (byte)0xD3, 132 (byte)0xD0, (byte)0xD3, (byte)0xD3, (byte)0xD1, 133 (byte)0xFF 134 }; 135 CharStream charStream; 136 try (ByteArrayInputStream is = new ByteArrayInputStream(toParse); 137 // Note we use ISO_8859_1 to treat all byte values as Unicode "characters" from 138 // U+0000 to U+00FF. 139 InputStreamReader isr = new InputStreamReader(is, StandardCharsets.ISO_8859_1)) { 140 charStream = new ANTLRInputStream(isr); 141 } 142 Grammar grammar = new Grammar(grammarText); 143 LexerInterpreter lexEngine = grammar.createLexerInterpreter(charStream); 144 CommonTokenStream tokens = new CommonTokenStream(lexEngine); 145 GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens); 146 ParseTree parseTree = parser.parse(grammar.rules.get("r").index); 147 InterpreterTreeTextProvider nodeTextProvider = 148 new InterpreterTreeTextProvider(grammar.getRuleNames()); 149 String result = Trees.toStringTree(parseTree, nodeTextProvider); 150 151 assertEquals( 152 "(r:1 \u0002\u0000\u0001\u0007 \u00D0\u00D2\u00D2\u00D3\u00D3\u00D3 \u00D0\u00D3\u00D3\u00D1 \u00FF)", 153 result); 154 } 155 parseTreeForGrammarWithInput( String grammarText, String rootRule, String inputText)156 private static String parseTreeForGrammarWithInput( 157 String grammarText, 158 String rootRule, 159 String inputText) throws Exception { 160 Grammar grammar = new Grammar(grammarText); 161 LexerInterpreter lexEngine = grammar.createLexerInterpreter( 162 CharStreams.fromString(inputText)); 163 CommonTokenStream tokens = new CommonTokenStream(lexEngine); 164 GrammarParserInterpreter parser = grammar.createGrammarParserInterpreter(tokens); 165 ParseTree parseTree = parser.parse(grammar.rules.get(rootRule).index); 166 InterpreterTreeTextProvider nodeTextProvider = 167 new InterpreterTreeTextProvider(grammar.getRuleNames()); 168 return Trees.toStringTree(parseTree, nodeTextProvider); 169 } 170 } 171