1 /* 2 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. 3 * Use of this file is governed by the BSD 3-clause license that 4 * can be found in the LICENSE.txt file in the project root. 5 */ 6 7 package org.antlr.v4.runtime.atn; 8 9 import org.antlr.v4.runtime.Token; 10 import org.antlr.v4.runtime.misc.IntegerList; 11 import org.antlr.v4.runtime.misc.Interval; 12 import org.antlr.v4.runtime.misc.IntervalSet; 13 import org.antlr.v4.runtime.misc.Utils; 14 15 import java.io.InvalidClassException; 16 import java.util.ArrayList; 17 import java.util.Collection; 18 import java.util.HashMap; 19 import java.util.LinkedHashMap; 20 import java.util.List; 21 import java.util.Locale; 22 import java.util.Map; 23 import java.util.UUID; 24 25 public class ATNSerializer { 26 public ATN atn; 27 private List<String> tokenNames; 28 29 private interface CodePointSerializer { serializeCodePoint(IntegerList data, int cp)30 void serializeCodePoint(IntegerList data, int cp); 31 } 32 ATNSerializer(ATN atn)33 public ATNSerializer(ATN atn) { 34 assert atn.grammarType != null; 35 this.atn = atn; 36 } 37 ATNSerializer(ATN atn, List<String> tokenNames)38 public ATNSerializer(ATN atn, List<String> tokenNames) { 39 assert atn.grammarType != null; 40 this.atn = atn; 41 this.tokenNames = tokenNames; 42 } 43 44 /** Serialize state descriptors, edge descriptors, and decision→state map 45 * into list of ints: 46 * 47 * grammar-type, (ANTLRParser.LEXER, ...) 48 * max token type, 49 * num states, 50 * state-0-type ruleIndex, state-1-type ruleIndex, ... state-i-type ruleIndex optional-arg ... 51 * num rules, 52 * rule-1-start-state rule-1-args, rule-2-start-state rule-2-args, ... 53 * (args are token type,actionIndex in lexer else 0,0) 54 * num modes, 55 * mode-0-start-state, mode-1-start-state, ... (parser has 0 modes) 56 * num unicode-bmp-sets 57 * bmp-set-0-interval-count intervals, bmp-set-1-interval-count intervals, ... 58 * num unicode-smp-sets 59 * smp-set-0-interval-count intervals, smp-set-1-interval-count intervals, ... 60 * num total edges, 61 * src, trg, edge-type, edge arg1, optional edge arg2 (present always), ... 62 * num decisions, 63 * decision-0-start-state, decision-1-start-state, ... 64 * 65 * Convenient to pack into unsigned shorts to make as Java string. 66 */ serialize()67 public IntegerList serialize() { 68 IntegerList data = new IntegerList(); 69 data.add(ATNDeserializer.SERIALIZED_VERSION); 70 serializeUUID(data, ATNDeserializer.SERIALIZED_UUID); 71 72 // convert grammar type to ATN const to avoid dependence on ANTLRParser 73 data.add(atn.grammarType.ordinal()); 74 data.add(atn.maxTokenType); 75 int nedges = 0; 76 77 // Note that we use a LinkedHashMap as a set to 78 // maintain insertion order while deduplicating 79 // entries with the same key. 80 Map<IntervalSet, Boolean> sets = new LinkedHashMap<>(); 81 82 // dump states, count edges and collect sets while doing so 83 IntegerList nonGreedyStates = new IntegerList(); 84 IntegerList precedenceStates = new IntegerList(); 85 data.add(atn.states.size()); 86 for (ATNState s : atn.states) { 87 if ( s==null ) { // might be optimized away 88 data.add(ATNState.INVALID_TYPE); 89 continue; 90 } 91 92 int stateType = s.getStateType(); 93 if (s instanceof DecisionState && ((DecisionState)s).nonGreedy) { 94 nonGreedyStates.add(s.stateNumber); 95 } 96 97 if (s instanceof RuleStartState && ((RuleStartState)s).isLeftRecursiveRule) { 98 precedenceStates.add(s.stateNumber); 99 } 100 101 data.add(stateType); 102 103 if (s.ruleIndex == -1) { 104 data.add(Character.MAX_VALUE); 105 } 106 else { 107 data.add(s.ruleIndex); 108 } 109 110 if ( s.getStateType() == ATNState.LOOP_END ) { 111 data.add(((LoopEndState)s).loopBackState.stateNumber); 112 } 113 else if ( s instanceof BlockStartState ) { 114 data.add(((BlockStartState)s).endState.stateNumber); 115 } 116 117 if (s.getStateType() != ATNState.RULE_STOP) { 118 // the deserializer can trivially derive these edges, so there's no need to serialize them 119 nedges += s.getNumberOfTransitions(); 120 } 121 122 for (int i=0; i<s.getNumberOfTransitions(); i++) { 123 Transition t = s.transition(i); 124 int edgeType = Transition.serializationTypes.get(t.getClass()); 125 if ( edgeType == Transition.SET || edgeType == Transition.NOT_SET ) { 126 SetTransition st = (SetTransition)t; 127 sets.put(st.set, true); 128 } 129 } 130 } 131 132 // non-greedy states 133 data.add(nonGreedyStates.size()); 134 for (int i = 0; i < nonGreedyStates.size(); i++) { 135 data.add(nonGreedyStates.get(i)); 136 } 137 138 // precedence states 139 data.add(precedenceStates.size()); 140 for (int i = 0; i < precedenceStates.size(); i++) { 141 data.add(precedenceStates.get(i)); 142 } 143 144 int nrules = atn.ruleToStartState.length; 145 data.add(nrules); 146 for (int r=0; r<nrules; r++) { 147 ATNState ruleStartState = atn.ruleToStartState[r]; 148 data.add(ruleStartState.stateNumber); 149 if (atn.grammarType == ATNType.LEXER) { 150 if (atn.ruleToTokenType[r] == Token.EOF) { 151 data.add(Character.MAX_VALUE); 152 } 153 else { 154 data.add(atn.ruleToTokenType[r]); 155 } 156 } 157 } 158 159 int nmodes = atn.modeToStartState.size(); 160 data.add(nmodes); 161 if ( nmodes>0 ) { 162 for (ATNState modeStartState : atn.modeToStartState) { 163 data.add(modeStartState.stateNumber); 164 } 165 } 166 List<IntervalSet> bmpSets = new ArrayList<>(); 167 List<IntervalSet> smpSets = new ArrayList<>(); 168 for (IntervalSet set : sets.keySet()) { 169 if (set.getMaxElement() <= Character.MAX_VALUE) { 170 bmpSets.add(set); 171 } 172 else { 173 smpSets.add(set); 174 } 175 } 176 serializeSets( 177 data, 178 bmpSets, 179 new CodePointSerializer() { 180 @Override 181 public void serializeCodePoint(IntegerList data, int cp) { 182 data.add(cp); 183 } 184 }); 185 serializeSets( 186 data, 187 smpSets, 188 new CodePointSerializer() { 189 @Override 190 public void serializeCodePoint(IntegerList data, int cp) { 191 serializeInt(data, cp); 192 } 193 }); 194 Map<IntervalSet, Integer> setIndices = new HashMap<>(); 195 int setIndex = 0; 196 for (IntervalSet bmpSet : bmpSets) { 197 setIndices.put(bmpSet, setIndex++); 198 } 199 for (IntervalSet smpSet : smpSets) { 200 setIndices.put(smpSet, setIndex++); 201 } 202 203 data.add(nedges); 204 for (ATNState s : atn.states) { 205 if ( s==null ) { 206 // might be optimized away 207 continue; 208 } 209 210 if (s.getStateType() == ATNState.RULE_STOP) { 211 continue; 212 } 213 214 for (int i=0; i<s.getNumberOfTransitions(); i++) { 215 Transition t = s.transition(i); 216 217 if (atn.states.get(t.target.stateNumber) == null) { 218 throw new IllegalStateException("Cannot serialize a transition to a removed state."); 219 } 220 221 int src = s.stateNumber; 222 int trg = t.target.stateNumber; 223 int edgeType = Transition.serializationTypes.get(t.getClass()); 224 int arg1 = 0; 225 int arg2 = 0; 226 int arg3 = 0; 227 switch ( edgeType ) { 228 case Transition.RULE : 229 trg = ((RuleTransition)t).followState.stateNumber; 230 arg1 = ((RuleTransition)t).target.stateNumber; 231 arg2 = ((RuleTransition)t).ruleIndex; 232 arg3 = ((RuleTransition)t).precedence; 233 break; 234 case Transition.PRECEDENCE: 235 PrecedencePredicateTransition ppt = (PrecedencePredicateTransition)t; 236 arg1 = ppt.precedence; 237 break; 238 case Transition.PREDICATE : 239 PredicateTransition pt = (PredicateTransition)t; 240 arg1 = pt.ruleIndex; 241 arg2 = pt.predIndex; 242 arg3 = pt.isCtxDependent ? 1 : 0 ; 243 break; 244 case Transition.RANGE : 245 arg1 = ((RangeTransition)t).from; 246 arg2 = ((RangeTransition)t).to; 247 if (arg1 == Token.EOF) { 248 arg1 = 0; 249 arg3 = 1; 250 } 251 252 break; 253 case Transition.ATOM : 254 arg1 = ((AtomTransition)t).label; 255 if (arg1 == Token.EOF) { 256 arg1 = 0; 257 arg3 = 1; 258 } 259 260 break; 261 case Transition.ACTION : 262 ActionTransition at = (ActionTransition)t; 263 arg1 = at.ruleIndex; 264 arg2 = at.actionIndex; 265 if (arg2 == -1) { 266 arg2 = 0xFFFF; 267 } 268 269 arg3 = at.isCtxDependent ? 1 : 0 ; 270 break; 271 case Transition.SET : 272 arg1 = setIndices.get(((SetTransition)t).set); 273 break; 274 case Transition.NOT_SET : 275 arg1 = setIndices.get(((SetTransition)t).set); 276 break; 277 case Transition.WILDCARD : 278 break; 279 } 280 281 data.add(src); 282 data.add(trg); 283 data.add(edgeType); 284 data.add(arg1); 285 data.add(arg2); 286 data.add(arg3); 287 } 288 } 289 290 int ndecisions = atn.decisionToState.size(); 291 data.add(ndecisions); 292 for (DecisionState decStartState : atn.decisionToState) { 293 data.add(decStartState.stateNumber); 294 } 295 296 // 297 // LEXER ACTIONS 298 // 299 if (atn.grammarType == ATNType.LEXER) { 300 data.add(atn.lexerActions.length); 301 for (LexerAction action : atn.lexerActions) { 302 data.add(action.getActionType().ordinal()); 303 switch (action.getActionType()) { 304 case CHANNEL: 305 int channel = ((LexerChannelAction)action).getChannel(); 306 data.add(channel != -1 ? channel : 0xFFFF); 307 data.add(0); 308 break; 309 310 case CUSTOM: 311 int ruleIndex = ((LexerCustomAction)action).getRuleIndex(); 312 int actionIndex = ((LexerCustomAction)action).getActionIndex(); 313 data.add(ruleIndex != -1 ? ruleIndex : 0xFFFF); 314 data.add(actionIndex != -1 ? actionIndex : 0xFFFF); 315 break; 316 317 case MODE: 318 int mode = ((LexerModeAction)action).getMode(); 319 data.add(mode != -1 ? mode : 0xFFFF); 320 data.add(0); 321 break; 322 323 case MORE: 324 data.add(0); 325 data.add(0); 326 break; 327 328 case POP_MODE: 329 data.add(0); 330 data.add(0); 331 break; 332 333 case PUSH_MODE: 334 mode = ((LexerPushModeAction)action).getMode(); 335 data.add(mode != -1 ? mode : 0xFFFF); 336 data.add(0); 337 break; 338 339 case SKIP: 340 data.add(0); 341 data.add(0); 342 break; 343 344 case TYPE: 345 int type = ((LexerTypeAction)action).getType(); 346 data.add(type != -1 ? type : 0xFFFF); 347 data.add(0); 348 break; 349 350 default: 351 String message = String.format(Locale.getDefault(), "The specified lexer action type %s is not valid.", action.getActionType()); 352 throw new IllegalArgumentException(message); 353 } 354 } 355 } 356 357 // Note: This value shifting loop is documented in ATNDeserializer. 358 // don't adjust the first value since that's the version number 359 for (int i = 1; i < data.size(); i++) { 360 if (data.get(i) < Character.MIN_VALUE || data.get(i) > Character.MAX_VALUE) { 361 throw new UnsupportedOperationException("Serialized ATN data element "+ 362 data.get(i)+ 363 " element "+i+" out of range "+ 364 (int)Character.MIN_VALUE+ 365 ".."+ 366 (int)Character.MAX_VALUE); 367 } 368 369 int value = (data.get(i) + 2) & 0xFFFF; 370 data.set(i, value); 371 } 372 373 return data; 374 } 375 serializeSets( IntegerList data, Collection<IntervalSet> sets, CodePointSerializer codePointSerializer)376 private static void serializeSets( 377 IntegerList data, 378 Collection<IntervalSet> sets, 379 CodePointSerializer codePointSerializer) 380 { 381 int nSets = sets.size(); 382 data.add(nSets); 383 384 for (IntervalSet set : sets) { 385 boolean containsEof = set.contains(Token.EOF); 386 if (containsEof && set.getIntervals().get(0).b == Token.EOF) { 387 data.add(set.getIntervals().size() - 1); 388 } 389 else { 390 data.add(set.getIntervals().size()); 391 } 392 393 data.add(containsEof ? 1 : 0); 394 for (Interval I : set.getIntervals()) { 395 if (I.a == Token.EOF) { 396 if (I.b == Token.EOF) { 397 continue; 398 } 399 else { 400 codePointSerializer.serializeCodePoint(data, 0); 401 } 402 } 403 else { 404 codePointSerializer.serializeCodePoint(data, I.a); 405 } 406 407 codePointSerializer.serializeCodePoint(data, I.b); 408 } 409 } 410 } 411 decode(char[] data)412 public String decode(char[] data) { 413 data = data.clone(); 414 // don't adjust the first value since that's the version number 415 for (int i = 1; i < data.length; i++) { 416 data[i] = (char)(data[i] - 2); 417 } 418 419 StringBuilder buf = new StringBuilder(); 420 int p = 0; 421 int version = ATNDeserializer.toInt(data[p++]); 422 if (version != ATNDeserializer.SERIALIZED_VERSION) { 423 String reason = String.format("Could not deserialize ATN with version %d (expected %d).", version, ATNDeserializer.SERIALIZED_VERSION); 424 throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); 425 } 426 427 UUID uuid = ATNDeserializer.toUUID(data, p); 428 p += 8; 429 if (!uuid.equals(ATNDeserializer.SERIALIZED_UUID)) { 430 String reason = String.format(Locale.getDefault(), "Could not deserialize ATN with UUID %s (expected %s).", uuid, ATNDeserializer.SERIALIZED_UUID); 431 throw new UnsupportedOperationException(new InvalidClassException(ATN.class.getName(), reason)); 432 } 433 434 p++; // skip grammarType 435 int maxType = ATNDeserializer.toInt(data[p++]); 436 buf.append("max type ").append(maxType).append("\n"); 437 int nstates = ATNDeserializer.toInt(data[p++]); 438 for (int i=0; i<nstates; i++) { 439 int stype = ATNDeserializer.toInt(data[p++]); 440 if ( stype==ATNState.INVALID_TYPE ) continue; // ignore bad type of states 441 int ruleIndex = ATNDeserializer.toInt(data[p++]); 442 if (ruleIndex == Character.MAX_VALUE) { 443 ruleIndex = -1; 444 } 445 446 String arg = ""; 447 if ( stype == ATNState.LOOP_END ) { 448 int loopBackStateNumber = ATNDeserializer.toInt(data[p++]); 449 arg = " "+loopBackStateNumber; 450 } 451 else if ( stype == ATNState.PLUS_BLOCK_START || stype == ATNState.STAR_BLOCK_START || stype == ATNState.BLOCK_START ) { 452 int endStateNumber = ATNDeserializer.toInt(data[p++]); 453 arg = " "+endStateNumber; 454 } 455 buf.append(i).append(":") 456 .append(ATNState.serializationNames.get(stype)).append(" ") 457 .append(ruleIndex).append(arg).append("\n"); 458 } 459 // this code is meant to model the form of ATNDeserializer.deserialize, 460 // since both need to be updated together whenever a change is made to 461 // the serialization format. The "dead" code is only used in debugging 462 // and testing scenarios, so the form you see here was kept for 463 // improved maintainability. 464 // start 465 int numNonGreedyStates = ATNDeserializer.toInt(data[p++]); 466 for (int i = 0; i < numNonGreedyStates; i++) { 467 int stateNumber = ATNDeserializer.toInt(data[p++]); 468 } 469 int numPrecedenceStates = ATNDeserializer.toInt(data[p++]); 470 for (int i = 0; i < numPrecedenceStates; i++) { 471 int stateNumber = ATNDeserializer.toInt(data[p++]); 472 } 473 // finish 474 int nrules = ATNDeserializer.toInt(data[p++]); 475 for (int i=0; i<nrules; i++) { 476 int s = ATNDeserializer.toInt(data[p++]); 477 if (atn.grammarType == ATNType.LEXER) { 478 int arg1 = ATNDeserializer.toInt(data[p++]); 479 buf.append("rule ").append(i).append(":").append(s).append(" ").append(arg1).append('\n'); 480 } 481 else { 482 buf.append("rule ").append(i).append(":").append(s).append('\n'); 483 } 484 } 485 int nmodes = ATNDeserializer.toInt(data[p++]); 486 for (int i=0; i<nmodes; i++) { 487 int s = ATNDeserializer.toInt(data[p++]); 488 buf.append("mode ").append(i).append(":").append(s).append('\n'); 489 } 490 int numBMPSets = ATNDeserializer.toInt(data[p++]); 491 p = appendSets(buf, data, p, numBMPSets, 0, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_BMP)); 492 int numSMPSets = ATNDeserializer.toInt(data[p++]); 493 p = appendSets(buf, data, p, numSMPSets, numBMPSets, ATNDeserializer.getUnicodeDeserializer(ATNDeserializer.UnicodeDeserializingMode.UNICODE_SMP)); 494 int nedges = ATNDeserializer.toInt(data[p++]); 495 for (int i=0; i<nedges; i++) { 496 int src = ATNDeserializer.toInt(data[p]); 497 int trg = ATNDeserializer.toInt(data[p + 1]); 498 int ttype = ATNDeserializer.toInt(data[p + 2]); 499 int arg1 = ATNDeserializer.toInt(data[p + 3]); 500 int arg2 = ATNDeserializer.toInt(data[p + 4]); 501 int arg3 = ATNDeserializer.toInt(data[p + 5]); 502 buf.append(src).append("->").append(trg) 503 .append(" ").append(Transition.serializationNames.get(ttype)) 504 .append(" ").append(arg1).append(",").append(arg2).append(",").append(arg3) 505 .append("\n"); 506 p += 6; 507 } 508 int ndecisions = ATNDeserializer.toInt(data[p++]); 509 for (int i=0; i<ndecisions; i++) { 510 int s = ATNDeserializer.toInt(data[p++]); 511 buf.append(i).append(":").append(s).append("\n"); 512 } 513 if (atn.grammarType == ATNType.LEXER) { 514 // this code is meant to model the form of ATNDeserializer.deserialize, 515 // since both need to be updated together whenever a change is made to 516 // the serialization format. The "dead" code is only used in debugging 517 // and testing scenarios, so the form you see here was kept for 518 // improved maintainability. 519 int lexerActionCount = ATNDeserializer.toInt(data[p++]); 520 for (int i = 0; i < lexerActionCount; i++) { 521 LexerActionType actionType = LexerActionType.values()[ATNDeserializer.toInt(data[p++])]; 522 int data1 = ATNDeserializer.toInt(data[p++]); 523 int data2 = ATNDeserializer.toInt(data[p++]); 524 } 525 } 526 return buf.toString(); 527 } 528 appendSets(StringBuilder buf, char[] data, int p, int nsets, int setIndexOffset, ATNDeserializer.UnicodeDeserializer unicodeDeserializer)529 private int appendSets(StringBuilder buf, char[] data, int p, int nsets, int setIndexOffset, ATNDeserializer.UnicodeDeserializer unicodeDeserializer) { 530 for (int i=0; i<nsets; i++) { 531 int nintervals = ATNDeserializer.toInt(data[p++]); 532 buf.append(i+setIndexOffset).append(":"); 533 boolean containsEof = data[p++] != 0; 534 if (containsEof) { 535 buf.append(getTokenName(Token.EOF)); 536 } 537 538 for (int j=0; j<nintervals; j++) { 539 if ( containsEof || j>0 ) { 540 buf.append(", "); 541 } 542 543 int a = unicodeDeserializer.readUnicode(data, p); 544 p += unicodeDeserializer.size(); 545 int b = unicodeDeserializer.readUnicode(data, p); 546 p += unicodeDeserializer.size(); 547 buf.append(getTokenName(a)).append("..").append(getTokenName(b)); 548 } 549 buf.append("\n"); 550 } 551 return p; 552 } 553 getTokenName(int t)554 public String getTokenName(int t) { 555 if ( t==-1 ) return "EOF"; 556 557 if ( atn.grammarType == ATNType.LEXER && 558 t >= Character.MIN_VALUE && t <= Character.MAX_VALUE ) 559 { 560 switch (t) { 561 case '\n': 562 return "'\\n'"; 563 case '\r': 564 return "'\\r'"; 565 case '\t': 566 return "'\\t'"; 567 case '\b': 568 return "'\\b'"; 569 case '\f': 570 return "'\\f'"; 571 case '\\': 572 return "'\\\\'"; 573 case '\'': 574 return "'\\''"; 575 default: 576 if ( Character.UnicodeBlock.of((char)t)==Character.UnicodeBlock.BASIC_LATIN && 577 !Character.isISOControl((char)t) ) { 578 return '\''+Character.toString((char)t)+'\''; 579 } 580 // turn on the bit above max "\uFFFF" value so that we pad with zeros 581 // then only take last 4 digits 582 String hex = Integer.toHexString(t|0x10000).toUpperCase().substring(1,5); 583 String unicodeStr = "'\\u"+hex+"'"; 584 return unicodeStr; 585 } 586 } 587 588 if (tokenNames != null && t >= 0 && t < tokenNames.size()) { 589 return tokenNames.get(t); 590 } 591 592 return String.valueOf(t); 593 } 594 595 /** Used by Java target to encode short/int array as chars in string. */ getSerializedAsString(ATN atn)596 public static String getSerializedAsString(ATN atn) { 597 return new String(getSerializedAsChars(atn)); 598 } 599 getSerialized(ATN atn)600 public static IntegerList getSerialized(ATN atn) { 601 return new ATNSerializer(atn).serialize(); 602 } 603 getSerializedAsChars(ATN atn)604 public static char[] getSerializedAsChars(ATN atn) { 605 return Utils.toCharArray(getSerialized(atn)); 606 } 607 getDecoded(ATN atn, List<String> tokenNames)608 public static String getDecoded(ATN atn, List<String> tokenNames) { 609 IntegerList serialized = getSerialized(atn); 610 char[] data = Utils.toCharArray(serialized); 611 return new ATNSerializer(atn, tokenNames).decode(data); 612 } 613 serializeUUID(IntegerList data, UUID uuid)614 private void serializeUUID(IntegerList data, UUID uuid) { 615 serializeLong(data, uuid.getLeastSignificantBits()); 616 serializeLong(data, uuid.getMostSignificantBits()); 617 } 618 serializeLong(IntegerList data, long value)619 private void serializeLong(IntegerList data, long value) { 620 serializeInt(data, (int)value); 621 serializeInt(data, (int)(value >> 32)); 622 } 623 serializeInt(IntegerList data, int value)624 private void serializeInt(IntegerList data, int value) { 625 data.add((char)value); 626 data.add((char)(value >> 16)); 627 } 628 } 629