1options { 2STATIC=false; 3FORCE_LA_CHECK=true; 4} 5 6PARSER_BEGIN(NomParser) 7/* Copyright (C) 2003-2007 University of Manchester 8 * 9 * This library is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * This library is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with this library; if not, write to the Free Software 21 * Foundation, 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 22 * (or see http://www.gnu.org/copyleft/lesser.html) 23 */ 24package org.openscience.cdk.iupac.parser; 25 26import java.io.StringReader; 27import org.openscience.cdk.DefaultChemObjectBuilder; 28import org.openscience.cdk.interfaces.IAtomContainer; 29import org.openscience.cdk.interfaces.IChemObjectBuilder; 30import org.openscience.cdk.exception.*; 31import java.util.*; 32 33/** 34 * A class partly generated by <a href="http://javacc.dev.java.net" target="_top">JavaCC</a> which breaks down the chemical name 35 * into computable subparts and passes these parts to the MoleculeBuilder. 36 * 37 * @author David Robinson (University of Manchester) 38 * @author Bhupinder Sandhu 39 * @author Stephen Tomkinson 40 * 41 * @cdk.keyword IUPAC name 42 * @deprecated The OPSIN (<a href="http://opsin.ch.cam.ac.uk/">http://opsin.ch.cam.ac.uk/</a>) tool 43 * offers a more comprehensive alternative to this parsers functionality. 44 */ 45public class NomParser { 46 //private variables needed throughout the program 47 private static int mainChainPrefix; 48 private static boolean isMainCyclic; 49 50 /** The temporary vector of locations the current group/substituent is attached to */ 51 private static List<Token> tempLocation; 52 53 /** The vector of attached functional groups, with an instance of AttachedGroup for each 54 * functional group. 55 */ 56 private static List<AttachedGroup> attachedGroups; 57 58 /** The vector of attached substituent, with an instance of AttachedGroup for each 59 * substituent. 60 */ 61 private static List<AttachedGroup> attachedSubstituents; 62 63 /** 64 * Used in the build up of ancient Greek style prefixes 65 */ 66 private static int currentNumber; 67 68 /** 69 * Parses the chemical name and returns the built molecule. It uses 70 * the {@link DefaultChemObjectBuilder} to create a data model. 71 * 72 * @param stringToParse A case-insensitive name of the chemical to build. 73 * @return A molecule which represents the interpretation of the name. 74 * @throws ParseException Any error which occur in the parsing get wrapped 75 * up in a ParseException and thrown. 76 */ 77 public static IAtomContainer generate (String stringToParse) throws ParseException, CDKException { 78 return generate(stringToParse, DefaultChemObjectBuilder.getInstance()); 79 } 80 81 /** 82 * Parses the chemical name and returns the built molecule. It uses the 83 * given {@link IChemObjectBuilder} to create a data model. 84 * 85 * @param stringToParse A case-insensitive name of the chemical to build. 86 * @return A molecule which represents the interpretation of the name. 87 * @throws ParseException Any error which occur in the parsing get wrapped 88 * up in a ParseException and thrown. 89 */ 90 public static IAtomContainer generate (String stringToParse, IChemObjectBuilder builder) throws ParseException, CDKException 91 { 92 isMainCyclic = false; 93 tempLocation = new ArrayList<Token>(); 94 attachedSubstituents = new ArrayList<AttachedGroup>(); 95 attachedGroups = new ArrayList<AttachedGroup>(); 96 currentNumber = 0; 97 98 StringReader stringReader = new StringReader (stringToParse.toLowerCase() + "\n"); 99 NomParser parser = new NomParser (stringReader); 100 parser.completeChemicalName(); 101 102 //Scan substituents for a too high connection point 103 checkConnections (attachedSubstituents.iterator()); 104 105 //Scan functional groups for a too high connection point 106 checkConnections (attachedGroups.iterator()); 107 108 MoleculeBuilder moleculeBuilder = new MoleculeBuilder(builder); 109 IAtomContainer returnedMolecule = moleculeBuilder.buildMolecule(mainChainPrefix, attachedSubstituents, attachedGroups, isMainCyclic, stringToParse); 110 111 return returnedMolecule; 112 } 113 114 /** 115 * Checks to ensure that all groups/substituents attached to the main chain 116 * are connected to a valid atom which occurs on the main chain. 117 * 118 * @param vectorIterator An iterator which provides instances of AttachedGroup to check 119 * @throws ParseException A tailored instance of ParseException so Nomen can display 120 * the error to the user. 121 */ 122 private static void checkConnections (Iterator<AttachedGroup> vectorIterator) throws ParseException 123 { 124 while (vectorIterator.hasNext()) 125 { 126 AttachedGroup ag = vectorIterator.next(); 127 List<Token> locations = ag.getLocations(); 128 129 Iterator<Token> locationsIterator = locations.iterator(); 130 while (locationsIterator.hasNext()) 131 { 132 Token tok = locationsIterator.next(); 133 134 try 135 { 136 if (Integer.parseInt(tok.image) > mainChainPrefix) 137 { 138 String mesg = "invalid attachment point " + Integer.parseInt(tok.image) + " must be below " + (mainChainPrefix + 1); 139 throw new ParseException(mesg); 140 } 141 } 142 catch (NumberFormatException nfe) 143 { 144 //Do nothing, as this should never happen 145 } 146 } 147 } 148 } 149} 150 151PARSER_END(NomParser) 152 153JAVACODE 154/** 155 * Stores "head tokens", the substituent prefix, in a vector of AttachedGroup objects. 156 */ 157void AddHeadToken() { 158 attachedSubstituents.add (new AttachedGroup (tempLocation, currentNumber) ); 159 tempLocation = new Vector(); 160} 161 162JAVACODE 163void MakeMainChainIntoSubstituent() { 164 attachedSubstituents.add (new AttachedGroup (tempLocation, mainChainPrefix) ); 165 currentNumber = 0; 166 mainChainPrefix = 0; 167 tempLocation = new Vector(); 168} 169 170JAVACODE 171/** 172 * Stores the functional groups in a vector of AttachedGroup objects. 173 */ 174void AddFunGroup() { 175 Token tok; 176 tok = getToken(-1); 177 attachedGroups.add (new AttachedGroup (tempLocation, tok.image) ); 178 tempLocation = new Vector(); 179} 180 181JAVACODE 182/** 183* Stores the functional group positions, the number of the atom they 184* connect to, in an array. 185*/ 186void AddFunGroupPos() { 187 Token tok; 188 tok = getToken(-1); 189 tempLocation.add(tok); 190} 191 192JAVACODE 193/** 194* Adds to the position array a location of -1 to indicate no location was 195* specified. 196*/ 197void AddUnknownFunGroupPos() { 198 Token tok = new Token(); 199 tok.image = "-1"; 200 tempLocation.add(tok); 201} 202 203JAVACODE 204/** 205 * Store the mainChainPrefix token, the chain prefix of the longest carbon chain 206 */ 207void AddMainChain() { 208 mainChainPrefix = currentNumber; 209 currentNumber = 0; 210} 211 212JAVACODE 213/** 214 * Sets the main chain to be cyclic. 215 */ 216void SetMainCyclic() { 217 isMainCyclic = true; 218} 219 220TOKEN : 221{ 222 < EOL: "\n" | "\r" > 223} 224 225TOKEN : /*NUMBERS*/ 226{ 227 < CONSTANT: ( <DIGIT> )+ > 228| < #DIGIT: ["0" - "9"] > 229} 230 231TOKEN : /*NUMBER CONNECTORS*/ 232{ 233 < DASH: "-" > 234| < COMMA: "," > 235} 236 237/** 238 * Initial small numbers. 239 */ 240TOKEN : 241{ 242 < METH: "meth" > 243| < ETH: "eth" > 244| < PROP: "prop" > 245| < BUT: "but" > 246} 247 248/** 249 * Other special cases. 250 */ 251TOKEN : 252{ 253 < UNDEC: "undec" > 254| < EICOS: "eicos" | "icos" > 255| < HENICOS: "henicos" > 256} 257 258/** 259 * Usual numbers for base 10 numbering. 260 */ 261TOKEN : 262{ 263 < HEN: "hen" > 264| < DO: "do" > 265| < TRI: "tri" > 266| < TETR: "tetra" > 267| < PENT: "pent" > 268| < HEX: "hex" > 269| < HEPT: "hept" > 270| < OCT: "oct" > 271| < NON: "non" > 272} 273 274/** 275 * Positional aides which give the magnitude of the the base numbers. 276 * Equivalent to "...ty" and "...hundred" in English 277 */ 278TOKEN : 279{ 280 < DEC: "dec" > 281| < COS: "cos" > 282| < CONT: "cont" > 283} 284 285/* Skip the "a" letter for Greek numbers */ 286SKIP : 287{ 288 < A : "a" > | < SPACE : " " > 289} 290 291TOKEN : /*BOND MODIFIERS*/ 292{ 293 < AN: "an" > 294 | < EN: "en" > 295 | < YN: "yn" > 296} 297 298 299TOKEN : /*CONNECTOR*/ 300{ 301 < YL: "yl" > 302| < DI: "di" > 303| < CYCLO: "cyclo" > 304} 305 306TOKEN : /*PREFIXES*/ 307{ 308 < CHLORO: "chloro" > 309 | < FLUORO: "fluoro" > 310 | < BROMO: "bromo" > 311 | < IODO: "iodo" > 312 | < NITRO: "nitro" > 313 | < OXO: "oxo" > 314 | < PHENYL: "phenyl" > 315 | < AMINO: "amino" > 316 | < ALUMINO: "alumino" > 317 | < LITHO: "litho" > 318 | < HYDROXY: "hydroxy" > 319} 320 321TOKEN : /*FUNCTIONAL GROUP SUFFIXES*/ 322{ 323 < E: "e"> 324 | < OL: "ol" > 325 | < OICACID: "oic acid" > 326 | < OYLCHLORIDE: "oyl chloride" > 327 | < NITRILE: "nitrile" > 328 | < AL: "al" > 329 | < AMIDE: "amide" > 330 | < AMINE: "amine" > 331 | < ONE: "one" > 332 | < OATE: "oate" > 333} 334 335TOKEN : /* METALS */ 336{ 337 < LITHIUM: "lithium" > 338| < SODIUM: "sodium" > 339| < POTASSIUM: "potassium" > 340| < RUBIDIUM: "rubidium" > 341| < CESIUM: "cesium" > 342| < FRANCIUM: "francium" > 343| < BERYLLIUM: "beryllium" > 344| < MAGNESIUM: "magnesium" > 345| < CALCIUM: "calcium" > 346| < STRONTIUM: "strontium" > 347| < BARIUM: "barium" > 348| < RADIUM: "radium"> 349| < SCANDIUM: "scandium" > 350| < YTTRIUM: "yttrium" > 351| < LANTHANUM: "lanthanum" > 352| < ACTINIUM: "actinium" > 353| < TITANIUM: "titanium" > 354| < ZIRCONIUM: "zirconium" > 355| < HAFNIUM: "hafnium" > 356| < RUTHERFORDIUM: "rutherfordium" > 357| < VANADIUM: "vanadium" > 358| < NIOBIUM: "niobium" > 359| < TANTALUM: "tantalum" > 360| < DUBNIUM: "dubnium" > 361| < CHROMIUM: "chromium" > 362| < MOLYBDENUM: "molybdenum" > 363| < TUNGSTEN: "tungsten" > 364| < SEABORGIUM: "seaborgium" > 365| < MANGANESE: "manganese" > 366| < TECHNETIUM: "technetium" > 367| < RHENIUM: "rhenium" > 368| < BOHRIUM: "bohrium" > 369| < IRON: "iron" > 370| < RUTHENIUM: "ruthenium" > 371| < OSMIUM: "osmium" > 372| < HASSIUM: "hassium" > 373| < COBALT: "cobalt"> 374| < RHODIUM: "rhodium"> 375| < IRIDIUM: "iridium" > 376| < MEITMERIUM: "meitmerium" > 377| < NICKEL: "nickel" > 378| < PALLADIUM: "palladium" > 379| < PLATINUM: "platinum" > 380| < COPPER: "copper" > 381| < SILVER: "silver" > 382| < GOLD: "gold" > 383| < ZINC: "zinc" > 384| < CADMIUM: "cadmium" > 385| < MECURY: "mercury" > 386| < ALUMINIUM: "aluminium" > 387| < GALLIUM: "gallium" > 388| < INDIUM: "indium" > 389| < THALLIUM: "thallium" > 390| < GERMAINIUM: "germainium" > 391| < TIN: "tin" > 392| < LEAD: "lead" > 393| < ARSENIC: "arsenic" > 394| < ANTIMONY: "antimony" > 395| < BISMUTH: "bismuth" > 396| < SELENIUM: "selenium" > 397| < TELLURIUM: "tellurium" > 398| < POLONIUM: "polonium" > 399| < CERIUM: "cerium" > 400| < PRASEODYMIUM: "praseodymium" > 401| < NEODYMIUM: "neodymium" > 402| < PROMETHIUM: "promethium" > 403| < SANARIUM: "sanarium" > 404| < EUROPIUM: "europium" > 405| < GADOLINIUM: "gadolinium" > 406| < TERBIUM: "terbium" > 407| < DYSPROSIUM: "dysprosium" > 408| < HOLMIUM: "holmium" > 409| < ERBIUM: "erbium" > 410| < THULIUM: "thulium" > 411| < YTTERBIUM: "ytterbium" > 412| < LUTETIUM: "lutetium" > 413| < THORIUM: "thorium" > 414| < PROTACTINIUM: "protactinium" > 415| < URANIUM: "uranium" > 416| < NEPTUNIUM: "neptunium" > 417| < PLUTONIUM: "plutonium" > 418| < AMERCIUM: "amercium" > 419| < CURIUM: "curium" > 420| < BERKELIUM: "berkelium" > 421| < CALIFORNIUM: "californium" > 422| < EINSTEINIUM: "einsteinium" > 423| < FERMIUM: "fermium" > 424| < MENDELEVIUM: "mendelevium" > 425| < NOBELIUM: "nobelium" > 426| < LAWRENCIUM: "lawrencium" > 427} 428 429/** 430 * The general form all chemical names must follow. 431 */ 432void completeChemicalName() : 433{} 434{ 435 LOOKAHEAD (2) ( mainChainConstruct() ) | ( prefixConstruct() mainChainConstruct() ) <EOL> 436 437} 438 439/** 440 * Allows 1 or more prefixes 441 */ 442void prefixConstruct() : 443{} 444{ 445 //Dash not needed in the first case 446 prefixType() (<DASH> prefixType())* 447} 448 449void prefixType() : 450{} 451{ 452 ( attachLocationSpecified() 453 | AddUnknownFunGroupPos() ) 454 455 ( subChain() 456 | functionalGroupPrefix() ) 457} 458 459/** The substituent part of the prefix */ 460void subChain() : 461{} 462{ 463 //Only allow *specified location* substituents delt with in prefixType() 464 chainPrefix() AddHeadToken() <YL> 465} 466 467/** 468 * An attach position has been specified using a 469 * comma seperated list followed by a dash 470 */ 471void attachLocationSpecified() : 472{} 473{ 474 //Must be at least one constant, so add that to begin with. 475 476 <CONSTANT> AddFunGroupPos() 477 478 ( oneAttachLocation() 479 | twoOrThreeAttachLocations() ) 480} 481 482/** 483 * Only one attach location specified, should be followed by a dash. 484 */ 485void oneAttachLocation() : 486{} 487{ 488 <DASH> 489} 490 491/** 492 * Two or three attach locations specidied, handle the second and if needed, the third one here. 493 */ 494void twoOrThreeAttachLocations() : 495{} 496{ 497 <COMMA> <CONSTANT> AddFunGroupPos() 498 ( (<DASH> <DI>) | (<COMMA> <CONSTANT> AddFunGroupPos() <DASH> <TRI>) ) 499} 500 501/** 502 * A list of known tokens denoting a chain's length. 503 */ 504void chainPrefix() : 505{} 506{ 507 specialCase() | allBaseNumbers() [tensWithUnits() | tensNoUnits()] 508} 509 510/** Deal with special cases where the rules don't apply. */ 511void specialCase() : 512{} 513{ 514 < METH > {currentNumber = 1;} 515| < ETH > {currentNumber = 2;} 516| < PROP > {currentNumber = 3;} 517| < BUT > {currentNumber = 4;} 518| < DEC > {currentNumber = 10;} 519| < UNDEC > {currentNumber = 11;} 520| < EICOS > {currentNumber = 20;} 521| < HENICOS > {currentNumber = 21;} 522} 523 524/** The usual numbers .*/ 525void allBaseNumbers() : 526{} 527{ 528 < HEN > {currentNumber = 1;} 529| < DO > {currentNumber = 2;} 530| < TRI > {currentNumber = 3;} 531| < TETR > {currentNumber = 4;} 532| < PENT > {currentNumber = 5;} 533| < HEX > {currentNumber = 6;} 534| < HEPT > {currentNumber = 7;} 535| < OCT > {currentNumber = 8;} 536| < NON > {currentNumber = 9;} 537} 538 539/** Deal with fragments refering to the positioning of the base numbers (denoting their magnitude) */ 540void tensNoUnits() : 541{} 542{ 543 <DEC> { currentNumber += 10; } 544| <COS> { currentNumber += 20; } 545| <CONT> { currentNumber *= 10; } 546} 547 548/** 549 * Deals with numbers above 30 where the base numbers set appear twice. 550 * For example, in the tens and the units. */ 551void tensWithUnits() : 552{ 553 int tempBackup; 554} 555{ 556 { tempBackup = currentNumber; } 557 allBaseNumbers() <CONT> 558 { currentNumber *= 10; currentNumber += tempBackup; } 559} 560 561/** The functional group part of the prefix */ 562void functionalGroupPrefix() : 563{} 564{ 565 prefixFunctionalGroups() 566 AddFunGroup() 567} 568 569/** 570 * Main chains are compulsary and consist of an optional "cyclo", a length prefix and 571 * a posfix denoting functional groups. 572 */ 573void mainChainConstruct() : 574{} 575{ 576 [cycle()] mainChainPrefix()( 577 578 <YL> AddUnknownFunGroupPos() MakeMainChainIntoSubstituent() ( prioritySubstituents() 579 | mainChainPrefix() <AN> prioritySubstituentsFunGroups() ) AddUnknownFunGroupPos() AddFunGroup() 580 | bondType() mainChainSuffix() ) 581 582} 583 584/** 585 * Deals with cyclic main chains. 586 */ 587void cycle() : 588{} 589{ 590 <CYCLO> SetMainCyclic() 591} 592 593/** 594 * Deal with the main chain's length. 595 */ 596void mainChainPrefix() : 597{} 598{ 599 chainPrefix() AddMainChain() 600} 601 602 603/** 604 * Tokens which affect the bond order of the first bond. 605 */ 606void bondType() : 607{} 608{ 609 ( <AN> AddUnknownFunGroupPos() 610 | ( <DASH> attachLocationSpecified() | AddUnknownFunGroupPos() ) 611 ( <EN> | <YN> ) ) 612 AddFunGroup() 613} 614 615/** 616 * Figure out the functional group by the main chain's suffix. 617 */ 618void mainChainSuffix() : 619{} 620{ 621 endFunctionalGroups() | connectingFunctionalGroupsConstruct() 622} 623 624/** 625 * Functional groups which occur at the end of the main chain and need 626 * a connecting "an". 627 */ 628void endFunctionalGroups() : 629{} 630{ 631( <E>[<NITRILE>] 632| <AMIDE> 633| <AMINE> 634| <OATE> 635| <ONE> 636| <OICACID> 637| <OYLCHLORIDE> 638| <AL> ) AddUnknownFunGroupPos() AddFunGroup() 639} 640 641/** 642 * The layout of a functional group(s) which can connect anywhere. 643 * No number specified, or a number list specified. 644 */ 645void connectingFunctionalGroupsConstruct() : 646{} 647{ 648 ( connectingFunctionalGroupSuffix() AddUnknownFunGroupPos() 649 | <DASH> attachLocationSpecified() connectingFunctionalGroupSuffix() ) 650 651 AddFunGroup() 652} 653 654/** 655 * Functional groups suffixs for groups which can be connected anywhere 656 * along the main chain. 657 */ 658void connectingFunctionalGroupSuffix() : 659{} 660{ 661 <OL> 662} 663 664void prefixFunctionalGroups() : 665{} 666{ 667 <CHLORO> 668| <BROMO> 669| <IODO> 670| <FLUORO> 671| <NITRO> 672| <OXO> 673| <PHENYL> 674| <AMINO> 675| <HYDROXY> 676} 677 678/** 679 * Things which have sub chains branching off them and the molecule does have a main chain. 680 */ 681void prioritySubstituentsFunGroups() : 682{} 683{ 684 <AMINE> 685| <AMIDE> 686| <OATE> 687| <ONE> 688} 689 690/** 691 * Things which have sub chains branching off them but the molecule has no main chain. 692 */ 693void prioritySubstituents () : 694{} 695{ 696 groupOneMetals() 697| groupTwoMetals() 698| dBlockMetals() 699| pBlockMetals() 700| fBlockMetals() 701 702} 703 704void groupOneMetals() : 705{} 706{ 707 708 <LITHIUM> 709| <SODIUM> 710| <POTASSIUM> 711| <RUBIDIUM> 712| <CESIUM> 713| <FRANCIUM> 714 715} 716 717void groupTwoMetals() : 718{} 719{ 720 <BERYLLIUM> 721| <MAGNESIUM> 722| <CALCIUM> 723| <STRONTIUM> 724| <BARIUM> 725| <RADIUM> 726} 727 728void dBlockMetals() : 729{} 730{ 731 <SCANDIUM> 732| <YTTRIUM> 733| <LANTHANUM> 734| <ACTINIUM> 735| <TITANIUM> 736| <ZIRCONIUM> 737| <HAFNIUM> 738| <RUTHERFORDIUM> 739| <VANADIUM> 740| <NIOBIUM> 741| <TANTALUM> 742| <DUBNIUM> 743| <CHROMIUM> 744| <MOLYBDENUM> 745| <TUNGSTEN> 746| <SEABORGIUM> 747| <MANGANESE> 748| <TECHNETIUM> 749| <RHENIUM> 750| <BOHRIUM> 751| <IRON> 752| <RUTHENIUM> 753| <OSMIUM> 754| <HASSIUM> 755| <COBALT> 756| <RHODIUM> 757| <IRIDIUM> 758| <MEITMERIUM> 759| <NICKEL> 760| <PALLADIUM> 761| <PLATINUM> 762| <COPPER> 763| <SILVER> 764| <GOLD> 765| <ZINC> 766| <CADMIUM> 767| <MECURY> 768} 769 770void pBlockMetals() : 771{} 772{ 773 <ALUMINIUM> 774| <GALLIUM> 775| <INDIUM> 776| <THALLIUM> 777| <GERMAINIUM> 778| <TIN> 779| <LEAD> 780| <ARSENIC> 781| <ANTIMONY> 782| <BISMUTH> 783| <SELENIUM> 784| <TELLURIUM> 785| <POLONIUM> 786 787} 788 789void fBlockMetals() : 790{} 791{ 792 <CERIUM> 793| <PRASEODYMIUM> 794| <NEODYMIUM> 795| <PROMETHIUM> 796| <SANARIUM> 797| <EUROPIUM> 798| <GADOLINIUM> 799| <TERBIUM> 800| <DYSPROSIUM> 801| <HOLMIUM> 802| <ERBIUM> 803| <THULIUM> 804| <YTTERBIUM> 805| <LUTETIUM> 806| <THORIUM> 807| <PROTACTINIUM> 808| <URANIUM> 809| <NEPTUNIUM> 810| <PLUTONIUM> 811| <AMERCIUM> 812| <CURIUM> 813| <BERKELIUM> 814| <CALIFORNIUM> 815| <EINSTEINIUM> 816| <FERMIUM> 817| <MENDELEVIUM> 818| <NOBELIUM> 819| <LAWRENCIUM> 820 821} 822 823