1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 20 import edu.stanford.nlp.process.WordTokenFactory; 21 import edu.stanford.nlp.ling.HasWord; 22 import edu.stanford.nlp.ling.Word; 23 import edu.stanford.nlp.ling.CoreLabel; 24 import edu.stanford.nlp.process.PTBTokenizer; 25 import edu.stanford.nlp.util.StringUtils; 26 import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 27 import edu.stanford.nlp.parser.lexparser.TreeBinarizer; 28 import edu.stanford.nlp.trees.GrammaticalStructure; 29 import edu.stanford.nlp.trees.GrammaticalStructureFactory; 30 import edu.stanford.nlp.trees.PennTreebankLanguagePack; 31 import edu.stanford.nlp.trees.Tree; 32 import edu.stanford.nlp.trees.Trees; 33 import edu.stanford.nlp.trees.TreebankLanguagePack; 34 import edu.stanford.nlp.trees.TypedDependency; 35 36 import java.io.BufferedWriter; 37 import java.io.FileWriter; 38 import java.io.StringReader; 39 import java.io.IOException; 40 import java.util.ArrayList; 41 import java.util.Collection; 42 import java.util.List; 43 import java.util.HashMap; 44 import java.util.Properties; 45 import java.util.Scanner; 46 47 public class ConstituencyParse { 48 49 private boolean tokenize; 50 private BufferedWriter tokWriter, parentWriter; 51 private LexicalizedParser parser; 52 private TreeBinarizer binarizer; 53 private CollapseUnaryTransformer transformer; 54 private GrammaticalStructureFactory gsf; 55 56 private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; 57 ConstituencyParse(String tokPath, String parentPath, boolean tokenize)58 public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException { 59 this.tokenize = tokenize; 60 if (tokPath != null) { 61 tokWriter = new BufferedWriter(new FileWriter(tokPath)); 62 } 63 parentWriter = new BufferedWriter(new FileWriter(parentPath)); 64 parser = LexicalizedParser.loadModel(PCFG_PATH); 65 binarizer = TreeBinarizer.simpleTreeBinarizer( 66 parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); 67 transformer = new CollapseUnaryTransformer(); 68 69 // set up to produce dependency representations from constituency trees 70 TreebankLanguagePack tlp = new PennTreebankLanguagePack(); 71 gsf = tlp.grammaticalStructureFactory(); 72 } 73 sentenceToTokens(String line)74 public List<HasWord> sentenceToTokens(String line) { 75 List<HasWord> tokens = new ArrayList<>(); 76 if (tokenize) { 77 PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); 78 for (Word label; tokenizer.hasNext(); ) { 79 tokens.add(tokenizer.next()); 80 } 81 } else { 82 for (String word : line.split(" ")) { 83 tokens.add(new Word(word)); 84 } 85 } 86 87 return tokens; 88 } 89 parse(List<HasWord> tokens)90 public Tree parse(List<HasWord> tokens) { 91 Tree tree = parser.apply(tokens); 92 return tree; 93 } 94 constTreeParents(Tree tree)95 public int[] constTreeParents(Tree tree) { 96 Tree binarized = binarizer.transformTree(tree); 97 Tree collapsedUnary = transformer.transformTree(binarized); 98 Trees.convertToCoreLabels(collapsedUnary); 99 collapsedUnary.indexSpans(); 100 List<Tree> leaves = collapsedUnary.getLeaves(); 101 int size = collapsedUnary.size() - leaves.size(); 102 int[] parents = new int[size]; 103 HashMap<Integer, Integer> index = new HashMap<Integer, Integer>(); 104 105 int idx = leaves.size(); 106 int leafIdx = 0; 107 for (Tree leaf : leaves) { 108 Tree cur = leaf.parent(collapsedUnary); // go to preterminal 109 int curIdx = leafIdx++; 110 boolean done = false; 111 while (!done) { 112 Tree parent = cur.parent(collapsedUnary); 113 if (parent == null) { 114 parents[curIdx] = 0; 115 break; 116 } 117 118 int parentIdx; 119 int parentNumber = parent.nodeNumber(collapsedUnary); 120 if (!index.containsKey(parentNumber)) { 121 parentIdx = idx++; 122 index.put(parentNumber, parentIdx); 123 } else { 124 parentIdx = index.get(parentNumber); 125 done = true; 126 } 127 128 parents[curIdx] = parentIdx + 1; 129 cur = parent; 130 curIdx = parentIdx; 131 } 132 } 133 134 return parents; 135 } 136 137 // convert constituency parse to a dependency representation and return the 138 // parent pointer representation of the tree depTreeParents(Tree tree, List<HasWord> tokens)139 public int[] depTreeParents(Tree tree, List<HasWord> tokens) { 140 GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); 141 Collection<TypedDependency> tdl = gs.typedDependencies(); 142 int len = tokens.size(); 143 int[] parents = new int[len]; 144 for (int i = 0; i < len; i++) { 145 // if a node has a parent of -1 at the end of parsing, then the node 146 // has no parent. 147 parents[i] = -1; 148 } 149 150 for (TypedDependency td : tdl) { 151 // let root have index 0 152 int child = td.dep().index(); 153 int parent = td.gov().index(); 154 parents[child - 1] = parent; 155 } 156 157 return parents; 158 } 159 printTokens(List<HasWord> tokens)160 public void printTokens(List<HasWord> tokens) throws IOException { 161 int len = tokens.size(); 162 StringBuilder sb = new StringBuilder(); 163 for (int i = 0; i < len - 1; i++) { 164 if (tokenize) { 165 sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word())); 166 } else { 167 sb.append(tokens.get(i).word()); 168 } 169 sb.append(' '); 170 } 171 172 if (tokenize) { 173 sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word())); 174 } else { 175 sb.append(tokens.get(len - 1).word()); 176 } 177 178 sb.append('\n'); 179 tokWriter.write(sb.toString()); 180 } 181 printParents(int[] parents)182 public void printParents(int[] parents) throws IOException { 183 StringBuilder sb = new StringBuilder(); 184 int size = parents.length; 185 for (int i = 0; i < size - 1; i++) { 186 sb.append(parents[i]); 187 sb.append(' '); 188 } 189 sb.append(parents[size - 1]); 190 sb.append('\n'); 191 parentWriter.write(sb.toString()); 192 } 193 close()194 public void close() throws IOException { 195 if (tokWriter != null) tokWriter.close(); 196 parentWriter.close(); 197 } 198 main(String[] args)199 public static void main(String[] args) throws Exception { 200 Properties props = StringUtils.argsToProperties(args); 201 if (!props.containsKey("parentpath")) { 202 System.err.println( 203 "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>"); 204 System.exit(1); 205 } 206 207 // whether to tokenize input sentences 208 boolean tokenize = false; 209 if (props.containsKey("tokenize")) { 210 tokenize = true; 211 } 212 213 // whether to produce dependency trees from the constituency parse 214 boolean deps = false; 215 if (props.containsKey("deps")) { 216 deps = true; 217 } 218 219 String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null; 220 String parentPath = props.getProperty("parentpath"); 221 ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize); 222 223 Scanner stdin = new Scanner(System.in); 224 int count = 0; 225 long start = System.currentTimeMillis(); 226 while (stdin.hasNextLine()) { 227 String line = stdin.nextLine(); 228 List<HasWord> tokens = processor.sentenceToTokens(line); 229 Tree parse = processor.parse(tokens); 230 231 // produce parent pointer representation 232 int[] parents = deps ? processor.depTreeParents(parse, tokens) 233 : processor.constTreeParents(parse); 234 235 // print 236 if (tokPath != null) { 237 processor.printTokens(tokens); 238 } 239 processor.printParents(parents); 240 241 count++; 242 if (count % 1000 == 0) { 243 double elapsed = (System.currentTimeMillis() - start) / 1000.0; 244 System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed); 245 } 246 } 247 248 long totalTimeMillis = System.currentTimeMillis() - start; 249 System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", 250 count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count); 251 processor.close(); 252 } 253 } 254