1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  *   http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 
20 import edu.stanford.nlp.process.WordTokenFactory;
21 import edu.stanford.nlp.ling.HasWord;
22 import edu.stanford.nlp.ling.Word;
23 import edu.stanford.nlp.ling.CoreLabel;
24 import edu.stanford.nlp.process.PTBTokenizer;
25 import edu.stanford.nlp.util.StringUtils;
26 import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
27 import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
28 import edu.stanford.nlp.trees.GrammaticalStructure;
29 import edu.stanford.nlp.trees.GrammaticalStructureFactory;
30 import edu.stanford.nlp.trees.PennTreebankLanguagePack;
31 import edu.stanford.nlp.trees.Tree;
32 import edu.stanford.nlp.trees.Trees;
33 import edu.stanford.nlp.trees.TreebankLanguagePack;
34 import edu.stanford.nlp.trees.TypedDependency;
35 
36 import java.io.BufferedWriter;
37 import java.io.FileWriter;
38 import java.io.StringReader;
39 import java.io.IOException;
40 import java.util.ArrayList;
41 import java.util.Collection;
42 import java.util.List;
43 import java.util.HashMap;
44 import java.util.Properties;
45 import java.util.Scanner;
46 
47 public class ConstituencyParse {
48 
49   private boolean tokenize;
50   private BufferedWriter tokWriter, parentWriter;
51   private LexicalizedParser parser;
52   private TreeBinarizer binarizer;
53   private CollapseUnaryTransformer transformer;
54   private GrammaticalStructureFactory gsf;
55 
56   private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
57 
ConstituencyParse(String tokPath, String parentPath, boolean tokenize)58   public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
59     this.tokenize = tokenize;
60     if (tokPath != null) {
61       tokWriter = new BufferedWriter(new FileWriter(tokPath));
62     }
63     parentWriter = new BufferedWriter(new FileWriter(parentPath));
64     parser = LexicalizedParser.loadModel(PCFG_PATH);
65     binarizer = TreeBinarizer.simpleTreeBinarizer(
66       parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
67     transformer = new CollapseUnaryTransformer();
68 
69     // set up to produce dependency representations from constituency trees
70     TreebankLanguagePack tlp = new PennTreebankLanguagePack();
71     gsf = tlp.grammaticalStructureFactory();
72   }
73 
sentenceToTokens(String line)74   public List<HasWord> sentenceToTokens(String line) {
75     List<HasWord> tokens = new ArrayList<>();
76     if (tokenize) {
77       PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
78       for (Word label; tokenizer.hasNext(); ) {
79         tokens.add(tokenizer.next());
80       }
81     } else {
82       for (String word : line.split(" ")) {
83         tokens.add(new Word(word));
84       }
85     }
86 
87     return tokens;
88   }
89 
parse(List<HasWord> tokens)90   public Tree parse(List<HasWord> tokens) {
91     Tree tree = parser.apply(tokens);
92     return tree;
93   }
94 
constTreeParents(Tree tree)95   public int[] constTreeParents(Tree tree) {
96     Tree binarized = binarizer.transformTree(tree);
97     Tree collapsedUnary = transformer.transformTree(binarized);
98     Trees.convertToCoreLabels(collapsedUnary);
99     collapsedUnary.indexSpans();
100     List<Tree> leaves = collapsedUnary.getLeaves();
101     int size = collapsedUnary.size() - leaves.size();
102     int[] parents = new int[size];
103     HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
104 
105     int idx = leaves.size();
106     int leafIdx = 0;
107     for (Tree leaf : leaves) {
108       Tree cur = leaf.parent(collapsedUnary); // go to preterminal
109       int curIdx = leafIdx++;
110       boolean done = false;
111       while (!done) {
112         Tree parent = cur.parent(collapsedUnary);
113         if (parent == null) {
114           parents[curIdx] = 0;
115           break;
116         }
117 
118         int parentIdx;
119         int parentNumber = parent.nodeNumber(collapsedUnary);
120         if (!index.containsKey(parentNumber)) {
121           parentIdx = idx++;
122           index.put(parentNumber, parentIdx);
123         } else {
124           parentIdx = index.get(parentNumber);
125           done = true;
126         }
127 
128         parents[curIdx] = parentIdx + 1;
129         cur = parent;
130         curIdx = parentIdx;
131       }
132     }
133 
134     return parents;
135   }
136 
137   // convert constituency parse to a dependency representation and return the
138   // parent pointer representation of the tree
depTreeParents(Tree tree, List<HasWord> tokens)139   public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
140     GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
141     Collection<TypedDependency> tdl = gs.typedDependencies();
142     int len = tokens.size();
143     int[] parents = new int[len];
144     for (int i = 0; i < len; i++) {
145       // if a node has a parent of -1 at the end of parsing, then the node
146       // has no parent.
147       parents[i] = -1;
148     }
149 
150     for (TypedDependency td : tdl) {
151       // let root have index 0
152       int child = td.dep().index();
153       int parent = td.gov().index();
154       parents[child - 1] = parent;
155     }
156 
157     return parents;
158   }
159 
printTokens(List<HasWord> tokens)160   public void printTokens(List<HasWord> tokens) throws IOException {
161     int len = tokens.size();
162     StringBuilder sb = new StringBuilder();
163     for (int i = 0; i < len - 1; i++) {
164       if (tokenize) {
165         sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
166       } else {
167         sb.append(tokens.get(i).word());
168       }
169       sb.append(' ');
170     }
171 
172     if (tokenize) {
173       sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
174     } else {
175       sb.append(tokens.get(len - 1).word());
176     }
177 
178     sb.append('\n');
179     tokWriter.write(sb.toString());
180   }
181 
printParents(int[] parents)182   public void printParents(int[] parents) throws IOException {
183     StringBuilder sb = new StringBuilder();
184     int size = parents.length;
185     for (int i = 0; i < size - 1; i++) {
186       sb.append(parents[i]);
187       sb.append(' ');
188     }
189     sb.append(parents[size - 1]);
190     sb.append('\n');
191     parentWriter.write(sb.toString());
192   }
193 
close()194   public void close() throws IOException {
195     if (tokWriter != null) tokWriter.close();
196     parentWriter.close();
197   }
198 
main(String[] args)199   public static void main(String[] args) throws Exception {
200     Properties props = StringUtils.argsToProperties(args);
201     if (!props.containsKey("parentpath")) {
202       System.err.println(
203         "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
204       System.exit(1);
205     }
206 
207     // whether to tokenize input sentences
208     boolean tokenize = false;
209     if (props.containsKey("tokenize")) {
210       tokenize = true;
211     }
212 
213     // whether to produce dependency trees from the constituency parse
214     boolean deps = false;
215     if (props.containsKey("deps")) {
216       deps = true;
217     }
218 
219     String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
220     String parentPath = props.getProperty("parentpath");
221     ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);
222 
223     Scanner stdin = new Scanner(System.in);
224     int count = 0;
225     long start = System.currentTimeMillis();
226     while (stdin.hasNextLine()) {
227       String line = stdin.nextLine();
228       List<HasWord> tokens = processor.sentenceToTokens(line);
229       Tree parse = processor.parse(tokens);
230 
231       // produce parent pointer representation
232       int[] parents = deps ? processor.depTreeParents(parse, tokens)
233                            : processor.constTreeParents(parse);
234 
235       // print
236       if (tokPath != null) {
237         processor.printTokens(tokens);
238       }
239       processor.printParents(parents);
240 
241       count++;
242       if (count % 1000 == 0) {
243         double elapsed = (System.currentTimeMillis() - start) / 1000.0;
244         System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
245       }
246     }
247 
248     long totalTimeMillis = System.currentTimeMillis() - start;
249     System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
250       count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
251     processor.close();
252   }
253 }
254