1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.i18n.phonenumbers.metadata.table;
17 
18 import static com.google.common.base.CharMatcher.isNot;
19 import static com.google.common.base.CharMatcher.javaIsoControl;
20 import static com.google.common.base.CharMatcher.whitespace;
21 import static com.google.common.base.Preconditions.checkArgument;
22 import static com.google.common.collect.ImmutableList.toImmutableList;
23 
24 import com.google.common.base.CharMatcher;
25 import com.google.common.collect.ImmutableList;
26 import com.google.common.collect.ImmutableMap;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.util.List;
30 import java.util.function.Consumer;
31 import java.util.stream.Stream;
32 import javax.annotation.Nullable;
33 
34 /**
35  * An efficient, fluent CSV parser which operates on a {@link Stream} of lines. It handles quoting
36  * of values, whitespace trimming and mapping values via a "schema" row.
37  *
38  * <p>This class is sadly necessary since the one in {@code com.google.common.text} doesn't support
39  * ignoring whitespace (and making it do so would take longer than writing this).
40  *
41  * <p>This class is immutable and thread-safe.
42  */
43 // TODO: Investigate other "standard" CSV parsers such as org.apache.commons.csv.
44 public final class CsvParser {
45   /**
46    * A consumer for CSV rows which can automatically map values according to a header row.
47    *
48    * <p>This class is immutable and thread-safe.
49    */
50   public static final class RowMapper {
51     @Nullable private final Consumer<ImmutableList<String>> headerHandler;
52 
RowMapper(Consumer<ImmutableList<String>> headerHandler)53     private RowMapper(Consumer<ImmutableList<String>> headerHandler) {
54       this.headerHandler = headerHandler;
55     }
56 
mapTo(Consumer<ImmutableMap<String, String>> handler)57     public Consumer<Stream<String>> mapTo(Consumer<ImmutableMap<String, String>> handler) {
58       return new Consumer<Stream<String>>() {
59         private ImmutableList<String> header = null;
60 
61         @Override
62         public void accept(Stream<String> row) {
63           if (header == null) {
64             // Can contain duplicates (but that's bad for mapping).
65             header = row.collect(toImmutableList());
66             checkArgument(
67                 header.size() == header.stream().distinct().count(),
68                 "duplicate values in CSV header: %s",
69                 header);
70             if (headerHandler != null) {
71               headerHandler.accept(header);
72             }
73           } else {
74             ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
75             // Not a pure lambda due to the need to index columns.
76             row.forEach(new Consumer<String>() {
77               private int i = 0;
78 
79               @Override
80               public void accept(String v) {
81                 checkArgument(i < header.size(),
82                     "too many columns (expected %s): %s", header.size(), map);
83                 if (!v.isEmpty()) {
84                   map.put(header.get(i++), v);
85                 }
86               }
87             });
88             handler.accept(map.build());
89           }
90         }
91       };
92     }
93   }
94 
95   private static final CharMatcher NON_WHITESPACE = CharMatcher.whitespace().negate();
96   private static final char QUOTE = '"';
97   private static final CharMatcher VALID_DELIMITER_CHAR =
98       NON_WHITESPACE.and(javaIsoControl().negate()).and(isNot(QUOTE)).or(CharMatcher.anyOf(" \t"));
99 
withSeparator(char delimiter)100   public static CsvParser withSeparator(char delimiter) {
101     return new CsvParser(delimiter, false, false);
102   }
103 
commaSeparated()104   public static CsvParser commaSeparated() {
105     return withSeparator(',');
106   }
107 
tabSeparated()108   public static CsvParser tabSeparated() {
109     return withSeparator('\t');
110   }
111 
rowMapper()112   public static RowMapper rowMapper() {
113     return new RowMapper(null);
114   }
115 
rowMapper(Consumer<ImmutableList<String>> headerHandler)116   public static RowMapper rowMapper(Consumer<ImmutableList<String>> headerHandler) {
117     return new RowMapper(headerHandler);
118   }
119 
120   private final char delimiter;
121   private final boolean trimWhitespace;
122   private final boolean allowMultiline;
123 
CsvParser(char delimiter, boolean trimWhitespace, boolean allowMultiline)124   private CsvParser(char delimiter, boolean trimWhitespace, boolean allowMultiline) {
125     checkArgument(VALID_DELIMITER_CHAR.matches(delimiter),
126         "invalid delimiter: %s", delimiter);
127     this.delimiter = delimiter;
128     this.trimWhitespace = trimWhitespace;
129     this.allowMultiline = allowMultiline;
130   }
131 
trimWhitespace()132   public CsvParser trimWhitespace() {
133     checkArgument(NON_WHITESPACE.matches(delimiter),
134         "cannot trim whitespace if delimiter is whitespace");
135     return new CsvParser(delimiter, true, allowMultiline);
136   }
137 
allowMultiline()138   public CsvParser allowMultiline() {
139     return new CsvParser(delimiter, trimWhitespace, true);
140   }
141 
parse(Stream<String> lines, Consumer<Stream<String>> rowCallback)142   public void parse(Stream<String> lines, Consumer<Stream<String>> rowCallback) {
143     // Allow whitespace delimiter if we aren't also trimming whitespace.
144     List<String> row = new ArrayList<>();
145     StringBuilder buffer = new StringBuilder();
146     Iterator<String> it = lines.iterator();
147     while (parseRow(it, row, buffer)) {
148       rowCallback.accept(row.stream());
149       row.clear();
150     }
151   }
152 
parseRow(Iterator<String> lines, List<String> row, StringBuilder buffer)153   private boolean parseRow(Iterator<String> lines, List<String> row, StringBuilder buffer) {
154     if (!lines.hasNext()) {
155       return false;
156     }
157     // First line of potentially several which make up this row.
158     String line = lines.next();
159     int start = maybeTrimWhitespace(line, 0);
160     while (start < line.length()) {
161       // "start" is the start of the next part and must be a valid index into current "line".
162       // Could be high or low surrogate if badly formed string, or just point at the delimiter.
163       char c = line.charAt(start);
164       int pos;
165       if (c == QUOTE) {
166         // Quoted value, maybe parse and unescape multiple lines here.
167         pos = ++start;
168         while (true) {
169           if (pos == line.length()) {
170             buffer.append(line, start, pos);
171             checkArgument(allowMultiline && lines.hasNext(),
172                 "unterminated quoted value: %s", buffer);
173             buffer.append('\n');
174             line = lines.next();
175             start = 0;
176             pos = 0;
177           }
178           c = line.charAt(pos);
179           if (c == QUOTE) {
180             buffer.append(line, start, pos++);
181             if (pos == line.length()) {
182               break;
183             }
184             if (line.charAt(pos) != QUOTE) {
185               pos = maybeTrimWhitespace(line, pos);
186               checkArgument(pos == line.length() || line.codePointAt(pos) == delimiter,
187                   "unexpected character (expected delimiter) in: %s", line);
188               break;
189             }
190             // "Double double quotes, what does it mean?" (oh yeah, a single double quote).
191             buffer.append(QUOTE);
192             start = pos + 1;
193           }
194           pos++;
195         }
196         row.add(buffer.toString());
197         buffer.setLength(0);
198       } else if (c == delimiter) {
199         // Empty unquoted empty value (e.g. "foo,,bar").
200         row.add("");
201         pos = start;
202       } else {
203         // Non-empty unquoted value.
204         pos = line.indexOf(delimiter, start + 1);
205         if (pos == -1) {
206           pos = line.length();
207         }
208         String value = line.substring(start, maybeTrimTrailingWhitespace(line, pos));
209         checkArgument(value.indexOf(QUOTE) == -1,
210             "quotes cannot appear in unquoted values: %s", value);
211         row.add(value);
212       }
213       if (pos == line.length()) {
214         // We hit end-of-line at the end of a value, so just return (no trailing empty value).
215         return true;
216       }
217       // If not end-of-line, "pos" points at the last delimiter, so we can find the next start.
218       start = maybeTrimWhitespace(line, pos + 1);
219     }
220     // We hit end-of-line either immediately, or after a delimiter. Either way we always need to
221     // add a trailing empty value for consistency.
222     row.add("");
223     return true;
224   }
225 
maybeTrimWhitespace(String s, int i)226   private int maybeTrimWhitespace(String s, int i) {
227     if (trimWhitespace) {
228       i = NON_WHITESPACE.indexIn(s, i);
229       if (i == -1) {
230         i = s.length();
231       }
232     }
233     return i;
234   }
235 
maybeTrimTrailingWhitespace(String s, int i)236   private int maybeTrimTrailingWhitespace(String s, int i) {
237     if (trimWhitespace) {
238       // There is no "lastIndexIn(String, int)" sadly.
239       while (i > 0 && whitespace().matches(s.charAt(i - 1))) {
240         i--;
241       }
242     }
243     return i;
244   }
245 }
246