1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.i18n.phonenumbers.metadata.table; 17 18 import static com.google.common.base.CharMatcher.isNot; 19 import static com.google.common.base.CharMatcher.javaIsoControl; 20 import static com.google.common.base.CharMatcher.whitespace; 21 import static com.google.common.base.Preconditions.checkArgument; 22 import static com.google.common.collect.ImmutableList.toImmutableList; 23 24 import com.google.common.base.CharMatcher; 25 import com.google.common.collect.ImmutableList; 26 import com.google.common.collect.ImmutableMap; 27 import java.util.ArrayList; 28 import java.util.Iterator; 29 import java.util.List; 30 import java.util.function.Consumer; 31 import java.util.stream.Stream; 32 import javax.annotation.Nullable; 33 34 /** 35 * An efficient, fluent CSV parser which operates on a {@link Stream} of lines. It handles quoting 36 * of values, whitespace trimming and mapping values via a "schema" row. 37 * 38 * <p>This class is sadly necessary since the one in {@code com.google.common.text} doesn't support 39 * ignoring whitespace (and making it do so would take longer than writing this). 40 * 41 * <p>This class is immutable and thread-safe. 42 */ 43 // TODO: Investigate other "standard" CSV parsers such as org.apache.commons.csv. 44 public final class CsvParser { 45 /** 46 * A consumer for CSV rows which can automatically map values according to a header row. 47 * 48 * <p>This class is immutable and thread-safe. 49 */ 50 public static final class RowMapper { 51 @Nullable private final Consumer<ImmutableList<String>> headerHandler; 52 RowMapper(Consumer<ImmutableList<String>> headerHandler)53 private RowMapper(Consumer<ImmutableList<String>> headerHandler) { 54 this.headerHandler = headerHandler; 55 } 56 mapTo(Consumer<ImmutableMap<String, String>> handler)57 public Consumer<Stream<String>> mapTo(Consumer<ImmutableMap<String, String>> handler) { 58 return new Consumer<Stream<String>>() { 59 private ImmutableList<String> header = null; 60 61 @Override 62 public void accept(Stream<String> row) { 63 if (header == null) { 64 // Can contain duplicates (but that's bad for mapping). 65 header = row.collect(toImmutableList()); 66 checkArgument( 67 header.size() == header.stream().distinct().count(), 68 "duplicate values in CSV header: %s", 69 header); 70 if (headerHandler != null) { 71 headerHandler.accept(header); 72 } 73 } else { 74 ImmutableMap.Builder<String, String> map = ImmutableMap.builder(); 75 // Not a pure lambda due to the need to index columns. 76 row.forEach(new Consumer<String>() { 77 private int i = 0; 78 79 @Override 80 public void accept(String v) { 81 checkArgument(i < header.size(), 82 "too many columns (expected %s): %s", header.size(), map); 83 if (!v.isEmpty()) { 84 map.put(header.get(i++), v); 85 } 86 } 87 }); 88 handler.accept(map.build()); 89 } 90 } 91 }; 92 } 93 } 94 95 private static final CharMatcher NON_WHITESPACE = CharMatcher.whitespace().negate(); 96 private static final char QUOTE = '"'; 97 private static final CharMatcher VALID_DELIMITER_CHAR = 98 NON_WHITESPACE.and(javaIsoControl().negate()).and(isNot(QUOTE)).or(CharMatcher.anyOf(" \t")); 99 withSeparator(char delimiter)100 public static CsvParser withSeparator(char delimiter) { 101 return new CsvParser(delimiter, false, false); 102 } 103 commaSeparated()104 public static CsvParser commaSeparated() { 105 return withSeparator(','); 106 } 107 tabSeparated()108 public static CsvParser tabSeparated() { 109 return withSeparator('\t'); 110 } 111 rowMapper()112 public static RowMapper rowMapper() { 113 return new RowMapper(null); 114 } 115 rowMapper(Consumer<ImmutableList<String>> headerHandler)116 public static RowMapper rowMapper(Consumer<ImmutableList<String>> headerHandler) { 117 return new RowMapper(headerHandler); 118 } 119 120 private final char delimiter; 121 private final boolean trimWhitespace; 122 private final boolean allowMultiline; 123 CsvParser(char delimiter, boolean trimWhitespace, boolean allowMultiline)124 private CsvParser(char delimiter, boolean trimWhitespace, boolean allowMultiline) { 125 checkArgument(VALID_DELIMITER_CHAR.matches(delimiter), 126 "invalid delimiter: %s", delimiter); 127 this.delimiter = delimiter; 128 this.trimWhitespace = trimWhitespace; 129 this.allowMultiline = allowMultiline; 130 } 131 trimWhitespace()132 public CsvParser trimWhitespace() { 133 checkArgument(NON_WHITESPACE.matches(delimiter), 134 "cannot trim whitespace if delimiter is whitespace"); 135 return new CsvParser(delimiter, true, allowMultiline); 136 } 137 allowMultiline()138 public CsvParser allowMultiline() { 139 return new CsvParser(delimiter, trimWhitespace, true); 140 } 141 parse(Stream<String> lines, Consumer<Stream<String>> rowCallback)142 public void parse(Stream<String> lines, Consumer<Stream<String>> rowCallback) { 143 // Allow whitespace delimiter if we aren't also trimming whitespace. 144 List<String> row = new ArrayList<>(); 145 StringBuilder buffer = new StringBuilder(); 146 Iterator<String> it = lines.iterator(); 147 while (parseRow(it, row, buffer)) { 148 rowCallback.accept(row.stream()); 149 row.clear(); 150 } 151 } 152 parseRow(Iterator<String> lines, List<String> row, StringBuilder buffer)153 private boolean parseRow(Iterator<String> lines, List<String> row, StringBuilder buffer) { 154 if (!lines.hasNext()) { 155 return false; 156 } 157 // First line of potentially several which make up this row. 158 String line = lines.next(); 159 int start = maybeTrimWhitespace(line, 0); 160 while (start < line.length()) { 161 // "start" is the start of the next part and must be a valid index into current "line". 162 // Could be high or low surrogate if badly formed string, or just point at the delimiter. 163 char c = line.charAt(start); 164 int pos; 165 if (c == QUOTE) { 166 // Quoted value, maybe parse and unescape multiple lines here. 167 pos = ++start; 168 while (true) { 169 if (pos == line.length()) { 170 buffer.append(line, start, pos); 171 checkArgument(allowMultiline && lines.hasNext(), 172 "unterminated quoted value: %s", buffer); 173 buffer.append('\n'); 174 line = lines.next(); 175 start = 0; 176 pos = 0; 177 } 178 c = line.charAt(pos); 179 if (c == QUOTE) { 180 buffer.append(line, start, pos++); 181 if (pos == line.length()) { 182 break; 183 } 184 if (line.charAt(pos) != QUOTE) { 185 pos = maybeTrimWhitespace(line, pos); 186 checkArgument(pos == line.length() || line.codePointAt(pos) == delimiter, 187 "unexpected character (expected delimiter) in: %s", line); 188 break; 189 } 190 // "Double double quotes, what does it mean?" (oh yeah, a single double quote). 191 buffer.append(QUOTE); 192 start = pos + 1; 193 } 194 pos++; 195 } 196 row.add(buffer.toString()); 197 buffer.setLength(0); 198 } else if (c == delimiter) { 199 // Empty unquoted empty value (e.g. "foo,,bar"). 200 row.add(""); 201 pos = start; 202 } else { 203 // Non-empty unquoted value. 204 pos = line.indexOf(delimiter, start + 1); 205 if (pos == -1) { 206 pos = line.length(); 207 } 208 String value = line.substring(start, maybeTrimTrailingWhitespace(line, pos)); 209 checkArgument(value.indexOf(QUOTE) == -1, 210 "quotes cannot appear in unquoted values: %s", value); 211 row.add(value); 212 } 213 if (pos == line.length()) { 214 // We hit end-of-line at the end of a value, so just return (no trailing empty value). 215 return true; 216 } 217 // If not end-of-line, "pos" points at the last delimiter, so we can find the next start. 218 start = maybeTrimWhitespace(line, pos + 1); 219 } 220 // We hit end-of-line either immediately, or after a delimiter. Either way we always need to 221 // add a trailing empty value for consistency. 222 row.add(""); 223 return true; 224 } 225 maybeTrimWhitespace(String s, int i)226 private int maybeTrimWhitespace(String s, int i) { 227 if (trimWhitespace) { 228 i = NON_WHITESPACE.indexIn(s, i); 229 if (i == -1) { 230 i = s.length(); 231 } 232 } 233 return i; 234 } 235 maybeTrimTrailingWhitespace(String s, int i)236 private int maybeTrimTrailingWhitespace(String s, int i) { 237 if (trimWhitespace) { 238 // There is no "lastIndexIn(String, int)" sadly. 239 while (i > 0 && whitespace().matches(s.charAt(i - 1))) { 240 i--; 241 } 242 } 243 return i; 244 } 245 } 246