1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.utilities.io; 22 23 import java.io.IOException; 24 import java.util.regex.Matcher; 25 import java.util.regex.Pattern; 26 27 /** 28 * Reader that will tokenize the input data as desired. 29 * 30 * @author Erich Schubert 31 * @since 0.7.0 32 * 33 * @composed - - - Tokenizer 34 */ 35 public class TokenizedReader extends BufferedLineReader { 36 /** 37 * Comment pattern. 38 */ 39 private Matcher comment = null; 40 41 /** 42 * String tokenizer. 43 */ 44 protected Tokenizer tokenizer; 45 46 /** 47 * Constructor. 48 * 49 * @param colSep Column separator 50 * @param quoteChars Quote character 51 * @param comment Comment pattern 52 */ TokenizedReader(Pattern colSep, String quoteChars, Pattern comment)53 public TokenizedReader(Pattern colSep, String quoteChars, Pattern comment) { 54 super(); 55 this.tokenizer = new Tokenizer(colSep, quoteChars); 56 this.comment = comment.matcher(""); 57 } 58 59 /** 60 * Read the next line into the tokenizer. 61 * 62 * @return The next line, or {@code null}. 63 */ nextLineExceptComments()64 public boolean nextLineExceptComments() throws IOException { 65 while(nextLine()) { 66 if(comment == null || !comment.reset(buf).matches()) { 67 tokenizer.initialize(buf, 0, buf.length()); 68 return true; 69 } 70 } 71 return false; 72 } 73 74 /** 75 * Cleanup the internal state of the tokenized reader. 76 * 77 * This also closes the input stream, but the TokenizerReader can still be 78 * applied to a new stream using any of the {@link #reset} methods. 79 */ 80 @Override reset()81 public void reset() { 82 super.reset(); 83 if(comment != null) { 84 comment.reset(""); 85 } 86 tokenizer.cleanup(); 87 } 88 89 @Override close()90 public void close() throws IOException { 91 reset(); 92 super.close(); 93 } 94 95 /** 96 * Get the tokenizer of the reader. 97 * 98 * @return Tokenizer 99 */ getTokenizer()100 public Tokenizer getTokenizer() { 101 return tokenizer; 102 } 103 }