1 
2 #include "FileRecordTypeChecker.h"
3 #include "api/BamReader.h"
4 #include "ParseTools.h"
5 
FileRecordTypeChecker()6 FileRecordTypeChecker::FileRecordTypeChecker()
7 : _eofHit(false),
8   _inheader(false)
9 {
10 	_fileType = UNKNOWN_FILE_TYPE;
11 	_recordType = UNKNOWN_RECORD_TYPE;
12 	_numFields = 0;
13 	_isBinary = false;
14 	_isText = false;
15 	_isBed = false;
16 	_isDelimited = false;
17 	_delimChar = '\t'; //tab by default
18 	_firstValidDataLineIdx = -1;
19 	_isVCF = false;
20 	_isBAM = false;
21 	_isCRAM = false;
22 	_isGFF = false;
23 	_isGFFplus = false;
24 	_isGzipped = false;
25 	_isCompressed = false;
26 	_insufficientData = false;
27 	_fourthFieldNumeric = false;
28 	_givenEmptyBuffer = false;
29 	_isGroupBy = false;
30 	// TO DO: Bed4, Bed5, and BedGraph are missing from all of these.
31 
32 	_hasName[UNKNOWN_RECORD_TYPE] = false;
33 	_hasName[EMPTY_RECORD_TYPE] = false;
34 	_hasName[BED3_RECORD_TYPE] = false;
35 	_hasName[BED6_RECORD_TYPE] = true;
36 	_hasName[BED12_RECORD_TYPE] = true;
37 	_hasName[BED_PLUS_RECORD_TYPE] = true;
38 	_hasName[BED6_PLUS_RECORD_TYPE] = true;
39 	_hasName[BAM_RECORD_TYPE] = true;
40 	_hasName[VCF_RECORD_TYPE] = true;
41 	_hasName[GFF_RECORD_TYPE] = true;
42 	_hasName[GFF_PLUS_RECORD_TYPE] = true;
43 	_hasName[NO_POS_PLUS_RECORD_TYPE] = true;
44 
45 	_hasScore[UNKNOWN_RECORD_TYPE] = false;
46 	_hasScore[EMPTY_RECORD_TYPE] = false;
47 	_hasScore[BED3_RECORD_TYPE] = false;
48 	_hasScore[BED6_RECORD_TYPE] = true;
49 	_hasScore[BED12_RECORD_TYPE] = true;
50 	_hasScore[BED_PLUS_RECORD_TYPE] = true;
51 	_hasScore[BED6_PLUS_RECORD_TYPE] = true;
52 	_hasScore[BAM_RECORD_TYPE] = true;
53 	_hasScore[VCF_RECORD_TYPE] = true;
54 	_hasScore[GFF_RECORD_TYPE] = true;
55 	_hasScore[GFF_PLUS_RECORD_TYPE] = true;
56 	_hasScore[NO_POS_PLUS_RECORD_TYPE] = true;
57 
58 
59 	_hasStrand[UNKNOWN_RECORD_TYPE] = false;
60 	_hasStrand[EMPTY_RECORD_TYPE] = false;
61 	_hasStrand[BED3_RECORD_TYPE] = false;
62 	_hasStrand[BED6_RECORD_TYPE] = true;
63 	_hasStrand[BED12_RECORD_TYPE] = true;
64 	_hasStrand[BED_PLUS_RECORD_TYPE] = true; //actually, unknown. Give benefit of doubt.
65 	_hasStrand[BED6_PLUS_RECORD_TYPE] = true;
66 	_hasStrand[BAM_RECORD_TYPE] = true;
67 	_hasStrand[VCF_RECORD_TYPE] = true;
68 	_hasStrand[GFF_RECORD_TYPE] = true;
69 	_hasStrand[GFF_PLUS_RECORD_TYPE] = true;
70 	_hasStrand[NO_POS_PLUS_RECORD_TYPE] = true;
71 
72 
73 	_recordTypeNames[UNKNOWN_RECORD_TYPE] = "Unknown record type";
74 	_recordTypeNames[EMPTY_RECORD_TYPE] = "Empty record type";
75 	_recordTypeNames[BED3_RECORD_TYPE] = "Bed3 record type";
76 	_recordTypeNames[BED6_RECORD_TYPE] = "Bed6 record type";
77 	_recordTypeNames[BED12_RECORD_TYPE] = "Bed12 record type";
78 	_recordTypeNames[BED_PLUS_RECORD_TYPE] = "BedPlus record type";
79 	_recordTypeNames[BAM_RECORD_TYPE] = "BAM record type";
80 	_recordTypeNames[VCF_RECORD_TYPE] = "VCF record type";
81 	_recordTypeNames[GFF_RECORD_TYPE] = "Gff record type";
82 	_recordTypeNames[GFF_PLUS_RECORD_TYPE] = "GffPlus record type";
83 	_recordTypeNames[NO_POS_PLUS_RECORD_TYPE] = "NoPosPlus record type";
84 
85 
86 	_fileTypeNames[UNKNOWN_FILE_TYPE] = "Unknown file type";
87 	_fileTypeNames[EMPTY_FILE_TYPE] = "Empty file type";
88 	_fileTypeNames[SINGLE_LINE_DELIM_TEXT_FILE_TYPE] = "Delimited text file type";
89 	_fileTypeNames[GZIP_FILE_TYPE] = "Gzip file type";
90 	_fileTypeNames[BAM_FILE_TYPE] = "BAM file type";
91 	_fileTypeNames[VCF_FILE_TYPE] = "VCF file type";
92 }
93 
94 
scanBuffer(const char * buffer,size_t len,bool eofHit,bool isCompressed)95 bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit, bool isCompressed)
96 {
97 	_eofHit = eofHit;
98 	_isCompressed = isCompressed;
99 	_numBytesInBuffer = len;
100 	if (_numBytesInBuffer == 0) {
101 		_fileType = EMPTY_FILE_TYPE;
102 		_recordType = EMPTY_RECORD_TYPE;
103 		return true;
104 	}
105 
106 	//special: the first thing we do is look for a gzipped file.
107 	if (!_isGzipped && ((unsigned char)(buffer[0]) == 0x1f)) {
108 		_isGzipped = true;
109 		return true;
110 	}
111 	//scan the first 8K block of the streamBuf.
112 
113 	//now we have a buffer from the file.
114 	//first, test to see if it's binary or text.
115 	if (isBinaryBuffer(buffer, len)) {
116 		_isText = false;
117 		_isBinary = true;
118 		return true;
119 	} else {
120 		_isText = true;
121 		_isBinary = false;
122 		return handleTextFormat(buffer, len);
123 	}
124 }
125 
isBinaryBuffer(const char * buffer,size_t len)126 bool FileRecordTypeChecker::isBinaryBuffer(const char *buffer, size_t len)
127 {
128 	if (isBAM(buffer)) {
129 		return true;
130 	}
131 
132 	//Let's say that in a text file, at least 90% of the characters
133 	//should be alphanumeric, whitespace, or punctuation.
134 	static const float PERCENTAGE_PRINTABLE = .9;
135 
136 	int alphaNumCount = 0;
137 	int whiteSpaceCount = 0;
138 	int punctuationCount = 0;
139 
140 	for (int i=0; i < (int)len; i++) {
141 		char currChar = buffer[i];
142 		if (isalnum(currChar)) {
143 			alphaNumCount++;
144 		} else if (isspace(currChar)) {
145 			whiteSpaceCount++;
146 		} else if (ispunct(currChar)) {
147 			punctuationCount++;
148 		}
149 	}
150 
151 	if ((float)(alphaNumCount + whiteSpaceCount + punctuationCount) / (float)(_numBytesInBuffer) < PERCENTAGE_PRINTABLE) {
152 		return true;
153 	}
154 	return false;
155 }
156 
157 
isBAM(const char * buffer)158 bool FileRecordTypeChecker::isBAM(const char *buffer)
159 {
160 	//check for BAM. The Bam Magic String is "BAM\1", and should be the first 4 characters of the file.
161 
162 	if (strncmp(buffer, "BAM\1", 4) == 0) {
163 		_isBAM = true;
164 		_fileType = BAM_FILE_TYPE;
165 		_recordType = BAM_RECORD_TYPE;
166 		return true;
167 	}
168 
169 	//TBD: Handle other binary formats
170 	return false;
171 }
172 
handleTextFormat(const char * buffer,size_t len)173 bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
174 {
175 	if (isVCFformat(buffer)) {
176 		return isTextDelimtedFormat(buffer, len);
177 	} else if (isTextDelimtedFormat(buffer, len)) {
178 		//At this point, _isText and _isDelimited are set. _numFields and _delimChar are
179 		//set.
180 		_fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE;
181 		if (_isGroupBy) {
182 			_recordType = NO_POS_PLUS_RECORD_TYPE;
183 			return true;
184 		}
185 
186 		//Tokenize the first line of valid data into fields.
187 		//Need to make a copy so next call to tokenizer doesn't overwrite the line.
188 
189 		string line(_tokenizer.getElem(_firstValidDataLineIdx));
190 
191 		// ditch \r for Windows if necessary.
192 		if (line.size() && line[line.size()-1] == '\r') {
193 			line.resize(line.size()-1);
194 		}
195 
196 		_tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW);
197 		_tokenizer.setNumExpectedItems(_numFields);
198 		_tokenizer.tokenize(line, _delimChar);
199 		if (_tokenizer.getNumFields(line, _delimChar) != _numFields) {
200 			cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl;
201 			cerr << "Perhaps you have extra TAB at the end of your line? Check with \"cat -t\""<< endl;
202 			exit(1);
203 		}
204 
205 		if (isBedFormat()) {
206 			_isBed = true;
207 			if (_numFields == 3) {
208 				_recordType = BED3_RECORD_TYPE;
209 			} else if (_numFields == 4) {
210 				if (isNumeric(_tokenizer.getElem(3))) {
211 					_recordType = BEDGRAPH_RECORD_TYPE;
212 					_fourthFieldNumeric = true;
213 				} else {
214 					_fourthFieldNumeric = false;
215 					_recordType = BED4_RECORD_TYPE;
216 					_hasStrand[BED4_RECORD_TYPE] = isStrandField(3);
217 				}
218 			} else if (_numFields == 5 && passesBed5()) {
219 				_recordType = BED5_RECORD_TYPE;
220 			} else if (_numFields == 6 && passesBed6()) {
221 				_recordType = BED6_RECORD_TYPE;
222 			} else if (_numFields == 12 && passesBed12()) {
223 				_recordType = BED12_RECORD_TYPE;
224 			} else if (_numFields >3) {
225 				if (_numFields >= 6 && isStrandField(5)) {
226 					_recordType = BED6_PLUS_RECORD_TYPE;
227 				} else {
228 					_recordType = BED_PLUS_RECORD_TYPE;
229 				}
230 
231 			}
232 			return true;
233 		}
234 		if (isGFFformat()) {
235 			if (_isGFFplus) {
236 				_recordType = GFF_PLUS_RECORD_TYPE;
237 				return true;
238 			}
239 			_isGFF = true;
240 			_recordType = GFF_RECORD_TYPE;
241 			return true;
242 		}
243 		//Here the Record must not have positions, so it is the NoPosPlus Type.
244 		_recordType = NO_POS_PLUS_RECORD_TYPE;
245 		return false;
246 	}
247 	return false;
248 }
249 
isVCFformat(const char * buffer)250 bool FileRecordTypeChecker::isVCFformat(const char *buffer)
251 {
252 	if (_isVCF) {
253 		return true; //previous pass through this method has determined file is VCF.
254 	}
255 	if (memcmp(buffer, "##fileformat=VCF", 16) == 0) {
256 		_isVCF = true;
257 		_fileType = VCF_FILE_TYPE;
258 		_recordType = VCF_RECORD_TYPE;
259 		return true;
260 	}
261 	return false;
262 }
263 
isBedFormat()264 bool FileRecordTypeChecker::isBedFormat() {
265 
266 	//test that the file has at least three fields.
267 	//2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd.
268 	if (_numFields < 3) {
269 		return false;
270 	}
271 	//the 2nd and 3rd fields must be numeric.
272 	if (!isInteger(_tokenizer.getElem(1)) || !isInteger(_tokenizer.getElem(2))) {
273 		return false;
274 	}
275 
276 	CHRPOS start = str2chrPos(_tokenizer.getElem(1));
277 	CHRPOS end = str2chrPos(_tokenizer.getElem(2));
278 	if (end < start) {
279 		return false;
280 	}
281 	return true;
282 }
283 
isGFFformat()284 bool FileRecordTypeChecker::isGFFformat()
285 {
286 	//a GFF file may have 8 or 9 fields. More than thats is GFFplus
287 	if (_numFields < 7 ) {
288 		return false;
289 	}
290 	//the 4th and 5th fields must be numeric.
291 	if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) {
292 		return false;
293 	}
294 	CHRPOS start = str2chrPos(_tokenizer.getElem(3));
295 	CHRPOS end = str2chrPos(_tokenizer.getElem(4));
296 	if (end < start) {
297 		return false;
298 	}
299 	if (_numFields > 8) {
300 		_isGFFplus = true;
301 	}
302 	return true;
303 }
304 
isTextDelimtedFormat(const char * buffer,size_t len)305 bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
306 {
307 	//Break single string buffer into vector of strings. Delimiter is newline.
308 	_tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
309 	int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit, _isCompressed);
310 
311 	//anticipated delimiter characters are tab, comma, and semi-colon.
312 	//If we need new ones, they must be added in this method.
313 	//search each line for delimiter characters.
314 
315 	vector<int> tabCounts;
316 	vector<int> commaCounts;
317 	vector<int> semicolonCounts;
318 
319 	tabCounts.reserve(numLines);
320 	commaCounts.reserve(numLines);
321 	semicolonCounts.reserve(numLines);
322 
323 	//loop through the lines, ignoring headers. Count potential delimiters,
324 	//see if we can find a few lines with the same number of a given delimiter.
325 	//delims are tested in hierarchical order, starting with tab,then comma, then semi-colon.
326 
327 	int validLinesFound=0;
328 	int headerCount = 0;
329 	int emptyLines = 0;
330 	for (int i=0; i < numLines; i++ ) {
331 
332 
333 		if (validLinesFound >=4) {
334 			break; //really only need to look at like 4 lines of data, max.
335 		}
336 
337 		const string line = _tokenizer.getElem(i);
338 
339 		//skip over any empty line
340 		if (line.size() == 0) {
341 			emptyLines++;
342 			continue;
343 		}
344 		//
345 		//skip over any header line
346 		//
347 
348 		if (_inheader) {
349 			headerCount++;
350 			_inheader = false; //inheaders can only apply to first line
351 			continue;
352 		}
353 		if (isHeaderLine(line)) {
354 			//clear any previously found supposedly valid data lines, because valid lines can only come after header lines.
355 			if (_firstValidDataLineIdx > -1 && _firstValidDataLineIdx < i) {
356 				_firstValidDataLineIdx = -1;
357 				validLinesFound--;
358 				headerCount++;
359 			}
360 			headerCount++;
361 			continue;
362 		}
363 
364 		//a line must have some alphanumeric characters in order to be valid.
365 		bool hasAlphaNum = false;
366 		for (int j=0; j < len; j++) {
367 			if(isalnum(line[j])) {
368 				hasAlphaNum = true;
369 				break;
370 			}
371 		}
372 		if (!hasAlphaNum) {
373 			continue;
374 		}
375 
376 		validLinesFound++;
377 
378 		if (_firstValidDataLineIdx == -1) {
379 			_firstValidDataLineIdx = i;
380 		}
381 
382 		int tab_count = std::count(line.begin(), line.end(), '\t');
383 		int comma_count = std::count(line.begin(), line.end(), ',');
384 		int semicolon_count = std::count(line.begin(), line.end(), ';');
385 
386 		tabCounts.push_back(tab_count);
387 		commaCounts.push_back(comma_count);
388 		semicolonCounts.push_back(semicolon_count);
389 	}
390 
391 
392 	if (headerCount + emptyLines == numLines) {
393 		_insufficientData = true;
394 	}
395 	if (validLinesFound == 0) {
396 		return false;
397 	}
398 	_insufficientData = false;
399 
400 	if (delimiterTesting(tabCounts, '\t')) {
401 
402 		return true;
403 	}
404 	else if (validLinesFound) {
405 		return true;
406 	}
407 
408 	if (delimiterTesting(commaCounts, ',')) {
409 		return true;
410 	}
411 	if (delimiterTesting(semicolonCounts, ';')) {
412 		return true;
413 	}
414 
415 	return false; //unable to detect delimited file.
416 }
417 
delimiterTesting(vector<int> & counts,char suspectChar)418 bool FileRecordTypeChecker::delimiterTesting(vector<int> &counts, char suspectChar)
419 {
420 	//check to see if we found the same number of tabs in every line.
421 	int numDelims = counts[0];
422 	if (numDelims != 0) {
423 		bool countsMatch = true;
424 		for (int i=1;  i < (int)counts.size(); i++) {
425 			if (counts[i] != numDelims) {
426 				countsMatch = false;
427 			}
428 		}
429 		if (countsMatch) {
430 			//Hurray!! We have successfully found a delimited file.
431 			_isDelimited = true;
432 			_delimChar = suspectChar;
433 			_numFields = numDelims + 1;
434 			return true;
435 		} else {
436 			return false;
437 		}
438 	}
439 	else { // there is just a single column with no delimiter.
440 		_numFields = 1;
441 		return false;
442 	}
443 }
444 
445 
setBam()446 void FileRecordTypeChecker::setBam()
447 {
448 	_fileType = BAM_FILE_TYPE;
449 	_recordType = BAM_RECORD_TYPE;
450 	_isBinary = true;
451 	_isBAM = true;
452 }
453 
setCram()454 void FileRecordTypeChecker::setCram()
455 {
456 	_fileType = BAM_FILE_TYPE;
457 	_recordType = BAM_RECORD_TYPE;
458 	_isBinary = true;
459 	_isBAM = true;
460 	_isCRAM = true;
461 }
462 
passesBed5()463 bool FileRecordTypeChecker::passesBed5() {
464 	return _isBed && _numFields == 5 && isNumeric(_tokenizer.getElem(4));
465 }
466 
passesBed6()467 bool FileRecordTypeChecker::passesBed6() {
468 	return (_isBed && _numFields == 6 && isStrandField(5));
469 }
470 
passesBed12()471 bool FileRecordTypeChecker::passesBed12() {
472 
473 	return (isStrandField(5) && isNumeric(_tokenizer.getElem(6)) &&
474 			isNumeric(_tokenizer.getElem(7)) && isNumeric(_tokenizer.getElem(9)));
475 }
476 
isStrandField(int field)477 bool FileRecordTypeChecker::isStrandField(int field) {
478 	const string &strandChar = _tokenizer.getElem(field);
479 	return (strandChar == "+" || strandChar == "-" || strandChar == "." || strandChar == "*");
480 }
481