1
2 #include "FileRecordTypeChecker.h"
3 #include "api/BamReader.h"
4 #include "ParseTools.h"
5
FileRecordTypeChecker()6 FileRecordTypeChecker::FileRecordTypeChecker()
7 : _eofHit(false),
8 _inheader(false)
9 {
10 _fileType = UNKNOWN_FILE_TYPE;
11 _recordType = UNKNOWN_RECORD_TYPE;
12 _numFields = 0;
13 _isBinary = false;
14 _isText = false;
15 _isBed = false;
16 _isDelimited = false;
17 _delimChar = '\t'; //tab by default
18 _firstValidDataLineIdx = -1;
19 _isVCF = false;
20 _isBAM = false;
21 _isCRAM = false;
22 _isGFF = false;
23 _isGFFplus = false;
24 _isGzipped = false;
25 _isCompressed = false;
26 _insufficientData = false;
27 _fourthFieldNumeric = false;
28 _givenEmptyBuffer = false;
29 _isGroupBy = false;
30 // TO DO: Bed4, Bed5, and BedGraph are missing from all of these.
31
32 _hasName[UNKNOWN_RECORD_TYPE] = false;
33 _hasName[EMPTY_RECORD_TYPE] = false;
34 _hasName[BED3_RECORD_TYPE] = false;
35 _hasName[BED6_RECORD_TYPE] = true;
36 _hasName[BED12_RECORD_TYPE] = true;
37 _hasName[BED_PLUS_RECORD_TYPE] = true;
38 _hasName[BED6_PLUS_RECORD_TYPE] = true;
39 _hasName[BAM_RECORD_TYPE] = true;
40 _hasName[VCF_RECORD_TYPE] = true;
41 _hasName[GFF_RECORD_TYPE] = true;
42 _hasName[GFF_PLUS_RECORD_TYPE] = true;
43 _hasName[NO_POS_PLUS_RECORD_TYPE] = true;
44
45 _hasScore[UNKNOWN_RECORD_TYPE] = false;
46 _hasScore[EMPTY_RECORD_TYPE] = false;
47 _hasScore[BED3_RECORD_TYPE] = false;
48 _hasScore[BED6_RECORD_TYPE] = true;
49 _hasScore[BED12_RECORD_TYPE] = true;
50 _hasScore[BED_PLUS_RECORD_TYPE] = true;
51 _hasScore[BED6_PLUS_RECORD_TYPE] = true;
52 _hasScore[BAM_RECORD_TYPE] = true;
53 _hasScore[VCF_RECORD_TYPE] = true;
54 _hasScore[GFF_RECORD_TYPE] = true;
55 _hasScore[GFF_PLUS_RECORD_TYPE] = true;
56 _hasScore[NO_POS_PLUS_RECORD_TYPE] = true;
57
58
59 _hasStrand[UNKNOWN_RECORD_TYPE] = false;
60 _hasStrand[EMPTY_RECORD_TYPE] = false;
61 _hasStrand[BED3_RECORD_TYPE] = false;
62 _hasStrand[BED6_RECORD_TYPE] = true;
63 _hasStrand[BED12_RECORD_TYPE] = true;
64 _hasStrand[BED_PLUS_RECORD_TYPE] = true; //actually, unknown. Give benefit of doubt.
65 _hasStrand[BED6_PLUS_RECORD_TYPE] = true;
66 _hasStrand[BAM_RECORD_TYPE] = true;
67 _hasStrand[VCF_RECORD_TYPE] = true;
68 _hasStrand[GFF_RECORD_TYPE] = true;
69 _hasStrand[GFF_PLUS_RECORD_TYPE] = true;
70 _hasStrand[NO_POS_PLUS_RECORD_TYPE] = true;
71
72
73 _recordTypeNames[UNKNOWN_RECORD_TYPE] = "Unknown record type";
74 _recordTypeNames[EMPTY_RECORD_TYPE] = "Empty record type";
75 _recordTypeNames[BED3_RECORD_TYPE] = "Bed3 record type";
76 _recordTypeNames[BED6_RECORD_TYPE] = "Bed6 record type";
77 _recordTypeNames[BED12_RECORD_TYPE] = "Bed12 record type";
78 _recordTypeNames[BED_PLUS_RECORD_TYPE] = "BedPlus record type";
79 _recordTypeNames[BAM_RECORD_TYPE] = "BAM record type";
80 _recordTypeNames[VCF_RECORD_TYPE] = "VCF record type";
81 _recordTypeNames[GFF_RECORD_TYPE] = "Gff record type";
82 _recordTypeNames[GFF_PLUS_RECORD_TYPE] = "GffPlus record type";
83 _recordTypeNames[NO_POS_PLUS_RECORD_TYPE] = "NoPosPlus record type";
84
85
86 _fileTypeNames[UNKNOWN_FILE_TYPE] = "Unknown file type";
87 _fileTypeNames[EMPTY_FILE_TYPE] = "Empty file type";
88 _fileTypeNames[SINGLE_LINE_DELIM_TEXT_FILE_TYPE] = "Delimited text file type";
89 _fileTypeNames[GZIP_FILE_TYPE] = "Gzip file type";
90 _fileTypeNames[BAM_FILE_TYPE] = "BAM file type";
91 _fileTypeNames[VCF_FILE_TYPE] = "VCF file type";
92 }
93
94
scanBuffer(const char * buffer,size_t len,bool eofHit,bool isCompressed)95 bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit, bool isCompressed)
96 {
97 _eofHit = eofHit;
98 _isCompressed = isCompressed;
99 _numBytesInBuffer = len;
100 if (_numBytesInBuffer == 0) {
101 _fileType = EMPTY_FILE_TYPE;
102 _recordType = EMPTY_RECORD_TYPE;
103 return true;
104 }
105
106 //special: the first thing we do is look for a gzipped file.
107 if (!_isGzipped && ((unsigned char)(buffer[0]) == 0x1f)) {
108 _isGzipped = true;
109 return true;
110 }
111 //scan the first 8K block of the streamBuf.
112
113 //now we have a buffer from the file.
114 //first, test to see if it's binary or text.
115 if (isBinaryBuffer(buffer, len)) {
116 _isText = false;
117 _isBinary = true;
118 return true;
119 } else {
120 _isText = true;
121 _isBinary = false;
122 return handleTextFormat(buffer, len);
123 }
124 }
125
isBinaryBuffer(const char * buffer,size_t len)126 bool FileRecordTypeChecker::isBinaryBuffer(const char *buffer, size_t len)
127 {
128 if (isBAM(buffer)) {
129 return true;
130 }
131
132 //Let's say that in a text file, at least 90% of the characters
133 //should be alphanumeric, whitespace, or punctuation.
134 static const float PERCENTAGE_PRINTABLE = .9;
135
136 int alphaNumCount = 0;
137 int whiteSpaceCount = 0;
138 int punctuationCount = 0;
139
140 for (int i=0; i < (int)len; i++) {
141 char currChar = buffer[i];
142 if (isalnum(currChar)) {
143 alphaNumCount++;
144 } else if (isspace(currChar)) {
145 whiteSpaceCount++;
146 } else if (ispunct(currChar)) {
147 punctuationCount++;
148 }
149 }
150
151 if ((float)(alphaNumCount + whiteSpaceCount + punctuationCount) / (float)(_numBytesInBuffer) < PERCENTAGE_PRINTABLE) {
152 return true;
153 }
154 return false;
155 }
156
157
isBAM(const char * buffer)158 bool FileRecordTypeChecker::isBAM(const char *buffer)
159 {
160 //check for BAM. The Bam Magic String is "BAM\1", and should be the first 4 characters of the file.
161
162 if (strncmp(buffer, "BAM\1", 4) == 0) {
163 _isBAM = true;
164 _fileType = BAM_FILE_TYPE;
165 _recordType = BAM_RECORD_TYPE;
166 return true;
167 }
168
169 //TBD: Handle other binary formats
170 return false;
171 }
172
handleTextFormat(const char * buffer,size_t len)173 bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
174 {
175 if (isVCFformat(buffer)) {
176 return isTextDelimtedFormat(buffer, len);
177 } else if (isTextDelimtedFormat(buffer, len)) {
178 //At this point, _isText and _isDelimited are set. _numFields and _delimChar are
179 //set.
180 _fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE;
181 if (_isGroupBy) {
182 _recordType = NO_POS_PLUS_RECORD_TYPE;
183 return true;
184 }
185
186 //Tokenize the first line of valid data into fields.
187 //Need to make a copy so next call to tokenizer doesn't overwrite the line.
188
189 string line(_tokenizer.getElem(_firstValidDataLineIdx));
190
191 // ditch \r for Windows if necessary.
192 if (line.size() && line[line.size()-1] == '\r') {
193 line.resize(line.size()-1);
194 }
195
196 _tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW);
197 _tokenizer.setNumExpectedItems(_numFields);
198 _tokenizer.tokenize(line, _delimChar);
199 if (_tokenizer.getNumFields(line, _delimChar) != _numFields) {
200 cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl;
201 cerr << "Perhaps you have extra TAB at the end of your line? Check with \"cat -t\""<< endl;
202 exit(1);
203 }
204
205 if (isBedFormat()) {
206 _isBed = true;
207 if (_numFields == 3) {
208 _recordType = BED3_RECORD_TYPE;
209 } else if (_numFields == 4) {
210 if (isNumeric(_tokenizer.getElem(3))) {
211 _recordType = BEDGRAPH_RECORD_TYPE;
212 _fourthFieldNumeric = true;
213 } else {
214 _fourthFieldNumeric = false;
215 _recordType = BED4_RECORD_TYPE;
216 _hasStrand[BED4_RECORD_TYPE] = isStrandField(3);
217 }
218 } else if (_numFields == 5 && passesBed5()) {
219 _recordType = BED5_RECORD_TYPE;
220 } else if (_numFields == 6 && passesBed6()) {
221 _recordType = BED6_RECORD_TYPE;
222 } else if (_numFields == 12 && passesBed12()) {
223 _recordType = BED12_RECORD_TYPE;
224 } else if (_numFields >3) {
225 if (_numFields >= 6 && isStrandField(5)) {
226 _recordType = BED6_PLUS_RECORD_TYPE;
227 } else {
228 _recordType = BED_PLUS_RECORD_TYPE;
229 }
230
231 }
232 return true;
233 }
234 if (isGFFformat()) {
235 if (_isGFFplus) {
236 _recordType = GFF_PLUS_RECORD_TYPE;
237 return true;
238 }
239 _isGFF = true;
240 _recordType = GFF_RECORD_TYPE;
241 return true;
242 }
243 //Here the Record must not have positions, so it is the NoPosPlus Type.
244 _recordType = NO_POS_PLUS_RECORD_TYPE;
245 return false;
246 }
247 return false;
248 }
249
isVCFformat(const char * buffer)250 bool FileRecordTypeChecker::isVCFformat(const char *buffer)
251 {
252 if (_isVCF) {
253 return true; //previous pass through this method has determined file is VCF.
254 }
255 if (memcmp(buffer, "##fileformat=VCF", 16) == 0) {
256 _isVCF = true;
257 _fileType = VCF_FILE_TYPE;
258 _recordType = VCF_RECORD_TYPE;
259 return true;
260 }
261 return false;
262 }
263
isBedFormat()264 bool FileRecordTypeChecker::isBedFormat() {
265
266 //test that the file has at least three fields.
267 //2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd.
268 if (_numFields < 3) {
269 return false;
270 }
271 //the 2nd and 3rd fields must be numeric.
272 if (!isInteger(_tokenizer.getElem(1)) || !isInteger(_tokenizer.getElem(2))) {
273 return false;
274 }
275
276 CHRPOS start = str2chrPos(_tokenizer.getElem(1));
277 CHRPOS end = str2chrPos(_tokenizer.getElem(2));
278 if (end < start) {
279 return false;
280 }
281 return true;
282 }
283
isGFFformat()284 bool FileRecordTypeChecker::isGFFformat()
285 {
286 //a GFF file may have 8 or 9 fields. More than thats is GFFplus
287 if (_numFields < 7 ) {
288 return false;
289 }
290 //the 4th and 5th fields must be numeric.
291 if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) {
292 return false;
293 }
294 CHRPOS start = str2chrPos(_tokenizer.getElem(3));
295 CHRPOS end = str2chrPos(_tokenizer.getElem(4));
296 if (end < start) {
297 return false;
298 }
299 if (_numFields > 8) {
300 _isGFFplus = true;
301 }
302 return true;
303 }
304
isTextDelimtedFormat(const char * buffer,size_t len)305 bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
306 {
307 //Break single string buffer into vector of strings. Delimiter is newline.
308 _tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
309 int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit, _isCompressed);
310
311 //anticipated delimiter characters are tab, comma, and semi-colon.
312 //If we need new ones, they must be added in this method.
313 //search each line for delimiter characters.
314
315 vector<int> tabCounts;
316 vector<int> commaCounts;
317 vector<int> semicolonCounts;
318
319 tabCounts.reserve(numLines);
320 commaCounts.reserve(numLines);
321 semicolonCounts.reserve(numLines);
322
323 //loop through the lines, ignoring headers. Count potential delimiters,
324 //see if we can find a few lines with the same number of a given delimiter.
325 //delims are tested in hierarchical order, starting with tab,then comma, then semi-colon.
326
327 int validLinesFound=0;
328 int headerCount = 0;
329 int emptyLines = 0;
330 for (int i=0; i < numLines; i++ ) {
331
332
333 if (validLinesFound >=4) {
334 break; //really only need to look at like 4 lines of data, max.
335 }
336
337 const string line = _tokenizer.getElem(i);
338
339 //skip over any empty line
340 if (line.size() == 0) {
341 emptyLines++;
342 continue;
343 }
344 //
345 //skip over any header line
346 //
347
348 if (_inheader) {
349 headerCount++;
350 _inheader = false; //inheaders can only apply to first line
351 continue;
352 }
353 if (isHeaderLine(line)) {
354 //clear any previously found supposedly valid data lines, because valid lines can only come after header lines.
355 if (_firstValidDataLineIdx > -1 && _firstValidDataLineIdx < i) {
356 _firstValidDataLineIdx = -1;
357 validLinesFound--;
358 headerCount++;
359 }
360 headerCount++;
361 continue;
362 }
363
364 //a line must have some alphanumeric characters in order to be valid.
365 bool hasAlphaNum = false;
366 for (int j=0; j < len; j++) {
367 if(isalnum(line[j])) {
368 hasAlphaNum = true;
369 break;
370 }
371 }
372 if (!hasAlphaNum) {
373 continue;
374 }
375
376 validLinesFound++;
377
378 if (_firstValidDataLineIdx == -1) {
379 _firstValidDataLineIdx = i;
380 }
381
382 int tab_count = std::count(line.begin(), line.end(), '\t');
383 int comma_count = std::count(line.begin(), line.end(), ',');
384 int semicolon_count = std::count(line.begin(), line.end(), ';');
385
386 tabCounts.push_back(tab_count);
387 commaCounts.push_back(comma_count);
388 semicolonCounts.push_back(semicolon_count);
389 }
390
391
392 if (headerCount + emptyLines == numLines) {
393 _insufficientData = true;
394 }
395 if (validLinesFound == 0) {
396 return false;
397 }
398 _insufficientData = false;
399
400 if (delimiterTesting(tabCounts, '\t')) {
401
402 return true;
403 }
404 else if (validLinesFound) {
405 return true;
406 }
407
408 if (delimiterTesting(commaCounts, ',')) {
409 return true;
410 }
411 if (delimiterTesting(semicolonCounts, ';')) {
412 return true;
413 }
414
415 return false; //unable to detect delimited file.
416 }
417
delimiterTesting(vector<int> & counts,char suspectChar)418 bool FileRecordTypeChecker::delimiterTesting(vector<int> &counts, char suspectChar)
419 {
420 //check to see if we found the same number of tabs in every line.
421 int numDelims = counts[0];
422 if (numDelims != 0) {
423 bool countsMatch = true;
424 for (int i=1; i < (int)counts.size(); i++) {
425 if (counts[i] != numDelims) {
426 countsMatch = false;
427 }
428 }
429 if (countsMatch) {
430 //Hurray!! We have successfully found a delimited file.
431 _isDelimited = true;
432 _delimChar = suspectChar;
433 _numFields = numDelims + 1;
434 return true;
435 } else {
436 return false;
437 }
438 }
439 else { // there is just a single column with no delimiter.
440 _numFields = 1;
441 return false;
442 }
443 }
444
445
setBam()446 void FileRecordTypeChecker::setBam()
447 {
448 _fileType = BAM_FILE_TYPE;
449 _recordType = BAM_RECORD_TYPE;
450 _isBinary = true;
451 _isBAM = true;
452 }
453
setCram()454 void FileRecordTypeChecker::setCram()
455 {
456 _fileType = BAM_FILE_TYPE;
457 _recordType = BAM_RECORD_TYPE;
458 _isBinary = true;
459 _isBAM = true;
460 _isCRAM = true;
461 }
462
passesBed5()463 bool FileRecordTypeChecker::passesBed5() {
464 return _isBed && _numFields == 5 && isNumeric(_tokenizer.getElem(4));
465 }
466
passesBed6()467 bool FileRecordTypeChecker::passesBed6() {
468 return (_isBed && _numFields == 6 && isStrandField(5));
469 }
470
passesBed12()471 bool FileRecordTypeChecker::passesBed12() {
472
473 return (isStrandField(5) && isNumeric(_tokenizer.getElem(6)) &&
474 isNumeric(_tokenizer.getElem(7)) && isNumeric(_tokenizer.getElem(9)));
475 }
476
isStrandField(int field)477 bool FileRecordTypeChecker::isStrandField(int field) {
478 const string &strandChar = _tokenizer.getElem(field);
479 return (strandChar == "+" || strandChar == "-" || strandChar == "." || strandChar == "*");
480 }
481