1 /* $Id: bed_line_reader.cpp 585671 2019-05-02 15:04:11Z ludwigf $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description: Iterate through file names matching a given glob pattern
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34
35 #include <objtools/import/import_error.hpp>
36 #include "bed_line_reader.hpp"
37 #include "bed_import_data.hpp"
38
39 #include <assert.h>
40
41 USING_NCBI_SCOPE;
42 USING_SCOPE(objects);
43
44 // ****************************************************************************
45 // Implementation notes:
46 // - This implementation follows the UCSC version of BED as described at
47 // https://genome.ucsc.edu/FAQ/FAQformat.html#format1.
48 // - Per spec, arbitrary whitespace (i.e. any combination of tabs and spaces)
49 // is permissible as column separators.
50 // - Per spec, there can only be a single instance of browser/track lines,
51 // right at the beginning of the file. There is no provision for multiple
52 // tracks in a single file.
53 // ****************************************************************************
54
55 // ============================================================================
CBedLineReader(CImportMessageHandler & errorReporter)56 CBedLineReader::CBedLineReader(
57 CImportMessageHandler& errorReporter):
58 // ============================================================================
59 CFeatLineReader(errorReporter),
60 mColumnCount(0),
61 mColumnDelimiter(" \t"),
62 mSplitFlags(NStr::fSplit_MergeDelimiters),
63 mUseScore(false),
64 mItemRgb(false),
65 mColorByStrand(false)
66 {
67 }
68
69 // ============================================================================
70 bool
GetNextRecord(CStreamLineReader & lineReader,CFeatImportData & record)71 CBedLineReader::GetNextRecord(
72 CStreamLineReader& lineReader,
73 CFeatImportData& record)
74 //
75 // Error disposition:
76 // Abort on:
77 // Bad column count.
78 // Terminate on:
79 // Track line out of order.
80 // Discard record on:
81 // Bad topology (block description off, text in numeric fields)
82 // Fix up data with default values on:
83 // Bad score values, bad RGB values.
84 // ============================================================================
85 {
86 xReportProgress();
87
88 string nextLine = "";
89 while (!lineReader.AtEOF()) {
90 nextLine = *(++lineReader);
91 ++mLineCount;
92 if (xIgnoreLine(nextLine)) {
93 continue;
94 }
95 if (xProcessTrackLine(nextLine)) {
96 continue;
97 }
98 vector<string> columns;
99 xSplitLine(nextLine, columns);
100 xInitializeRecord(columns, record);
101 ++mRecordCount;
102 return true;
103 }
104 return false;
105 }
106
107 // ============================================================================
108 void
SetInputStream(CNcbiIstream & istr,bool force)109 CBedLineReader::SetInputStream(
110 CNcbiIstream& istr,
111 bool force)
112 // ============================================================================
113 {
114 CFeatLineReader::SetInputStream(istr, force);
115 }
116
117 // ============================================================================
118 bool
xIgnoreLine(const string & line) const119 CBedLineReader::xIgnoreLine(
120 const string& line) const
121 // ============================================================================
122 {
123 if (CFeatLineReader::xIgnoreLine(line)) {
124 return true;
125 }
126 return NStr::StartsWith(line, "browser");
127 }
128
129 // ============================================================================
130 bool
xProcessTrackLine(const string & line)131 CBedLineReader::xProcessTrackLine(
132 const string& line)
133 // ============================================================================
134 {
135 CImportError errorInvalidTrackValue(
136 CImportError::WARNING, "Invalid track value",
137 LineCount());
138
139 CImportError errorTrackLineOutOfOrder(
140 CImportError::CRITICAL, "Track line out of order",
141 LineCount());
142
143 string track, values;
144 NStr::SplitInTwo(line, " \t", track, values);
145 if (track != "track") {
146 return false;
147 }
148 if (this->mRecordCount > 0) { //track line not before any data
149 throw errorTrackLineOutOfOrder;
150 }
151
152 mAnnotInfo.Clear();
153 if (values.empty()) {
154 return true;
155 }
156
157 vector<string> trackPieces;
158 NStr::Split(values, "=", trackPieces);
159 string key, value;
160 key = NStr::TruncateSpaces_Unsafe(trackPieces[0]);
161 for (int i = 1; i < trackPieces.size() - 1; ++i) {
162 vector<string> words;
163 NStr::Split(trackPieces[i], " \t", words);
164 auto pendingKey = words.back();
165 words.pop_back();
166 value = NStr::Join(words, " ");
167 mAnnotInfo.SetValue(key, NStr::Replace(value, "\"", ""));
168 key = pendingKey;
169 }
170 mAnnotInfo.SetValue(key, NStr::Replace(trackPieces.back(), "\"", ""));
171
172 //cache some often accessed values:
173 const string& useScore = mAnnotInfo.ValueOf("useScore");
174 mUseScore = (!useScore.empty() && useScore != "0" && useScore != "false");
175
176 const string& itemRgb = mAnnotInfo.ValueOf("itemRgb");
177 mItemRgb = (itemRgb == "on" || itemRgb == "On");
178 if (!itemRgb.empty() && !mItemRgb) {
179 errorInvalidTrackValue.AmendMessage("Bad itemRgb value --- ignored");
180 mErrorReporter.ReportError(errorInvalidTrackValue);
181 }
182
183 const string& colorByStrand = mAnnotInfo.ValueOf("colorByStrand");
184 if (!colorByStrand.empty()) {
185 try {
186 string colorStrandPlus, colorStrandMinus;
187 NStr::SplitInTwo(
188 colorByStrand, " ", colorStrandPlus, colorStrandMinus);
189 vector<string> rgbComponents;
190 NStr::Split(colorStrandPlus, ",", rgbComponents);
191 if (rgbComponents.size() != 3) {
192 throw std::exception();
193 }
194 mRgbStrandPlus.R = NStr::StringToInt(rgbComponents[0]);
195 mRgbStrandPlus.G = NStr::StringToInt(rgbComponents[1]);
196 mRgbStrandPlus.B = NStr::StringToInt(rgbComponents[2]);
197 rgbComponents.clear();
198 NStr::Split(colorStrandMinus, ",", rgbComponents);
199 if (rgbComponents.size() != 3) {
200 throw std::exception();
201 }
202 mRgbStrandMinus.R = NStr::StringToInt(rgbComponents[0]);
203 mRgbStrandMinus.G = NStr::StringToInt(rgbComponents[1]);
204 mRgbStrandMinus.B = NStr::StringToInt(rgbComponents[2]);
205 }
206 catch(std::exception&) {
207 errorInvalidTrackValue.AmendMessage(
208 "Bad colorByStrand value --- ignored");
209 mErrorReporter.ReportError(errorInvalidTrackValue);
210 }
211 mColorByStrand = true;
212 }
213 return true;
214 }
215
216 // ============================================================================
217 void
xSplitLine(const string & line,vector<string> & columns)218 CBedLineReader::xSplitLine(
219 const string& line,
220 vector<string>& columns)
221 //
222 // See implementation notes!
223 // ============================================================================
224 {
225 CImportError errorInvalidColumnCount(
226 CImportError::FATAL, "Invalid column count",
227 LineCount());
228
229 columns.clear();
230 NStr::Split(line, mColumnDelimiter, columns, mSplitFlags);
231 if (mColumnCount == 0) {
232 if (columns.size() < 3 || columns.size() > 12) {
233 throw errorInvalidColumnCount;
234 }
235 mColumnCount = columns.size();
236 return;
237 }
238 if (columns.size() != mColumnCount) {
239 throw errorInvalidColumnCount;
240 }
241 }
242
243 // ============================================================================
244 void
xInitializeChromInterval(const vector<string> & columns,string & chromId,TSeqPos & chromStart,TSeqPos & chromEnd,ENa_strand & chromStrand)245 CBedLineReader::xInitializeChromInterval(
246 const vector<string>& columns,
247 string& chromId,
248 TSeqPos& chromStart,
249 TSeqPos& chromEnd,
250 ENa_strand& chromStrand)
251 // ============================================================================
252 {
253 CImportError errorInvalidChromStartValue(
254 CImportError::ERROR, "Invalid chromStart value",
255 LineCount());
256 CImportError errorInvalidChromEndValue(
257 CImportError::ERROR, "Invalid chromEnd value",
258 LineCount());
259 CImportError errorInvalidStrandValue(
260 CImportError::ERROR, "Invalid strand value",
261 LineCount());
262
263 chromId = columns[0];
264
265 try {
266 chromStart = NStr::StringToInt(columns[1]);
267 }
268 catch(CException&) {
269 throw errorInvalidChromStartValue;
270 }
271
272 try {
273 chromEnd = NStr::StringToInt(columns[2]);
274 }
275 catch(CException&) {
276 throw errorInvalidChromEndValue;
277 }
278
279 chromStrand = eNa_strand_plus;
280 if (columns.size() > 5) {
281 const auto& strand = columns[5];
282 if (strand != "+" && strand != "-" && strand != ".") {
283 throw errorInvalidStrandValue;
284 }
285 if (strand == "-") {
286 chromStrand = eNa_strand_minus;
287 }
288 }
289 }
290
291 // ============================================================================
292 void
xInitializeScore(const vector<string> & columns,double & score)293 CBedLineReader::xInitializeScore(
294 const vector<string>& columns,
295 double& score)
296 // ============================================================================
297 {
298 CImportError errorInvalidScoreValue(
299 CImportError::WARNING, "Invalid score value- omitting from output.",
300 LineCount());
301
302 if (columns.size() < 5 || columns[4] == "." || mUseScore) {
303 score = -1.0;
304 return;
305 }
306 try {
307 score = NStr::StringToDouble(columns[4]);
308 }
309 catch(CException&) {
310 mErrorReporter.ReportError(errorInvalidScoreValue);
311 score = -1.0;
312 return;
313 }
314 }
315
316 // ============================================================================
317 void
xInitializeRgb(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)318 CBedLineReader::xInitializeRgb(
319 const vector<string>& columns,
320 CBedImportData::RgbValue& rgbValue)
321 // ============================================================================
322 {
323 if (mUseScore) {
324 xInitializeRgbFromScoreColumn(columns, rgbValue);
325 return;
326 }
327 if (mItemRgb) {
328 xInitializeRgbFromRgbColumn(columns, rgbValue);
329 return;
330 }
331 if (mColorByStrand) {
332 xInitializeRgbFromStrandColumn(columns, rgbValue);
333 return;
334 }
335 rgbValue.R = rgbValue.B = rgbValue.G = -1;
336 }
337
338 // ============================================================================
339 void
xInitializeRgbFromStrandColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)340 CBedLineReader::xInitializeRgbFromStrandColumn(
341 const vector<string>& columns,
342 CBedImportData::RgbValue& rgbValue)
343 // ============================================================================
344 {
345 CImportError errorInvalidStrandValue(
346 CImportError::WARNING,
347 "Invalid strand value- setting color to BLACK.",
348 LineCount());
349
350 if (columns.size() < 6 ||
351 (columns[5] != "+" && columns[5] != "-" && columns[5] != ".")) {
352 mErrorReporter.ReportError(errorInvalidStrandValue);
353 rgbValue.R = rgbValue.G = rgbValue.B = 0;
354 return;
355 }
356 if (columns[5] == "-") {
357 rgbValue.R = mRgbStrandMinus.R;
358 rgbValue.B = mRgbStrandMinus.B;
359 rgbValue.G = mRgbStrandMinus.G;
360 }
361 else {
362 rgbValue.R = mRgbStrandPlus.R;
363 rgbValue.B = mRgbStrandPlus.B;
364 rgbValue.G = mRgbStrandPlus.G;
365 }
366 }
367
368 // ============================================================================
369 void
xInitializeRgbFromScoreColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)370 CBedLineReader::xInitializeRgbFromScoreColumn(
371 const vector<string>& columns,
372 CBedImportData::RgbValue& rgbValue)
373 // ============================================================================
374 {
375 CImportError errorInvalidScoreValue(
376 CImportError::WARNING, "Invalid score value- setting color to BLACK.",
377 LineCount());
378 CImportError errorScoreValueTooLow(
379 CImportError::WARNING, "Invalid score value- clipping to 0.",
380 LineCount());
381 CImportError errorScoreValueTooHigh(
382 CImportError::WARNING, "Invalid score value- clipping to 1000.",
383 LineCount());
384
385 if (columns.size() < 5 || columns[4] == ".") {
386 mErrorReporter.ReportError(errorInvalidScoreValue);
387 rgbValue.R = rgbValue.G = rgbValue.B = 0;
388 return;
389 }
390 auto scoreValue = 0;
391 try {
392 scoreValue = static_cast<int>(NStr::StringToDouble(columns[4]));
393 }
394 catch(CException&) {
395 mErrorReporter.ReportError(errorInvalidScoreValue);
396 rgbValue.R = rgbValue.G = rgbValue.B = 0;
397 return;
398 }
399
400 if (scoreValue < 0) {
401 mErrorReporter.ReportError(errorScoreValueTooLow);
402 scoreValue = 0;
403 }
404 else if (scoreValue > 1000) {
405 mErrorReporter.ReportError(errorScoreValueTooHigh);
406 scoreValue = 1000;
407 }
408 if (scoreValue == 0) {
409 rgbValue.R = rgbValue.G = rgbValue.B = 0;
410 return;
411 }
412 if (scoreValue > 998) {
413 rgbValue.R = rgbValue.G = rgbValue.B = 255;
414 return;
415 }
416
417 scoreValue = static_cast<int>(scoreValue / 111);
418 rgbValue.R = rgbValue.G = rgbValue.B = (13 + 29*scoreValue);
419 return;
420 }
421
422 // ============================================================================
423 void
xInitializeRgbFromRgbColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)424 CBedLineReader::xInitializeRgbFromRgbColumn(
425 const vector<string>& columns,
426 CBedImportData::RgbValue& rgbValue)
427 // ============================================================================
428 {
429 CImportError errorInvalidRgbValue(
430 CImportError::WARNING, "Invalid RGB value- defaulting to BLACK",
431 LineCount());
432
433 rgbValue.R = rgbValue.G = rgbValue.B = 0;
434 if (columns.size() < 9 || columns[8] == ".") {
435 return;
436 }
437
438 string rgb = columns[8];
439 try {
440 vector<string > rgbSplits;
441 NStr::Split(rgb, ",", rgbSplits);
442 switch(rgbSplits.size()) {
443 default:
444 throw errorInvalidRgbValue;
445 case 1: {
446 unsigned long rgbInt = 0;
447 if (NStr::StartsWith(rgb, "0x")) {
448 auto hexDigits = rgb.substr(2, string::npos);
449 rgbInt = NStr::StringToULong(hexDigits, 0, 16);
450 }
451 else if (NStr::StartsWith(rgb, "#")) {
452 auto hexDigits = rgb.substr(1, string::npos);
453 rgbInt = NStr::StringToULong(hexDigits, 0, 16);
454 }
455 else {
456 rgbInt = NStr::StringToULong(rgbSplits[0]);
457 }
458 rgbInt &= 0xffffff;
459 rgbValue.R = (rgbInt & 0xff0000) >> 16;
460 rgbValue.G = (rgbInt & 0x00ff00) >> 8;
461 rgbValue.B = (rgbInt & 0x0000ff);
462 break;
463 }
464 case 3: {
465 rgbValue.R = NStr::StringToInt(rgbSplits[0]);
466 rgbValue.G = NStr::StringToInt(rgbSplits[1]);
467 rgbValue.B = NStr::StringToInt(rgbSplits[2]);
468 break;
469 }
470 }
471 }
472 catch(CException&) {
473 rgbValue.R = rgbValue.G = rgbValue.B = 0;
474 mErrorReporter.ReportError(errorInvalidRgbValue);
475 return;
476 }
477 if (rgbValue.R < 0 || 255 < rgbValue.R) {
478 rgbValue.R = rgbValue.G = rgbValue.B = 0;
479 mErrorReporter.ReportError(errorInvalidRgbValue);
480 return;
481 }
482 if (rgbValue.G < 0 || 255 < rgbValue.G) {
483 rgbValue.R = rgbValue.G = rgbValue.B = 0;
484 mErrorReporter.ReportError(errorInvalidRgbValue);
485 return;
486 }
487 if (rgbValue.B < 0 || 255 < rgbValue.B) {
488 rgbValue.R = rgbValue.G = rgbValue.B = 0;
489 mErrorReporter.ReportError(errorInvalidRgbValue);
490 return;
491 }
492 }
493
494 // ============================================================================
495 void
xInitializeThickInterval(const vector<string> & columns,TSeqPos & thickStart,TSeqPos & thickEnd)496 CBedLineReader::xInitializeThickInterval(
497 const vector<string>& columns,
498 TSeqPos& thickStart,
499 TSeqPos& thickEnd)
500 // ============================================================================
501 {
502 CImportError errorInvalidThickStartValue(
503 CImportError::ERROR, "Invalid thickStart value",
504 LineCount());
505 CImportError errorInvalidThickEndValue(
506 CImportError::ERROR, "Invalid thickEnd value",
507 LineCount());
508 if (columns.size() < 8) {
509 return;
510 }
511
512 try {
513 thickStart = NStr::StringToInt(columns[6]);
514 }
515 catch(CException&) {
516 throw errorInvalidThickStartValue;
517 }
518
519 try {
520 thickEnd = NStr::StringToInt(columns[7]);
521 }
522 catch(CException&) {
523 throw errorInvalidThickEndValue;
524 }
525 }
526
527 // ============================================================================
528 void
xInitializeChromName(const vector<string> & columns,string & chromName)529 CBedLineReader::xInitializeChromName(
530 const vector<string>& columns,
531 string& chromName)
532 // ============================================================================
533 {
534 if (columns.size() < 4) {
535 chromName.clear();
536 return;
537 }
538 chromName = columns[3];
539 }
540
541 // ============================================================================
542 void
xInitializeBlocks(const vector<string> & columns,unsigned int & blockCount,vector<int> & blockStarts,vector<int> & blockSizes)543 CBedLineReader::xInitializeBlocks(
544 const vector<string>& columns,
545 unsigned int& blockCount,
546 vector<int>& blockStarts,
547 vector<int>& blockSizes)
548 // ============================================================================
549 {
550 CImportError errorInvalidBlockCountValue(
551 CImportError::ERROR, "Invalid blockCount value",
552 LineCount());
553 CImportError errorInvalidBlockStartsValue(
554 CImportError::ERROR, "Invalid blockStarts value",
555 LineCount());
556 CImportError errorInvalidBlockSizesValue(
557 CImportError::ERROR, "Invalid blockSizes value",
558 LineCount());
559 CImportError errorInconsistentBlocksInformation(
560 CImportError::ERROR, "Inconsistent blocks information",
561 LineCount());
562
563 if (columns.size() < 12) {
564 blockCount = 0;
565 return;
566 }
567 try {
568 blockCount = NStr::StringToInt(columns[9]);
569 }
570 catch(std::exception&) {
571 throw errorInvalidBlockCountValue;
572 }
573
574 blockStarts.clear();
575 blockSizes.clear();
576 try {
577 vector<string> blockSizesSplits;
578 NStr::Split(columns[10], ",", blockSizesSplits);
579 if (blockSizesSplits.back().empty()) {
580 blockSizesSplits.pop_back();
581 }
582 for (auto blockSize: blockSizesSplits) {
583 blockSizes.push_back(NStr::StringToInt(blockSize));
584 }
585 }
586 catch(std::exception&) {
587 throw errorInvalidBlockSizesValue;
588 }
589 if (blockCount != blockSizes.size()) {
590 throw errorInconsistentBlocksInformation;
591 }
592
593 try {
594 vector<string> blockStartsSplits;
595 NStr::Split(columns[11], ",", blockStartsSplits);
596 if (blockStartsSplits.back().empty()) {
597 blockStartsSplits.pop_back();
598 }
599 for (auto blockStart: blockStartsSplits) {
600 blockStarts.push_back(NStr::StringToInt(blockStart));
601 }
602 }
603 catch(std::exception&) {
604 throw errorInvalidBlockStartsValue;
605 }
606 if (blockCount != blockStarts.size()) {
607 throw errorInconsistentBlocksInformation;
608 }
609 }
610
611 // ============================================================================
612 void
xInitializeRecord(const vector<string> & columns,CFeatImportData & record_)613 CBedLineReader::xInitializeRecord(
614 const vector<string>& columns,
615 CFeatImportData& record_)
616 // ============================================================================
617 {
618 CImportError errorInvalidThickInterval(
619 CImportError::ERROR, "thickInterval extending beyond chrom feature",
620 LineCount());
621
622 assert(dynamic_cast<CBedImportData*>(&record_));
623
624 CBedImportData& record = static_cast<CBedImportData&>(record_);
625 //record.InitializeFrom(columns, mLineNumber);
626
627 string chromId;
628 TSeqPos chromStart, chromEnd;
629 ENa_strand chromStrand;
630 xInitializeChromInterval(columns, chromId, chromStart, chromEnd, chromStrand);
631
632 string name;
633 xInitializeChromName(columns, name);
634
635 double score;
636 xInitializeScore(columns, score);
637
638 TSeqPos thickStart = chromStart, thickEnd = chromStart; //!!
639 xInitializeThickInterval(columns, thickStart, thickEnd);
640 if (thickStart < chromStart || thickEnd > chromEnd) {
641 throw errorInvalidThickInterval;
642 }
643
644 CBedImportData::RgbValue rgbValue;
645 xInitializeRgb(columns, rgbValue);
646
647 unsigned int blockCount;
648 vector<int> blockStarts, blockSizes;
649 xInitializeBlocks(columns, blockCount, blockStarts, blockSizes);
650
651 record.Initialize(chromId, chromStart, chromEnd, name, score, chromStrand,
652 thickStart, thickEnd, rgbValue, blockCount, blockStarts, blockSizes);
653 }
654
655