1 /*  $Id: bed_line_reader.cpp 585671 2019-05-02 15:04:11Z ludwigf $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:  Iterate through file names matching a given glob pattern
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 
35 #include <objtools/import/import_error.hpp>
36 #include "bed_line_reader.hpp"
37 #include "bed_import_data.hpp"
38 
39 #include <assert.h>
40 
41 USING_NCBI_SCOPE;
42 USING_SCOPE(objects);
43 
44 //  ****************************************************************************
45 //  Implementation notes:
46 //  - This implementation follows the UCSC version of BED as described at
47 //    https://genome.ucsc.edu/FAQ/FAQformat.html#format1.
48 //  - Per spec, arbitrary whitespace (i.e. any combination of tabs and spaces)
49 //    is permissible as column separators.
50 //  - Per spec, there can only be a single instance of browser/track lines,
51 //    right at the beginning of the file. There is no provision for multiple
52 //    tracks in a single file.
53 //  ****************************************************************************
54 
55 //  ============================================================================
CBedLineReader(CImportMessageHandler & errorReporter)56 CBedLineReader::CBedLineReader(
57     CImportMessageHandler& errorReporter):
58 //  ============================================================================
59     CFeatLineReader(errorReporter),
60     mColumnCount(0),
61     mColumnDelimiter(" \t"),
62     mSplitFlags(NStr::fSplit_MergeDelimiters),
63     mUseScore(false),
64     mItemRgb(false),
65     mColorByStrand(false)
66 {
67 }
68 
69 //  ============================================================================
70 bool
GetNextRecord(CStreamLineReader & lineReader,CFeatImportData & record)71 CBedLineReader::GetNextRecord(
72     CStreamLineReader& lineReader,
73     CFeatImportData& record)
74 //
75 //  Error disposition:
76 //  Abort on:
77 //    Bad column count.
78 //  Terminate on:
79 //    Track line out of order.
80 //  Discard record on:
81 //    Bad topology (block description off, text in numeric fields)
82 //  Fix up data with default values on:
83 //    Bad score values, bad RGB values.
84 //  ============================================================================
85 {
86     xReportProgress();
87 
88     string nextLine = "";
89     while (!lineReader.AtEOF()) {
90         nextLine = *(++lineReader);
91         ++mLineCount;
92         if (xIgnoreLine(nextLine)) {
93             continue;
94         }
95         if (xProcessTrackLine(nextLine)) {
96             continue;
97         }
98         vector<string> columns;
99         xSplitLine(nextLine, columns);
100         xInitializeRecord(columns, record);
101         ++mRecordCount;
102         return true;
103     }
104     return false;
105 }
106 
107 //  ============================================================================
108 void
SetInputStream(CNcbiIstream & istr,bool force)109 CBedLineReader::SetInputStream(
110     CNcbiIstream& istr,
111     bool force)
112 //  ============================================================================
113 {
114     CFeatLineReader::SetInputStream(istr, force);
115 }
116 
117 //  ============================================================================
118 bool
xIgnoreLine(const string & line) const119 CBedLineReader::xIgnoreLine(
120     const string& line) const
121 //  ============================================================================
122 {
123     if (CFeatLineReader::xIgnoreLine(line)) {
124         return true;
125     }
126     return NStr::StartsWith(line, "browser");
127 }
128 
129 //  ============================================================================
130 bool
xProcessTrackLine(const string & line)131 CBedLineReader::xProcessTrackLine(
132     const string& line)
133 //  ============================================================================
134 {
135     CImportError errorInvalidTrackValue(
136         CImportError::WARNING, "Invalid track value",
137         LineCount());
138 
139     CImportError errorTrackLineOutOfOrder(
140         CImportError::CRITICAL, "Track line out of order",
141         LineCount());
142 
143     string track, values;
144     NStr::SplitInTwo(line, " \t", track, values);
145     if (track != "track") {
146         return false;
147     }
148     if (this->mRecordCount > 0) { //track line not before any data
149         throw errorTrackLineOutOfOrder;
150     }
151 
152     mAnnotInfo.Clear();
153     if (values.empty()) {
154         return true;
155     }
156 
157     vector<string> trackPieces;
158     NStr::Split(values, "=", trackPieces);
159     string key, value;
160     key = NStr::TruncateSpaces_Unsafe(trackPieces[0]);
161     for (int i = 1; i < trackPieces.size() - 1; ++i) {
162         vector<string> words;
163         NStr::Split(trackPieces[i], " \t", words);
164         auto pendingKey = words.back();
165         words.pop_back();
166         value = NStr::Join(words, " ");
167         mAnnotInfo.SetValue(key, NStr::Replace(value, "\"", ""));
168         key = pendingKey;
169     }
170     mAnnotInfo.SetValue(key, NStr::Replace(trackPieces.back(), "\"", ""));
171 
172     //cache some often accessed values:
173     const string& useScore = mAnnotInfo.ValueOf("useScore");
174     mUseScore = (!useScore.empty()  &&  useScore != "0"  &&  useScore != "false");
175 
176     const string& itemRgb = mAnnotInfo.ValueOf("itemRgb");
177     mItemRgb = (itemRgb == "on"  ||  itemRgb == "On");
178     if (!itemRgb.empty()  &&  !mItemRgb) {
179         errorInvalidTrackValue.AmendMessage("Bad itemRgb value --- ignored");
180         mErrorReporter.ReportError(errorInvalidTrackValue);
181     }
182 
183     const string& colorByStrand = mAnnotInfo.ValueOf("colorByStrand");
184     if (!colorByStrand.empty()) {
185         try {
186             string colorStrandPlus, colorStrandMinus;
187             NStr::SplitInTwo(
188                 colorByStrand, " ", colorStrandPlus, colorStrandMinus);
189             vector<string> rgbComponents;
190             NStr::Split(colorStrandPlus, ",", rgbComponents);
191             if (rgbComponents.size() != 3) {
192                 throw std::exception();
193             }
194             mRgbStrandPlus.R = NStr::StringToInt(rgbComponents[0]);
195             mRgbStrandPlus.G = NStr::StringToInt(rgbComponents[1]);
196             mRgbStrandPlus.B = NStr::StringToInt(rgbComponents[2]);
197             rgbComponents.clear();
198             NStr::Split(colorStrandMinus, ",", rgbComponents);
199             if (rgbComponents.size() != 3) {
200                 throw std::exception();
201             }
202             mRgbStrandMinus.R = NStr::StringToInt(rgbComponents[0]);
203             mRgbStrandMinus.G = NStr::StringToInt(rgbComponents[1]);
204             mRgbStrandMinus.B = NStr::StringToInt(rgbComponents[2]);
205         }
206         catch(std::exception&) {
207             errorInvalidTrackValue.AmendMessage(
208                 "Bad colorByStrand value --- ignored");
209             mErrorReporter.ReportError(errorInvalidTrackValue);
210         }
211         mColorByStrand = true;
212     }
213     return true;
214 }
215 
216 //  ============================================================================
217 void
xSplitLine(const string & line,vector<string> & columns)218 CBedLineReader::xSplitLine(
219     const string& line,
220     vector<string>& columns)
221 //
222 //  See implementation notes!
223 //  ============================================================================
224 {
225     CImportError errorInvalidColumnCount(
226         CImportError::FATAL, "Invalid column count",
227         LineCount());
228 
229     columns.clear();
230     NStr::Split(line, mColumnDelimiter, columns, mSplitFlags);
231 	if (mColumnCount == 0) {
232 		if (columns.size() < 3  ||  columns.size() > 12) {
233 			throw errorInvalidColumnCount;
234 		}
235         mColumnCount = columns.size();
236         return;
237 	}
238     if (columns.size() != mColumnCount) {
239         throw errorInvalidColumnCount;
240     }
241 }
242 
243 //  ============================================================================
244 void
xInitializeChromInterval(const vector<string> & columns,string & chromId,TSeqPos & chromStart,TSeqPos & chromEnd,ENa_strand & chromStrand)245 CBedLineReader::xInitializeChromInterval(
246     const vector<string>& columns,
247     string& chromId,
248     TSeqPos& chromStart,
249     TSeqPos& chromEnd,
250     ENa_strand& chromStrand)
251 //  ============================================================================
252 {
253     CImportError errorInvalidChromStartValue(
254         CImportError::ERROR, "Invalid chromStart value",
255         LineCount());
256     CImportError errorInvalidChromEndValue(
257         CImportError::ERROR, "Invalid chromEnd value",
258         LineCount());
259     CImportError errorInvalidStrandValue(
260         CImportError::ERROR, "Invalid strand value",
261         LineCount());
262 
263     chromId = columns[0];
264 
265     try {
266         chromStart = NStr::StringToInt(columns[1]);
267     }
268     catch(CException&) {
269         throw errorInvalidChromStartValue;
270     }
271 
272     try {
273         chromEnd = NStr::StringToInt(columns[2]);
274     }
275     catch(CException&) {
276         throw errorInvalidChromEndValue;
277     }
278 
279     chromStrand = eNa_strand_plus;
280     if (columns.size() > 5) {
281         const auto& strand = columns[5];
282         if (strand != "+"  &&  strand != "-"  &&  strand != ".") {
283             throw errorInvalidStrandValue;
284         }
285         if (strand == "-") {
286             chromStrand = eNa_strand_minus;
287         }
288     }
289 }
290 
291 //  ============================================================================
292 void
xInitializeScore(const vector<string> & columns,double & score)293 CBedLineReader::xInitializeScore(
294     const vector<string>& columns,
295     double& score)
296 //  ============================================================================
297 {
298     CImportError errorInvalidScoreValue(
299         CImportError::WARNING, "Invalid score value- omitting from output.",
300         LineCount());
301 
302     if (columns.size() < 5  ||  columns[4] == "."  ||  mUseScore) {
303         score = -1.0;
304         return;
305     }
306     try {
307         score = NStr::StringToDouble(columns[4]);
308     }
309     catch(CException&) {
310         mErrorReporter.ReportError(errorInvalidScoreValue);
311         score = -1.0;
312         return;
313     }
314 }
315 
316 //  ============================================================================
317 void
xInitializeRgb(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)318 CBedLineReader::xInitializeRgb(
319     const vector<string>& columns,
320     CBedImportData::RgbValue& rgbValue)
321     //  ============================================================================
322 {
323     if (mUseScore) {
324         xInitializeRgbFromScoreColumn(columns, rgbValue);
325         return;
326     }
327     if (mItemRgb) {
328         xInitializeRgbFromRgbColumn(columns, rgbValue);
329         return;
330     }
331     if (mColorByStrand) {
332         xInitializeRgbFromStrandColumn(columns, rgbValue);
333         return;
334     }
335     rgbValue.R = rgbValue.B = rgbValue.G = -1;
336 }
337 
338 //  ============================================================================
339 void
xInitializeRgbFromStrandColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)340 CBedLineReader::xInitializeRgbFromStrandColumn(
341     const vector<string>& columns,
342     CBedImportData::RgbValue& rgbValue)
343 //  ============================================================================
344 {
345     CImportError errorInvalidStrandValue(
346         CImportError::WARNING,
347         "Invalid strand value- setting color to BLACK.",
348         LineCount());
349 
350     if (columns.size() < 6  ||
351             (columns[5] != "+"  &&  columns[5] != "-"  &&  columns[5] != ".")) {
352         mErrorReporter.ReportError(errorInvalidStrandValue);
353         rgbValue.R = rgbValue.G = rgbValue.B = 0;
354         return;
355     }
356     if (columns[5] == "-") {
357         rgbValue.R = mRgbStrandMinus.R;
358         rgbValue.B = mRgbStrandMinus.B;
359         rgbValue.G = mRgbStrandMinus.G;
360     }
361     else {
362         rgbValue.R = mRgbStrandPlus.R;
363         rgbValue.B = mRgbStrandPlus.B;
364         rgbValue.G = mRgbStrandPlus.G;
365     }
366 }
367 
368 //  ============================================================================
369 void
xInitializeRgbFromScoreColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)370 CBedLineReader::xInitializeRgbFromScoreColumn(
371     const vector<string>& columns,
372     CBedImportData::RgbValue& rgbValue)
373 //  ============================================================================
374 {
375     CImportError errorInvalidScoreValue(
376         CImportError::WARNING, "Invalid score value- setting color to BLACK.",
377         LineCount());
378     CImportError errorScoreValueTooLow(
379         CImportError::WARNING, "Invalid score value- clipping to 0.",
380         LineCount());
381     CImportError errorScoreValueTooHigh(
382         CImportError::WARNING, "Invalid score value- clipping to 1000.",
383         LineCount());
384 
385     if (columns.size() < 5 || columns[4] == ".") {
386         mErrorReporter.ReportError(errorInvalidScoreValue);
387         rgbValue.R = rgbValue.G = rgbValue.B = 0;
388         return;
389     }
390     auto scoreValue = 0;
391     try {
392         scoreValue = static_cast<int>(NStr::StringToDouble(columns[4]));
393     }
394     catch(CException&) {
395         mErrorReporter.ReportError(errorInvalidScoreValue);
396         rgbValue.R = rgbValue.G = rgbValue.B = 0;
397         return;
398     }
399 
400     if (scoreValue < 0) {
401         mErrorReporter.ReportError(errorScoreValueTooLow);
402         scoreValue = 0;
403     }
404     else if (scoreValue > 1000) {
405         mErrorReporter.ReportError(errorScoreValueTooHigh);
406         scoreValue = 1000;
407     }
408     if (scoreValue == 0) {
409         rgbValue.R = rgbValue.G = rgbValue.B = 0;
410         return;
411     }
412     if (scoreValue > 998) {
413         rgbValue.R = rgbValue.G = rgbValue.B = 255;
414         return;
415     }
416 
417     scoreValue = static_cast<int>(scoreValue / 111);
418     rgbValue.R = rgbValue.G = rgbValue.B = (13 + 29*scoreValue);
419     return;
420 }
421 
422 //  ============================================================================
423 void
xInitializeRgbFromRgbColumn(const vector<string> & columns,CBedImportData::RgbValue & rgbValue)424 CBedLineReader::xInitializeRgbFromRgbColumn(
425     const vector<string>& columns,
426     CBedImportData::RgbValue& rgbValue)
427 //  ============================================================================
428 {
429     CImportError errorInvalidRgbValue(
430         CImportError::WARNING, "Invalid RGB value- defaulting to BLACK",
431         LineCount());
432 
433     rgbValue.R = rgbValue.G = rgbValue.B = 0;
434     if (columns.size() < 9  ||  columns[8] == ".") {
435         return;
436     }
437 
438     string rgb = columns[8];
439     try {
440         vector<string > rgbSplits;
441         NStr::Split(rgb, ",", rgbSplits);
442         switch(rgbSplits.size()) {
443         default:
444             throw errorInvalidRgbValue;
445         case 1: {
446             unsigned long rgbInt = 0;
447             if (NStr::StartsWith(rgb, "0x")) {
448                 auto hexDigits = rgb.substr(2, string::npos);
449                 rgbInt = NStr::StringToULong(hexDigits, 0, 16);
450             }
451             else if (NStr::StartsWith(rgb, "#")) {
452                 auto hexDigits = rgb.substr(1, string::npos);
453                 rgbInt = NStr::StringToULong(hexDigits, 0, 16);
454             }
455             else {
456                 rgbInt = NStr::StringToULong(rgbSplits[0]);
457             }
458             rgbInt &= 0xffffff;
459             rgbValue.R = (rgbInt & 0xff0000) >> 16;
460             rgbValue.G = (rgbInt & 0x00ff00) >> 8;
461             rgbValue.B = (rgbInt & 0x0000ff);
462             break;
463         }
464         case 3: {
465             rgbValue.R = NStr::StringToInt(rgbSplits[0]);
466             rgbValue.G = NStr::StringToInt(rgbSplits[1]);
467             rgbValue.B = NStr::StringToInt(rgbSplits[2]);
468             break;
469         }
470         }
471     }
472     catch(CException&) {
473         rgbValue.R = rgbValue.G = rgbValue.B = 0;
474         mErrorReporter.ReportError(errorInvalidRgbValue);
475         return;
476     }
477     if (rgbValue.R < 0  ||  255 < rgbValue.R) {
478         rgbValue.R = rgbValue.G = rgbValue.B = 0;
479         mErrorReporter.ReportError(errorInvalidRgbValue);
480         return;
481     }
482     if (rgbValue.G < 0  ||  255 < rgbValue.G) {
483         rgbValue.R = rgbValue.G = rgbValue.B = 0;
484         mErrorReporter.ReportError(errorInvalidRgbValue);
485         return;
486     }
487     if (rgbValue.B < 0  ||  255 < rgbValue.B) {
488         rgbValue.R = rgbValue.G = rgbValue.B = 0;
489         mErrorReporter.ReportError(errorInvalidRgbValue);
490         return;
491     }
492 }
493 
494 //  ============================================================================
495 void
xInitializeThickInterval(const vector<string> & columns,TSeqPos & thickStart,TSeqPos & thickEnd)496 CBedLineReader::xInitializeThickInterval(
497     const vector<string>& columns,
498     TSeqPos& thickStart,
499     TSeqPos& thickEnd)
500 //  ============================================================================
501 {
502     CImportError errorInvalidThickStartValue(
503         CImportError::ERROR, "Invalid thickStart value",
504         LineCount());
505     CImportError errorInvalidThickEndValue(
506         CImportError::ERROR, "Invalid thickEnd value",
507         LineCount());
508     if (columns.size() < 8) {
509         return;
510     }
511 
512     try {
513         thickStart = NStr::StringToInt(columns[6]);
514     }
515     catch(CException&) {
516         throw errorInvalidThickStartValue;
517     }
518 
519     try {
520         thickEnd = NStr::StringToInt(columns[7]);
521     }
522     catch(CException&) {
523         throw errorInvalidThickEndValue;
524     }
525 }
526 
527 //  ============================================================================
528 void
xInitializeChromName(const vector<string> & columns,string & chromName)529 CBedLineReader::xInitializeChromName(
530     const vector<string>& columns,
531     string& chromName)
532 //  ============================================================================
533 {
534     if (columns.size() < 4) {
535         chromName.clear();
536         return;
537     }
538     chromName = columns[3];
539 }
540 
541 //  ============================================================================
542 void
xInitializeBlocks(const vector<string> & columns,unsigned int & blockCount,vector<int> & blockStarts,vector<int> & blockSizes)543 CBedLineReader::xInitializeBlocks(
544     const vector<string>& columns,
545     unsigned int& blockCount,
546     vector<int>& blockStarts,
547     vector<int>& blockSizes)
548 //  ============================================================================
549 {
550     CImportError errorInvalidBlockCountValue(
551         CImportError::ERROR, "Invalid blockCount value",
552         LineCount());
553     CImportError errorInvalidBlockStartsValue(
554         CImportError::ERROR, "Invalid blockStarts value",
555         LineCount());
556     CImportError errorInvalidBlockSizesValue(
557         CImportError::ERROR, "Invalid blockSizes value",
558         LineCount());
559     CImportError errorInconsistentBlocksInformation(
560         CImportError::ERROR, "Inconsistent blocks information",
561         LineCount());
562 
563     if (columns.size() < 12) {
564         blockCount = 0;
565         return;
566     }
567     try {
568         blockCount = NStr::StringToInt(columns[9]);
569     }
570     catch(std::exception&) {
571         throw errorInvalidBlockCountValue;
572     }
573 
574     blockStarts.clear();
575     blockSizes.clear();
576     try {
577         vector<string> blockSizesSplits;
578         NStr::Split(columns[10], ",", blockSizesSplits);
579         if (blockSizesSplits.back().empty()) {
580             blockSizesSplits.pop_back();
581         }
582         for (auto blockSize: blockSizesSplits) {
583             blockSizes.push_back(NStr::StringToInt(blockSize));
584         }
585     }
586     catch(std::exception&) {
587         throw errorInvalidBlockSizesValue;
588     }
589     if (blockCount != blockSizes.size()) {
590         throw errorInconsistentBlocksInformation;
591     }
592 
593     try {
594         vector<string> blockStartsSplits;
595         NStr::Split(columns[11], ",", blockStartsSplits);
596         if (blockStartsSplits.back().empty()) {
597             blockStartsSplits.pop_back();
598         }
599         for (auto blockStart: blockStartsSplits) {
600             blockStarts.push_back(NStr::StringToInt(blockStart));
601         }
602     }
603     catch(std::exception&) {
604         throw errorInvalidBlockStartsValue;
605     }
606     if (blockCount != blockStarts.size()) {
607         throw errorInconsistentBlocksInformation;
608     }
609 }
610 
611 //  ============================================================================
612 void
xInitializeRecord(const vector<string> & columns,CFeatImportData & record_)613 CBedLineReader::xInitializeRecord(
614     const vector<string>& columns,
615     CFeatImportData& record_)
616 //  ============================================================================
617 {
618     CImportError errorInvalidThickInterval(
619         CImportError::ERROR, "thickInterval extending beyond chrom feature",
620         LineCount());
621 
622     assert(dynamic_cast<CBedImportData*>(&record_));
623 
624     CBedImportData& record = static_cast<CBedImportData&>(record_);
625     //record.InitializeFrom(columns, mLineNumber);
626 
627     string chromId;
628     TSeqPos chromStart, chromEnd;
629     ENa_strand chromStrand;
630     xInitializeChromInterval(columns, chromId, chromStart, chromEnd, chromStrand);
631 
632     string name;
633     xInitializeChromName(columns, name);
634 
635     double score;
636     xInitializeScore(columns, score);
637 
638     TSeqPos thickStart = chromStart, thickEnd = chromStart; //!!
639     xInitializeThickInterval(columns, thickStart, thickEnd);
640     if (thickStart < chromStart  ||  thickEnd > chromEnd) {
641         throw errorInvalidThickInterval;
642     }
643 
644     CBedImportData::RgbValue rgbValue;
645     xInitializeRgb(columns, rgbValue);
646 
647     unsigned int blockCount;
648     vector<int> blockStarts, blockSizes;
649     xInitializeBlocks(columns, blockCount, blockStarts, blockSizes);
650 
651     record.Initialize(chromId, chromStart, chromEnd, name, score, chromStrand,
652         thickStart, thickEnd, rgbValue, blockCount, blockStarts, blockSizes);
653 }
654 
655