1 /* $Id: vcf_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description:
29 * VCF file reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35
36 #include <util/line_reader.hpp>
37
38 #include <objects/general/Object_id.hpp>
39 #include <objects/general/User_object.hpp>
40 #include <objects/general/User_field.hpp>
41 #include <objects/general/Dbtag.hpp>
42
43 #include <objects/seqloc/Seq_interval.hpp>
44 #include <objects/seqloc/Seq_point.hpp>
45
46 #include <objects/seq/Annot_descr.hpp>
47 #include <objects/seq/Seq_literal.hpp>
48 #include <objects/seqfeat/Variation_ref.hpp>
49 #include <objects/seqfeat/Variation_inst.hpp>
50 #include <objects/seqfeat/VariantProperties.hpp>
51 #include <objects/seqfeat/Delta_item.hpp>
52
53 #include <objtools/readers/vcf_reader.hpp>
54
55 #include <algorithm>
56
57 #include "reader_message_handler.hpp"
58
59 #define NCBI_USE_ERRCODE_X Objtools_Rd_RepMask
60
61 BEGIN_NCBI_SCOPE
62 BEGIN_objects_SCOPE // namespace ncbi::objects::
63
64 // ============================================================================
65 const double CVcfReader::mMaxSupportedVersion = 4.1;
66 // ============================================================================
67
68 // ============================================================================
69 class CVcfData
70 // ============================================================================
71 {
72 public:
73 typedef map<string,vector<string> > INFOS;
74 typedef map<string, vector<string> > GTDATA;
75
CVcfData()76 CVcfData() { m_pdQual = 0; };
~CVcfData()77 ~CVcfData() { delete m_pdQual; };
78
79 string m_strLine;
80 string m_strChrom;
81 int m_iPos;
82 vector<string> m_Ids;
83 string m_strRef;
84 vector<string> m_Alt;
85 double* m_pdQual;
86 string m_strFilter;
87 INFOS m_Info;
88 vector<string> m_FormatKeys;
89 // vector< vector<string> > m_GenotypeData;
90 GTDATA m_GenotypeData;
91 enum SetType_t {
92 ST_ALL_SNV,
93 ST_ALL_DEL,
94 ST_ALL_INS,
95 ST_ALL_MNV,
96 ST_MIXED
97 } m_SetType;
98 };
99
100 // ----------------------------------------------------------------------------
SpecType(const string & spectype)101 ESpecType SpecType(
102 const string& spectype )
103 // ----------------------------------------------------------------------------
104 {
105 static map<string, ESpecType> typemap;
106 if ( typemap.empty() ) {
107 typemap["Integer"] = eType_Integer;
108 typemap["Float"] = eType_Float;
109 typemap["Flag"] = eType_Flag;
110 typemap["Character"] = eType_Character;
111 typemap["String"] = eType_String;
112 }
113 try {
114 return typemap[spectype];
115 }
116 catch( ... ) {
117 AutoPtr<CObjReaderLineException> pErr(
118 CObjReaderLineException::Create(
119 eDiag_Warning,
120 0,
121 "CVcfReader::xProcessMetaLineInfo: Unrecognized line or record type.",
122 ILineError::eProblem_GeneralParsingError) );
123 pErr->Throw();
124 return eType_String;
125 }
126 };
127
128 // ----------------------------------------------------------------------------
SpecNumber(const string & specnumber)129 ESpecNumber SpecNumber(
130 const string& specnumber )
131 // ----------------------------------------------------------------------------
132 {
133 if ( specnumber == "R" ) {
134 return eNumber_CountAllAlleles;
135 }
136 if ( specnumber == "A" ) {
137 return eNumber_CountAlleles;
138 }
139 if ( specnumber == "G" ) {
140 return eNumber_CountGenotypes;
141 }
142 if ( specnumber == "." ) {
143 return eNumber_CountUnknown;
144 }
145 try {
146 return ESpecNumber( NStr::StringToInt( specnumber ) );
147 }
148 catch( ... ) {
149 AutoPtr<CObjReaderLineException> pErr(
150 CObjReaderLineException::Create(
151 eDiag_Warning,
152 0,
153 "CVcfReader::xProcessMetaLineInfo: Unrecognized SpecNumber type in FORAMT directive. "
154 "Recognized settings are \'A\', \'G\', \'R\', \'.\', or numeric.",
155 ILineError::eProblem_GeneralParsingError) );
156 pErr->Throw();
157 }
158 return ESpecNumber( 0 );
159 };
160
161 // ----------------------------------------------------------------------------
CVcfReader(int flags,CReaderListener * pRL)162 CVcfReader::CVcfReader(
163 int flags,
164 CReaderListener* pRL):
165 CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
166 mActualVersion(0.0),
167 m_MetaHandled(false)
168 // ----------------------------------------------------------------------------
169 {
170 }
171
172
173 // ----------------------------------------------------------------------------
~CVcfReader()174 CVcfReader::~CVcfReader()
175 // ----------------------------------------------------------------------------
176 {
177 }
178
179 // ----------------------------------------------------------------------------
180 CRef< CSeq_annot >
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)181 CVcfReader::ReadSeqAnnot(
182 ILineReader& lr,
183 ILineErrorListener* pEC )
184 // ----------------------------------------------------------------------------
185 {
186 if (!m_Meta) {
187 m_Meta.Reset( new CAnnotdesc );
188 m_Meta->SetUser().SetType().SetStr( "vcf-meta-info" );
189 }
190 CRef<CSeq_annot> pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
191 if (pAnnot) {
192 xAssignTrackData(*pAnnot);
193 xAssignVcfMeta(*pAnnot);
194 }
195 return pAnnot;
196 }
197
198 // ----------------------------------------------------------------------------
199 CRef<CSeq_annot>
xCreateSeqAnnot()200 CVcfReader::xCreateSeqAnnot()
201 // ----------------------------------------------------------------------------
202 {
203 CRef<CSeq_annot> pAnnot = CReaderBase::xCreateSeqAnnot();
204 pAnnot->SetData().SetFtable();
205 return pAnnot;
206 }
207
208 // ----------------------------------------------------------------------------
209 void
xGetData(ILineReader & lr,TReaderData & readerData)210 CVcfReader::xGetData(
211 ILineReader& lr,
212 TReaderData& readerData)
213 // ----------------------------------------------------------------------------
214 {
215 readerData.clear();
216 string line;
217 if (!xGetLine(lr, line)) {
218 return;
219 }
220 if (xIsTrackLine(line) && m_uDataCount) {
221 xUngetLine(lr);
222 return;
223 }
224 readerData.push_back(TReaderLine{m_uLineNumber, line});
225 }
226
227 // ----------------------------------------------------------------------------
228 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)229 CVcfReader::xProcessData(
230 const TReaderData& readerData,
231 CSeq_annot& annot)
232 // ----------------------------------------------------------------------------
233 {
234 for (auto lineInfo: readerData) {
235 const auto& line = lineInfo.mData;
236 if (mActualVersion == 0.0) {
237 bool lineContainsVersion(false);
238 xSetFileFormat(line, annot, lineContainsVersion);
239 // Note:
240 // Currently, the line format specifier is also processed as a
241 // meta line even though it really isn't, or at least it's a very
242 // different meta than all other VCF metas.
243 // Uncomment the following lines to no longer process the file format
244 // specifier as a meta.
245 //if (lineContainsVersion) {
246 //return;
247 //}
248 }
249
250 if (xParseBrowserLine(line, annot)) {
251 return;
252 }
253 if (xProcessTrackLine(line, annot)) {
254 return;
255 }
256 if (xProcessMetaLine(line, annot)) {
257 return;
258 }
259 if (xProcessHeaderLine(line, annot)) {
260 return;
261 }
262 if (xProcessDataLine(line, annot)) {
263 ++m_uDataCount;
264 return;
265 }
266 CReaderMessage warning(
267 eDiag_Warning,
268 m_uLineNumber,
269 "CVcfReader::ReadSeqAnnot: Unrecognized line or record type.");
270 m_pMessageHandler->Report(warning);
271 }
272 }
273
274 // ----------------------------------------------------------------------------
275 bool
xProcessMetaLine(const string & line,CSeq_annot & annot)276 CVcfReader::xProcessMetaLine(
277 const string& line,
278 CSeq_annot& annot)
279 // ----------------------------------------------------------------------------
280 {
281 if ( ! NStr::StartsWith( line, "##" ) ) {
282 if ( !m_MetaDirectives.empty() && !m_MetaHandled ) {
283 m_Meta->SetUser().AddField("meta-information", m_MetaDirectives);
284 }
285 m_MetaHandled = true;
286 return false;
287 }
288 m_MetaDirectives.push_back(line.substr(2));
289
290 if (xProcessMetaLineInfo(line, annot)) {
291 return true;
292 }
293 if (xProcessMetaLineFilter(line, annot)) {
294 return true;
295 }
296 if (xProcessMetaLineFormat(line, annot)) {
297 return true;
298 }
299 return true;
300 }
301
302 // ----------------------------------------------------------------------------
303 void
xSetFileFormat(const string & line,CSeq_annot & annot,bool & lineContainsVersion)304 CVcfReader::xSetFileFormat(
305 const string& line,
306 CSeq_annot& annot,
307 bool& lineContainsVersion)
308 // ----------------------------------------------------------------------------
309 {
310 const string prefix = "##fileformat=VCFv";
311
312 if (!NStr::StartsWith(line, prefix)) {
313 CReaderMessage warning(
314 eDiag_Warning,
315 m_uLineNumber,
316 string("CVcfReader::xProcessMetaLineFileFormat: ") +
317 "Missing VCF version string. Assuming VCFv" +
318 NStr::DoubleToString(mMaxSupportedVersion) +
319 ". Proceed with care!");
320 m_pMessageHandler->Report(warning);
321 mActualVersion = mMaxSupportedVersion;
322 lineContainsVersion = false;
323 return;
324 }
325
326 lineContainsVersion = true;
327
328 string versionStr = line.substr(prefix.length(), string::npos);
329 try {
330 mActualVersion = NStr::StringToDouble(versionStr);
331 }
332 catch (std::exception except) {
333 CReaderMessage warning(
334 eDiag_Warning,
335 m_uLineNumber,
336 string("CVcfReader::xProcessMetaLineFileFormat: ") +
337 "Data file contains an unrecognized version string \"" +
338 versionStr +
339 "\". Assuming VCFv" +
340 NStr::DoubleToString(mMaxSupportedVersion) +
341 ". Proceed with care!");
342 m_pMessageHandler->Report(warning);
343 mActualVersion = mMaxSupportedVersion;
344 return;
345 }
346
347 if (mActualVersion > mMaxSupportedVersion) {
348 CReaderMessage warning(
349 eDiag_Warning,
350 m_uLineNumber,
351 string("CVcfReader::xProcessMetaLineFileFormat: Data file format \"") +
352 versionStr +
353 "\" exceeds reader supported format \"" +
354 NStr::DoubleToString(mMaxSupportedVersion) +
355 "\". Proceed with care!");
356 m_pMessageHandler->Report(warning);
357 mActualVersion = mMaxSupportedVersion;
358 return;
359 }
360 }
361
362 // ----------------------------------------------------------------------------
363 bool
xProcessMetaLineInfo(const string & line,CSeq_annot & annot)364 CVcfReader::xProcessMetaLineInfo(
365 const string& line,
366 CSeq_annot& annot)
367 // ----------------------------------------------------------------------------
368 {
369 const string prefix = "##INFO=<";
370 const string postfix = ">";
371
372 if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
373 return false;
374 }
375
376 try {
377 vector<string> fields;
378 string key, id, numcount, type, description;
379 string info = line.substr(
380 prefix.length(), line.length() - prefix.length() - postfix.length() );
381 NStr::Split( info, ",", fields );
382 NStr::SplitInTwo( fields[0], "=", key, id );
383 if ( key != "ID" ) {
384 AutoPtr<CObjReaderLineException> pErr(
385 CObjReaderLineException::Create(
386 eDiag_Error,
387 0,
388 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"ID\".",
389 ILineError::eProblem_BadInfoLine) );
390 pErr->Throw();
391 }
392 NStr::SplitInTwo( fields[1], "=", key, numcount );
393 if ( key != "Number" ) {
394 AutoPtr<CObjReaderLineException> pErr(
395 CObjReaderLineException::Create(
396 eDiag_Error,
397 0,
398 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Number\".",
399 ILineError::eProblem_BadInfoLine) );
400 pErr->Throw();
401 }
402 NStr::SplitInTwo( fields[2], "=", key, type );
403 if ( key != "Type" ) {
404 AutoPtr<CObjReaderLineException> pErr(
405 CObjReaderLineException::Create(
406 eDiag_Error,
407 0,
408 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Type\".",
409 ILineError::eProblem_BadInfoLine) );
410 pErr->Throw();
411 }
412 NStr::SplitInTwo( fields[3], "=", key, description );
413 if ( key != "Description" ) {
414 AutoPtr<CObjReaderLineException> pErr(
415 CObjReaderLineException::Create(
416 eDiag_Error,
417 0,
418 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Description\".",
419 ILineError::eProblem_BadInfoLine) );
420 pErr->Throw();
421 }
422 m_InfoSpecs[id] = CVcfInfoSpec( id, numcount, type, description );
423 }
424 catch (CObjReaderLineException& err) {
425 ProcessError(err, nullptr);
426 }
427 return true;
428 }
429
430 // ----------------------------------------------------------------------------
431 bool
xProcessMetaLineFilter(const string & line,CSeq_annot & annot)432 CVcfReader::xProcessMetaLineFilter(
433 const string& line,
434 CSeq_annot& annot)
435 // ----------------------------------------------------------------------------
436 {
437 const string prefix = "##FILTER=<";
438 const string postfix = ">";
439
440 if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
441 return false;
442 }
443
444 try {
445 vector<string> fields;
446 string key, id, description;
447 string info = line.substr(
448 prefix.length(), line.length() - prefix.length() - postfix.length() );
449 NStr::Split( info, ",", fields );
450 NStr::SplitInTwo( fields[0], "=", key, id );
451 if ( key != "ID" ) {
452 AutoPtr<CObjReaderLineException> pErr(
453 CObjReaderLineException::Create(
454 eDiag_Error,
455 0,
456 "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"ID\".",
457 ILineError::eProblem_BadFilterLine) );
458 pErr->Throw();
459 }
460 NStr::SplitInTwo( fields[1], "=", key, description );
461 if ( key != "Description" ) {
462 AutoPtr<CObjReaderLineException> pErr(
463 CObjReaderLineException::Create(
464 eDiag_Error,
465 0,
466 "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"Description\".",
467 ILineError::eProblem_BadFilterLine) );
468 pErr->Throw();
469 }
470 m_FilterSpecs[id] = CVcfFilterSpec( id, description );
471 }
472 catch (CObjReaderLineException& err) {
473 ProcessError(err, nullptr);
474 }
475 return true;
476 }
477
478 // ----------------------------------------------------------------------------
479 bool
xProcessMetaLineFormat(const string & line,CSeq_annot & annot)480 CVcfReader::xProcessMetaLineFormat(
481 const string& line,
482 CSeq_annot& annot)
483 // ----------------------------------------------------------------------------
484 {
485 const string prefix = "##FORMAT=<";
486 const string postfix = ">";
487
488 if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
489 return false;
490 }
491
492 try {
493 vector<string> fields;
494 string key, id, numcount, type, description;
495 string info = line.substr(
496 prefix.length(), line.length() - prefix.length() - postfix.length() );
497 NStr::Split( info, ",", fields );
498 NStr::SplitInTwo( fields[0], "=", key, id );
499 if ( key != "ID" ) {
500 AutoPtr<CObjReaderLineException> pErr(
501 CObjReaderLineException::Create(
502 eDiag_Error,
503 0,
504 "CVcfReader::xProcessMetaLineInfo: ##FORMAT with bad or missing \"ID\".",
505 ILineError::eProblem_BadFormatLine) );
506 pErr->Throw();
507 }
508 NStr::SplitInTwo( fields[1], "=", key, numcount );
509 if ( key != "Number" ) {
510 AutoPtr<CObjReaderLineException> pErr(
511 CObjReaderLineException::Create(
512 eDiag_Error,
513 0,
514 "CVcfReader::xProcessMetaLineInfo: "
515 "##FORMAT with bad or missing \"Number\".",
516 ILineError::eProblem_BadFormatLine) );
517 pErr->Throw();
518 }
519 NStr::SplitInTwo( fields[2], "=", key, type );
520 if ( key != "Type" ) {
521 AutoPtr<CObjReaderLineException> pErr(
522 CObjReaderLineException::Create(
523 eDiag_Error,
524 0,
525 "CVcfReader::xProcessMetaLineInfo: "
526 "##FORMAT with bad or missing \"Type\".",
527 ILineError::eProblem_BadFormatLine) );
528 pErr->Throw();
529 }
530 NStr::SplitInTwo( fields[3], "=", key, description );
531 if ( key != "Description" ) {
532 AutoPtr<CObjReaderLineException> pErr(
533 CObjReaderLineException::Create(
534 eDiag_Error,
535 0,
536 "CVcfReader::xProcessMetaLineInfo: "
537 "##FORMAT with bad or missing \"Description\".",
538 ILineError::eProblem_BadFormatLine) );
539 pErr->Throw();
540 }
541 m_FormatSpecs[id] = CVcfFormatSpec( id, numcount, type, description );
542 }
543 catch (CObjReaderLineException& err) {
544 ProcessError(err, nullptr);
545 }
546 return true;
547 }
548
549 // ----------------------------------------------------------------------------
550 bool
xProcessHeaderLine(const string & line,CSeq_annot & annot)551 CVcfReader::xProcessHeaderLine(
552 const string& line,
553 CSeq_annot& annot )
554 // ----------------------------------------------------------------------------
555 {
556 if ( ! NStr::StartsWith( line, "#CHROM" ) ) {
557 return false;
558 }
559
560 //
561 // Per spec:
562 // The header line provides the column headers for the data records that follow.
563 // the first few are fixed and mandatory: CHROM .. FILTER.
564 // If genotype data is present this is followed by the FORMAT header.
565 // After that come the various headers for the genotype information, and these
566 // need to be preserved:
567 //
568 NStr::Split(line, " \t", m_GenotypeHeaders, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
569 vector<string>::iterator pos_format = find(
570 m_GenotypeHeaders.begin(), m_GenotypeHeaders.end(), "FORMAT");
571 if ( pos_format == m_GenotypeHeaders.end() ) {
572 m_GenotypeHeaders.clear();
573 }
574 else {
575 m_GenotypeHeaders.erase( m_GenotypeHeaders.begin(), pos_format+1 );
576 m_Meta->SetUser().AddField("genotype-headers", m_GenotypeHeaders);
577 }
578
579 //
580 // The header line signals the end of meta information, so migrate the
581 // accumulated meta information into the seq descriptor:
582 //
583 return true;
584 }
585
586 // ----------------------------------------------------------------------------
587 bool
xAssignVcfMeta(CSeq_annot & annot)588 CVcfReader::xAssignVcfMeta(
589 CSeq_annot& annot)
590 // ----------------------------------------------------------------------------
591 {
592 if (m_Meta && m_Meta->IsUser() && m_Meta->GetUser().IsSetData()) {
593 if (!annot.IsSetDesc()) {
594 CRef< CAnnot_descr > desc( new CAnnot_descr );
595 annot.SetDesc(*desc);
596 }
597 annot.SetDesc().Set().push_back( m_Meta );
598 }
599 //else { // VCF input ought to include a header
600 // CReaderMessage warning(
601 // eDiag_Warning,
602 // m_uLineNumber,
603 // "CVcfReader::xAssignVcfMeta: Missing VCF header data.");
604 // m_pMessageHandler->Report(warning);
605 //}
606 return true;
607 }
608
609 // ----------------------------------------------------------------------------
610 bool
xProcessDataLine(const string & line,CSeq_annot & annot)611 CVcfReader::xProcessDataLine(
612 const string& line,
613 CSeq_annot& annot)
614 // ----------------------------------------------------------------------------
615 {
616 if ( NStr::StartsWith( line, "#" ) ) {
617 return false;
618 }
619
620 CVcfData data;
621 if (!xParseData(line, data, nullptr)) {
622 return false;
623 }
624 CRef<CSeq_feat> pFeat( new CSeq_feat );
625 pFeat->SetData().SetVariation().SetData().SetSet().SetType(
626 CVariation_ref::C_Data::C_Set::eData_set_type_package );
627 pFeat->SetData().SetVariation().SetVariant_prop().SetVersion( 5 );
628 CSeq_feat::TExt& ext = pFeat->SetExt();
629 ext.SetType().SetStr( "VcfAttributes" );
630
631 if (!xAssignFeatureLocationSet(data, pFeat)) {
632 return false;
633 }
634 if (!xAssignVariationIds(data, pFeat)) {
635 return false;
636 }
637 if (!xAssignVariationAlleleSet(data, pFeat)) {
638 return false;
639 }
640 if (!xProcessScore(data, pFeat)) {
641 return false;
642 }
643 if (!xProcessFilter(data, pFeat)) {
644 return false;
645 }
646 if (!xProcessInfo( data, pFeat)) {
647 return false;
648 }
649 if (!xProcessFormat(data, pFeat)) {
650 return false;
651 }
652
653 if ( pFeat->GetExt().GetData().empty() ) {
654 pFeat->ResetExt();
655 }
656 annot.SetData().SetFtable().push_back(pFeat);
657 return true;
658 }
659
660 // ----------------------------------------------------------------------------
661 bool
xAssignVariationAlleleSet(const CVcfData & data,CRef<CSeq_feat> pFeature)662 CVcfReader::xAssignVariationAlleleSet(
663 const CVcfData& data,
664 CRef<CSeq_feat> pFeature )
665 // ----------------------------------------------------------------------------
666 {
667 CVariation_ref::TData::TSet::TVariations& variants =
668 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
669
670 //make one variation for the reference
671 CRef<CVariation_ref> pIdentity(new CVariation_ref);
672 vector<string> variant;
673
674 switch(data.m_SetType) {
675 case CVcfData::ST_ALL_INS:
676 pIdentity->SetDeletion();
677 break;
678 default:
679 variant.push_back(data.m_strRef);
680 pIdentity->SetSNV(variant, CVariation_ref::eSeqType_na);
681 break;
682 }
683 CVariation_inst& instance = pIdentity->SetData().SetInstance();
684 instance.SetType(CVariation_inst::eType_identity);
685 instance.SetObservation(CVariation_inst::eObservation_reference);
686 if (data.m_SetType != CVcfData::ST_ALL_INS) {
687 variants.push_back(pIdentity);
688 }
689
690 bool no_alt = true;
691 for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
692 if (!NStr::Equal(data.m_Alt[i],".")) {
693 no_alt = false;
694 }
695 }
696 if (no_alt) {
697 instance.SetObservation() |= CVariation_inst::eObservation_variant;
698 return true;
699 }
700
701
702 //add additional variations, one for each alternative
703 for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
704 if (NStr::Equal(data.m_Alt[i],".")) {
705 continue;
706 }
707 switch(data.m_SetType) {
708 default:
709 if (!xAssignVariantDelins(data, i, pFeature)) {
710 return false;
711 }
712 break;
713 case CVcfData::ST_ALL_SNV:
714 if (!xAssignVariantSnv(data, i, pFeature)) {
715 return false;
716 }
717 break;
718 case CVcfData::ST_ALL_MNV:
719 if (!xAssignVariantMnv(data, i, pFeature)) {
720 return false;
721 }
722 break;
723 case CVcfData::ST_ALL_INS:
724 if (!xAssignVariantIns(data, i, pFeature)) {
725 return false;
726 }
727 break;
728 case CVcfData::ST_ALL_DEL:
729 if (!xAssignVariantDel(data, i, pFeature)) {
730 return false;
731 }
732 break;
733 }
734 }
735 return true;
736 }
737
738 // ----------------------------------------------------------------------------
739 bool
xAssignVariantSnv(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)740 CVcfReader::xAssignVariantSnv(
741 const CVcfData& data,
742 unsigned int index,
743 CRef<CSeq_feat> pFeature )
744 // ----------------------------------------------------------------------------
745 {
746 CVariation_ref::TData::TSet::TVariations& variants =
747 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
748
749 CRef<CVariation_ref> pVariant(new CVariation_ref);
750 {{
751 vector<string> variant;
752 variant.push_back(data.m_Alt[index]);
753 pVariant->SetSNV(variant, CVariation_ref::eSeqType_na);
754 }}
755 variants.push_back(pVariant);
756 return true;
757 }
758
759 // ----------------------------------------------------------------------------
760 bool
xAssignVariantMnv(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)761 CVcfReader::xAssignVariantMnv(
762 const CVcfData& data,
763 unsigned int index,
764 CRef<CSeq_feat> pFeature )
765 // ----------------------------------------------------------------------------
766 {
767 CVariation_ref::TData::TSet::TVariations& variants =
768 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
769
770 CRef<CVariation_ref> pVariant(new CVariation_ref);
771 {{
772 vector<string> variant;
773 variant.push_back(data.m_Alt[index]);
774 pVariant->SetMNP(variant, CVariation_ref::eSeqType_na);
775 }}
776 variants.push_back(pVariant);
777 return true;
778 }
779
780 // ----------------------------------------------------------------------------
781 static void
s_AddDeleteDeltaItem(CVariation_inst & instance)782 s_AddDeleteDeltaItem(
783 CVariation_inst& instance )
784 {
785 CRef<CDelta_item> pItem(new CDelta_item);
786
787 pItem->SetSeq().SetThis();
788 instance.SetType(CVariation_inst::eType_del);
789 pItem->SetAction(CDelta_item::eAction_del_at);
790 instance.SetDelta().push_back(pItem);
791 }
792
793
794 // ----------------------------------------------------------------------------
795 bool
xAssignVariantDel(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)796 CVcfReader::xAssignVariantDel(
797 const CVcfData& data,
798 unsigned int index,
799 CRef<CSeq_feat> pFeature )
800 // ----------------------------------------------------------------------------
801 {
802 CVariation_ref::TData::TSet::TVariations& variants =
803 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
804
805 CRef<CVariation_ref> pVariant(new CVariation_ref);
806 {{
807 //pVariant->SetData().SetNote("DEL");
808 pVariant->SetDeletion();
809 CVariation_inst& instance = pVariant->SetData().SetInstance();
810 s_AddDeleteDeltaItem(instance);
811
812 }}
813 variants.push_back(pVariant);
814 return true;
815 }
816
817 // ----------------------------------------------------------------------------
818 bool
xAssignVariantIns(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)819 CVcfReader::xAssignVariantIns(
820 const CVcfData& data,
821 unsigned int index,
822 CRef<CSeq_feat> pFeature )
823 // ----------------------------------------------------------------------------
824 {
825 CVariation_ref::TData::TSet::TVariations& variants =
826 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
827
828 CRef<CVariation_ref> pVariant(new CVariation_ref);
829 {{
830 string insertion(data.m_Alt[index]);
831 CRef<CSeq_literal> pLiteral(new CSeq_literal);
832 pLiteral->SetSeq_data().SetIupacna().Set(insertion);
833 pLiteral->SetLength(
834 static_cast<TSeqPos>(insertion.size()));
835 CRef<CDelta_item> pItem(new CDelta_item);
836 pItem->SetAction(CDelta_item::eAction_ins_before);
837 pItem->SetSeq().SetLiteral(*pLiteral);
838 CVariation_inst& instance = pVariant->SetData().SetInstance();
839 instance.SetType(CVariation_inst::eType_ins);
840 instance.SetDelta().push_back(pItem);
841 }}
842 variants.push_back(pVariant);
843 return true;
844 }
845
846
847
848 // ----------------------------------------------------------------------------
849 bool
xAssignVariantDelins(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)850 CVcfReader::xAssignVariantDelins(
851 const CVcfData& data,
852 unsigned int index,
853 CRef<CSeq_feat> pFeature )
854 // ----------------------------------------------------------------------------
855 {
856 string insertion(data.m_Alt[index]);
857
858 CVariation_ref::TData::TSet::TVariations& variants =
859 pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
860
861 CRef<CVariation_ref> pVariant(new CVariation_ref);
862 CVariation_inst& instance = pVariant->SetData().SetInstance();
863
864 // If it is a deletion, add Deleteion Delta-item and be done.
865 if (insertion.size() == 0) {
866
867 s_AddDeleteDeltaItem(instance);
868 variants.push_back(pVariant);
869 return true;
870 }
871
872 // Must be a SNV or delins
873 CRef<CSeq_literal> pLiteral(new CSeq_literal);
874 pLiteral->SetSeq_data().SetIupacna().Set(insertion);
875 pLiteral->SetLength(static_cast<TSeqPos>(insertion.size()));
876
877 CRef<CDelta_item> pItem(new CDelta_item);
878 pItem->SetSeq().SetLiteral(*pLiteral);
879 instance.SetDelta().push_back(pItem);
880
881 //Let's try to smartly set the Type.
882 if (insertion.size() == 1 && data.m_strRef.size() == 1) {
883 instance.SetType(CVariation_inst::eType_snv);
884 } else {
885 instance.SetType(CVariation_inst::eType_delins);
886 }
887
888 variants.push_back(pVariant);
889 return true;
890
891 }
892
893 // ----------------------------------------------------------------------------
894 bool
xParseData(const string & line,CVcfData & data,ILineErrorListener * pEC)895 CVcfReader::xParseData(
896 const string& line,
897 CVcfData& data,
898 ILineErrorListener* pEC)
899 // ----------------------------------------------------------------------------
900 {
901 vector<string> columns;
902 NStr::Split(line, "\t", columns, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
903 if ( columns.size() < 8 ) {
904 return false;
905 }
906 try {
907 data.m_strLine = line;
908
909 data.m_strChrom = columns[0];
910 data.m_iPos = NStr::StringToInt( columns[1] );
911 NStr::Split( columns[2], ";", data.m_Ids );
912 if ( (data.m_Ids.size() == 1) && (data.m_Ids[0] == ".") ) {
913 data.m_Ids.clear();
914 }
915 data.m_strRef = columns[3];
916 NStr::Split( columns[4], ",", data.m_Alt );
917 if ( columns[5] != "." ) {
918 data.m_pdQual = new double( NStr::StringToDouble( columns[5] ) );
919 }
920 data.m_strFilter = columns[6];
921
922 vector<string> infos;
923 if ( columns[7] != "." ) {
924 NStr::Split( columns[7], ";", infos, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
925 for ( vector<string>::iterator it = infos.begin();
926 it != infos.end(); ++it )
927 {
928 string key, value;
929 NStr::SplitInTwo( *it, "=", key, value );
930 data.m_Info[key] = vector<string>();
931 NStr::Split( value, ",", data.m_Info[key] );
932 }
933 }
934 if ( columns.size() > 8 ) {
935 NStr::Split( columns[8], ":", data.m_FormatKeys, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
936
937 for ( size_t u=9; u < columns.size(); ++u ) {
938 vector<string> values;
939 NStr::Split( columns[u], ":", values, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
940 data.m_GenotypeData[ m_GenotypeHeaders[u-9] ] = values;
941 }
942 }
943 }
944 catch ( ... ) {
945 AutoPtr<CObjReaderLineException> pErr(
946 CObjReaderLineException::Create(
947 eDiag_Error,
948 0,
949 "Unable to parse given VCF data (syntax error).",
950 ILineError::eProblem_GeneralParsingError));
951 ProcessError(*pErr, pEC);
952 return false;
953 }
954
955 if (!xNormalizeData(data, pEC)) {
956 return false;
957 }
958
959 //assign set type:
960
961 //test for all SNVs
962 bool maybeAllSnv = (data.m_strRef.size() == 1);
963 if (maybeAllSnv) {
964 for (size_t u=0; u < data.m_Alt.size(); ++u) {
965 if (data.m_Alt[u].size() != 1) {
966 maybeAllSnv = false;
967 break;
968 }
969 }
970 if (maybeAllSnv) {
971 data.m_SetType = CVcfData::ST_ALL_SNV;
972 return true;
973 }
974 }
975
976 //test for all mnvs:
977 bool maybeAllMnv = true;
978 size_t refSize = data.m_strRef.size();
979 for (size_t u=0; u < data.m_Alt.size(); ++u) {
980 if (data.m_Alt[u].size() != refSize) {
981 maybeAllMnv = false;
982 break;
983 }
984 }
985 if (maybeAllMnv) {
986 data.m_SetType = CVcfData::ST_ALL_MNV;
987 return true;
988 }
989
990 //test for all insertions:
991 bool maybeAllIns = true;
992 for (size_t u=0; u < data.m_Alt.size(); ++u) {
993 if (! NStr::StartsWith(data.m_Alt[u], data.m_strRef)) {
994 maybeAllIns = false;
995 break;
996 }
997 }
998 if (maybeAllIns) {
999 data.m_SetType = CVcfData::ST_ALL_INS;
1000 return true;
1001 }
1002
1003 //test for all deletions:
1004 // note: even it is all deletions we are not able to process them
1005 // as such because those deletions would be at different ASN1
1006 // locations. Hence we punt to "indel" if there is more than one
1007 // alternative.
1008 bool maybeAllDel = false;
1009 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1010 if (data.m_Alt.size() == 1 && data.m_Alt[0].empty()) {
1011 maybeAllDel = true;
1012 }
1013 }
1014 if (maybeAllDel) {
1015 data.m_SetType = CVcfData::ST_ALL_DEL;
1016 return true;
1017 }
1018
1019 data.m_SetType = CVcfData::ST_MIXED;
1020 return true;
1021 }
1022
1023
1024 // ---------------------------------------------------------------------------
1025 bool
xNormalizeData(CVcfData & data,ILineErrorListener * pEC)1026 CVcfReader::xNormalizeData(
1027 CVcfData& data,
1028 ILineErrorListener* pEC)
1029 // ---------------------------------------------------------------------------
1030 {
1031 // make sure none of the alternatives is equal to the reference:
1032 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1033 if (data.m_Alt[u] == data.m_strRef) {
1034 AutoPtr<CObjReaderLineException> pErr(
1035 CObjReaderLineException::Create(
1036 eDiag_Error,
1037 0,
1038 "CVcfReader::xNormalizeData: Invalid alternative.",
1039 ILineError::eProblem_GeneralParsingError));
1040 ProcessError(*pErr, pEC);
1041 return false;
1042 }
1043 }
1044
1045 // normalize ref/alt by trimming common prefices and adjusting location
1046 bool trimComplete = false;
1047 while (!data.m_strRef.empty()) {
1048 char leadBase = data.m_strRef[0];
1049 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1050 if (!NStr::StartsWith(data.m_Alt[u], leadBase)) {
1051 trimComplete = true;
1052 break;
1053 }
1054 }
1055 if (trimComplete) {
1056 break;
1057 }
1058 data.m_strRef = data.m_strRef.substr(1);
1059 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1060 data.m_Alt[u] = data.m_Alt[u].substr(1);
1061 }
1062 data.m_iPos++;
1063 }
1064
1065 // normalize ref/alt by trimming common postfixes and adjusting location
1066 trimComplete = false;
1067 size_t refSize = data.m_strRef.size();
1068 size_t trimSize = 0;
1069 while (refSize > trimSize) {
1070 string postfix = data.m_strRef.substr(refSize-1-trimSize, trimSize+1);
1071 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1072 size_t altSize = data.m_Alt[u].size();
1073 if (altSize < trimSize+1) {
1074 trimComplete = true;
1075 break;
1076 }
1077 string postfixA = data.m_Alt[u].substr(altSize-1-trimSize, trimSize+1);
1078 if (postfix != postfixA) {
1079 trimComplete = true;
1080 break;
1081 }
1082 }
1083 if (trimComplete) {
1084 break;
1085 }
1086 trimSize++;
1087 }
1088 if (trimSize > 0) {
1089 data.m_strRef =
1090 data.m_strRef.substr(0, data.m_strRef.size()-trimSize);
1091 for (size_t u=0; u < data.m_Alt.size(); ++u) {
1092 data.m_Alt[u] =
1093 data.m_Alt[u].substr(0, data.m_Alt[u].size()-trimSize);
1094 }
1095 }
1096 return true;
1097 }
1098
1099 // ---------------------------------------------------------------------------
1100 bool
xAssignFeatureLocationSet(const CVcfData & data,CRef<CSeq_feat> pFeat)1101 CVcfReader::xAssignFeatureLocationSet(
1102 const CVcfData& data,
1103 CRef<CSeq_feat> pFeat )
1104 // ---------------------------------------------------------------------------
1105 {
1106 CRef<CSeq_id> pId(CReadUtil::AsSeqId(data.m_strChrom, m_iFlags));
1107
1108 //context:
1109 // we are trying to package all the allele of this feature into a single
1110 // variation_ref, hence, they all need a common location.
1111 // Referenced location differ between the different types of variations,
1112 // so we need to find the most specific variation type that describes them
1113 // all. Once the actual variation type has been found we can set the location
1114 // accordingly.
1115
1116 // in practice, we will choose the common variation type if it is indeed
1117 // common for all the alleles. Otherwise, we just make it a MNV.
1118
1119 if (data.m_SetType == CVcfData::ST_ALL_SNV) {
1120 //set location for SNVs
1121 pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1122 pFeat->SetLocation().SetPnt().SetId(*pId);
1123 return true;
1124 }
1125 if (data.m_SetType == CVcfData::ST_ALL_MNV) {
1126 //set location for MNV. This will be the location of the reference
1127 pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1128 pFeat->SetLocation().SetInt().SetTo(
1129 static_cast<TSeqPos>(data.m_iPos + data.m_strRef.size() - 2));
1130 pFeat->SetLocation().SetInt().SetId(*pId);
1131 return true;
1132 }
1133 if (data.m_SetType == CVcfData::ST_ALL_INS) {
1134 //set location for INSs. Will always be a point!
1135 //m_iPos points to the 1-based position of the first
1136 //nt that is unique between alt and ref
1137 pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1138 pFeat->SetLocation().SetPnt().SetId(*pId);
1139 return true;
1140 }
1141 if (data.m_SetType == CVcfData::ST_ALL_DEL) {
1142 if (data.m_strRef.size() == 1) {
1143 //deletion of a single base
1144 pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1145 pFeat->SetLocation().SetPnt().SetId(*pId);
1146 }
1147 else {
1148 pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1149 //-1 for 0-based,
1150 //another -1 for inclusive end-point ( i.e. [], not [) )
1151 pFeat->SetLocation().SetInt().SetTo(
1152 static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1153 pFeat->SetLocation().SetInt().SetId(*pId);
1154 }
1155 return true;
1156 }
1157
1158 //default: For MNV's we will use the single starting point
1159 //NB: For references of size >=2, this location will not
1160 //match the reference allele. Future Variation-ref
1161 //normalization code will address these issues,
1162 //and obviate the need for this code altogether.
1163 if (data.m_strRef.size() == 1) {
1164 //deletion of a single base
1165 pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1166 pFeat->SetLocation().SetPnt().SetId(*pId);
1167 }
1168 else {
1169 pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1170 pFeat->SetLocation().SetInt().SetTo(
1171 static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1172 pFeat->SetLocation().SetInt().SetId(*pId);
1173 }
1174 return true;
1175 }
1176
1177 // ----------------------------------------------------------------------------
1178 bool
xProcessScore(CVcfData & data,CRef<CSeq_feat> pFeature)1179 CVcfReader::xProcessScore(
1180 CVcfData& data,
1181 CRef<CSeq_feat> pFeature )
1182 // ----------------------------------------------------------------------------
1183 {
1184 CSeq_feat::TExt& ext = pFeature->SetExt();
1185 if ( data.m_pdQual ) {
1186 ext.AddField( "score", *data.m_pdQual );
1187 }
1188 return true;
1189 }
1190
1191 // ----------------------------------------------------------------------------
1192 bool
xProcessFilter(CVcfData & data,CRef<CSeq_feat> pFeature)1193 CVcfReader::xProcessFilter(
1194 CVcfData& data,
1195 CRef<CSeq_feat> pFeature )
1196 // ----------------------------------------------------------------------------
1197 {
1198 if(!NStr::Equal(data.m_strFilter,".")) {
1199 CSeq_feat::TExt& ext = pFeature->SetExt();
1200 ext.AddField( "filter", data.m_strFilter );
1201 }
1202 return true;
1203 }
1204
1205 // ----------------------------------------------------------------------------
1206 bool
xProcessInfo(CVcfData & data,CRef<CSeq_feat> pFeature)1207 CVcfReader::xProcessInfo(
1208 CVcfData& data,
1209 CRef<CSeq_feat> pFeature)
1210 // ----------------------------------------------------------------------------
1211 {
1212 if (!xAssignVariantProps(data, pFeature)) {
1213 return false;
1214 }
1215 CSeq_feat::TExt& ext = pFeature->SetExt();
1216 if (data.m_Info.empty()) {
1217 return true;
1218 }
1219 vector<string> infos;
1220 for ( map<string,vector<string> >::const_iterator cit = data.m_Info.begin();
1221 cit != data.m_Info.end(); cit++ )
1222 {
1223 const string& key = cit->first;
1224 vector<string> value = cit->second;
1225 if ( value.empty() ) {
1226 infos.push_back( key );
1227 }
1228 else {
1229 string joined = NStr::Join( list<string>( value.begin(), value.end() ), "," );
1230 infos.push_back( key + "=" + joined );
1231 }
1232 }
1233 ext.AddField( "info", NStr::Join( infos, ";" ) );
1234 return true;
1235 }
1236
1237 // ----------------------------------------------------------------------------
1238 bool
xProcessTrackLine(const string & strLine,CSeq_annot & current)1239 CVcfReader::xProcessTrackLine(
1240 const string& strLine,
1241 CSeq_annot& current)
1242 // ----------------------------------------------------------------------------
1243 {
1244 if (!xIsTrackLine(strLine)) {
1245 return false;
1246 }
1247 vector<string> parts;
1248 CReadUtil::Tokenize( strLine, " \t", parts );
1249 if (parts.size() >= 3) {
1250 const string digits("0123456789");
1251 bool col2_is_numeric =
1252 (string::npos == parts[1].find_first_not_of(digits));
1253 bool col3_is_numeric =
1254 (string::npos == parts[2].find_first_not_of(digits));
1255 if (col2_is_numeric && col3_is_numeric) {
1256 return false;
1257 }
1258 }
1259 if (!CReaderBase::xParseTrackLine(strLine)) {
1260 CReaderMessage warning(
1261 eDiag_Warning,
1262 m_uLineNumber,
1263 "Bad track line: Expected \"track key1=value1 key2=value2 ...\". Ignored.");
1264 m_pMessageHandler->Report(warning);
1265 }
1266 return true;
1267 }
1268
1269 // ----------------------------------------------------------------------------
1270 bool
xProcessFormat(CVcfData & data,CRef<CSeq_feat> pFeature)1271 CVcfReader::xProcessFormat(
1272 CVcfData& data,
1273 CRef<CSeq_feat> pFeature )
1274 // ----------------------------------------------------------------------------
1275 {
1276 if (data.m_FormatKeys.empty()) {
1277 return true;
1278 }
1279
1280 CSeq_feat::TExt& ext = pFeature->SetExt();
1281 ext.AddField("format", data.m_FormatKeys);
1282
1283 CRef<CUser_field> pGenotypeData( new CUser_field );
1284 pGenotypeData->SetLabel().SetStr("genotype-data");
1285
1286 for ( CVcfData::GTDATA::const_iterator cit = data.m_GenotypeData.begin();
1287 cit != data.m_GenotypeData.end(); ++cit) {
1288 pGenotypeData->AddField(cit->first,cit->second);
1289 }
1290 ext.SetData().push_back(pGenotypeData);
1291 return true;
1292 }
1293
1294
1295 // ----------------------------------------------------------------------------
xAssigndbSNPTag(const vector<string> & ids,CRef<CDbtag> pDbtag) const1296 bool CVcfReader::xAssigndbSNPTag(
1297 const vector<string>& ids,
1298 CRef<CDbtag> pDbtag) const
1299 // ----------------------------------------------------------------------------
1300 {
1301 for (const string& id : ids) {
1302 if (NStr::StartsWith(id, "rs") ||
1303 NStr::StartsWith(id, "ss") )
1304 {
1305 try {
1306 const int idval = NStr::StringToInt(id.substr(2));
1307 pDbtag->SetDb("dbSNP");
1308 pDbtag->SetTag().SetId(idval);
1309 }
1310 catch (...) {
1311 continue;
1312 }
1313 return true;
1314 }
1315 }
1316 return false;
1317 }
1318
1319
1320 // ----------------------------------------------------------------------------
1321 bool
xAssignVariationIds(CVcfData & data,CRef<CSeq_feat> pFeature)1322 CVcfReader::xAssignVariationIds(
1323 CVcfData& data,
1324 CRef<CSeq_feat> pFeature )
1325 // ----------------------------------------------------------------------------
1326 {
1327 if ( data.m_Ids.empty() ) {
1328 return true;
1329 }
1330 CVariation_ref& variation = pFeature->SetData().SetVariation();
1331 // CVariation_ref::TVariant_prop& var_prop = variation.SetVariant_prop();
1332 // var_prop.SetVersion( 5 );
1333
1334 auto it = data.m_Info.find("SOURCE");
1335
1336 if (data.m_Info.end() != it) {
1337 vector<string> sources = it->second;
1338 if (sources.size() > 0 &&
1339 NStr::Equal(sources.front(), "dbsnp"))
1340 {
1341 CRef<CDbtag> pDbtag = Ref(new CDbtag());
1342 if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1343 variation.SetId(pDbtag.GetNCObject());
1344 return true;
1345 }
1346 }
1347 }
1348
1349
1350 if ( data.m_Info.find( "DB" ) != data.m_Info.end() ) {
1351 string id = data.m_Ids[0];
1352 NStr::ToLower(id);
1353 if (NStr::StartsWith(id, "rs") || NStr::StartsWith(id, "ss")) {
1354 variation.SetId().SetDb("dbSNP");
1355 }
1356 else {
1357 variation.SetId().SetDb( "dbVar" );
1358 }
1359 }
1360 else if ( data.m_Info.find( "H2" ) != data.m_Info.end() ) {
1361 variation.SetId().SetDb( "HapMap2" );
1362 }
1363 else {
1364 variation.SetId().SetDb( "local" );
1365 }
1366 variation.SetId().SetTag().SetStr( data.m_Ids[0] );
1367
1368 for ( size_t i=1; i < data.m_Ids.size(); ++i ) {
1369 if ( data.m_Info.find( "DB" ) != data.m_Info.end()
1370 && data.m_Info.find( "H2" ) != data.m_Info.end() )
1371 {
1372 variation.SetId().SetDb( "HapMap2" );
1373 }
1374 else {
1375 variation.SetId().SetDb( "local" );
1376 }
1377 variation.SetId().SetTag().SetStr( data.m_Ids[i] );
1378 }
1379 return true;
1380 }
1381
1382
1383 // ----------------------------------------------------------------------------
1384 bool
xAssignVariantProps(CVcfData & data,CRef<CSeq_feat> pFeat)1385 CVcfReader::xAssignVariantProps(
1386 CVcfData& data,
1387 CRef<CSeq_feat> pFeat)
1388 // ----------------------------------------------------------------------------
1389 {
1390 typedef CVariantProperties VP;
1391
1392 CVcfData::INFOS& infos = data.m_Info;
1393 VP& props = pFeat->SetData().SetVariation().SetVariant_prop();
1394 CVcfData::INFOS::iterator it;
1395
1396 props.SetResource_link() = 0;
1397 props.SetGene_location() = 0;
1398 props.SetEffect() = 0;
1399 props.SetMapping() = 0;
1400 props.SetFrequency_based_validation() = 0;
1401 props.SetGenotype() = 0;
1402 props.SetQuality_check() = 0;
1403
1404 //byte F0
1405 props.SetVersion() = 5;
1406
1407 //superbyte F1
1408 it = infos.find("SLO");
1409 if (infos.end() != it) {
1410 props.SetResource_link() |= VP::eResource_link_submitterLinkout;
1411 infos.erase(it);
1412 }
1413 it = infos.find("S3D");
1414 if (infos.end() != it) {
1415 props.SetResource_link() |= VP::eResource_link_has3D;
1416 infos.erase(it);
1417 }
1418 it = infos.find("TPA");
1419 if (infos.end() != it) {
1420 props.SetResource_link() |= VP::eResource_link_provisional;
1421 infos.erase(it);
1422 }
1423 it = infos.find("PM");
1424 if (infos.end() != it) {
1425 props.SetResource_link() |= VP::eResource_link_preserved;
1426 infos.erase(it);
1427 }
1428 it = infos.find("CLN");
1429 if (infos.end() != it) {
1430 props.SetResource_link() |= VP::eResource_link_clinical;
1431 infos.erase(it);
1432 }
1433 //todo: INFO ID=PMC
1434 it = infos.find("PMC");
1435 if (infos.end() != it) {
1436 infos.erase(it);
1437 }
1438 it = infos.find("PMID");
1439 if (infos.end() != it) {
1440 vector<string> pmids = it->second;
1441 for (vector<string>::const_iterator cit = pmids.begin();
1442 cit != pmids.end(); ++cit)
1443 {
1444 try {
1445 string db, tag;
1446 NStr::SplitInTwo(*cit, ":", db, tag);
1447 if (db != "PM") {
1448 CReaderMessage warning(
1449 eDiag_Warning,
1450 m_uLineNumber,
1451 "CVcfReader::xAssignVariantProps: Invalid PMID database ID.");
1452 m_pMessageHandler->Report(warning);
1453 continue;
1454 }
1455 CRef<CDbtag> pDbtag(new CDbtag);
1456 pDbtag->SetDb(db);
1457 pDbtag->SetTag().SetId(
1458 NStr::StringToInt(tag));
1459 pFeat->SetDbxref().push_back(pDbtag);
1460 }
1461 catch(...) {}
1462 }
1463 infos.erase(it);
1464 }
1465
1466 xAssignVariantSource(data, pFeat);
1467
1468 //superbyte F2
1469 it = infos.find("R5");
1470 if (infos.end() != it) {
1471 props.SetGene_location() |= VP::eGene_location_near_gene_5;
1472 infos.erase(it);
1473 }
1474 it = infos.find("R3");
1475 if (infos.end() != it) {
1476 props.SetGene_location() |= VP::eGene_location_near_gene_3;
1477 infos.erase(it);
1478 }
1479 it = infos.find("INT");
1480 if (infos.end() != it) {
1481 props.SetGene_location() |= VP::eGene_location_intron;
1482 infos.erase(it);
1483 }
1484 it = infos.find("DSS");
1485 if (infos.end() != it) {
1486 props.SetGene_location() |= VP::eGene_location_donor;
1487 infos.erase(it);
1488 }
1489 it = infos.find("ASS");
1490 if (infos.end() != it) {
1491 props.SetGene_location() |= VP::eGene_location_acceptor;
1492 infos.erase(it);
1493 }
1494 it = infos.find("U5");
1495 if (infos.end() != it) {
1496 props.SetGene_location() |= VP::eGene_location_utr_5;
1497 infos.erase(it);
1498 }
1499 it = infos.find("U3");
1500 if (infos.end() != it) {
1501 props.SetGene_location() |= CVariantProperties::eGene_location_utr_3;
1502 infos.erase(it);
1503 }
1504
1505 it = infos.find("SYN");
1506 if (infos.end() != it) {
1507 props.SetGene_location() |= VP::eEffect_synonymous;
1508 infos.erase(it);
1509 }
1510 it = infos.find("NSN");
1511 if (infos.end() != it) {
1512 props.SetGene_location() |= VP::eEffect_stop_gain;
1513 infos.erase(it);
1514 }
1515 it = infos.find("NSM");
1516 if (infos.end() != it) {
1517 props.SetGene_location() |= VP::eEffect_missense;
1518 infos.erase(it);
1519 }
1520 it = infos.find("NSF");
1521 if (infos.end() != it) {
1522 props.SetGene_location() |= VP::eEffect_frameshift;
1523 infos.erase(it);
1524 }
1525
1526 //byte F3
1527 it = infos.find("WGT");
1528 if (infos.end() != it) {
1529 int weight = NStr::StringToInt( infos["WGT"][0] );
1530 switch(weight) {
1531 default:
1532 break;
1533 case 1:
1534 props.SetMap_weight() = VP::eMap_weight_is_uniquely_placed;
1535 infos.erase(it);
1536 break;
1537 case 2:
1538 props.SetMap_weight() = VP::eMap_weight_placed_twice_on_same_chrom;
1539 infos.erase(it);
1540 break;
1541 case 3:
1542 props.SetMap_weight() = VP::eMap_weight_placed_twice_on_diff_chrom;
1543 infos.erase(it);
1544 break;
1545 case 10:
1546 props.SetMap_weight() = VP::eMap_weight_many_placements;
1547 break;
1548 }
1549 }
1550
1551 it = infos.find("ASP");
1552 if (infos.end() != it) {
1553 props.SetMapping() |= VP::eMapping_is_assembly_specific;
1554 infos.erase(it);
1555 }
1556 it = infos.find("CFL");
1557 if (infos.end() != it) {
1558 props.SetMapping() |= VP::eMapping_has_assembly_conflict;
1559 infos.erase(it);
1560 }
1561 it = infos.find("OTH");
1562 if (infos.end() != it) {
1563 props.SetMapping() |= VP::eMapping_has_other_snp;
1564 infos.erase(it);
1565 }
1566
1567 //byte F4
1568 it = infos.find("OTH");
1569 if (infos.end() != it) {
1570 props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_all;
1571 infos.erase(it);
1572 }
1573 it = infos.find("G5A");
1574 if (infos.end() != it) {
1575 props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_1plus;
1576 infos.erase(it);
1577 }
1578 it = infos.find("VLD");
1579 if (infos.end() != it) {
1580 props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_validated;
1581 infos.erase(it);
1582 }
1583 it = infos.find("MUT");
1584 if (infos.end() != it) {
1585 props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_is_mutation;
1586 infos.erase(it);
1587 }
1588 it = infos.find("GMAF");
1589 if (infos.end() != it) {
1590 props.SetAllele_frequency() = NStr::StringToDouble(infos["GMAF"][0]);
1591 infos.erase(it);
1592 }
1593
1594 //byte F5
1595 it = infos.find("GNO");
1596 if (infos.end() != it) {
1597 props.SetGenotype() |= VP::eGenotype_has_genotypes;
1598 infos.erase(it);
1599 }
1600 it = infos.find("HD");
1601 if (infos.end() != it) {
1602 props.SetResource_link() |= VP::eResource_link_genotypeKit;
1603 infos.erase(it);
1604 }
1605
1606 //byte F6
1607 if (infos.end() != infos.find("PH3")) {
1608 CRef<CDbtag> pDbtag(new CDbtag);
1609 pDbtag->SetDb("BioProject");
1610 pDbtag->SetTag().SetId(60835);
1611 pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1612 }
1613 if (infos.end() != infos.find("KGPhase1")) {
1614 CRef<CDbtag> pDbtag(new CDbtag);
1615 pDbtag->SetDb("BioProject");
1616 pDbtag->SetTag().SetId(28889);
1617 pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1618 }
1619
1620 //byte F7
1621
1622 //byte F8
1623 //no relevant information found in VCF
1624
1625 //byte F9
1626 it = infos.find("GCF");
1627 if (infos.end() != it) {
1628 props.SetQuality_check() |= VP::eQuality_check_genotype_conflict;
1629 infos.erase(it);
1630 }
1631 it = infos.find("NOV");
1632 if (infos.end() != it) {
1633 props.SetQuality_check() |= VP::eQuality_check_non_overlapping_alleles;
1634 infos.erase(it);
1635 }
1636 it = infos.find("WTD");
1637 if (infos.end() != it) {
1638 props.SetQuality_check() |= VP::eQuality_check_withdrawn_by_submitter;
1639 infos.erase(it);
1640 }
1641 it = infos.find("NOC");
1642 if (infos.end() != it) {
1643 props.SetQuality_check() |= VP::eQuality_check_contig_allele_missing;
1644 infos.erase(it);
1645 }
1646 return true;
1647 }
1648
1649
1650 // ----------------------------------------------------------------------------
xAssignVariantSource(CVcfData & data,CRef<CSeq_feat> pFeat)1651 void CVcfReader::xAssignVariantSource(CVcfData& data,
1652 CRef<CSeq_feat> pFeat)
1653 // ----------------------------------------------------------------------------
1654 {
1655 CVcfData::INFOS& infos = data.m_Info;
1656 auto it = infos.find("SOURCE");
1657 if (infos.end() != it) {
1658 vector<string> sources = it->second;
1659 if (sources.size() > 0 &&
1660 NStr::Equal(sources.front(),"dbsnp"))
1661 {
1662 bool valid_id=false;
1663 CRef<CDbtag> pDbtag(new CDbtag());
1664 if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1665 pFeat->SetDbxref().push_back(pDbtag);
1666 valid_id = true;
1667 }
1668
1669 if (!valid_id) {
1670 CReaderMessage warning(
1671 eDiag_Warning,
1672 m_uLineNumber,
1673 "CVcfReader::xAssignVariantProps: No valid dbSNP identifier");
1674 m_pMessageHandler->Report(warning);
1675 }
1676 infos.erase(it);
1677 }
1678 }
1679 }
1680
1681
1682 // ----------------------------------------------------------------------------
xIsCommentLine(const CTempString & strLine)1683 bool CVcfReader::xIsCommentLine(
1684 const CTempString& strLine)
1685 // ----------------------------------------------------------------------------
1686 {
1687 if (NStr::StartsWith(strLine, "#CHROM")) {
1688 return false;
1689 }
1690 return CReaderBase::xIsCommentLine(strLine);
1691 }
1692
1693 END_objects_SCOPE
1694 END_NCBI_SCOPE
1695