1 /*  $Id: vcf_reader.cpp 632526 2021-06-02 17:25:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Frank Ludwig
27  *
28  * File Description:
29  *   VCF file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
38 #include <objects/general/Object_id.hpp>
39 #include <objects/general/User_object.hpp>
40 #include <objects/general/User_field.hpp>
41 #include <objects/general/Dbtag.hpp>
42 
43 #include <objects/seqloc/Seq_interval.hpp>
44 #include <objects/seqloc/Seq_point.hpp>
45 
46 #include <objects/seq/Annot_descr.hpp>
47 #include <objects/seq/Seq_literal.hpp>
48 #include <objects/seqfeat/Variation_ref.hpp>
49 #include <objects/seqfeat/Variation_inst.hpp>
50 #include <objects/seqfeat/VariantProperties.hpp>
51 #include <objects/seqfeat/Delta_item.hpp>
52 
53 #include <objtools/readers/vcf_reader.hpp>
54 
55 #include <algorithm>
56 
57 #include "reader_message_handler.hpp"
58 
59 #define NCBI_USE_ERRCODE_X   Objtools_Rd_RepMask
60 
61 BEGIN_NCBI_SCOPE
62 BEGIN_objects_SCOPE // namespace ncbi::objects::
63 
64 //  ============================================================================
65 const double CVcfReader::mMaxSupportedVersion = 4.1;
66 //  ============================================================================
67 
68 //  ============================================================================
69 class CVcfData
70 //  ============================================================================
71 {
72 public:
73     typedef map<string,vector<string> > INFOS;
74     typedef map<string, vector<string> > GTDATA;
75 
CVcfData()76     CVcfData() { m_pdQual = 0; };
~CVcfData()77     ~CVcfData() { delete m_pdQual; };
78 
79     string m_strLine;
80     string m_strChrom;
81     int m_iPos;
82     vector<string> m_Ids;
83     string m_strRef;
84     vector<string> m_Alt;
85     double* m_pdQual;
86     string m_strFilter;
87     INFOS m_Info;
88     vector<string> m_FormatKeys;
89 //    vector< vector<string> > m_GenotypeData;
90     GTDATA m_GenotypeData;
91     enum SetType_t {
92         ST_ALL_SNV,
93         ST_ALL_DEL,
94         ST_ALL_INS,
95         ST_ALL_MNV,
96         ST_MIXED
97     } m_SetType;
98 };
99 
100 //  ----------------------------------------------------------------------------
SpecType(const string & spectype)101 ESpecType SpecType(
102     const string& spectype )
103 //  ----------------------------------------------------------------------------
104 {
105     static map<string, ESpecType> typemap;
106     if ( typemap.empty() ) {
107         typemap["Integer"] = eType_Integer;
108         typemap["Float"] = eType_Float;
109         typemap["Flag"] = eType_Flag;
110         typemap["Character"] = eType_Character;
111         typemap["String"] = eType_String;
112     }
113     try {
114         return typemap[spectype];
115     }
116     catch( ... ) {
117         AutoPtr<CObjReaderLineException> pErr(
118             CObjReaderLineException::Create(
119             eDiag_Warning,
120             0,
121             "CVcfReader::xProcessMetaLineInfo: Unrecognized line or record type.",
122             ILineError::eProblem_GeneralParsingError) );
123         pErr->Throw();
124         return eType_String;
125     }
126 };
127 
128 //  ----------------------------------------------------------------------------
SpecNumber(const string & specnumber)129 ESpecNumber SpecNumber(
130     const string& specnumber )
131 //  ----------------------------------------------------------------------------
132 {
133     if ( specnumber == "R" ) {
134         return eNumber_CountAllAlleles;
135     }
136     if ( specnumber == "A" ) {
137         return eNumber_CountAlleles;
138     }
139     if ( specnumber == "G" ) {
140         return eNumber_CountGenotypes;
141     }
142     if ( specnumber == "." ) {
143         return eNumber_CountUnknown;
144     }
145     try {
146         return ESpecNumber( NStr::StringToInt( specnumber ) );
147     }
148     catch( ... ) {
149         AutoPtr<CObjReaderLineException> pErr(
150             CObjReaderLineException::Create(
151                 eDiag_Warning,
152                 0,
153                 "CVcfReader::xProcessMetaLineInfo: Unrecognized SpecNumber type in FORAMT directive. "
154                 "Recognized settings are \'A\', \'G\', \'R\', \'.\', or numeric.",
155                 ILineError::eProblem_GeneralParsingError) );
156         pErr->Throw();
157     }
158     return ESpecNumber( 0 );
159 };
160 
161 //  ----------------------------------------------------------------------------
CVcfReader(int flags,CReaderListener * pRL)162 CVcfReader::CVcfReader(
163     int flags,
164     CReaderListener* pRL):
165     CReaderBase(flags, "", "", CReadUtil::AsSeqId, pRL),
166     mActualVersion(0.0),
167     m_MetaHandled(false)
168 //  ----------------------------------------------------------------------------
169 {
170 }
171 
172 
173 //  ----------------------------------------------------------------------------
~CVcfReader()174 CVcfReader::~CVcfReader()
175 //  ----------------------------------------------------------------------------
176 {
177 }
178 
179 //  ----------------------------------------------------------------------------
180 CRef< CSeq_annot >
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)181 CVcfReader::ReadSeqAnnot(
182     ILineReader& lr,
183     ILineErrorListener* pEC )
184 //  ----------------------------------------------------------------------------
185 {
186     if (!m_Meta) {
187         m_Meta.Reset( new CAnnotdesc );
188         m_Meta->SetUser().SetType().SetStr( "vcf-meta-info" );
189     }
190     CRef<CSeq_annot> pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
191     if (pAnnot) {
192         xAssignTrackData(*pAnnot);
193         xAssignVcfMeta(*pAnnot);
194     }
195     return pAnnot;
196 }
197 
198 //  ----------------------------------------------------------------------------
199 CRef<CSeq_annot>
xCreateSeqAnnot()200 CVcfReader::xCreateSeqAnnot()
201 //  ----------------------------------------------------------------------------
202 {
203     CRef<CSeq_annot> pAnnot = CReaderBase::xCreateSeqAnnot();
204     pAnnot->SetData().SetFtable();
205     return pAnnot;
206 }
207 
208 //  ----------------------------------------------------------------------------
209 void
xGetData(ILineReader & lr,TReaderData & readerData)210 CVcfReader::xGetData(
211     ILineReader& lr,
212     TReaderData& readerData)
213 //  ----------------------------------------------------------------------------
214 {
215     readerData.clear();
216     string line;
217     if (!xGetLine(lr, line)) {
218         return;
219     }
220     if (xIsTrackLine(line)  &&  m_uDataCount) {
221         xUngetLine(lr);
222         return;
223     }
224     readerData.push_back(TReaderLine{m_uLineNumber, line});
225 }
226 
227 //  ----------------------------------------------------------------------------
228 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)229 CVcfReader::xProcessData(
230     const TReaderData& readerData,
231     CSeq_annot& annot)
232 //  ----------------------------------------------------------------------------
233 {
234     for (auto lineInfo: readerData) {
235         const auto& line = lineInfo.mData;
236         if (mActualVersion == 0.0) {
237             bool lineContainsVersion(false);
238             xSetFileFormat(line, annot, lineContainsVersion);
239             // Note:
240             // Currently, the line format specifier is also processed as a
241             // meta line even though it really isn't, or at least it's a very
242             // different meta than all other VCF metas.
243             // Uncomment the following lines to no longer process the file format
244             // specifier as a meta.
245             //if (lineContainsVersion) {
246                 //return;
247             //}
248         }
249 
250         if (xParseBrowserLine(line, annot)) {
251             return;
252         }
253         if (xProcessTrackLine(line, annot)) {
254             return;
255         }
256         if (xProcessMetaLine(line, annot)) {
257             return;
258         }
259         if (xProcessHeaderLine(line, annot)) {
260             return;
261         }
262         if (xProcessDataLine(line, annot)) {
263             ++m_uDataCount;
264             return;
265         }
266         CReaderMessage warning(
267             eDiag_Warning,
268             m_uLineNumber,
269             "CVcfReader::ReadSeqAnnot: Unrecognized line or record type.");
270         m_pMessageHandler->Report(warning);
271     }
272 }
273 
274 //  ----------------------------------------------------------------------------
275 bool
xProcessMetaLine(const string & line,CSeq_annot & annot)276 CVcfReader::xProcessMetaLine(
277     const string& line,
278     CSeq_annot& annot)
279 //  ----------------------------------------------------------------------------
280 {
281     if ( ! NStr::StartsWith( line, "##" ) ) {
282         if ( !m_MetaDirectives.empty() && !m_MetaHandled ) {
283             m_Meta->SetUser().AddField("meta-information", m_MetaDirectives);
284         }
285         m_MetaHandled = true;
286         return false;
287     }
288     m_MetaDirectives.push_back(line.substr(2));
289 
290     if (xProcessMetaLineInfo(line, annot)) {
291         return true;
292     }
293     if (xProcessMetaLineFilter(line, annot)) {
294         return true;
295     }
296     if (xProcessMetaLineFormat(line, annot)) {
297         return true;
298     }
299     return true;
300 }
301 
302 //  ----------------------------------------------------------------------------
303 void
xSetFileFormat(const string & line,CSeq_annot & annot,bool & lineContainsVersion)304 CVcfReader::xSetFileFormat(
305     const string& line,
306     CSeq_annot& annot,
307     bool& lineContainsVersion)
308 //  ----------------------------------------------------------------------------
309 {
310     const string prefix = "##fileformat=VCFv";
311 
312     if (!NStr::StartsWith(line, prefix)) {
313         CReaderMessage warning(
314             eDiag_Warning,
315             m_uLineNumber,
316             string("CVcfReader::xProcessMetaLineFileFormat: ") +
317                 "Missing VCF version string. Assuming VCFv" +
318                 NStr::DoubleToString(mMaxSupportedVersion) +
319                 ". Proceed with care!");
320         m_pMessageHandler->Report(warning);
321         mActualVersion = mMaxSupportedVersion;
322         lineContainsVersion = false;
323         return;
324     }
325 
326     lineContainsVersion = true;
327 
328     string versionStr = line.substr(prefix.length(), string::npos);
329     try {
330         mActualVersion = NStr::StringToDouble(versionStr);
331     }
332     catch (std::exception except) {
333         CReaderMessage warning(
334             eDiag_Warning,
335             m_uLineNumber,
336             string("CVcfReader::xProcessMetaLineFileFormat: ") +
337             "Data file contains an unrecognized version string \"" +
338                 versionStr +
339                 "\". Assuming VCFv" +
340                 NStr::DoubleToString(mMaxSupportedVersion) +
341                 ". Proceed with care!");
342         m_pMessageHandler->Report(warning);
343         mActualVersion = mMaxSupportedVersion;
344         return;
345     }
346 
347     if (mActualVersion > mMaxSupportedVersion) {
348         CReaderMessage warning(
349             eDiag_Warning,
350             m_uLineNumber,
351             string("CVcfReader::xProcessMetaLineFileFormat: Data file format \"") +
352                 versionStr +
353                 "\" exceeds reader supported format \"" +
354                 NStr::DoubleToString(mMaxSupportedVersion) +
355                 "\". Proceed with care!");
356         m_pMessageHandler->Report(warning);
357         mActualVersion = mMaxSupportedVersion;
358         return;
359     }
360 }
361 
362 //  ----------------------------------------------------------------------------
363 bool
xProcessMetaLineInfo(const string & line,CSeq_annot & annot)364 CVcfReader::xProcessMetaLineInfo(
365     const string& line,
366     CSeq_annot& annot)
367 //  ----------------------------------------------------------------------------
368 {
369     const string prefix = "##INFO=<";
370     const string postfix = ">";
371 
372     if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
373         return false;
374     }
375 
376     try {
377         vector<string> fields;
378         string key, id, numcount, type, description;
379         string info = line.substr(
380             prefix.length(), line.length() - prefix.length() - postfix.length() );
381         NStr::Split( info, ",", fields );
382         NStr::SplitInTwo( fields[0], "=", key, id );
383         if ( key != "ID" ) {
384             AutoPtr<CObjReaderLineException> pErr(
385                 CObjReaderLineException::Create(
386                 eDiag_Error,
387                 0,
388                 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"ID\".",
389                 ILineError::eProblem_BadInfoLine) );
390             pErr->Throw();
391         }
392         NStr::SplitInTwo( fields[1], "=", key, numcount );
393         if ( key != "Number" ) {
394             AutoPtr<CObjReaderLineException> pErr(
395                 CObjReaderLineException::Create(
396                 eDiag_Error,
397                 0,
398                 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Number\".",
399                 ILineError::eProblem_BadInfoLine) );
400             pErr->Throw();
401         }
402         NStr::SplitInTwo( fields[2], "=", key, type );
403         if ( key != "Type" ) {
404             AutoPtr<CObjReaderLineException> pErr(
405                 CObjReaderLineException::Create(
406                 eDiag_Error,
407                 0,
408                 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Type\".",
409                 ILineError::eProblem_BadInfoLine) );
410             pErr->Throw();
411         }
412         NStr::SplitInTwo( fields[3], "=", key, description );
413         if ( key != "Description" ) {
414             AutoPtr<CObjReaderLineException> pErr(
415                 CObjReaderLineException::Create(
416                 eDiag_Error,
417                 0,
418                 "CVcfReader::xProcessMetaLineInfo: ##INFO with bad or missing \"Description\".",
419                 ILineError::eProblem_BadInfoLine) );
420             pErr->Throw();
421         }
422         m_InfoSpecs[id] = CVcfInfoSpec( id, numcount, type, description );
423     }
424     catch (CObjReaderLineException& err) {
425         ProcessError(err, nullptr);
426     }
427     return true;
428 }
429 
430 //  ----------------------------------------------------------------------------
431 bool
xProcessMetaLineFilter(const string & line,CSeq_annot & annot)432 CVcfReader::xProcessMetaLineFilter(
433     const string& line,
434     CSeq_annot& annot)
435 //  ----------------------------------------------------------------------------
436 {
437     const string prefix = "##FILTER=<";
438     const string postfix = ">";
439 
440     if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
441         return false;
442     }
443 
444     try {
445         vector<string> fields;
446         string key, id, description;
447         string info = line.substr(
448             prefix.length(), line.length() - prefix.length() - postfix.length() );
449         NStr::Split( info, ",", fields );
450         NStr::SplitInTwo( fields[0], "=", key, id );
451         if ( key != "ID" ) {
452             AutoPtr<CObjReaderLineException> pErr(
453                 CObjReaderLineException::Create(
454                 eDiag_Error,
455                 0,
456                 "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"ID\".",
457                 ILineError::eProblem_BadFilterLine) );
458             pErr->Throw();
459         }
460         NStr::SplitInTwo( fields[1], "=", key, description );
461         if ( key != "Description" ) {
462             AutoPtr<CObjReaderLineException> pErr(
463                 CObjReaderLineException::Create(
464                 eDiag_Error,
465                 0,
466                 "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"Description\".",
467                 ILineError::eProblem_BadFilterLine) );
468             pErr->Throw();
469         }
470         m_FilterSpecs[id] = CVcfFilterSpec( id, description );
471     }
472     catch (CObjReaderLineException& err) {
473         ProcessError(err, nullptr);
474     }
475     return true;
476 }
477 
478 //  ----------------------------------------------------------------------------
479 bool
xProcessMetaLineFormat(const string & line,CSeq_annot & annot)480 CVcfReader::xProcessMetaLineFormat(
481     const string& line,
482     CSeq_annot& annot)
483 //  ----------------------------------------------------------------------------
484 {
485     const string prefix = "##FORMAT=<";
486     const string postfix = ">";
487 
488     if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
489         return false;
490     }
491 
492     try {
493         vector<string> fields;
494         string key, id, numcount, type, description;
495         string info = line.substr(
496             prefix.length(), line.length() - prefix.length() - postfix.length() );
497         NStr::Split( info, ",", fields );
498         NStr::SplitInTwo( fields[0], "=", key, id );
499         if ( key != "ID" ) {
500             AutoPtr<CObjReaderLineException> pErr(
501                 CObjReaderLineException::Create(
502                 eDiag_Error,
503                 0,
504                 "CVcfReader::xProcessMetaLineInfo: ##FORMAT with bad or missing \"ID\".",
505                 ILineError::eProblem_BadFormatLine) );
506             pErr->Throw();
507         }
508         NStr::SplitInTwo( fields[1], "=", key, numcount );
509         if ( key != "Number" ) {
510             AutoPtr<CObjReaderLineException> pErr(
511                 CObjReaderLineException::Create(
512                 eDiag_Error,
513                 0,
514                 "CVcfReader::xProcessMetaLineInfo: "
515                     "##FORMAT with bad or missing \"Number\".",
516                 ILineError::eProblem_BadFormatLine) );
517             pErr->Throw();
518         }
519         NStr::SplitInTwo( fields[2], "=", key, type );
520         if ( key != "Type" ) {
521             AutoPtr<CObjReaderLineException> pErr(
522                 CObjReaderLineException::Create(
523                 eDiag_Error,
524                 0,
525                 "CVcfReader::xProcessMetaLineInfo: "
526                     "##FORMAT with bad or missing \"Type\".",
527                 ILineError::eProblem_BadFormatLine) );
528             pErr->Throw();
529         }
530         NStr::SplitInTwo( fields[3], "=", key, description );
531         if ( key != "Description" ) {
532             AutoPtr<CObjReaderLineException> pErr(
533                 CObjReaderLineException::Create(
534                 eDiag_Error,
535                 0,
536                 "CVcfReader::xProcessMetaLineInfo: "
537                     "##FORMAT with bad or missing \"Description\".",
538                 ILineError::eProblem_BadFormatLine) );
539             pErr->Throw();
540         }
541         m_FormatSpecs[id] = CVcfFormatSpec( id, numcount, type, description );
542     }
543     catch (CObjReaderLineException& err) {
544         ProcessError(err, nullptr);
545     }
546     return true;
547 }
548 
549 //  ----------------------------------------------------------------------------
550 bool
xProcessHeaderLine(const string & line,CSeq_annot & annot)551 CVcfReader::xProcessHeaderLine(
552     const string& line,
553     CSeq_annot& annot )
554 //  ----------------------------------------------------------------------------
555 {
556     if ( ! NStr::StartsWith( line, "#CHROM" ) ) {
557         return false;
558     }
559 
560     //
561     //  Per spec:
562     //  The header line provides the column headers for the data records that follow.
563     //  the first few are fixed and mandatory: CHROM .. FILTER.
564     //  If genotype data is present this is followed by the FORMAT header.
565     //  After that come the various headers for the genotype information, and these
566     //  need to be preserved:
567     //
568     NStr::Split(line, " \t", m_GenotypeHeaders, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
569     vector<string>::iterator pos_format = find(
570         m_GenotypeHeaders.begin(), m_GenotypeHeaders.end(), "FORMAT");
571     if ( pos_format == m_GenotypeHeaders.end() ) {
572         m_GenotypeHeaders.clear();
573     }
574     else {
575         m_GenotypeHeaders.erase( m_GenotypeHeaders.begin(), pos_format+1 );
576         m_Meta->SetUser().AddField("genotype-headers", m_GenotypeHeaders);
577     }
578 
579     //
580     //  The header line signals the end of meta information, so migrate the
581     //  accumulated meta information into the seq descriptor:
582     //
583     return true;
584 }
585 
586 //  ----------------------------------------------------------------------------
587 bool
xAssignVcfMeta(CSeq_annot & annot)588 CVcfReader::xAssignVcfMeta(
589     CSeq_annot& annot)
590 //  ----------------------------------------------------------------------------
591 {
592     if (m_Meta  &&  m_Meta->IsUser() &&  m_Meta->GetUser().IsSetData()) {
593         if (!annot.IsSetDesc()) {
594             CRef< CAnnot_descr > desc( new CAnnot_descr );
595             annot.SetDesc(*desc);
596         }
597         annot.SetDesc().Set().push_back( m_Meta );
598     }
599     //else { // VCF input ought to include a header
600     //    CReaderMessage warning(
601     //        eDiag_Warning,
602     //        m_uLineNumber,
603     //        "CVcfReader::xAssignVcfMeta: Missing VCF header data.");
604     //    m_pMessageHandler->Report(warning);
605     //}
606     return true;
607 }
608 
609 //  ----------------------------------------------------------------------------
610 bool
xProcessDataLine(const string & line,CSeq_annot & annot)611 CVcfReader::xProcessDataLine(
612     const string& line,
613     CSeq_annot& annot)
614 //  ----------------------------------------------------------------------------
615 {
616     if ( NStr::StartsWith( line, "#" ) ) {
617         return false;
618     }
619 
620     CVcfData data;
621     if (!xParseData(line, data, nullptr)) {
622         return false;
623     }
624     CRef<CSeq_feat> pFeat( new CSeq_feat );
625     pFeat->SetData().SetVariation().SetData().SetSet().SetType(
626         CVariation_ref::C_Data::C_Set::eData_set_type_package );
627     pFeat->SetData().SetVariation().SetVariant_prop().SetVersion( 5 );
628     CSeq_feat::TExt& ext = pFeat->SetExt();
629     ext.SetType().SetStr( "VcfAttributes" );
630 
631     if (!xAssignFeatureLocationSet(data, pFeat)) {
632         return false;
633     }
634     if (!xAssignVariationIds(data, pFeat)) {
635         return false;
636     }
637     if (!xAssignVariationAlleleSet(data, pFeat)) {
638         return false;
639     }
640     if (!xProcessScore(data, pFeat)) {
641         return false;
642     }
643     if (!xProcessFilter(data, pFeat)) {
644         return false;
645     }
646     if (!xProcessInfo( data, pFeat)) {
647         return false;
648     }
649     if (!xProcessFormat(data, pFeat)) {
650         return false;
651     }
652 
653     if ( pFeat->GetExt().GetData().empty() ) {
654         pFeat->ResetExt();
655     }
656     annot.SetData().SetFtable().push_back(pFeat);
657     return true;
658 }
659 
660 //  ----------------------------------------------------------------------------
661 bool
xAssignVariationAlleleSet(const CVcfData & data,CRef<CSeq_feat> pFeature)662 CVcfReader::xAssignVariationAlleleSet(
663     const CVcfData& data,
664     CRef<CSeq_feat> pFeature )
665 //  ----------------------------------------------------------------------------
666 {
667     CVariation_ref::TData::TSet::TVariations& variants =
668         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
669 
670     //make one variation for the reference
671     CRef<CVariation_ref> pIdentity(new CVariation_ref);
672     vector<string> variant;
673 
674     switch(data.m_SetType) {
675     case CVcfData::ST_ALL_INS:
676         pIdentity->SetDeletion();
677         break;
678     default:
679         variant.push_back(data.m_strRef);
680         pIdentity->SetSNV(variant, CVariation_ref::eSeqType_na);
681         break;
682     }
683     CVariation_inst& instance = pIdentity->SetData().SetInstance();
684     instance.SetType(CVariation_inst::eType_identity);
685     instance.SetObservation(CVariation_inst::eObservation_reference);
686     if (data.m_SetType != CVcfData::ST_ALL_INS) {
687         variants.push_back(pIdentity);
688     }
689 
690     bool no_alt = true;
691     for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
692         if (!NStr::Equal(data.m_Alt[i],".")) {
693             no_alt = false;
694         }
695     }
696     if (no_alt) {
697         instance.SetObservation() |= CVariation_inst::eObservation_variant;
698         return true;
699     }
700 
701 
702     //add additional variations, one for each alternative
703     for (unsigned int i=0; i < data.m_Alt.size(); ++i) {
704         if (NStr::Equal(data.m_Alt[i],".")) {
705             continue;
706         }
707         switch(data.m_SetType) {
708         default:
709             if (!xAssignVariantDelins(data, i, pFeature)) {
710                 return false;
711             }
712             break;
713         case CVcfData::ST_ALL_SNV:
714             if (!xAssignVariantSnv(data, i, pFeature)) {
715                 return false;
716             }
717             break;
718         case CVcfData::ST_ALL_MNV:
719             if (!xAssignVariantMnv(data, i, pFeature)) {
720                 return false;
721             }
722             break;
723         case CVcfData::ST_ALL_INS:
724             if (!xAssignVariantIns(data, i, pFeature)) {
725                 return false;
726             }
727             break;
728         case CVcfData::ST_ALL_DEL:
729             if (!xAssignVariantDel(data, i, pFeature)) {
730                 return false;
731             }
732             break;
733         }
734     }
735     return true;
736 }
737 
738 //  ----------------------------------------------------------------------------
739 bool
xAssignVariantSnv(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)740 CVcfReader::xAssignVariantSnv(
741     const CVcfData& data,
742     unsigned int index,
743     CRef<CSeq_feat> pFeature )
744 //  ----------------------------------------------------------------------------
745 {
746     CVariation_ref::TData::TSet::TVariations& variants =
747         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
748 
749     CRef<CVariation_ref> pVariant(new CVariation_ref);
750     {{
751         vector<string> variant;
752         variant.push_back(data.m_Alt[index]);
753         pVariant->SetSNV(variant, CVariation_ref::eSeqType_na);
754     }}
755     variants.push_back(pVariant);
756     return true;
757 }
758 
759 //  ----------------------------------------------------------------------------
760 bool
xAssignVariantMnv(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)761 CVcfReader::xAssignVariantMnv(
762     const CVcfData& data,
763     unsigned int index,
764     CRef<CSeq_feat> pFeature )
765 //  ----------------------------------------------------------------------------
766 {
767     CVariation_ref::TData::TSet::TVariations& variants =
768         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
769 
770     CRef<CVariation_ref> pVariant(new CVariation_ref);
771     {{
772         vector<string> variant;
773         variant.push_back(data.m_Alt[index]);
774         pVariant->SetMNP(variant, CVariation_ref::eSeqType_na);
775     }}
776     variants.push_back(pVariant);
777     return true;
778 }
779 
780 //  ----------------------------------------------------------------------------
781 static void
s_AddDeleteDeltaItem(CVariation_inst & instance)782 s_AddDeleteDeltaItem(
783     CVariation_inst& instance )
784 {
785     CRef<CDelta_item> pItem(new CDelta_item);
786 
787     pItem->SetSeq().SetThis();
788     instance.SetType(CVariation_inst::eType_del);
789     pItem->SetAction(CDelta_item::eAction_del_at);
790     instance.SetDelta().push_back(pItem);
791 }
792 
793 
794 //  ----------------------------------------------------------------------------
795 bool
xAssignVariantDel(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)796 CVcfReader::xAssignVariantDel(
797     const CVcfData& data,
798     unsigned int index,
799     CRef<CSeq_feat> pFeature )
800 //  ----------------------------------------------------------------------------
801 {
802     CVariation_ref::TData::TSet::TVariations& variants =
803         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
804 
805     CRef<CVariation_ref> pVariant(new CVariation_ref);
806     {{
807         //pVariant->SetData().SetNote("DEL");
808         pVariant->SetDeletion();
809         CVariation_inst& instance =  pVariant->SetData().SetInstance();
810         s_AddDeleteDeltaItem(instance);
811 
812     }}
813     variants.push_back(pVariant);
814     return true;
815 }
816 
817 //  ----------------------------------------------------------------------------
818 bool
xAssignVariantIns(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)819 CVcfReader::xAssignVariantIns(
820     const CVcfData& data,
821     unsigned int index,
822     CRef<CSeq_feat> pFeature )
823 //  ----------------------------------------------------------------------------
824 {
825     CVariation_ref::TData::TSet::TVariations& variants =
826         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
827 
828     CRef<CVariation_ref> pVariant(new CVariation_ref);
829     {{
830         string insertion(data.m_Alt[index]);
831         CRef<CSeq_literal> pLiteral(new CSeq_literal);
832         pLiteral->SetSeq_data().SetIupacna().Set(insertion);
833         pLiteral->SetLength(
834             static_cast<TSeqPos>(insertion.size()));
835         CRef<CDelta_item> pItem(new CDelta_item);
836         pItem->SetAction(CDelta_item::eAction_ins_before);
837         pItem->SetSeq().SetLiteral(*pLiteral);
838         CVariation_inst& instance =  pVariant->SetData().SetInstance();
839         instance.SetType(CVariation_inst::eType_ins);
840         instance.SetDelta().push_back(pItem);
841     }}
842     variants.push_back(pVariant);
843     return true;
844 }
845 
846 
847 
848 //  ----------------------------------------------------------------------------
849 bool
xAssignVariantDelins(const CVcfData & data,unsigned int index,CRef<CSeq_feat> pFeature)850 CVcfReader::xAssignVariantDelins(
851     const CVcfData& data,
852     unsigned int index,
853     CRef<CSeq_feat> pFeature )
854 //  ----------------------------------------------------------------------------
855 {
856     string insertion(data.m_Alt[index]);
857 
858     CVariation_ref::TData::TSet::TVariations& variants =
859         pFeature->SetData().SetVariation().SetData().SetSet().SetVariations();
860 
861     CRef<CVariation_ref> pVariant(new CVariation_ref);
862     CVariation_inst& instance = pVariant->SetData().SetInstance();
863 
864     // If it is a deletion, add Deleteion Delta-item and be done.
865     if (insertion.size() == 0) {
866 
867         s_AddDeleteDeltaItem(instance);
868         variants.push_back(pVariant);
869         return true;
870     }
871 
872     // Must be a SNV or delins
873     CRef<CSeq_literal> pLiteral(new CSeq_literal);
874     pLiteral->SetSeq_data().SetIupacna().Set(insertion);
875     pLiteral->SetLength(static_cast<TSeqPos>(insertion.size()));
876 
877     CRef<CDelta_item> pItem(new CDelta_item);
878     pItem->SetSeq().SetLiteral(*pLiteral);
879     instance.SetDelta().push_back(pItem);
880 
881     //Let's try to smartly set the Type.
882     if (insertion.size() == 1 && data.m_strRef.size() == 1) {
883         instance.SetType(CVariation_inst::eType_snv);
884     } else {
885         instance.SetType(CVariation_inst::eType_delins);
886     }
887 
888     variants.push_back(pVariant);
889     return true;
890 
891 }
892 
893 //  ----------------------------------------------------------------------------
894 bool
xParseData(const string & line,CVcfData & data,ILineErrorListener * pEC)895 CVcfReader::xParseData(
896     const string& line,
897     CVcfData& data,
898     ILineErrorListener* pEC)
899 //  ----------------------------------------------------------------------------
900 {
901     vector<string> columns;
902     NStr::Split(line, "\t", columns, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
903     if ( columns.size() < 8 ) {
904         return false;
905     }
906     try {
907         data.m_strLine = line;
908 
909         data.m_strChrom = columns[0];
910         data.m_iPos = NStr::StringToInt( columns[1] );
911         NStr::Split( columns[2], ";", data.m_Ids );
912         if ( (data.m_Ids.size() == 1)  &&  (data.m_Ids[0] == ".") ) {
913             data.m_Ids.clear();
914         }
915         data.m_strRef = columns[3];
916         NStr::Split( columns[4], ",", data.m_Alt );
917         if ( columns[5] != "." ) {
918             data.m_pdQual = new double( NStr::StringToDouble( columns[5] ) );
919         }
920         data.m_strFilter = columns[6];
921 
922         vector<string> infos;
923         if ( columns[7] != "." ) {
924             NStr::Split( columns[7], ";", infos, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
925             for ( vector<string>::iterator it = infos.begin();
926                 it != infos.end(); ++it )
927             {
928                 string key, value;
929                 NStr::SplitInTwo( *it, "=", key, value );
930                 data.m_Info[key] = vector<string>();
931                 NStr::Split( value, ",", data.m_Info[key] );
932             }
933         }
934         if ( columns.size() > 8 ) {
935             NStr::Split( columns[8], ":", data.m_FormatKeys, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
936 
937             for ( size_t u=9; u < columns.size(); ++u ) {
938                 vector<string> values;
939                 NStr::Split( columns[u], ":", values, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate );
940                 data.m_GenotypeData[ m_GenotypeHeaders[u-9] ] = values;
941             }
942         }
943     }
944     catch ( ... ) {
945         AutoPtr<CObjReaderLineException> pErr(
946             CObjReaderLineException::Create(
947             eDiag_Error,
948             0,
949             "Unable to parse given VCF data (syntax error).",
950             ILineError::eProblem_GeneralParsingError));
951         ProcessError(*pErr, pEC);
952         return false;
953     }
954 
955     if (!xNormalizeData(data, pEC)) {
956         return false;
957     }
958 
959     //assign set type:
960 
961     //test for all SNVs
962     bool maybeAllSnv = (data.m_strRef.size() == 1);
963     if (maybeAllSnv) {
964         for (size_t u=0; u < data.m_Alt.size(); ++u) {
965             if (data.m_Alt[u].size() != 1) {
966                 maybeAllSnv = false;
967                 break;
968             }
969         }
970         if (maybeAllSnv) {
971             data.m_SetType = CVcfData::ST_ALL_SNV;
972             return true;
973         }
974     }
975 
976     //test for all mnvs:
977     bool maybeAllMnv = true;
978     size_t refSize = data.m_strRef.size();
979     for (size_t u=0; u < data.m_Alt.size(); ++u) {
980         if (data.m_Alt[u].size() != refSize) {
981             maybeAllMnv = false;
982             break;
983         }
984     }
985     if (maybeAllMnv) {
986         data.m_SetType = CVcfData::ST_ALL_MNV;
987         return true;
988     }
989 
990     //test for all insertions:
991     bool maybeAllIns = true;
992     for (size_t u=0; u < data.m_Alt.size(); ++u) {
993         if (! NStr::StartsWith(data.m_Alt[u], data.m_strRef)) {
994             maybeAllIns = false;
995             break;
996         }
997     }
998     if (maybeAllIns) {
999         data.m_SetType = CVcfData::ST_ALL_INS;
1000         return true;
1001     }
1002 
1003     //test for all deletions:
1004     // note: even it is all deletions we are not able to process them
1005     // as such because those deletions would be at different ASN1
1006     // locations. Hence we punt to "indel" if there is more than one
1007     // alternative.
1008     bool maybeAllDel = false;
1009     for (size_t u=0; u < data.m_Alt.size(); ++u) {
1010         if (data.m_Alt.size() == 1  && data.m_Alt[0].empty()) {
1011             maybeAllDel = true;
1012         }
1013     }
1014     if (maybeAllDel) {
1015         data.m_SetType = CVcfData::ST_ALL_DEL;
1016         return true;
1017     }
1018 
1019     data.m_SetType = CVcfData::ST_MIXED;
1020     return true;
1021 }
1022 
1023 
1024 //  ---------------------------------------------------------------------------
1025 bool
xNormalizeData(CVcfData & data,ILineErrorListener * pEC)1026 CVcfReader::xNormalizeData(
1027     CVcfData& data,
1028     ILineErrorListener* pEC)
1029 //  ---------------------------------------------------------------------------
1030 {
1031     // make sure none of the alternatives is equal to the reference:
1032     for (size_t u=0; u < data.m_Alt.size(); ++u) {
1033         if (data.m_Alt[u] == data.m_strRef) {
1034             AutoPtr<CObjReaderLineException> pErr(
1035                 CObjReaderLineException::Create(
1036                 eDiag_Error,
1037                 0,
1038                 "CVcfReader::xNormalizeData: Invalid alternative.",
1039                 ILineError::eProblem_GeneralParsingError));
1040             ProcessError(*pErr, pEC);
1041             return false;
1042         }
1043     }
1044 
1045     // normalize ref/alt by trimming common prefices and adjusting location
1046     bool trimComplete = false;
1047     while (!data.m_strRef.empty()) {
1048         char leadBase = data.m_strRef[0];
1049         for (size_t u=0; u < data.m_Alt.size(); ++u) {
1050             if (!NStr::StartsWith(data.m_Alt[u], leadBase)) {
1051                 trimComplete = true;
1052                 break;
1053             }
1054         }
1055         if (trimComplete) {
1056             break;
1057         }
1058         data.m_strRef = data.m_strRef.substr(1);
1059         for (size_t u=0; u < data.m_Alt.size(); ++u) {
1060             data.m_Alt[u] = data.m_Alt[u].substr(1);
1061         }
1062         data.m_iPos++;
1063     }
1064 
1065     //  normalize ref/alt by trimming common postfixes and adjusting location
1066     trimComplete = false;
1067     size_t refSize = data.m_strRef.size();
1068     size_t trimSize = 0;
1069     while (refSize > trimSize) {
1070         string postfix = data.m_strRef.substr(refSize-1-trimSize, trimSize+1);
1071         for (size_t u=0; u < data.m_Alt.size(); ++u) {
1072             size_t altSize = data.m_Alt[u].size();
1073             if (altSize < trimSize+1) {
1074                 trimComplete = true;
1075                 break;
1076             }
1077             string postfixA = data.m_Alt[u].substr(altSize-1-trimSize, trimSize+1);
1078             if (postfix != postfixA) {
1079                 trimComplete = true;
1080                 break;
1081             }
1082         }
1083         if (trimComplete) {
1084             break;
1085         }
1086         trimSize++;
1087     }
1088     if (trimSize > 0) {
1089         data.m_strRef =
1090             data.m_strRef.substr(0, data.m_strRef.size()-trimSize);
1091         for (size_t u=0; u < data.m_Alt.size(); ++u) {
1092             data.m_Alt[u] =
1093                 data.m_Alt[u].substr(0, data.m_Alt[u].size()-trimSize);
1094         }
1095     }
1096     return true;
1097 }
1098 
1099 //  ---------------------------------------------------------------------------
1100 bool
xAssignFeatureLocationSet(const CVcfData & data,CRef<CSeq_feat> pFeat)1101 CVcfReader::xAssignFeatureLocationSet(
1102     const CVcfData& data,
1103     CRef<CSeq_feat> pFeat )
1104 //  ---------------------------------------------------------------------------
1105 {
1106     CRef<CSeq_id> pId(CReadUtil::AsSeqId(data.m_strChrom, m_iFlags));
1107 
1108     //context:
1109     // we are trying to package all the allele of this feature into a single
1110     // variation_ref, hence, they all need a common location.
1111     // Referenced location differ between the different types of variations,
1112     // so we need to find the most specific variation type that describes them
1113     // all. Once the actual variation type has been found we can set the location
1114     // accordingly.
1115 
1116     // in practice, we will choose the common variation type if it is indeed
1117     // common for all the alleles. Otherwise, we just make it a MNV.
1118 
1119     if (data.m_SetType == CVcfData::ST_ALL_SNV) {
1120         //set location for SNVs
1121         pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1122         pFeat->SetLocation().SetPnt().SetId(*pId);
1123         return true;
1124     }
1125     if (data.m_SetType == CVcfData::ST_ALL_MNV) {
1126         //set location for MNV. This will be the location of the reference
1127         pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1128         pFeat->SetLocation().SetInt().SetTo(
1129             static_cast<TSeqPos>(data.m_iPos + data.m_strRef.size() - 2));
1130         pFeat->SetLocation().SetInt().SetId(*pId);
1131         return true;
1132     }
1133     if (data.m_SetType == CVcfData::ST_ALL_INS) {
1134         //set location for INSs. Will always be a point!
1135         //m_iPos points to the 1-based position of the first
1136         //nt that is unique between alt and ref
1137         pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1138         pFeat->SetLocation().SetPnt().SetId(*pId);
1139         return true;
1140     }
1141     if (data.m_SetType == CVcfData::ST_ALL_DEL) {
1142         if (data.m_strRef.size() == 1) {
1143             //deletion of a single base
1144             pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1145             pFeat->SetLocation().SetPnt().SetId(*pId);
1146         }
1147         else {
1148             pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1149             //-1 for 0-based,
1150             //another -1 for inclusive end-point ( i.e. [], not [) )
1151             pFeat->SetLocation().SetInt().SetTo(
1152                  static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1153             pFeat->SetLocation().SetInt().SetId(*pId);
1154         }
1155         return true;
1156     }
1157 
1158     //default: For MNV's we will use the single starting point
1159     //NB: For references of size >=2, this location will not
1160     //match the reference allele.  Future Variation-ref
1161     //normalization code will address these issues,
1162     //and obviate the need for this code altogether.
1163     if (data.m_strRef.size() == 1) {
1164         //deletion of a single base
1165         pFeat->SetLocation().SetPnt().SetPoint(data.m_iPos-1);
1166         pFeat->SetLocation().SetPnt().SetId(*pId);
1167     }
1168     else {
1169         pFeat->SetLocation().SetInt().SetFrom(data.m_iPos-1);
1170         pFeat->SetLocation().SetInt().SetTo(
1171             static_cast<TSeqPos>(data.m_iPos -1 + data.m_strRef.length() - 1));
1172         pFeat->SetLocation().SetInt().SetId(*pId);
1173     }
1174     return true;
1175 }
1176 
1177 //  ----------------------------------------------------------------------------
1178 bool
xProcessScore(CVcfData & data,CRef<CSeq_feat> pFeature)1179 CVcfReader::xProcessScore(
1180     CVcfData& data,
1181     CRef<CSeq_feat> pFeature )
1182 //  ----------------------------------------------------------------------------
1183 {
1184     CSeq_feat::TExt& ext = pFeature->SetExt();
1185     if ( data.m_pdQual ) {
1186         ext.AddField( "score", *data.m_pdQual );
1187     }
1188     return true;
1189 }
1190 
1191 //  ----------------------------------------------------------------------------
1192 bool
xProcessFilter(CVcfData & data,CRef<CSeq_feat> pFeature)1193 CVcfReader::xProcessFilter(
1194     CVcfData& data,
1195     CRef<CSeq_feat> pFeature )
1196 //  ----------------------------------------------------------------------------
1197 {
1198     if(!NStr::Equal(data.m_strFilter,".")) {
1199       CSeq_feat::TExt& ext = pFeature->SetExt();
1200       ext.AddField( "filter", data.m_strFilter );
1201     }
1202     return true;
1203 }
1204 
1205 //  ----------------------------------------------------------------------------
1206 bool
xProcessInfo(CVcfData & data,CRef<CSeq_feat> pFeature)1207 CVcfReader::xProcessInfo(
1208     CVcfData& data,
1209     CRef<CSeq_feat> pFeature)
1210 //  ----------------------------------------------------------------------------
1211 {
1212     if (!xAssignVariantProps(data, pFeature)) {
1213         return false;
1214     }
1215     CSeq_feat::TExt& ext = pFeature->SetExt();
1216     if (data.m_Info.empty()) {
1217         return true;
1218     }
1219     vector<string> infos;
1220     for ( map<string,vector<string> >::const_iterator cit = data.m_Info.begin();
1221         cit != data.m_Info.end(); cit++ )
1222     {
1223         const string& key = cit->first;
1224         vector<string> value = cit->second;
1225         if ( value.empty() ) {
1226             infos.push_back( key );
1227         }
1228         else {
1229             string joined = NStr::Join( list<string>( value.begin(), value.end() ), "," );
1230             infos.push_back( key + "=" + joined );
1231         }
1232     }
1233     ext.AddField( "info", NStr::Join( infos, ";" ) );
1234     return true;
1235 }
1236 
1237 //  ----------------------------------------------------------------------------
1238 bool
xProcessTrackLine(const string & strLine,CSeq_annot & current)1239 CVcfReader::xProcessTrackLine(
1240     const string& strLine,
1241     CSeq_annot& current)
1242 //  ----------------------------------------------------------------------------
1243 {
1244     if (!xIsTrackLine(strLine)) {
1245         return false;
1246     }
1247     vector<string> parts;
1248     CReadUtil::Tokenize( strLine, " \t", parts );
1249     if (parts.size() >= 3) {
1250         const string digits("0123456789");
1251         bool col2_is_numeric =
1252             (string::npos == parts[1].find_first_not_of(digits));
1253         bool col3_is_numeric =
1254             (string::npos == parts[2].find_first_not_of(digits));
1255         if (col2_is_numeric  &&  col3_is_numeric) {
1256             return false;
1257         }
1258     }
1259     if (!CReaderBase::xParseTrackLine(strLine)) {
1260         CReaderMessage warning(
1261             eDiag_Warning,
1262             m_uLineNumber,
1263             "Bad track line: Expected \"track key1=value1 key2=value2 ...\". Ignored.");
1264         m_pMessageHandler->Report(warning);
1265     }
1266     return true;
1267 }
1268 
1269 //  ----------------------------------------------------------------------------
1270 bool
xProcessFormat(CVcfData & data,CRef<CSeq_feat> pFeature)1271 CVcfReader::xProcessFormat(
1272     CVcfData& data,
1273     CRef<CSeq_feat> pFeature )
1274 //  ----------------------------------------------------------------------------
1275 {
1276     if (data.m_FormatKeys.empty()) {
1277         return true;
1278     }
1279 
1280     CSeq_feat::TExt& ext = pFeature->SetExt();
1281     ext.AddField("format", data.m_FormatKeys);
1282 
1283     CRef<CUser_field> pGenotypeData( new CUser_field );
1284     pGenotypeData->SetLabel().SetStr("genotype-data");
1285 
1286     for ( CVcfData::GTDATA::const_iterator cit = data.m_GenotypeData.begin();
1287             cit != data.m_GenotypeData.end(); ++cit) {
1288         pGenotypeData->AddField(cit->first,cit->second);
1289     }
1290     ext.SetData().push_back(pGenotypeData);
1291     return true;
1292 }
1293 
1294 
1295 //  ----------------------------------------------------------------------------
xAssigndbSNPTag(const vector<string> & ids,CRef<CDbtag> pDbtag) const1296 bool CVcfReader::xAssigndbSNPTag(
1297     const vector<string>& ids,
1298     CRef<CDbtag> pDbtag) const
1299 //  ----------------------------------------------------------------------------
1300 {
1301     for (const string& id : ids) {
1302         if (NStr::StartsWith(id, "rs") ||
1303             NStr::StartsWith(id, "ss") )
1304         {
1305             try {
1306                 const int idval = NStr::StringToInt(id.substr(2));
1307                 pDbtag->SetDb("dbSNP");
1308                 pDbtag->SetTag().SetId(idval);
1309             }
1310             catch (...) {
1311                 continue;
1312             }
1313             return true;
1314         }
1315     }
1316     return false;
1317 }
1318 
1319 
1320 //  ----------------------------------------------------------------------------
1321 bool
xAssignVariationIds(CVcfData & data,CRef<CSeq_feat> pFeature)1322 CVcfReader::xAssignVariationIds(
1323     CVcfData& data,
1324     CRef<CSeq_feat> pFeature )
1325 //  ----------------------------------------------------------------------------
1326 {
1327     if ( data.m_Ids.empty() ) {
1328         return true;
1329     }
1330     CVariation_ref& variation = pFeature->SetData().SetVariation();
1331 //    CVariation_ref::TVariant_prop& var_prop = variation.SetVariant_prop();
1332 //    var_prop.SetVersion( 5 );
1333 
1334     auto it = data.m_Info.find("SOURCE");
1335 
1336     if (data.m_Info.end() != it) {
1337         vector<string> sources = it->second;
1338         if (sources.size() > 0 &&
1339             NStr::Equal(sources.front(), "dbsnp"))
1340         {
1341             CRef<CDbtag> pDbtag = Ref(new CDbtag());
1342             if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1343                 variation.SetId(pDbtag.GetNCObject());
1344                 return true;
1345             }
1346         }
1347     }
1348 
1349 
1350     if ( data.m_Info.find( "DB" ) != data.m_Info.end() ) {
1351         string id = data.m_Ids[0];
1352         NStr::ToLower(id);
1353         if (NStr::StartsWith(id, "rs")  ||  NStr::StartsWith(id, "ss")) {
1354             variation.SetId().SetDb("dbSNP");
1355         }
1356         else {
1357             variation.SetId().SetDb( "dbVar" );
1358         }
1359     }
1360     else if ( data.m_Info.find( "H2" ) != data.m_Info.end() ) {
1361         variation.SetId().SetDb( "HapMap2" );
1362     }
1363     else {
1364         variation.SetId().SetDb( "local" );
1365     }
1366     variation.SetId().SetTag().SetStr( data.m_Ids[0] );
1367 
1368     for ( size_t i=1; i < data.m_Ids.size(); ++i ) {
1369         if ( data.m_Info.find( "DB" ) != data.m_Info.end()
1370             &&  data.m_Info.find( "H2" ) != data.m_Info.end() )
1371         {
1372             variation.SetId().SetDb( "HapMap2" );
1373         }
1374         else {
1375             variation.SetId().SetDb( "local" );
1376         }
1377         variation.SetId().SetTag().SetStr( data.m_Ids[i] );
1378     }
1379     return true;
1380 }
1381 
1382 
1383 //  ----------------------------------------------------------------------------
1384 bool
xAssignVariantProps(CVcfData & data,CRef<CSeq_feat> pFeat)1385 CVcfReader::xAssignVariantProps(
1386     CVcfData& data,
1387     CRef<CSeq_feat> pFeat)
1388 //  ----------------------------------------------------------------------------
1389 {
1390     typedef CVariantProperties VP;
1391 
1392     CVcfData::INFOS& infos = data.m_Info;
1393     VP& props = pFeat->SetData().SetVariation().SetVariant_prop();
1394     CVcfData::INFOS::iterator it;
1395 
1396     props.SetResource_link() = 0;
1397     props.SetGene_location() = 0;
1398     props.SetEffect() = 0;
1399     props.SetMapping() = 0;
1400     props.SetFrequency_based_validation() = 0;
1401     props.SetGenotype() = 0;
1402     props.SetQuality_check() = 0;
1403 
1404     //byte F0
1405     props.SetVersion() = 5;
1406 
1407     //superbyte F1
1408     it = infos.find("SLO");
1409     if (infos.end() != it) {
1410         props.SetResource_link() |= VP::eResource_link_submitterLinkout;
1411         infos.erase(it);
1412     }
1413     it = infos.find("S3D");
1414     if (infos.end() != it) {
1415         props.SetResource_link() |= VP::eResource_link_has3D;
1416         infos.erase(it);
1417     }
1418     it = infos.find("TPA");
1419     if (infos.end() != it) {
1420         props.SetResource_link() |= VP::eResource_link_provisional;
1421         infos.erase(it);
1422     }
1423     it = infos.find("PM");
1424     if (infos.end() != it) {
1425         props.SetResource_link() |= VP::eResource_link_preserved;
1426         infos.erase(it);
1427     }
1428     it = infos.find("CLN");
1429     if (infos.end() != it) {
1430         props.SetResource_link() |= VP::eResource_link_clinical;
1431         infos.erase(it);
1432     }
1433     //todo: INFO ID=PMC
1434     it = infos.find("PMC");
1435     if (infos.end() != it) {
1436         infos.erase(it);
1437     }
1438     it = infos.find("PMID");
1439     if (infos.end() != it) {
1440         vector<string> pmids = it->second;
1441         for (vector<string>::const_iterator cit = pmids.begin();
1442             cit != pmids.end(); ++cit)
1443         {
1444             try {
1445                 string db, tag;
1446                 NStr::SplitInTwo(*cit, ":", db, tag);
1447                 if (db != "PM") {
1448                     CReaderMessage warning(
1449                         eDiag_Warning,
1450                         m_uLineNumber,
1451                         "CVcfReader::xAssignVariantProps: Invalid PMID database ID.");
1452                     m_pMessageHandler->Report(warning);
1453                     continue;
1454                 }
1455                 CRef<CDbtag> pDbtag(new CDbtag);
1456                 pDbtag->SetDb(db);
1457                 pDbtag->SetTag().SetId(
1458                     NStr::StringToInt(tag));
1459                 pFeat->SetDbxref().push_back(pDbtag);
1460             }
1461             catch(...) {}
1462         }
1463         infos.erase(it);
1464     }
1465 
1466     xAssignVariantSource(data, pFeat);
1467 
1468     //superbyte F2
1469     it = infos.find("R5");
1470     if (infos.end() != it) {
1471         props.SetGene_location() |= VP::eGene_location_near_gene_5;
1472         infos.erase(it);
1473     }
1474     it = infos.find("R3");
1475     if (infos.end() != it) {
1476         props.SetGene_location() |= VP::eGene_location_near_gene_3;
1477         infos.erase(it);
1478     }
1479     it = infos.find("INT");
1480     if (infos.end() != it) {
1481         props.SetGene_location() |= VP::eGene_location_intron;
1482         infos.erase(it);
1483     }
1484     it = infos.find("DSS");
1485     if (infos.end() != it) {
1486         props.SetGene_location() |= VP::eGene_location_donor;
1487         infos.erase(it);
1488     }
1489     it = infos.find("ASS");
1490     if (infos.end() != it) {
1491         props.SetGene_location() |= VP::eGene_location_acceptor;
1492         infos.erase(it);
1493     }
1494     it = infos.find("U5");
1495     if (infos.end() != it) {
1496         props.SetGene_location() |= VP::eGene_location_utr_5;
1497         infos.erase(it);
1498     }
1499     it = infos.find("U3");
1500     if (infos.end() != it) {
1501         props.SetGene_location() |= CVariantProperties::eGene_location_utr_3;
1502         infos.erase(it);
1503     }
1504 
1505     it = infos.find("SYN");
1506     if (infos.end() != it) {
1507         props.SetGene_location() |= VP::eEffect_synonymous;
1508         infos.erase(it);
1509     }
1510     it = infos.find("NSN");
1511     if (infos.end() != it) {
1512         props.SetGene_location() |= VP::eEffect_stop_gain;
1513         infos.erase(it);
1514     }
1515     it = infos.find("NSM");
1516     if (infos.end() != it) {
1517         props.SetGene_location() |= VP::eEffect_missense;
1518         infos.erase(it);
1519     }
1520     it = infos.find("NSF");
1521     if (infos.end() != it) {
1522         props.SetGene_location() |= VP::eEffect_frameshift;
1523         infos.erase(it);
1524     }
1525 
1526     //byte F3
1527     it = infos.find("WGT");
1528     if (infos.end() != it) {
1529         int weight = NStr::StringToInt( infos["WGT"][0] );
1530         switch(weight) {
1531         default:
1532             break;
1533         case 1:
1534             props.SetMap_weight() = VP::eMap_weight_is_uniquely_placed;
1535             infos.erase(it);
1536             break;
1537         case 2:
1538             props.SetMap_weight() = VP::eMap_weight_placed_twice_on_same_chrom;
1539             infos.erase(it);
1540             break;
1541         case 3:
1542             props.SetMap_weight() = VP::eMap_weight_placed_twice_on_diff_chrom;
1543             infos.erase(it);
1544             break;
1545         case 10:
1546             props.SetMap_weight() = VP::eMap_weight_many_placements;
1547             break;
1548         }
1549     }
1550 
1551     it = infos.find("ASP");
1552     if (infos.end() != it) {
1553         props.SetMapping() |= VP::eMapping_is_assembly_specific;
1554         infos.erase(it);
1555     }
1556     it = infos.find("CFL");
1557     if (infos.end() != it) {
1558         props.SetMapping() |= VP::eMapping_has_assembly_conflict;
1559         infos.erase(it);
1560     }
1561     it = infos.find("OTH");
1562     if (infos.end() != it) {
1563         props.SetMapping() |= VP::eMapping_has_other_snp;
1564         infos.erase(it);
1565     }
1566 
1567     //byte F4
1568     it = infos.find("OTH");
1569     if (infos.end() != it) {
1570         props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_all;
1571         infos.erase(it);
1572     }
1573     it = infos.find("G5A");
1574     if (infos.end() != it) {
1575         props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_above_5pct_1plus;
1576         infos.erase(it);
1577     }
1578     it = infos.find("VLD");
1579     if (infos.end() != it) {
1580         props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_validated;
1581         infos.erase(it);
1582     }
1583     it = infos.find("MUT");
1584     if (infos.end() != it) {
1585         props.SetFrequency_based_validation() |= VP::eFrequency_based_validation_is_mutation;
1586         infos.erase(it);
1587     }
1588     it = infos.find("GMAF");
1589     if (infos.end() != it) {
1590         props.SetAllele_frequency() = NStr::StringToDouble(infos["GMAF"][0]);
1591         infos.erase(it);
1592     }
1593 
1594     //byte F5
1595     it = infos.find("GNO");
1596     if (infos.end() != it) {
1597         props.SetGenotype() |= VP::eGenotype_has_genotypes;
1598         infos.erase(it);
1599     }
1600     it = infos.find("HD");
1601     if (infos.end() != it) {
1602         props.SetResource_link() |= VP::eResource_link_genotypeKit;
1603         infos.erase(it);
1604     }
1605 
1606     //byte F6
1607     if (infos.end() != infos.find("PH3")) {
1608         CRef<CDbtag> pDbtag(new CDbtag);
1609         pDbtag->SetDb("BioProject");
1610         pDbtag->SetTag().SetId(60835);
1611         pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1612     }
1613     if (infos.end() != infos.find("KGPhase1")) {
1614         CRef<CDbtag> pDbtag(new CDbtag);
1615         pDbtag->SetDb("BioProject");
1616         pDbtag->SetTag().SetId(28889);
1617         pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
1618     }
1619 
1620     //byte F7
1621 
1622     //byte F8
1623     //no relevant information found in VCF
1624 
1625     //byte F9
1626     it = infos.find("GCF");
1627     if (infos.end() != it) {
1628         props.SetQuality_check() |= VP::eQuality_check_genotype_conflict;
1629         infos.erase(it);
1630     }
1631     it = infos.find("NOV");
1632     if (infos.end() != it) {
1633         props.SetQuality_check() |= VP::eQuality_check_non_overlapping_alleles;
1634         infos.erase(it);
1635     }
1636     it = infos.find("WTD");
1637     if (infos.end() != it) {
1638         props.SetQuality_check() |= VP::eQuality_check_withdrawn_by_submitter;
1639         infos.erase(it);
1640     }
1641     it = infos.find("NOC");
1642     if (infos.end() != it) {
1643         props.SetQuality_check() |= VP::eQuality_check_contig_allele_missing;
1644         infos.erase(it);
1645     }
1646     return true;
1647 }
1648 
1649 
1650 //  ----------------------------------------------------------------------------
xAssignVariantSource(CVcfData & data,CRef<CSeq_feat> pFeat)1651 void CVcfReader::xAssignVariantSource(CVcfData& data,
1652     CRef<CSeq_feat> pFeat)
1653 //  ----------------------------------------------------------------------------
1654 {
1655     CVcfData::INFOS& infos = data.m_Info;
1656     auto it = infos.find("SOURCE");
1657     if (infos.end() != it) {
1658         vector<string> sources = it->second;
1659         if (sources.size() > 0 &&
1660             NStr::Equal(sources.front(),"dbsnp"))
1661         {
1662             bool valid_id=false;
1663             CRef<CDbtag> pDbtag(new CDbtag());
1664             if (xAssigndbSNPTag(data.m_Ids, pDbtag)) {
1665                 pFeat->SetDbxref().push_back(pDbtag);
1666                 valid_id = true;
1667             }
1668 
1669             if (!valid_id) {
1670                 CReaderMessage warning(
1671                     eDiag_Warning,
1672                     m_uLineNumber,
1673                     "CVcfReader::xAssignVariantProps: No valid dbSNP identifier");
1674                 m_pMessageHandler->Report(warning);
1675             }
1676             infos.erase(it);
1677         }
1678     }
1679 }
1680 
1681 
1682 //  ----------------------------------------------------------------------------
xIsCommentLine(const CTempString & strLine)1683 bool CVcfReader::xIsCommentLine(
1684     const CTempString& strLine)
1685 //  ----------------------------------------------------------------------------
1686 {
1687     if (NStr::StartsWith(strLine, "#CHROM")) {
1688         return false;
1689     }
1690     return CReaderBase::xIsCommentLine(strLine);
1691 }
1692 
1693 END_objects_SCOPE
1694 END_NCBI_SCOPE
1695