1 /*  $Id: gff3_reader.cpp 636167 2021-08-17 15:26:17Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Frank Ludwig
27  *
28  * File Description:
29  *   GFF3 file reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 
36 #include <util/line_reader.hpp>
37 
38 
39 #include <objects/general/Object_id.hpp>
40 #include <objects/general/User_object.hpp>
41 
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqloc/Seq_point.hpp>
46 
47 #include <objects/seq/Seq_annot.hpp>
48 #include <objects/seq/Annot_id.hpp>
49 #include <objects/seq/Annot_descr.hpp>
50 #include <objects/seq/so_map.hpp>
51 #include <objects/seqfeat/SeqFeatXref.hpp>
52 
53 #include <objects/seqfeat/Seq_feat.hpp>
54 #include <objects/seqfeat/Code_break.hpp>
55 #include <objects/seqfeat/Genetic_code.hpp>
56 #include <objects/seqfeat/RNA_ref.hpp>
57 #include <objects/seqfeat/Feat_id.hpp>
58 
59 #include <objtools/readers/gff3_reader.hpp>
60 #include "gff3_location_merger.hpp"
61 
62 #include "reader_message_handler.hpp"
63 
64 #include <algorithm>
65 
66 BEGIN_NCBI_SCOPE
67 BEGIN_objects_SCOPE
68 
69 unsigned int CGff3Reader::msGenericIdCounter = 0;
70 
71 //  ----------------------------------------------------------------------------
xMakeRecordId(const CGff2Record & record)72 string CGff3Reader::xMakeRecordId(
73     const CGff2Record& record)
74 //  ----------------------------------------------------------------------------
75 {
76     string id, parentId;
77     record.GetAttribute("ID", id);
78     record.GetAttribute("Parent", parentId);
79 
80     auto recordType = record.NormalizedType();
81     if (recordType == "cds") {
82         string cdsId = parentId;
83         if (cdsId.empty()) {
84             cdsId = (id.empty() ? xNextGenericId() : id);
85         }
86         else {
87             cdsId += ":cds";
88         }
89         return cdsId;
90     }
91     if (id.empty()) {
92         return xNextGenericId();
93     }
94     return id;
95 }
96 
97 //  ----------------------------------------------------------------------------
xNextGenericId()98 string CGff3Reader::xNextGenericId()
99 //  ----------------------------------------------------------------------------
100 {
101     return string("generic") + NStr::IntToString(msGenericIdCounter++);
102 }
103 
104 //  ----------------------------------------------------------------------------
AssignFromGff(const string & strRawInput)105 bool CGff3ReadRecord::AssignFromGff(
106     const string& strRawInput )
107 //  ----------------------------------------------------------------------------
108 {
109     if (!CGff2Record::AssignFromGff(strRawInput)) {
110         return false;
111     }
112     string id, parent;
113     GetAttribute("ID", id);
114     GetAttribute("Parent", parent);
115     if (id.empty()  &&  parent.empty()) {
116         m_Attributes["ID"] = CGff3Reader::xNextGenericId();
117     }
118     if (m_strType == "pseudogene") {
119         SetType("gene");
120         m_Attributes["pseudo"] = "true";
121         return true;
122     }
123     if (m_strType == "pseudogenic_transcript") {
124         SetType("transcript");
125         m_Attributes["pseudo"] = "true";
126         return true;
127     }
128     if (m_strType == "pseudogenic_tRNA") {
129         SetType("tRNA");
130         m_Attributes["pseudo"] = "true";
131         return true;
132     }
133     if (m_strType == "pseudogenic_rRNA") {
134         SetType("rRNA");
135         m_Attributes["pseudo"] = "true";
136         return true;
137     }
138     if (m_strType == "pseudogenic_exon") {
139         SetType("exon");
140         return true;
141     }
142     if (m_strType == "pseudogenic_CDS") {
143         SetType("CDS");
144         m_Attributes["pseudo"] = "true";
145         return true;
146     }
147     if (m_strType == "transcript") {
148         SetType("misc_RNA");
149         return true;
150     }
151     return true;
152 }
153 
154 //  ----------------------------------------------------------------------------
x_NormalizedAttributeKey(const string & strRawKey)155 string CGff3ReadRecord::x_NormalizedAttributeKey(
156     const string& strRawKey )
157 //  ---------------------------------------------------------------------------
158 {
159     string strKey = CGff2Record::xNormalizedAttributeKey( strRawKey );
160     if ( 0 == NStr::CompareNocase( strRawKey, "ID" ) ) {
161         return "ID";
162     }
163     if ( 0 == NStr::CompareNocase( strKey, "Name" ) ) {
164         return "Name";
165     }
166     if ( 0 == NStr::CompareNocase( strKey, "Alias" ) ) {
167         return "Alias";
168     }
169     if ( 0 == NStr::CompareNocase( strKey, "Parent" ) ) {
170         return "Parent";
171     }
172     if ( 0 == NStr::CompareNocase( strKey, "Target" ) ) {
173         return "Target";
174     }
175     if ( 0 == NStr::CompareNocase( strKey, "Gap" ) ) {
176         return "Gap";
177     }
178     if ( 0 == NStr::CompareNocase( strKey, "Derives_from" ) ) {
179         return "Derives_from";
180     }
181     if ( 0 == NStr::CompareNocase( strKey, "Note" ) ) {
182         return "Note";
183     }
184     if ( 0 == NStr::CompareNocase( strKey, "Dbxref" )  ||
185         0 == NStr::CompareNocase( strKey, "Db_xref" ) ) {
186         return "Dbxref";
187     }
188     if ( 0 == NStr::CompareNocase( strKey, "Ontology_term" ) ) {
189         return "Ontology_term";
190     }
191     return strKey;
192 }
193 
194 //  ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,const string & name,const string & title,SeqIdResolver resolver,CReaderListener * pRL)195 CGff3Reader::CGff3Reader(
196     unsigned int uFlags,
197     const string& name,
198     const string& title,
199     SeqIdResolver resolver,
200     CReaderListener* pRL):
201 //  ----------------------------------------------------------------------------
202     CGff2Reader( uFlags, name, title, resolver, pRL )
203 {
204     mpLocations.reset(new CGff3LocationMerger(uFlags, resolver, 0));
205     CGff2Record::ResetId();
206 }
207 
208 //  ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,CReaderListener * pRL)209 CGff3Reader::CGff3Reader(
210     unsigned int uFlags,
211     CReaderListener* pRL):
212 //  ----------------------------------------------------------------------------
213     CGff3Reader(uFlags, "", "", CReadUtil::AsSeqId, pRL)
214 {
215 }
216 
217 //  ----------------------------------------------------------------------------
~CGff3Reader()218 CGff3Reader::~CGff3Reader()
219 //  ----------------------------------------------------------------------------
220 {
221 }
222 
223 //  ----------------------------------------------------------------------------
224 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)225 CGff3Reader::ReadSeqAnnot(
226     ILineReader& lr,
227     ILineErrorListener* pEC )
228 //  ----------------------------------------------------------------------------
229 {
230     mCurrentFeatureCount = 0;
231     mParsingAlignment = false;
232     mAlignmentData.Reset();
233     mpLocations->Reset();
234     auto pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
235     if (pAnnot  &&  pAnnot->GetData().Which() == CSeq_annot::TData::e_not_set) {
236         return CRef<CSeq_annot>();
237     }
238     return pAnnot;
239 }
240 
241 //  ----------------------------------------------------------------------------
242 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)243 CGff3Reader::xProcessData(
244     const TReaderData& readerData,
245     CSeq_annot& annot)
246 //  ----------------------------------------------------------------------------
247 {
248     for (const auto& lineData: readerData) {
249         const auto& line = lineData.mData;
250         if (xParseStructuredComment(line)  &&
251                 !NStr::StartsWith(line, "##sequence-region") ) {
252             continue;
253         }
254         if (xParseBrowserLine(line, annot)) {
255             continue;
256         }
257         if (xParseFeature(line, annot, nullptr)) {
258             continue;
259         }
260     }
261 }
262 
263 //  ----------------------------------------------------------------------------
xProcessAlignmentData(CSeq_annot & annot)264 void CGff3Reader::xProcessAlignmentData(
265     CSeq_annot& annot)
266 //  ----------------------------------------------------------------------------
267 {
268     for (const string& id : mAlignmentData.mIds) {
269         CRef<CSeq_align> pAlign = Ref(new CSeq_align());
270         if (x_MergeAlignments(mAlignmentData.mAlignments.at(id), pAlign)) {
271             // if available, add current browser information
272             if ( m_CurrentBrowserInfo ) {
273                 annot.SetDesc().Set().push_back( m_CurrentBrowserInfo );
274             }
275 
276             annot.SetNameDesc("alignments");
277 
278             if ( !m_AnnotTitle.empty() ) {
279                 annot.SetTitleDesc(m_AnnotTitle);
280             }
281             // Add alignment
282             annot.SetData().SetAlign().push_back(pAlign);
283         }
284     }
285 }
286 
287 //  ----------------------------------------------------------------------------
288 bool
xParseFeature(const string & line,CSeq_annot & annot,ILineErrorListener * pEC)289 CGff3Reader::xParseFeature(
290     const string& line,
291     CSeq_annot& annot,
292     ILineErrorListener* pEC)
293 //  ----------------------------------------------------------------------------
294 {
295     if (CGff2Reader::IsAlignmentData(line)) {
296         return xParseAlignment(line);
297     }
298 
299     //parse record:
300     shared_ptr<CGff3ReadRecord> pRecord(x_CreateRecord());
301     try {
302         if (!pRecord->AssignFromGff(line)) {
303             return false;
304         }
305     }
306     catch(CObjReaderLineException& err) {
307         ProcessError(err, pEC);
308         return false;
309     }
310 
311     //make sure we are interested:
312     if (xIsIgnoredFeatureType(pRecord->Type())) {
313         return true;
314     }
315     if (xIsIgnoredFeatureId(pRecord->Id())) {
316         return true;
317     }
318 
319     //no support for multiparented features in genbank mode:
320     if (this->IsInGenbankMode()  &&  pRecord->IsMultiParent()) {
321         AutoPtr<CObjReaderLineException> pErr(
322             CObjReaderLineException::Create(
323             eDiag_Fatal,
324             0,
325             "Multiparented features are not supported in Genbank mode"));
326         ProcessError(*pErr, pEC);
327     }
328 
329     //append feature to annot:
330     if (!xUpdateAnnotFeature(*pRecord, annot, pEC)) {
331         return false;
332     }
333 
334     ++mCurrentFeatureCount;
335     mParsingAlignment = false;
336     return true;
337 }
338 
339 
340 //  ----------------------------------------------------------------------------
xParseAlignment(const string & strLine)341 bool CGff3Reader::xParseAlignment(
342     const string& strLine)
343 //  ----------------------------------------------------------------------------
344 {
345     if (IsInGenbankMode()) {
346         return true;
347     }
348     auto& ids = mAlignmentData.mIds;
349     auto& alignments = mAlignmentData.mAlignments;
350 
351     unique_ptr<CGff2Record> pRecord(x_CreateRecord());
352 
353     if ( !pRecord->AssignFromGff(strLine) ) {
354         return false;
355     }
356 
357     string id;
358     if ( !pRecord->GetAttribute("ID", id) ) {
359         id = pRecord->Id();
360     }
361 
362     if (alignments.find(id) == alignments.end()) {
363        ids.push_back(id);
364     }
365 
366     CRef<CSeq_align> alignment;
367     if (!x_CreateAlignment(*pRecord, alignment)) {
368         return false;
369     }
370 
371     alignments[id].push_back(alignment);
372 
373     ++mCurrentFeatureCount;
374     mParsingAlignment = true;
375     return true;
376 }
377 
378 //  ----------------------------------------------------------------------------
xUpdateAnnotFeature(const CGff2Record & gffRecord,CSeq_annot & annot,ILineErrorListener * pEC)379 bool CGff3Reader::xUpdateAnnotFeature(
380     const CGff2Record& gffRecord,
381     CSeq_annot& annot,
382     ILineErrorListener* pEC)
383 //  ----------------------------------------------------------------------------
384 {
385     //if (gffRecord.Type() == "CDS") {
386     //    if (gffRecord.SeqStart() == 114392786) {
387     //        cerr << "";
388     //    }
389     //}
390 
391     mpLocations->AddRecord(gffRecord);
392 
393     CRef< CSeq_feat > pFeature(new CSeq_feat);
394 
395     auto recType = gffRecord.NormalizedType();
396     if (recType == "exon" || recType == "five_prime_utr" || recType == "three_prime_utr") {
397         return xUpdateAnnotExon(gffRecord, pFeature, annot, pEC);
398     }
399     if (recType == "cds") {
400         return xUpdateAnnotCds(gffRecord, pFeature, annot, pEC);
401     }
402     if (recType == "gene") {
403          return xUpdateAnnotGene(gffRecord, pFeature, annot, pEC);
404     }
405     if (recType == "mrna") {
406         return xUpdateAnnotMrna(gffRecord, pFeature, annot, pEC);
407     }
408     if (recType == "region") {
409         return xUpdateAnnotRegion(gffRecord, pFeature, annot, pEC);
410     }
411     if (!xUpdateAnnotGeneric(gffRecord, pFeature, annot, pEC)) {
412         return false;
413     }
414     return true;
415 }
416 
417 
418 //  ----------------------------------------------------------------------------
xVerifyExonLocation(const string & mrnaId,const CGff2Record & exon)419 void CGff3Reader::xVerifyExonLocation(
420     const string& mrnaId,
421     const CGff2Record& exon)
422 //  ----------------------------------------------------------------------------
423 {
424     map<string,CRef<CSeq_interval> >::const_iterator cit = mMrnaLocs.find(mrnaId);
425     if (cit == mMrnaLocs.end()) {
426         string message = "Bad data line: ";
427         message += exon.Type();
428         message += " referring to non-existent parent feature.";
429         CReaderMessage error(
430             eDiag_Error,
431             m_uLineNumber,
432             message);
433         throw error;
434     }
435     const CSeq_interval& containingInt = cit->second.GetObject();
436     const CRef<CSeq_loc> pContainedLoc = exon.GetSeqLoc(m_iFlags, mSeqIdResolve);
437     const CSeq_interval& containedInt = pContainedLoc->GetInt();
438     if (containedInt.GetFrom() < containingInt.GetFrom()  ||
439             containedInt.GetTo() > containingInt.GetTo()) {
440         string message = "Bad data line: ";
441         message += exon.Type();
442         message += " extends beyond parent feature.";
443         CReaderMessage error(
444             eDiag_Error,
445             m_uLineNumber,
446             message);
447         throw error;
448     }
449 }
450 
451 //  ----------------------------------------------------------------------------
xUpdateAnnotExon(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)452 bool CGff3Reader::xUpdateAnnotExon(
453     const CGff2Record& record,
454     CRef<CSeq_feat> pFeature,
455     CSeq_annot& annot,
456     ILineErrorListener* pEC)
457 //  ----------------------------------------------------------------------------
458 {
459     list<string> parents;
460     if (record.GetAttribute("Parent", parents)) {
461         for (list<string>::const_iterator it = parents.begin(); it != parents.end();
462                 ++it) {
463             const string& parentId = *it;
464             CRef<CSeq_feat> pParent;
465             if (!x_GetFeatureById(parentId, pParent)) {
466                 xAddPendingExon(parentId, record);
467                 return true;
468             }
469             if (pParent->GetData().IsRna()) {
470                 xVerifyExonLocation(parentId, record);
471             }
472             if (pParent->GetData().IsGene()) {
473                 if  (!xInitializeFeature(record, pFeature)) {
474                     return false;
475                 }
476                 return xAddFeatureToAnnot(pFeature, annot);
477             }
478             IdToFeatureMap::iterator fit = m_MapIdToFeature.find(parentId);
479             if (fit != m_MapIdToFeature.end()) {
480                 CRef<CSeq_feat> pParent = fit->second;
481                 if (!record.UpdateFeature(m_iFlags, pParent)) {
482                     return false;
483                 }
484             }
485         }
486     }
487     return true;
488 }
489 
490 
491 //  ----------------------------------------------------------------------------
xJoinLocationIntoRna(const CGff2Record & record,ILineErrorListener * pEC)492 bool CGff3Reader::xJoinLocationIntoRna(
493     const CGff2Record& record,
494     //CRef<CSeq_feat> pFeature,
495     //CSeq_annot& annot,
496     ILineErrorListener* pEC)
497 //  ----------------------------------------------------------------------------
498 {
499     list<string> parents;
500     if (!record.GetAttribute("Parent", parents)) {
501         return true;
502     }
503     for (list<string>::const_iterator it = parents.begin(); it != parents.end();
504             ++it) {
505         const string& parentId = *it;
506         CRef<CSeq_feat> pParent;
507         if (!x_GetFeatureById(parentId, pParent)) {
508             // Danger:
509             // We don't know whether the CDS parent is indeed an RNA and it could
510             //  possible be a gene.
511             // If the parent is indeed a gene then gene construction will have to
512             //  purge this pending exon (or it will cause a sanity check to fail
513             //  during post processing).
514             xAddPendingExon(parentId, record);
515             continue;
516         }
517         if (!pParent->GetData().IsRna()) {
518             continue;
519         }
520         xVerifyExonLocation(parentId, record);
521         if (!record.UpdateFeature(m_iFlags, pParent)) {
522             return false;
523         }
524     }
525     return true;
526 }
527 
528 
529 //  ----------------------------------------------------------------------------
xUpdateAnnotCds(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)530 bool CGff3Reader::xUpdateAnnotCds(
531     const CGff2Record& record,
532     CRef<CSeq_feat> pFeature,
533     CSeq_annot& annot,
534     ILineErrorListener* pEC)
535 //  ----------------------------------------------------------------------------
536 {
537     if (!xJoinLocationIntoRna(record, pEC)) {
538         return false;
539     }
540     xVerifyCdsParents(record);
541 
542     string cdsId = xMakeRecordId(record);
543     mpLocations->AddRecordForId(cdsId, record);
544 
545     auto pExistingFeature = m_MapIdToFeature.find(cdsId);
546     if (pExistingFeature != m_MapIdToFeature.end()) {
547         return true;
548     }
549 
550     m_MapIdToFeature[cdsId] = pFeature;
551     xInitializeFeature(record, pFeature);
552     xAddFeatureToAnnot(pFeature, annot);
553 
554     string parentId;
555     record.GetAttribute("Parent", parentId);
556     if (!parentId.empty()) {
557         xFeatureSetQualifier("Parent", parentId, pFeature);
558         xFeatureSetXrefParent(parentId, pFeature);
559         if (m_iFlags & fGeneXrefs) {
560             xFeatureSetXrefGrandParent(parentId, pFeature);
561         }
562     }
563     return true;
564 }
565 
566 
567 //  ----------------------------------------------------------------------------
xFeatureSetXrefGrandParent(const string & parent,CRef<CSeq_feat> pFeature)568 bool CGff3Reader::xFeatureSetXrefGrandParent(
569     const string& parent,
570     CRef<CSeq_feat> pFeature)
571 //  ----------------------------------------------------------------------------
572 {
573     IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
574     if (it == m_MapIdToFeature.end()) {
575         return false;
576     }
577     CRef<CSeq_feat> pParent = it->second;
578     const string &grandParentsStr = pParent->GetNamedQual("Parent");
579     list<string> grandParents;
580     NStr::Split(grandParentsStr, ",", grandParents, 0);
581     for (list<string>::const_iterator gpcit = grandParents.begin();
582             gpcit != grandParents.end(); ++gpcit) {
583         IdToFeatureMap::iterator gpit = m_MapIdToFeature.find(*gpcit);
584         if (gpit == m_MapIdToFeature.end()) {
585             return false;
586         }
587         CRef<CSeq_feat> pGrandParent = gpit->second;
588 
589         //xref grandchild->grandparent
590         CRef<CFeat_id> pGrandParentId(new CFeat_id);
591         pGrandParentId->Assign(pGrandParent->GetId());
592         CRef<CSeqFeatXref> pGrandParentXref(new CSeqFeatXref);
593         pGrandParentXref->SetId(*pGrandParentId);
594         pFeature->SetXref().push_back(pGrandParentXref);
595 
596         //xref grandparent->grandchild
597         CRef<CFeat_id> pGrandChildId(new CFeat_id);
598         pGrandChildId->Assign(pFeature->GetId());
599         CRef<CSeqFeatXref> pGrandChildXref(new CSeqFeatXref);
600         pGrandChildXref->SetId(*pGrandChildId);
601         pGrandParent->SetXref().push_back(pGrandChildXref);
602     }
603     return true;
604 }
605 
606 //  ----------------------------------------------------------------------------
xFeatureSetXrefParent(const string & parent,CRef<CSeq_feat> pChild)607 bool CGff3Reader::xFeatureSetXrefParent(
608     const string& parent,
609     CRef<CSeq_feat> pChild)
610 //  ----------------------------------------------------------------------------
611 {
612     IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
613     if (it == m_MapIdToFeature.end()) {
614         return false;
615     }
616     CRef<CSeq_feat> pParent = it->second;
617 
618     //xref child->parent
619     CRef<CFeat_id> pParentId(new CFeat_id);
620     pParentId->Assign(pParent->GetId());
621     CRef<CSeqFeatXref> pParentXref(new CSeqFeatXref);
622     pParentXref->SetId(*pParentId);
623     pChild->SetXref().push_back(pParentXref);
624 
625     //xref parent->child
626     CRef<CFeat_id> pChildId(new CFeat_id);
627     pChildId->Assign(pChild->GetId());
628     CRef<CSeqFeatXref> pChildXref(new CSeqFeatXref);
629     pChildXref->SetId(*pChildId);
630     pParent->SetXref().push_back(pChildXref);
631     return true;
632 }
633 
634 //  ----------------------------------------------------------------------------
xFindFeatureUnderConstruction(const CGff2Record & record,CRef<CSeq_feat> & underConstruction)635 bool CGff3Reader::xFindFeatureUnderConstruction(
636     const CGff2Record& record,
637     CRef<CSeq_feat>& underConstruction)
638 //  ----------------------------------------------------------------------------
639 {
640     string id;
641     if (!record.GetAttribute("ID", id)) {
642         return false;
643     }
644     IdToFeatureMap::iterator it = m_MapIdToFeature.find(id);
645     if (it == m_MapIdToFeature.end()) {
646         return false;
647     }
648 
649     CReaderMessage fatal(
650         eDiag_Fatal,
651         m_uLineNumber,
652         "Bad data line:  Duplicate feature ID \"" + id + "\".");
653     CSeq_feat tempFeat;
654     if (CSoMap::SoTypeToFeature(record.Type(), tempFeat)) {
655         if (it->second->GetData().GetSubtype() != tempFeat.GetData().GetSubtype()) {
656             throw fatal;
657         }
658     }
659 
660     underConstruction = it->second;
661     return true;
662 }
663 
664 //  ----------------------------------------------------------------------------
xUpdateAnnotGeneric(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)665 bool CGff3Reader::xUpdateAnnotGeneric(
666     const CGff2Record& record,
667     CRef<CSeq_feat> pFeature,
668     CSeq_annot& annot,
669     ILineErrorListener* pEC)
670 //  ----------------------------------------------------------------------------
671 {
672     CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
673     if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
674         return record.UpdateFeature(m_iFlags, pUnderConstruction);
675     }
676 
677     string featType = record.Type();
678     if (featType == "stop_codon_read_through"  ||  featType == "selenocysteine") {
679         string cdsParent;
680         if (!record.GetAttribute("Parent", cdsParent)) {
681             CReaderMessage error(
682                 eDiag_Error,
683                 m_uLineNumber,
684                 "Bad data line: Unassigned code break.");
685             throw error;
686         }
687         IdToFeatureMap::iterator it = m_MapIdToFeature.find(cdsParent);
688         if (it == m_MapIdToFeature.end()) {
689             CReaderMessage error(
690                 eDiag_Error,
691                 m_uLineNumber,
692                 "Bad data line: Code break assigned to missing feature.");
693             throw error;
694         }
695 
696         CRef<CCode_break> pCodeBreak(new CCode_break);
697         CSeq_interval& cbLoc = pCodeBreak->SetLoc().SetInt();
698         CRef< CSeq_id > pId = mSeqIdResolve(record.Id(), m_iFlags, true);
699         cbLoc.SetId(*pId);
700         cbLoc.SetFrom(static_cast<TSeqPos>(record.SeqStart()));
701         cbLoc.SetTo(static_cast<TSeqPos>(record.SeqStop()));
702         if (record.IsSetStrand()) {
703             cbLoc.SetStrand(record.Strand());
704         }
705         pCodeBreak->SetAa().SetNcbieaa(
706             (featType == "selenocysteine") ? 'U' : 'X');
707         CRef<CSeq_feat> pCds = it->second;
708         CCdregion& cdRegion = pCds->SetData().SetCdregion();
709         list< CRef< CCode_break > >& codeBreaks = cdRegion.SetCode_break();
710         codeBreaks.push_back(pCodeBreak);
711         return true;
712     }
713     if (!xInitializeFeature(record, pFeature)) {
714         return false;
715     }
716     if (! xAddFeatureToAnnot(pFeature, annot)) {
717         return false;
718     }
719     string strId;
720     if ( record.GetAttribute("ID", strId)) {
721         m_MapIdToFeature[strId] = pFeature;
722     }
723     if (pFeature->GetData().IsRna()  ||
724             pFeature->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_RNA) {
725         CRef<CSeq_interval> rnaLoc(new CSeq_interval);
726         rnaLoc->Assign(pFeature->GetLocation().GetInt());
727         mMrnaLocs[strId] = rnaLoc;
728     }
729     return true;
730 }
731 
732 //  ----------------------------------------------------------------------------
xUpdateAnnotMrna(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)733 bool CGff3Reader::xUpdateAnnotMrna(
734     const CGff2Record& record,
735     CRef<CSeq_feat> pFeature,
736     CSeq_annot& annot,
737     ILineErrorListener* pEC)
738 //  ----------------------------------------------------------------------------
739 {
740     CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
741     if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
742         return record.UpdateFeature(m_iFlags, pUnderConstruction);
743     }
744 
745     if (!xInitializeFeature(record, pFeature)) {
746         return false;
747     }
748     string parentsStr;
749     if ((m_iFlags & fGeneXrefs)  &&  record.GetAttribute("Parent", parentsStr)) {
750         list<string> parents;
751         NStr::Split(parentsStr, ",", parents, 0);
752         for (list<string>::const_iterator cit = parents.begin();
753                 cit != parents.end();
754                 ++cit) {
755             if (!xFeatureSetXrefParent(*cit, pFeature)) {
756                 CReaderMessage error(
757                     eDiag_Error,
758                     m_uLineNumber,
759                     "Bad data line: mRNA record with bad parent assignment.");
760                 throw error;
761             }
762         }
763     }
764 
765     string strId;
766     if ( record.GetAttribute("ID", strId)) {
767         m_MapIdToFeature[strId] = pFeature;
768     }
769     CRef<CSeq_interval> mrnaLoc(new CSeq_interval);
770     CSeq_loc::E_Choice choice = pFeature->GetLocation().Which();
771     if (choice != CSeq_loc::e_Int) {
772         CReaderMessage error(
773             eDiag_Error,
774             m_uLineNumber,
775             "Internal error: Unexpected location type.");
776         throw error;
777     }
778     mrnaLoc->Assign(pFeature->GetLocation().GetInt());
779     mMrnaLocs[strId] = mrnaLoc;
780 
781     list<CGff2Record> pendingExons;
782     xGetPendingExons(strId, pendingExons);
783     for (auto exonRecord: pendingExons) {
784         CRef< CSeq_feat > pFeature(new CSeq_feat);
785         xUpdateAnnotExon(exonRecord, pFeature, annot, pEC);
786     }
787     if (! xAddFeatureToAnnot(pFeature, annot)) {
788         return false;
789     }
790     return true;
791 }
792 
793 //  ----------------------------------------------------------------------------
xUpdateAnnotGene(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)794 bool CGff3Reader::xUpdateAnnotGene(
795     const CGff2Record& record,
796     CRef<CSeq_feat> pFeature,
797     CSeq_annot& annot,
798     ILineErrorListener* pEC)
799 //  ----------------------------------------------------------------------------
800 {
801     CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
802     if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
803         return record.UpdateFeature(m_iFlags, pUnderConstruction);
804     }
805 
806     if (!xInitializeFeature(record, pFeature)) {
807         return false;
808     }
809     if (! xAddFeatureToAnnot(pFeature, annot)) {
810         return false;
811     }
812     string strId;
813     if ( record.GetAttribute("ID", strId)) {
814         m_MapIdToFeature[strId] = pFeature;
815     }
816     // address corner case:
817     // parent of CDS is a gene but the DCS is listed before the gene so at the
818     //  time we did not know the parent would be a gene.
819     // remedy: throw out any collected cds locations that were meant for RNA
820     //  construction.
821     list<CGff2Record> pendingExons;
822     xGetPendingExons(strId, pendingExons);
823     return true;
824 }
825 
826 //  ----------------------------------------------------------------------------
xUpdateAnnotRegion(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)827 bool CGff3Reader::xUpdateAnnotRegion(
828     const CGff2Record& record,
829     CRef<CSeq_feat> pFeature,
830     CSeq_annot& annot,
831     ILineErrorListener* pEC)
832 //  ----------------------------------------------------------------------------
833 {
834     if (!record.InitializeFeature(m_iFlags, pFeature)) {
835         return false;
836     }
837 
838     if (! xAddFeatureToAnnot(pFeature, annot)) {
839         return false;
840     }
841     string strId;
842     if ( record.GetAttribute("ID", strId)) {
843         mIdToSeqIdMap[strId] = record.Id();
844         m_MapIdToFeature[strId] = pFeature;
845     }
846     return true;
847 }
848 
849 //  ----------------------------------------------------------------------------
xAddFeatureToAnnot(CRef<CSeq_feat> pFeature,CSeq_annot & annot)850 bool CGff3Reader::xAddFeatureToAnnot(
851     CRef< CSeq_feat > pFeature,
852     CSeq_annot& annot )
853 //  ----------------------------------------------------------------------------
854 {
855     annot.SetData().SetFtable().push_back( pFeature ) ;
856     return true;
857 }
858 
859 //  ----------------------------------------------------------------------------
xVerifyCdsParents(const CGff2Record & record)860 void CGff3Reader::xVerifyCdsParents(
861     const CGff2Record& record)
862 //  ----------------------------------------------------------------------------
863 {
864     string id;
865     string parents;
866     if (!record.GetAttribute("ID", id)) {
867         return;
868     }
869     record.GetAttribute("Parent", parents);
870     map<string, string>::iterator it = mCdsParentMap.find(id);
871     if (it == mCdsParentMap.end()) {
872         mCdsParentMap[id] = parents;
873         return;
874     }
875     if (it->second == parents) {
876         return;
877     }
878     CReaderMessage error(
879         eDiag_Error,
880         m_uLineNumber,
881         "Bad data line: CDS record with bad parent assignments.");
882     throw error;
883 }
884 
885 //  ----------------------------------------------------------------------------
xReadInit()886 bool CGff3Reader::xReadInit()
887 //  ----------------------------------------------------------------------------
888 {
889     if (!CGff2Reader::xReadInit()) {
890         return false;
891     }
892     mCdsParentMap.clear();
893     return true;
894 }
895 
896 //  ----------------------------------------------------------------------------
xIsIgnoredFeatureType(const string & featureType)897 bool CGff3Reader::xIsIgnoredFeatureType(
898     const string& featureType)
899 //  ----------------------------------------------------------------------------
900 {
901     typedef CStaticArraySet<string, PNocase> STRINGARRAY;
902 
903     string ftype(CSoMap::ResolveSoAlias(featureType));
904 
905     static const char* const ignoredTypesAlways_[] = {
906         "protein",
907         "start_codon", // also part of a cds feature
908         "stop_codon", // in GFF3, also part of a cds feature
909     };
910     DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesAlways, ignoredTypesAlways_);
911     STRINGARRAY::const_iterator cit = ignoredTypesAlways.find(ftype);
912     if (cit != ignoredTypesAlways.end()) {
913         return true;
914     }
915     if (!IsInGenbankMode()) {
916         return false;
917     }
918 
919     /* -genbank mode:*/
920     static const char* const specialTypesGenbank_[] = {
921         "antisense_RNA",
922         "autocatalytically_spliced_intron",
923         "guide_RNA",
924         "hammerhead_ribozyme",
925         "lnc_RNA",
926         "miRNA",
927         "piRNA",
928         "rasiRNA",
929         "ribozyme",
930         "RNase_MRP_RNA",
931         "RNase_P_RNA",
932         "scRNA",
933         "selenocysteine",
934         "siRNA",
935         "snoRNA",
936         "snRNA",
937         "SRP_RNA",
938         "stop_codon_read_through",
939         "telomerase_RNA",
940         "vault_RNA",
941         "Y_RNA"
942     };
943     DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, specialTypesGenbank, specialTypesGenbank_);
944 
945     static const char* const ignoredTypesGenbank_[] = {
946         "apicoplast_chromosome",
947         "assembly",
948         "cDNA_match",
949         "chloroplast_chromosome",
950         "chromoplast_chromosome",
951         "chromosome",
952         "contig",
953         "cyanelle_chromosome",
954         "dna_chromosome",
955         "EST_match",
956         "expressed_sequence_match",
957         "intron",
958         "leucoplast_chromosome",
959         "macronuclear_chromosome",
960         "match",
961         "match_part",
962         "micronuclear_chromosome",
963         "mitochondrial_chromosome",
964         "nuclear_chromosome",
965         "nucleomorphic_chromosome",
966         "nucleotide_motif",
967         "nucleotide_to_protein_match",
968         "partial_genomic_sequence_assembly",
969         "protein_match",
970         "replicon",
971         "rna_chromosome",
972         "sequence_assembly",
973         "supercontig",
974         "translated_nucleotide_match",
975         "ultracontig",
976     };
977     DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesGenbank, ignoredTypesGenbank_);
978 
979     cit = specialTypesGenbank.find(ftype);
980     if (cit != specialTypesGenbank.end()) {
981         return false;
982     }
983 
984     cit = ignoredTypesGenbank.find(ftype);
985     if (cit != ignoredTypesGenbank.end()) {
986         return true;
987     }
988 
989     return false;
990 }
991 
992 //  ----------------------------------------------------------------------------
993 bool
xInitializeFeature(const CGff2Record & record,CRef<CSeq_feat> pFeature)994 CGff3Reader::xInitializeFeature(
995     const CGff2Record& record,
996     CRef<CSeq_feat> pFeature)
997 //  ----------------------------------------------------------------------------
998 {
999     if (!record.InitializeFeature(m_iFlags, pFeature)) {
1000         return false;
1001     }
1002     const auto& attrs = record.Attributes();
1003     const auto it = attrs.find("ID");
1004     if (it != attrs.end()) {
1005         mIdToSeqIdMap[it->second] = record.Id();
1006     }
1007     return true;
1008 }
1009 
1010 //  ----------------------------------------------------------------------------
1011 void
xAddPendingExon(const string & rnaId,const CGff2Record & exonRecord)1012 CGff3Reader::xAddPendingExon(
1013     const string& rnaId,
1014     const CGff2Record& exonRecord)
1015 //  ----------------------------------------------------------------------------
1016 {
1017     PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1018     if (it == mPendingExons.end()) {
1019         mPendingExons[rnaId] = list<CGff2Record>();
1020     }
1021     mPendingExons[rnaId].push_back(exonRecord);
1022 }
1023 
1024 //  ----------------------------------------------------------------------------
1025 void
xGetPendingExons(const string & rnaId,list<CGff2Record> & pendingExons)1026 CGff3Reader::xGetPendingExons(
1027     const string& rnaId,
1028     list<CGff2Record>& pendingExons)
1029 //  ----------------------------------------------------------------------------
1030 {
1031     PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1032     if (it == mPendingExons.end()) {
1033         return;
1034     }
1035     pendingExons.swap(mPendingExons[rnaId]);
1036     mPendingExons.erase(rnaId);
1037 }
1038 
1039 //  ----------------------------------------------------------------------------
xPostProcessAnnot(CSeq_annot & annot)1040 void CGff3Reader::xPostProcessAnnot(
1041     CSeq_annot& annot)
1042 //  ----------------------------------------------------------------------------
1043 {
1044     if (mAlignmentData) {
1045         xProcessAlignmentData(annot);
1046         return;
1047     }
1048     if (!mCurrentFeatureCount) {
1049         return;
1050     }
1051 
1052     for (const auto& it: mPendingExons) {
1053         CReaderMessage warning(
1054             eDiag_Warning,
1055             m_uLineNumber,
1056             "Bad data line: Record references non-existent Parent=" + it.first);
1057         m_pMessageHandler->Report(warning);
1058     }
1059 
1060     //location fixup:
1061     for (auto itLocation: mpLocations->LocationMap()) {
1062         auto id = itLocation.first;
1063         auto itFeature = m_MapIdToFeature.find(id);
1064         if (itFeature == m_MapIdToFeature.end()) {
1065             continue;
1066         }
1067         CRef<CSeq_loc> pNewLoc(new CSeq_loc);
1068         CCdregion::EFrame frame;
1069         mpLocations->MergeLocation(pNewLoc, frame, itLocation.second);
1070         CRef<CSeq_feat> pFeature = itFeature->second;
1071         pFeature->SetLocation(*pNewLoc);
1072         if (pFeature->GetData().IsCdregion()) {
1073             auto& cdrData = pFeature->SetData().SetCdregion();
1074             cdrData.SetFrame(
1075                 frame == CCdregion::eFrame_not_set ? CCdregion::eFrame_one : frame);
1076         }
1077     }
1078 
1079     return CGff2Reader::xPostProcessAnnot(annot);
1080 }
1081 
1082 //  ----------------------------------------------------------------------------
xProcessSequenceRegionPragma(const string & pragma)1083 void CGff3Reader::xProcessSequenceRegionPragma(
1084     const string& pragma)
1085 //  ----------------------------------------------------------------------------
1086 {
1087     TSeqPos sequenceSize(0);
1088     vector<string> tokens;
1089     NStr::Split(pragma, " \t", tokens, NStr::fSplit_MergeDelimiters);
1090     if (tokens.size() < 2) {
1091         CReaderMessage warning(
1092             ncbi::eDiag_Warning,
1093             m_uLineNumber,
1094             "Bad sequence-region pragma - ignored.");
1095         throw warning;
1096     }
1097     if (tokens.size() >= 4) {
1098         try {
1099             sequenceSize = NStr::StringToNonNegativeInt(tokens[3]);
1100         }
1101         catch(exception&) {
1102             CReaderMessage warning(
1103                 ncbi::eDiag_Warning,
1104                 m_uLineNumber,
1105                 "Bad sequence-region pragma - ignored.");
1106             throw warning;
1107         }
1108     }
1109     mpLocations->SetSequenceSize(tokens[1], sequenceSize);
1110     auto resolvedId = mSeqIdResolve(tokens[1], m_iFlags, true)->AsFastaString();
1111     mpLocations->SetSequenceSize(resolvedId, sequenceSize);
1112 
1113 }
1114 
1115 //  ----------------------------------------------------------------------------
SequenceSize() const1116 TSeqPos CGff3Reader::SequenceSize() const
1117 //  ----------------------------------------------------------------------------
1118 {
1119     return mpLocations->SequenceSize();
1120 }
1121 
1122 //  ----------------------------------------------------------------------------
GetSequenceSize(const string & seqId) const1123 TSeqPos CGff3Reader::GetSequenceSize(
1124     const string& seqId) const
1125 //  ----------------------------------------------------------------------------
1126 {
1127     return mpLocations->GetSequenceSize(seqId);
1128 }
1129 
1130 END_objects_SCOPE
1131 END_NCBI_SCOPE
1132