1 /* $Id: gff3_reader.cpp 632532 2021-06-02 17:25:46Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description:
29 * GFF3 file reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35
36 #include <util/line_reader.hpp>
37
38
39 #include <objects/general/Object_id.hpp>
40 #include <objects/general/User_object.hpp>
41
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqloc/Seq_point.hpp>
46
47 #include <objects/seq/Seq_annot.hpp>
48 #include <objects/seq/Annot_id.hpp>
49 #include <objects/seq/Annot_descr.hpp>
50 #include <objects/seq/so_map.hpp>
51 #include <objects/seqfeat/SeqFeatXref.hpp>
52
53 #include <objects/seqfeat/Seq_feat.hpp>
54 #include <objects/seqfeat/Code_break.hpp>
55 #include <objects/seqfeat/Genetic_code.hpp>
56 #include <objects/seqfeat/RNA_ref.hpp>
57 #include <objects/seqfeat/Feat_id.hpp>
58
59 #include <objtools/readers/gff3_reader.hpp>
60 #include "gff3_location_merger.hpp"
61
62 #include "reader_message_handler.hpp"
63
64 #include <algorithm>
65
66 BEGIN_NCBI_SCOPE
67 BEGIN_objects_SCOPE
68
69 unsigned int CGff3Reader::msGenericIdCounter = 0;
70
71 // ----------------------------------------------------------------------------
xMakeRecordId(const CGff2Record & record)72 string CGff3Reader::xMakeRecordId(
73 const CGff2Record& record)
74 // ----------------------------------------------------------------------------
75 {
76 string id, parentId;
77 record.GetAttribute("ID", id);
78 record.GetAttribute("Parent", parentId);
79
80 auto recordType = record.NormalizedType();
81 if (recordType == "cds") {
82 string cdsId = parentId;
83 if (cdsId.empty()) {
84 cdsId = (id.empty() ? xNextGenericId() : id);
85 }
86 else {
87 cdsId += ":cds";
88 }
89 return cdsId;
90 }
91 if (id.empty()) {
92 return xNextGenericId();
93 }
94 return id;
95 }
96
97 // ----------------------------------------------------------------------------
xNextGenericId()98 string CGff3Reader::xNextGenericId()
99 // ----------------------------------------------------------------------------
100 {
101 return string("generic") + NStr::IntToString(msGenericIdCounter++);
102 }
103
104 // ----------------------------------------------------------------------------
AssignFromGff(const string & strRawInput)105 bool CGff3ReadRecord::AssignFromGff(
106 const string& strRawInput )
107 // ----------------------------------------------------------------------------
108 {
109 if (!CGff2Record::AssignFromGff(strRawInput)) {
110 return false;
111 }
112 string id, parent;
113 GetAttribute("ID", id);
114 GetAttribute("Parent", parent);
115 if (id.empty() && parent.empty()) {
116 m_Attributes["ID"] = CGff3Reader::xNextGenericId();
117 }
118 if (m_strType == "pseudogene") {
119 SetType("gene");
120 m_Attributes["pseudo"] = "true";
121 return true;
122 }
123 if (m_strType == "pseudogenic_transcript") {
124 SetType("transcript");
125 m_Attributes["pseudo"] = "true";
126 return true;
127 }
128 if (m_strType == "pseudogenic_tRNA") {
129 SetType("tRNA");
130 m_Attributes["pseudo"] = "true";
131 return true;
132 }
133 if (m_strType == "pseudogenic_rRNA") {
134 SetType("rRNA");
135 m_Attributes["pseudo"] = "true";
136 return true;
137 }
138 if (m_strType == "pseudogenic_exon") {
139 SetType("exon");
140 return true;
141 }
142 if (m_strType == "pseudogenic_CDS") {
143 SetType("CDS");
144 m_Attributes["pseudo"] = "true";
145 return true;
146 }
147 if (m_strType == "transcript") {
148 SetType("misc_RNA");
149 return true;
150 }
151 return true;
152 }
153
154 // ----------------------------------------------------------------------------
x_NormalizedAttributeKey(const string & strRawKey)155 string CGff3ReadRecord::x_NormalizedAttributeKey(
156 const string& strRawKey )
157 // ---------------------------------------------------------------------------
158 {
159 string strKey = CGff2Record::xNormalizedAttributeKey( strRawKey );
160 if ( 0 == NStr::CompareNocase( strRawKey, "ID" ) ) {
161 return "ID";
162 }
163 if ( 0 == NStr::CompareNocase( strKey, "Name" ) ) {
164 return "Name";
165 }
166 if ( 0 == NStr::CompareNocase( strKey, "Alias" ) ) {
167 return "Alias";
168 }
169 if ( 0 == NStr::CompareNocase( strKey, "Parent" ) ) {
170 return "Parent";
171 }
172 if ( 0 == NStr::CompareNocase( strKey, "Target" ) ) {
173 return "Target";
174 }
175 if ( 0 == NStr::CompareNocase( strKey, "Gap" ) ) {
176 return "Gap";
177 }
178 if ( 0 == NStr::CompareNocase( strKey, "Derives_from" ) ) {
179 return "Derives_from";
180 }
181 if ( 0 == NStr::CompareNocase( strKey, "Note" ) ) {
182 return "Note";
183 }
184 if ( 0 == NStr::CompareNocase( strKey, "Dbxref" ) ||
185 0 == NStr::CompareNocase( strKey, "Db_xref" ) ) {
186 return "Dbxref";
187 }
188 if ( 0 == NStr::CompareNocase( strKey, "Ontology_term" ) ) {
189 return "Ontology_term";
190 }
191 return strKey;
192 }
193
194 // ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,const string & name,const string & title,SeqIdResolver resolver,CReaderListener * pRL)195 CGff3Reader::CGff3Reader(
196 unsigned int uFlags,
197 const string& name,
198 const string& title,
199 SeqIdResolver resolver,
200 CReaderListener* pRL):
201 // ----------------------------------------------------------------------------
202 CGff2Reader( uFlags, name, title, resolver, pRL )
203 {
204 mpLocations.reset(new CGff3LocationMerger(uFlags, resolver, 0));
205 CGff2Record::ResetId();
206 }
207
208 // ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,CReaderListener * pRL)209 CGff3Reader::CGff3Reader(
210 unsigned int uFlags,
211 CReaderListener* pRL):
212 // ----------------------------------------------------------------------------
213 CGff3Reader(uFlags, "", "", CReadUtil::AsSeqId, pRL)
214 {
215 }
216
217 // ----------------------------------------------------------------------------
~CGff3Reader()218 CGff3Reader::~CGff3Reader()
219 // ----------------------------------------------------------------------------
220 {
221 }
222
223 // ----------------------------------------------------------------------------
224 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)225 CGff3Reader::ReadSeqAnnot(
226 ILineReader& lr,
227 ILineErrorListener* pEC )
228 // ----------------------------------------------------------------------------
229 {
230 mCurrentFeatureCount = 0;
231 mParsingAlignment = false;
232 mAlignmentData.Reset();
233 mpLocations->Reset();
234 auto pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
235 if (pAnnot && pAnnot->GetData().Which() == CSeq_annot::TData::e_not_set) {
236 return CRef<CSeq_annot>();
237 }
238 return pAnnot;
239 }
240
241 // ----------------------------------------------------------------------------
242 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)243 CGff3Reader::xProcessData(
244 const TReaderData& readerData,
245 CSeq_annot& annot)
246 // ----------------------------------------------------------------------------
247 {
248 for (const auto& lineData: readerData) {
249 const auto& line = lineData.mData;
250 if (xParseStructuredComment(line) &&
251 !NStr::StartsWith(line, "##sequence-region") ) {
252 continue;
253 }
254 if (xParseBrowserLine(line, annot)) {
255 continue;
256 }
257 if (xParseFeature(line, annot, nullptr)) {
258 continue;
259 }
260 }
261 }
262
263 // ----------------------------------------------------------------------------
xProcessAlignmentData(CSeq_annot & annot)264 void CGff3Reader::xProcessAlignmentData(
265 CSeq_annot& annot)
266 // ----------------------------------------------------------------------------
267 {
268 for (const string& id : mAlignmentData.mIds) {
269 CRef<CSeq_align> pAlign = Ref(new CSeq_align());
270 if (x_MergeAlignments(mAlignmentData.mAlignments.at(id), pAlign)) {
271 // if available, add current browser information
272 if ( m_CurrentBrowserInfo ) {
273 annot.SetDesc().Set().push_back( m_CurrentBrowserInfo );
274 }
275
276 annot.SetNameDesc("alignments");
277
278 if ( !m_AnnotTitle.empty() ) {
279 annot.SetTitleDesc(m_AnnotTitle);
280 }
281 // Add alignment
282 annot.SetData().SetAlign().push_back(pAlign);
283 }
284 }
285 }
286
287 // ----------------------------------------------------------------------------
288 bool
xParseFeature(const string & line,CSeq_annot & annot,ILineErrorListener * pEC)289 CGff3Reader::xParseFeature(
290 const string& line,
291 CSeq_annot& annot,
292 ILineErrorListener* pEC)
293 // ----------------------------------------------------------------------------
294 {
295 if (CGff2Reader::IsAlignmentData(line)) {
296 return xParseAlignment(line);
297 }
298
299 //parse record:
300 shared_ptr<CGff3ReadRecord> pRecord(x_CreateRecord());
301 try {
302 if (!pRecord->AssignFromGff(line)) {
303 return false;
304 }
305 }
306 catch(CObjReaderLineException& err) {
307 ProcessError(err, pEC);
308 return false;
309 }
310
311 //make sure we are interested:
312 if (xIsIgnoredFeatureType(pRecord->Type())) {
313 return true;
314 }
315 if (xIsIgnoredFeatureId(pRecord->Id())) {
316 return true;
317 }
318
319 //no support for multiparented features in genbank mode:
320 if (this->IsInGenbankMode() && pRecord->IsMultiParent()) {
321 AutoPtr<CObjReaderLineException> pErr(
322 CObjReaderLineException::Create(
323 eDiag_Fatal,
324 0,
325 "Multiparented features are not supported in Genbank mode"));
326 ProcessError(*pErr, pEC);
327 }
328
329 //append feature to annot:
330 if (!xUpdateAnnotFeature(*pRecord, annot, pEC)) {
331 return false;
332 }
333
334 ++mCurrentFeatureCount;
335 mParsingAlignment = false;
336 return true;
337 }
338
339
340 // ----------------------------------------------------------------------------
xParseAlignment(const string & strLine)341 bool CGff3Reader::xParseAlignment(
342 const string& strLine)
343 // ----------------------------------------------------------------------------
344 {
345 if (IsInGenbankMode()) {
346 return true;
347 }
348 auto& ids = mAlignmentData.mIds;
349 auto& alignments = mAlignmentData.mAlignments;
350
351 unique_ptr<CGff2Record> pRecord(x_CreateRecord());
352
353 if ( !pRecord->AssignFromGff(strLine) ) {
354 return false;
355 }
356
357 string id;
358 if ( !pRecord->GetAttribute("ID", id) ) {
359 id = pRecord->Id();
360 }
361
362 if (alignments.find(id) == alignments.end()) {
363 ids.push_back(id);
364 }
365
366 CRef<CSeq_align> alignment;
367 if (!x_CreateAlignment(*pRecord, alignment)) {
368 return false;
369 }
370
371 alignments[id].push_back(alignment);
372
373 ++mCurrentFeatureCount;
374 mParsingAlignment = true;
375 return true;
376 }
377
378 // ----------------------------------------------------------------------------
xUpdateAnnotFeature(const CGff2Record & gffRecord,CSeq_annot & annot,ILineErrorListener * pEC)379 bool CGff3Reader::xUpdateAnnotFeature(
380 const CGff2Record& gffRecord,
381 CSeq_annot& annot,
382 ILineErrorListener* pEC)
383 // ----------------------------------------------------------------------------
384 {
385 //if (gffRecord.Type() == "CDS") {
386 // if (gffRecord.SeqStart() == 114392786) {
387 // cerr << "";
388 // }
389 //}
390
391 mpLocations->AddRecord(gffRecord);
392
393 CRef< CSeq_feat > pFeature(new CSeq_feat);
394
395 auto recType = gffRecord.NormalizedType();
396 if (recType == "exon" || recType == "five_prime_utr" || recType == "three_prime_utr") {
397 return xUpdateAnnotExon(gffRecord, pFeature, annot, pEC);
398 }
399 if (recType == "cds") {
400 return xUpdateAnnotCds(gffRecord, pFeature, annot, pEC);
401 }
402 if (recType == "gene") {
403 return xUpdateAnnotGene(gffRecord, pFeature, annot, pEC);
404 }
405 if (recType == "mrna") {
406 return xUpdateAnnotMrna(gffRecord, pFeature, annot, pEC);
407 }
408 if (recType == "region") {
409 return xUpdateAnnotRegion(gffRecord, pFeature, annot, pEC);
410 }
411 if (!xUpdateAnnotGeneric(gffRecord, pFeature, annot, pEC)) {
412 return false;
413 }
414 return true;
415 }
416
417
418 // ----------------------------------------------------------------------------
xVerifyExonLocation(const string & mrnaId,const CGff2Record & exon)419 void CGff3Reader::xVerifyExonLocation(
420 const string& mrnaId,
421 const CGff2Record& exon)
422 // ----------------------------------------------------------------------------
423 {
424 map<string,CRef<CSeq_interval> >::const_iterator cit = mMrnaLocs.find(mrnaId);
425 if (cit == mMrnaLocs.end()) {
426 string message = "Bad data line: ";
427 message += exon.Type();
428 message += " referring to non-existent parent feature.";
429 CReaderMessage error(
430 eDiag_Error,
431 m_uLineNumber,
432 message);
433 throw error;
434 }
435 const CSeq_interval& containingInt = cit->second.GetObject();
436 const CRef<CSeq_loc> pContainedLoc = exon.GetSeqLoc(m_iFlags, mSeqIdResolve);
437 const CSeq_interval& containedInt = pContainedLoc->GetInt();
438 if (containedInt.GetFrom() < containingInt.GetFrom() ||
439 containedInt.GetTo() > containingInt.GetTo()) {
440 string message = "Bad data line: ";
441 message += exon.Type();
442 message += " extends beyond parent feature.";
443 CReaderMessage error(
444 eDiag_Error,
445 m_uLineNumber,
446 message);
447 throw error;
448 }
449 }
450
451 // ----------------------------------------------------------------------------
xUpdateAnnotExon(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)452 bool CGff3Reader::xUpdateAnnotExon(
453 const CGff2Record& record,
454 CRef<CSeq_feat> pFeature,
455 CSeq_annot& annot,
456 ILineErrorListener* pEC)
457 // ----------------------------------------------------------------------------
458 {
459 list<string> parents;
460 if (record.GetAttribute("Parent", parents)) {
461 for (list<string>::const_iterator it = parents.begin(); it != parents.end();
462 ++it) {
463 const string& parentId = *it;
464 CRef<CSeq_feat> pParent;
465 if (!x_GetFeatureById(parentId, pParent)) {
466 xAddPendingExon(parentId, record);
467 return true;
468 }
469 if (pParent->GetData().IsRna()) {
470 xVerifyExonLocation(parentId, record);
471 }
472 if (pParent->GetData().IsGene()) {
473 if (!xInitializeFeature(record, pFeature)) {
474 return false;
475 }
476 return xAddFeatureToAnnot(pFeature, annot);
477 }
478 IdToFeatureMap::iterator fit = m_MapIdToFeature.find(parentId);
479 if (fit != m_MapIdToFeature.end()) {
480 CRef<CSeq_feat> pParent = fit->second;
481 if (!record.UpdateFeature(m_iFlags, pParent)) {
482 return false;
483 }
484 }
485 }
486 }
487 return true;
488 }
489
490
491 // ----------------------------------------------------------------------------
xJoinLocationIntoRna(const CGff2Record & record,ILineErrorListener * pEC)492 bool CGff3Reader::xJoinLocationIntoRna(
493 const CGff2Record& record,
494 //CRef<CSeq_feat> pFeature,
495 //CSeq_annot& annot,
496 ILineErrorListener* pEC)
497 // ----------------------------------------------------------------------------
498 {
499 list<string> parents;
500 if (!record.GetAttribute("Parent", parents)) {
501 return true;
502 }
503 for (list<string>::const_iterator it = parents.begin(); it != parents.end();
504 ++it) {
505 const string& parentId = *it;
506 CRef<CSeq_feat> pParent;
507 if (!x_GetFeatureById(parentId, pParent)) {
508 // Danger:
509 // We don't know whether the CDS parent is indeed an RNA and it could
510 // possible be a gene.
511 // If the parent is indeed a gene then gene construction will have to
512 // purge this pending exon (or it will cause a sanity check to fail
513 // during post processing).
514 xAddPendingExon(parentId, record);
515 continue;
516 }
517 if (!pParent->GetData().IsRna()) {
518 continue;
519 }
520 xVerifyExonLocation(parentId, record);
521 if (!record.UpdateFeature(m_iFlags, pParent)) {
522 return false;
523 }
524 }
525 return true;
526 }
527
528
529 // ----------------------------------------------------------------------------
xUpdateAnnotCds(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)530 bool CGff3Reader::xUpdateAnnotCds(
531 const CGff2Record& record,
532 CRef<CSeq_feat> pFeature,
533 CSeq_annot& annot,
534 ILineErrorListener* pEC)
535 // ----------------------------------------------------------------------------
536 {
537 if (!xJoinLocationIntoRna(record, pEC)) {
538 return false;
539 }
540 xVerifyCdsParents(record);
541
542 string cdsId = xMakeRecordId(record);
543 mpLocations->AddRecordForId(cdsId, record);
544
545 auto pExistingFeature = m_MapIdToFeature.find(cdsId);
546 if (pExistingFeature != m_MapIdToFeature.end()) {
547 return true;
548 }
549
550 m_MapIdToFeature[cdsId] = pFeature;
551 xInitializeFeature(record, pFeature);
552 xAddFeatureToAnnot(pFeature, annot);
553
554 string parentId;
555 record.GetAttribute("Parent", parentId);
556 if (!parentId.empty()) {
557 xFeatureSetQualifier("Parent", parentId, pFeature);
558 xFeatureSetXrefParent(parentId, pFeature);
559 if (m_iFlags & fGeneXrefs) {
560 xFeatureSetXrefGrandParent(parentId, pFeature);
561 }
562 }
563 return true;
564 }
565
566
567 // ----------------------------------------------------------------------------
xFeatureSetXrefGrandParent(const string & parent,CRef<CSeq_feat> pFeature)568 bool CGff3Reader::xFeatureSetXrefGrandParent(
569 const string& parent,
570 CRef<CSeq_feat> pFeature)
571 // ----------------------------------------------------------------------------
572 {
573 IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
574 if (it == m_MapIdToFeature.end()) {
575 return false;
576 }
577 CRef<CSeq_feat> pParent = it->second;
578 const string &grandParentsStr = pParent->GetNamedQual("Parent");
579 list<string> grandParents;
580 NStr::Split(grandParentsStr, ",", grandParents, 0);
581 for (list<string>::const_iterator gpcit = grandParents.begin();
582 gpcit != grandParents.end(); ++gpcit) {
583 IdToFeatureMap::iterator gpit = m_MapIdToFeature.find(*gpcit);
584 if (gpit == m_MapIdToFeature.end()) {
585 return false;
586 }
587 CRef<CSeq_feat> pGrandParent = gpit->second;
588
589 //xref grandchild->grandparent
590 CRef<CFeat_id> pGrandParentId(new CFeat_id);
591 pGrandParentId->Assign(pGrandParent->GetId());
592 CRef<CSeqFeatXref> pGrandParentXref(new CSeqFeatXref);
593 pGrandParentXref->SetId(*pGrandParentId);
594 pFeature->SetXref().push_back(pGrandParentXref);
595
596 //xref grandparent->grandchild
597 CRef<CFeat_id> pGrandChildId(new CFeat_id);
598 pGrandChildId->Assign(pFeature->GetId());
599 CRef<CSeqFeatXref> pGrandChildXref(new CSeqFeatXref);
600 pGrandChildXref->SetId(*pGrandChildId);
601 pGrandParent->SetXref().push_back(pGrandChildXref);
602 }
603 return true;
604 }
605
606 // ----------------------------------------------------------------------------
xFeatureSetXrefParent(const string & parent,CRef<CSeq_feat> pChild)607 bool CGff3Reader::xFeatureSetXrefParent(
608 const string& parent,
609 CRef<CSeq_feat> pChild)
610 // ----------------------------------------------------------------------------
611 {
612 IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
613 if (it == m_MapIdToFeature.end()) {
614 return false;
615 }
616 CRef<CSeq_feat> pParent = it->second;
617
618 //xref child->parent
619 CRef<CFeat_id> pParentId(new CFeat_id);
620 pParentId->Assign(pParent->GetId());
621 CRef<CSeqFeatXref> pParentXref(new CSeqFeatXref);
622 pParentXref->SetId(*pParentId);
623 pChild->SetXref().push_back(pParentXref);
624
625 //xref parent->child
626 CRef<CFeat_id> pChildId(new CFeat_id);
627 pChildId->Assign(pChild->GetId());
628 CRef<CSeqFeatXref> pChildXref(new CSeqFeatXref);
629 pChildXref->SetId(*pChildId);
630 pParent->SetXref().push_back(pChildXref);
631 return true;
632 }
633
634 // ----------------------------------------------------------------------------
xFindFeatureUnderConstruction(const CGff2Record & record,CRef<CSeq_feat> & underConstruction)635 bool CGff3Reader::xFindFeatureUnderConstruction(
636 const CGff2Record& record,
637 CRef<CSeq_feat>& underConstruction)
638 // ----------------------------------------------------------------------------
639 {
640 string id;
641 if (!record.GetAttribute("ID", id)) {
642 return false;
643 }
644 IdToFeatureMap::iterator it = m_MapIdToFeature.find(id);
645 if (it == m_MapIdToFeature.end()) {
646 return false;
647 }
648
649 CReaderMessage fatal(
650 eDiag_Fatal,
651 m_uLineNumber,
652 "Bad data line: Duplicate feature ID \"" + id + "\".");
653 CSeq_feat tempFeat;
654 if (CSoMap::SoTypeToFeature(record.Type(), tempFeat)) {
655 if (it->second->GetData().GetSubtype() != tempFeat.GetData().GetSubtype()) {
656 throw fatal;
657 }
658 }
659
660 underConstruction = it->second;
661 return true;
662 }
663
664 // ----------------------------------------------------------------------------
xUpdateAnnotGeneric(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)665 bool CGff3Reader::xUpdateAnnotGeneric(
666 const CGff2Record& record,
667 CRef<CSeq_feat> pFeature,
668 CSeq_annot& annot,
669 ILineErrorListener* pEC)
670 // ----------------------------------------------------------------------------
671 {
672 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
673 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
674 return record.UpdateFeature(m_iFlags, pUnderConstruction);
675 }
676
677 string featType = record.Type();
678 if (featType == "stop_codon_read_through" || featType == "selenocysteine") {
679 string cdsParent;
680 if (!record.GetAttribute("Parent", cdsParent)) {
681 CReaderMessage error(
682 eDiag_Error,
683 m_uLineNumber,
684 "Bad data line: Unassigned code break.");
685 throw error;
686 }
687 IdToFeatureMap::iterator it = m_MapIdToFeature.find(cdsParent);
688 if (it == m_MapIdToFeature.end()) {
689 CReaderMessage error(
690 eDiag_Error,
691 m_uLineNumber,
692 "Bad data line: Code break assigned to missing feature.");
693 throw error;
694 }
695
696 CRef<CCode_break> pCodeBreak(new CCode_break);
697 CSeq_interval& cbLoc = pCodeBreak->SetLoc().SetInt();
698 CRef< CSeq_id > pId = mSeqIdResolve(record.Id(), m_iFlags, true);
699 cbLoc.SetId(*pId);
700 cbLoc.SetFrom(static_cast<TSeqPos>(record.SeqStart()));
701 cbLoc.SetTo(static_cast<TSeqPos>(record.SeqStop()));
702 if (record.IsSetStrand()) {
703 cbLoc.SetStrand(record.Strand());
704 }
705 pCodeBreak->SetAa().SetNcbieaa(
706 (featType == "selenocysteine") ? 'U' : 'X');
707 CRef<CSeq_feat> pCds = it->second;
708 CCdregion& cdRegion = pCds->SetData().SetCdregion();
709 list< CRef< CCode_break > >& codeBreaks = cdRegion.SetCode_break();
710 codeBreaks.push_back(pCodeBreak);
711 return true;
712 }
713 if (!xInitializeFeature(record, pFeature)) {
714 return false;
715 }
716 if (! xAddFeatureToAnnot(pFeature, annot)) {
717 return false;
718 }
719 string strId;
720 if ( record.GetAttribute("ID", strId)) {
721 m_MapIdToFeature[strId] = pFeature;
722 }
723 if (pFeature->GetData().IsRna() ||
724 pFeature->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_RNA) {
725 CRef<CSeq_interval> rnaLoc(new CSeq_interval);
726 rnaLoc->Assign(pFeature->GetLocation().GetInt());
727 mMrnaLocs[strId] = rnaLoc;
728 }
729 return true;
730 }
731
732 // ----------------------------------------------------------------------------
xUpdateAnnotMrna(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)733 bool CGff3Reader::xUpdateAnnotMrna(
734 const CGff2Record& record,
735 CRef<CSeq_feat> pFeature,
736 CSeq_annot& annot,
737 ILineErrorListener* pEC)
738 // ----------------------------------------------------------------------------
739 {
740 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
741 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
742 return record.UpdateFeature(m_iFlags, pUnderConstruction);
743 }
744
745 if (!xInitializeFeature(record, pFeature)) {
746 return false;
747 }
748 string parentsStr;
749 if ((m_iFlags & fGeneXrefs) && record.GetAttribute("Parent", parentsStr)) {
750 list<string> parents;
751 NStr::Split(parentsStr, ",", parents, 0);
752 for (list<string>::const_iterator cit = parents.begin();
753 cit != parents.end();
754 ++cit) {
755 if (!xFeatureSetXrefParent(*cit, pFeature)) {
756 CReaderMessage error(
757 eDiag_Error,
758 m_uLineNumber,
759 "Bad data line: mRNA record with bad parent assignment.");
760 throw error;
761 }
762 }
763 }
764
765 string strId;
766 if ( record.GetAttribute("ID", strId)) {
767 m_MapIdToFeature[strId] = pFeature;
768 }
769 CRef<CSeq_interval> mrnaLoc(new CSeq_interval);
770 CSeq_loc::E_Choice choice = pFeature->GetLocation().Which();
771 if (choice != CSeq_loc::e_Int) {
772 CReaderMessage error(
773 eDiag_Error,
774 m_uLineNumber,
775 "Internal error: Unexpected location type.");
776 throw error;
777 }
778 mrnaLoc->Assign(pFeature->GetLocation().GetInt());
779 mMrnaLocs[strId] = mrnaLoc;
780
781 list<CGff2Record> pendingExons;
782 xGetPendingExons(strId, pendingExons);
783 for (auto exonRecord: pendingExons) {
784 CRef< CSeq_feat > pFeature(new CSeq_feat);
785 xUpdateAnnotExon(exonRecord, pFeature, annot, pEC);
786 }
787 if (! xAddFeatureToAnnot(pFeature, annot)) {
788 return false;
789 }
790 return true;
791 }
792
793 // ----------------------------------------------------------------------------
xUpdateAnnotGene(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)794 bool CGff3Reader::xUpdateAnnotGene(
795 const CGff2Record& record,
796 CRef<CSeq_feat> pFeature,
797 CSeq_annot& annot,
798 ILineErrorListener* pEC)
799 // ----------------------------------------------------------------------------
800 {
801 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
802 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
803 return record.UpdateFeature(m_iFlags, pUnderConstruction);
804 }
805
806 if (!xInitializeFeature(record, pFeature)) {
807 return false;
808 }
809 if (! xAddFeatureToAnnot(pFeature, annot)) {
810 return false;
811 }
812 string strId;
813 if ( record.GetAttribute("ID", strId)) {
814 m_MapIdToFeature[strId] = pFeature;
815 }
816 // address corner case:
817 // parent of CDS is a gene but the DCS is listed before the gene so at the
818 // time we did not know the parent would be a gene.
819 // remedy: throw out any collected cds locations that were meant for RNA
820 // construction.
821 list<CGff2Record> pendingExons;
822 xGetPendingExons(strId, pendingExons);
823 return true;
824 }
825
826 // ----------------------------------------------------------------------------
xUpdateAnnotRegion(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)827 bool CGff3Reader::xUpdateAnnotRegion(
828 const CGff2Record& record,
829 CRef<CSeq_feat> pFeature,
830 CSeq_annot& annot,
831 ILineErrorListener* pEC)
832 // ----------------------------------------------------------------------------
833 {
834 if (!record.InitializeFeature(m_iFlags, pFeature)) {
835 return false;
836 }
837
838 if (! xAddFeatureToAnnot(pFeature, annot)) {
839 return false;
840 }
841 string strId;
842 if ( record.GetAttribute("ID", strId)) {
843 mIdToSeqIdMap[strId] = record.Id();
844 m_MapIdToFeature[strId] = pFeature;
845 }
846 return true;
847 }
848
849 // ----------------------------------------------------------------------------
xAddFeatureToAnnot(CRef<CSeq_feat> pFeature,CSeq_annot & annot)850 bool CGff3Reader::xAddFeatureToAnnot(
851 CRef< CSeq_feat > pFeature,
852 CSeq_annot& annot )
853 // ----------------------------------------------------------------------------
854 {
855 annot.SetData().SetFtable().push_back( pFeature ) ;
856 return true;
857 }
858
859 // ----------------------------------------------------------------------------
xVerifyCdsParents(const CGff2Record & record)860 void CGff3Reader::xVerifyCdsParents(
861 const CGff2Record& record)
862 // ----------------------------------------------------------------------------
863 {
864 string id;
865 string parents;
866 if (!record.GetAttribute("ID", id)) {
867 return;
868 }
869 record.GetAttribute("Parent", parents);
870 if (parents.empty()) {
871 CReaderMessage error(
872 eDiag_Error,
873 m_uLineNumber,
874 "Bad data line: CDS record is missing mandatory Parent attribute.");
875 throw error;
876 }
877 map<string, string>::iterator it = mCdsParentMap.find(id);
878 if (it == mCdsParentMap.end()) {
879 mCdsParentMap[id] = parents;
880 return;
881 }
882 if (it->second == parents) {
883 return;
884 }
885 CReaderMessage error(
886 eDiag_Error,
887 m_uLineNumber,
888 "Bad data line: CDS record with bad parent assignments.");
889 throw error;
890 }
891
892 // ----------------------------------------------------------------------------
xReadInit()893 bool CGff3Reader::xReadInit()
894 // ----------------------------------------------------------------------------
895 {
896 if (!CGff2Reader::xReadInit()) {
897 return false;
898 }
899 mCdsParentMap.clear();
900 return true;
901 }
902
903 // ----------------------------------------------------------------------------
xIsIgnoredFeatureType(const string & featureType)904 bool CGff3Reader::xIsIgnoredFeatureType(
905 const string& featureType)
906 // ----------------------------------------------------------------------------
907 {
908 typedef CStaticArraySet<string, PNocase> STRINGARRAY;
909
910 string ftype(CSoMap::ResolveSoAlias(featureType));
911
912 static const char* const ignoredTypesAlways_[] = {
913 "protein",
914 "start_codon", // also part of a cds feature
915 "stop_codon", // in GFF3, also part of a cds feature
916 };
917 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesAlways, ignoredTypesAlways_);
918 STRINGARRAY::const_iterator cit = ignoredTypesAlways.find(ftype);
919 if (cit != ignoredTypesAlways.end()) {
920 return true;
921 }
922 if (!IsInGenbankMode()) {
923 return false;
924 }
925
926 /* -genbank mode:*/
927 static const char* const specialTypesGenbank_[] = {
928 "antisense_RNA",
929 "autocatalytically_spliced_intron",
930 "guide_RNA",
931 "hammerhead_ribozyme",
932 "lnc_RNA",
933 "miRNA",
934 "piRNA",
935 "rasiRNA",
936 "ribozyme",
937 "RNase_MRP_RNA",
938 "RNase_P_RNA",
939 "scRNA",
940 "selenocysteine",
941 "siRNA",
942 "snoRNA",
943 "snRNA",
944 "SRP_RNA",
945 "stop_codon_read_through",
946 "telomerase_RNA",
947 "vault_RNA",
948 "Y_RNA"
949 };
950 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, specialTypesGenbank, specialTypesGenbank_);
951
952 static const char* const ignoredTypesGenbank_[] = {
953 "apicoplast_chromosome",
954 "assembly",
955 "cDNA_match",
956 "chloroplast_chromosome",
957 "chromoplast_chromosome",
958 "chromosome",
959 "contig",
960 "cyanelle_chromosome",
961 "dna_chromosome",
962 "EST_match",
963 "expressed_sequence_match",
964 "intron",
965 "leucoplast_chromosome",
966 "macronuclear_chromosome",
967 "match",
968 "match_part",
969 "micronuclear_chromosome",
970 "mitochondrial_chromosome",
971 "nuclear_chromosome",
972 "nucleomorphic_chromosome",
973 "nucleotide_motif",
974 "nucleotide_to_protein_match",
975 "partial_genomic_sequence_assembly",
976 "protein_match",
977 "replicon",
978 "rna_chromosome",
979 "sequence_assembly",
980 "supercontig",
981 "translated_nucleotide_match",
982 "ultracontig",
983 };
984 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesGenbank, ignoredTypesGenbank_);
985
986 cit = specialTypesGenbank.find(ftype);
987 if (cit != specialTypesGenbank.end()) {
988 return false;
989 }
990
991 cit = ignoredTypesGenbank.find(ftype);
992 if (cit != ignoredTypesGenbank.end()) {
993 return true;
994 }
995
996 return false;
997 }
998
999 // ----------------------------------------------------------------------------
1000 bool
xInitializeFeature(const CGff2Record & record,CRef<CSeq_feat> pFeature)1001 CGff3Reader::xInitializeFeature(
1002 const CGff2Record& record,
1003 CRef<CSeq_feat> pFeature)
1004 // ----------------------------------------------------------------------------
1005 {
1006 if (!record.InitializeFeature(m_iFlags, pFeature)) {
1007 return false;
1008 }
1009 const auto& attrs = record.Attributes();
1010 const auto it = attrs.find("ID");
1011 if (it != attrs.end()) {
1012 mIdToSeqIdMap[it->second] = record.Id();
1013 }
1014 return true;
1015 }
1016
1017 // ----------------------------------------------------------------------------
1018 void
xAddPendingExon(const string & rnaId,const CGff2Record & exonRecord)1019 CGff3Reader::xAddPendingExon(
1020 const string& rnaId,
1021 const CGff2Record& exonRecord)
1022 // ----------------------------------------------------------------------------
1023 {
1024 PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1025 if (it == mPendingExons.end()) {
1026 mPendingExons[rnaId] = list<CGff2Record>();
1027 }
1028 mPendingExons[rnaId].push_back(exonRecord);
1029 }
1030
1031 // ----------------------------------------------------------------------------
1032 void
xGetPendingExons(const string & rnaId,list<CGff2Record> & pendingExons)1033 CGff3Reader::xGetPendingExons(
1034 const string& rnaId,
1035 list<CGff2Record>& pendingExons)
1036 // ----------------------------------------------------------------------------
1037 {
1038 PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1039 if (it == mPendingExons.end()) {
1040 return;
1041 }
1042 pendingExons.swap(mPendingExons[rnaId]);
1043 mPendingExons.erase(rnaId);
1044 }
1045
1046 // ----------------------------------------------------------------------------
xPostProcessAnnot(CSeq_annot & annot)1047 void CGff3Reader::xPostProcessAnnot(
1048 CSeq_annot& annot)
1049 // ----------------------------------------------------------------------------
1050 {
1051 if (mAlignmentData) {
1052 xProcessAlignmentData(annot);
1053 return;
1054 }
1055 if (!mCurrentFeatureCount) {
1056 return;
1057 }
1058
1059 for (const auto& it: mPendingExons) {
1060 CReaderMessage warning(
1061 eDiag_Warning,
1062 m_uLineNumber,
1063 "Bad data line: Record references non-existent Parent=" + it.first);
1064 m_pMessageHandler->Report(warning);
1065 }
1066
1067 //location fixup:
1068 for (auto itLocation: mpLocations->LocationMap()) {
1069 auto id = itLocation.first;
1070 auto itFeature = m_MapIdToFeature.find(id);
1071 if (itFeature == m_MapIdToFeature.end()) {
1072 continue;
1073 }
1074 CRef<CSeq_loc> pNewLoc(new CSeq_loc);
1075 CCdregion::EFrame frame;
1076 mpLocations->MergeLocation(pNewLoc, frame, itLocation.second);
1077 CRef<CSeq_feat> pFeature = itFeature->second;
1078 pFeature->SetLocation(*pNewLoc);
1079 if (pFeature->GetData().IsCdregion()) {
1080 auto& cdrData = pFeature->SetData().SetCdregion();
1081 cdrData.SetFrame(
1082 frame == CCdregion::eFrame_not_set ? CCdregion::eFrame_one : frame);
1083 }
1084 }
1085
1086 return CGff2Reader::xPostProcessAnnot(annot);
1087 }
1088
1089 // ----------------------------------------------------------------------------
xProcessSequenceRegionPragma(const string & pragma)1090 void CGff3Reader::xProcessSequenceRegionPragma(
1091 const string& pragma)
1092 // ----------------------------------------------------------------------------
1093 {
1094 TSeqPos sequenceSize(0);
1095 vector<string> tokens;
1096 NStr::Split(pragma, " \t", tokens, NStr::fSplit_MergeDelimiters);
1097 if (tokens.size() < 2) {
1098 CReaderMessage warning(
1099 ncbi::eDiag_Warning,
1100 m_uLineNumber,
1101 "Bad sequence-region pragma - ignored.");
1102 throw warning;
1103 }
1104 if (tokens.size() >= 4) {
1105 try {
1106 sequenceSize = NStr::StringToNonNegativeInt(tokens[3]);
1107 }
1108 catch(exception&) {
1109 CReaderMessage warning(
1110 ncbi::eDiag_Warning,
1111 m_uLineNumber,
1112 "Bad sequence-region pragma - ignored.");
1113 throw warning;
1114 }
1115 }
1116 mpLocations->SetSequenceSize(tokens[1], sequenceSize);
1117 }
1118
1119 // ----------------------------------------------------------------------------
SequenceSize() const1120 TSeqPos CGff3Reader::SequenceSize() const
1121 // ----------------------------------------------------------------------------
1122 {
1123 return mpLocations->SequenceSize();
1124 }
1125
1126 END_objects_SCOPE
1127 END_NCBI_SCOPE
1128