1 /* $Id: gff3_reader.cpp 636167 2021-08-17 15:26:17Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description:
29 * GFF3 file reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35
36 #include <util/line_reader.hpp>
37
38
39 #include <objects/general/Object_id.hpp>
40 #include <objects/general/User_object.hpp>
41
42 #include <objects/seqloc/Seq_id.hpp>
43 #include <objects/seqloc/Seq_loc.hpp>
44 #include <objects/seqloc/Seq_interval.hpp>
45 #include <objects/seqloc/Seq_point.hpp>
46
47 #include <objects/seq/Seq_annot.hpp>
48 #include <objects/seq/Annot_id.hpp>
49 #include <objects/seq/Annot_descr.hpp>
50 #include <objects/seq/so_map.hpp>
51 #include <objects/seqfeat/SeqFeatXref.hpp>
52
53 #include <objects/seqfeat/Seq_feat.hpp>
54 #include <objects/seqfeat/Code_break.hpp>
55 #include <objects/seqfeat/Genetic_code.hpp>
56 #include <objects/seqfeat/RNA_ref.hpp>
57 #include <objects/seqfeat/Feat_id.hpp>
58
59 #include <objtools/readers/gff3_reader.hpp>
60 #include "gff3_location_merger.hpp"
61
62 #include "reader_message_handler.hpp"
63
64 #include <algorithm>
65
66 BEGIN_NCBI_SCOPE
67 BEGIN_objects_SCOPE
68
69 unsigned int CGff3Reader::msGenericIdCounter = 0;
70
71 // ----------------------------------------------------------------------------
xMakeRecordId(const CGff2Record & record)72 string CGff3Reader::xMakeRecordId(
73 const CGff2Record& record)
74 // ----------------------------------------------------------------------------
75 {
76 string id, parentId;
77 record.GetAttribute("ID", id);
78 record.GetAttribute("Parent", parentId);
79
80 auto recordType = record.NormalizedType();
81 if (recordType == "cds") {
82 string cdsId = parentId;
83 if (cdsId.empty()) {
84 cdsId = (id.empty() ? xNextGenericId() : id);
85 }
86 else {
87 cdsId += ":cds";
88 }
89 return cdsId;
90 }
91 if (id.empty()) {
92 return xNextGenericId();
93 }
94 return id;
95 }
96
97 // ----------------------------------------------------------------------------
xNextGenericId()98 string CGff3Reader::xNextGenericId()
99 // ----------------------------------------------------------------------------
100 {
101 return string("generic") + NStr::IntToString(msGenericIdCounter++);
102 }
103
104 // ----------------------------------------------------------------------------
AssignFromGff(const string & strRawInput)105 bool CGff3ReadRecord::AssignFromGff(
106 const string& strRawInput )
107 // ----------------------------------------------------------------------------
108 {
109 if (!CGff2Record::AssignFromGff(strRawInput)) {
110 return false;
111 }
112 string id, parent;
113 GetAttribute("ID", id);
114 GetAttribute("Parent", parent);
115 if (id.empty() && parent.empty()) {
116 m_Attributes["ID"] = CGff3Reader::xNextGenericId();
117 }
118 if (m_strType == "pseudogene") {
119 SetType("gene");
120 m_Attributes["pseudo"] = "true";
121 return true;
122 }
123 if (m_strType == "pseudogenic_transcript") {
124 SetType("transcript");
125 m_Attributes["pseudo"] = "true";
126 return true;
127 }
128 if (m_strType == "pseudogenic_tRNA") {
129 SetType("tRNA");
130 m_Attributes["pseudo"] = "true";
131 return true;
132 }
133 if (m_strType == "pseudogenic_rRNA") {
134 SetType("rRNA");
135 m_Attributes["pseudo"] = "true";
136 return true;
137 }
138 if (m_strType == "pseudogenic_exon") {
139 SetType("exon");
140 return true;
141 }
142 if (m_strType == "pseudogenic_CDS") {
143 SetType("CDS");
144 m_Attributes["pseudo"] = "true";
145 return true;
146 }
147 if (m_strType == "transcript") {
148 SetType("misc_RNA");
149 return true;
150 }
151 return true;
152 }
153
154 // ----------------------------------------------------------------------------
x_NormalizedAttributeKey(const string & strRawKey)155 string CGff3ReadRecord::x_NormalizedAttributeKey(
156 const string& strRawKey )
157 // ---------------------------------------------------------------------------
158 {
159 string strKey = CGff2Record::xNormalizedAttributeKey( strRawKey );
160 if ( 0 == NStr::CompareNocase( strRawKey, "ID" ) ) {
161 return "ID";
162 }
163 if ( 0 == NStr::CompareNocase( strKey, "Name" ) ) {
164 return "Name";
165 }
166 if ( 0 == NStr::CompareNocase( strKey, "Alias" ) ) {
167 return "Alias";
168 }
169 if ( 0 == NStr::CompareNocase( strKey, "Parent" ) ) {
170 return "Parent";
171 }
172 if ( 0 == NStr::CompareNocase( strKey, "Target" ) ) {
173 return "Target";
174 }
175 if ( 0 == NStr::CompareNocase( strKey, "Gap" ) ) {
176 return "Gap";
177 }
178 if ( 0 == NStr::CompareNocase( strKey, "Derives_from" ) ) {
179 return "Derives_from";
180 }
181 if ( 0 == NStr::CompareNocase( strKey, "Note" ) ) {
182 return "Note";
183 }
184 if ( 0 == NStr::CompareNocase( strKey, "Dbxref" ) ||
185 0 == NStr::CompareNocase( strKey, "Db_xref" ) ) {
186 return "Dbxref";
187 }
188 if ( 0 == NStr::CompareNocase( strKey, "Ontology_term" ) ) {
189 return "Ontology_term";
190 }
191 return strKey;
192 }
193
194 // ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,const string & name,const string & title,SeqIdResolver resolver,CReaderListener * pRL)195 CGff3Reader::CGff3Reader(
196 unsigned int uFlags,
197 const string& name,
198 const string& title,
199 SeqIdResolver resolver,
200 CReaderListener* pRL):
201 // ----------------------------------------------------------------------------
202 CGff2Reader( uFlags, name, title, resolver, pRL )
203 {
204 mpLocations.reset(new CGff3LocationMerger(uFlags, resolver, 0));
205 CGff2Record::ResetId();
206 }
207
208 // ----------------------------------------------------------------------------
CGff3Reader(unsigned int uFlags,CReaderListener * pRL)209 CGff3Reader::CGff3Reader(
210 unsigned int uFlags,
211 CReaderListener* pRL):
212 // ----------------------------------------------------------------------------
213 CGff3Reader(uFlags, "", "", CReadUtil::AsSeqId, pRL)
214 {
215 }
216
217 // ----------------------------------------------------------------------------
~CGff3Reader()218 CGff3Reader::~CGff3Reader()
219 // ----------------------------------------------------------------------------
220 {
221 }
222
223 // ----------------------------------------------------------------------------
224 CRef<CSeq_annot>
ReadSeqAnnot(ILineReader & lr,ILineErrorListener * pEC)225 CGff3Reader::ReadSeqAnnot(
226 ILineReader& lr,
227 ILineErrorListener* pEC )
228 // ----------------------------------------------------------------------------
229 {
230 mCurrentFeatureCount = 0;
231 mParsingAlignment = false;
232 mAlignmentData.Reset();
233 mpLocations->Reset();
234 auto pAnnot = CReaderBase::ReadSeqAnnot(lr, pEC);
235 if (pAnnot && pAnnot->GetData().Which() == CSeq_annot::TData::e_not_set) {
236 return CRef<CSeq_annot>();
237 }
238 return pAnnot;
239 }
240
241 // ----------------------------------------------------------------------------
242 void
xProcessData(const TReaderData & readerData,CSeq_annot & annot)243 CGff3Reader::xProcessData(
244 const TReaderData& readerData,
245 CSeq_annot& annot)
246 // ----------------------------------------------------------------------------
247 {
248 for (const auto& lineData: readerData) {
249 const auto& line = lineData.mData;
250 if (xParseStructuredComment(line) &&
251 !NStr::StartsWith(line, "##sequence-region") ) {
252 continue;
253 }
254 if (xParseBrowserLine(line, annot)) {
255 continue;
256 }
257 if (xParseFeature(line, annot, nullptr)) {
258 continue;
259 }
260 }
261 }
262
263 // ----------------------------------------------------------------------------
xProcessAlignmentData(CSeq_annot & annot)264 void CGff3Reader::xProcessAlignmentData(
265 CSeq_annot& annot)
266 // ----------------------------------------------------------------------------
267 {
268 for (const string& id : mAlignmentData.mIds) {
269 CRef<CSeq_align> pAlign = Ref(new CSeq_align());
270 if (x_MergeAlignments(mAlignmentData.mAlignments.at(id), pAlign)) {
271 // if available, add current browser information
272 if ( m_CurrentBrowserInfo ) {
273 annot.SetDesc().Set().push_back( m_CurrentBrowserInfo );
274 }
275
276 annot.SetNameDesc("alignments");
277
278 if ( !m_AnnotTitle.empty() ) {
279 annot.SetTitleDesc(m_AnnotTitle);
280 }
281 // Add alignment
282 annot.SetData().SetAlign().push_back(pAlign);
283 }
284 }
285 }
286
287 // ----------------------------------------------------------------------------
288 bool
xParseFeature(const string & line,CSeq_annot & annot,ILineErrorListener * pEC)289 CGff3Reader::xParseFeature(
290 const string& line,
291 CSeq_annot& annot,
292 ILineErrorListener* pEC)
293 // ----------------------------------------------------------------------------
294 {
295 if (CGff2Reader::IsAlignmentData(line)) {
296 return xParseAlignment(line);
297 }
298
299 //parse record:
300 shared_ptr<CGff3ReadRecord> pRecord(x_CreateRecord());
301 try {
302 if (!pRecord->AssignFromGff(line)) {
303 return false;
304 }
305 }
306 catch(CObjReaderLineException& err) {
307 ProcessError(err, pEC);
308 return false;
309 }
310
311 //make sure we are interested:
312 if (xIsIgnoredFeatureType(pRecord->Type())) {
313 return true;
314 }
315 if (xIsIgnoredFeatureId(pRecord->Id())) {
316 return true;
317 }
318
319 //no support for multiparented features in genbank mode:
320 if (this->IsInGenbankMode() && pRecord->IsMultiParent()) {
321 AutoPtr<CObjReaderLineException> pErr(
322 CObjReaderLineException::Create(
323 eDiag_Fatal,
324 0,
325 "Multiparented features are not supported in Genbank mode"));
326 ProcessError(*pErr, pEC);
327 }
328
329 //append feature to annot:
330 if (!xUpdateAnnotFeature(*pRecord, annot, pEC)) {
331 return false;
332 }
333
334 ++mCurrentFeatureCount;
335 mParsingAlignment = false;
336 return true;
337 }
338
339
340 // ----------------------------------------------------------------------------
xParseAlignment(const string & strLine)341 bool CGff3Reader::xParseAlignment(
342 const string& strLine)
343 // ----------------------------------------------------------------------------
344 {
345 if (IsInGenbankMode()) {
346 return true;
347 }
348 auto& ids = mAlignmentData.mIds;
349 auto& alignments = mAlignmentData.mAlignments;
350
351 unique_ptr<CGff2Record> pRecord(x_CreateRecord());
352
353 if ( !pRecord->AssignFromGff(strLine) ) {
354 return false;
355 }
356
357 string id;
358 if ( !pRecord->GetAttribute("ID", id) ) {
359 id = pRecord->Id();
360 }
361
362 if (alignments.find(id) == alignments.end()) {
363 ids.push_back(id);
364 }
365
366 CRef<CSeq_align> alignment;
367 if (!x_CreateAlignment(*pRecord, alignment)) {
368 return false;
369 }
370
371 alignments[id].push_back(alignment);
372
373 ++mCurrentFeatureCount;
374 mParsingAlignment = true;
375 return true;
376 }
377
378 // ----------------------------------------------------------------------------
xUpdateAnnotFeature(const CGff2Record & gffRecord,CSeq_annot & annot,ILineErrorListener * pEC)379 bool CGff3Reader::xUpdateAnnotFeature(
380 const CGff2Record& gffRecord,
381 CSeq_annot& annot,
382 ILineErrorListener* pEC)
383 // ----------------------------------------------------------------------------
384 {
385 //if (gffRecord.Type() == "CDS") {
386 // if (gffRecord.SeqStart() == 114392786) {
387 // cerr << "";
388 // }
389 //}
390
391 mpLocations->AddRecord(gffRecord);
392
393 CRef< CSeq_feat > pFeature(new CSeq_feat);
394
395 auto recType = gffRecord.NormalizedType();
396 if (recType == "exon" || recType == "five_prime_utr" || recType == "three_prime_utr") {
397 return xUpdateAnnotExon(gffRecord, pFeature, annot, pEC);
398 }
399 if (recType == "cds") {
400 return xUpdateAnnotCds(gffRecord, pFeature, annot, pEC);
401 }
402 if (recType == "gene") {
403 return xUpdateAnnotGene(gffRecord, pFeature, annot, pEC);
404 }
405 if (recType == "mrna") {
406 return xUpdateAnnotMrna(gffRecord, pFeature, annot, pEC);
407 }
408 if (recType == "region") {
409 return xUpdateAnnotRegion(gffRecord, pFeature, annot, pEC);
410 }
411 if (!xUpdateAnnotGeneric(gffRecord, pFeature, annot, pEC)) {
412 return false;
413 }
414 return true;
415 }
416
417
418 // ----------------------------------------------------------------------------
xVerifyExonLocation(const string & mrnaId,const CGff2Record & exon)419 void CGff3Reader::xVerifyExonLocation(
420 const string& mrnaId,
421 const CGff2Record& exon)
422 // ----------------------------------------------------------------------------
423 {
424 map<string,CRef<CSeq_interval> >::const_iterator cit = mMrnaLocs.find(mrnaId);
425 if (cit == mMrnaLocs.end()) {
426 string message = "Bad data line: ";
427 message += exon.Type();
428 message += " referring to non-existent parent feature.";
429 CReaderMessage error(
430 eDiag_Error,
431 m_uLineNumber,
432 message);
433 throw error;
434 }
435 const CSeq_interval& containingInt = cit->second.GetObject();
436 const CRef<CSeq_loc> pContainedLoc = exon.GetSeqLoc(m_iFlags, mSeqIdResolve);
437 const CSeq_interval& containedInt = pContainedLoc->GetInt();
438 if (containedInt.GetFrom() < containingInt.GetFrom() ||
439 containedInt.GetTo() > containingInt.GetTo()) {
440 string message = "Bad data line: ";
441 message += exon.Type();
442 message += " extends beyond parent feature.";
443 CReaderMessage error(
444 eDiag_Error,
445 m_uLineNumber,
446 message);
447 throw error;
448 }
449 }
450
451 // ----------------------------------------------------------------------------
xUpdateAnnotExon(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)452 bool CGff3Reader::xUpdateAnnotExon(
453 const CGff2Record& record,
454 CRef<CSeq_feat> pFeature,
455 CSeq_annot& annot,
456 ILineErrorListener* pEC)
457 // ----------------------------------------------------------------------------
458 {
459 list<string> parents;
460 if (record.GetAttribute("Parent", parents)) {
461 for (list<string>::const_iterator it = parents.begin(); it != parents.end();
462 ++it) {
463 const string& parentId = *it;
464 CRef<CSeq_feat> pParent;
465 if (!x_GetFeatureById(parentId, pParent)) {
466 xAddPendingExon(parentId, record);
467 return true;
468 }
469 if (pParent->GetData().IsRna()) {
470 xVerifyExonLocation(parentId, record);
471 }
472 if (pParent->GetData().IsGene()) {
473 if (!xInitializeFeature(record, pFeature)) {
474 return false;
475 }
476 return xAddFeatureToAnnot(pFeature, annot);
477 }
478 IdToFeatureMap::iterator fit = m_MapIdToFeature.find(parentId);
479 if (fit != m_MapIdToFeature.end()) {
480 CRef<CSeq_feat> pParent = fit->second;
481 if (!record.UpdateFeature(m_iFlags, pParent)) {
482 return false;
483 }
484 }
485 }
486 }
487 return true;
488 }
489
490
491 // ----------------------------------------------------------------------------
xJoinLocationIntoRna(const CGff2Record & record,ILineErrorListener * pEC)492 bool CGff3Reader::xJoinLocationIntoRna(
493 const CGff2Record& record,
494 //CRef<CSeq_feat> pFeature,
495 //CSeq_annot& annot,
496 ILineErrorListener* pEC)
497 // ----------------------------------------------------------------------------
498 {
499 list<string> parents;
500 if (!record.GetAttribute("Parent", parents)) {
501 return true;
502 }
503 for (list<string>::const_iterator it = parents.begin(); it != parents.end();
504 ++it) {
505 const string& parentId = *it;
506 CRef<CSeq_feat> pParent;
507 if (!x_GetFeatureById(parentId, pParent)) {
508 // Danger:
509 // We don't know whether the CDS parent is indeed an RNA and it could
510 // possible be a gene.
511 // If the parent is indeed a gene then gene construction will have to
512 // purge this pending exon (or it will cause a sanity check to fail
513 // during post processing).
514 xAddPendingExon(parentId, record);
515 continue;
516 }
517 if (!pParent->GetData().IsRna()) {
518 continue;
519 }
520 xVerifyExonLocation(parentId, record);
521 if (!record.UpdateFeature(m_iFlags, pParent)) {
522 return false;
523 }
524 }
525 return true;
526 }
527
528
529 // ----------------------------------------------------------------------------
xUpdateAnnotCds(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)530 bool CGff3Reader::xUpdateAnnotCds(
531 const CGff2Record& record,
532 CRef<CSeq_feat> pFeature,
533 CSeq_annot& annot,
534 ILineErrorListener* pEC)
535 // ----------------------------------------------------------------------------
536 {
537 if (!xJoinLocationIntoRna(record, pEC)) {
538 return false;
539 }
540 xVerifyCdsParents(record);
541
542 string cdsId = xMakeRecordId(record);
543 mpLocations->AddRecordForId(cdsId, record);
544
545 auto pExistingFeature = m_MapIdToFeature.find(cdsId);
546 if (pExistingFeature != m_MapIdToFeature.end()) {
547 return true;
548 }
549
550 m_MapIdToFeature[cdsId] = pFeature;
551 xInitializeFeature(record, pFeature);
552 xAddFeatureToAnnot(pFeature, annot);
553
554 string parentId;
555 record.GetAttribute("Parent", parentId);
556 if (!parentId.empty()) {
557 xFeatureSetQualifier("Parent", parentId, pFeature);
558 xFeatureSetXrefParent(parentId, pFeature);
559 if (m_iFlags & fGeneXrefs) {
560 xFeatureSetXrefGrandParent(parentId, pFeature);
561 }
562 }
563 return true;
564 }
565
566
567 // ----------------------------------------------------------------------------
xFeatureSetXrefGrandParent(const string & parent,CRef<CSeq_feat> pFeature)568 bool CGff3Reader::xFeatureSetXrefGrandParent(
569 const string& parent,
570 CRef<CSeq_feat> pFeature)
571 // ----------------------------------------------------------------------------
572 {
573 IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
574 if (it == m_MapIdToFeature.end()) {
575 return false;
576 }
577 CRef<CSeq_feat> pParent = it->second;
578 const string &grandParentsStr = pParent->GetNamedQual("Parent");
579 list<string> grandParents;
580 NStr::Split(grandParentsStr, ",", grandParents, 0);
581 for (list<string>::const_iterator gpcit = grandParents.begin();
582 gpcit != grandParents.end(); ++gpcit) {
583 IdToFeatureMap::iterator gpit = m_MapIdToFeature.find(*gpcit);
584 if (gpit == m_MapIdToFeature.end()) {
585 return false;
586 }
587 CRef<CSeq_feat> pGrandParent = gpit->second;
588
589 //xref grandchild->grandparent
590 CRef<CFeat_id> pGrandParentId(new CFeat_id);
591 pGrandParentId->Assign(pGrandParent->GetId());
592 CRef<CSeqFeatXref> pGrandParentXref(new CSeqFeatXref);
593 pGrandParentXref->SetId(*pGrandParentId);
594 pFeature->SetXref().push_back(pGrandParentXref);
595
596 //xref grandparent->grandchild
597 CRef<CFeat_id> pGrandChildId(new CFeat_id);
598 pGrandChildId->Assign(pFeature->GetId());
599 CRef<CSeqFeatXref> pGrandChildXref(new CSeqFeatXref);
600 pGrandChildXref->SetId(*pGrandChildId);
601 pGrandParent->SetXref().push_back(pGrandChildXref);
602 }
603 return true;
604 }
605
606 // ----------------------------------------------------------------------------
xFeatureSetXrefParent(const string & parent,CRef<CSeq_feat> pChild)607 bool CGff3Reader::xFeatureSetXrefParent(
608 const string& parent,
609 CRef<CSeq_feat> pChild)
610 // ----------------------------------------------------------------------------
611 {
612 IdToFeatureMap::iterator it = m_MapIdToFeature.find(parent);
613 if (it == m_MapIdToFeature.end()) {
614 return false;
615 }
616 CRef<CSeq_feat> pParent = it->second;
617
618 //xref child->parent
619 CRef<CFeat_id> pParentId(new CFeat_id);
620 pParentId->Assign(pParent->GetId());
621 CRef<CSeqFeatXref> pParentXref(new CSeqFeatXref);
622 pParentXref->SetId(*pParentId);
623 pChild->SetXref().push_back(pParentXref);
624
625 //xref parent->child
626 CRef<CFeat_id> pChildId(new CFeat_id);
627 pChildId->Assign(pChild->GetId());
628 CRef<CSeqFeatXref> pChildXref(new CSeqFeatXref);
629 pChildXref->SetId(*pChildId);
630 pParent->SetXref().push_back(pChildXref);
631 return true;
632 }
633
634 // ----------------------------------------------------------------------------
xFindFeatureUnderConstruction(const CGff2Record & record,CRef<CSeq_feat> & underConstruction)635 bool CGff3Reader::xFindFeatureUnderConstruction(
636 const CGff2Record& record,
637 CRef<CSeq_feat>& underConstruction)
638 // ----------------------------------------------------------------------------
639 {
640 string id;
641 if (!record.GetAttribute("ID", id)) {
642 return false;
643 }
644 IdToFeatureMap::iterator it = m_MapIdToFeature.find(id);
645 if (it == m_MapIdToFeature.end()) {
646 return false;
647 }
648
649 CReaderMessage fatal(
650 eDiag_Fatal,
651 m_uLineNumber,
652 "Bad data line: Duplicate feature ID \"" + id + "\".");
653 CSeq_feat tempFeat;
654 if (CSoMap::SoTypeToFeature(record.Type(), tempFeat)) {
655 if (it->second->GetData().GetSubtype() != tempFeat.GetData().GetSubtype()) {
656 throw fatal;
657 }
658 }
659
660 underConstruction = it->second;
661 return true;
662 }
663
664 // ----------------------------------------------------------------------------
xUpdateAnnotGeneric(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)665 bool CGff3Reader::xUpdateAnnotGeneric(
666 const CGff2Record& record,
667 CRef<CSeq_feat> pFeature,
668 CSeq_annot& annot,
669 ILineErrorListener* pEC)
670 // ----------------------------------------------------------------------------
671 {
672 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
673 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
674 return record.UpdateFeature(m_iFlags, pUnderConstruction);
675 }
676
677 string featType = record.Type();
678 if (featType == "stop_codon_read_through" || featType == "selenocysteine") {
679 string cdsParent;
680 if (!record.GetAttribute("Parent", cdsParent)) {
681 CReaderMessage error(
682 eDiag_Error,
683 m_uLineNumber,
684 "Bad data line: Unassigned code break.");
685 throw error;
686 }
687 IdToFeatureMap::iterator it = m_MapIdToFeature.find(cdsParent);
688 if (it == m_MapIdToFeature.end()) {
689 CReaderMessage error(
690 eDiag_Error,
691 m_uLineNumber,
692 "Bad data line: Code break assigned to missing feature.");
693 throw error;
694 }
695
696 CRef<CCode_break> pCodeBreak(new CCode_break);
697 CSeq_interval& cbLoc = pCodeBreak->SetLoc().SetInt();
698 CRef< CSeq_id > pId = mSeqIdResolve(record.Id(), m_iFlags, true);
699 cbLoc.SetId(*pId);
700 cbLoc.SetFrom(static_cast<TSeqPos>(record.SeqStart()));
701 cbLoc.SetTo(static_cast<TSeqPos>(record.SeqStop()));
702 if (record.IsSetStrand()) {
703 cbLoc.SetStrand(record.Strand());
704 }
705 pCodeBreak->SetAa().SetNcbieaa(
706 (featType == "selenocysteine") ? 'U' : 'X');
707 CRef<CSeq_feat> pCds = it->second;
708 CCdregion& cdRegion = pCds->SetData().SetCdregion();
709 list< CRef< CCode_break > >& codeBreaks = cdRegion.SetCode_break();
710 codeBreaks.push_back(pCodeBreak);
711 return true;
712 }
713 if (!xInitializeFeature(record, pFeature)) {
714 return false;
715 }
716 if (! xAddFeatureToAnnot(pFeature, annot)) {
717 return false;
718 }
719 string strId;
720 if ( record.GetAttribute("ID", strId)) {
721 m_MapIdToFeature[strId] = pFeature;
722 }
723 if (pFeature->GetData().IsRna() ||
724 pFeature->GetData().GetSubtype() == CSeqFeatData::eSubtype_misc_RNA) {
725 CRef<CSeq_interval> rnaLoc(new CSeq_interval);
726 rnaLoc->Assign(pFeature->GetLocation().GetInt());
727 mMrnaLocs[strId] = rnaLoc;
728 }
729 return true;
730 }
731
732 // ----------------------------------------------------------------------------
xUpdateAnnotMrna(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)733 bool CGff3Reader::xUpdateAnnotMrna(
734 const CGff2Record& record,
735 CRef<CSeq_feat> pFeature,
736 CSeq_annot& annot,
737 ILineErrorListener* pEC)
738 // ----------------------------------------------------------------------------
739 {
740 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
741 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
742 return record.UpdateFeature(m_iFlags, pUnderConstruction);
743 }
744
745 if (!xInitializeFeature(record, pFeature)) {
746 return false;
747 }
748 string parentsStr;
749 if ((m_iFlags & fGeneXrefs) && record.GetAttribute("Parent", parentsStr)) {
750 list<string> parents;
751 NStr::Split(parentsStr, ",", parents, 0);
752 for (list<string>::const_iterator cit = parents.begin();
753 cit != parents.end();
754 ++cit) {
755 if (!xFeatureSetXrefParent(*cit, pFeature)) {
756 CReaderMessage error(
757 eDiag_Error,
758 m_uLineNumber,
759 "Bad data line: mRNA record with bad parent assignment.");
760 throw error;
761 }
762 }
763 }
764
765 string strId;
766 if ( record.GetAttribute("ID", strId)) {
767 m_MapIdToFeature[strId] = pFeature;
768 }
769 CRef<CSeq_interval> mrnaLoc(new CSeq_interval);
770 CSeq_loc::E_Choice choice = pFeature->GetLocation().Which();
771 if (choice != CSeq_loc::e_Int) {
772 CReaderMessage error(
773 eDiag_Error,
774 m_uLineNumber,
775 "Internal error: Unexpected location type.");
776 throw error;
777 }
778 mrnaLoc->Assign(pFeature->GetLocation().GetInt());
779 mMrnaLocs[strId] = mrnaLoc;
780
781 list<CGff2Record> pendingExons;
782 xGetPendingExons(strId, pendingExons);
783 for (auto exonRecord: pendingExons) {
784 CRef< CSeq_feat > pFeature(new CSeq_feat);
785 xUpdateAnnotExon(exonRecord, pFeature, annot, pEC);
786 }
787 if (! xAddFeatureToAnnot(pFeature, annot)) {
788 return false;
789 }
790 return true;
791 }
792
793 // ----------------------------------------------------------------------------
xUpdateAnnotGene(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)794 bool CGff3Reader::xUpdateAnnotGene(
795 const CGff2Record& record,
796 CRef<CSeq_feat> pFeature,
797 CSeq_annot& annot,
798 ILineErrorListener* pEC)
799 // ----------------------------------------------------------------------------
800 {
801 CRef<CSeq_feat> pUnderConstruction(new CSeq_feat);
802 if (xFindFeatureUnderConstruction(record, pUnderConstruction)) {
803 return record.UpdateFeature(m_iFlags, pUnderConstruction);
804 }
805
806 if (!xInitializeFeature(record, pFeature)) {
807 return false;
808 }
809 if (! xAddFeatureToAnnot(pFeature, annot)) {
810 return false;
811 }
812 string strId;
813 if ( record.GetAttribute("ID", strId)) {
814 m_MapIdToFeature[strId] = pFeature;
815 }
816 // address corner case:
817 // parent of CDS is a gene but the DCS is listed before the gene so at the
818 // time we did not know the parent would be a gene.
819 // remedy: throw out any collected cds locations that were meant for RNA
820 // construction.
821 list<CGff2Record> pendingExons;
822 xGetPendingExons(strId, pendingExons);
823 return true;
824 }
825
826 // ----------------------------------------------------------------------------
xUpdateAnnotRegion(const CGff2Record & record,CRef<CSeq_feat> pFeature,CSeq_annot & annot,ILineErrorListener * pEC)827 bool CGff3Reader::xUpdateAnnotRegion(
828 const CGff2Record& record,
829 CRef<CSeq_feat> pFeature,
830 CSeq_annot& annot,
831 ILineErrorListener* pEC)
832 // ----------------------------------------------------------------------------
833 {
834 if (!record.InitializeFeature(m_iFlags, pFeature)) {
835 return false;
836 }
837
838 if (! xAddFeatureToAnnot(pFeature, annot)) {
839 return false;
840 }
841 string strId;
842 if ( record.GetAttribute("ID", strId)) {
843 mIdToSeqIdMap[strId] = record.Id();
844 m_MapIdToFeature[strId] = pFeature;
845 }
846 return true;
847 }
848
849 // ----------------------------------------------------------------------------
xAddFeatureToAnnot(CRef<CSeq_feat> pFeature,CSeq_annot & annot)850 bool CGff3Reader::xAddFeatureToAnnot(
851 CRef< CSeq_feat > pFeature,
852 CSeq_annot& annot )
853 // ----------------------------------------------------------------------------
854 {
855 annot.SetData().SetFtable().push_back( pFeature ) ;
856 return true;
857 }
858
859 // ----------------------------------------------------------------------------
xVerifyCdsParents(const CGff2Record & record)860 void CGff3Reader::xVerifyCdsParents(
861 const CGff2Record& record)
862 // ----------------------------------------------------------------------------
863 {
864 string id;
865 string parents;
866 if (!record.GetAttribute("ID", id)) {
867 return;
868 }
869 record.GetAttribute("Parent", parents);
870 map<string, string>::iterator it = mCdsParentMap.find(id);
871 if (it == mCdsParentMap.end()) {
872 mCdsParentMap[id] = parents;
873 return;
874 }
875 if (it->second == parents) {
876 return;
877 }
878 CReaderMessage error(
879 eDiag_Error,
880 m_uLineNumber,
881 "Bad data line: CDS record with bad parent assignments.");
882 throw error;
883 }
884
885 // ----------------------------------------------------------------------------
xReadInit()886 bool CGff3Reader::xReadInit()
887 // ----------------------------------------------------------------------------
888 {
889 if (!CGff2Reader::xReadInit()) {
890 return false;
891 }
892 mCdsParentMap.clear();
893 return true;
894 }
895
896 // ----------------------------------------------------------------------------
xIsIgnoredFeatureType(const string & featureType)897 bool CGff3Reader::xIsIgnoredFeatureType(
898 const string& featureType)
899 // ----------------------------------------------------------------------------
900 {
901 typedef CStaticArraySet<string, PNocase> STRINGARRAY;
902
903 string ftype(CSoMap::ResolveSoAlias(featureType));
904
905 static const char* const ignoredTypesAlways_[] = {
906 "protein",
907 "start_codon", // also part of a cds feature
908 "stop_codon", // in GFF3, also part of a cds feature
909 };
910 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesAlways, ignoredTypesAlways_);
911 STRINGARRAY::const_iterator cit = ignoredTypesAlways.find(ftype);
912 if (cit != ignoredTypesAlways.end()) {
913 return true;
914 }
915 if (!IsInGenbankMode()) {
916 return false;
917 }
918
919 /* -genbank mode:*/
920 static const char* const specialTypesGenbank_[] = {
921 "antisense_RNA",
922 "autocatalytically_spliced_intron",
923 "guide_RNA",
924 "hammerhead_ribozyme",
925 "lnc_RNA",
926 "miRNA",
927 "piRNA",
928 "rasiRNA",
929 "ribozyme",
930 "RNase_MRP_RNA",
931 "RNase_P_RNA",
932 "scRNA",
933 "selenocysteine",
934 "siRNA",
935 "snoRNA",
936 "snRNA",
937 "SRP_RNA",
938 "stop_codon_read_through",
939 "telomerase_RNA",
940 "vault_RNA",
941 "Y_RNA"
942 };
943 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, specialTypesGenbank, specialTypesGenbank_);
944
945 static const char* const ignoredTypesGenbank_[] = {
946 "apicoplast_chromosome",
947 "assembly",
948 "cDNA_match",
949 "chloroplast_chromosome",
950 "chromoplast_chromosome",
951 "chromosome",
952 "contig",
953 "cyanelle_chromosome",
954 "dna_chromosome",
955 "EST_match",
956 "expressed_sequence_match",
957 "intron",
958 "leucoplast_chromosome",
959 "macronuclear_chromosome",
960 "match",
961 "match_part",
962 "micronuclear_chromosome",
963 "mitochondrial_chromosome",
964 "nuclear_chromosome",
965 "nucleomorphic_chromosome",
966 "nucleotide_motif",
967 "nucleotide_to_protein_match",
968 "partial_genomic_sequence_assembly",
969 "protein_match",
970 "replicon",
971 "rna_chromosome",
972 "sequence_assembly",
973 "supercontig",
974 "translated_nucleotide_match",
975 "ultracontig",
976 };
977 DEFINE_STATIC_ARRAY_MAP(STRINGARRAY, ignoredTypesGenbank, ignoredTypesGenbank_);
978
979 cit = specialTypesGenbank.find(ftype);
980 if (cit != specialTypesGenbank.end()) {
981 return false;
982 }
983
984 cit = ignoredTypesGenbank.find(ftype);
985 if (cit != ignoredTypesGenbank.end()) {
986 return true;
987 }
988
989 return false;
990 }
991
992 // ----------------------------------------------------------------------------
993 bool
xInitializeFeature(const CGff2Record & record,CRef<CSeq_feat> pFeature)994 CGff3Reader::xInitializeFeature(
995 const CGff2Record& record,
996 CRef<CSeq_feat> pFeature)
997 // ----------------------------------------------------------------------------
998 {
999 if (!record.InitializeFeature(m_iFlags, pFeature)) {
1000 return false;
1001 }
1002 const auto& attrs = record.Attributes();
1003 const auto it = attrs.find("ID");
1004 if (it != attrs.end()) {
1005 mIdToSeqIdMap[it->second] = record.Id();
1006 }
1007 return true;
1008 }
1009
1010 // ----------------------------------------------------------------------------
1011 void
xAddPendingExon(const string & rnaId,const CGff2Record & exonRecord)1012 CGff3Reader::xAddPendingExon(
1013 const string& rnaId,
1014 const CGff2Record& exonRecord)
1015 // ----------------------------------------------------------------------------
1016 {
1017 PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1018 if (it == mPendingExons.end()) {
1019 mPendingExons[rnaId] = list<CGff2Record>();
1020 }
1021 mPendingExons[rnaId].push_back(exonRecord);
1022 }
1023
1024 // ----------------------------------------------------------------------------
1025 void
xGetPendingExons(const string & rnaId,list<CGff2Record> & pendingExons)1026 CGff3Reader::xGetPendingExons(
1027 const string& rnaId,
1028 list<CGff2Record>& pendingExons)
1029 // ----------------------------------------------------------------------------
1030 {
1031 PENDING_EXONS::iterator it = mPendingExons.find(rnaId);
1032 if (it == mPendingExons.end()) {
1033 return;
1034 }
1035 pendingExons.swap(mPendingExons[rnaId]);
1036 mPendingExons.erase(rnaId);
1037 }
1038
1039 // ----------------------------------------------------------------------------
xPostProcessAnnot(CSeq_annot & annot)1040 void CGff3Reader::xPostProcessAnnot(
1041 CSeq_annot& annot)
1042 // ----------------------------------------------------------------------------
1043 {
1044 if (mAlignmentData) {
1045 xProcessAlignmentData(annot);
1046 return;
1047 }
1048 if (!mCurrentFeatureCount) {
1049 return;
1050 }
1051
1052 for (const auto& it: mPendingExons) {
1053 CReaderMessage warning(
1054 eDiag_Warning,
1055 m_uLineNumber,
1056 "Bad data line: Record references non-existent Parent=" + it.first);
1057 m_pMessageHandler->Report(warning);
1058 }
1059
1060 //location fixup:
1061 for (auto itLocation: mpLocations->LocationMap()) {
1062 auto id = itLocation.first;
1063 auto itFeature = m_MapIdToFeature.find(id);
1064 if (itFeature == m_MapIdToFeature.end()) {
1065 continue;
1066 }
1067 CRef<CSeq_loc> pNewLoc(new CSeq_loc);
1068 CCdregion::EFrame frame;
1069 mpLocations->MergeLocation(pNewLoc, frame, itLocation.second);
1070 CRef<CSeq_feat> pFeature = itFeature->second;
1071 pFeature->SetLocation(*pNewLoc);
1072 if (pFeature->GetData().IsCdregion()) {
1073 auto& cdrData = pFeature->SetData().SetCdregion();
1074 cdrData.SetFrame(
1075 frame == CCdregion::eFrame_not_set ? CCdregion::eFrame_one : frame);
1076 }
1077 }
1078
1079 return CGff2Reader::xPostProcessAnnot(annot);
1080 }
1081
1082 // ----------------------------------------------------------------------------
xProcessSequenceRegionPragma(const string & pragma)1083 void CGff3Reader::xProcessSequenceRegionPragma(
1084 const string& pragma)
1085 // ----------------------------------------------------------------------------
1086 {
1087 TSeqPos sequenceSize(0);
1088 vector<string> tokens;
1089 NStr::Split(pragma, " \t", tokens, NStr::fSplit_MergeDelimiters);
1090 if (tokens.size() < 2) {
1091 CReaderMessage warning(
1092 ncbi::eDiag_Warning,
1093 m_uLineNumber,
1094 "Bad sequence-region pragma - ignored.");
1095 throw warning;
1096 }
1097 if (tokens.size() >= 4) {
1098 try {
1099 sequenceSize = NStr::StringToNonNegativeInt(tokens[3]);
1100 }
1101 catch(exception&) {
1102 CReaderMessage warning(
1103 ncbi::eDiag_Warning,
1104 m_uLineNumber,
1105 "Bad sequence-region pragma - ignored.");
1106 throw warning;
1107 }
1108 }
1109 mpLocations->SetSequenceSize(tokens[1], sequenceSize);
1110 auto resolvedId = mSeqIdResolve(tokens[1], m_iFlags, true)->AsFastaString();
1111 mpLocations->SetSequenceSize(resolvedId, sequenceSize);
1112
1113 }
1114
1115 // ----------------------------------------------------------------------------
SequenceSize() const1116 TSeqPos CGff3Reader::SequenceSize() const
1117 // ----------------------------------------------------------------------------
1118 {
1119 return mpLocations->SequenceSize();
1120 }
1121
1122 // ----------------------------------------------------------------------------
GetSequenceSize(const string & seqId) const1123 TSeqPos CGff3Reader::GetSequenceSize(
1124 const string& seqId) const
1125 // ----------------------------------------------------------------------------
1126 {
1127 return mpLocations->GetSequenceSize(seqId);
1128 }
1129
1130 END_objects_SCOPE
1131 END_NCBI_SCOPE
1132