1 /*  $Id: gff3_annot_assembler.cpp 597788 2019-11-27 03:06:29Z ucko $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Frank Ludwig
27  *
28  * File Description:  Iterate through file names matching a given glob pattern
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 #include <objects/general/Object_id.hpp>
35 
36 #include <objects/seqfeat/Feat_id.hpp>
37 #include <objects/seqfeat/Gene_ref.hpp>
38 #include <objects/seqfeat/RNA_ref.hpp>
39 #include <objects/seqfeat/Imp_feat.hpp>
40 #include <objects/seqfeat/Cdregion.hpp>
41 #include <objects/seqfeat/SeqFeatData.hpp>
42 #include <objects/seqfeat/Gb_qual.hpp>
43 #include <objects/seqloc/Seq_interval.hpp>
44 
45 #include <objtools/import/import_error.hpp>
46 #include "feat_util.hpp"
47 #include "featid_generator.hpp"
48 #include "gff3_annot_assembler.hpp"
49 
50 #include <assert.h>
51 
52 USING_NCBI_SCOPE;
53 USING_SCOPE(objects);
54 
55 //  ============================================================================
CGff3AnnotAssembler(CImportMessageHandler & errorReporter)56 CGff3AnnotAssembler::CGff3AnnotAssembler(
57     CImportMessageHandler& errorReporter):
58 //  ============================================================================
59     CFeatAnnotAssembler(errorReporter)
60 {
61 }
62 
63 //  ============================================================================
~CGff3AnnotAssembler()64 CGff3AnnotAssembler::~CGff3AnnotAssembler()
65 //  ============================================================================
66 {
67 }
68 
69 //  ============================================================================
70 void
ProcessRecord(const CFeatImportData & record_,CSeq_annot & annot)71 CGff3AnnotAssembler::ProcessRecord(
72     const CFeatImportData& record_,
73     CSeq_annot& annot)
74 //  ============================================================================
75 {
76     assert(dynamic_cast<const CGff3ImportData*>(&record_));
77     const CGff3ImportData& record = static_cast<const CGff3ImportData&>(record_);
78 
79     auto recordId = record.Id();
80     auto parentId = record.Parent();
81     auto pFeature = record.GetData();
82 
83     switch (pFeature->GetData().GetSubtype()) {
84     default:
85         return xProcessFeatureDefault(recordId, parentId, pFeature, annot);
86     case CSeqFeatData::eSubtype_exon:
87         return xProcessFeatureExon(recordId, parentId, pFeature, annot);
88     case CSeqFeatData::eSubtype_mRNA:
89     case CSeqFeatData::eSubtype_misc_RNA:
90     case CSeqFeatData::eSubtype_ncRNA:
91     case CSeqFeatData::eSubtype_rRNA:
92     case CSeqFeatData::eSubtype_tmRNA:
93     case CSeqFeatData::eSubtype_tRNA:
94         return xProcessFeatureRna(recordId, parentId, pFeature, annot);
95     case CSeqFeatData::eSubtype_cdregion:
96         return xProcessFeatureCds(recordId, parentId, pFeature, annot);
97     }
98 }
99 
100 //  ============================================================================
101 void
xProcessFeatureDefault(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)102 CGff3AnnotAssembler::xProcessFeatureDefault(
103     const std::string& recordId,
104     const std::string& parentId,
105     CRef<CSeq_feat> pFeature,
106     CSeq_annot& annot)
107 //  ============================================================================
108 {
109     string featureType = CSeqFeatData::SubtypeValueToName(
110         pFeature->GetData().GetSubtype());
111     NStr::ToLower(featureType);
112     pFeature->SetId(*mIdGenerator.GetIdFor(featureType));
113     annot.SetData().SetFtable().push_back(pFeature);
114     if (!recordId.empty()) {
115         mFeatureMap.AddFeature(recordId, pFeature);
116     }
117 }
118 
119 
120 //  ============================================================================
121 void
xProcessFeatureCds(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)122 CGff3AnnotAssembler::xProcessFeatureCds(
123     const std::string& recordId,
124     const std::string& parentId,
125     CRef<CSeq_feat> pFeature,
126     CSeq_annot& annot)
127 //  ============================================================================
128 {
129     auto pExistingCds = mFeatureMap.FindFeature(recordId);
130     if (pExistingCds) {
131         // add new piece to existing piece
132         CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
133             pExistingCds->GetLocation(), pFeature->GetLocation());
134         pExistingCds->SetLocation().Assign(*pUpdatedLocation);
135 
136         // update frame if necessary
137         auto cdsStrand = pExistingCds->GetLocation().GetStrand();
138         auto& existingCds = pExistingCds->SetData().SetCdregion();
139         const auto& newCds = pFeature->GetData().GetCdregion();
140         if (cdsStrand == eNa_strand_plus) {
141             auto existingStart =
142                 pExistingCds->GetLocation().GetStart(eExtreme_Positional);
143             auto contributedStart =
144                 pFeature->GetLocation().GetStart(eExtreme_Positional);
145             if (existingStart == contributedStart) {
146                 existingCds.SetFrame(newCds.GetFrame());
147             }
148         }
149         else if (cdsStrand == eNa_strand_minus) {
150             auto existingStop =
151                 pExistingCds->GetLocation().GetStart(eExtreme_Positional);
152             auto contributedStop =
153                 pFeature->GetLocation().GetStart(eExtreme_Positional);
154             if (existingStop == contributedStop) {
155                 existingCds.SetFrame(newCds.GetFrame());
156             }
157         }
158     }
159     else {
160         pFeature->SetId(*mIdGenerator.GetIdFor("cds"));
161         annot.SetData().SetFtable().push_back(pFeature);
162         if (!recordId.empty()) {
163             mFeatureMap.AddFeature(recordId, pFeature);
164         }
165         if (!recordId.empty()  &&  !parentId.empty()) {
166             mXrefMap[recordId] = parentId;
167         }
168     }
169 }
170 
171 
172 //  ============================================================================
173 void
xProcessFeatureRna(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)174 CGff3AnnotAssembler::xProcessFeatureRna(
175     const std::string& recordId,
176     const std::string& parentId,
177     CRef<CSeq_feat> pFeature,
178     CSeq_annot& annot)
179 //  ============================================================================
180 {
181     annot.SetData().SetFtable().push_back(pFeature);
182     if (!recordId.empty()) {
183         mFeatureMap.AddFeature(recordId, pFeature);
184     }
185     if (!recordId.empty()  &&  !parentId.empty()) {
186         mXrefMap[recordId] = parentId;
187     }
188 
189     pFeature->SetId(*mIdGenerator.GetIdFor("mrna"));
190     xMarkLocationPending(*pFeature);
191 
192     vector<CRef<CSeq_feat>> pendingExons;
193     if (!mPendingFeatures.FindPendingFeatures(recordId, pendingExons)) {
194         return;
195     }
196     for (auto pExon: pendingExons) {
197         CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
198             pFeature->GetLocation(), pExon->GetLocation());
199         pFeature->SetLocation().Assign(*pUpdatedLocation);
200     }
201     mPendingFeatures.MarkFeaturesDone(recordId);
202 }
203 
204 
205 //  ============================================================================
206 void
xProcessFeatureExon(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)207 CGff3AnnotAssembler::xProcessFeatureExon(
208     const std::string& recordId,
209     const std::string& parentId,
210     CRef<CSeq_feat> pFeature,
211     CSeq_annot& annot)
212 //  ============================================================================
213 {
214     auto pParentRna = mFeatureMap.FindFeature(parentId);
215     if (pParentRna) {
216         if (xIsLocationPending(*pParentRna)) {
217             pParentRna->SetLocation().Assign(pFeature->GetLocation());
218             xUnmarkLocationPending(*pParentRna);
219         }
220         else {
221             CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
222                 pParentRna->GetLocation(), pFeature->GetLocation());
223             pParentRna->SetLocation().Assign(*pUpdatedLocation);
224         }
225     }
226     else {
227         mPendingFeatures.AddFeature(parentId, pFeature);
228     }
229 }
230 
231 //  ============================================================================
232 void
FinalizeAnnot(const CAnnotImportData & annotData,CSeq_annot & annot)233 CGff3AnnotAssembler::FinalizeAnnot(
234     const CAnnotImportData& annotData,
235     CSeq_annot& annot)
236 //  ============================================================================
237 {
238     // generate crefs between genes, mRNAs, and coding regions:
239     for (auto entry: mXrefMap) {
240         auto childId = entry.first;
241         auto parentId = entry.second;
242 
243         auto pChild = mFeatureMap.FindFeature(childId);
244         auto pParent = mFeatureMap.FindFeature(parentId);
245         if (!pChild  ||  !pParent) {
246             continue;
247         }
248         pChild->AddSeqFeatXref(pParent->GetId());
249         pParent->AddSeqFeatXref(pChild->GetId());
250 
251         auto itGrandParent = mXrefMap.find(parentId);
252         if (itGrandParent == mXrefMap.end()) {
253             continue;
254         }
255         auto grandParentId = itGrandParent->second;
256         auto pGrandParent = mFeatureMap.FindFeature(grandParentId);
257         if (!pGrandParent) {
258             continue;
259         }
260         pChild->AddSeqFeatXref(pGrandParent->GetId());
261         pGrandParent->AddSeqFeatXref(pChild->GetId());
262         pParent->AddSeqFeatXref(pGrandParent->GetId());
263         pGrandParent->AddSeqFeatXref(pParent->GetId());
264     }
265 
266     // remove any remaining "under construction" markers:
267     auto& ftable = annot.SetData().SetFtable();
268     for (auto& pFeature: ftable) {
269         xUnmarkLocationPending(*pFeature);
270     }
271 }
272 
273 //  ============================================================================
274 bool
xIsLocationPending(const CSeq_feat & feat)275 CGff3AnnotAssembler::xIsLocationPending(
276     const CSeq_feat& feat)
277 //  ============================================================================
278 {
279     if (!feat.IsSetQual()) {
280         return false;
281     }
282     for (const auto& pQual: feat.GetQual()) {
283         if (pQual->IsSetQual()  &&  pQual->GetQual() == "__location_pending") {
284             return true;
285         }
286     }
287     return false;
288 }
289 
290 //  ============================================================================
291 void
xMarkLocationPending(CSeq_feat & feat)292 CGff3AnnotAssembler::xMarkLocationPending(
293     CSeq_feat& feat)
294 //  ============================================================================
295 {
296     feat.AddQualifier("__location_pending", "true");
297 }
298 
299 //  ============================================================================
300 void
xUnmarkLocationPending(CSeq_feat & feat)301 CGff3AnnotAssembler::xUnmarkLocationPending(
302     CSeq_feat& feat)
303 //  ============================================================================
304 {
305     feat.RemoveQualifier("__location_pending");
306 }
307