1 /* $Id: gff3_annot_assembler.cpp 597788 2019-11-27 03:06:29Z ucko $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig
27 *
28 * File Description: Iterate through file names matching a given glob pattern
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbifile.hpp>
34 #include <objects/general/Object_id.hpp>
35
36 #include <objects/seqfeat/Feat_id.hpp>
37 #include <objects/seqfeat/Gene_ref.hpp>
38 #include <objects/seqfeat/RNA_ref.hpp>
39 #include <objects/seqfeat/Imp_feat.hpp>
40 #include <objects/seqfeat/Cdregion.hpp>
41 #include <objects/seqfeat/SeqFeatData.hpp>
42 #include <objects/seqfeat/Gb_qual.hpp>
43 #include <objects/seqloc/Seq_interval.hpp>
44
45 #include <objtools/import/import_error.hpp>
46 #include "feat_util.hpp"
47 #include "featid_generator.hpp"
48 #include "gff3_annot_assembler.hpp"
49
50 #include <assert.h>
51
52 USING_NCBI_SCOPE;
53 USING_SCOPE(objects);
54
55 // ============================================================================
CGff3AnnotAssembler(CImportMessageHandler & errorReporter)56 CGff3AnnotAssembler::CGff3AnnotAssembler(
57 CImportMessageHandler& errorReporter):
58 // ============================================================================
59 CFeatAnnotAssembler(errorReporter)
60 {
61 }
62
63 // ============================================================================
~CGff3AnnotAssembler()64 CGff3AnnotAssembler::~CGff3AnnotAssembler()
65 // ============================================================================
66 {
67 }
68
69 // ============================================================================
70 void
ProcessRecord(const CFeatImportData & record_,CSeq_annot & annot)71 CGff3AnnotAssembler::ProcessRecord(
72 const CFeatImportData& record_,
73 CSeq_annot& annot)
74 // ============================================================================
75 {
76 assert(dynamic_cast<const CGff3ImportData*>(&record_));
77 const CGff3ImportData& record = static_cast<const CGff3ImportData&>(record_);
78
79 auto recordId = record.Id();
80 auto parentId = record.Parent();
81 auto pFeature = record.GetData();
82
83 switch (pFeature->GetData().GetSubtype()) {
84 default:
85 return xProcessFeatureDefault(recordId, parentId, pFeature, annot);
86 case CSeqFeatData::eSubtype_exon:
87 return xProcessFeatureExon(recordId, parentId, pFeature, annot);
88 case CSeqFeatData::eSubtype_mRNA:
89 case CSeqFeatData::eSubtype_misc_RNA:
90 case CSeqFeatData::eSubtype_ncRNA:
91 case CSeqFeatData::eSubtype_rRNA:
92 case CSeqFeatData::eSubtype_tmRNA:
93 case CSeqFeatData::eSubtype_tRNA:
94 return xProcessFeatureRna(recordId, parentId, pFeature, annot);
95 case CSeqFeatData::eSubtype_cdregion:
96 return xProcessFeatureCds(recordId, parentId, pFeature, annot);
97 }
98 }
99
100 // ============================================================================
101 void
xProcessFeatureDefault(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)102 CGff3AnnotAssembler::xProcessFeatureDefault(
103 const std::string& recordId,
104 const std::string& parentId,
105 CRef<CSeq_feat> pFeature,
106 CSeq_annot& annot)
107 // ============================================================================
108 {
109 string featureType = CSeqFeatData::SubtypeValueToName(
110 pFeature->GetData().GetSubtype());
111 NStr::ToLower(featureType);
112 pFeature->SetId(*mIdGenerator.GetIdFor(featureType));
113 annot.SetData().SetFtable().push_back(pFeature);
114 if (!recordId.empty()) {
115 mFeatureMap.AddFeature(recordId, pFeature);
116 }
117 }
118
119
120 // ============================================================================
121 void
xProcessFeatureCds(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)122 CGff3AnnotAssembler::xProcessFeatureCds(
123 const std::string& recordId,
124 const std::string& parentId,
125 CRef<CSeq_feat> pFeature,
126 CSeq_annot& annot)
127 // ============================================================================
128 {
129 auto pExistingCds = mFeatureMap.FindFeature(recordId);
130 if (pExistingCds) {
131 // add new piece to existing piece
132 CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
133 pExistingCds->GetLocation(), pFeature->GetLocation());
134 pExistingCds->SetLocation().Assign(*pUpdatedLocation);
135
136 // update frame if necessary
137 auto cdsStrand = pExistingCds->GetLocation().GetStrand();
138 auto& existingCds = pExistingCds->SetData().SetCdregion();
139 const auto& newCds = pFeature->GetData().GetCdregion();
140 if (cdsStrand == eNa_strand_plus) {
141 auto existingStart =
142 pExistingCds->GetLocation().GetStart(eExtreme_Positional);
143 auto contributedStart =
144 pFeature->GetLocation().GetStart(eExtreme_Positional);
145 if (existingStart == contributedStart) {
146 existingCds.SetFrame(newCds.GetFrame());
147 }
148 }
149 else if (cdsStrand == eNa_strand_minus) {
150 auto existingStop =
151 pExistingCds->GetLocation().GetStart(eExtreme_Positional);
152 auto contributedStop =
153 pFeature->GetLocation().GetStart(eExtreme_Positional);
154 if (existingStop == contributedStop) {
155 existingCds.SetFrame(newCds.GetFrame());
156 }
157 }
158 }
159 else {
160 pFeature->SetId(*mIdGenerator.GetIdFor("cds"));
161 annot.SetData().SetFtable().push_back(pFeature);
162 if (!recordId.empty()) {
163 mFeatureMap.AddFeature(recordId, pFeature);
164 }
165 if (!recordId.empty() && !parentId.empty()) {
166 mXrefMap[recordId] = parentId;
167 }
168 }
169 }
170
171
172 // ============================================================================
173 void
xProcessFeatureRna(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)174 CGff3AnnotAssembler::xProcessFeatureRna(
175 const std::string& recordId,
176 const std::string& parentId,
177 CRef<CSeq_feat> pFeature,
178 CSeq_annot& annot)
179 // ============================================================================
180 {
181 annot.SetData().SetFtable().push_back(pFeature);
182 if (!recordId.empty()) {
183 mFeatureMap.AddFeature(recordId, pFeature);
184 }
185 if (!recordId.empty() && !parentId.empty()) {
186 mXrefMap[recordId] = parentId;
187 }
188
189 pFeature->SetId(*mIdGenerator.GetIdFor("mrna"));
190 xMarkLocationPending(*pFeature);
191
192 vector<CRef<CSeq_feat>> pendingExons;
193 if (!mPendingFeatures.FindPendingFeatures(recordId, pendingExons)) {
194 return;
195 }
196 for (auto pExon: pendingExons) {
197 CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
198 pFeature->GetLocation(), pExon->GetLocation());
199 pFeature->SetLocation().Assign(*pUpdatedLocation);
200 }
201 mPendingFeatures.MarkFeaturesDone(recordId);
202 }
203
204
205 // ============================================================================
206 void
xProcessFeatureExon(const std::string & recordId,const std::string & parentId,CRef<CSeq_feat> pFeature,CSeq_annot & annot)207 CGff3AnnotAssembler::xProcessFeatureExon(
208 const std::string& recordId,
209 const std::string& parentId,
210 CRef<CSeq_feat> pFeature,
211 CSeq_annot& annot)
212 // ============================================================================
213 {
214 auto pParentRna = mFeatureMap.FindFeature(parentId);
215 if (pParentRna) {
216 if (xIsLocationPending(*pParentRna)) {
217 pParentRna->SetLocation().Assign(pFeature->GetLocation());
218 xUnmarkLocationPending(*pParentRna);
219 }
220 else {
221 CRef<CSeq_loc> pUpdatedLocation = FeatUtil::AddLocations(
222 pParentRna->GetLocation(), pFeature->GetLocation());
223 pParentRna->SetLocation().Assign(*pUpdatedLocation);
224 }
225 }
226 else {
227 mPendingFeatures.AddFeature(parentId, pFeature);
228 }
229 }
230
231 // ============================================================================
232 void
FinalizeAnnot(const CAnnotImportData & annotData,CSeq_annot & annot)233 CGff3AnnotAssembler::FinalizeAnnot(
234 const CAnnotImportData& annotData,
235 CSeq_annot& annot)
236 // ============================================================================
237 {
238 // generate crefs between genes, mRNAs, and coding regions:
239 for (auto entry: mXrefMap) {
240 auto childId = entry.first;
241 auto parentId = entry.second;
242
243 auto pChild = mFeatureMap.FindFeature(childId);
244 auto pParent = mFeatureMap.FindFeature(parentId);
245 if (!pChild || !pParent) {
246 continue;
247 }
248 pChild->AddSeqFeatXref(pParent->GetId());
249 pParent->AddSeqFeatXref(pChild->GetId());
250
251 auto itGrandParent = mXrefMap.find(parentId);
252 if (itGrandParent == mXrefMap.end()) {
253 continue;
254 }
255 auto grandParentId = itGrandParent->second;
256 auto pGrandParent = mFeatureMap.FindFeature(grandParentId);
257 if (!pGrandParent) {
258 continue;
259 }
260 pChild->AddSeqFeatXref(pGrandParent->GetId());
261 pGrandParent->AddSeqFeatXref(pChild->GetId());
262 pParent->AddSeqFeatXref(pGrandParent->GetId());
263 pGrandParent->AddSeqFeatXref(pParent->GetId());
264 }
265
266 // remove any remaining "under construction" markers:
267 auto& ftable = annot.SetData().SetFtable();
268 for (auto& pFeature: ftable) {
269 xUnmarkLocationPending(*pFeature);
270 }
271 }
272
273 // ============================================================================
274 bool
xIsLocationPending(const CSeq_feat & feat)275 CGff3AnnotAssembler::xIsLocationPending(
276 const CSeq_feat& feat)
277 // ============================================================================
278 {
279 if (!feat.IsSetQual()) {
280 return false;
281 }
282 for (const auto& pQual: feat.GetQual()) {
283 if (pQual->IsSetQual() && pQual->GetQual() == "__location_pending") {
284 return true;
285 }
286 }
287 return false;
288 }
289
290 // ============================================================================
291 void
xMarkLocationPending(CSeq_feat & feat)292 CGff3AnnotAssembler::xMarkLocationPending(
293 CSeq_feat& feat)
294 // ============================================================================
295 {
296 feat.AddQualifier("__location_pending", "true");
297 }
298
299 // ============================================================================
300 void
xUnmarkLocationPending(CSeq_feat & feat)301 CGff3AnnotAssembler::xUnmarkLocationPending(
302 CSeq_feat& feat)
303 // ============================================================================
304 {
305 feat.RemoveQualifier("__location_pending");
306 }
307