1 /*
2 * $Id: gtf_location_merger.cpp 632531 2021-06-02 17:25:37Z ivanov $
3 *
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * Authors: Frank Ludwig
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objects/seqloc/Seq_interval.hpp>
35 #include "gtf_location_merger.hpp"
36
37 BEGIN_NCBI_SCOPE
38 BEGIN_SCOPE(objects);
39
40 CGtfLocationRecord::TYPEORDER_MAP CGtfLocationRecord::msTypeOrder = {
41 {"start_codon", TYPE_start_codon},
42 {"cds", TYPE_cds},
43 {"stop_codon", TYPE_stop_codon},
44
45 {"5utr", TYPE_5utr},
46 {"initial", TYPE_initial},
47 {"exon", TYPE_exon},
48 {"single", TYPE_exon},
49 {"internal", TYPE_exon},
50 {"terminal", TYPE_terminal},
51 {"3utr", TYPE_3utr},
52 };
53
54 // ----------------------------------------------------------------------------
CGtfLocationRecord(const CGtfReadRecord & record,unsigned int flags,CGff3ReadRecord::SeqIdResolver seqIdResolve)55 CGtfLocationRecord::CGtfLocationRecord(
56 const CGtfReadRecord& record,
57 unsigned int flags,
58 CGff3ReadRecord::SeqIdResolver seqIdResolve)
59 // ----------------------------------------------------------------------------
60 {
61 mId.Assign(*record.GetSeqId(flags, seqIdResolve));
62 mStart = static_cast<TSeqPos>(record.SeqStart());
63 mStop = static_cast<TSeqPos>(record.SeqStop());
64 mStrand = (record.IsSetStrand() ? record.Strand() : eNa_strand_plus);
65 mType = GetRecordType(record);
66
67 mPartNum = 0;
68 string recordPart = record.GtfAttributes().ValueOf("part");
69 if (recordPart.empty()) {
70 recordPart = record.GtfAttributes().ValueOf("exon_number");
71 }
72 try {
73 mPartNum = NStr::StringToInt(recordPart);
74 }
75 catch (CStringException&) {
76 //mPartNum = 0;
77 }
78 }
79
80 // ----------------------------------------------------------------------------
CGtfLocationRecord(const CGtfLocationRecord & other)81 CGtfLocationRecord::CGtfLocationRecord(
82 const CGtfLocationRecord& other)
83 // ----------------------------------------------------------------------------
84 {
85 mId.Assign(other.mId);
86 mStart = other.mStart;
87 mStop = other.mStop;
88 mStrand = other.mStrand;
89 mType = other.mType;
90 mPartNum = other.mPartNum;
91 }
92
93 // ----------------------------------------------------------------------------
94 CGtfLocationRecord::RecordType
GetRecordType(const CGtfReadRecord & record)95 CGtfLocationRecord::GetRecordType(
96 const CGtfReadRecord& record)
97 // ----------------------------------------------------------------------------
98 {
99 auto recType = record.NormalizedType();
100 auto typeIt = msTypeOrder.find(recType);
101 if (typeIt == msTypeOrder.end()) {
102 return TYPE_unspecified;
103 }
104 return typeIt->second;
105 }
106
107 // ----------------------------------------------------------------------------
108 CGtfLocationRecord&
operator =(const CGtfLocationRecord & other)109 CGtfLocationRecord::operator=(
110 const CGtfLocationRecord& other)
111 // ----------------------------------------------------------------------------
112 {
113 mId.Assign(other.mId);
114 mStart = other.mStart;
115 mStop = other.mStop;
116 mStrand = other.mStrand;
117 mType = other.mType;
118 mPartNum = other.mPartNum;
119 return *this;
120 }
121
122 // ----------------------------------------------------------------------------
123 bool
Contains(const CGtfLocationRecord & other) const124 CGtfLocationRecord::Contains(
125 const CGtfLocationRecord& other) const
126 // ----------------------------------------------------------------------------
127 {
128 if (mStrand != other.mStrand) {
129 return false;
130 }
131 return ((mStart <= other.mStart) && (mStop >= other.mStop));
132 }
133
134 // ----------------------------------------------------------------------------
135 bool
IsContainedBy(const CGtfLocationRecord & other) const136 CGtfLocationRecord::IsContainedBy(
137 const CGtfLocationRecord& other) const
138 // ----------------------------------------------------------------------------
139 {
140 return other.Contains(*this);
141 }
142
143 // ----------------------------------------------------------------------------
144 CRef<CSeq_loc>
GetLocation()145 CGtfLocationRecord::GetLocation()
146 // ----------------------------------------------------------------------------
147 {
148 CRef<CSeq_loc> pLocation(new CSeq_loc);
149 CRef<CSeq_interval> pInterval(new CSeq_interval);
150 pInterval->SetId().Assign(mId);
151 pInterval->SetFrom(mStart);
152 pInterval->SetTo(mStop);
153 pInterval->SetStrand(mStrand);
154 pLocation->SetInt(*pInterval);
155 return pLocation;
156 }
157
158 // -----------------------------------------------------------------------------
CompareTypeAndPartNumbers(const CGtfLocationRecord & lhs,const CGtfLocationRecord & rhs)159 bool CGtfLocationRecord::CompareTypeAndPartNumbers(
160 const CGtfLocationRecord& lhs,
161 const CGtfLocationRecord& rhs)
162 // ----------------------------------------------------------------------------
163 {
164 if (lhs.mType == rhs.mType) {
165 return (lhs.mPartNum < rhs.mPartNum);
166 }
167 return (lhs.mType < rhs.mType);
168 };
169
170
171 // ============================================================================
CGtfLocationMerger(unsigned int flags,CGff3ReadRecord::SeqIdResolver idResolver)172 CGtfLocationMerger::CGtfLocationMerger(
173 unsigned int flags,
174 CGff3ReadRecord::SeqIdResolver idResolver):
175 // ============================================================================
176 mFlags(flags),
177 mIdResolver(idResolver)
178 {
179 }
180
181 // ============================================================================
182 string
GetFeatureIdFor(const CGtfReadRecord & record,const string & prefixOverride)183 CGtfLocationMerger::GetFeatureIdFor(
184 const CGtfReadRecord& record,
185 const string& prefixOverride)
186 // ============================================================================
187 {
188 static const list<string> cdsTypes{ "start_codon", "stop_codon", "cds" };
189 static const list<string> transcriptTypes = { "5utr", "3utr", "exon",
190 "initial", "internal", "terminal", "single" };
191
192 auto prefix = prefixOverride;
193 if (prefixOverride.empty()) {
194 prefix = record.NormalizedType();
195 if (std::find(cdsTypes.begin(), cdsTypes.end(), prefix) != cdsTypes.end()) {
196 prefix = "cds";
197 }
198 else if (std::find(transcriptTypes.begin(), transcriptTypes.end(), prefix)
199 != transcriptTypes.end()) {
200 prefix = "transcript";
201 }
202 }
203 if (prefix == "gene") {
204 return (prefix + ":" + record.GeneKey());
205 }
206 return (prefix + ":" + record.FeatureKey());
207 }
208
209 // ============================================================================
210 void
AddRecord(const CGtfReadRecord & record)211 CGtfLocationMerger::AddRecord(
212 const CGtfReadRecord& record)
213 // ============================================================================
214 {
215 AddRecordForId(GetFeatureIdFor(record), record);
216 }
217
218 // ============================================================================
219 void
AddRecordForId(const string & id,const CGtfReadRecord & record)220 CGtfLocationMerger::AddRecordForId(
221 const string& id,
222 const CGtfReadRecord& record)
223 // ============================================================================
224 {
225 auto existingEntry = mMapIdToLocations.find(id);
226 if (existingEntry == mMapIdToLocations.end()) {
227 existingEntry = mMapIdToLocations.emplace(id, LOCATIONS()).first;
228 }
229
230 CGtfLocationRecord location(record, mFlags, mIdResolver);
231 auto& existingRecords = existingEntry->second;
232 for (auto& record: existingRecords) {
233 if (record.Contains(location)) {
234 if (location.mType == CGtfLocationRecord::TYPE_start_codon) {
235 record.mType = CGtfLocationRecord::TYPE_start_codon;
236 record.mPartNum = location.mPartNum;
237 }
238 return;
239 }
240 if (record.IsContainedBy(location)) {
241 if (record.mType == CGtfLocationRecord::TYPE_start_codon) {
242 location.mType = CGtfLocationRecord::TYPE_start_codon;
243 location.mPartNum = record.mPartNum;
244 }
245 record = location;
246 return;
247 }
248 }
249 existingEntry->second.push_back(location);
250 }
251
252 // ============================================================================
253 void
AddStubForId(const string & id)254 CGtfLocationMerger::AddStubForId(
255 const string& id)
256 // ============================================================================
257 {
258 auto existingEntry = mMapIdToLocations.find(id);
259 if (existingEntry == mMapIdToLocations.end()) {
260 return;
261 }
262 mMapIdToLocations.emplace(id, LOCATIONS());
263 }
264
265
266 // ============================================================================
267 CRef<CSeq_loc>
MergeLocation(CSeqFeatData::ESubtype subType,LOCATIONS & locations)268 CGtfLocationMerger::MergeLocation(
269 CSeqFeatData::ESubtype subType,
270 LOCATIONS& locations)
271 // ============================================================================
272 {
273 if (locations.empty()) {
274 CRef<CSeq_loc> pSeqloc(new CSeq_loc);
275 pSeqloc->SetNull();
276 return pSeqloc;
277 }
278
279 switch(subType) {
280 default: {
281 return MergeLocationDefault(locations);
282 }
283 case CSeqFeatData::eSubtype_cdregion:
284 return MergeLocationForCds(locations);
285 case CSeqFeatData::eSubtype_mRNA:
286 return MergeLocationForTranscript(locations);
287 case CSeqFeatData::eSubtype_gene:
288 return MergeLocationForGene(locations);
289 }
290 }
291
292
293
294 // ============================================================================
295 CRef<CSeq_loc>
MergeLocationDefault(LOCATIONS & locations)296 CGtfLocationMerger::MergeLocationDefault(
297 LOCATIONS& locations)
298 // ============================================================================
299 {
300 NCBI_ASSERT(!locations.empty(),
301 "Cannot call MergeLocationDefault with empty location");
302 CRef<CSeq_loc> pSeqloc(new CSeq_loc);
303 if (locations.size() == 1) {
304 auto& onlyOne = locations.front();
305 pSeqloc = onlyOne.GetLocation();
306 return pSeqloc;
307 }
308 locations.sort(CGtfLocationRecord::CompareTypeAndPartNumbers);
309
310 auto& mix = pSeqloc->SetMix();
311 for (auto& location: locations) {
312 mix.AddSeqLoc(*location.GetLocation());
313 }
314 return pSeqloc;
315 }
316
317 // ============================================================================
318 CRef<CSeq_loc>
MergeLocationForCds(LOCATIONS & locations)319 CGtfLocationMerger::MergeLocationForCds(
320 LOCATIONS& locations)
321 // ============================================================================
322 {
323 NCBI_ASSERT(!locations.empty(),
324 "Cannot call MergeLocationForCds with empty location");
325
326 locations.sort(CGtfLocationRecord::CompareTypeAndPartNumbers);
327 CRef<CSeq_loc> pSeqloc(new CSeq_loc);
328 auto& mix = pSeqloc->SetMix();
329 for (auto& location: locations) {
330 mix.AddSeqLoc(*location.GetLocation());
331 }
332 pSeqloc = pSeqloc->Merge(CSeq_loc::fMerge_All, 0);
333 return pSeqloc;
334 }
335
336 // ============================================================================
337 CRef<CSeq_loc>
MergeLocationForGene(LOCATIONS & locations)338 CGtfLocationMerger::MergeLocationForGene(
339 LOCATIONS& locations)
340 // ============================================================================
341 {
342 NCBI_ASSERT(!locations.empty(),
343 "Cannot call MergeLocationForGene with empty location");
344
345 CRef<CSeq_loc> pSeqloc = MergeLocationDefault(locations);
346 if (pSeqloc->IsInt()) {
347 return pSeqloc;
348 }
349
350 pSeqloc->ChangeToPackedInt();
351 auto seqlocIntervals = pSeqloc->GetPacked_int().Get();
352
353 CRef<CSeq_id> pRangeId(new CSeq_id);
354 pRangeId->Assign(*pSeqloc->GetId());
355 auto rangeStart = pSeqloc->GetStart(eExtreme_Biological);
356 auto rangeStop = pSeqloc->GetStop(eExtreme_Biological);
357 auto rangeStrand = eNa_strand_plus;
358 if (pSeqloc->IsSetStrand()) {
359 rangeStrand = pSeqloc->GetStrand();
360 }
361
362 if (rangeStrand == eNa_strand_minus) {
363 if (rangeStart >= rangeStop) {
364 CRef<CSeq_interval> pOverlap(
365 new CSeq_interval(*pRangeId, rangeStop, rangeStart, rangeStrand));
366 pSeqloc->SetInt(*pOverlap);
367 }
368 else {
369 CRef<CSeq_loc_mix> pMix(new CSeq_loc_mix);
370 auto it = seqlocIntervals.begin();
371 auto oldFrom = (*it)->GetFrom();
372 auto oldTo = (*it)->GetTo();
373
374 for (it++; it != seqlocIntervals.end(); it++) {
375 auto newFrom = (*it)->GetFrom();
376 auto newTo = (*it)->GetTo();
377 if (newTo >= oldFrom) {
378 oldFrom = newFrom;
379 }
380 else {
381 pMix->AddInterval(*pRangeId, 0, oldTo, rangeStrand);
382 oldFrom = newFrom, oldTo = newTo;
383 }
384 }
385 if (oldFrom > oldTo) {
386 return MergeLocationForTranscript(locations);
387 }
388 pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
389 pSeqloc->SetMix(*pMix);
390 }
391 }
392 else {
393 if (rangeStart <= rangeStop) {
394 CRef<CSeq_interval> pOverlap(
395 new CSeq_interval(*pRangeId, rangeStart, rangeStop, rangeStrand));
396 pSeqloc->SetInt(*pOverlap);
397 }
398 else {
399 CRef<CSeq_loc_mix> pMix(new CSeq_loc_mix);
400 auto it = seqlocIntervals.begin();
401 auto oldFrom = (*it)->GetFrom();
402 auto oldTo = (*it)->GetTo();
403 for (it++; it != seqlocIntervals.end(); it++) {
404 auto newFrom = (*it)->GetFrom();
405 auto newTo = (*it)->GetTo();
406 if (newFrom >= oldTo) {
407 oldTo = newTo;
408 }
409 else {
410 pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
411 oldFrom = 0, oldTo = newTo;
412 }
413 }
414 if (oldFrom > oldTo) {
415 return MergeLocationForTranscript(locations);
416 }
417 pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
418 pSeqloc->SetMix(*pMix);
419 }
420 }
421 return pSeqloc;
422 }
423
424 // ============================================================================
425 CRef<CSeq_loc>
MergeLocationForTranscript(LOCATIONS & locations)426 CGtfLocationMerger::MergeLocationForTranscript(
427 LOCATIONS& locations)
428 // ============================================================================
429 {
430 return MergeLocationDefault(locations);
431 }
432
433 END_SCOPE(objects)
434 END_NCBI_SCOPE
435