1 /*
2  * $Id: gtf_location_merger.cpp 632531 2021-06-02 17:25:37Z ivanov $
3  *
4  * ===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  * Authors:  Frank Ludwig
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistr.hpp>
34 #include <objects/seqloc/Seq_interval.hpp>
35 #include "gtf_location_merger.hpp"
36 
37 BEGIN_NCBI_SCOPE
38 BEGIN_SCOPE(objects);
39 
40 CGtfLocationRecord::TYPEORDER_MAP CGtfLocationRecord::msTypeOrder = {
41     {"start_codon", TYPE_start_codon},
42     {"cds",         TYPE_cds},
43     {"stop_codon",  TYPE_stop_codon},
44 
45     {"5utr",        TYPE_5utr},
46     {"initial",     TYPE_initial},
47     {"exon",        TYPE_exon},
48     {"single",      TYPE_exon},
49     {"internal",    TYPE_exon},
50     {"terminal",    TYPE_terminal},
51     {"3utr",        TYPE_3utr},
52 };
53 
54 //  ----------------------------------------------------------------------------
CGtfLocationRecord(const CGtfReadRecord & record,unsigned int flags,CGff3ReadRecord::SeqIdResolver seqIdResolve)55 CGtfLocationRecord::CGtfLocationRecord(
56     const CGtfReadRecord& record,
57     unsigned int flags,
58     CGff3ReadRecord::SeqIdResolver seqIdResolve)
59 //  ----------------------------------------------------------------------------
60 {
61     mId.Assign(*record.GetSeqId(flags, seqIdResolve));
62     mStart = static_cast<TSeqPos>(record.SeqStart());
63     mStop  = static_cast<TSeqPos>(record.SeqStop());
64     mStrand = (record.IsSetStrand() ? record.Strand() : eNa_strand_plus);
65     mType = GetRecordType(record);
66 
67     mPartNum = 0;
68     string recordPart = record.GtfAttributes().ValueOf("part");
69     if (recordPart.empty()) {
70         recordPart = record.GtfAttributes().ValueOf("exon_number");
71     }
72     try {
73         mPartNum = NStr::StringToInt(recordPart);
74     }
75     catch (CStringException&) {
76         //mPartNum = 0;
77     }
78 }
79 
80 //  ----------------------------------------------------------------------------
CGtfLocationRecord(const CGtfLocationRecord & other)81 CGtfLocationRecord::CGtfLocationRecord(
82     const CGtfLocationRecord& other)
83 //  ----------------------------------------------------------------------------
84 {
85     mId.Assign(other.mId);
86     mStart = other.mStart;
87     mStop = other.mStop;
88     mStrand = other.mStrand;
89     mType = other.mType;
90     mPartNum = other.mPartNum;
91 }
92 
93 //  ----------------------------------------------------------------------------
94 CGtfLocationRecord::RecordType
GetRecordType(const CGtfReadRecord & record)95 CGtfLocationRecord::GetRecordType(
96     const CGtfReadRecord& record)
97 //  ----------------------------------------------------------------------------
98 {
99     auto recType = record.NormalizedType();
100     auto typeIt = msTypeOrder.find(recType);
101     if (typeIt == msTypeOrder.end()) {
102         return TYPE_unspecified;
103     }
104     return typeIt->second;
105 }
106 
107 //  ----------------------------------------------------------------------------
108 CGtfLocationRecord&
operator =(const CGtfLocationRecord & other)109 CGtfLocationRecord::operator=(
110     const CGtfLocationRecord& other)
111 //  ----------------------------------------------------------------------------
112 {
113     mId.Assign(other.mId);
114     mStart = other.mStart;
115     mStop = other.mStop;
116     mStrand = other.mStrand;
117     mType = other.mType;
118     mPartNum = other.mPartNum;
119     return *this;
120 }
121 
122 //  ----------------------------------------------------------------------------
123 bool
Contains(const CGtfLocationRecord & other) const124 CGtfLocationRecord::Contains(
125     const CGtfLocationRecord& other) const
126 //  ----------------------------------------------------------------------------
127 {
128     if (mStrand != other.mStrand) {
129         return false;
130     }
131     return ((mStart <= other.mStart)  &&  (mStop >= other.mStop));
132 }
133 
134 //  ----------------------------------------------------------------------------
135 bool
IsContainedBy(const CGtfLocationRecord & other) const136 CGtfLocationRecord::IsContainedBy(
137     const CGtfLocationRecord& other) const
138 //  ----------------------------------------------------------------------------
139 {
140     return other.Contains(*this);
141 }
142 
143 //  ----------------------------------------------------------------------------
144 CRef<CSeq_loc>
GetLocation()145 CGtfLocationRecord::GetLocation()
146 //  ----------------------------------------------------------------------------
147 {
148     CRef<CSeq_loc> pLocation(new CSeq_loc);
149     CRef<CSeq_interval> pInterval(new CSeq_interval);
150     pInterval->SetId().Assign(mId);
151     pInterval->SetFrom(mStart);
152     pInterval->SetTo(mStop);
153     pInterval->SetStrand(mStrand);
154     pLocation->SetInt(*pInterval);
155     return pLocation;
156 }
157 
158 //  -----------------------------------------------------------------------------
CompareTypeAndPartNumbers(const CGtfLocationRecord & lhs,const CGtfLocationRecord & rhs)159 bool CGtfLocationRecord::CompareTypeAndPartNumbers(
160     const CGtfLocationRecord& lhs,
161     const CGtfLocationRecord& rhs)
162 //  ----------------------------------------------------------------------------
163 {
164     if (lhs.mType == rhs.mType) {
165         return (lhs.mPartNum < rhs.mPartNum);
166     }
167     return (lhs.mType < rhs.mType);
168 };
169 
170 
171 //  ============================================================================
CGtfLocationMerger(unsigned int flags,CGff3ReadRecord::SeqIdResolver idResolver)172 CGtfLocationMerger::CGtfLocationMerger(
173     unsigned int flags,
174     CGff3ReadRecord::SeqIdResolver idResolver):
175 //  ============================================================================
176     mFlags(flags),
177     mIdResolver(idResolver)
178 {
179 }
180 
181 //  ============================================================================
182 string
GetFeatureIdFor(const CGtfReadRecord & record,const string & prefixOverride)183 CGtfLocationMerger::GetFeatureIdFor(
184     const CGtfReadRecord& record,
185     const string& prefixOverride)
186 //  ============================================================================
187 {
188     static const list<string> cdsTypes{ "start_codon", "stop_codon", "cds" };
189     static const list<string> transcriptTypes = { "5utr", "3utr", "exon",
190         "initial", "internal", "terminal", "single" };
191 
192     auto prefix = prefixOverride;
193     if (prefixOverride.empty()) {
194         prefix = record.NormalizedType();
195         if (std::find(cdsTypes.begin(), cdsTypes.end(), prefix) != cdsTypes.end()) {
196             prefix = "cds";
197         }
198         else if (std::find(transcriptTypes.begin(), transcriptTypes.end(), prefix)
199                 != transcriptTypes.end()) {
200             prefix = "transcript";
201         }
202     }
203     if (prefix == "gene") {
204         return (prefix + ":" + record.GeneKey());
205     }
206     return (prefix + ":" + record.FeatureKey());
207 }
208 
209 //  ============================================================================
210 void
AddRecord(const CGtfReadRecord & record)211 CGtfLocationMerger::AddRecord(
212     const CGtfReadRecord& record)
213 //  ============================================================================
214 {
215     AddRecordForId(GetFeatureIdFor(record), record);
216 }
217 
218 //  ============================================================================
219 void
AddRecordForId(const string & id,const CGtfReadRecord & record)220 CGtfLocationMerger::AddRecordForId(
221     const string& id,
222     const CGtfReadRecord& record)
223 //  ============================================================================
224 {
225     auto existingEntry = mMapIdToLocations.find(id);
226     if (existingEntry == mMapIdToLocations.end()) {
227         existingEntry = mMapIdToLocations.emplace(id, LOCATIONS()).first;
228     }
229 
230     CGtfLocationRecord location(record, mFlags, mIdResolver);
231     auto& existingRecords = existingEntry->second;
232     for (auto& record: existingRecords) {
233         if (record.Contains(location)) {
234             if (location.mType == CGtfLocationRecord::TYPE_start_codon) {
235                 record.mType = CGtfLocationRecord::TYPE_start_codon;
236                 record.mPartNum = location.mPartNum;
237             }
238             return;
239         }
240         if (record.IsContainedBy(location)) {
241             if (record.mType == CGtfLocationRecord::TYPE_start_codon) {
242                 location.mType = CGtfLocationRecord::TYPE_start_codon;
243                 location.mPartNum = record.mPartNum;
244             }
245             record = location;
246             return;
247         }
248     }
249     existingEntry->second.push_back(location);
250 }
251 
252 //  ============================================================================
253 void
AddStubForId(const string & id)254 CGtfLocationMerger::AddStubForId(
255     const string& id)
256 //  ============================================================================
257 {
258     auto existingEntry = mMapIdToLocations.find(id);
259     if (existingEntry == mMapIdToLocations.end()) {
260         return;
261     }
262     mMapIdToLocations.emplace(id, LOCATIONS());
263 }
264 
265 
266 //  ============================================================================
267 CRef<CSeq_loc>
MergeLocation(CSeqFeatData::ESubtype subType,LOCATIONS & locations)268 CGtfLocationMerger::MergeLocation(
269     CSeqFeatData::ESubtype subType,
270     LOCATIONS& locations)
271 //  ============================================================================
272 {
273     if (locations.empty()) {
274         CRef<CSeq_loc> pSeqloc(new CSeq_loc);
275         pSeqloc->SetNull();
276         return pSeqloc;
277     }
278 
279     switch(subType) {
280         default: {
281             return MergeLocationDefault(locations);
282         }
283         case CSeqFeatData::eSubtype_cdregion:
284             return MergeLocationForCds(locations);
285         case CSeqFeatData::eSubtype_mRNA:
286             return MergeLocationForTranscript(locations);
287         case CSeqFeatData::eSubtype_gene:
288             return MergeLocationForGene(locations);
289     }
290 }
291 
292 
293 
294 //  ============================================================================
295 CRef<CSeq_loc>
MergeLocationDefault(LOCATIONS & locations)296 CGtfLocationMerger::MergeLocationDefault(
297     LOCATIONS& locations)
298 //  ============================================================================
299 {
300     NCBI_ASSERT(!locations.empty(),
301         "Cannot call MergeLocationDefault with empty location");
302     CRef<CSeq_loc> pSeqloc(new CSeq_loc);
303     if (locations.size() == 1) {
304         auto& onlyOne = locations.front();
305         pSeqloc = onlyOne.GetLocation();
306         return pSeqloc;
307     }
308     locations.sort(CGtfLocationRecord::CompareTypeAndPartNumbers);
309 
310     auto& mix = pSeqloc->SetMix();
311     for (auto& location: locations) {
312         mix.AddSeqLoc(*location.GetLocation());
313     }
314     return pSeqloc;
315 }
316 
317 //  ============================================================================
318 CRef<CSeq_loc>
MergeLocationForCds(LOCATIONS & locations)319 CGtfLocationMerger::MergeLocationForCds(
320     LOCATIONS& locations)
321 //  ============================================================================
322 {
323     NCBI_ASSERT(!locations.empty(),
324         "Cannot call MergeLocationForCds with empty location");
325 
326     locations.sort(CGtfLocationRecord::CompareTypeAndPartNumbers);
327     CRef<CSeq_loc> pSeqloc(new CSeq_loc);
328     auto& mix = pSeqloc->SetMix();
329     for (auto& location: locations) {
330         mix.AddSeqLoc(*location.GetLocation());
331     }
332     pSeqloc = pSeqloc->Merge(CSeq_loc::fMerge_All, 0);
333     return pSeqloc;
334 }
335 
336 //  ============================================================================
337 CRef<CSeq_loc>
MergeLocationForGene(LOCATIONS & locations)338 CGtfLocationMerger::MergeLocationForGene(
339     LOCATIONS& locations)
340 //  ============================================================================
341 {
342     NCBI_ASSERT(!locations.empty(),
343         "Cannot call MergeLocationForGene with empty location");
344 
345     CRef<CSeq_loc> pSeqloc = MergeLocationDefault(locations);
346     if (pSeqloc->IsInt()) {
347         return pSeqloc;
348     }
349 
350     pSeqloc->ChangeToPackedInt();
351     auto seqlocIntervals = pSeqloc->GetPacked_int().Get();
352 
353     CRef<CSeq_id> pRangeId(new CSeq_id);
354     pRangeId->Assign(*pSeqloc->GetId());
355     auto rangeStart = pSeqloc->GetStart(eExtreme_Biological);
356     auto rangeStop = pSeqloc->GetStop(eExtreme_Biological);
357     auto rangeStrand = eNa_strand_plus;
358     if (pSeqloc->IsSetStrand()) {
359         rangeStrand = pSeqloc->GetStrand();
360     }
361 
362     if (rangeStrand == eNa_strand_minus) {
363         if (rangeStart >= rangeStop) {
364             CRef<CSeq_interval> pOverlap(
365                 new CSeq_interval(*pRangeId, rangeStop, rangeStart, rangeStrand));
366             pSeqloc->SetInt(*pOverlap);
367         }
368         else {
369             CRef<CSeq_loc_mix>  pMix(new CSeq_loc_mix);
370             auto it = seqlocIntervals.begin();
371             auto oldFrom = (*it)->GetFrom();
372             auto oldTo = (*it)->GetTo();
373 
374             for (it++; it != seqlocIntervals.end(); it++) {
375                 auto newFrom = (*it)->GetFrom();
376                 auto newTo = (*it)->GetTo();
377                 if (newTo >= oldFrom) {
378                     oldFrom = newFrom;
379                 }
380                 else {
381                     pMix->AddInterval(*pRangeId, 0, oldTo, rangeStrand);
382                     oldFrom = newFrom, oldTo = newTo;
383                 }
384             }
385             if (oldFrom > oldTo) {
386                 return MergeLocationForTranscript(locations);
387             }
388             pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
389             pSeqloc->SetMix(*pMix);
390         }
391     }
392     else {
393         if (rangeStart <= rangeStop) {
394             CRef<CSeq_interval> pOverlap(
395                 new CSeq_interval(*pRangeId, rangeStart, rangeStop, rangeStrand));
396             pSeqloc->SetInt(*pOverlap);
397         }
398         else {
399             CRef<CSeq_loc_mix>  pMix(new CSeq_loc_mix);
400             auto it = seqlocIntervals.begin();
401             auto oldFrom = (*it)->GetFrom();
402             auto oldTo = (*it)->GetTo();
403             for (it++; it != seqlocIntervals.end(); it++) {
404                 auto newFrom = (*it)->GetFrom();
405                 auto newTo = (*it)->GetTo();
406                 if (newFrom >= oldTo) {
407                     oldTo = newTo;
408                 }
409                 else {
410                     pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
411                     oldFrom = 0, oldTo = newTo;
412                 }
413             }
414             if (oldFrom > oldTo) {
415                 return MergeLocationForTranscript(locations);
416             }
417             pMix->AddInterval(*pRangeId, oldFrom, oldTo, rangeStrand);
418             pSeqloc->SetMix(*pMix);
419         }
420     }
421     return pSeqloc;
422 }
423 
424 //  ============================================================================
425 CRef<CSeq_loc>
MergeLocationForTranscript(LOCATIONS & locations)426 CGtfLocationMerger::MergeLocationForTranscript(
427     LOCATIONS& locations)
428 //  ============================================================================
429 {
430     return MergeLocationDefault(locations);
431 }
432 
433 END_SCOPE(objects)
434 END_NCBI_SCOPE
435