1 /* $Id: sequence.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Clifford Clausen
27 *
28 * File Description:
29 * Sequence utilities requiring CScope
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <serial/iterator.hpp>
34 #include <util/static_map.hpp>
35
36 #include <objmgr/object_manager.hpp>
37 #include <objmgr/scope.hpp>
38 #include <objmgr/seq_vector.hpp>
39 #include <objmgr/seq_vector_ci.hpp>
40 #include <objmgr/seqdesc_ci.hpp>
41 #include <objmgr/feat_ci.hpp>
42 #include <objmgr/bioseq_ci.hpp>
43 #include <objmgr/seq_entry_handle.hpp>
44 #include <objmgr/impl/handle_range_map.hpp>
45 #include <objmgr/impl/synonyms.hpp>
46 #include <objmgr/util/seq_loc_util.hpp>
47 #include <objmgr/util/create_defline.hpp>
48
49 #include <objects/general/Int_fuzz.hpp>
50 #include <objects/general/Dbtag.hpp>
51 #include <objects/general/Object_id.hpp>
52 #include <objects/general/User_object.hpp>
53 #include <objects/general/User_field.hpp>
54 #include <objects/general/Date.hpp>
55 #include <objects/general/general_macros.hpp>
56
57 #include <objects/misc/sequence_util_macros.hpp>
58
59 #include <objects/seq/Bioseq.hpp>
60 #include <objects/seq/Delta_ext.hpp>
61 #include <objects/seq/Delta_seq.hpp>
62 #include <objects/seq/Linkage_evidence.hpp>
63 #include <objects/seq/MolInfo.hpp>
64 #include <objects/seq/Seg_ext.hpp>
65 #include <objects/seq/Seq_ext.hpp>
66 #include <objects/seq/Seq_gap.hpp>
67 #include <objects/seq/Seq_inst.hpp>
68 #include <objects/seq/Seq_literal.hpp>
69 #include <objects/seq/seqport_util.hpp>
70 #include <objects/seq/Seq_hist.hpp>
71 #include <objects/seq/Seq_hist_rec.hpp>
72 #include <objects/seq/seq_macros.hpp>
73
74 #include <objects/seqloc/Packed_seqpnt.hpp>
75 #include <objects/seqloc/Seq_bond.hpp>
76 #include <objects/seqloc/Seq_id.hpp>
77 #include <objects/seqloc/Seq_interval.hpp>
78 #include <objects/seqloc/Seq_loc.hpp>
79 #include <objects/seqloc/Seq_loc_equiv.hpp>
80 #include <objects/seqloc/Seq_loc_mix.hpp>
81 #include <objects/seqloc/Seq_point.hpp>
82
83 #include <objects/seqset/Seq_entry.hpp>
84
85 #include <objects/seqfeat/seqfeat__.hpp>
86
87 #include <objmgr/seq_loc_mapper.hpp>
88 #include <objmgr/seq_entry_ci.hpp>
89 #include <objmgr/util/sequence.hpp>
90 #include <objmgr/error_codes.hpp>
91 #include <util/strsearch.hpp>
92
93 #include <list>
94 #include <algorithm>
95
96
97 #define NCBI_USE_ERRCODE_X ObjMgr_SeqUtil
98
99 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)100 BEGIN_SCOPE(objects)
101 BEGIN_SCOPE(sequence)
102
103
104 const CBioSource* GetBioSource(const CBioseq& bioseq)
105 {
106 ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
107 {
108 if ((**it).IsSource())
109 return &(**it).GetSource();
110 }
111
112 return NULL;
113 }
114
GetBioSource(const CBioseq_Handle & handle)115 const CBioSource* GetBioSource(const CBioseq_Handle& handle)
116 {
117 {{
118 CSeqdesc_CI desc(handle, CSeqdesc::e_Source);
119 if (desc) {
120 return &desc->GetSource();
121 }
122 }}
123 {{
124 CSeqdesc_CI desc(handle.GetTopLevelEntry(), CSeqdesc::e_Source);
125 if (desc) {
126 return &desc->GetSource();
127 }
128 }}
129
130 return NULL;
131 }
132
133
GetOrg_refOrNull(const CBioseq_Handle & handle)134 const COrg_ref* GetOrg_refOrNull(const CBioseq_Handle& handle)
135 {
136 vector<CSeqdesc::E_Choice> types;
137 types.push_back(CSeqdesc::e_Source);
138 types.push_back(CSeqdesc::e_Org);
139 CSeqdesc_CI desc_it(handle, types);
140 if ( desc_it ) {
141 const CSeqdesc& desc = *desc_it;
142 if ( desc.IsSource() ) {
143 return &desc.GetSource().GetOrg();
144 }
145 if ( desc.IsOrg() ) {
146 return &desc.GetOrg();
147 }
148 }
149 return 0;
150 }
151
152
GetOrg_ref(const CBioseq_Handle & handle)153 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle)
154 {
155 const COrg_ref* org_ref = GetOrg_refOrNull(handle);
156 if ( org_ref ) {
157 return *org_ref;
158 }
159 NCBI_THROW(CException, eUnknown, "No organism set");
160 }
161
162
GetTaxId(const CBioseq_Handle & handle)163 TTaxId GetTaxId(const CBioseq_Handle& handle)
164 {
165 const COrg_ref* org_ref = GetOrg_refOrNull(handle);
166 if ( org_ref ) {
167 return org_ref->GetTaxId();
168 }
169 return ZERO_TAX_ID;
170 }
171
172
GetMolInfo(const CBioseq & bioseq)173 const CMolInfo* GetMolInfo(const CBioseq& bioseq)
174 {
175 ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
176 {
177 if ((**it).IsMolinfo())
178 return &(**it).GetMolinfo();
179 }
180 return NULL;
181 }
182
183
GetMolInfo(const CBioseq_Handle & handle)184 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle)
185 {
186 CSeqdesc_CI desc_iter(handle, CSeqdesc::e_Molinfo);
187 for ( ; desc_iter; ++desc_iter) {
188 return &desc_iter->GetMolinfo();
189 }
190
191 return NULL;
192 }
193
194
195
GetBioseqFromSeqLoc(const CSeq_loc & loc,CScope & scope,CScope::EGetBioseqFlag flag)196 CBioseq_Handle GetBioseqFromSeqLoc
197 (const CSeq_loc& loc,
198 CScope& scope,
199 CScope::EGetBioseqFlag flag)
200 {
201 CBioseq_Handle retval;
202
203 try {
204 if (IsOneBioseq(loc, &scope)) {
205 return scope.GetBioseqHandle(GetId(loc, &scope), flag);
206 }
207
208 // assuming location is annotated on parts of a segmented bioseq
209 for (CSeq_loc_CI it(loc); it; ++it) {
210 CBioseq_Handle part = scope.GetBioseqHandle(it.GetSeq_id(), flag);
211 if (part) {
212 retval = GetParentForPart(part);
213 }
214 break; // check only the first part
215 }
216
217 // if multiple intervals and not parts, look for the first loaded bioseq
218 if (!retval) {
219 for (CSeq_loc_CI it(loc); it; ++it) {
220 retval =
221 scope.GetBioseqHandle(it.GetSeq_id_Handle(), CScope::eGetBioseq_Loaded);
222 if (retval) {
223 break;
224 }
225 }
226 }
227
228 if (!retval && flag == CScope::eGetBioseq_All) {
229 for (CSeq_loc_CI it(loc); it; ++it) {
230 retval =
231 scope.GetBioseqHandle(it.GetSeq_id_Handle(), flag);
232 if (retval) {
233 break;
234 }
235 }
236 }
237 } catch (exception&) {
238 retval.Reset();
239 }
240
241 return retval;
242 }
243
244
GetProteinName(const CBioseq_Handle & seq)245 string GetProteinName(const CBioseq_Handle& seq)
246 {
247 if ( !seq ) {
248 NCBI_THROW(CObjMgrException, eInvalidHandle,
249 "GetProteinName: "
250 "null handle");
251 }
252 if ( !seq.IsProtein() ) {
253 NCBI_THROW_FMT(CObjmgrUtilException, eBadSequenceType,
254 "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
255 "the sequence is not a protein");
256 }
257 TSeqPos seq_length = seq.GetBioseqLength();
258 TSeqPos best_length = 0;
259 vector<CMappedFeat> best_feats;
260 for ( CFeat_CI it(seq, CSeqFeatData::e_Prot); it; ++it ) {
261 COpenRange<TSeqPos> range = it->GetRange();
262 if ( range.GetToOpen() > seq_length ) {
263 range.SetToOpen(seq_length);
264 }
265 TSeqPos length = range.GetLength();
266 if ( length > best_length ) {
267 best_length = length;
268 best_feats.clear();
269 }
270 if ( length == best_length ) {
271 best_feats.push_back(*it);
272 }
273 }
274 if ( best_feats.empty() ) {
275 NCBI_THROW_FMT(CObjMgrException, eFindFailed,
276 "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
277 "the sequence does't have prot feature");
278 }
279 if ( best_feats.size() > 1 ) {
280 NCBI_THROW_FMT(CObjMgrException, eFindConflict,
281 "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
282 "the sequence have ambiguous prot feature");
283 }
284 string ret;
285 best_feats[0].GetData().GetProt().GetLabel(&ret);
286 if ( ret.empty() ) {
287 NCBI_THROW_FMT(CObjmgrUtilException, eBadFeature,
288 "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
289 "the prot feature doesn't return name");
290 }
291 return ret;
292 }
293
294
GetErrCodeString(void) const295 const char* CSeqIdFromHandleException::GetErrCodeString(void) const
296 {
297 switch (GetErrCode()) {
298 case eNoSynonyms: return "eNoSynonyms";
299 case eRequestedIdNotFound: return "eRequestedIdNotFound";
300 default: return CException::GetErrCodeString();
301 }
302 }
303
304
Score_SeqIdHandle(const CSeq_id_Handle & idh)305 int Score_SeqIdHandle(const CSeq_id_Handle& idh)
306 {
307 CConstRef<CSeq_id> id = idh.GetSeqId();
308 CRef<CSeq_id> id_non_const
309 (const_cast<CSeq_id*>(id.GetPointer()));
310 return CSeq_id::Score(id_non_const);
311 }
312
313
BestRank_SeqIdHandle(const CSeq_id_Handle & idh)314 int BestRank_SeqIdHandle(const CSeq_id_Handle& idh)
315 {
316 CConstRef<CSeq_id> id = idh.GetSeqId();
317 CRef<CSeq_id> id_non_const
318 (const_cast<CSeq_id*>(id.GetPointer()));
319 return CSeq_id::BestRank(id_non_const);
320 }
321
322
WorstRank_SeqIdHandle(const CSeq_id_Handle & idh)323 int WorstRank_SeqIdHandle(const CSeq_id_Handle& idh)
324 {
325 CConstRef<CSeq_id> id = idh.GetSeqId();
326 CRef<CSeq_id> id_non_const
327 (const_cast<CSeq_id*>(id.GetPointer()));
328 return CSeq_id::WorstRank(id_non_const);
329 }
330
331
FastaAARank_SeqIdHandle(const CSeq_id_Handle & idh)332 int FastaAARank_SeqIdHandle(const CSeq_id_Handle& idh)
333 {
334 CConstRef<CSeq_id> id = idh.GetSeqId();
335 CRef<CSeq_id> id_non_const
336 (const_cast<CSeq_id*>(id.GetPointer()));
337 return CSeq_id::FastaAARank(id_non_const);
338 }
339
340
FastaNARank_SeqIdHandle(const CSeq_id_Handle & idh)341 int FastaNARank_SeqIdHandle(const CSeq_id_Handle& idh)
342 {
343 CConstRef<CSeq_id> id = idh.GetSeqId();
344 CRef<CSeq_id> id_non_const
345 (const_cast<CSeq_id*>(id.GetPointer()));
346 return CSeq_id::FastaNARank(id_non_const);
347 }
348
349
350
x_GetId(const CScope::TIds & ids,EGetIdType type)351 CSeq_id_Handle x_GetId(const CScope::TIds& ids, EGetIdType type)
352 {
353 if ( ids.empty() ) {
354 return CSeq_id_Handle();
355 }
356
357 switch ( (type & eGetId_TypeMask) ) {
358 case eGetId_ForceGi:
359 if ( !CSeq_id::AvoidGi() ) {
360 ITERATE (CScope::TIds, iter, ids) {
361 if (iter->IsGi()) {
362 return *iter;
363 }
364 }
365 }
366 if ((type & eGetId_ThrowOnError) != 0) {
367 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
368 "sequence::GetId(): gi seq-id not found in the list");
369 }
370 break;
371
372 case eGetId_ForceAcc:
373 {{
374 CSeq_id_Handle best = x_GetId(ids, eGetId_Best);
375 if (best &&
376 best.GetSeqId()->GetTextseq_Id() != NULL &&
377 best.GetSeqId()->GetTextseq_Id()->IsSetAccession()) {
378 return best;
379 }
380 }}
381 if ((type & eGetId_ThrowOnError) != 0) {
382 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
383 "sequence::GetId(): text seq-id not found in the list");
384 }
385 break;
386
387 case eGetId_Best:
388 {{
389 return FindBestChoice(ids, Score_SeqIdHandle);
390 }}
391
392 case eGetId_Seq_id_Score:
393 {{
394 return FindBestChoice(ids, Score_SeqIdHandle);
395 }}
396
397 case eGetId_Seq_id_BestRank:
398 {{
399 return FindBestChoice(ids, BestRank_SeqIdHandle);
400 }}
401
402 case eGetId_Seq_id_WorstRank:
403 {{
404 return FindBestChoice(ids, WorstRank_SeqIdHandle);
405 }}
406
407 case eGetId_Seq_id_FastaAARank:
408 {{
409 return FindBestChoice(ids, FastaAARank_SeqIdHandle);
410 }}
411
412 case eGetId_Seq_id_FastaNARank:
413 {{
414 return FindBestChoice(ids, FastaNARank_SeqIdHandle);
415 }}
416
417 default:
418 break;
419 }
420 return CSeq_id_Handle();
421 }
422
423
GetId(const CBioseq & seq,EGetIdType type)424 CSeq_id_Handle GetId(const CBioseq& seq, EGetIdType type)
425 {
426 return GetId(seq.GetId(), type);
427 }
428
429
GetId(const CBioseq::TId & ids_in,EGetIdType type)430 CSeq_id_Handle GetId(const CBioseq::TId& ids_in, EGetIdType type)
431 {
432 CScope::TIds ids;
433 ITERATE (CBioseq::TId, it, ids_in) {
434 ids.push_back(CSeq_id_Handle::GetHandle(**it));
435 }
436
437 return x_GetId(ids, type);
438 }
439
440
GetId(const CSeq_id & id,CScope & scope,EGetIdType type)441 CSeq_id_Handle GetId(const CSeq_id& id, CScope& scope, EGetIdType type)
442 {
443 return GetId(CSeq_id_Handle::GetHandle(id), scope, type);
444 }
445
446
GetId(const CSeq_id_Handle & idh,CScope & scope,EGetIdType type)447 CSeq_id_Handle GetId(const CSeq_id_Handle& idh, CScope& scope,
448 EGetIdType type)
449 {
450 CSeq_id_Handle ret;
451 if (!idh) return ret;
452 try {
453 if ( (type & eGetId_TypeMask) == eGetId_ForceGi ) {
454 if ( idh.IsGi() && (type & eGetId_VerifyId) == 0 ) {
455 return idh;
456 }
457 TGi gi = scope.GetGi(idh);
458 if (gi != ZERO_GI) {
459 ret = CSeq_id_Handle::GetGiHandle(gi);
460 }
461 }
462 else if ( (type & eGetId_TypeMask) == eGetId_Canonical) {
463 /// Short-cuts for commonly used IDs that are
464 /// known unambiguously to be canonical:
465 /// - ID/GenBank: GI
466 /// - Trace: gnl|ti|<tid> in the C++ Toolkit;
467 /// note that in the C Toolkit, the
468 /// canonical ID appears to be gnl|TRACE|<tid>.
469 /// - Short Read Archive: gnl|SRA|...
470 if (!CSeq_id::PreferAccessionOverGi() &&
471 idh.IsGi()) return idh;
472 if (idh.Which() == CSeq_id::e_General) {
473 CConstRef<CSeq_id> id = idh.GetSeqId();
474 _ASSERT(id && id->IsGeneral());
475 const CSeq_id::TGeneral::TDb& db = id->GetGeneral().GetDb();
476 if (db == "ti" || db == "SRA") return idh;
477 }
478
479 /// Fallback to retrieve IDs.
480 ret = x_GetId(scope.GetIds(idh), type);
481 if ( !ret ) {
482 /// failed to retrieve IDs
483 /// assume input is the best that we can do
484 ret = idh;
485 }
486 }
487 else if ( (type & eGetId_TypeMask) == eGetId_ForceAcc ) {
488 ret = scope.GetAccVer(idh);
489 }
490 else {
491 ret = x_GetId(scope.GetIds(idh), type);
492 }
493 }
494 catch (exception& e) {
495 ERR_POST("sequence::GetId(): exception: "<<e.what());
496 if ( (type & eGetId_ThrowOnError) != 0 ) {
497 throw;
498 }
499 ret.Reset();
500 return ret;
501 }
502 if ( !ret && (type & eGetId_ThrowOnError) != 0 ) {
503 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
504 "sequence::GetId(): seq-id not found in the scope");
505 }
506 return ret;
507 }
508
509
GetId(const CBioseq_Handle & handle,EGetIdType type)510 CSeq_id_Handle GetId(const CBioseq_Handle& handle,
511 EGetIdType type)
512 {
513 _ASSERT(handle);
514
515 const CScope::TIds& ids = handle.GetId();
516 CSeq_id_Handle idh = x_GetId(ids, type);
517
518 if ( !idh && (type & eGetId_ThrowOnError) != 0 ) {
519 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
520 "Unable to get Seq-id from handle");
521 }
522
523 return idh;
524 }
525
526
GetGiForAccession(const string & acc,CScope & scope,EGetIdType flags)527 TGi GetGiForAccession(const string& acc, CScope& scope, EGetIdType flags)
528 {
529 if ( CSeq_id::AvoidGi() ) return ZERO_GI;
530
531 // Clear throw-on-error flag
532 EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
533 try {
534 CSeq_id acc_id(acc);
535 // Get gi only if acc a real accession.
536 if ( acc_id.GetTextseq_Id() ) {
537 CSeq_id_Handle idh = GetId(acc_id, scope, get_id_flags);
538 if ( idh.IsGi() ) {
539 return idh.GetGi();
540 }
541 }
542 }
543 catch (exception& e) {
544 if ( (flags & eGetId_ThrowOnError) != 0 ) {
545 throw e;
546 }
547 return ZERO_GI;
548 }
549 if ( (flags & eGetId_ThrowOnError) != 0 ) {
550 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
551 "sequence::GetGiForAccession(): invalid seq-id type");
552 }
553 return ZERO_GI;
554 }
555
556
GetGiForId(const objects::CSeq_id & id,CScope & scope,EGetIdType flags)557 TGi GetGiForId(const objects::CSeq_id& id, CScope& scope, EGetIdType flags)
558 {
559 if ( CSeq_id::AvoidGi() ) return ZERO_GI;
560
561 // Clear throw-on-error flag
562 EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
563 CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
564 if ( idh.IsGi() ) {
565 return idh.GetGi();
566 }
567 if ( (flags & eGetId_ThrowOnError) != 0 ) {
568 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
569 "sequence::GetGiForId(): seq-id not found in the scope");
570 }
571 return ZERO_GI;
572 }
573
574
GetAccessionForGi(TGi gi,CScope & scope,EAccessionVersion use_version,EGetIdType flags)575 string GetAccessionForGi(TGi gi,
576 CScope& scope,
577 EAccessionVersion use_version,
578 EGetIdType flags)
579 {
580 // Clear throw-on-error flag
581 EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
582 bool with_version = (use_version == eWithAccessionVersion);
583
584 CSeq_id gi_id(CSeq_id::e_Gi, gi);
585 CSeq_id_Handle idh = GetId(gi_id, scope, get_id_flags);
586 if ( idh ) {
587 return idh.GetSeqId()->GetSeqIdString(with_version);
588 }
589 if ( (flags & eGetId_ThrowOnError) != 0 ) {
590 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
591 "sequence::GetAccessionForGi(): seq-id not found in the scope");
592 }
593 return kEmptyStr;
594 }
595
596
GetAccessionForId(const objects::CSeq_id & id,CScope & scope,EAccessionVersion use_version,EGetIdType flags)597 string GetAccessionForId(const objects::CSeq_id& id,
598 CScope& scope,
599 EAccessionVersion use_version,
600 EGetIdType flags)
601 {
602 // Clear throw-on-error flag
603 EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
604 bool with_version = (use_version == eWithAccessionVersion);
605
606 CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
607 if ( idh ) {
608 return idh.GetSeqId()->GetSeqIdString(with_version);
609 }
610 if ( (flags & eGetId_ThrowOnError) != 0 ) {
611 NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
612 "sequence::GetAccessionForId(): seq-id not found in the scope");
613 }
614 return kEmptyStr;
615 }
616
617
x_FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope,const CTime * tlim)618 CSeq_id_Handle x_FindLatestSequence(const CSeq_id_Handle& idh,
619 CScope& scope,
620 const CTime* tlim)
621 {
622 CBioseq_Handle h = scope.GetBioseqHandle(idh);
623 set<CSeq_id_Handle> visited;
624 CSeq_id_Handle next = idh;
625 while (h && h.IsSetInst() && h.GetInst().IsSetHist()
626 && h.GetInst().GetHist().IsSetReplaced_by()) {
627 const CSeq_hist_rec& rec = h.GetInst().GetHist().GetReplaced_by();
628
629 // Check if the next bioseq is newer than the limit.
630 if (tlim && rec.IsSetDate() &&
631 rec.GetDate().AsCTime().DiffTimeSpan(*tlim).GetSign() == ePositive) {
632 break;
633 }
634 // Make sure the list of ids is not empty
635 if ( rec.GetIds().empty() ) {
636 return CSeq_id_Handle();
637 }
638 visited.insert(next);
639 // If there are several replaced-by entries, use the first one
640 next = CSeq_id_Handle::GetHandle(
641 *h.GetInst().GetHist().GetReplaced_by().GetIds().front());
642 if (visited.find(next) != visited.end()) {
643 // Infinite recursion detected
644 return CSeq_id_Handle();
645 }
646 h = scope.GetBioseqHandle(next);
647 }
648 return h ? next : CSeq_id_Handle();
649 }
650
651
FindLatestSequence(const CSeq_id & id,CScope & scope)652 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id, CScope& scope)
653 {
654 return x_FindLatestSequence(CSeq_id_Handle::GetHandle(id),
655 scope, NULL).GetSeqId();
656 }
657
FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope)658 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh, CScope& scope)
659 {
660 return x_FindLatestSequence(idh, scope, NULL);
661 }
662
FindLatestSequence(const CSeq_id & id,CScope & scope,const CTime & tlim)663 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id,
664 CScope& scope,
665 const CTime& tlim)
666 {
667 return x_FindLatestSequence(CSeq_id_Handle::GetHandle(id),
668 scope, &tlim).GetSeqId();
669 }
670
FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope,const CTime & tlim)671 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh,
672 CScope& scope,
673 const CTime& tlim)
674 {
675 return x_FindLatestSequence(idh, scope, &tlim);
676 }
677
678
SourceToProduct(const CSeq_feat & feat,const CSeq_loc & source_loc,TS2PFlags flags,CScope * scope,int * frame)679 CRef<CSeq_loc> SourceToProduct(const CSeq_feat& feat,
680 const CSeq_loc& source_loc, TS2PFlags flags,
681 CScope* scope, int* frame)
682 {
683 SRelLoc::TFlags rl_flags = 0;
684 if (flags & fS2P_NoMerge) {
685 rl_flags |= SRelLoc::fNoMerge;
686 }
687 SRelLoc rl(feat.GetLocation(), source_loc, scope, rl_flags);
688 _ASSERT(!rl.m_Ranges.empty());
689 rl.m_ParentLoc.Reset(&feat.GetProduct());
690 if (feat.GetData().IsCdregion()) {
691 // 3:1 ratio
692 const CCdregion& cds = feat.GetData().GetCdregion();
693 int base_frame = cds.GetFrame();
694 if (base_frame > 0) {
695 --base_frame;
696 }
697 if (frame) {
698 *frame = (3 + rl.m_Ranges.front()->GetFrom() - base_frame) % 3 + 1;
699 }
700 TSeqPos prot_length;
701 try {
702 prot_length = GetLength(feat.GetProduct(), scope);
703 } catch (CObjmgrUtilException) {
704 prot_length = numeric_limits<TSeqPos>::max();
705 }
706 NON_CONST_ITERATE (SRelLoc::TRanges, it, rl.m_Ranges) {
707 if (IsReverse((*it)->GetStrand())) {
708 ERR_POST_X(6, Warning
709 << "SourceToProduct:"
710 " parent and child have opposite orientations");
711 }
712 TSeqPos fr = (*it)->GetFrom();
713 TSeqPos to = (*it)->GetTo();
714 (*it)->SetFrom(((*it)->GetFrom() - base_frame) / 3);
715 (*it)->SetTo (((*it)->GetTo() - base_frame) / 3);
716 if ((flags & fS2P_AllowTer) && to == prot_length * 3 && fr < to ) {
717 --(*it)->SetTo();
718 }
719 }
720 } else {
721 if (frame) {
722 *frame = 0; // not applicable; explicitly zero
723 }
724 }
725
726 return rl.Resolve(scope, rl_flags);
727 }
728
729
ProductToSource(const CSeq_feat & feat,const CSeq_loc & prod_loc,TP2SFlags flags,CScope * scope)730 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc,
731 TP2SFlags flags, CScope* scope)
732 {
733 SRelLoc rl(feat.GetProduct(), prod_loc, scope);
734 _ASSERT(!rl.m_Ranges.empty());
735 rl.m_ParentLoc.Reset(&feat.GetLocation());
736 if (feat.GetData().IsCdregion()) {
737 // 3:1 ratio
738 const CCdregion& cds = feat.GetData().GetCdregion();
739 int base_frame = cds.GetFrame();
740 if (base_frame > 0) {
741 --base_frame;
742 }
743 TSeqPos nuc_length, prot_length;
744 try {
745 nuc_length = GetLength(feat.GetLocation(), scope);
746 } catch (CObjmgrUtilException) {
747 nuc_length = numeric_limits<TSeqPos>::max();
748 }
749 try {
750 prot_length = GetLength(feat.GetProduct(), scope);
751 } catch (CObjmgrUtilException) {
752 prot_length = numeric_limits<TSeqPos>::max();
753 }
754 NON_CONST_ITERATE(SRelLoc::TRanges, it, rl.m_Ranges) {
755 _ASSERT( !IsReverse((*it)->GetStrand()) );
756 TSeqPos from, to;
757 if ((flags & fP2S_Extend) && (*it)->GetFrom() == 0) {
758 from = 0;
759 } else {
760 from = (*it)->GetFrom() * 3 + base_frame;
761 }
762 if ((flags & fP2S_Extend) && (*it)->GetTo() == prot_length - 1) {
763 to = nuc_length - 1;
764 } else {
765 to = (*it)->GetTo() * 3 + base_frame + 2;
766 }
767 (*it)->SetFrom(from);
768 (*it)->SetTo (to);
769 }
770 }
771
772 return rl.Resolve(scope);
773 }
774
775
776 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore;
777 typedef vector<TFeatScore> TFeatScores;
778
779 template <class T, class U>
780 struct SPairLessByFirst
781 {
operator ()SPairLessByFirst782 bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
783 {
784 return p1.first < p2.first;
785 }
786 };
787
788 template <class T, class U>
789 struct SPairLessBySecond
790 {
operator ()SPairLessBySecond791 bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
792 {
793 return p1.second < p2.second;
794 }
795 };
796
797 class COverlapPairLess
798 {
799 public:
COverlapPairLess(CScope * scope_arg)800 COverlapPairLess( CScope *scope_arg ) : scope(scope_arg) { }
801
operator ()(const pair<Int8,CConstRef<CSeq_feat>> & gene1,const pair<Int8,CConstRef<CSeq_feat>> & gene2)802 bool operator()( const pair<Int8,CConstRef<CSeq_feat> >& gene1,
803 const pair<Int8, CConstRef<CSeq_feat> >& gene2 )
804 {
805 // First, compare by overlap amount
806 if( gene1.first != gene2.first ) {
807 return gene1.first < gene2.first;
808 }
809
810 const CSeq_loc &loc1 = gene1.second->GetLocation();
811 const CSeq_loc &loc2 = gene2.second->GetLocation();
812
813 // If genes are at identical positions, we fall back on the label
814 if( sequence::Compare(loc1, loc2, scope, sequence::fCompareOverlapping ) ==
815 sequence::eSame) {
816 if( gene1.second->IsSetData() && gene1.second->GetData().IsGene() &&
817 gene2.second->IsSetData() && gene2.second->GetData().IsGene() )
818 {
819 string gene1_label;
820 string gene2_label;
821
822 gene1.second->GetData().GetGene().GetLabel( &gene1_label );
823 gene2.second->GetData().GetGene().GetLabel( &gene2_label );
824 return gene1_label < gene2_label;
825 }
826 }
827
828 return false;
829 }
830 private:
831 CScope *scope;
832 };
833
GetOverlappingFeatures(const CSeq_loc & loc,CSeqFeatData::E_Choice feat_type,CSeqFeatData::ESubtype feat_subtype,EOverlapType overlap_type,TFeatScores & feats,CScope & scope,const TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)834 void GetOverlappingFeatures(const CSeq_loc& loc,
835 CSeqFeatData::E_Choice feat_type,
836 CSeqFeatData::ESubtype feat_subtype,
837 EOverlapType overlap_type,
838 TFeatScores& feats,
839 CScope& scope,
840 const TBestFeatOpts opts,
841 CGetOverlappingFeaturesPlugin *plugin )
842 {
843 bool revert_locations = false;
844 SAnnotSelector::EOverlapType annot_overlap_type;
845 switch (overlap_type) {
846 case eOverlap_Simple:
847 case eOverlap_Contained:
848 case eOverlap_Contains:
849 // Require total range overlap
850 annot_overlap_type = SAnnotSelector::eOverlap_TotalRange;
851 break;
852 case eOverlap_Subset:
853 case eOverlap_SubsetRev:
854 case eOverlap_CheckIntervals:
855 case eOverlap_Interval:
856 case eOverlap_CheckIntRev:
857 revert_locations = true;
858 // there's no break here - proceed to "default"
859 default:
860 // Require intervals overlap
861 annot_overlap_type = SAnnotSelector::eOverlap_Intervals;
862 break;
863 }
864
865 CConstRef<CSeq_feat> feat_ref;
866 TOverlapFlags overlap_flags = fOverlap_Default;
867
868 CBioseq_Handle bioseq_handle;
869 CRange<TSeqPos> range;
870 ENa_strand strand = eNa_strand_unknown;
871 if ( loc.IsWhole() ) {
872 bioseq_handle = scope.GetBioseqHandle(loc.GetWhole());
873 range = range.GetWhole();
874 }
875 else if ( loc.IsInt() || loc.IsPnt() || loc.IsPacked_int() || loc.IsMix() || loc.IsPacked_pnt() ) {
876 const CSeq_id* id = loc.GetId();
877 if( NULL != id ) {
878 bioseq_handle = scope.GetBioseqHandle(*id);
879 range.SetFrom(loc.GetStart(eExtreme_Positional));
880 range.SetTo(loc.GetStop(eExtreme_Positional));
881 if ( loc.IsSetStrand() ) {
882 strand = loc.GetStrand();
883 }
884 }
885 }
886 else {
887 range = range.GetEmpty();
888 }
889
890 // Check if the sequence is circular
891 TSeqPos circular_length = kInvalidSeqPos;
892 CConstRef<CSeq_id> circular_id;
893 if ( bioseq_handle ) {
894 if ( bioseq_handle.IsSetInst_Topology() &&
895 bioseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
896 circular_length = bioseq_handle.GetBioseqLength();
897 circular_id = bioseq_handle.GetSeqId();
898 }
899 }
900 else {
901 try {
902 const CSeq_id* loc_id = nullptr;
903 try {
904 loc.CheckId(loc_id);
905 }
906 catch (exception&) {
907 loc_id = 0;
908 }
909 if ( loc_id ) {
910 circular_id.Reset(loc_id);
911 CBioseq_Handle bseq_handle = scope.GetBioseqHandle(*circular_id);
912 if ( bseq_handle && bseq_handle.IsSetInst_Topology() &&
913 bseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
914 circular_length = bseq_handle.GetBioseqLength();
915 }
916 }
917 }
918 catch (exception& _DEBUG_ARG(e)) {
919 _TRACE("test for circularity failed: " << e.what()) ;
920 }
921 }
922
923 CRef<CSeq_loc> circular_loc;
924 if (circular_id && range.GetFrom() > range.GetTo()) {
925 // Circular bioseq, the location crosses zero. Can't use a single
926 // total range.
927 circular_loc.Reset(new CSeq_loc);
928 CRef<CSeq_interval> sub_loc(new CSeq_interval);
929 sub_loc->SetId().Assign(*circular_id);
930 sub_loc->SetFrom(0);
931 sub_loc->SetTo(range.GetTo());
932 if ( loc.IsSetStrand() ) {
933 sub_loc->SetStrand(loc.GetStrand());
934 }
935 // First interval - no matter front or back
936 circular_loc->SetPacked_int().Set().push_back(sub_loc);
937 sub_loc.Reset(new CSeq_interval);
938 sub_loc->SetId().Assign(*circular_id);
939 sub_loc->SetFrom(range.GetFrom());
940 sub_loc->SetTo(circular_length == kInvalidSeqPos
941 ? kInvalidSeqPos : circular_length - 1);
942 if ( loc.IsSetStrand() ) {
943 sub_loc->SetStrand(loc.GetStrand());
944 }
945 if ( IsReverse(strand) ) {
946 circular_loc->SetPacked_int().Set().push_front(sub_loc);
947 }
948 else {
949 circular_loc->SetPacked_int().Set().push_back(sub_loc);
950 }
951 }
952 try {
953 SAnnotSelector sel;
954 sel.SetFeatType(feat_type)
955 .SetFeatSubtype(feat_subtype)
956 .SetOverlapType(annot_overlap_type)
957 .SetResolveTSE();
958 if( opts & fBestFeat_IgnoreStrand ) {
959 sel.SetIgnoreStrand();
960 if( ! circular_id && range.GetFrom() > range.GetTo() ) {
961 // switch from and to
962 range = CRange<TSeqPos>( range.GetTo(), range.GetFrom() );
963 }
964 }
965 if( plugin ) {
966 plugin->processSAnnotSelector( sel );
967 }
968
969 auto_ptr<CFeat_CI> feat_it_ptr;
970 if( plugin ) {
971 plugin->setUpFeatureIterator( bioseq_handle, feat_it_ptr,
972 circular_length, range, loc, sel, scope, strand);
973 } else {
974 if ( circular_loc ) {
975 if ( !bioseq_handle ) {
976 sel.SetSearchUnresolved();
977 }
978 feat_it_ptr.reset( new CFeat_CI(scope, *circular_loc, sel) );
979 }
980 else if ( bioseq_handle ) {
981 feat_it_ptr.reset( new CFeat_CI(bioseq_handle, range, strand, sel) );
982 }
983 else {
984 sel.SetSearchUnresolved();
985 feat_it_ptr.reset( new CFeat_CI(scope, loc, sel) );
986 }
987 }
988 // convenience variable so we don't have to keep dereferencing the auto_ptr
989 CFeat_CI &feat_it = *feat_it_ptr;
990
991 CRef<CSeq_loc> cleaned_loc( new CSeq_loc );
992 cleaned_loc->Assign( loc );
993 if( opts & fBestFeat_IgnoreStrand ) {
994 cleaned_loc->SetStrand(eNa_strand_plus);
995 overlap_flags |= fOverlap_IgnoreTopology;
996 }
997 if( plugin ) {
998 plugin->processLoc( bioseq_handle, cleaned_loc, circular_length );
999 }
1000
1001 for ( ; feat_it; ++feat_it) {
1002 CRef<CSeq_loc> cleaned_loc_this_iteration = cleaned_loc;
1003 CRef<CSeq_loc> candidate_feat_loc( new CSeq_loc );
1004 candidate_feat_loc->Assign( feat_it->GetOriginalFeature().GetLocation() );
1005 if( opts & fBestFeat_IgnoreStrand ) {
1006 candidate_feat_loc->SetStrand(eNa_strand_plus);
1007 }
1008 EOverlapType overlap_type_this_iteration = overlap_type;
1009 bool revert_locations_this_iteration = revert_locations;
1010
1011 if( plugin ) {
1012 bool shouldContinueToNextIteration = false;
1013 plugin->processMainLoop(
1014 shouldContinueToNextIteration,
1015 cleaned_loc_this_iteration,
1016 candidate_feat_loc,
1017 overlap_type_this_iteration,
1018 revert_locations_this_iteration,
1019 bioseq_handle,
1020 *feat_it,
1021 circular_length,
1022 annot_overlap_type);
1023 if( shouldContinueToNextIteration ) {
1024 continue;
1025 }
1026 }
1027
1028 try {
1029 // treat subset as a special case
1030 Int8 cur_diff = -1;
1031 if ( !revert_locations_this_iteration ) {
1032 if (overlap_flags == fOverlap_Default) {
1033 cur_diff = TestForOverlap64(*candidate_feat_loc,
1034 *cleaned_loc_this_iteration,
1035 overlap_type_this_iteration,
1036 circular_length,
1037 &scope);
1038 }
1039 else {
1040 cur_diff = TestForOverlapEx(*candidate_feat_loc,
1041 *cleaned_loc_this_iteration,
1042 overlap_type_this_iteration,
1043 &scope,
1044 overlap_flags);
1045 }
1046 }
1047 else {
1048 if (overlap_flags == fOverlap_Default) {
1049 cur_diff = TestForOverlap64(*cleaned_loc_this_iteration,
1050 *candidate_feat_loc,
1051 overlap_type_this_iteration,
1052 circular_length,
1053 &scope);
1054 }
1055 else {
1056 cur_diff = TestForOverlapEx(*cleaned_loc_this_iteration,
1057 *candidate_feat_loc,
1058 overlap_type_this_iteration,
1059 &scope,
1060 overlap_flags);
1061 }
1062 }
1063
1064 if( plugin ) {
1065 plugin->postProcessDiffAmount( cur_diff, cleaned_loc_this_iteration,
1066 candidate_feat_loc, scope, sel, circular_length );
1067 }
1068 if (cur_diff < 0) {
1069 continue;
1070 }
1071
1072 // quick fix for CFeat_CI returning wrong additional features
1073 if (overlap_type == eOverlap_Contained) {
1074 ECompare cmp = Compare(feat_it->GetLocation(), loc, &scope, fCompareOverlapping);
1075 if (cmp != eContains && cmp != eSame) {
1076 continue;
1077 }
1078 }
1079 TFeatScore sc(cur_diff, ConstRef(&feat_it->GetMappedFeature()));
1080 feats.push_back(sc);
1081 }
1082 catch (CObjmgrUtilException&) {
1083 // On TestForOverlap64 error proceed to the next feature.
1084 continue;
1085 }
1086 }
1087 }
1088 catch (exception&) {
1089 _TRACE("GetOverlappingFeatures(): error: feature iterator failed");
1090 }
1091
1092 std::stable_sort(feats.begin(), feats.end(),
1093 COverlapPairLess( &scope ) );
1094 }
1095
1096
GetBestOverlappingFeat(const CSeq_loc & loc,CSeqFeatData::E_Choice feat_type,EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1097 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
1098 CSeqFeatData::E_Choice feat_type,
1099 EOverlapType overlap_type,
1100 CScope& scope,
1101 TBestFeatOpts opts,
1102 CGetOverlappingFeaturesPlugin *plugin )
1103 {
1104 TFeatScores scores;
1105 GetOverlappingFeatures(loc,
1106 feat_type, CSeqFeatData::eSubtype_any,
1107 overlap_type, scores, scope, opts, plugin );
1108 if (scores.size()) {
1109 if (opts & fBestFeat_FavorLonger) {
1110 return scores.back().second;
1111 } else {
1112 return scores.front().second;
1113 }
1114 }
1115 return CConstRef<CSeq_feat>();
1116 }
1117
1118
GetBestOverlappingFeat(const CSeq_loc & loc,CSeqFeatData::ESubtype feat_type,EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1119 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
1120 CSeqFeatData::ESubtype feat_type,
1121 EOverlapType overlap_type,
1122 CScope& scope,
1123 TBestFeatOpts opts,
1124 CGetOverlappingFeaturesPlugin *plugin )
1125 {
1126 TFeatScores scores;
1127 GetOverlappingFeatures(loc,
1128 CSeqFeatData::GetTypeFromSubtype(feat_type), feat_type,
1129 overlap_type, scores, scope, opts, plugin );
1130
1131 if (scores.size()) {
1132 if (opts & fBestFeat_FavorLonger) {
1133 return scores.back().second;
1134 } else {
1135 return scores.front().second;
1136 }
1137 }
1138 return CConstRef<CSeq_feat>();
1139 }
1140
1141
1142 /// GetmRNAforCDS
1143 /// A function to find a CSeq_feat representing the
1144 /// appropriate mRNA for a given CDS.
1145 /// @param cds The feature for which the mRNA to be found
1146 /// @param scope The scope
1147 ///
1148 /// @return CConstRef<CSeq_feat> for new mRNA (will be NULL if none is found)
1149
GetmRNAforCDS(const CSeq_feat & cds,CScope & scope)1150 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope)
1151 {
1152 CConstRef<CSeq_feat> mrna;
1153
1154 bool has_xref = false;
1155 if (cds.IsSetXref()) {
1156 /* using FeatID from feature cross-references:
1157 * if CDS refers to an mRNA by feature ID, use that feature
1158 */
1159 CBioseq_Handle bsh;
1160 try {
1161 bsh = scope.GetBioseqHandle(cds.GetLocation());
1162 } catch (CException& ) {
1163 // multi-accession location, can't do this check
1164 return CConstRef<CSeq_feat>(NULL);
1165 }
1166 if (!bsh)
1167 {
1168 return CConstRef<CSeq_feat>(NULL);
1169 }
1170
1171 CTSE_Handle tse = bsh.GetTSE_Handle();
1172 ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1173 if ((*it)->IsSetId() && (*it)->GetId().IsLocal()) {
1174 CSeq_feat_Handle mrna_h = tse.GetFeatureWithId(CSeqFeatData::eSubtype_mRNA, (*it)->GetId().GetLocal());
1175 if (mrna_h) {
1176 mrna = mrna_h.GetSeq_feat();
1177 }
1178 has_xref = true;
1179 }
1180 }
1181 }
1182 if (!has_xref) {
1183 /* using original location to find mRNA:
1184 * mRNA must include the CDS location and the internal interval boundaries need to be identical
1185 */
1186 mrna = GetBestOverlappingFeat(cds.GetLocation(), CSeqFeatData::eSubtype_mRNA, sequence::eOverlap_CheckIntRev, scope);
1187 }
1188 return mrna;
1189 }
1190
1191
1192 static
x_GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::E_Choice type,CSeqFeatData::ESubtype subtype,CScope & scope,bool search_both_strands=true)1193 CConstRef<CSeq_feat> x_GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1194 CSeqFeatData::E_Choice type,
1195 CSeqFeatData::ESubtype subtype,
1196 CScope& scope,
1197 bool search_both_strands = true)
1198 {
1199 TFeatScores scores;
1200 CConstRef<CSeq_feat> overlap;
1201 GetOverlappingFeatures(snp_feat.GetLocation(),
1202 type, subtype,
1203 eOverlap_Contained, scores,
1204 scope);
1205 if (scores.size()) {
1206 overlap = scores.front().second;
1207 }
1208
1209 if (search_both_strands && !overlap) {
1210 CRef<CSeq_loc> loc(new CSeq_loc);
1211 loc->Assign(snp_feat.GetLocation());
1212
1213 ENa_strand strand = GetStrand(*loc, &scope);
1214 if (strand == eNa_strand_plus || strand == eNa_strand_minus) {
1215 loc->FlipStrand();
1216 } else if (strand == eNa_strand_unknown) {
1217 loc->SetStrand(eNa_strand_minus);
1218 }
1219
1220 scores.clear();
1221 GetOverlappingFeatures(*loc,
1222 type, subtype,
1223 eOverlap_Contained, scores,
1224 scope);
1225 if (scores.size()) {
1226 overlap = scores.front().second;
1227 }
1228 }
1229
1230 return overlap;
1231 }
1232
1233
GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::E_Choice type,CScope & scope,bool search_both_strands)1234 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1235 CSeqFeatData::E_Choice type,
1236 CScope& scope,
1237 bool search_both_strands)
1238 {
1239 return x_GetBestOverlapForSNP(snp_feat, type, CSeqFeatData::eSubtype_any,
1240 scope, search_both_strands);
1241 }
1242
1243
GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::ESubtype subtype,CScope & scope,bool search_both_strands)1244 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1245 CSeqFeatData::ESubtype subtype,
1246 CScope& scope,
1247 bool search_both_strands)
1248 {
1249 return x_GetBestOverlapForSNP(snp_feat,
1250 CSeqFeatData::GetTypeFromSubtype(subtype), subtype, scope,
1251 search_both_strands);
1252 }
1253
1254
GetOverlappingGene(const CSeq_loc & loc,CScope & scope,ETransSplicing eTransSplicing)1255 CConstRef<CSeq_feat> GetOverlappingGene(
1256 const CSeq_loc& loc, CScope& scope,
1257 ETransSplicing eTransSplicing )
1258 {
1259 switch ( eTransSplicing ) {
1260 case eTransSplicing_Auto:
1261 {
1262 ENa_strand strand = loc.GetStrand();
1263 if (strand == eNa_strand_both || strand == eNa_strand_other) {
1264 // Mixed strand indicates trans-splicing must be on.
1265 return GetOverlappingGene(loc, scope, eTransSplicing_Yes);
1266 }
1267 // Try with trans-splicing on first. If it finds nothing, try
1268 // to turn it off.
1269 CConstRef<CSeq_feat> ret = GetOverlappingGene(loc, scope, eTransSplicing_Yes);
1270 return ret ? ret : GetOverlappingGene(loc, scope, eTransSplicing_No);
1271 }
1272 case eTransSplicing_Yes:
1273 {
1274 // If trans-splicing is on, the result must be a multi-range gene.
1275 CConstRef<CSeq_feat> ret = GetBestOverlappingFeat(loc,
1276 CSeqFeatData::eSubtype_gene,
1277 eOverlap_Contained, scope, fBestFeat_IgnoreStrand);
1278 if ( ret ) {
1279 CSeq_loc_CI it(ret->GetLocation());
1280 ++it;
1281 if ( !it ) ret.Reset();
1282 }
1283 return ret;
1284 }
1285 case eTransSplicing_No:
1286 {
1287 // Multi-range genes assume trans-splicing=on and should not be included
1288 // when it's off.
1289 CConstRef<CSeq_feat> ret = GetBestOverlappingFeat(loc,
1290 CSeqFeatData::eSubtype_gene,
1291 eOverlap_Contained, scope, 0);
1292 if ( ret ) {
1293 CSeq_loc_CI it(ret->GetLocation());
1294 ++it;
1295 if ( it ) ret.Reset();
1296 }
1297 return ret;
1298 }
1299 }
1300 return null;
1301 }
1302
1303
IsTransSpliced(const CSeq_feat & feat)1304 bool IsTransSpliced(const CSeq_feat& feat)
1305 {
1306 // note - even if the exception says "trans-splicing", it isn't really trans-splicing if
1307 // it's a single interval
1308 if (feat.IsSetExcept_text() && NStr::Find(feat.GetExcept_text(), "trans-splicing") != string::npos
1309 && !feat.GetLocation().IsInt()) {
1310 return true;
1311 } else {
1312 return false;
1313 }
1314 }
1315
1316
IsPseudo(const CSeq_feat & feat,CScope & scope)1317 bool IsPseudo(const CSeq_feat& feat, CScope& scope)
1318 {
1319 if (feat.IsSetPseudo() && feat.GetPseudo()) {
1320 return true;
1321 }
1322 if (feat.IsSetQual()) {
1323 ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
1324 if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudogene")) {
1325 return true;
1326 }
1327 }
1328 }
1329 if (feat.GetData().IsGene()) {
1330 if (feat.GetData().GetGene().IsSetPseudo() && feat.GetData().GetGene().GetPseudo()) {
1331 return true;
1332 }
1333 } else {
1334 if (feat.IsSetXref()) {
1335 ITERATE(CSeq_feat::TXref, it, feat.GetXref()) {
1336 if ((*it)->IsSetData() && (*it)->GetData().IsGene() &&
1337 (*it)->GetData().GetGene().IsSetPseudo() &&
1338 (*it)->GetData().GetGene().GetPseudo()) {
1339 return true;
1340 }
1341 }
1342 }
1343 CConstRef<CSeq_feat> gene = GetGeneForFeature(feat, scope);
1344 if (gene && IsPseudo(*gene, scope)) {
1345 return true;
1346 }
1347 }
1348 return false;
1349 }
1350
GetLocalGeneByLocus(const string & locus,bool use_tag,CBioseq_Handle bsh)1351 CConstRef<CSeq_feat> GetLocalGeneByLocus(const string& locus, bool use_tag, CBioseq_Handle bsh)
1352 {
1353 CTSE_Handle tse = bsh.GetTSE_Handle();
1354 const CBioseq& b = *(bsh.GetCompleteBioseq());
1355
1356 CTSE_Handle::TSeq_feat_Handles potentials = tse.GetGenesWithLocus(locus, use_tag);
1357 //if (potentials.size() == 1) { // it may return wrong gene!
1358 // return potentials.front().GetSeq_feat();
1359 //}
1360 ITERATE(CTSE_Handle::TSeq_feat_Handles, p, potentials) {
1361 try {
1362 CSeq_id_Handle id_h = p->GetLocationId();
1363 if (id_h) {
1364 CConstRef<CSeq_id> p_id = id_h.GetSeqId();
1365 if (p_id) {
1366 ITERATE(CBioseq::TId, id, b.GetId()) {
1367 CSeq_id::E_SIC cmp = p_id->Compare(**id);
1368 if (cmp == CSeq_id::e_YES) {
1369 return p->GetSeq_feat();
1370 } else if (cmp == CSeq_id::e_NO) {
1371 break;
1372 }
1373 }
1374 }
1375 }
1376 } catch (CException&) {
1377 CSeq_loc_CI li(p->GetLocation());
1378 while (li) {
1379 try {
1380 const CSeq_id& this_id = li.GetSeq_id();
1381 ITERATE(CBioseq::TId, id, b.GetId()) {
1382 CSeq_id::E_SIC cmp = this_id.Compare(**id);
1383 if (cmp == CSeq_id::e_YES) {
1384 return p->GetSeq_feat();
1385 } else if (cmp == CSeq_id::e_NO) {
1386 break;
1387 }
1388 }
1389 } catch (CException& ) {
1390 // no Seq-id for this sublocation, keep trying
1391 }
1392 ++li;
1393 }
1394 }
1395 }
1396 return CConstRef<CSeq_feat>(NULL);
1397 }
1398
1399
GetLocalGeneByXref(const CGene_ref & gene,CBioseq_Handle bsh)1400 CConstRef<CSeq_feat> GetLocalGeneByXref(const CGene_ref& gene, CBioseq_Handle bsh)
1401 {
1402 if (gene.IsSetLocus_tag() && !(gene.GetLocus_tag().empty())) {
1403 CConstRef<CSeq_feat> f = GetLocalGeneByLocus(gene.GetLocus_tag(), true, bsh);
1404 if (f) {
1405 return f;
1406 }
1407 }
1408 if (gene.IsSetLocus() && !(gene.GetLocus().empty())) {
1409 CConstRef<CSeq_feat> f = GetLocalGeneByLocus(gene.GetLocus(), false, bsh);
1410 if (f) {
1411 return f;
1412 }
1413 }
1414 return CConstRef<CSeq_feat>(NULL);
1415 }
1416
1417
GetGeneForFeature(const CSeq_feat & feat,CScope & scope)1418 CConstRef<CSeq_feat> GetGeneForFeature(const CSeq_feat& feat, CScope& scope)
1419 {
1420 if (feat.IsSetXref()) {
1421 CBioseq_Handle bsh = GetBioseqFromSeqLoc(feat.GetLocation(), scope);
1422 if (!bsh) {
1423 return CConstRef<CSeq_feat>();
1424 }
1425 CTSE_Handle tse = bsh.GetTSE_Handle();
1426 ITERATE(CSeq_feat::TXref, xit, feat.GetXref()) {
1427 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() && (*xit)->GetData().GetGene().IsSuppressed()) {
1428 return (CConstRef <CSeq_feat>());
1429 }
1430 if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
1431 (!(*xit)->IsSetData() || (*xit)->GetData().IsGene())) {
1432 const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
1433 CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, feat_id);
1434 if (far_feats.size() > 0) {
1435 return far_feats.front().GetSeq_feat();
1436 }
1437 // if xref claims to point to gene feature but gene feature does not exist,
1438 // return NULL
1439 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1440 return CConstRef<CSeq_feat>();
1441 }
1442 } else if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1443 const CGene_ref& gene = (*xit)->GetData().GetGene();
1444 return GetLocalGeneByXref(gene, bsh);
1445 }
1446 }
1447 }
1448
1449 CConstRef <CSeq_feat> gf = GetOverlappingGene(feat.GetLocation(), scope, IsTransSpliced(feat) ? sequence::eTransSplicing_Yes : sequence::eTransSplicing_Auto);
1450 if (gf) {
1451 ECompare cmp = Compare(gf->GetLocation(), feat.GetLocation(), &scope, fCompareOverlapping);
1452 if (cmp == eContains || cmp == eSame) {
1453 return gf;
1454 }
1455 }
1456
1457 return CConstRef <CSeq_feat>();
1458 }
1459
1460
GetOverlappingmRNA(const CSeq_loc & loc,CScope & scope)1461 CConstRef<CSeq_feat> GetOverlappingmRNA(const CSeq_loc& loc, CScope& scope)
1462 {
1463 return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_mRNA,
1464 eOverlap_Contained, scope);
1465 }
1466
1467
GetOverlappingCDS(const CSeq_loc & loc,CScope & scope)1468 CConstRef<CSeq_feat> GetOverlappingCDS(const CSeq_loc& loc, CScope& scope)
1469 {
1470 return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_cdregion,
1471 eOverlap_Contained, scope);
1472 }
1473
1474
GetOverlappingPub(const CSeq_loc & loc,CScope & scope)1475 CConstRef<CSeq_feat> GetOverlappingPub(const CSeq_loc& loc, CScope& scope)
1476 {
1477 return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_pub,
1478 eOverlap_Contained, scope);
1479 }
1480
1481
GetOverlappingSource(const CSeq_loc & loc,CScope & scope)1482 CConstRef<CSeq_feat> GetOverlappingSource(const CSeq_loc& loc, CScope& scope)
1483 {
1484 return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_biosrc,
1485 eOverlap_Contained, scope);
1486 }
1487
1488
GetOverlappingOperon(const CSeq_loc & loc,CScope & scope)1489 CConstRef<CSeq_feat> GetOverlappingOperon(const CSeq_loc& loc, CScope& scope)
1490 {
1491 return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_operon,
1492 eOverlap_Contained, scope);
1493 }
1494
1495
1496 const char* kRibosomalSlippageText = "ribosomal slippage";
1497
GetBestMrnaForCds(const CSeq_feat & cds_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1498 CConstRef<CSeq_feat> GetBestMrnaForCds(const CSeq_feat& cds_feat,
1499 CScope& scope,
1500 TBestFeatOpts opts,
1501 CGetOverlappingFeaturesPlugin *plugin )
1502 {
1503 _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
1504 CConstRef<CSeq_feat> mrna_feat;
1505
1506 // search for a best overlapping mRNA
1507 // we start with a scan through the product accessions because we need
1508 // to insure that the chosen transcript does indeed match what we want
1509 TFeatScores feats;
1510 EOverlapType overlap_type = eOverlap_CheckIntRev;
1511 if (cds_feat.IsSetExcept() && cds_feat.GetExcept() &&
1512 cds_feat.IsSetExcept_text() &&
1513 cds_feat.GetExcept_text() == kRibosomalSlippageText) {
1514 overlap_type = eOverlap_SubsetRev;
1515 }
1516 GetOverlappingFeatures(cds_feat.GetLocation(),
1517 CSeqFeatData::e_Rna,
1518 CSeqFeatData::eSubtype_mRNA,
1519 overlap_type,
1520 feats, scope, opts, plugin );
1521 /// easy out: 0 or 1 possible features
1522 if (feats.size() < 2) {
1523 if (feats.size() == 1) {
1524 mrna_feat = feats.front().second;
1525 }
1526 return mrna_feat;
1527 }
1528
1529 if (cds_feat.IsSetProduct()) {
1530 try {
1531 // this may throw, if the product spans multiple sequences
1532 // this would be extremely unlikely, but we catch anyway
1533 const CSeq_id& product_id =
1534 sequence::GetId(cds_feat.GetProduct(), &scope);
1535
1536 ITERATE (TFeatScores, feat_iter, feats) {
1537 const CSeq_feat& feat = *feat_iter->second;
1538 if ( !feat.IsSetExt() ) {
1539 continue;
1540 }
1541
1542 /// scan the user object in the ext field
1543 /// we look for a user object of type MrnaProteinLink
1544 /// this should contain a seq-d string that we can match
1545 CTypeConstIterator<CUser_object> obj_iter(feat);
1546 for ( ; obj_iter; ++obj_iter) {
1547 if (obj_iter->IsSetType() &&
1548 obj_iter->GetType().IsStr() &&
1549 obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1550 string prot_id_str = obj_iter->GetField("protein seqID")
1551 .GetData().GetStr();
1552 CSeq_id prot_id(prot_id_str);
1553 vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1554 ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1555 ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1556 if (product_id.Match(*id_iter->GetSeqId())) {
1557 mrna_feat.Reset(&feat);
1558 return mrna_feat;
1559 }
1560 }
1561 }
1562 }
1563 }
1564 }
1565 catch (exception&) {
1566 }
1567 }
1568
1569 if (cds_feat.IsSetProduct() && !(opts & fBestFeat_NoExpensive) ) {
1570 try {
1571 // this may throw, if the product spans multiple sequences
1572 // this would be extremely unlikely, but we catch anyway
1573 const CSeq_id& product_id =
1574 sequence::GetId(cds_feat.GetProduct(), &scope);
1575
1576 TFeatScores matching_feats;
1577 ITERATE (TFeatScores, feat_iter, feats) {
1578
1579 // we grab the mRNA product, if available, and scan it for
1580 // a CDS feature. the CDS feature should point to the same
1581 // product as our current feature.
1582 const CSeq_feat& mrna = *feat_iter->second;
1583 if ( !mrna.IsSetProduct() ) {
1584 continue;
1585 }
1586
1587 CBioseq_Handle handle =
1588 scope.GetBioseqHandle(mrna.GetProduct());
1589 if ( !handle ) {
1590 continue;
1591 }
1592
1593 SAnnotSelector cds_sel;
1594 cds_sel.SetOverlapIntervals()
1595 .ExcludeNamedAnnots("SNP")
1596 .SetResolveTSE()
1597 .SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
1598 CFeat_CI other_iter(scope, mrna.GetProduct(), cds_sel);
1599 for ( ; other_iter && !mrna_feat; ++other_iter) {
1600 const CSeq_feat& cds = other_iter->GetOriginalFeature();
1601 if ( !cds.IsSetProduct() ) {
1602 continue;
1603 }
1604
1605 CBioseq_Handle prot_handle =
1606 scope.GetBioseqHandle(cds.GetProduct());
1607 if ( !prot_handle ) {
1608 continue;
1609 }
1610
1611 if (prot_handle.IsSynonym(product_id)) {
1612 // got it!
1613 matching_feats.push_back(*feat_iter);
1614 break;
1615 }
1616 }
1617 }
1618 if ( !matching_feats.empty() ) {
1619 // keep only matching features
1620 feats.swap(matching_feats);
1621 if ( feats.size() == 1 ) {
1622 mrna_feat = feats.front().second;
1623 return mrna_feat;
1624 }
1625 }
1626 }
1627 catch (exception&) {
1628 }
1629 }
1630
1631 // check for transcript_id; this is a fast check
1632 string transcript_id = cds_feat.GetNamedQual("transcript_id");
1633 if ( !transcript_id.empty() ) {
1634 ITERATE (vector<TFeatScore>, feat_iter, feats) {
1635 const CSeq_feat& feat = *feat_iter->second;
1636 string other_transcript_id =
1637 feat.GetNamedQual("transcript_id");
1638 if (transcript_id == other_transcript_id) {
1639 mrna_feat.Reset(&feat);
1640 return mrna_feat;
1641 }
1642 }
1643 }
1644
1645 //
1646 // try to find the best by overlaps alone
1647 //
1648
1649 if ( !mrna_feat && !(opts & fBestFeat_StrictMatch) ) {
1650 if (opts & fBestFeat_FavorLonger) {
1651 mrna_feat = feats.back().second;
1652 } else {
1653 mrna_feat = feats.front().second;
1654 }
1655 }
1656
1657 return mrna_feat;
1658 }
1659
1660
1661 // Plugin for GetOverlappingFeatures - uses eOverlap_CheckIntervals
1662 // or eOverlap_Subset depending on the "ribosomal slippage" flag
1663 // in the current feature.
1664
1665 class CCdsForMrnaPlugin : public CGetOverlappingFeaturesPlugin
1666 {
1667 public:
CCdsForMrnaPlugin(CGetOverlappingFeaturesPlugin * prev_plugin)1668 CCdsForMrnaPlugin(CGetOverlappingFeaturesPlugin* prev_plugin)
1669 : m_PrevPlugin(prev_plugin) {}
~CCdsForMrnaPlugin()1670 virtual ~CCdsForMrnaPlugin() {}
1671
processSAnnotSelector(SAnnotSelector & sel)1672 virtual void processSAnnotSelector(
1673 SAnnotSelector &sel)
1674 {
1675 if ( m_PrevPlugin ) {
1676 m_PrevPlugin->processSAnnotSelector(sel);
1677 }
1678 }
1679
setUpFeatureIterator(CBioseq_Handle & bioseq_handle,auto_ptr<CFeat_CI> & feat_ci,TSeqPos circular_length,CRange<TSeqPos> & range,const CSeq_loc & loc,SAnnotSelector & sel,CScope & scope,ENa_strand & strand)1680 virtual void setUpFeatureIterator(
1681 CBioseq_Handle &bioseq_handle,
1682 auto_ptr<CFeat_CI> &feat_ci,
1683 TSeqPos circular_length ,
1684 CRange<TSeqPos> &range,
1685 const CSeq_loc& loc,
1686 SAnnotSelector &sel,
1687 CScope &scope,
1688 ENa_strand &strand)
1689 {
1690 if ( m_PrevPlugin ) {
1691 m_PrevPlugin->setUpFeatureIterator(bioseq_handle,
1692 feat_ci, circular_length, range, loc, sel, scope, strand);
1693 return;
1694 }
1695 if ( bioseq_handle ) {
1696 feat_ci.reset(new CFeat_CI(bioseq_handle, range, strand, sel));
1697 } else {
1698 feat_ci.reset(new CFeat_CI(scope, loc, sel));
1699 }
1700 }
1701
processLoc(CBioseq_Handle & bioseq_handle,CRef<CSeq_loc> & loc,TSeqPos circular_length)1702 virtual void processLoc(
1703 CBioseq_Handle &bioseq_handle,
1704 CRef<CSeq_loc> &loc,
1705 TSeqPos circular_length)
1706 {
1707 if ( m_PrevPlugin ) {
1708 m_PrevPlugin->processLoc(bioseq_handle, loc, circular_length);
1709 }
1710 }
1711
processMainLoop(bool & shouldContinueToNextIteration,CRef<CSeq_loc> & cleaned_loc_this_iteration,CRef<CSeq_loc> & candidate_feat_loc,EOverlapType & overlap_type_this_iteration,bool & revert_locations_this_iteration,CBioseq_Handle & bioseq_handle,const CMappedFeat & feat,TSeqPos circular_length,SAnnotSelector::EOverlapType annot_overlap_type)1712 virtual void processMainLoop(
1713 bool &shouldContinueToNextIteration,
1714 CRef<CSeq_loc> &cleaned_loc_this_iteration,
1715 CRef<CSeq_loc> &candidate_feat_loc,
1716 EOverlapType &overlap_type_this_iteration,
1717 bool &revert_locations_this_iteration,
1718 CBioseq_Handle &bioseq_handle,
1719 const CMappedFeat &feat,
1720 TSeqPos circular_length,
1721 SAnnotSelector::EOverlapType annot_overlap_type)
1722 {
1723 const CSeq_feat& cds = feat.GetOriginalFeature();
1724 _ASSERT(cds.GetData().GetSubtype() ==
1725 CSeqFeatData::eSubtype_cdregion);
1726 // If the feature has "ribosomal slippage" flag set, use
1727 // eOverlap_Subset. Otherwise use more strict eOverlap_CheckIntervals.
1728 if (cds.IsSetExcept() && cds.GetExcept() &&
1729 cds.IsSetExcept_text() &&
1730 cds.GetExcept_text() == kRibosomalSlippageText) {
1731 overlap_type_this_iteration = eOverlap_Subset;
1732 }
1733 if ( m_PrevPlugin ) {
1734 m_PrevPlugin->processMainLoop(shouldContinueToNextIteration,
1735 cleaned_loc_this_iteration, candidate_feat_loc,
1736 overlap_type_this_iteration,
1737 revert_locations_this_iteration,
1738 bioseq_handle, feat, circular_length, annot_overlap_type);
1739 }
1740 }
1741
postProcessDiffAmount(Int8 & cur_diff,CRef<CSeq_loc> & cleaned_loc,CRef<CSeq_loc> & candidate_feat_loc,CScope & scope,SAnnotSelector & sel,TSeqPos circular_length)1742 virtual void postProcessDiffAmount(
1743 Int8 &cur_diff,
1744 CRef<CSeq_loc> &cleaned_loc,
1745 CRef<CSeq_loc> &candidate_feat_loc,
1746 CScope &scope,
1747 SAnnotSelector &sel,
1748 TSeqPos circular_length )
1749 {
1750 if ( m_PrevPlugin ) {
1751 m_PrevPlugin->postProcessDiffAmount(cur_diff,
1752 cleaned_loc, candidate_feat_loc,
1753 scope, sel, circular_length);
1754 }
1755 }
1756
1757 private:
1758 CGetOverlappingFeaturesPlugin* m_PrevPlugin;
1759 };
1760
1761
1762 CConstRef<CSeq_feat>
GetBestCdsForMrna(const CSeq_feat & mrna_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1763 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
1764 CScope& scope,
1765 TBestFeatOpts opts,
1766 CGetOverlappingFeaturesPlugin *plugin )
1767 {
1768 _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
1769 CConstRef<CSeq_feat> cds_feat;
1770
1771 auto_ptr<CGetOverlappingFeaturesPlugin> cds_plugin(
1772 new CCdsForMrnaPlugin(plugin));
1773 // search for a best overlapping CDS
1774 // we start with a scan through the product accessions because we need
1775 // to insure that the chosen transcript does indeed match what we want
1776 TFeatScores feats;
1777 GetOverlappingFeatures(mrna_feat.GetLocation(),
1778 CSeqFeatData::e_Cdregion,
1779 CSeqFeatData::eSubtype_cdregion,
1780 eOverlap_CheckIntervals,
1781 feats, scope, opts, cds_plugin.get());
1782
1783 /// easy out: 0 or 1 possible features
1784 if (feats.size() < 2) {
1785 if (feats.size() == 1) {
1786 cds_feat = feats.front().second;
1787 }
1788 return cds_feat;
1789 }
1790
1791 if (mrna_feat.IsSetExt()) {
1792 /// scan the user object in the ext field
1793 /// we look for a user object of type MrnaProteinLink
1794 /// this should contain a seq-d string that we can match
1795 string prot_id_str;
1796 CTypeConstIterator<CUser_object> obj_iter(mrna_feat);
1797 for ( ; obj_iter; ++obj_iter) {
1798 if (obj_iter->IsSetType() &&
1799 obj_iter->GetType().IsStr() &&
1800 obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1801 prot_id_str = obj_iter->GetField("protein seqID").GetData().GetStr();
1802 break;
1803 }
1804 }
1805 if ( !prot_id_str.empty() ) {
1806 CSeq_id prot_id(prot_id_str);
1807 vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1808 ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1809
1810 try {
1811 /// look for a CDS feature that matches this expected ID
1812 ITERATE (TFeatScores, feat_iter, feats) {
1813 const CSeq_feat& feat = *feat_iter->second;
1814 if ( !feat.IsSetProduct() ) {
1815 continue;
1816 }
1817 const CSeq_id& id =
1818 sequence::GetId(feat.GetLocation(), &scope);
1819 ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1820 if (id.Match(*id_iter->GetSeqId())) {
1821 cds_feat.Reset(&feat);
1822 return cds_feat;
1823 }
1824 }
1825 }
1826 }
1827 catch (exception&) {
1828 }
1829 }
1830 }
1831
1832 // scan through the product accessions because we need to insure that the
1833 // chosen transcript does indeed match what we want
1834 if (mrna_feat.IsSetProduct() && !(opts & fBestFeat_NoExpensive) ) {
1835 do {
1836 try {
1837 // this may throw, if the product spans multiple sequences
1838 // this would be extremely unlikely, but we catch anyway
1839 const CSeq_id& mrna_product =
1840 sequence::GetId(mrna_feat.GetProduct(), &scope);
1841 CBioseq_Handle mrna_handle =
1842 scope.GetBioseqHandle(mrna_product);
1843
1844 // find the ID of the protein accession we're looking for
1845 CConstRef<CSeq_id> protein_id;
1846 {{
1847 SAnnotSelector sel;
1848 sel.SetOverlapIntervals()
1849 .ExcludeNamedAnnots("SNP")
1850 .SetResolveTSE()
1851 .SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
1852
1853 CFeat_CI iter(mrna_handle, sel);
1854 for ( ; iter; ++iter) {
1855 if (iter->IsSetProduct()) {
1856 protein_id.Reset
1857 (&sequence::GetId(iter->GetProduct(),
1858 &scope));
1859 break;
1860 }
1861 }
1862 }}
1863
1864 if ( !protein_id ) {
1865 break;
1866 }
1867
1868 TFeatScores::const_iterator feat_iter = feats.begin();
1869 TFeatScores::const_iterator feat_end = feats.end();
1870 for ( ; feat_iter != feat_end && !cds_feat; ++feat_iter) {
1871 /// look for all contained CDS features; for each, check
1872 /// to see if the protein product is the expected protein
1873 /// product
1874 const CSeq_feat& cds = *feat_iter->second;
1875 if ( !cds.IsSetProduct() ) {
1876 continue;
1877 }
1878
1879 CBioseq_Handle prot_handle =
1880 scope.GetBioseqHandle(cds.GetProduct());
1881 if ( !prot_handle ) {
1882 continue;
1883 }
1884
1885 if (prot_handle.IsSynonym(*protein_id)) {
1886 // got it!
1887 cds_feat.Reset(&cds);
1888 return cds_feat;
1889 }
1890 }
1891 }
1892 catch ( exception& ) {
1893 }
1894 }
1895 while (false);
1896 }
1897
1898 // check for transcript_id
1899 // this is generally only available in GTF/GFF-imported features
1900 string transcript_id = mrna_feat.GetNamedQual("transcript_id");
1901 if ( !transcript_id.empty() ) {
1902 ITERATE (TFeatScores, feat_iter, feats) {
1903 const CSeq_feat& feat = *feat_iter->second;
1904 string other_transcript_id =
1905 feat.GetNamedQual("transcript_id");
1906 if (transcript_id == other_transcript_id) {
1907 cds_feat.Reset(&feat);
1908 return cds_feat;
1909 }
1910 }
1911 }
1912
1913 //
1914 // try to find the best by overlaps alone
1915 //
1916
1917 if ( !cds_feat && !(opts & fBestFeat_StrictMatch) ) {
1918 if (opts & fBestFeat_FavorLonger) {
1919 cds_feat = feats.back().second;
1920 } else {
1921 cds_feat = feats.front().second;
1922 }
1923 }
1924
1925 return cds_feat;
1926 }
1927
1928
GetBestGeneForMrna(const CSeq_feat & mrna_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1929 CConstRef<CSeq_feat> GetBestGeneForMrna(const CSeq_feat& mrna_feat,
1930 CScope& scope,
1931 TBestFeatOpts opts,
1932 CGetOverlappingFeaturesPlugin *plugin )
1933 {
1934 _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
1935 CConstRef<CSeq_feat> gene_feat;
1936
1937 // search for a best overlapping gene
1938 TFeatScores feats;
1939 GetOverlappingFeatures(mrna_feat.GetLocation(),
1940 CSeqFeatData::e_Gene,
1941 CSeqFeatData::eSubtype_any,
1942 eOverlap_Contained,
1943 feats, scope, opts, plugin );
1944 /// easy out: 0 or 1 possible features
1945 if (feats.size() < 2) {
1946 if (feats.size() == 1) {
1947 gene_feat = feats.front().second;
1948 }
1949 return gene_feat;
1950 }
1951
1952 ///
1953 /// compare gene xrefs to see if ew can find a match
1954 ///
1955 const CGene_ref* ref = mrna_feat.GetGeneXref();
1956 if (ref) {
1957 if (ref->IsSuppressed()) {
1958 /// 'suppress' case
1959 return gene_feat;
1960 }
1961
1962 string ref_str;
1963 ref->GetLabel(&ref_str);
1964
1965 ITERATE (TFeatScores, feat_it, feats) {
1966 const CSeq_feat& feat = *feat_it->second;
1967 const CGene_ref& other_ref = feat.GetData().GetGene();
1968 string other_ref_str;
1969 other_ref.GetLabel(&other_ref_str);
1970 if (ref_str == other_ref_str) {
1971 gene_feat = &feat;
1972 return gene_feat;
1973 }
1974 }
1975 }
1976
1977 ///
1978 /// compare by dbxrefs
1979 ///
1980 if (mrna_feat.IsSetDbxref()) {
1981 int gene_id = 0;
1982 ITERATE (CSeq_feat::TDbxref, dbxref, mrna_feat.GetDbxref()) {
1983 if ((*dbxref)->GetDb() == "GeneID" ||
1984 (*dbxref)->GetDb() == "LocusID") {
1985 gene_id = (*dbxref)->GetTag().GetId();
1986 break;
1987 }
1988 }
1989
1990 if (gene_id != 0) {
1991 ITERATE (TFeatScores, feat_it, feats) {
1992 const CSeq_feat& feat = *feat_it->second;
1993 ITERATE (CSeq_feat::TDbxref, dbxref, feat.GetDbxref()) {
1994 const string& db = (*dbxref)->GetDb();
1995 if ((db == "GeneID" || db == "LocusID") &&
1996 (*dbxref)->GetTag().GetId() == gene_id) {
1997 gene_feat = &feat;
1998 return gene_feat;
1999 }
2000 }
2001 }
2002 }
2003 }
2004
2005 if ( !gene_feat && !(opts & fBestFeat_StrictMatch) ) {
2006 if (opts & fBestFeat_FavorLonger) {
2007 gene_feat = feats.back().second;
2008 } else {
2009 gene_feat = feats.front().second;
2010 }
2011 }
2012
2013 return gene_feat;
2014 }
2015
2016
GetBestGeneForCds(const CSeq_feat & cds_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2017 CConstRef<CSeq_feat> GetBestGeneForCds(const CSeq_feat& cds_feat,
2018 CScope& scope,
2019 TBestFeatOpts opts,
2020 CGetOverlappingFeaturesPlugin *plugin )
2021 {
2022 _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2023
2024 CConstRef<CSeq_feat> feat_ref;
2025
2026 // search for a best overlapping gene
2027 TFeatScores feats;
2028 GetOverlappingFeatures(cds_feat.GetLocation(),
2029 CSeqFeatData::e_Gene,
2030 CSeqFeatData::eSubtype_any,
2031 eOverlap_Contained,
2032 feats, scope, opts, plugin );
2033 /// easy out: 0 or 1 possible features
2034 if (feats.size() < 2) {
2035 if (feats.size() == 1) {
2036 feat_ref = feats.front().second;
2037 }
2038 return feat_ref;
2039 }
2040
2041 // next: see if we can match based on gene xref
2042 const CGene_ref* ref = cds_feat.GetGeneXref();
2043 if (ref) {
2044 if (ref->IsSuppressed()) {
2045 /// 'suppress' case
2046 return feat_ref;
2047 }
2048
2049 ITERATE (TFeatScores, feat_it, feats) {
2050 const CSeq_feat& feat = *feat_it->second;
2051
2052 string ref_str;
2053 ref->GetLabel(&ref_str);
2054
2055 const CGene_ref& other_ref = feat.GetData().GetGene();
2056 string other_ref_str;
2057 other_ref.GetLabel(&other_ref_str);
2058 if (ref_str == other_ref_str) {
2059 feat_ref = &feat;
2060 return feat_ref;
2061 }
2062 }
2063 }
2064
2065 /// last check: expensive: need to proxy through mRNA match
2066 if ( !feat_ref && !(opts & fBestFeat_NoExpensive) ) {
2067 feat_ref = GetBestMrnaForCds(cds_feat, scope,
2068 opts | fBestFeat_StrictMatch);
2069 if (feat_ref) {
2070 feat_ref = GetBestGeneForMrna(*feat_ref, scope, opts);
2071 if (feat_ref) {
2072 return feat_ref;
2073 }
2074 }
2075 }
2076
2077 if ( !feat_ref && !(opts & fBestFeat_StrictMatch) ) {
2078 feat_ref = feats.front().second;
2079 }
2080 return feat_ref;
2081 }
2082
2083
GetMrnasForGene(const CSeq_feat & gene_feat,CScope & scope,list<CConstRef<CSeq_feat>> & mrna_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2084 void GetMrnasForGene(const CSeq_feat& gene_feat, CScope& scope,
2085 list< CConstRef<CSeq_feat> >& mrna_feats,
2086 TBestFeatOpts opts,
2087 CGetOverlappingFeaturesPlugin *plugin )
2088 {
2089 _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2090 SAnnotSelector sel;
2091 sel.SetResolveTSE()
2092 .SetAdaptiveDepth()
2093 .IncludeFeatSubtype(CSeqFeatData::eSubtype_mRNA);
2094 CFeat_CI feat_it(scope, gene_feat.GetLocation(), sel);
2095 if (feat_it.GetSize() == 0) {
2096 return;
2097 }
2098
2099 ///
2100 /// pass 1: compare by gene xref
2101 ///
2102 {{
2103 const CGene_ref& ref = gene_feat.GetData().GetGene();
2104 string ref_str;
2105 ref.GetLabel(&ref_str);
2106 size_t count = 0;
2107 for ( ; feat_it; ++feat_it) {
2108
2109 const CGene_ref* other_ref =
2110 feat_it->GetOriginalFeature().GetGeneXref();
2111 if ( !other_ref || other_ref->IsSuppressed() ) {
2112 continue;
2113 }
2114
2115 string other_ref_str;
2116 other_ref->GetLabel(&other_ref_str);
2117 if (other_ref_str != ref_str) {
2118 continue;
2119 }
2120
2121 ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2122 feat_it->GetLocation(),
2123 &scope,
2124 sequence::fCompareOverlapping);
2125 if (comp != eSame && comp != eContains) {
2126 continue;
2127 }
2128
2129 CConstRef<CSeq_feat> feat_ref(&feat_it->GetOriginalFeature());
2130 mrna_feats.push_back(feat_ref);
2131 ++count;
2132 }
2133
2134 if (count) {
2135 return;
2136 }
2137 }}
2138
2139 ///
2140 /// pass 2: compare by gene id
2141 ///
2142 {{
2143 int gene_id = 0;
2144 if (gene_feat.IsSetDbxref()) {
2145 ITERATE (CSeq_feat::TDbxref, dbxref, gene_feat.GetDbxref()) {
2146 if ((*dbxref)->GetDb() == "GeneID" ||
2147 (*dbxref)->GetDb() == "LocusID") {
2148 gene_id = (*dbxref)->GetTag().GetId();
2149 break;
2150 }
2151 }
2152 }
2153
2154 if (gene_id) {
2155 size_t count = 0;
2156 feat_it.Rewind();
2157 for ( ; feat_it; ++feat_it) {
2158 /// check the suppress case
2159 /// regardless of the gene-id binding, we always ignore these
2160 const CGene_ref* other_ref =
2161 feat_it->GetOriginalFeature().GetGeneXref();
2162 if ( other_ref && other_ref->IsSuppressed() ) {
2163 continue;
2164 }
2165
2166 CConstRef<CSeq_feat> ref(&feat_it->GetOriginalFeature());
2167
2168 ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2169 feat_it->GetLocation(),
2170 &scope,
2171 sequence::fCompareOverlapping);
2172 if (comp != eSame && comp != eContains) {
2173 continue;
2174 }
2175
2176 if (feat_it->IsSetDbxref()) {
2177 ITERATE (CSeq_feat::TDbxref, dbxref, feat_it->GetDbxref()) {
2178 if (((*dbxref)->GetDb() == "GeneID" ||
2179 (*dbxref)->GetDb() == "LocusID") &&
2180 (*dbxref)->GetTag().GetId() == gene_id) {
2181 mrna_feats.push_back(ref);
2182 ++count;
2183 break;
2184 }
2185 }
2186 }
2187 }
2188
2189 if (count) {
2190 return;
2191 }
2192 }
2193 }}
2194
2195 // gene doesn't have a gene_id or a gene ref
2196 CConstRef<CSeq_feat> feat =
2197 sequence::GetBestOverlappingFeat(gene_feat.GetLocation(),
2198 CSeqFeatData::eSubtype_mRNA,
2199 sequence::eOverlap_Contains,
2200 scope, opts, plugin );
2201 if (feat) {
2202 mrna_feats.push_back(feat);
2203 }
2204 }
2205
2206
GetCdssForGene(const CSeq_feat & gene_feat,CScope & scope,list<CConstRef<CSeq_feat>> & cds_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2207 void GetCdssForGene(const CSeq_feat& gene_feat, CScope& scope,
2208 list< CConstRef<CSeq_feat> >& cds_feats,
2209 TBestFeatOpts opts,
2210 CGetOverlappingFeaturesPlugin *plugin )
2211 {
2212 _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2213 list< CConstRef<CSeq_feat> > mrna_feats;
2214 GetMrnasForGene(gene_feat, scope, mrna_feats, opts);
2215 if (mrna_feats.size()) {
2216 ITERATE (list< CConstRef<CSeq_feat> >, iter, mrna_feats) {
2217 CConstRef<CSeq_feat> cds = GetBestCdsForMrna(**iter, scope, opts);
2218 if (cds) {
2219 cds_feats.push_back(cds);
2220 }
2221 }
2222 } else {
2223 CConstRef<CSeq_feat> feat =
2224 sequence::GetBestOverlappingFeat(gene_feat.GetLocation(),
2225 CSeqFeatData::eSubtype_cdregion,
2226 sequence::eOverlap_Subset,
2227 scope, opts, plugin );
2228 if (feat) {
2229 cds_feats.push_back(feat);
2230 }
2231 }
2232 }
2233
2234
2235 CConstRef<CSeq_feat>
GetBestOverlappingFeat(const CSeq_feat & feat,CSeqFeatData::E_Choice feat_type,sequence::EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2236 GetBestOverlappingFeat(const CSeq_feat& feat,
2237 CSeqFeatData::E_Choice feat_type,
2238 sequence::EOverlapType overlap_type,
2239 CScope& scope,
2240 TBestFeatOpts opts,
2241 CGetOverlappingFeaturesPlugin *plugin )
2242 {
2243 CConstRef<CSeq_feat> feat_ref;
2244 switch (feat_type) {
2245 case CSeqFeatData::e_Gene:
2246 return GetBestOverlappingFeat(feat,
2247 CSeqFeatData::eSubtype_gene,
2248 overlap_type, scope, opts, plugin );
2249
2250 case CSeqFeatData::e_Rna:
2251 feat_ref = GetBestOverlappingFeat(feat,
2252 CSeqFeatData::eSubtype_mRNA,
2253 overlap_type, scope, opts, plugin );
2254 break;
2255
2256 case CSeqFeatData::e_Cdregion:
2257 return GetBestOverlappingFeat(feat,
2258 CSeqFeatData::eSubtype_cdregion,
2259 overlap_type, scope, opts, plugin );
2260
2261 default:
2262 break;
2263 }
2264
2265 if ( !feat_ref ) {
2266 feat_ref = sequence::GetBestOverlappingFeat
2267 (feat.GetLocation(), feat_type, overlap_type, scope, opts, plugin );
2268 }
2269
2270 return feat_ref;
2271 }
2272
2273
2274 CConstRef<CSeq_feat>
GetBestOverlappingFeat(const CSeq_feat & feat,CSeqFeatData::ESubtype subtype,sequence::EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2275 GetBestOverlappingFeat(const CSeq_feat& feat,
2276 CSeqFeatData::ESubtype subtype,
2277 sequence::EOverlapType overlap_type,
2278 CScope& scope,
2279 TBestFeatOpts opts,
2280 CGetOverlappingFeaturesPlugin *plugin )
2281 {
2282 CConstRef<CSeq_feat> feat_ref;
2283 switch (feat.GetData().GetSubtype()) {
2284 case CSeqFeatData::eSubtype_mRNA:
2285 switch (subtype) {
2286 case CSeqFeatData::eSubtype_gene:
2287 return GetBestGeneForMrna(feat, scope, opts);
2288
2289 case CSeqFeatData::eSubtype_cdregion:
2290 return GetBestCdsForMrna(feat, scope, opts);
2291
2292 default:
2293 break;
2294 }
2295 break;
2296
2297 case CSeqFeatData::eSubtype_cdregion:
2298 switch (subtype) {
2299 case CSeqFeatData::eSubtype_mRNA:
2300 return GetBestMrnaForCds(feat, scope, opts);
2301
2302 case CSeqFeatData::eSubtype_gene:
2303 return GetBestGeneForCds(feat, scope, opts);
2304
2305 default:
2306 break;
2307 }
2308 break;
2309
2310 case CSeqFeatData::eSubtype_variation:
2311 return GetBestOverlapForSNP(feat, subtype, scope, true);
2312
2313 default:
2314 break;
2315 }
2316
2317 if ( !feat_ref ) {
2318 feat_ref = GetBestOverlappingFeat
2319 (feat.GetLocation(), subtype, overlap_type, scope, opts, plugin );
2320 }
2321
2322 return feat_ref;
2323 }
2324
2325
2326 namespace {
2327
x_GetFeatById(CSeqFeatData::ESubtype subtype,const CSeq_feat & feat,const CTSE_Handle & tse)2328 CConstRef<CSeq_feat> x_GetFeatById(CSeqFeatData::ESubtype subtype,
2329 const CSeq_feat& feat,
2330 const CTSE_Handle& tse)
2331 {
2332 if ( feat.IsSetXref() ) {
2333 ITERATE ( CSeq_feat::TXref, it, feat.GetXref() ) {
2334 const CSeqFeatXref& xref = **it;
2335 if ( xref.IsSetId() ) {
2336 const CFeat_id& id = xref.GetId();
2337 if ( id.IsLocal() ) {
2338 const CObject_id& obj_id = id.GetLocal();
2339 if ( obj_id.IsId() ) {
2340 int local_id = obj_id.GetId();
2341 CSeq_feat_Handle feat_handle =
2342 tse.GetFeatureWithId(subtype, local_id);
2343 if ( feat_handle ) {
2344 return feat_handle.GetSeq_feat();
2345 }
2346 }
2347 }
2348 }
2349 }
2350 }
2351 return null;
2352 }
2353
2354 }
2355
2356
2357 CConstRef<CSeq_feat>
GetBestGeneForMrna(const CSeq_feat & mrna_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2358 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
2359 const CTSE_Handle& tse,
2360 TBestFeatOpts opts,
2361 CGetOverlappingFeaturesPlugin *plugin)
2362 {
2363 _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
2364 CConstRef<CSeq_feat> ret =
2365 x_GetFeatById(CSeqFeatData::eSubtype_gene, mrna_feat, tse);
2366 if ( !ret ) {
2367 ret = GetBestGeneForMrna(mrna_feat, tse.GetScope(), opts);
2368 }
2369 return ret;
2370 }
2371
2372 CConstRef<CSeq_feat>
GetBestGeneForCds(const CSeq_feat & cds_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2373 GetBestGeneForCds(const CSeq_feat& cds_feat,
2374 const CTSE_Handle& tse,
2375 TBestFeatOpts opts,
2376 CGetOverlappingFeaturesPlugin *plugin)
2377 {
2378 _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2379 CConstRef<CSeq_feat> ret =
2380 x_GetFeatById(CSeqFeatData::eSubtype_gene, cds_feat, tse);
2381 if ( !ret ) {
2382 ret = GetBestGeneForCds(cds_feat, tse.GetScope(), opts);
2383 }
2384 return ret;
2385 }
2386
2387 CConstRef<CSeq_feat>
GetBestMrnaForCds(const CSeq_feat & cds_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2388 GetBestMrnaForCds(const CSeq_feat& cds_feat,
2389 const CTSE_Handle& tse,
2390 TBestFeatOpts opts,
2391 CGetOverlappingFeaturesPlugin *plugin)
2392 {
2393 _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2394 CConstRef<CSeq_feat> ret =
2395 x_GetFeatById(CSeqFeatData::eSubtype_mRNA, cds_feat, tse);
2396 if ( !ret ) {
2397 ret = GetBestMrnaForCds(cds_feat, tse.GetScope(), opts);
2398 }
2399 return ret;
2400 }
2401
2402 CConstRef<CSeq_feat>
GetBestCdsForMrna(const CSeq_feat & mrna_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2403 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
2404 const CTSE_Handle& tse,
2405 TBestFeatOpts opts,
2406 CGetOverlappingFeaturesPlugin *plugin)
2407 {
2408 _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
2409 CConstRef<CSeq_feat> ret =
2410 x_GetFeatById(CSeqFeatData::eSubtype_cdregion, mrna_feat, tse);
2411 if ( !ret ) {
2412 ret = GetBestCdsForMrna(mrna_feat, tse.GetScope(), opts);
2413 }
2414 return ret;
2415 }
2416
GetMrnasForGene(const CSeq_feat & gene_feat,const CTSE_Handle & tse,list<CConstRef<CSeq_feat>> & mrna_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2417 void GetMrnasForGene(const CSeq_feat& gene_feat,
2418 const CTSE_Handle& tse,
2419 list< CConstRef<CSeq_feat> >& mrna_feats,
2420 TBestFeatOpts opts,
2421 CGetOverlappingFeaturesPlugin *plugin)
2422 {
2423 _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2424 GetMrnasForGene(gene_feat, tse.GetScope(), mrna_feats, opts);
2425 }
2426
GetCdssForGene(const CSeq_feat & gene_feat,const CTSE_Handle & tse,list<CConstRef<CSeq_feat>> & cds_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2427 void GetCdssForGene(const CSeq_feat& gene_feat,
2428 const CTSE_Handle& tse,
2429 list< CConstRef<CSeq_feat> >& cds_feats,
2430 TBestFeatOpts opts,
2431 CGetOverlappingFeaturesPlugin *plugin)
2432 {
2433 _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2434 GetCdssForGene(gene_feat, tse.GetScope(), cds_feats, opts);
2435 }
2436
2437 // Get the encoding CDS feature of a given protein sequence.
GetCDSForProduct(const CBioseq & product,CScope * scope)2438 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope)
2439 {
2440 if ( scope == 0 ) {
2441 return 0;
2442 }
2443
2444 return GetCDSForProduct(scope->GetBioseqHandle(product));
2445 }
2446
GetCDSForProduct(const CBioseq_Handle & bsh)2447 const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& bsh)
2448 {
2449 CMappedFeat f = GetMappedCDSForProduct(bsh);
2450 if ( f ) {
2451 return &f.GetOriginalFeature();
2452 }
2453
2454 return 0;
2455 }
2456
GetMappedCDSForProduct(const CBioseq_Handle & bsh)2457 CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle& bsh)
2458 {
2459 if ( bsh ) {
2460 // try first in-TSE CDS
2461 CFeat_CI fi(bsh,
2462 SAnnotSelector(CSeqFeatData::e_Cdregion)
2463 .SetByProduct().SetLimitTSE(bsh.GetTSE_Handle()));
2464 if ( !fi ) {
2465 // then any other CDS
2466 fi = CFeat_CI(bsh,
2467 SAnnotSelector(CSeqFeatData::e_Cdregion)
2468 .SetByProduct().ExcludeTSE(bsh.GetTSE_Handle()));
2469 }
2470 if ( fi ) {
2471 // return the first one (should be the one packaged on the
2472 // nuc-prot set).
2473 return *fi;
2474 }
2475 }
2476
2477 return CMappedFeat();
2478 }
2479
2480
2481 // Get the mature peptide feature of a protein
GetPROTForProduct(const CBioseq & product,CScope * scope)2482 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope)
2483 {
2484 if ( scope == 0 ) {
2485 return 0;
2486 }
2487
2488 return GetPROTForProduct(scope->GetBioseqHandle(product));
2489 }
2490
GetPROTForProduct(const CBioseq_Handle & bsh)2491 const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& bsh)
2492 {
2493 if ( bsh ) {
2494 CFeat_CI fi(bsh, SAnnotSelector(CSeqFeatData::e_Prot).SetByProduct());
2495 if ( fi ) {
2496 return &(fi->GetOriginalFeature());
2497 }
2498 }
2499
2500 return 0;
2501 }
2502
2503
2504
2505 // Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
GetmRNAForProduct(const CBioseq & product,CScope * scope)2506 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope)
2507 {
2508 if ( scope == 0 ) {
2509 return 0;
2510 }
2511
2512 return GetmRNAForProduct(scope->GetBioseqHandle(product));
2513 }
2514
GetmRNAForProduct(const CBioseq_Handle & bsh)2515 const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& bsh)
2516 {
2517 if ( bsh ) {
2518 SAnnotSelector as(CSeqFeatData::eSubtype_mRNA);
2519 as.SetByProduct();
2520
2521 CFeat_CI fi(bsh, as);
2522 if ( fi ) {
2523 return &(fi->GetOriginalFeature());
2524 }
2525 }
2526
2527 return 0;
2528 }
2529
2530
GetMappedmRNAForProduct(const CBioseq_Handle & bsh)2531 CMappedFeat GetMappedmRNAForProduct(const CBioseq_Handle& bsh)
2532 {
2533 if ( bsh ) {
2534 CFeat_CI fi(bsh,
2535 SAnnotSelector(CSeqFeatData::eSubtype_mRNA)
2536 .SetByProduct());
2537 if ( fi ) {
2538 // return the first one (should be the one packaged on the
2539 // nuc-prot set).
2540 return *fi;
2541 }
2542 }
2543
2544 return CMappedFeat();
2545 }
2546
2547
2548 // Get the encoding sequence of a protein
GetNucleotideParent(const CBioseq & product,CScope * scope)2549 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope)
2550 {
2551 if ( scope == 0 ) {
2552 return 0;
2553 }
2554 CBioseq_Handle bsh = GetNucleotideParent(scope->GetBioseqHandle(product));
2555 return bsh ? bsh.GetCompleteBioseq() : reinterpret_cast<const CBioseq*>(0);
2556 }
2557
GetNucleotideParent(const CBioseq_Handle & bsh)2558 CBioseq_Handle GetNucleotideParent(const CBioseq_Handle& bsh)
2559 {
2560 // If protein use CDS to get to the encoding Nucleotide.
2561 // if nucleotide (cDNA) use mRNA feature.
2562 const CSeq_feat* sfp = bsh.GetInst().IsAa() ?
2563 GetCDSForProduct(bsh) : GetmRNAForProduct(bsh);
2564
2565 CBioseq_Handle ret;
2566 if ( sfp ) {
2567 try {
2568 ret = bsh.GetScope().GetBioseqHandle(sfp->GetLocation());
2569 } catch(...) {
2570 // may fail due to trans-splicing, e.g., on small-genome set
2571 }
2572 }
2573 return ret;
2574 }
2575
2576
GetParentForPart(const CBioseq_Handle & part)2577 CBioseq_Handle GetParentForPart(const CBioseq_Handle& part)
2578 {
2579 CBioseq_Handle seg;
2580
2581 if (part) {
2582 CSeq_entry_Handle segset =
2583 part.GetExactComplexityLevel(CBioseq_set::eClass_segset);
2584 if (segset) {
2585 for (CSeq_entry_CI it(segset); it; ++it) {
2586 if (it->IsSeq()) {
2587 seg = it->GetSeq();
2588 break;
2589 }
2590 }
2591 }
2592 }
2593
2594 return seg;
2595 }
2596
2597
END_SCOPE(sequence)2598 END_SCOPE(sequence)
2599
2600
2601
2602 CFastaOstream::CFastaOstream(CNcbiOstream& out)
2603 : m_Out(out),
2604 m_Flags(fInstantiateGaps | fAssembleParts | fEnableGI),
2605 m_GapMode(eGM_letters)
2606 {
2607 m_Gen.reset(new sequence::CDeflineGenerator);
2608 SetWidth(70);
2609 }
2610
~CFastaOstream()2611 CFastaOstream::~CFastaOstream()
2612 {
2613 m_Out << flush;
2614 }
2615
Write(const CSeq_entry_Handle & handle,const CSeq_loc * location)2616 void CFastaOstream::Write(const CSeq_entry_Handle& handle,
2617 const CSeq_loc* location)
2618 {
2619 for (CBioseq_CI it(handle); it; ++it) {
2620 if ( !SkipBioseq(*it) ) {
2621 if (location) {
2622 CSeq_loc loc2;
2623 loc2.SetWhole().Assign(*it->GetSeqId());
2624 int d = sequence::TestForOverlap
2625 (*location, loc2, sequence::eOverlap_Interval,
2626 kInvalidSeqPos, &handle.GetScope());
2627 if (d < 0) {
2628 continue;
2629 }
2630 }
2631 Write(*it, location);
2632 }
2633 }
2634 }
2635
2636
Write(const CBioseq_Handle & handle,const CSeq_loc * location,const string & custom_title)2637 void CFastaOstream::Write(const CBioseq_Handle& handle,
2638 const CSeq_loc* location,
2639 const string& custom_title)
2640 {
2641 WriteTitle(handle, location, custom_title);
2642 WriteSequence(handle, location);
2643 }
2644
2645
s_FastaGetOriginalID(const CBioseq & seq)2646 static string s_FastaGetOriginalID (const CBioseq& seq)
2647
2648 {
2649 FOR_EACH_SEQDESC_ON_BIOSEQ (it, seq) {
2650 const CSeqdesc& desc = **it;
2651 if (! desc.IsUser()) continue;
2652 if (! desc.GetUser().IsSetType()) continue;
2653 const CUser_object& usr = desc.GetUser();
2654 const CObject_id& oi = usr.GetType();
2655 if (! oi.IsStr()) continue;
2656 const string& type = oi.GetStr();
2657 if (! NStr::EqualNocase(type, "OrginalID") && ! NStr::EqualNocase(type, "OriginalID")) continue;
2658 FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
2659 const CUser_field& fld = **uitr;
2660 if (FIELD_IS_SET_AND_IS(fld, Label, Str)) {
2661 const string &label_str = GET_FIELD(fld.GetLabel(), Str);
2662 if (! NStr::EqualNocase(label_str, "LocalId")) continue;
2663 if (fld.IsSetData() && fld.GetData().IsStr()) {
2664 return fld.GetData().GetStr();
2665 }
2666 }
2667 }
2668 }
2669
2670 return "";
2671 }
2672
s_ShouldUseOriginalID(const CBioseq & seq)2673 static bool s_ShouldUseOriginalID (const CBioseq& seq)
2674 {
2675 FOR_EACH_SEQID_ON_BIOSEQ (id_itr, seq) {
2676 const CSeq_id& sid = **id_itr;
2677 switch (sid.Which()) {
2678 case CSeq_id::e_Local:
2679 break;
2680 case CSeq_id::e_General:
2681 {
2682 const CDbtag& dbtag = sid.GetGeneral();
2683 if (dbtag.IsSetDb()) {
2684 const string& db = dbtag.GetDb();
2685 if (! NStr::EqualNocase(db, "TMSMART") &&
2686 ! NStr::EqualNocase(db, "BankIt") &&
2687 ! NStr::EqualNocase(db, "NCBIFILE")) {
2688 return false;
2689 }
2690 }
2691 }
2692 break;
2693 default:
2694 return false;
2695 }
2696 }
2697
2698 return true;
2699 }
2700
x_GetBestId(CConstRef<CSeq_id> & gi_id,CConstRef<CSeq_id> & best_id,bool & hide_prefix,const CBioseq & bioseq)2701 void CFastaOstream::x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq)
2702 {
2703 bool is_na = bioseq.GetInst().GetMol() != CSeq_inst::eMol_aa;
2704 best_id = FindBestChoice(bioseq.GetId(), is_na ? CSeq_id::FastaNARank : CSeq_id::FastaAARank);
2705
2706 ITERATE(CBioseq::TId, id, bioseq.GetId()) {
2707 if ((*id)->IsGi()) {
2708 gi_id = *id;
2709 break;
2710 }
2711 }
2712
2713 // see SQD-4144, only Accession.Version should be shown, without prefixes and suffixes
2714 if (best_id.NotEmpty() &&
2715 (m_Flags & fEnableGI) == 0 &&
2716 (m_Flags & fHideGenBankPrefix) != 0)
2717 {
2718 switch (best_id->Which())
2719 {
2720 case CSeq_id::e_Genbank:
2721 case CSeq_id::e_Embl:
2722 case CSeq_id::e_Other:
2723 case CSeq_id::e_Ddbj:
2724 case CSeq_id::e_Tpg:
2725 case CSeq_id::e_Tpe:
2726 case CSeq_id::e_Tpd:
2727 hide_prefix = true;
2728 break;
2729 default:
2730 break;
2731 }
2732 }
2733 }
2734
x_WriteAsFasta(const CBioseq & bioseq)2735 void CFastaOstream::x_WriteAsFasta(const CBioseq& bioseq)
2736 {
2737 CConstRef<CSeq_id> best_id;
2738 CConstRef<CSeq_id> gi_id;
2739 bool hide_prefix = false;
2740
2741 // override this method and provide application specific 'best id' policy
2742 x_GetBestId(gi_id, best_id, hide_prefix, bioseq);
2743
2744 if (best_id.NotEmpty())
2745 {
2746 // RW-139, no GI in FASTA output
2747 if (gi_id.NotEmpty() && (m_Flags & fEnableGI) && !best_id->IsGi())
2748 {
2749 // FastA format
2750 // Here we have something like:
2751 // gi|###|SOME_ACCESSION|title
2752
2753 gi_id->WriteAsFasta(m_Out);
2754 m_Out << '|';
2755 }
2756
2757 const CTextseq_id* text_id = 0;
2758 if (hide_prefix)
2759 {
2760 text_id = best_id->GetTextseq_Id();
2761 }
2762
2763 if (text_id != 0)
2764 {
2765 if (text_id->IsSetAccession())
2766 {
2767 m_Out << text_id->GetAccession();
2768 if (text_id->IsSetVersion())
2769 {
2770 m_Out << "." << text_id->GetVersion();
2771 }
2772 }
2773 }
2774 else
2775 {
2776 best_id->WriteAsFasta(m_Out);
2777 }
2778 }
2779 }
2780
x_WriteSeqIds(const CBioseq & bioseq,const CSeq_loc * location)2781 void CFastaOstream::x_WriteSeqIds(const CBioseq& bioseq,
2782 const CSeq_loc* location)
2783 {
2784 bool have_range = (location != NULL && !location->IsWhole()
2785 && !(m_Flags & fSuppressRange) );
2786
2787 if ( !have_range && (m_Flags & fNoDupCheck) == 0) {
2788 ITERATE (CBioseq::TId, id, bioseq.GetId()) {
2789 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(**id);
2790 pair<TSeq_id_HandleSet::iterator, bool> p
2791 = m_PreviousWholeIds.insert(idh);
2792 if ( !p.second ) {
2793 NCBI_THROW(CObjmgrUtilException, eBadLocation,
2794 "Duplicate Seq-id " + (*id)->AsFastaString()
2795 + " in FASTA output");
2796 }
2797 }
2798 }
2799
2800 m_Out << '>';
2801 if (!(m_Flags & fIgnoreOriginalID) &&
2802 s_ShouldUseOriginalID(bioseq)) {
2803 string origID = s_FastaGetOriginalID(bioseq);
2804 if (! NStr::IsBlank(origID)) {
2805 m_Out << "lcl|" << origID;
2806 } else {
2807 x_WriteAsFasta(bioseq);
2808 }
2809 } else {
2810 x_WriteAsFasta(bioseq);
2811 }
2812
2813 if (have_range) {
2814 char delim = ':';
2815 for (CSeq_loc_CI it(*location); it; ++it) {
2816 CSeq_loc::TRange range = it.GetRange();
2817 TSeqPos from = range.GetFrom() + 1, to = range.GetTo() + 1;
2818 _ASSERT(from <= to);
2819 m_Out << delim;
2820 if (it.IsSetStrand() && IsReverse(it.GetStrand())) {
2821 m_Out << 'c' << to << '-' << from;
2822 } else {
2823 m_Out << from << '-' << to;
2824 }
2825 delim = ',';
2826 }
2827 }
2828 }
2829
2830 inline
2831 sequence::CDeflineGenerator::TUserFlags
x_GetTitleFlags(void) const2832 CFastaOstream::x_GetTitleFlags(void) const
2833 {
2834 sequence::TGetTitleFlags title_flags = 0;
2835 title_flags |= sequence::CDeflineGenerator::fFastaFormat;
2836
2837 if ((m_Flags & fNoExpensiveOps) != 0) {
2838 title_flags |= sequence::CDeflineGenerator::fNoExpensiveOps;
2839 }
2840 if ((m_Flags & fShowModifiers) != 0) {
2841 title_flags |= sequence::CDeflineGenerator::fShowModifiers;
2842 }
2843 /*
2844 if ((m_Flags & fUseAutoDef) != 0) {
2845 title_flags |= sequence::CDeflineGenerator::fUseAutoDef;
2846 }
2847 */
2848 if ((m_Flags & fDoNotUseAutoDef) == 0) {
2849 title_flags |= sequence::CDeflineGenerator::fUseAutoDef;
2850 }
2851 return title_flags;
2852 }
2853
x_WriteSeqTitle(const CBioseq_Handle & bioseq_handle,const string & custom_title)2854 void CFastaOstream::x_WriteSeqTitle(const CBioseq_Handle & bioseq_handle,
2855 const string& custom_title)
2856 {
2857 string safe_title = (!custom_title.empty()) ? custom_title
2858 : m_Gen->GenerateDefline(bioseq_handle, x_GetTitleFlags());
2859
2860 if ( !safe_title.empty() ) {
2861 if ( !(m_Flags & fKeepGTSigns) ) {
2862 NStr::ReplaceInPlace(safe_title, ">", "_");
2863 }
2864 if (safe_title[0] != ' ') {
2865 m_Out << ' ';
2866 }
2867
2868 if ((m_Flags & fHTMLEncode) != 0) {
2869 safe_title = NStr::HtmlEncode(safe_title);
2870 }
2871 m_Out << safe_title;
2872 }
2873 m_Out << '\n';
2874 }
2875
WriteTitle(const CBioseq & bioseq,const CSeq_loc * location,bool no_scope,const string & custom_title)2876 void CFastaOstream::WriteTitle(const CBioseq& bioseq,
2877 const CSeq_loc* location,
2878 bool no_scope, // not used
2879 const string& custom_title)
2880 {
2881 x_WriteSeqIds(bioseq, location);
2882 CScope scope(*CObjectManager::GetInstance());
2883 CBioseq_Handle bioseq_handle = scope.AddBioseq(bioseq);
2884 x_WriteSeqTitle(bioseq_handle, custom_title);
2885 }
2886
WriteTitle(const CBioseq_Handle & bioseq_handle,const CSeq_loc * location,const string & custom_title)2887 void CFastaOstream::WriteTitle(const CBioseq_Handle& bioseq_handle,
2888 const CSeq_loc* location,
2889 const string& custom_title)
2890 {
2891 const CBioseq& bioseq = *bioseq_handle.GetBioseqCore();
2892 x_WriteSeqIds(bioseq, location);
2893 x_WriteSeqTitle(bioseq_handle, custom_title);
2894 }
2895
2896
x_MapMask(CSeq_loc_Mapper & mapper,const CSeq_loc & mask,const CSeq_id * base_seq_id,CScope * scope)2897 CConstRef<CSeq_loc> CFastaOstream::x_MapMask(CSeq_loc_Mapper& mapper,
2898 const CSeq_loc& mask,
2899 const CSeq_id* base_seq_id,
2900 CScope* scope)
2901 {
2902 CConstRef<CSeq_loc> mapped_mask(&mask);
2903
2904 // Mapping down requires the higher-level ID as a reference, even
2905 // when given a scope, and as such should precede mapping up to
2906 // keep sequence::GetId from bombing out.
2907 if ((m_Flags & fMapMasksDown) != 0 && scope) {
2908 try {
2909 CSeq_loc_Mapper mapper_down
2910 (scope->GetBioseqHandle(sequence::GetId(*mapped_mask, scope)),
2911 CSeq_loc_Mapper::eSeqMap_Down);
2912 mapped_mask = mapped_mask->Add(*mapper_down.Map(*mapped_mask),
2913 CSeq_loc::fSortAndMerge_All, 0);
2914 } catch (CObjmgrUtilException&) {
2915 }
2916 }
2917 if ((m_Flags & fMapMasksUp) != 0 && scope && base_seq_id) {
2918 CSeq_loc_Mapper mapper_up(scope->GetBioseqHandle(*base_seq_id),
2919 CSeq_loc_Mapper::eSeqMap_Up);
2920 mapped_mask = mapped_mask->Add(*mapper_up.Map(*mapped_mask),
2921 CSeq_loc::fSortAndMerge_All, 0);
2922 }
2923 mapped_mask = mapper.Map(*mapped_mask);
2924 return mapped_mask;
2925 }
2926
2927
x_GetMaskingStates(TMSMap & masking_state,const CSeq_id * base_seq_id,const CSeq_loc * location,CScope * scope)2928 void CFastaOstream::x_GetMaskingStates(TMSMap& masking_state,
2929 const CSeq_id* base_seq_id,
2930 const CSeq_loc* location,
2931 CScope* scope)
2932 {
2933 CRef<CSeq_loc_Mapper> mapper;
2934 CBioseq_Handle bsh;
2935
2936 if (m_SoftMask.NotEmpty() || m_HardMask.NotEmpty()) {
2937 _ASSERT(base_seq_id);
2938 if (location) {
2939 CSeq_loc loc2;
2940 try {
2941 TSeqPos length = sequence::GetLength(*location, scope);
2942 loc2.SetInt().SetId().Assign(*base_seq_id);
2943 loc2.SetInt().SetFrom(0);
2944 loc2.SetInt().SetTo(length - 1);
2945 } catch (exception&) {
2946 loc2.SetWhole().Assign(*base_seq_id);
2947 }
2948 mapper.Reset(new CSeq_loc_Mapper(*location, loc2, scope));
2949 } else {
2950 // still useful for filtering out locations on other sequences
2951 CSeq_loc whole;
2952 whole.SetWhole().Assign(*base_seq_id);
2953 mapper.Reset(new CSeq_loc_Mapper(whole, whole, scope));
2954 }
2955 mapper->SetMergeAll();
2956 mapper->TruncateNonmappingRanges();
2957
2958 if (scope && (m_Flags & (fMapMasksUp | fMapMasksDown))) {
2959 bsh = scope->GetBioseqHandle(*base_seq_id);
2960 }
2961
2962 const CSeq_loc& mask = m_SoftMask ? *m_SoftMask : *m_HardMask;
2963 int type = m_SoftMask ? eSoftMask : eHardMask;
2964 CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, mask, base_seq_id,
2965 scope);
2966
2967 masking_state[0] = 0;
2968 for (CSeq_loc_CI it(*mapped_mask); it; ++it) {
2969 CSeq_loc_CI::TRange loc_range = it.GetRange();
2970 masking_state[loc_range.GetFrom()] = type;
2971 masking_state[loc_range.GetToOpen()] = 0;
2972 }
2973 }
2974
2975 if (m_SoftMask.NotEmpty() && m_HardMask.NotEmpty()) {
2976 CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, *m_HardMask,
2977 base_seq_id, scope);
2978 for (CSeq_loc_CI it(*mapped_mask); it; ++it) {
2979 CSeq_loc_CI::TRange loc_range = it.GetRange();
2980 TSeqPos from = loc_range.GetFrom();
2981 TSeqPos to = loc_range.GetToOpen();
2982 TMSMap::iterator ms_it = masking_state.lower_bound(from);
2983 int prev_state;
2984
2985 if (ms_it == masking_state.end()) {
2986 masking_state[loc_range.GetFrom()] = eHardMask;
2987 masking_state[loc_range.GetToOpen()] = 0;
2988 continue;
2989 } else if (ms_it->first == from) {
2990 prev_state = ms_it->second;
2991 ms_it->second |= eHardMask;
2992 } else {
2993 // NB: lower_bound's name is misleading, as it actually
2994 // returns the least element whose key >= from.
2995 _ASSERT(ms_it != masking_state.begin());
2996 TMSMap::iterator prev_it = ms_it;
2997 --prev_it;
2998 prev_state = prev_it->second;
2999 TMSMap::value_type value(from, prev_state | eHardMask);
3000
3001 // Add the new element (using ms_it as a position hint),
3002 // and repoint ms_it at it so that the below loop will
3003 // start at the correct position.
3004 ms_it = masking_state.insert(ms_it, value);
3005 }
3006 while (++ms_it != masking_state.end() && ms_it->first < to) {
3007 prev_state = ms_it->second;
3008 ms_it->second |= eHardMask;
3009 }
3010 if (ms_it == masking_state.end() || ms_it->first != to) {
3011 masking_state.insert(ms_it, TMSMap::value_type(to, prev_state));
3012 }
3013 }
3014 }
3015 }
3016
3017
x_WriteSequence(const CSeqVector & vec,const TMSMap & masking_state)3018 void CFastaOstream::x_WriteSequence(const CSeqVector& vec,
3019 const TMSMap& masking_state)
3020 {
3021 TSeqPos rem_line = m_Width;
3022 CSeqVector_CI it(vec);
3023 TMSMap::const_iterator ms_it = masking_state.begin();
3024 TSeqPos rem_state
3025 = (ms_it == masking_state.end() ? numeric_limits<TSeqPos>::max()
3026 : ms_it->first);
3027 int current_state = 0;
3028 CTempString uc_hard_mask_str
3029 (vec.IsProtein() ? m_UC_Xs.get() : m_UC_Ns.get(), m_Width);
3030 CTempString lc_hard_mask_str
3031 (vec.IsProtein() ? m_LC_Xs.get() : m_LC_Ns.get(), m_Width);
3032 EGapMode native_gap_mode
3033 = ((vec.GetGapChar() == '-') ? eGM_dashes : eGM_letters);
3034 CTempString alt_gap_str;
3035
3036 if (native_gap_mode == eGM_dashes) {
3037 alt_gap_str = uc_hard_mask_str;
3038 } else {
3039 alt_gap_str.assign(m_Dashes.get(), m_Width);
3040 }
3041
3042 if ((m_Flags & fReverseStrand) != 0) {
3043 it.SetStrand(Reverse(it.GetStrand()));
3044 }
3045
3046 while ( it ) {
3047 if (rem_state == 0) {
3048 _ASSERT(ms_it->first == it.GetPos());
3049 current_state = ms_it->second;
3050 if (++ms_it == masking_state.end()) {
3051 rem_state = numeric_limits<TSeqPos>::max();
3052 } else {
3053 rem_state = ms_it->first - it.GetPos();
3054 }
3055 }
3056 if( (m_Flags & fShowGapsOfSizeZero) != 0 &&
3057 it.HasZeroGapBefore() )
3058 {
3059 m_Out << "-\n";
3060 rem_line = m_Width;
3061 }
3062 if ((m_GapMode != native_gap_mode || (m_Flags & fInstantiateGaps) == 0)
3063 && it.GetGapSizeForward())
3064 {
3065 TSeqPos gap_size = it.GetGapSizeForward();
3066 if (m_GapMode == eGM_one_dash
3067 || (m_Flags & fInstantiateGaps) == 0) {
3068 m_Out << "-\n";
3069 rem_line = m_Width;
3070 } else if (m_GapMode == eGM_count) {
3071 if (rem_line < m_Width) {
3072 m_Out << '\n';
3073 }
3074 _ASSERT(it.GetCurrentSeqMap_CI().GetType() == CSeqMap::eSeqGap);
3075 if (it.GetCurrentSeqMap_CI().IsUnknownLength()) {
3076 // conventional designation, regardless of nominal length
3077 if( gap_size > 0 && (m_Flags & fKeepUnknGapNomLen) != 0 )
3078 {
3079 m_Out << ">?unk" << gap_size;
3080 } else {
3081 m_Out << ">?unk100";
3082 }
3083 } else {
3084 m_Out << ">?" << gap_size;
3085 }
3086 // print gap mods, if requested
3087 if( (m_Flags & fShowGapModifiers) != 0 )
3088 {
3089 CConstRef<CSeq_literal> pGapLiteral =
3090 it.GetCurrentSeqMap_CI().GetRefGapLiteral();
3091 if( pGapLiteral &&
3092 FIELD_IS_SET_AND_IS(*pGapLiteral, Seq_data, Gap) )
3093 {
3094 const CSeq_gap & seq_gap =
3095 pGapLiteral->GetSeq_data().GetGap();
3096 SGapModText gap_mod_text;
3097 GetGapModText(seq_gap, gap_mod_text);
3098
3099 CNcbiOstrstream gap_mod_strm;
3100 gap_mod_text.WriteAllModsAsFasta(gap_mod_strm);
3101 const string sGapModText =
3102 CNcbiOstrstreamToString(gap_mod_strm);
3103 if( ! sGapModText.empty() ) {
3104 m_Out << ' ' << sGapModText;
3105 }
3106 }
3107 }
3108 m_Out << '\n';
3109 rem_line = m_Width;
3110 } else {
3111 TSeqPos rem_gap = gap_size;
3112 while (rem_gap >= rem_line) {
3113 x_WriteBuffer(alt_gap_str.data(), rem_line);
3114 m_Out << '\n';
3115 rem_gap -= rem_line;
3116 rem_line = m_Width;
3117 }
3118 if (rem_gap > 0) {
3119 x_WriteBuffer(alt_gap_str.data(), rem_gap);
3120 rem_line -= rem_gap;
3121 }
3122 }
3123 it.SkipGap();
3124 if (rem_state >= gap_size) {
3125 rem_state -= gap_size;
3126 } else {
3127 while (++ms_it != masking_state.end()
3128 && ms_it->first < it.GetPos()) {
3129 current_state = ms_it->second;
3130 }
3131 if (ms_it == masking_state.end()) {
3132 rem_state = numeric_limits<TSeqPos>::max();
3133 } else {
3134 rem_state = ms_it->first - it.GetPos();
3135 }
3136 }
3137 } else {
3138 TSeqPos count = min(TSeqPos(it.GetBufferSize()), rem_state);
3139 TSeqPos new_pos = it.GetPos() + count;
3140 const char* ptr = it.GetBufferPtr();
3141 string lc_buffer;
3142
3143 rem_state -= count;
3144 if (current_state & eHardMask) {
3145 ptr = (current_state & eSoftMask) ? lc_hard_mask_str.data()
3146 : uc_hard_mask_str.data();
3147 } else if (current_state & eSoftMask) {
3148 // ToLower() always operates in place. :-/
3149 lc_buffer.assign(ptr, count);
3150 NStr::ToLower(lc_buffer);
3151 ptr = lc_buffer.data();
3152 }
3153 while ( count >= rem_line ) {
3154 x_WriteBuffer(ptr, rem_line);
3155 if ( !(current_state & eHardMask) ) {
3156 ptr += rem_line;
3157 }
3158 count -= rem_line;
3159 m_Out << '\n';
3160 rem_line = m_Width;
3161 }
3162 if ( count > 0 ) {
3163 x_WriteBuffer(ptr, count);
3164 rem_line -= count;
3165 }
3166 it.SetPos(new_pos);
3167 }
3168 }
3169 if ( rem_line < m_Width ) {
3170 m_Out << '\n';
3171 }
3172 // m_Out << NcbiFlush;
3173 }
3174
3175
WriteSequence(const CBioseq_Handle & handle,const CSeq_loc * location,const CSeq_loc::EOpFlags merge_flags)3176 void CFastaOstream::WriteSequence(const CBioseq_Handle& handle,
3177 const CSeq_loc* location,
3178 const CSeq_loc::EOpFlags merge_flags)
3179
3180 {
3181 vector<CTSE_Handle> used_tses;
3182 if ( !(m_Flags & fAssembleParts) && !handle.IsSetInst_Seq_data() ) {
3183 SSeqMapSelector sel(CSeqMap::fFindInnerRef, (size_t)-1);
3184 sel.SetLinkUsedTSE(handle.GetTSE_Handle());
3185 sel.SetLinkUsedTSE(used_tses);
3186 if ( !handle.GetSeqMap().CanResolveRange(&handle.GetScope(), sel) ) {
3187 return;
3188 }
3189 }
3190
3191 CScope& scope = handle.GetScope();
3192 CSeqVector v;
3193 if (location) {
3194 if (sequence::SeqLocCheck(*location, &scope)
3195 == sequence::eSeqLocCheck_error) {
3196 string label;
3197 location->GetLabel(&label);
3198 NCBI_THROW(CObjmgrUtilException, eBadLocation,
3199 "CFastaOstream: location out of range: " + label);
3200 }
3201 CRef<CSeq_loc> merged
3202 = sequence::Seq_loc_Merge(*location, merge_flags, &scope);
3203 v = CSeqVector(*merged, scope, CBioseq_Handle::eCoding_Iupac);
3204 } else {
3205 v = handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3206 }
3207 if (v.IsProtein()) { // allow extensions
3208 v.SetCoding(CSeq_data::e_Ncbieaa);
3209 }
3210
3211 TMSMap masking_state;
3212 if (m_SoftMask.NotEmpty() || m_HardMask.NotEmpty()) {
3213 x_GetMaskingStates(masking_state, handle.GetSeqId(), location, &scope);
3214 }
3215 x_WriteSequence(v, masking_state);
3216 }
3217
3218
Write(const CSeq_entry & entry,const CSeq_loc * location,bool no_scope)3219 void CFastaOstream::Write(const CSeq_entry& entry, const CSeq_loc* location,
3220 bool no_scope)
3221 {
3222 if (location || !no_scope) {
3223 CScope scope(*CObjectManager::GetInstance());
3224 Write(scope.AddTopLevelSeqEntry(entry), location);
3225 } else {
3226 switch (entry.Which()) {
3227 case CSeq_entry::e_Seq:
3228 Write(entry.GetSeq(), location, no_scope);
3229 break;
3230 case CSeq_entry::e_Set:
3231 ITERATE (CBioseq_set::TSeq_set, it, entry.GetSet().GetSeq_set()) {
3232 Write(**it, location, no_scope);
3233 }
3234 break;
3235 default:
3236 // throw
3237 break;
3238 }
3239 }
3240 }
3241
3242
Write(const CBioseq & seq,const CSeq_loc * location,bool no_scope,const string & custom_title)3243 void CFastaOstream::Write(const CBioseq& seq, const CSeq_loc* location,
3244 bool no_scope, const string& custom_title )
3245 {
3246 CScope scope(*CObjectManager::GetInstance());
3247 CBioseq_Handle bioseq_handle = scope.AddBioseq(seq);
3248 if (location || !no_scope) {
3249 Write(bioseq_handle, location, custom_title);
3250 } else {
3251 /// write our title
3252 x_WriteSeqIds(seq, NULL);
3253 x_WriteSeqTitle(bioseq_handle, custom_title);
3254
3255 /// write the sequence
3256 TMSMap masking_state;
3257 x_GetMaskingStates(masking_state, NULL, NULL, NULL);
3258
3259 /// check to see if all of our segments are resolvable
3260 bool is_raw = true;
3261 switch (seq.GetInst().GetRepr()) {
3262 case CSeq_inst::eRepr_raw:
3263 break;
3264 case CSeq_inst::eRepr_delta:
3265 ITERATE (CSeq_inst::TExt::TDelta::Tdata, iter,
3266 seq.GetInst().GetExt().GetDelta().Get()) {
3267 if ((*iter)->Which() == CDelta_seq::e_Loc) {
3268 is_raw = false;
3269 break;
3270 }
3271 }
3272 break;
3273 default:
3274 is_raw = false;
3275 break;
3276 }
3277
3278 if (is_raw) {
3279 CSeqVector vec(seq, NULL, CBioseq_Handle::eCoding_Iupac);
3280 if (vec.IsProtein()) { // allow extensions
3281 vec.SetCoding(CSeq_data::e_Ncbieaa);
3282 }
3283 x_WriteSequence(vec, masking_state);
3284 } else {
3285 /// we require far-pointer resolution
3286 CScope scope(*CObjectManager::GetInstance());
3287 CBioseq_Handle bsh = scope.AddBioseq(seq);
3288 CSeqVector vec(bsh, CBioseq_Handle::eCoding_Iupac);
3289 if (vec.IsProtein()) {
3290 vec.SetCoding(CSeq_data::e_Ncbieaa);
3291 }
3292 x_WriteSequence(vec, masking_state);
3293 }
3294 }
3295 }
3296
3297
GetMask(EMaskType type) const3298 CConstRef<CSeq_loc> CFastaOstream::GetMask(EMaskType type) const
3299 {
3300 return (type == eSoftMask) ? m_SoftMask : m_HardMask;
3301 }
3302
3303
SetMask(EMaskType type,CConstRef<CSeq_loc> location)3304 void CFastaOstream::SetMask(EMaskType type, CConstRef<CSeq_loc> location)
3305 {
3306 ((type == eSoftMask) ? m_SoftMask : m_HardMask) = location;
3307 }
3308
3309
SetWidth(TSeqPos width)3310 void CFastaOstream::SetWidth(TSeqPos width)
3311 {
3312 m_Width = width;
3313 m_Dashes.reset(new char[width]); memset(m_Dashes.get(), '-', width);
3314 m_LC_Ns .reset(new char[width]); memset(m_LC_Ns .get(), 'n', width);
3315 m_LC_Xs .reset(new char[width]); memset(m_LC_Xs .get(), 'x', width);
3316 m_UC_Ns .reset(new char[width]); memset(m_UC_Ns .get(), 'N', width);
3317 m_UC_Xs .reset(new char[width]); memset(m_UC_Xs .get(), 'X', width);
3318 }
3319
3320 void
WriteAllModsAsFasta(CNcbiOstream & out) const3321 CFastaOstream::SGapModText::WriteAllModsAsFasta(
3322 CNcbiOstream & out ) const
3323 {
3324 string sPrefix;
3325 if( ! gap_type.empty() ) {
3326 out << sPrefix << "[gap-type=" << gap_type << ']';
3327 sPrefix = " ";
3328 }
3329 if( ! gap_linkage_evidences.empty() ) {
3330 out << sPrefix << "[linkage-evidence=" << NStr::Join(gap_linkage_evidences, ";") << ']';
3331 sPrefix = " ";
3332 }
3333 }
3334
3335 // static
3336 void
GetGapModText(const CSeq_gap & seq_gap,SGapModText & out_gap_info)3337 CFastaOstream::GetGapModText(
3338 const CSeq_gap & seq_gap,
3339 SGapModText & out_gap_info )
3340 {
3341 // convenience references
3342 string & gap_type = out_gap_info.gap_type;
3343 vector<string> & gap_linkage_evidences =
3344 out_gap_info.gap_linkage_evidences;
3345
3346 // make sure initialized
3347 gap_type.clear();
3348 gap_linkage_evidences.clear();
3349
3350 // true if we need to have a /linkage-evidence tag.
3351 // Also, if this is false, we should *not* have any
3352 // linkage-evidence tag
3353 bool need_evidence = false;
3354
3355 // determine if we're linked, and also determine if
3356 // we need linkage-evidence
3357 bool is_linkage =
3358 seq_gap.CanGetLinkage() &&
3359 seq_gap.GetLinkage() == CSeq_gap::eLinkage_linked;
3360
3361 if ( seq_gap.IsSetLinkage_evidence() ) {
3362 is_linkage = true; /* do not rely solely on Seq-gap.linkage, which is not always set correctly */
3363 }
3364
3365 // For /gap_type qual
3366 if( seq_gap.CanGetType() ) {
3367 switch( seq_gap.GetType() ) {
3368 case CSeq_gap::eType_unknown:
3369 // don't show /gap_type - policy changed at SQD-1801
3370 gap_type = "unknown";
3371 need_evidence = is_linkage;
3372 break;
3373 case CSeq_gap::eType_fragment:
3374 gap_type = "within scaffold";
3375 need_evidence = true;
3376 break;
3377 case CSeq_gap::eType_clone:
3378 gap_type = ( is_linkage ? "within scaffold" : "between scaffolds" );
3379 need_evidence = is_linkage;
3380 break;
3381 case CSeq_gap::eType_short_arm:
3382 gap_type = "short arm";
3383 break;
3384 case CSeq_gap::eType_heterochromatin:
3385 gap_type = "heterochromatin";
3386 break;
3387 case CSeq_gap::eType_centromere:
3388 gap_type = "centromere";
3389 break;
3390 case CSeq_gap::eType_telomere:
3391 gap_type = "telomere";
3392 break;
3393 case CSeq_gap::eType_repeat:
3394 gap_type = ( is_linkage ?
3395 "repeat within scaffold" :
3396 "repeat between scaffolds" );
3397 need_evidence = is_linkage;
3398 break;
3399 case CSeq_gap::eType_contig:
3400 gap_type = "between scaffolds";
3401 break;
3402 case CSeq_gap::eType_scaffold:
3403 gap_type = "within scaffold";
3404 need_evidence = is_linkage;
3405 break;
3406 case CSeq_gap::eType_contamination:
3407 gap_type = "contamination";
3408 need_evidence = is_linkage;
3409 break;
3410 case CSeq_gap::eType_other:
3411 gap_type = "other";
3412 break;
3413 default:
3414 gap_type = "(ERROR: UNRECOGNIZED_GAP_TYPE:" +
3415 NStr::IntToString(seq_gap.GetType()) + ")";
3416 break;
3417 }
3418 }
3419
3420 // For linkage evidence
3421 if( seq_gap.CanGetLinkage_evidence() ) {
3422 ITERATE( CSeq_gap::TLinkage_evidence,
3423 evidence_iter,
3424 seq_gap.GetLinkage_evidence() )
3425 {
3426 const CLinkage_evidence & evidence = **evidence_iter;
3427 if( evidence.CanGetType() ) {
3428 switch( evidence.GetType() ) {
3429 case CLinkage_evidence::eType_paired_ends:
3430 gap_linkage_evidences.push_back("paired-ends");
3431 break;
3432 case CLinkage_evidence::eType_align_genus:
3433 gap_linkage_evidences.push_back("align genus");
3434 break;
3435 case CLinkage_evidence::eType_align_xgenus:
3436 gap_linkage_evidences.push_back("align xgenus");
3437 break;
3438 case CLinkage_evidence::eType_align_trnscpt:
3439 gap_linkage_evidences.push_back("align trnscpt");
3440 break;
3441 case CLinkage_evidence::eType_within_clone:
3442 gap_linkage_evidences.push_back("within clone");
3443 break;
3444 case CLinkage_evidence::eType_clone_contig:
3445 gap_linkage_evidences.push_back("clone contig");
3446 break;
3447 case CLinkage_evidence::eType_map:
3448 gap_linkage_evidences.push_back("map");
3449 break;
3450 case CLinkage_evidence::eType_strobe:
3451 gap_linkage_evidences.push_back("strobe");
3452 break;
3453 case CLinkage_evidence::eType_unspecified:
3454 gap_linkage_evidences.push_back("unspecified");
3455 break;
3456 case CLinkage_evidence::eType_pcr:
3457 gap_linkage_evidences.push_back("pcr");
3458 break;
3459 case CLinkage_evidence::eType_proximity_ligation:
3460 gap_linkage_evidences.push_back("proximity ligation");
3461 break;
3462 case CLinkage_evidence::eType_other:
3463 gap_linkage_evidences.push_back("other");
3464 break;
3465 default:
3466 gap_linkage_evidences.push_back("(UNRECOGNIZED LINKAGE EVIDENCE:" +
3467 NStr::IntToString( evidence.GetType() ) + ")");
3468 break;
3469 }
3470 }
3471 }
3472 }
3473
3474 if( need_evidence && gap_linkage_evidences.empty() ) {
3475 gap_linkage_evidences.push_back("unspecified");
3476 } else if( ! need_evidence && ! gap_linkage_evidences.empty() ) {
3477 // This case shouldn't happen if the validator is checking
3478 // records first.
3479 gap_linkage_evidences.clear();
3480 }
3481 }
3482
3483 /////////////////////////////////////////////////////////////////////////////
3484 //
3485 // sequence translation
3486 //
3487
3488
3489 template <class Container>
x_Translate(const Container & seq,string & prot,int frame,const CGenetic_code * code,bool is_5prime_complete,bool is_3prime_complete,bool include_stop,bool remove_trailing_X,bool * alt_start)3490 void x_Translate(const Container& seq,
3491 string& prot,
3492 int frame,
3493 const CGenetic_code* code,
3494 bool is_5prime_complete,
3495 bool is_3prime_complete,
3496 bool include_stop,
3497 bool remove_trailing_X,
3498 bool* alt_start)
3499 {
3500 // reserve our space
3501 const size_t usable_size = seq.size() > frame ? seq.size() - frame : 0;
3502 const size_t mod = usable_size % 3;
3503 prot.erase();
3504 prot.reserve((usable_size + 2) / 3);
3505
3506 // get appropriate translation table
3507 const CTrans_table & tbl =
3508 (code ? CGen_code_table::GetTransTable(*code) :
3509 CGen_code_table::GetTransTable(1));
3510
3511 char aa = '\0';
3512 int state = 0;
3513 int start_state = 0;
3514 try {
3515 // main loop through bases
3516 typename Container::const_iterator start = seq.begin();
3517 {{
3518 for (int i = 0; i < frame; ++i) {
3519 ++start;
3520 }
3521 }}
3522
3523 size_t i;
3524 size_t k;
3525 size_t length = usable_size / 3;
3526 bool check_start = (is_5prime_complete && frame == 0);
3527 bool first_time = true;
3528
3529 for (i = 0; i < length; ++i) {
3530
3531 // loop through one codon at a time
3532 for (k = 0; k < 3; ++k, ++start) {
3533 state = tbl.NextCodonState(state, *start);
3534 }
3535
3536 if (first_time) {
3537 start_state = state;
3538 }
3539
3540 // save translated amino acid
3541 if (first_time && check_start) {
3542 aa = tbl.GetStartResidue(state);
3543 prot.append(1, aa);
3544 } else {
3545 aa = tbl.GetCodonResidue(state);
3546 prot.append(1, aa);
3547 }
3548
3549 first_time = false;
3550 }
3551
3552 if (mod) {
3553 for (k = 0; k < mod; ++k, ++start) {
3554 state = tbl.NextCodonState(state, *start);
3555 }
3556
3557 for (; k < 3; ++k) {
3558 state = tbl.NextCodonState(state, 'N');
3559 }
3560
3561 if (first_time) {
3562 start_state = state;
3563 }
3564
3565 // save translated amino acid
3566 char c = tbl.GetCodonResidue(state);
3567 if (first_time && check_start) {
3568 aa = tbl.GetStartResidue(state);
3569 prot.append(1, aa);
3570 } else if (c != 'X') {
3571 // if padding was needed, trim ambiguous last residue
3572 aa = tbl.GetCodonResidue(state);
3573 prot.append(1, aa);
3574 }
3575 }
3576 } catch (CSeqVectorException& /*ex*/) {
3577 // ran out of sequence
3578 }
3579
3580 if ( aa != '*' && include_stop && (! mod) && prot.size() > 0 && is_3prime_complete ) {
3581 // check for stop codon that normally encodes an amino acid
3582 aa = tbl.GetStopResidue(state);
3583 if (aa == '*') {
3584 prot[prot.size()-1] = aa;
3585 }
3586 }
3587
3588 // check for alternative start codon
3589 if (alt_start && is_5prime_complete) {
3590 if ( tbl.IsAltStart(start_state) ) {
3591 *alt_start = true;
3592 } else {
3593 *alt_start = false;
3594 }
3595 }
3596
3597 if ( !include_stop ) {
3598 SIZE_TYPE sz = prot.find_first_of("*");
3599 if (sz != string::npos) {
3600 prot.resize(sz);
3601 }
3602 }
3603
3604 if (remove_trailing_X) {
3605 SIZE_TYPE sz;
3606 for (sz = prot.size(); sz > 0 && prot[sz - 1] == 'X'; --sz) {
3607 }
3608 prot.resize(sz);
3609 }
3610
3611 /**
3612 cerr << "source: ";
3613 ITERATE (typename Container, it, seq) {
3614 cerr << *it;
3615 }
3616 cerr << endl;
3617 cerr << "xlate: ";
3618 ITERATE (string, it, prot) {
3619 cerr << *it;
3620 }
3621 cerr << endl;
3622 **/
3623 }
3624
3625
AddAAToDeltaSeq(CRef<CBioseq> prot,char residue)3626 static void AddAAToDeltaSeq (CRef<CBioseq> prot, char residue)
3627 {
3628 if (prot->SetInst().SetExt().SetDelta().Set().empty()
3629 || prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetSeq_data().IsGap()) {
3630 // either first seg or transitioning from gap, need new seg
3631 CRef<CDelta_seq> seg(new CDelta_seq());
3632 seg->SetLiteral().SetLength(0);
3633 prot->SetInst().SetExt().SetDelta().Set().push_back(seg);
3634 }
3635
3636 CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3637
3638 if (residue == '*' || residue == '-') {
3639 // found a residue that is not part of the IUPACAA alphabet, must convert to NCBIEAA
3640 if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsIupacaa()) {
3641 // convert to ncbieaa
3642 string current = last->GetLiteral().GetSeq_data().GetIupacaa().Get();
3643 last->SetLiteral().SetSeq_data().SetNcbieaa().Set(current);
3644 }
3645 // add *
3646 last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3647 } else if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsNcbieaa()) {
3648 // already using NCBIEAA, must continue to do so
3649 last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3650 } else {
3651 // so far, have not found residues that are not part of IUPACAA, can continue to use IUPACAA
3652 last->SetLiteral().SetSeq_data().SetIupacaa().Set().append(1, residue);
3653 }
3654
3655 TSeqPos len = last->GetLiteral().GetLength();
3656 last->SetLiteral().SetLength(len + 1);
3657 }
3658
3659
AddGapToDeltaSeq(CRef<CBioseq> prot,bool unknown_length,TSeqPos add_len)3660 static void AddGapToDeltaSeq (CRef<CBioseq>prot, bool unknown_length, TSeqPos add_len)
3661 {
3662 if (prot->SetInst().SetExt().SetDelta().Set().empty()) {
3663 // create new segment for gap
3664 CRef<CDelta_seq> new_seg(new CDelta_seq());
3665 new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3666 new_seg->SetLiteral().SetLength(add_len);
3667 if (unknown_length) {
3668 new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3669 }
3670 prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3671 } else {
3672 CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3673 if (last->SetLiteral().GetSeq_data().IsGap()
3674 && ((unknown_length && last->SetLiteral().IsSetFuzz())
3675 || (!unknown_length && !last->SetLiteral().IsSetFuzz()))) {
3676 // ok, already creating gap segment with correct fuzz
3677 TSeqPos len = prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetLength();
3678 prot->SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(len + add_len);
3679 } else {
3680 // create new segment for gap
3681 CRef<CDelta_seq> new_seg(new CDelta_seq());
3682 new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3683 new_seg->SetLiteral().SetLength(add_len);
3684 if (unknown_length) {
3685 new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3686 }
3687 prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3688 }
3689 }
3690 }
3691
3692
TranslateToProtein(const CSeq_feat & cds,CScope & scope)3693 CRef<CBioseq> CSeqTranslator::TranslateToProtein(const CSeq_feat& cds,
3694 CScope& scope)
3695 {
3696 const CGenetic_code* code = NULL;
3697 int frame = 0;
3698 if (cds.GetData().IsCdregion()) {
3699 const CCdregion& cdr = cds.GetData().GetCdregion();
3700 if (cdr.IsSetFrame()) {
3701 switch (cdr.GetFrame()) {
3702 case CCdregion::eFrame_two:
3703 frame = 1;
3704 break;
3705 case CCdregion::eFrame_three:
3706 frame = 2;
3707 break;
3708 default:
3709 break;
3710 }
3711 }
3712 if (cdr.IsSetCode()) {
3713 code = &cdr.GetCode();
3714 }
3715 }
3716 bool is_5prime_complete = !cds.GetLocation().IsPartialStart(eExtreme_Biological);
3717
3718 CSeqVector seq(cds.GetLocation(), scope, CBioseq_Handle::eCoding_Iupac);
3719 CConstRef<CSeqMap> map;
3720 map.Reset(&seq.GetSeqMap());
3721
3722 CRef<CBioseq> prot(new CBioseq());
3723
3724 prot->SetInst().SetRepr(CSeq_inst::eRepr_delta);
3725 prot->SetInst().SetMol(CSeq_inst::eMol_aa);
3726 prot->SetInst().SetLength(0);
3727
3728 // reserve our space
3729 const TSeqPos usable_size = TSeqPos(seq.size()) - frame;
3730 const TSeqPos mod = usable_size % 3;
3731
3732 // get appropriate translation table
3733 const CTrans_table & tbl =
3734 (code ? CGen_code_table::GetTransTable(*code) :
3735 CGen_code_table::GetTransTable(1));
3736
3737 try {
3738 // main loop through bases
3739 CSeqVector::const_iterator start = seq.begin();
3740 for (int i = 0; i < frame; ++i) {
3741 ++start;
3742 }
3743
3744 TSeqPos i;
3745 TSeqPos k;
3746 int state = 0;
3747 TSeqPos length = usable_size / 3;
3748 bool check_start = (is_5prime_complete && frame == 0);
3749 bool first_time = true;
3750
3751 for (i = 0; i < length; ++i) {
3752 bool is_gap = true;
3753 bool unknown_length = false;
3754 TSeqPos pos = (i * 3) + frame;
3755
3756 if (start.HasZeroGapBefore()) {
3757 AddGapToDeltaSeq(prot, true, 0);
3758 }
3759
3760 // loop through one codon at a time
3761 for (k = 0; k < 3; ++k, ++start) {
3762 state = tbl.NextCodonState(state, *start);
3763 if (seq.IsInGap(pos + k)) {
3764 if (is_gap && !unknown_length) {
3765 CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3766 if (map_iter.GetType() == CSeqMap::eSeqGap
3767 && map_iter.IsUnknownLength()) {
3768 unknown_length = true;
3769 }
3770 }
3771 } else {
3772 is_gap = false;
3773 }
3774 }
3775
3776 if (is_gap) {
3777 AddGapToDeltaSeq(prot, unknown_length, 1);
3778 } else {
3779 // save translated amino acid
3780 if (first_time && check_start) {
3781 AddAAToDeltaSeq(prot, tbl.GetStartResidue(state));
3782 } else {
3783 AddAAToDeltaSeq(prot, tbl.GetCodonResidue(state));
3784 }
3785
3786 }
3787
3788 first_time = false;
3789 }
3790
3791 if (mod) {
3792 bool is_gap = true;
3793 bool unknown_length = false;
3794 TSeqPos pos = (length * 3) + frame;
3795 for (k = 0; k < mod; ++k, ++start) {
3796 state = tbl.NextCodonState(state, *start);
3797 if (seq.IsInGap(pos + k)) {
3798 if (is_gap && !unknown_length) {
3799 CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3800 if (map_iter.GetType() == CSeqMap::eSeqGap) {
3801 if (map_iter.IsUnknownLength()) {
3802 unknown_length = true;
3803 }
3804 }
3805 }
3806 } else {
3807 is_gap = false;
3808 }
3809 }
3810
3811 if (is_gap) {
3812 AddGapToDeltaSeq(prot, unknown_length, 1);
3813 } else {
3814 for (; k < 3; ++k) {
3815 state = tbl.NextCodonState(state, 'N');
3816 }
3817
3818 // save translated amino acid
3819 char c = tbl.GetCodonResidue(state);
3820 if (c != 'X') {
3821 if (first_time && check_start) {
3822 AddAAToDeltaSeq(prot, tbl.GetStartResidue(state));
3823 } else {
3824 AddAAToDeltaSeq(prot, tbl.GetCodonResidue(state));
3825 }
3826 }
3827 }
3828 }
3829 } catch (CSeqVectorException& /*ex*/) {
3830 // ran out of sequence
3831 }
3832
3833 TSeqPos prot_len = 0;
3834 ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
3835 prot_len += (*seg_it)->GetLiteral().GetLength();
3836 }
3837
3838 // code break substitution
3839 if (cds.GetData().IsCdregion() &&
3840 cds.GetData().GetCdregion().IsSetCode_break()) {
3841 const CCdregion& cdr = cds.GetData().GetCdregion();
3842 ITERATE(CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
3843 const CRef <CCode_break> brk = *code_break;
3844 const CSeq_loc& cbk_loc = brk->GetLoc();
3845 TSeqPos seq_pos =
3846 sequence::LocationOffset(cds.GetLocation(), cbk_loc,
3847 sequence::eOffset_FromStart,
3848 &scope);
3849 seq_pos -= frame;
3850 string::size_type j = seq_pos / 3;
3851 if (j < prot_len) {
3852 const CCode_break::C_Aa& c_aa = brk->GetAa();
3853 if (c_aa.IsNcbieaa()) {
3854 CDelta_ext::Tdata::iterator seg_it = prot->SetInst().SetExt().SetDelta().Set().begin();
3855 string::size_type offset = 0;
3856 while (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
3857 && offset + (*seg_it)->GetLiteral().GetLength() < j) {
3858 offset += (*seg_it)->GetLiteral().GetLength();
3859 ++seg_it;
3860 }
3861 if (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
3862 && !(*seg_it)->GetLiteral().GetSeq_data().IsGap()) {
3863 if ((*seg_it)->GetLiteral().GetSeq_data().IsIupacaa()) {
3864 (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set()[j - offset] = c_aa.GetNcbieaa();
3865 } else {
3866 (*seg_it)->SetLiteral().SetSeq_data().SetNcbieaa().Set()[j - offset] = c_aa.GetNcbieaa();
3867 }
3868 }
3869 }
3870 } else if (j == prot_len) {
3871 // add terminal exception
3872 const CCode_break::C_Aa& c_aa = brk->GetAa();
3873 if (c_aa.IsNcbieaa() && c_aa.GetNcbieaa() == 42) {
3874 AddAAToDeltaSeq(prot, c_aa.GetNcbieaa());
3875 }
3876 }
3877 }
3878 }
3879
3880 // remove stop codon from end
3881 CRef<CDelta_seq> end;
3882 if (!prot->SetInst().SetExt().SetDelta().Set().empty())
3883 {
3884 end = prot->SetInst().SetExt().SetDelta().Set().back();
3885 }
3886
3887 if (end && end->IsLiteral() && end->GetLiteral().IsSetSeq_data()) {
3888 if (end->GetLiteral().GetSeq_data().IsIupacaa()) {
3889 string& last_seg = end->SetLiteral().SetSeq_data().SetIupacaa().Set();
3890 if (NStr::EndsWith(last_seg, "*")) {
3891 last_seg = last_seg.substr(0, last_seg.length() - 1);
3892 end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
3893 }
3894 } else if (end->GetLiteral().GetSeq_data().IsNcbieaa()) {
3895 string& last_seg = end->SetLiteral().SetSeq_data().SetNcbieaa().Set();
3896 if (NStr::EndsWith(last_seg, "*")) {
3897 last_seg = last_seg.substr(0, last_seg.length() - 1);
3898 end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
3899 }
3900 }
3901 }
3902
3903 // recalculate protein length, check need for ncbieaa - may have been altered by removal of stop codon/transl_except
3904 prot_len = 0;
3905 NON_CONST_ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
3906 prot_len += (*seg_it)->GetLiteral().GetLength();
3907 if ((*seg_it)->GetLiteral().IsSetSeq_data()
3908 && (*seg_it)->GetLiteral().GetSeq_data().IsNcbieaa()) {
3909 string current = (*seg_it)->GetLiteral().GetSeq_data().GetNcbieaa();
3910 if (NStr::Find(current, "*") == string::npos && NStr::Find(current, "-") == string::npos) {
3911 (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set(current);
3912 }
3913 }
3914 }
3915 prot->SetInst().SetLength(prot_len);
3916
3917 if (prot->GetInst().GetLength() == 0) {
3918 prot.Reset(NULL);
3919 } else if (prot->SetInst().SetExt().SetDelta().Set().size() == 1
3920 && prot->SetInst().SetExt().SetDelta().Set().front()->IsLiteral()
3921 && prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().IsSetSeq_data()) {
3922 // only one segment, should be raw rather than delta
3923 if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsIupacaa()) {
3924 string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetIupacaa().Get();
3925 prot->SetInst().ResetExt();
3926 prot->SetInst().SetSeq_data().SetIupacaa().Set(data);
3927 prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
3928 } else if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsNcbieaa()) {
3929 string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetNcbieaa().Get();
3930 prot->SetInst().ResetExt();
3931 prot->SetInst().SetSeq_data().SetNcbieaa().Set(data);
3932 prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
3933 }
3934 }
3935
3936 return prot;
3937 }
3938
3939
ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein)3940 bool CSeqTranslator::ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein)
3941 {
3942 if (!protein || !protein->IsAa() || !protein->IsSetInst()) {
3943 return false;
3944 }
3945 return protein->SetInst().ConvertDeltaToRaw();
3946 }
3947
3948
Translate(const string & seq,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start,bool is_5prime_complete,bool is_3prime_complete)3949 void CSeqTranslator::Translate(const string& seq, string& prot,
3950 const CGenetic_code* code,
3951 bool include_stop,
3952 bool remove_trailing_X,
3953 bool* alt_start,
3954 bool is_5prime_complete,
3955 bool is_3prime_complete)
3956 {
3957 x_Translate(seq, prot, 0, code,
3958 is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
3959 }
3960
3961
Translate(const string & seq,string & prot,TTranslationFlags flags,const CGenetic_code * code,bool * alt_start)3962 void CSeqTranslator::Translate(const string& seq,
3963 string& prot,
3964 TTranslationFlags flags,
3965 const CGenetic_code* code,
3966 bool* alt_start)
3967 {
3968 x_Translate(seq, prot, 0, code,
3969 !(flags & fIs5PrimePartial),
3970 !(flags & fIs3PrimePartial),
3971 !(flags & fNoStop),
3972 flags & fRemoveTrailingX,
3973 alt_start);
3974 }
3975
3976
Translate(const CSeqVector & seq,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start,bool is_5prime_complete,bool is_3prime_complete)3977 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
3978 const CGenetic_code* code,
3979 bool include_stop,
3980 bool remove_trailing_X,
3981 bool* alt_start,
3982 bool is_5prime_complete,
3983 bool is_3prime_complete)
3984 {
3985 x_Translate(seq, prot, 0, code,
3986 is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
3987 }
3988
3989
Translate(const CSeqVector & seq,string & prot,TTranslationFlags flags,const CGenetic_code * code,bool * alt_start)3990 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
3991 TTranslationFlags flags,
3992 const CGenetic_code* code,
3993 bool* alt_start)
3994 {
3995 x_Translate(seq, prot, 0, code,
3996 !(flags & fIs5PrimePartial),
3997 !(flags & fIs3PrimePartial),
3998 !(flags & fNoStop),
3999 flags & fRemoveTrailingX,
4000 alt_start);
4001 }
4002
4003
Translate(const CSeq_loc & loc,const CBioseq_Handle & handle,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start)4004 void CSeqTranslator::Translate(const CSeq_loc& loc,
4005 const CBioseq_Handle& handle,
4006 string& prot,
4007 const CGenetic_code* code,
4008 bool include_stop,
4009 bool remove_trailing_X,
4010 bool* alt_start)
4011 {
4012 CSeqVector seq(loc, handle.GetScope(), CBioseq_Handle::eCoding_Iupac);
4013 x_Translate(seq, prot, 0, code,
4014 !loc.IsPartialStart(eExtreme_Biological),
4015 !loc.IsPartialStop(eExtreme_Biological),
4016 include_stop, remove_trailing_X, alt_start);
4017 }
4018
4019
4020
Translate(const CSeq_loc & loc,CScope & scope,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start)4021 void CSeqTranslator::Translate(const CSeq_loc& loc,
4022 CScope& scope,
4023 string& prot,
4024 const CGenetic_code* code,
4025 bool include_stop,
4026 bool remove_trailing_X,
4027 bool* alt_start)
4028 {
4029 CSeqVector seq(loc, scope, CBioseq_Handle::eCoding_Iupac);
4030 x_Translate(seq, prot, 0, code,
4031 !loc.IsPartialStart(eExtreme_Biological),
4032 !loc.IsPartialStop(eExtreme_Biological),
4033 include_stop, remove_trailing_X, alt_start);
4034 }
4035
4036
Translate(const CSeq_feat & feat,CScope & scope,string & prot,bool include_stop,bool remove_trailing_X,bool * alt_start)4037 void CSeqTranslator::Translate(const CSeq_feat& feat,
4038 CScope& scope,
4039 string& prot,
4040 bool include_stop,
4041 bool remove_trailing_X,
4042 bool* alt_start)
4043 {
4044 const CGenetic_code* code = NULL;
4045 int frame = 0;
4046 if (feat.GetData().IsCdregion()) {
4047 const CCdregion& cdr = feat.GetData().GetCdregion();
4048 if (cdr.IsSetFrame ()) {
4049 switch (cdr.GetFrame ()) {
4050 case CCdregion::eFrame_two :
4051 frame = 1;
4052 break;
4053 case CCdregion::eFrame_three :
4054 frame = 2;
4055 break;
4056 default :
4057 break;
4058 }
4059 }
4060 if (cdr.IsSetCode()) {
4061 code = &cdr.GetCode();
4062 }
4063 }
4064
4065 bool code_break_include_stop = include_stop;
4066 if (feat.GetData().IsCdregion() &&
4067 feat.GetData().GetCdregion().IsSetCode_break()) {
4068 code_break_include_stop = true;
4069 }
4070
4071 CSeqVector seq(feat.GetLocation(), scope, CBioseq_Handle::eCoding_Iupac);
4072 x_Translate(seq, prot, frame, code,
4073 !feat.GetLocation().IsPartialStart(eExtreme_Biological),
4074 !feat.GetLocation().IsPartialStop(eExtreme_Biological),
4075 code_break_include_stop, remove_trailing_X, alt_start);
4076
4077
4078 // code break substitution
4079 if (feat.GetData().IsCdregion() &&
4080 feat.GetData().GetCdregion().IsSetCode_break()) {
4081 const CCdregion& cdr = feat.GetData().GetCdregion();
4082 string::size_type protlen = prot.size();
4083 ITERATE (CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
4084 const CRef <CCode_break> brk = *code_break;
4085 const CSeq_loc& cbk_loc = brk->GetLoc();
4086 TSeqPos seq_pos =
4087 sequence::LocationOffset(feat.GetLocation(), cbk_loc,
4088 sequence::eOffset_FromStart,
4089 &scope);
4090 seq_pos -= frame;
4091 string::size_type i = seq_pos / 3;
4092 if (i < protlen) {
4093 const CCode_break::C_Aa& c_aa = brk->GetAa ();
4094 if (c_aa.IsNcbieaa ()) {
4095 prot [i] = c_aa.GetNcbieaa ();
4096 }
4097 } else if (i == protlen) {
4098 // add terminal exception
4099 const CCode_break::C_Aa& c_aa = brk->GetAa ();
4100 if (c_aa.IsNcbieaa () && c_aa.GetNcbieaa () == 42) {
4101 prot += c_aa.GetNcbieaa ();
4102 }
4103 }
4104 }
4105
4106 if ( !include_stop ) {
4107 SIZE_TYPE sz = prot.find_first_of("*");
4108 if (sz != string::npos) {
4109 prot.resize(sz);
4110 }
4111 }
4112 }
4113 }
4114
4115
4116 typedef struct {
4117 bool has_final_stop;
4118 bool has_internal_stop;
4119 bool has_start_m;
4120 size_t len;
4121 size_t frame_offset;
4122 } SFrameInfo;
4123
4124 typedef map<CCdregion::EFrame, SFrameInfo> TFrameInfoMap;
4125
FindBestFrame(const CSeq_feat & cds,CScope & scope,bool & ambiguous)4126 CCdregion::EFrame CSeqTranslator::FindBestFrame(const CSeq_feat& cds, CScope& scope, bool& ambiguous)
4127 {
4128 ambiguous = false;
4129 if (!cds.IsSetLocation() || !cds.IsSetData() || !cds.GetData().IsCdregion()) {
4130 return CCdregion::eFrame_not_set;
4131 }
4132 const CCdregion& cdr = cds.GetData().GetCdregion();
4133
4134 CCdregion::EFrame orig_frame = cdr.IsSetFrame() ? cdr.GetFrame() : CCdregion::eFrame_one;
4135 if (orig_frame == CCdregion::eFrame_not_set) {
4136 orig_frame = CCdregion::eFrame_one;
4137 }
4138
4139 CRef<CSeq_feat> tmp_cds(new CSeq_feat());
4140 tmp_cds->Assign(cds);
4141 TFrameInfoMap frame_map;
4142 frame_map[CCdregion::eFrame_one] = { false, false, false, NPOS, 0 };
4143 frame_map[CCdregion::eFrame_two] = { false, false, false, NPOS, 1 };
4144 frame_map[CCdregion::eFrame_three] = { false, false, false, NPOS, 2 };
4145
4146 bool is_3complete = !tmp_cds->GetLocation().IsPartialStop(eExtreme_Biological);
4147 bool is_5complete = !tmp_cds->GetLocation().IsPartialStart(eExtreme_Biological);
4148
4149 size_t leftover = sequence::GetLength(tmp_cds->GetLocation(), &scope) % 3;
4150
4151 for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4152 tmp_cds->SetData().SetCdregion().SetFrame(it->first);
4153 string prot;
4154 CSeqTranslator::Translate(*tmp_cds, scope, prot, true, false, NULL);
4155 size_t pos = NStr::Find(prot, "*");
4156 it->second.len = prot.length();
4157
4158 if ((pos == prot.length() - 1) && (leftover == it->second.frame_offset)) {
4159 it->second.has_final_stop = true;
4160 } else if (pos != NPOS) {
4161 it->second.has_internal_stop = true;
4162 }
4163
4164 if (NStr::StartsWith(prot, "M") && it->second.frame_offset == 0) {
4165 it->second.has_start_m = true;
4166 }
4167 }
4168
4169 // if the original frame has no internal stop codons and has a final
4170 // stop codon, keep the original frame
4171 if (frame_map[orig_frame].has_final_stop) {
4172 return orig_frame;
4173 }
4174
4175 if (is_3complete && !is_5complete) {
4176 // find a frame that has a stop codon
4177 for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4178 if (it->second.has_final_stop) {
4179 return it->first;
4180 }
4181 }
4182 }
4183
4184 if (is_5complete && !is_3complete) {
4185 // find a frame that has a start codon (could only be first frame)
4186 if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4187 return CCdregion::eFrame_one;
4188 }
4189 }
4190
4191 if (is_5complete) {
4192 // find a frame that has a start codon (could only be first frame)
4193 if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4194 return CCdregion::eFrame_one;
4195 }
4196 }
4197
4198 if (is_3complete) {
4199 // find a frame that has a stop codon
4200 for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4201 if (it->second.has_final_stop) {
4202 return it->first;
4203 }
4204 }
4205 }
4206
4207 // otherwise, just looking for no internal stop codon
4208 if (!frame_map[orig_frame].has_internal_stop) {
4209 return orig_frame;
4210 }
4211
4212 CCdregion::EFrame best_frame = CCdregion::eFrame_not_set;
4213 for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4214 if (!it->second.has_internal_stop) {
4215 if (best_frame == CCdregion::eFrame_not_set) {
4216 best_frame = it->first;
4217 } else {
4218 ambiguous = true;
4219 }
4220 }
4221 }
4222 if (best_frame != CCdregion::eFrame_not_set) {
4223 return best_frame;
4224 } else {
4225 return orig_frame;
4226 }
4227 }
4228
4229
FindBestFrame(const CSeq_feat & cds,CScope & scope)4230 CCdregion::EFrame CSeqTranslator::FindBestFrame(const CSeq_feat& cds, CScope& scope)
4231 {
4232 bool ambiguous = false;
4233
4234 return FindBestFrame(cds, scope, ambiguous);
4235 }
4236
4237
TranslateCdregion(string & prot,const CBioseq_Handle & bsh,const CSeq_loc & loc,const CCdregion & cdr,bool include_stop,bool remove_trailing_X,bool * alt_start,ETranslationLengthProblemOptions)4238 void CCdregion_translate::TranslateCdregion (string& prot,
4239 const CBioseq_Handle& bsh,
4240 const CSeq_loc& loc,
4241 const CCdregion& cdr,
4242 bool include_stop,
4243 bool remove_trailing_X,
4244 bool* alt_start,
4245 ETranslationLengthProblemOptions /*options*/)
4246 {
4247 CSeq_feat feat;
4248 feat.SetLocation(const_cast<CSeq_loc&>(loc));
4249 feat.SetData().SetCdregion(const_cast<CCdregion&>(cdr));
4250 CSeqTranslator::Translate(feat, bsh.GetScope(), prot,
4251 include_stop, remove_trailing_X, alt_start);
4252 }
4253
4254
TranslateCdregion(string & prot,const CSeq_feat & cds,CScope & scope,bool include_stop,bool remove_trailing_X,bool * alt_start,ETranslationLengthProblemOptions)4255 void CCdregion_translate::TranslateCdregion (
4256 string& prot,
4257 const CSeq_feat& cds,
4258 CScope& scope,
4259 bool include_stop,
4260 bool remove_trailing_X,
4261 bool* alt_start,
4262 ETranslationLengthProblemOptions /*options*/)
4263 {
4264 _ASSERT(cds.GetData().IsCdregion());
4265 prot.erase();
4266 CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
4267 if ( !bsh ) {
4268 return;
4269 }
4270 CSeqTranslator::Translate(cds, bsh.GetScope(), prot,
4271 include_stop, remove_trailing_X, alt_start);
4272 }
4273
4274
SRelLoc(const CSeq_loc & parent,const CSeq_loc & child,CScope * scope,SRelLoc::TFlags flags)4275 SRelLoc::SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope,
4276 SRelLoc::TFlags flags)
4277 : m_ParentLoc(&parent)
4278 {
4279 typedef CSeq_loc::TRange TRange0;
4280 for (CSeq_loc_CI cit(child); cit; ++cit) {
4281 const CSeq_id& cseqid = cit.GetSeq_id();
4282 TRange0 crange = cit.GetRange();
4283 if (crange.IsWholeTo() && scope) {
4284 // determine actual end
4285 crange.SetToOpen(sequence::GetLength(cit.GetSeq_id(), scope));
4286 }
4287 ENa_strand cstrand = cit.GetStrand();
4288 TSeqPos pos = 0;
4289 for (CSeq_loc_CI pit(parent); pit; ++pit) {
4290 ENa_strand pstrand = pit.GetStrand();
4291 TRange0 prange = pit.GetRange();
4292 if (prange.IsWholeTo() && scope) {
4293 // determine actual end
4294 prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4295 }
4296 if ( !sequence::IsSameBioseq(cseqid, pit.GetSeq_id(), scope) ) {
4297 pos += prange.GetLength();
4298 continue;
4299 }
4300 CRef<TRange> intersection(new TRange);
4301 TSeqPos abs_from, abs_to;
4302 CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4303 if (crange.GetFrom() >= prange.GetFrom()) {
4304 abs_from = crange.GetFrom();
4305 fuzz_from = cit.GetFuzzFrom();
4306 if (abs_from == prange.GetFrom()) {
4307 // subtract out parent fuzz, if any
4308 const CInt_fuzz* pfuzz = pit.GetFuzzFrom();
4309 if (pfuzz) {
4310 if (fuzz_from) {
4311 CRef<CInt_fuzz> f(new CInt_fuzz);
4312 f->Assign(*fuzz_from);
4313 f->Subtract(*pfuzz, abs_from, abs_from);
4314 if (f->IsP_m() && !f->GetP_m() ) {
4315 fuzz_from.Reset(); // cancelled
4316 } else {
4317 fuzz_from = f;
4318 }
4319 } else {
4320 fuzz_from = pfuzz->Negative(abs_from);
4321 }
4322 }
4323 }
4324 } else {
4325 abs_from = prange.GetFrom();
4326 // fuzz_from = pit.GetFuzzFrom();
4327 CRef<CInt_fuzz> f(new CInt_fuzz);
4328 f->SetLim(CInt_fuzz::eLim_lt);
4329 fuzz_from = f;
4330 }
4331 if (crange.GetTo() <= prange.GetTo()) {
4332 abs_to = crange.GetTo();
4333 fuzz_to = cit.GetFuzzTo();
4334 if (abs_to == prange.GetTo()) {
4335 // subtract out parent fuzz, if any
4336 const CInt_fuzz* pfuzz = pit.GetFuzzTo();
4337 if (pfuzz) {
4338 if (fuzz_to) {
4339 CRef<CInt_fuzz> f(new CInt_fuzz);
4340 f->Assign(*fuzz_to);
4341 f->Subtract(*pfuzz, abs_to, abs_to);
4342 if (f->IsP_m() && !f->GetP_m() ) {
4343 fuzz_to.Reset(); // cancelled
4344 } else {
4345 fuzz_to = f;
4346 }
4347 } else {
4348 fuzz_to = pfuzz->Negative(abs_to);
4349 }
4350 }
4351 }
4352 } else {
4353 abs_to = prange.GetTo();
4354 // fuzz_to = pit.GetFuzzTo();
4355 CRef<CInt_fuzz> f(new CInt_fuzz);
4356 f->SetLim(CInt_fuzz::eLim_gt);
4357 fuzz_to = f;
4358 }
4359 if (abs_from <= abs_to) {
4360 if (IsReverse(pstrand)) {
4361 TSeqPos sigma = pos + prange.GetTo();
4362 intersection->SetFrom(sigma - abs_to);
4363 intersection->SetTo (sigma - abs_from);
4364 if (fuzz_from) {
4365 intersection->SetFuzz_to().AssignTranslated
4366 (*fuzz_from, intersection->GetTo(), abs_from);
4367 intersection->SetFuzz_to().Negate
4368 (intersection->GetTo());
4369 }
4370 if (fuzz_to) {
4371 intersection->SetFuzz_from().AssignTranslated
4372 (*fuzz_to, intersection->GetFrom(), abs_to);
4373 intersection->SetFuzz_from().Negate
4374 (intersection->GetFrom());
4375 }
4376 if (cstrand == eNa_strand_unknown) {
4377 intersection->SetStrand(pstrand);
4378 } else {
4379 intersection->SetStrand(Reverse(cstrand));
4380 }
4381 } else {
4382 TSignedSeqPos delta = pos - prange.GetFrom();
4383 intersection->SetFrom(abs_from + delta);
4384 intersection->SetTo (abs_to + delta);
4385 if (fuzz_from) {
4386 intersection->SetFuzz_from().AssignTranslated
4387 (*fuzz_from, intersection->GetFrom(), abs_from);
4388 }
4389 if (fuzz_to) {
4390 intersection->SetFuzz_to().AssignTranslated
4391 (*fuzz_to, intersection->GetTo(), abs_to);
4392 }
4393 if (cstrand == eNa_strand_unknown) {
4394 intersection->SetStrand(pstrand);
4395 } else {
4396 intersection->SetStrand(cstrand);
4397 }
4398 }
4399 // add to m_Ranges, combining with the previous
4400 // interval if possible
4401 if ( !(flags & fNoMerge) && !m_Ranges.empty()
4402 && SameOrientation(intersection->GetStrand(),
4403 m_Ranges.back()->GetStrand()) ) {
4404 if (m_Ranges.back()->GetTo() == intersection->GetFrom() - 1
4405 && !IsReverse(intersection->GetStrand()) ) {
4406 m_Ranges.back()->SetTo(intersection->GetTo());
4407 if (intersection->IsSetFuzz_to()) {
4408 m_Ranges.back()->SetFuzz_to
4409 (intersection->SetFuzz_to());
4410 } else {
4411 m_Ranges.back()->ResetFuzz_to();
4412 }
4413 } else if (m_Ranges.back()->GetFrom()
4414 == intersection->GetTo() + 1
4415 && IsReverse(intersection->GetStrand())) {
4416 m_Ranges.back()->SetFrom(intersection->GetFrom());
4417 if (intersection->IsSetFuzz_from()) {
4418 m_Ranges.back()->SetFuzz_from
4419 (intersection->SetFuzz_from());
4420 } else {
4421 m_Ranges.back()->ResetFuzz_from();
4422 }
4423 } else {
4424 m_Ranges.push_back(intersection);
4425 }
4426 } else {
4427 m_Ranges.push_back(intersection);
4428 }
4429 }
4430 pos += prange.GetLength();
4431 }
4432 }
4433 }
4434
4435
4436 // Bother trying to merge?
Resolve(const CSeq_loc & new_parent,CScope * scope,SRelLoc::TFlags) const4437 CRef<CSeq_loc> SRelLoc::Resolve(const CSeq_loc& new_parent, CScope* scope,
4438 SRelLoc::TFlags /* flags */)
4439 const
4440 {
4441 typedef CSeq_loc::TRange TRange0;
4442 CRef<CSeq_loc> result(new CSeq_loc);
4443 CSeq_loc_mix& mix = result->SetMix();
4444 ITERATE (TRanges, it, m_Ranges) {
4445 _ASSERT((*it)->GetFrom() <= (*it)->GetTo());
4446 TSeqPos pos = 0, start = (*it)->GetFrom();
4447 bool keep_going = true;
4448 for (CSeq_loc_CI pit(new_parent); pit; ++pit) {
4449 TRange0 prange = pit.GetRange();
4450 if (prange.IsWholeTo() && scope) {
4451 // determine actual end
4452 prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4453 }
4454 TSeqPos length = prange.GetLength();
4455 if (start >= pos && start < pos + length) {
4456 TSeqPos from, to;
4457 CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4458 ENa_strand strand;
4459 if (IsReverse(pit.GetStrand())) {
4460 TSeqPos sigma = pos + prange.GetTo();
4461 from = sigma - (*it)->GetTo();
4462 to = sigma - start;
4463 if (from < prange.GetFrom() || from > sigma) {
4464 from = prange.GetFrom();
4465 keep_going = true;
4466 } else {
4467 keep_going = false;
4468 }
4469 if ( !(*it)->IsSetStrand()
4470 || (*it)->GetStrand() == eNa_strand_unknown) {
4471 strand = pit.GetStrand();
4472 } else {
4473 strand = Reverse((*it)->GetStrand());
4474 }
4475 if (from == prange.GetFrom()) {
4476 fuzz_from = pit.GetFuzzFrom();
4477 }
4478 if ( !keep_going && (*it)->IsSetFuzz_to() ) {
4479 CRef<CInt_fuzz> f(new CInt_fuzz);
4480 if (fuzz_from) {
4481 f->Assign(*fuzz_from);
4482 } else {
4483 f->SetP_m(0);
4484 }
4485 f->Subtract((*it)->GetFuzz_to(), from, (*it)->GetTo(),
4486 CInt_fuzz::eAmplify);
4487 if (f->IsP_m() && !f->GetP_m() ) {
4488 fuzz_from.Reset(); // cancelled
4489 } else {
4490 fuzz_from = f;
4491 }
4492 }
4493 if (to == prange.GetTo()) {
4494 fuzz_to = pit.GetFuzzTo();
4495 }
4496 if (start == (*it)->GetFrom()
4497 && (*it)->IsSetFuzz_from()) {
4498 CRef<CInt_fuzz> f(new CInt_fuzz);
4499 if (fuzz_to) {
4500 f->Assign(*fuzz_to);
4501 } else {
4502 f->SetP_m(0);
4503 }
4504 f->Subtract((*it)->GetFuzz_from(), to,
4505 (*it)->GetFrom(), CInt_fuzz::eAmplify);
4506 if (f->IsP_m() && !f->GetP_m() ) {
4507 fuzz_to.Reset(); // cancelled
4508 } else {
4509 fuzz_to = f;
4510 }
4511 }
4512 } else {
4513 TSignedSeqPos delta = prange.GetFrom() - pos;
4514 from = start + delta;
4515 to = (*it)->GetTo() + delta;
4516 if (to > prange.GetTo()) {
4517 to = prange.GetTo();
4518 keep_going = true;
4519 } else {
4520 keep_going = false;
4521 }
4522 if ( !(*it)->IsSetStrand()
4523 || (*it)->GetStrand() == eNa_strand_unknown) {
4524 strand = pit.GetStrand();
4525 } else {
4526 strand = (*it)->GetStrand();
4527 }
4528 if (from == prange.GetFrom()) {
4529 fuzz_from = pit.GetFuzzFrom();
4530 }
4531 if (start == (*it)->GetFrom()
4532 && (*it)->IsSetFuzz_from()) {
4533 CRef<CInt_fuzz> f(new CInt_fuzz);
4534 if (fuzz_from) {
4535 f->Assign(*fuzz_from);
4536 f->Add((*it)->GetFuzz_from(), from,
4537 (*it)->GetFrom());
4538 } else {
4539 f->AssignTranslated((*it)->GetFuzz_from(), from,
4540 (*it)->GetFrom());
4541 }
4542 if (f->IsP_m() && !f->GetP_m() ) {
4543 fuzz_from.Reset(); // cancelled
4544 } else {
4545 fuzz_from = f;
4546 }
4547 }
4548 if (to == prange.GetTo()) {
4549 fuzz_to = pit.GetFuzzTo();
4550 }
4551 if ( !keep_going && (*it)->IsSetFuzz_to() ) {
4552 CRef<CInt_fuzz> f(new CInt_fuzz);
4553 if (fuzz_to) {
4554 f->Assign(*fuzz_to);
4555 f->Add((*it)->GetFuzz_to(), to, (*it)->GetTo());
4556 } else {
4557 f->AssignTranslated((*it)->GetFuzz_to(), to,
4558 (*it)->GetTo());
4559 }
4560 if (f->IsP_m() && !f->GetP_m() ) {
4561 fuzz_to.Reset(); // cancelled
4562 } else {
4563 fuzz_to = f;
4564 }
4565 }
4566 }
4567 if (from == to
4568 && (fuzz_from == fuzz_to
4569 || (fuzz_from.GetPointer() && fuzz_to.GetPointer()
4570 && fuzz_from->Equals(*fuzz_to)))) {
4571 // just a point
4572 CRef<CSeq_loc> loc(new CSeq_loc);
4573 CSeq_point& point = loc->SetPnt();
4574 point.SetPoint(from);
4575 if (strand != eNa_strand_unknown) {
4576 point.SetStrand(strand);
4577 }
4578 if (fuzz_from) {
4579 point.SetFuzz().Assign(*fuzz_from);
4580 }
4581 point.SetId().Assign(pit.GetSeq_id());
4582 mix.Set().push_back(loc);
4583 } else {
4584 CRef<CSeq_loc> loc(new CSeq_loc);
4585 CSeq_interval& ival = loc->SetInt();
4586 ival.SetFrom(from);
4587 ival.SetTo(to);
4588 if (strand != eNa_strand_unknown) {
4589 ival.SetStrand(strand);
4590 }
4591 if (fuzz_from) {
4592 ival.SetFuzz_from().Assign(*fuzz_from);
4593 }
4594 if (fuzz_to) {
4595 ival.SetFuzz_to().Assign(*fuzz_to);
4596 }
4597 ival.SetId().Assign(pit.GetSeq_id());
4598 mix.Set().push_back(loc);
4599 }
4600 if (keep_going) {
4601 start = pos + length;
4602 } else {
4603 break;
4604 }
4605 }
4606 pos += length;
4607 }
4608 if (keep_going) {
4609 TSeqPos total_length;
4610 string label;
4611 new_parent.GetLabel(&label);
4612 try {
4613 total_length = sequence::GetLength(new_parent, scope);
4614 ERR_POST_X(8, Warning << "SRelLoc::Resolve: Relative position "
4615 << start << " exceeds length (" << total_length
4616 << ") of parent location " << label);
4617 } catch (CObjmgrUtilException) {
4618 ERR_POST_X(9, Warning << "SRelLoc::Resolve: Relative position "
4619 << start
4620 << " exceeds length (?\?\?) of parent location "
4621 << label);
4622 }
4623 }
4624 }
4625 // clean up output
4626 switch (mix.Get().size()) {
4627 case 0:
4628 result->SetNull();
4629 break;
4630 case 1:
4631 {{
4632 CRef<CSeq_loc> first = mix.Set().front();
4633 result = first;
4634 break;
4635 }}
4636 default:
4637 break;
4638 }
4639 return result;
4640 }
4641
4642
4643 //============================================================================//
4644 // SeqSearch //
4645 //============================================================================//
4646
4647 // Public:
4648 // =======
4649
4650 // Constructors and Destructors:
CSeqSearch(IClient * client,TSearchFlags flags)4651 CSeqSearch::CSeqSearch(IClient *client, TSearchFlags flags) :
4652 m_Client(client), m_Flags(flags), m_LongestPattern(0), m_Fsa(true)
4653 {
4654 }
4655
4656
~CSeqSearch(void)4657 CSeqSearch::~CSeqSearch(void)
4658 {
4659 }
4660
4661
4662 typedef SStaticPair<Char, Char> TCharPair;
4663 static const TCharPair sc_comp_tbl[32] = {
4664 // uppercase
4665 { 'A', 'T' },
4666 { 'B', 'V' },
4667 { 'C', 'G' },
4668 { 'D', 'H' },
4669 { 'G', 'C' },
4670 { 'H', 'D' },
4671 { 'K', 'M' },
4672 { 'M', 'K' },
4673 { 'N', 'N' },
4674 { 'R', 'Y' },
4675 { 'S', 'S' },
4676 { 'T', 'A' },
4677 { 'U', 'A' },
4678 { 'V', 'B' },
4679 { 'W', 'W' },
4680 { 'Y', 'R' },
4681 // lowercase
4682 { 'a', 'T' },
4683 { 'b', 'V' },
4684 { 'c', 'G' },
4685 { 'd', 'H' },
4686 { 'g', 'C' },
4687 { 'h', 'D' },
4688 { 'k', 'M' },
4689 { 'm', 'K' },
4690 { 'n', 'N' },
4691 { 'r', 'Y' },
4692 { 's', 'S' },
4693 { 't', 'A' },
4694 { 'u', 'A' },
4695 { 'v', 'B' },
4696 { 'w', 'W' },
4697 { 'y', 'R' },
4698 };
4699 typedef CStaticPairArrayMap<Char, Char> TComplement;
4700 DEFINE_STATIC_ARRAY_MAP(TComplement, sc_Complement, sc_comp_tbl);
4701
4702
4703 inline
s_GetComplement(char c)4704 static char s_GetComplement(char c)
4705 {
4706 TComplement::const_iterator comp_it = sc_Complement.find(c);
4707 return (comp_it != sc_Complement.end()) ? comp_it->second : '\0';
4708 }
4709
4710
s_GetReverseComplement(const string & sequence)4711 static string s_GetReverseComplement(const string& sequence)
4712 {
4713 string revcomp;
4714 revcomp.reserve(sequence.length());
4715 string::const_reverse_iterator rend = sequence.rend();
4716
4717 for (string::const_reverse_iterator rit = sequence.rbegin(); rit != rend; ++rit) {
4718 revcomp += s_GetComplement(*rit);
4719 }
4720
4721 return revcomp;
4722 }
4723
4724
AddNucleotidePattern(const string & name,const string & sequence,Int2 cut_site,TSearchFlags flags)4725 void CSeqSearch::AddNucleotidePattern
4726 (const string& name,
4727 const string& sequence,
4728 Int2 cut_site,
4729 TSearchFlags flags)
4730 {
4731 if (NStr::IsBlank(name) || NStr::IsBlank(sequence)) {
4732 NCBI_THROW(CUtilException, eNoInput, "Empty input value");
4733 }
4734
4735 // cleanup pattern
4736 string pattern = sequence;
4737 NStr::TruncateSpaces(pattern);
4738 NStr::ToUpper(pattern);
4739
4740 string revcomp = s_GetReverseComplement(pattern);
4741 bool symmetric = (pattern == revcomp);
4742 ENa_strand strand = symmetric ? eNa_strand_both : eNa_strand_plus;
4743
4744 // record expansion of entered pattern
4745 x_AddNucleotidePattern(name, pattern, cut_site, strand, flags);
4746
4747 // record expansion of reverse complement of asymmetric pattern
4748 if (!symmetric && (!x_IsJustTopStrand(flags))) {
4749 TSeqPos revcomp_cut_site = TSeqPos(pattern.length()) - cut_site;
4750 x_AddNucleotidePattern(name, revcomp, revcomp_cut_site,
4751 eNa_strand_minus, flags);
4752 }
4753 }
4754
4755
4756 // Program passes each character in turn to finite state machine.
Search(int current_state,char ch,int position,int length)4757 int CSeqSearch::Search
4758 (int current_state,
4759 char ch,
4760 int position,
4761 int length)
4762 {
4763 if (m_Client == NULL) {
4764 return 0;
4765 }
4766
4767 // on first character, populate state transition table
4768 if (!m_Fsa.IsPrimed()) {
4769 m_Fsa.Prime();
4770 }
4771
4772 int next_state = m_Fsa.GetNextState(current_state, ch);
4773
4774 // report matches (if any)
4775 if (m_Fsa.IsMatchFound(next_state)) {
4776 ITERATE(vector<TPatternInfo>, it, m_Fsa.GetMatches(next_state)) {
4777 int start = position - int(it->GetSequence().length()) + 1;
4778
4779 // prevent multiple reports of patterns for circular sequences.
4780 if (start < length) {
4781 bool keep_going = m_Client->OnPatternFound(*it, start);
4782 if (!keep_going) {
4783 break;
4784 }
4785 }
4786 }
4787 }
4788
4789 return next_state;
4790 }
4791
4792
4793 // Search entire bioseq.
Search(const CBioseq_Handle & bsh)4794 void CSeqSearch::Search(const CBioseq_Handle& bsh)
4795 {
4796 if (!bsh || m_Client == NULL) {
4797 return;
4798 }
4799
4800 CSeqVector seq_vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
4801 TSeqPos seq_len = seq_vec.size();
4802 TSeqPos search_len = seq_len;
4803
4804 // handle circular bioseqs
4805 CSeq_inst::ETopology topology = bsh.GetInst_Topology();
4806 if (topology == CSeq_inst::eTopology_circular) {
4807 search_len += TSeqPos(m_LongestPattern - 1);
4808 }
4809
4810 int state = m_Fsa.GetInitialState();
4811
4812 for (TSeqPos i = 0; i < search_len; ++i) {
4813 state = Search(state, seq_vec[i % seq_len], i, seq_len);
4814 }
4815 }
4816
4817
4818 // Private:
4819 // ========
4820
4821 /// translation finite state machine base codes - ncbi4na
4822 enum EBaseCode {
4823 eBase_A = 1, ///< A
4824 eBase_C, ///< C
4825 eBase_M, ///< AC
4826 eBase_G, ///< G
4827 eBase_R, ///< AG
4828 eBase_S, ///< CG
4829 eBase_V, ///< ACG
4830 eBase_T, ///< T
4831 eBase_W, ///< AT
4832 eBase_Y, ///< CT
4833 eBase_H, ///< ACT
4834 eBase_K, ///< GT
4835 eBase_D, ///< AGT
4836 eBase_B, ///< CGT
4837 eBase_N ///< ACGT
4838 };
4839
4840 /// conversion table from Ncbi4na / Iupacna to EBaseCode
4841 static const EBaseCode sc_CharToEnum[256] = {
4842 // Ncbi4na
4843 eBase_N, eBase_A, eBase_C, eBase_M,
4844 eBase_G, eBase_R, eBase_S, eBase_V,
4845 eBase_T, eBase_W, eBase_Y, eBase_H,
4846 eBase_K, eBase_D, eBase_B, eBase_N,
4847
4848 eBase_N, eBase_N, eBase_N, eBase_N,
4849 eBase_N, eBase_N, eBase_N, eBase_N,
4850 eBase_N, eBase_N, eBase_N, eBase_N,
4851 eBase_N, eBase_N, eBase_N, eBase_N,
4852 eBase_N, eBase_N, eBase_N, eBase_N,
4853 eBase_N, eBase_N, eBase_N, eBase_N,
4854 eBase_N, eBase_N, eBase_N, eBase_N,
4855 eBase_N, eBase_N, eBase_N, eBase_N,
4856 eBase_N, eBase_N, eBase_N, eBase_N,
4857 eBase_N, eBase_N, eBase_N, eBase_N,
4858 eBase_N, eBase_N, eBase_N, eBase_N,
4859 eBase_N, eBase_N, eBase_N, eBase_N,
4860 // Iupacna (uppercase)
4861 eBase_N, eBase_A, eBase_B, eBase_C,
4862 eBase_D, eBase_N, eBase_N, eBase_G,
4863 eBase_H, eBase_N, eBase_N, eBase_K,
4864 eBase_N, eBase_M, eBase_N, eBase_N,
4865 eBase_N, eBase_N, eBase_R, eBase_S,
4866 eBase_T, eBase_T, eBase_V, eBase_W,
4867 eBase_N, eBase_Y, eBase_N, eBase_N,
4868 eBase_N, eBase_N, eBase_N, eBase_N,
4869 // Iupacna (lowercase)
4870 eBase_N, eBase_A, eBase_B, eBase_C,
4871 eBase_D, eBase_N, eBase_N, eBase_G,
4872 eBase_H, eBase_N, eBase_N, eBase_K,
4873 eBase_N, eBase_M, eBase_N, eBase_N,
4874 eBase_N, eBase_N, eBase_R, eBase_S,
4875 eBase_T, eBase_T, eBase_V, eBase_W,
4876 eBase_N, eBase_Y, eBase_N, eBase_N,
4877
4878 eBase_N, eBase_N, eBase_N, eBase_N,
4879 eBase_N, eBase_N, eBase_N, eBase_N,
4880 eBase_N, eBase_N, eBase_N, eBase_N,
4881 eBase_N, eBase_N, eBase_N, eBase_N,
4882 eBase_N, eBase_N, eBase_N, eBase_N,
4883 eBase_N, eBase_N, eBase_N, eBase_N,
4884 eBase_N, eBase_N, eBase_N, eBase_N,
4885 eBase_N, eBase_N, eBase_N, eBase_N,
4886 eBase_N, eBase_N, eBase_N, eBase_N,
4887 eBase_N, eBase_N, eBase_N, eBase_N,
4888 eBase_N, eBase_N, eBase_N, eBase_N,
4889 eBase_N, eBase_N, eBase_N, eBase_N,
4890 eBase_N, eBase_N, eBase_N, eBase_N,
4891 eBase_N, eBase_N, eBase_N, eBase_N,
4892 eBase_N, eBase_N, eBase_N, eBase_N,
4893 eBase_N, eBase_N, eBase_N, eBase_N,
4894 eBase_N, eBase_N, eBase_N, eBase_N,
4895 eBase_N, eBase_N, eBase_N, eBase_N,
4896 eBase_N, eBase_N, eBase_N, eBase_N,
4897 eBase_N, eBase_N, eBase_N, eBase_N,
4898 eBase_N, eBase_N, eBase_N, eBase_N,
4899 eBase_N, eBase_N, eBase_N, eBase_N,
4900 eBase_N, eBase_N, eBase_N, eBase_N,
4901 eBase_N, eBase_N, eBase_N, eBase_N,
4902 eBase_N, eBase_N, eBase_N, eBase_N,
4903 eBase_N, eBase_N, eBase_N, eBase_N,
4904 eBase_N, eBase_N, eBase_N, eBase_N,
4905 eBase_N, eBase_N, eBase_N, eBase_N,
4906 eBase_N, eBase_N, eBase_N, eBase_N,
4907 eBase_N, eBase_N, eBase_N, eBase_N,
4908 eBase_N, eBase_N, eBase_N, eBase_N,
4909 eBase_N, eBase_N, eBase_N, eBase_N,
4910 eBase_N, eBase_N, eBase_N, eBase_N
4911 };
4912
4913 static const char sc_EnumToChar[16] = {
4914 '\0', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
4915 };
4916
4917
x_AddNucleotidePattern(const string & name,string & pattern,Int2 cut_site,ENa_strand strand,TSearchFlags flags)4918 void CSeqSearch::x_AddNucleotidePattern
4919 (const string& name,
4920 string& pattern,
4921 Int2 cut_site,
4922 ENa_strand strand,
4923 TSearchFlags flags)
4924 {
4925 if (pattern.length() > m_LongestPattern) {
4926 m_LongestPattern = pattern.length();
4927 }
4928
4929 TPatternInfo pat_info(name, kEmptyStr, cut_site);
4930 pat_info.m_Strand = strand;
4931
4932 if (!x_IsExpandPattern(flags)) {
4933 pat_info.m_Sequence = pattern;
4934 x_AddPattern(pat_info, pattern, flags);
4935 } else {
4936 string buffer;
4937 buffer.reserve(pattern.length());
4938
4939 x_ExpandPattern(pattern, buffer, 0, pat_info, flags);
4940 }
4941 }
4942
4943
x_ExpandPattern(string & sequence,string & buf,size_t pos,TPatternInfo & pat_info,TSearchFlags flags)4944 void CSeqSearch::x_ExpandPattern
4945 (string& sequence,
4946 string& buf,
4947 size_t pos,
4948 TPatternInfo& pat_info,
4949 TSearchFlags flags)
4950 {
4951 static const EBaseCode expansion[] = { eBase_A, eBase_C, eBase_G, eBase_T };
4952
4953 if (pos < sequence.length()) {
4954 Uint4 code = static_cast<Uint4>(sc_CharToEnum[static_cast<Uint1>(sequence[pos])]);
4955
4956 for (int i = 0; i < 4; ++i) {
4957 if ((code & expansion[i]) != 0) {
4958 buf += sc_EnumToChar[expansion[i]];
4959 x_ExpandPattern(sequence, buf, pos + 1, pat_info, flags);
4960 buf.erase(pos);
4961 }
4962 }
4963 } else {
4964 // when position reaches pattern length, store one expanded string.
4965 x_AddPattern(pat_info, buf, flags);
4966 }
4967 }
4968
4969
x_AddPattern(TPatternInfo & pat_info,string & sequence,TSearchFlags flags)4970 void CSeqSearch::x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags)
4971 {
4972 x_StorePattern(pat_info, sequence);
4973
4974 if (x_IsAllowMismatch(flags)) {
4975 // put 'N' at every position if a single mismatch is allowed.
4976 char ch = 'N';
4977 NON_CONST_ITERATE (string, it, sequence) {
4978 swap(*it, ch);
4979
4980 x_StorePattern(pat_info, sequence);
4981
4982 // restore proper character, go on to put N in next position.
4983 swap(*it, ch);
4984 }
4985 }
4986 }
4987
4988
x_StorePattern(TPatternInfo & pat_info,string & sequence)4989 void CSeqSearch::x_StorePattern(TPatternInfo& pat_info, string& sequence)
4990 {
4991 pat_info.m_Sequence = sequence;
4992 m_Fsa.AddWord(sequence, pat_info);
4993 }
4994
4995
ReverseComplement(CSeq_inst & inst,CScope * scope)4996 void ReverseComplement(CSeq_inst& inst, CScope* scope)
4997 {
4998 switch (inst.GetRepr()) {
4999 case CSeq_inst::eRepr_raw:
5000 CSeqportUtil::ReverseComplement(&(inst.SetSeq_data()), 0, inst.GetLength());
5001 break;
5002 case CSeq_inst::eRepr_delta:
5003 if (!inst.IsSetExt() || !inst.GetExt().IsDelta()) {
5004 NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5005 "Sequence of this type cannot be reverse-complemented.");
5006 }
5007 // reverse order of segments
5008 inst.SetExt().SetDelta().Set().reverse();
5009 // reverse-complement individual segments
5010 NON_CONST_ITERATE(CSeq_inst::TExt::TDelta::Tdata, it, inst.SetExt().SetDelta().Set()) {
5011 switch ((*it)->Which()) {
5012 case CDelta_seq::e_Literal:
5013 if ((*it)->GetLiteral().IsSetSeq_data()) {
5014 CSeq_literal& lit = (*it)->SetLiteral();
5015 if (!lit.GetSeq_data().IsGap()) {
5016 CSeqportUtil::ReverseComplement(&(lit.SetSeq_data()), 0, lit.GetLength());
5017 }
5018 }
5019 break;
5020 case CDelta_seq::e_Loc:
5021 {{
5022 CRef<CSeq_loc> flip(sequence::SeqLocRevCmpl((*it)->SetLoc(), scope));
5023 (*it)->SetLoc(*flip);
5024 }}
5025 break;
5026 default:
5027 // do nothing
5028 break;
5029 }
5030 }
5031 break;
5032 default:
5033 NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5034 "Sequence of this type cannot be reverse-complemented.");
5035 break;
5036 }
5037 }
5038
5039
5040 END_SCOPE(objects)
5041 END_NCBI_SCOPE
5042
5043