1 /* $Id: variation_util2.cpp 415910 2013-10-22 16:06:39Z astashya 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * File Description: 27 * 28 */ 29 30 #include <ncbi_pch.hpp> 31 #include <objects/seqloc/Seq_point.hpp> 32 #include <objects/general/Object_id.hpp> 33 #include <objects/general/Dbtag.hpp> 34 35 #include <objects/seqalign/Seq_align.hpp> 36 #include <objects/seqalign/Spliced_seg.hpp> 37 #include <objects/seqalign/Spliced_exon.hpp> 38 #include <objects/seqalign/Product_pos.hpp> 39 #include <objects/seqalign/Prot_pos.hpp> 40 #include <objmgr/seq_loc_mapper.hpp> 41 #include <objmgr/feat_ci.hpp> 42 #include <objmgr/align_ci.hpp> 43 44 45 #include <objects/seq/seqport_util.hpp> 46 #include <objects/seqblock/GB_block.hpp> 47 48 #include <objects/seqfeat/Seq_feat.hpp> 49 #include <objects/seqfeat/Genetic_code.hpp> 50 #include <objects/seqfeat/Variation_ref.hpp> 51 #include <objects/seqfeat/Phenotype.hpp> 52 #include <objects/seqfeat/Variation_inst.hpp> 53 #include <objects/seqfeat/VariantProperties.hpp> 54 #include <objects/seqfeat/Delta_item.hpp> 55 #include <objects/seqfeat/Ext_loc.hpp> 56 #include <objects/seqfeat/BioSource.hpp> 57 #include <objects/seqfeat/SubSource.hpp> 58 #include <objects/seqfeat/SeqFeatXref.hpp> 59 60 #include <objects/pub/Pub_set.hpp> 61 #include <objects/pub/Pub.hpp> 62 #include <objects/biblio/Cit_art.hpp> 63 #include <objects/biblio/Cit_book.hpp> 64 #include <objects/biblio/Cit_jour.hpp> 65 66 67 #include <objects/variation/VariationMethod.hpp> 68 #include <objects/variation/VariationException.hpp> 69 70 #include <objects/seq/Seq_literal.hpp> 71 #include <objects/seq/Seq_data.hpp> 72 #include <objects/seq/Annot_descr.hpp> 73 #include <objects/seq/Annotdesc.hpp> 74 #include <objects/seq/Seq_descr.hpp> 75 #include <objects/seq/Seq_inst.hpp> 76 #include <objects/seq/Seqdesc.hpp> 77 78 #include <objects/general/User_object.hpp> 79 #include <objects/general/Object_id.hpp> 80 81 #include <objects/seqloc/Seq_loc_equiv.hpp> 82 83 #include <serial/iterator.hpp> 84 #include <objmgr/util/sequence.hpp> 85 #include <objmgr/bioseq_handle.hpp> 86 #include <objmgr/seq_vector.hpp> 87 #include <objmgr/util/seq_loc_util.hpp> 88 #include <misc/hgvs/variation_util2.hpp> 89 90 #include <objects/seqloc/Na_strand.hpp> 91 92 BEGIN_NCBI_SCOPE 93 94 namespace variation 95 { 96 CreateException(const string & message,CVariationException::ECode code=static_cast<CVariationException::ECode> (0))97 static CRef<CVariationException> CreateException( 98 const string& message, 99 CVariationException::ECode code = 100 static_cast<CVariationException::ECode>(0)) 101 { 102 CRef<CVariationException> e(new CVariationException); 103 e->SetMessage(message); 104 if(code) { 105 e->SetCode(code); 106 } 107 return e; 108 } 109 110 111 template<typename T> ChangeIdsInPlace(T & container,sequence::EGetIdType id_type,CScope & scope)112 static void ChangeIdsInPlace( 113 T& container, 114 sequence::EGetIdType id_type, 115 CScope& scope) 116 { 117 for(CTypeIterator<CSeq_id> it = Begin(container); it; ++it) { 118 CSeq_id& id = *it; 119 if((id_type == sequence::eGetId_ForceAcc && id.GetTextseq_Id()) 120 || (id_type == sequence::eGetId_ForceGi && id.IsGi())) { 121 continue; 122 } 123 124 const CSeq_id_Handle idh = sequence::GetId(id, scope, id_type); 125 126 if(idh) { 127 id.Assign(*idh.GetSeqId()); 128 } else if(id.IsGeneral() || id.IsLocal()) { 129 ; // e.g. ENST 130 } else { 131 NCBI_USER_THROW("Can't convert seq-id for " + it->AsFastaString()); 132 } 133 } 134 } 135 s_GetLength(const CVariantPlacement & p,CScope * scope)136 TSeqPos CVariationUtil::s_GetLength( 137 const CVariantPlacement& p, 138 CScope* scope) 139 { 140 int start_offset = p.IsSetStart_offset() ? p.GetStart_offset() : 0; 141 142 int stop_offset = p.IsSetStop_offset() ? p.GetStop_offset() 143 : p.GetLoc().IsPnt() ? start_offset 144 : 0; 145 return ncbi::sequence::GetLength(p.GetLoc(), scope) 146 + stop_offset 147 - start_offset; 148 } 149 150 // verify that exon_anchor_pos is represented in biostarts or biostops 151 // depending on the sign of offset_pos ValidExonTerminal(const set<TSeqPos> & exon_biostarts,const set<TSeqPos> & exon_biostops,TSeqPos exon_anchor_pos,int offset_pos)152 static bool ValidExonTerminal( 153 const set<TSeqPos>& exon_biostarts, 154 const set<TSeqPos>& exon_biostops, 155 TSeqPos exon_anchor_pos, 156 int offset_pos) 157 { 158 // VAR-1379, VAR-1500 159 // Note: offset=0 may appear in cases like NM_000535.5:c.1145-?_2174+?del 160 // We don't know here whether we're dealing with lt or gt, so will optimistically 161 // allow if we find the anchor in either starts or stops. 162 bool found_in_starts = exon_biostarts.find(exon_anchor_pos) 163 != exon_biostarts.end(); 164 165 bool found_in_stops = exon_biostops.find(exon_anchor_pos) 166 != exon_biostops.end(); 167 168 return (offset_pos < 0 && found_in_starts) 169 || (offset_pos > 0 && found_in_stops) 170 || (offset_pos == 0 && (found_in_starts || found_in_stops)); 171 } 172 173 GetFuzzSign(const CInt_fuzz & fuzz,int loc_sign)174 static int GetFuzzSign(const CInt_fuzz& fuzz, int loc_sign) 175 { 176 return !fuzz.IsLim() ? 0 177 : fuzz.GetLim() == CInt_fuzz::eLim_lt ? -1 178 : fuzz.GetLim() == CInt_fuzz::eLim_tl ? -1 * loc_sign 179 : fuzz.GetLim() == CInt_fuzz::eLim_gt ? 1 180 : fuzz.GetLim() == CInt_fuzz::eLim_tr ? 1 * loc_sign 181 : 0; 182 } 183 184 ValidExonTerminals(const set<TSeqPos> & exon_biostarts,const set<TSeqPos> & exon_biostops,const CVariantPlacement & p)185 static bool ValidExonTerminals( 186 const set<TSeqPos>& exon_biostarts, 187 const set<TSeqPos>& exon_biostops, 188 const CVariantPlacement& p) 189 { 190 int sign = sequence::GetStrand(p.GetLoc(), NULL) == eNa_strand_minus ? -1 : 1; 191 192 const int start_offset_sign = //VAR-1500 193 p.IsSetStart_offset() 194 && p.GetStart_offset() > 0 ? 1 195 196 : p.IsSetStart_offset() 197 && p.GetStart_offset() < 0 ? -1 198 199 : p.IsSetStart_offset_fuzz() ? GetFuzzSign(p.GetStart_offset_fuzz(), sign) 200 201 : 0; 202 203 const int stop_offset_sign = 204 p.IsSetStop_offset() 205 && p.GetStop_offset() > 0 ? 1 206 207 : p.IsSetStop_offset() 208 && p.GetStop_offset() < 0 ? -1 209 210 : p.IsSetStop_offset_fuzz() ? GetFuzzSign(p.GetStop_offset_fuzz(), sign) 211 212 : 0; 213 214 // if offset's sign is consistent with location's strand, we expect to find 215 // the anchor in exon-stops, otherwise in exon-starts. //VAR-1379 216 const bool start_ok = 217 (!p.IsSetStart_offset() && !p.IsSetStart_offset_fuzz()) 218 || ValidExonTerminal( 219 exon_biostarts, 220 exon_biostops, 221 sequence::GetStart(p.GetLoc(), NULL, eExtreme_Biological), 222 start_offset_sign * sign); 223 224 const bool stop_ok = 225 (!p.IsSetStop_offset() && !p.IsSetStop_offset_fuzz()) 226 || ValidExonTerminal( 227 exon_biostarts, 228 exon_biostops, 229 sequence::GetStop(p.GetLoc(), NULL, eExtreme_Biological), 230 stop_offset_sign * sign); 231 232 return start_ok && stop_ok; 233 } 234 235 CheckExonBoundary(const CVariantPlacement & p)236 CVariationUtil::ETestStatus CVariationUtil::CheckExonBoundary(const CVariantPlacement& p) 237 { 238 const CSeq_id* id = p.GetLoc().GetId(); 239 if(!id 240 || !(p.IsSetStart_offset() || p.IsSetStop_offset()) 241 || !(p.IsSetMol() && (p.GetMol() == CVariantPlacement::eMol_cdna 242 || p.GetMol() == CVariantPlacement::eMol_rna))) { 243 return eNotApplicable; 244 } 245 246 SAnnotSelector sel; 247 sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon); 248 CBioseq_Handle bsh = m_scope->GetBioseqHandle(*id); 249 250 set<TSeqPos> exon_biostarts, exon_biostops; 251 for(CFeat_CI ci(bsh, sel); ci; ++ci) { 252 const CMappedFeat& mf = *ci; 253 exon_biostarts.insert(sequence::GetStart(mf.GetLocation(), NULL)); 254 exon_biostops.insert(sequence::GetStop(mf.GetLocation(), NULL)); 255 } 256 257 return exon_biostarts.empty() && exon_biostops.empty() ? eNotApplicable 258 : ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass 259 : eFail; 260 } 261 CheckExonBoundary(const CVariantPlacement & p,const CSeq_align & aln)262 CVariationUtil::ETestStatus CVariationUtil::CheckExonBoundary( 263 const CVariantPlacement& p, 264 const CSeq_align& aln) 265 { 266 const CSeq_id* id = p.GetLoc().GetId(); 267 if(!aln.GetSegs().IsSpliced() 268 || !id 269 || !sequence::IsSameBioseq(aln.GetSeq_id(0), *id, m_scope) 270 || !(p.IsSetStart_offset() || p.IsSetStop_offset()) 271 || (p.IsSetMol() && p.GetMol() == CVariantPlacement::eMol_genomic) 272 ) { 273 return eNotApplicable; 274 } 275 276 set<TSeqPos> exon_biostarts, exon_biostops; 277 ITERATE(CSpliced_seg::TExons, it, aln.GetSegs().GetSpliced().GetExons()) 278 { 279 const CSpliced_exon& exon = **it; 280 exon_biostarts.insert(exon.GetProduct_start().IsNucpos() ? 281 exon.GetProduct_start().GetNucpos() : 282 exon.GetProduct_start().GetProtpos().GetAmin()); 283 284 exon_biostops.insert(exon.GetProduct_end().IsNucpos() ? 285 exon.GetProduct_end().GetNucpos() : 286 exon.GetProduct_end().GetProtpos().GetAmin()); 287 } 288 289 return ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass : eFail; 290 } 291 SwapLtGtFuzz(CInt_fuzz & fuzz)292 static void SwapLtGtFuzz(CInt_fuzz& fuzz) 293 { 294 if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_gt) { 295 fuzz.SetLim(CInt_fuzz::eLim_lt); 296 } else if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_lt) { 297 fuzz.SetLim(CInt_fuzz::eLim_gt); 298 } 299 } 300 301 302 //loc is presumed Int-loc ApplyOffsetFuzz(CSeq_loc & loc,const CInt_fuzz & offset_fuzz,bool is_start)303 static void ApplyOffsetFuzz( 304 CSeq_loc& loc, 305 const CInt_fuzz& offset_fuzz, 306 bool is_start) 307 { 308 bool minus_strand = loc.GetStrand() == eNa_strand_minus; 309 CInt_fuzz& fuzz = (minus_strand == is_start) ? loc.SetInt().SetFuzz_to() 310 : loc.SetInt().SetFuzz_from(); 311 if(!fuzz.Which()) { //will not apply this fuzz if mapping itself was fuzzy 312 fuzz.Assign(offset_fuzz); 313 if(minus_strand) { 314 SwapLtGtFuzz(fuzz); 315 } 316 if(fuzz.IsRange()) { 317 //fuzz range coordinates are invalid after remapping 318 fuzz.SetLim(CInt_fuzz::eLim_unk); 319 } 320 } 321 } 322 323 s_ResolveIntronicOffsets(CVariantPlacement & p)324 void CVariationUtil::s_ResolveIntronicOffsets(CVariantPlacement& p) 325 { 326 if(!p.IsSetStart_offset() 327 && !p.IsSetStop_offset()) { 328 return; 329 } 330 331 if(p.GetLoc().IsPnt() 332 && !p.IsSetStop_offset() 333 && p.IsSetStart_offset()) { 334 //In case of point, only start-offset may be specified. In this case 335 //temporarily set stop-offset to be the same (it will be reset below), 336 //such that start and stop of point-as-interval are adjusted consistently. 337 p.SetStop_offset(p.GetStart_offset()); 338 } 339 340 //Convert to interval. 341 //Need to do for a point so that we can deal with start and stop independently. 342 //Another scenario is when start/stop in CDNA coords lie in different exons, so 343 //the mapping is across one or more intron - we want to collapse the genomic loc 344 //(do we want to preselve the gaps caused by indels rather than introns?) 345 CRef<CSeq_loc> loc = 346 sequence::Seq_loc_Merge( 347 p.GetLoc(), 348 CSeq_loc::fMerge_SingleRange, 349 NULL); 350 351 if(!loc->IsInt() 352 && (p.IsSetStart_offset() || p.IsSetStop_offset())) { 353 NcbiCerr << MSerial_AsnText << p; 354 NCBI_THROW(CException, eUnknown, "Complex location"); 355 } 356 357 bool minus_strand = loc->GetStrand() == eNa_strand_minus; 358 359 if(p.IsSetStart_offset()) { 360 TSeqPos& bio_start_ref = minus_strand ? loc->SetInt().SetTo() 361 : loc->SetInt().SetFrom(); 362 363 bio_start_ref += p.GetStart_offset() * (minus_strand ? -1 : 1); 364 p.ResetStart_offset(); 365 } 366 367 if(p.IsSetStop_offset()) { 368 TSeqPos& bio_stop_ref = 369 loc->GetStrand() == eNa_strand_minus ? loc->SetInt().SetFrom() 370 : loc->SetInt().SetTo(); 371 372 bio_stop_ref += p.GetStop_offset() * (minus_strand ? -1 : 1); 373 p.ResetStop_offset(); 374 } 375 376 if(p.IsSetStart_offset_fuzz()) { 377 ApplyOffsetFuzz(*loc, p.GetStart_offset_fuzz(), true); 378 p.ResetStart_offset_fuzz(); 379 } 380 381 if(p.IsSetStop_offset_fuzz()) { 382 ApplyOffsetFuzz(*loc, p.GetStop_offset_fuzz(), false); 383 p.ResetStop_offset_fuzz(); 384 } 385 386 if(loc->GetTotalRange().GetLength() == 1) { 387 loc = sequence::Seq_loc_Merge(*loc, CSeq_loc::fSortAndMerge_All, NULL); 388 } 389 390 p.SetLoc().Assign(*loc); 391 } 392 393 s_AddIntronicOffsets(CVariantPlacement & p,const CSpliced_seg & ss,CScope * scope)394 void CVariationUtil::s_AddIntronicOffsets( 395 CVariantPlacement& p, 396 const CSpliced_seg& ss, 397 CScope* scope) 398 { 399 #if 0 400 if(!p.GetLoc().IsPnt() && !p.GetLoc().IsInt()) { 401 NCBI_THROW(CException, eUnknown, "Expected simple loc (int or pnt)"); 402 } 403 #endif 404 405 if(p.IsSetStart_offset() 406 || p.IsSetStop_offset() 407 || p.IsSetStart_offset_fuzz() 408 || p.IsSetStop_offset_fuzz()) { 409 NCBI_THROW(CException, eUnknown, "Expected offset-free placement"); 410 } 411 412 if(!p.GetLoc().GetId() 413 || !sequence::IsSameBioseq( 414 *p.GetLoc().GetId(), 415 ss.GetGenomic_id(), 416 scope)) { 417 NCBI_THROW(CException, eUnknown, 418 "Expected genomic_id in the variation to be the same as in spliced-seg"); 419 } 420 421 long start = p.GetLoc().GetStart(eExtreme_Positional); 422 long stop = p.GetLoc().GetStop(eExtreme_Positional); 423 424 long closest_start = ss.GetExons().front()->GetGenomic_start(); 425 // closest-exon-boundary for bio-start of variation location 426 427 long closest_stop = ss.GetExons().front()->GetGenomic_start(); 428 // closest-exon-boundary for bio-stop of variation location 429 430 ITERATE(CSpliced_seg::TExons, it, ss.GetExons()) 431 { 432 const CSpliced_exon& se = **it; 433 434 if(se.GetGenomic_end() >= start && se.GetGenomic_start() <= start) { 435 closest_start = start; //start is within exon - use itself. 436 } else { 437 if(abs((long)se.GetGenomic_end() - start) < abs(closest_start - start)) { 438 closest_start = (long)se.GetGenomic_end(); 439 } 440 if(abs((long)se.GetGenomic_start() - start) < abs(closest_start - start)) { 441 closest_start = (long)se.GetGenomic_start(); 442 } 443 } 444 445 if(se.GetGenomic_end() >= stop && se.GetGenomic_start() <= stop) { 446 closest_stop = stop; //end is within exon - use itself. 447 } else { 448 if(abs((long)se.GetGenomic_end() - stop) < abs(closest_stop - stop)) { 449 closest_stop = (long)se.GetGenomic_end(); 450 } 451 if(abs((long)se.GetGenomic_start() - stop) < abs(closest_stop - stop)) { 452 closest_stop = (long)se.GetGenomic_start(); 453 } 454 } 455 } 456 457 //convert to int, so we can set start and stop 458 CRef<CSeq_loc> int_loc = sequence::Seq_loc_Merge(p.GetLoc(), CSeq_loc::fMerge_SingleRange, NULL); 459 460 //adjust location 461 if(start != closest_start || stop != closest_stop) { 462 int_loc->SetInt().SetFrom(closest_start); 463 int_loc->SetInt().SetTo(closest_stop); 464 } 465 p.SetLoc(*int_loc); 466 467 //Add offsets. Note that start-offset and stop-offset are always biological. 468 //(e.g. start-offset applies to bio-start; value > 0 means downstream, and value <0 means upstream) 469 470 //Note: qry/sbj strandedness in the alignment is irrelevent - we're simply adjusting the 471 //location in the genomic coordinate system, conserving the strand 472 if(start != closest_start) { 473 long offset = (start - closest_start); 474 if(p.GetLoc().GetStrand() == eNa_strand_minus) { 475 p.SetStop_offset(offset * -1); 476 } else { 477 p.SetStart_offset(offset); 478 } 479 } 480 481 if(stop != closest_stop) { 482 long offset = (stop - closest_stop); 483 if(p.GetLoc().GetStrand() == eNa_strand_minus) { 484 p.SetStart_offset(offset * -1); 485 } else { 486 p.SetStop_offset(offset); 487 } 488 } 489 } 490 491 Remap(const CVariantPlacement & p,const CSeq_align & aln,bool check_placements)492 CRef<CVariantPlacement> CVariationUtil::Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements) 493 { 494 if(!p.GetLoc().GetId()) { 495 NCBI_THROW(CException, eUnknown, "Can't get seq-id"); 496 } 497 498 //note: the operations here are scope-free (i.e. seq-id types in aln must be consistent with p, 499 //or else will throw; using a scope will impose major slowdown, which is problematic if we're 500 //dealing with tens or hundreds of thousands of remappings. 501 502 CRef<CVariantPlacement> p2(new CVariantPlacement); 503 p2->Assign(p); 504 505 if(aln.GetSegs().IsSpliced() 506 && sequence::IsSameBioseq(aln.GetSegs().GetSpliced().GetGenomic_id(), 507 *p2->GetLoc().GetId(), NULL)) { 508 s_AddIntronicOffsets(*p2, aln.GetSegs().GetSpliced(), NULL); 509 } 510 511 CSeq_align::TDim target_row = -1; 512 for(int i = 0; i < 2; i++) { 513 if(sequence::IsSameBioseq(*p2->GetLoc().GetId(), aln.GetSeq_id(i), NULL)) { 514 target_row = 1 - i; 515 } 516 } 517 518 if(target_row == -1) { 519 NCBI_THROW(CException, 520 eUnknown, 521 "The alignment has no row for seq-id " 522 + p2->GetLoc().GetId()->AsFastaString()); 523 } 524 525 CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(aln, target_row, NULL)); 526 mapper->SetMergeAll(); 527 528 CRef<CVariantPlacement> p3 = x_Remap(*p2, *mapper); 529 530 if(p3->GetLoc().GetId() 531 && aln.GetSegs().IsSpliced() 532 && sequence::IsSameBioseq(aln.GetSegs().GetSpliced().GetGenomic_id(), 533 *p3->GetLoc().GetId(), NULL)) { 534 if(CheckExonBoundary(p, aln) == eFail) { 535 536 bool source_loc_is_projected = 537 p.IsSetPlacement_method() 538 && p.GetPlacement_method() == CVariantPlacement::ePlacement_method_projected; 539 // checkig exon-boundary only for non-projected cases, since an exon-boundary 540 // may become non-exon-boundary (e.g. transcript extended or exon boundary shifted). 541 // VAR-1309 542 543 CVariationException::ECode code = 544 source_loc_is_projected ? CVariationException::eCode_hgvs_exon_boundary_induced 545 : CVariationException::eCode_hgvs_exon_boundary; 546 547 const string msg = 548 "HGVS exon-boundary position not found in alignment of " 549 + aln.GetSeq_id(0).AsFastaString() 550 + "-vs-" 551 + aln.GetSeq_id(1).AsFastaString(); 552 553 p3->SetExceptions().push_back(CreateException(msg, code)); 554 } 555 556 s_ResolveIntronicOffsets(*p3); 557 } 558 559 //note: AttachSeq happens only after the intronic offsets are resolved. 560 AttachSeq(*p3); 561 562 // Note: as per SNP-5641 - will check for mismatches 563 // even in the presence of other more severe exceptions 564 // NcbiCerr << "Checking for mismatches-in-mapping" 565 // << MSerial_AsnText << p << MSerial_AsnText << *p3 << "\n\n"; 566 if(p.IsSetSeq() && p3->IsSetSeq() 567 && p.GetSeq().GetLength() == p3->GetSeq().GetLength() 568 && p.GetSeq().IsSetSeq_data() && p3->GetSeq().IsSetSeq_data() 569 && p.GetSeq().GetSeq_data().Which() == p3->GetSeq().GetSeq_data().Which() 570 && !p.GetSeq().GetSeq_data().Equals(p3->GetSeq().GetSeq_data())) { 571 p3->SetExceptions().push_back(CreateException( 572 "Mismatches in mapping", 573 CVariationException::eCode_mismatches_in_mapping)); 574 } 575 576 //VAR-1307 577 {{ 578 static const long thr = 5000; 579 long start_offset = !p2->IsSetStart_offset() ? 0 : p2->GetStart_offset(); 580 long stop_offset = !p2->IsSetStop_offset() ? 0 : p2->GetStop_offset(); 581 bool far_start = start_offset + thr < 0 582 && (p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStart(1) 583 || p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStop(1)); 584 585 bool far_stop = stop_offset > thr 586 && (p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStart(1) 587 || p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStop(1)); 588 589 if(far_start || far_stop) { 590 p3->SetExceptions().push_back(CreateException( 591 "Source location overhangs the alignment by at least 5kb ", 592 CVariationException::eCode_source_location_overhang)); 593 } 594 }} 595 596 if(check_placements) { 597 CheckPlacement(*p3); 598 } 599 return p3; 600 } 601 602 CreateSplicedSeqAlignFromFeat(const CSeq_feat & rna_feat)603 static CRef<CSeq_align> CreateSplicedSeqAlignFromFeat(const CSeq_feat& rna_feat) 604 { 605 CRef<CSeq_align> aln(new CSeq_align); 606 aln->SetType(CSeq_align::eType_other); 607 CSpliced_seg& ss = aln->SetSegs().SetSpliced(); 608 ss.SetProduct_id().Assign(sequence::GetId(rna_feat.GetProduct(), NULL)); 609 ss.SetGenomic_id().Assign(sequence::GetId(rna_feat.GetLocation(), NULL)); 610 ss.SetExons(); 611 ss.SetProduct_type(CSpliced_seg::eProduct_type_transcript); 612 ss.SetProduct_strand(eNa_strand_plus); 613 ss.SetGenomic_strand(sequence::GetStrand(rna_feat.GetLocation())); 614 615 TSignedSeqPos product_pos(0); 616 for(CSeq_loc_CI ci(rna_feat.GetLocation(), CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Biological); ci; ++ci) { 617 CRef<CSpliced_exon> se(new CSpliced_exon); 618 se->SetGenomic_start(ci.GetRange().GetFrom()); 619 se->SetGenomic_end(ci.GetRange().GetTo()); 620 se->SetProduct_start().SetNucpos(product_pos); 621 se->SetProduct_end().SetNucpos(product_pos + ci.GetRange().GetLength() - 1); 622 product_pos += ci.GetRange().GetLength(); 623 ss.SetExons().push_back(se); 624 } 625 626 return aln; 627 } 628 629 630 //todo: 1. what if we don't have the requested version of the product annotated? 631 // 2. do we need to support NGs, etc (will need to search all alignments on sequnece, as we can't find NG by product-id) 632 // 3. Do we want to return VariantPlacement instead, such that we can communicate auxiliary info? RemapToAnnotatedTarget(const CVariation & v,const CSeq_id & target)633 CRef<CVariantPlacement> CVariationUtil::RemapToAnnotatedTarget( 634 const CVariation& v, 635 const CSeq_id& target) 636 { 637 CRef<CVariantPlacement> result(new CVariantPlacement()); 638 result->SetLoc().SetNull(); 639 result->SetMol(CVariantPlacement::eMol_unknown); 640 CBioseq_Handle bsh = m_scope->GetBioseqHandle(target); 641 CRef<CSeq_loc_Mapper> upmapper; 642 643 if(v.IsSetPlacements()) { 644 //First see if we already have location on target in one of the placements 645 ITERATE(CVariation::TPlacements, it, v.GetPlacements()) 646 { 647 const CVariantPlacement& p = **it; 648 if(p.GetLoc().GetId() && sequence::IsSameBioseq(target, *p.GetLoc().GetId(), m_scope)) { 649 result->Assign(p); 650 break; 651 } 652 } 653 654 if(result->GetLoc().IsNull()) { 655 //did not find an existing placement for the target; will try to remap 656 ITERATE(CVariation::TPlacements, it, v.GetPlacements()) 657 { 658 const CVariantPlacement& placement = **it; 659 660 if(!placement.GetLoc().GetId()) { 661 continue; 662 } 663 CSeq_id_Handle p_idh = sequence::GetId(*placement.GetLoc().GetId(), 664 *m_scope, 665 sequence::eGetId_ForceAcc); 666 667 bool is_scaffold = p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_contig 668 || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_contig_ncbo 669 || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_wgs_nuc 670 || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_wgs_intermed; 671 672 if(placement.GetMol() == CVariantPlacement::eMol_protein) { 673 674 CRef<CVariation> precursor = InferNAfromAA(v); 675 result = RemapToAnnotatedTarget(*precursor, target); 676 677 } else if(placement.GetMol() == CVariantPlacement::eMol_genomic 678 && is_scaffold) { 679 // location is possibly a component of the target 680 if(!upmapper) { 681 upmapper.Reset(new CSeq_loc_Mapper(1, bsh, CSeq_loc_Mapper::eSeqMap_Up)); 682 } 683 result = Remap(placement, *upmapper); 684 ChangeIdsInPlace(*result, sequence::eGetId_ForceAcc, *m_scope); 685 686 } else { 687 688 // the annotation is gi-based, so we'll make sure the target 689 // is gi-based so we won't need to do conversion for all products while searching. 690 CSeq_id_Handle product_idh = sequence::GetId(*placement.GetLoc().GetId(), 691 *m_scope, 692 sequence::eGetId_ForceGi); 693 if(!product_idh) { 694 continue; 695 } 696 697 // note that here we are looking for annotated (packaged) alignment at the 698 // feature's location first (we could have searched the whole sequence instead, 699 // but it is quite slow for NCs) If we have the feature but no alignment, 700 // it is assumed that the alignment is perfect, and a synthetic seq-align 701 // is created based on RNA feature. 702 CRef<CSeq_align> aln; 703 {{ 704 CRef<CSeq_loc> target_loc = bsh.GetRangeSeq_loc(0, 0); //this returns "whole" 705 //will try to constrain target_loc from whole to product location only 706 707 SAnnotSelector sel; 708 sel.SetResolveAll(); 709 sel.SetAdaptiveDepth(true); 710 sel.IncludeFeatType(CSeqFeatData::e_Rna); 711 712 for(CFeat_CI ci(bsh, sel); ci; ++ci) { 713 const CMappedFeat& mf = *ci; 714 if(!mf.IsSetProduct()) { 715 continue; 716 } 717 const CSeq_id& product_id = sequence::GetId(mf.GetProduct(), NULL); 718 if(product_id.Equals(*product_idh.GetSeqId())) { 719 target_loc->Assign(mf.GetLocation()); 720 aln = CreateSplicedSeqAlignFromFeat(mf.GetMappedFeature()); 721 break; 722 } 723 } 724 725 //try to find explicit seq_align at the target-loc (if not found, will use the synthetic one) 726 for(CAlign_CI ci2(*m_scope, *target_loc, sel); ci2; ++ci2) { 727 const CSeq_align& current_aln = *ci2; 728 if(current_aln.GetSeq_id(0).Equals(*product_idh.GetSeqId())) { 729 aln = SerialClone<CSeq_align>(current_aln); 730 } 731 } 732 }} 733 734 if(aln) { 735 CRef<CVariantPlacement> p2(SerialClone<CVariantPlacement>(placement)); 736 ChangeIdsInPlace(*p2, sequence::eGetId_ForceAcc, *m_scope); 737 ChangeIdsInPlace(*aln, sequence::eGetId_ForceAcc, *m_scope); 738 result = Remap(*p2, *aln); 739 } 740 741 } 742 if(!result->GetLoc().IsNull()) { 743 break; 744 } 745 } 746 } 747 } else if(v.GetData().IsSet()) { //no placements at this level - try in sub-variations 748 ITERATE(CVariation::TData::TSet::TVariations, it, v.GetData().GetSet().GetVariations()) 749 { 750 const CVariation& v2 = **it; 751 CRef<CVariantPlacement> mapped_placement = RemapToAnnotatedTarget(v2, target); 752 if(!result->GetLoc().IsNull()) { //already have somthing - append 753 result->SetLoc().Add(mapped_placement->GetLoc()); 754 } else { 755 result->Assign(*mapped_placement); 756 } 757 } 758 } 759 760 CRef<CSeq_loc> loc = sequence::Seq_loc_Merge(result->GetLoc(), CSeq_loc::fSortAndMerge_All, NULL); 761 result->SetLoc().Assign(*loc); 762 return result; 763 } 764 765 766 767 template<typename T> ContainsIupacNaAmbiguities(const T & obj)768 static bool ContainsIupacNaAmbiguities(const T& obj) 769 { 770 for(CTypeConstIterator<CSeq_data> it(Begin(obj)); it; ++it) { 771 const CSeq_data& sd = *it; 772 if(!sd.IsIupacna()) { 773 continue; 774 } 775 ITERATE(string, it2, sd.GetIupacna().Get()) 776 { 777 char c = *it2; 778 if(c != 'A' && c != 'C' && c != 'G' && c != 'T') { 779 return true; 780 } 781 } 782 } 783 return false; 784 } 785 CheckAmbiguitiesInLiterals(CVariation & v)786 bool CVariationUtil::CheckAmbiguitiesInLiterals(CVariation& v) 787 { 788 bool had_ambiguities = false; 789 790 if(v.IsSetPlacements() && v.GetPlacements().size() > 0) { 791 if(ContainsIupacNaAmbiguities(v.GetData())) { 792 had_ambiguities = true; 793 v.SetPlacements().front()->SetExceptions().push_back(CreateException("Ambiguous residues in variation", CVariationException::eCode_ambiguous_sequence)); 794 } 795 } 796 797 if(v.GetData().IsSet()) { 798 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations()) 799 { 800 CVariation& v2 = **it; 801 had_ambiguities = had_ambiguities || CheckAmbiguitiesInLiterals(v2); 802 } 803 } 804 return had_ambiguities; 805 } 806 CheckPlacement(CVariantPlacement & p)807 bool CVariationUtil::CheckPlacement(CVariantPlacement& p) 808 { 809 bool invalid_location = false; 810 bool out_of_order = false; 811 812 if(sequence::SeqLocCheck(p.GetLoc(), m_scope) == sequence::eSeqLocCheck_error) { 813 invalid_location = true; 814 } 815 816 if(p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) { 817 if(sequence::GetStart(p.GetLoc(), NULL) > sequence::GetStop(p.GetLoc(), NULL)) { 818 out_of_order = true; 819 } 820 821 #if 0 822 //Note: not comparing offsets, as it may be legitimately out-of-order. JIRA:SNP-5238 823 if(p.IsSetStart_offset() && p.IsSetStop_offset() && sequence::GetLength(p.GetLoc(), NULL) == 1) { 824 if(p.GetStart_offset() > p.GetStop_offset()) { 825 //note: not checking for strand, as offsets are always in the order of transcription 826 out_of_order = true; 827 invalid_location = true; 828 } 829 } 830 #endif 831 } 832 833 if(invalid_location) { 834 p.SetExceptions().push_back(CreateException( 835 out_of_order ? "Invalid location - start and stop are out of order" 836 : "Invalid location", 837 CVariationException::eCode_hgvs_parsing)); 838 } 839 840 if(p.GetLoc().GetId()) { 841 842 CBioseq_Handle bsh = 843 m_scope->GetBioseqHandle( 844 *p.GetLoc().GetId()); 845 846 if(bsh && bsh.GetState() != 0 847 && bsh.GetState() != CBioseq_Handle::fState_dead) { //dead is OK, beacuse supporting old versions 848 p.SetExceptions().push_back( 849 CreateException( 850 "Bioseq is suppressed or withdrawn", 851 CVariationException::eCode_bioseq_state)); 852 } 853 } 854 855 return invalid_location; 856 } 857 858 Remap(const CVariantPlacement & p,CSeq_loc_Mapper & mapper)859 CRef<CVariantPlacement> CVariationUtil::Remap( 860 const CVariantPlacement& p, 861 CSeq_loc_Mapper& mapper) 862 { 863 CRef<CVariantPlacement> p2 = x_Remap(p, mapper); 864 865 if(((p.IsSetStart_offset() 866 || p.IsSetStop_offset()) 867 && p2->GetMol() == CVariantPlacement::eMol_genomic) 868 || (p.GetMol() == CVariantPlacement::eMol_genomic 869 && p2->GetMol() != CVariantPlacement::eMol_genomic 870 && p2->GetMol() != CVariantPlacement::eMol_unknown) /*in case remapped to nothing*/) { 871 NcbiCerr << "Mapped: " << MSerial_AsnText << *p2; 872 //When mapping an offset-placement to a genomic placement, may need to resolve offsets. 873 //or, when mapping from genomic to a product coordinates, may need to add offsets. In above cases 874 //we need to use the seq-align-based mapping. 875 NCBI_USER_THROW( 876 "Cannot use Mapper-based method to remap intronic cases;" 877 "must remap via spliced-seg alignment instead."); 878 } 879 880 AttachSeq(*p2); 881 882 if(p.IsSetSeq() && p2->IsSetSeq() 883 && p.GetSeq().GetLength() == p2->GetSeq().GetLength() 884 && p.GetSeq().IsSetSeq_data() && p2->GetSeq().IsSetSeq_data() 885 && p.GetSeq().GetSeq_data().Which() == p2->GetSeq().GetSeq_data().Which() 886 && !p.GetSeq().GetSeq_data().Equals(p2->GetSeq().GetSeq_data())) { 887 p2->SetExceptions().push_back( 888 CreateException( 889 "Mismatches in mapping", 890 CVariationException::eCode_mismatches_in_mapping)); 891 } 892 893 CheckPlacement(*p2); 894 return p2; 895 } 896 897 x_Remap(const CVariantPlacement & p,CSeq_loc_Mapper & mapper)898 CRef<CVariantPlacement> CVariationUtil::x_Remap( 899 const CVariantPlacement& p, 900 CSeq_loc_Mapper& mapper) 901 { 902 CRef<CVariantPlacement> p2(new CVariantPlacement); 903 904 905 if(p.IsSetStart_offset()) { 906 p2->SetStart_offset() = p.GetStart_offset(); 907 } 908 if(p.IsSetStop_offset()) { 909 p2->SetStop_offset() = p.GetStop_offset(); 910 } 911 if(p.IsSetStart_offset_fuzz()) { 912 p2->SetStart_offset_fuzz().Assign(p.GetStart_offset_fuzz()); 913 } 914 if(p.IsSetStop_offset_fuzz()) { 915 p2->SetStop_offset_fuzz().Assign(p.GetStop_offset_fuzz()); 916 } 917 918 CRef<CSeq_loc> mapped_loc = mapper.Map(p.GetLoc()); 919 920 {{ 921 // If we have offsets, then the distinction 922 // between point and one-point interval is important, e.g. 923 // NM_000155.3:c.-116-3_-116 - the location is an interval, 924 // but the anchor point is the same; we need to 925 // keep it as interval, as if we represent it as a point, 926 // then the corresponding HGVS is also a point: NM_000155.3:c.-116-3 927 const bool equal_offsets = ( 928 !p2->IsSetStart_offset() 929 && !p2->IsSetStop_offset()) 930 931 || (p2->IsSetStart_offset() 932 && p2->IsSetStop_offset() 933 934 && p2->GetStart_offset() 935 == p2->GetStop_offset()); 936 937 const bool merge_single_range = 938 p.GetLoc().IsInt() 939 && mapped_loc->IsPnt() 940 && !equal_offsets; 941 942 if(mapped_loc->IsInt() 943 && mapped_loc->GetInt().IsSetFuzz_from() 944 && mapped_loc->GetInt().IsSetFuzz_to() 945 && sequence::GetLength(*mapped_loc, NULL) == 1) { 946 // workaround for CXX-4376. single-nt locations will 947 // not be collapsed to a point if both From and To fuzz is present 948 mapped_loc->SetInt().ResetFuzz_to(); 949 } 950 951 mapped_loc = sequence::Seq_loc_Merge( 952 *mapped_loc, 953 merge_single_range ? CSeq_loc::fMerge_SingleRange 954 : CSeq_loc::fSortAndMerge_All, 955 NULL); 956 }} 957 958 #if 1 959 //SNP-5148 960 if(mapped_loc->IsNull() && p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) { 961 //If mapped to nothing, expand the loc and try again. The purpose is to know the seq-id of the 962 //neighborhood that we couldn't map to, so we can produce and Empty seq-loc that contains 963 //a seq-id, reporting that we tried to map to this sequence, but there's no mapping 964 CRef<CSeq_loc> loc1 = sequence::Seq_loc_Merge(p.GetLoc(), CSeq_loc::fMerge_SingleRange, NULL); 965 loc1->SetInt().SetFrom() = loc1->GetInt().GetFrom() < 500 ? 0 : loc1->SetInt().GetFrom() - 500; 966 loc1->SetInt().SetTo() += 500; 967 CRef<CSeq_loc> tmp_mapped_loc = mapper.Map(*loc1); 968 969 if(tmp_mapped_loc->GetId()) { 970 mapped_loc->SetEmpty().Assign(*tmp_mapped_loc->GetId()); 971 } 972 973 } 974 if(p2->GetLoc().IsEmpty()) { 975 p2->SetExceptions().push_back(CreateException("No mapping", CVariationException::eCode_no_mapping)); 976 } 977 #endif 978 979 980 p2->SetLoc(*mapped_loc); 981 p2->SetPlacement_method(CVariantPlacement::ePlacement_method_projected); 982 983 {{ 984 TSeqPos orig_len = p.GetLoc().Which() ? sequence::GetLength(p.GetLoc(), NULL) : 0; 985 TSeqPos mapped_len = mapped_loc->Which() ? sequence::GetLength(*mapped_loc, NULL) : 0; 986 bool orig_is_compound = p.GetLoc().IsMix() || p.GetLoc().IsPacked_int(); 987 bool mapped_is_compound = mapped_loc->IsMix() || mapped_loc->IsPacked_int(); 988 CRef<CVariationException> exception; 989 if(mapped_len == 0) { 990 exception.Reset(new CVariationException); 991 exception->SetCode(CVariationException::eCode_no_mapping); 992 } else if(mapped_len < orig_len) { 993 exception.Reset(new CVariationException); 994 exception->SetCode(CVariationException::eCode_partial_mapping); 995 } else if(!orig_is_compound && mapped_is_compound) { 996 exception.Reset(new CVariationException); 997 exception->SetCode(CVariationException::eCode_split_mapping); 998 } 999 if(exception) { 1000 exception->SetMessage(""); 1001 p2->SetExceptions().push_back(exception); 1002 } 1003 }} 1004 1005 if(p2->GetLoc().GetId()) { 1006 p2->SetMol(GetMolType(sequence::GetId(p2->GetLoc(), NULL))); 1007 } else { 1008 p2->SetMol(CVariantPlacement::eMol_unknown); 1009 } 1010 1011 //VAR-1307 1012 if(p2->GetLoc().GetId() 1013 && (p2->GetMol() == CVariantPlacement::eMol_genomic 1014 || p2->GetMol() == CVariantPlacement::eMol_mitochondrion 1015 || p2->GetMol() == CVariantPlacement::eMol_unknown) 1016 1017 && (p.GetMol() == CVariantPlacement::eMol_genomic 1018 || p.GetMol() == CVariantPlacement::eMol_mitochondrion 1019 || p.GetMol() == CVariantPlacement::eMol_unknown) 1020 1021 && s_GetLength(p, m_scope) 1022 > s_GetLength(*p2, m_scope) + 5000) { 1023 p2->SetExceptions().push_back(CreateException( 1024 "Source location overhangs the alignment by at least 5kb", 1025 CVariationException::eCode_source_location_overhang)); 1026 } 1027 1028 1029 ChangeIdsInPlace(*p2, sequence::eGetId_ForceAcc, *m_scope); 1030 return p2; 1031 } 1032 AttachSeq(CVariantPlacement & p,TSeqPos max_len)1033 bool CVariationUtil::AttachSeq(CVariantPlacement& p, TSeqPos max_len) 1034 { 1035 p.ResetSeq(); 1036 bool ret = false; 1037 TSeqPos length = s_GetLength(p, m_scope); 1038 1039 p.SetSeq().SetLength(length); 1040 1041 if((p.IsSetStart_offset() && p.GetStart_offset() != 0) 1042 || (p.IsSetStop_offset() && p.GetStop_offset() != 0)) { 1043 p.SetExceptions().push_back(CreateException( 1044 "Can't get sequence for an offset-based location", 1045 CVariationException::eCode_seqfetch_intronic)); 1046 ret = false; 1047 } else if(length > max_len) { 1048 p.SetExceptions().push_back(CreateException( 1049 "Sequence is longer than the cutoff threshold", 1050 CVariationException::eCode_seqfetch_too_long)); 1051 ret = false; 1052 } else { 1053 try { 1054 CRef<CSeq_literal> literal = x_GetLiteralAtLoc(p.GetLoc()); 1055 p.SetSeq(*literal); 1056 1057 if(ContainsIupacNaAmbiguities(*literal)) { 1058 p.SetExceptions().push_back(CreateException( 1059 "Ambiguous residues in reference", 1060 CVariationException::eCode_ambiguous_sequence)); 1061 } 1062 ret = true; 1063 } 1064 catch(CException&) { 1065 //location can be invalid - SNP-5510 1066 p.SetExceptions().push_back(CreateException( 1067 "Cannot fetch sequence at location", 1068 CVariationException::eCode_seqfetch_invalid)); 1069 ret = false; 1070 } 1071 } 1072 return ret; 1073 } 1074 AttachSeq(CVariation & v,TSeqPos max_len)1075 bool CVariationUtil::AttachSeq(CVariation& v, TSeqPos max_len) 1076 { 1077 v.Index(); 1078 bool had_exceptions = false; 1079 if(v.IsSetPlacements()) { 1080 1081 CConstRef<CSeq_literal> asserted_literal; 1082 CConstRef<CSeq_literal> variant_literal; 1083 // will only fetch for snp/mnp cases, 1084 // as ref_same_as_variant test is only applicable for these 1085 1086 // Find the asserted and variant literal for this variation 1087 // (don't look into consequence-variations that may also contain 1088 // protein-specific asserted and variant sequence) 1089 for(CTypeConstIterator<CVariation> it(Begin(v)); it; ++it) { 1090 const CVariation& v2 = *it; 1091 if(!v2.GetData().IsInstance() || (v2.GetConsequenceParent() && &v != &v2)) { 1092 continue; 1093 } 1094 1095 const CVariation_inst& inst = v2.GetData().GetInstance(); 1096 if(inst.GetDelta().size() == 1 1097 && inst.GetDelta().front()->IsSetSeq() 1098 && inst.GetDelta().front()->GetSeq().IsLiteral()) { 1099 CConstRef<CSeq_literal> literal(&inst.GetDelta().front()->GetSeq().GetLiteral()); 1100 if(!asserted_literal 1101 && inst.IsSetObservation() 1102 && inst.GetObservation() == CVariation_inst::eObservation_asserted) { 1103 asserted_literal = literal; 1104 } else if(!variant_literal //note: finding the very first one; will that work always? 1105 && (!inst.IsSetObservation() || inst.GetObservation() == CVariation_inst::eObservation_variant) 1106 && (!inst.GetDelta().front()->IsSetAction() || inst.GetDelta().front()->GetAction() == CDelta_item::eAction_morph) 1107 && (!inst.GetDelta().front()->IsSetMultiplier() && !inst.GetDelta().front()->IsSetMultiplier_fuzz()) 1108 ) { 1109 variant_literal = literal; 1110 } 1111 } 1112 } 1113 1114 #if 0 1115 if(variant_literal) { 1116 LOG_POST("Found variant-literal"); 1117 NcbiCout << MSerial_AsnText << *variant_literal; 1118 } else { 1119 LOG_POST("Did not find variant-literal"); 1120 } 1121 #endif 1122 1123 NON_CONST_ITERATE(CVariation::TPlacements, it, v.SetPlacements()) 1124 { 1125 CVariantPlacement& p = **it; 1126 bool attached = AttachSeq(p, max_len); 1127 1128 if(attached 1129 && asserted_literal 1130 && (p.GetSeq().GetLength() != asserted_literal->GetLength() 1131 || (p.GetSeq().IsSetSeq_data() && asserted_literal->IsSetSeq_data() 1132 && p.GetSeq().GetSeq_data().Which() == asserted_literal->GetSeq_data().Which() 1133 && !p.GetSeq().GetSeq_data().Equals(asserted_literal->GetSeq_data())))) { 1134 p.SetExceptions().push_back(CreateException( 1135 "Asserted sequence is inconsistent with reference", 1136 CVariationException::eCode_inconsistent_asserted_allele)); 1137 had_exceptions = true; 1138 } 1139 1140 if(attached 1141 && variant_literal 1142 && variant_literal->Equals(p.GetSeq())) { 1143 p.SetExceptions().push_back(CreateException( 1144 "Reference sequence is the same as variant", 1145 CVariationException::eCode_ref_same_as_variant)); 1146 had_exceptions = true; 1147 } 1148 } 1149 } 1150 1151 if(v.GetData().IsSet()) { 1152 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, 1153 v.SetData().SetSet().SetVariations()) 1154 { 1155 CVariation& v2 = **it; 1156 had_exceptions = had_exceptions || AttachSeq(v2, max_len); 1157 } 1158 } 1159 return !had_exceptions; 1160 } 1161 1162 GetMolType(const CSeq_id & id)1163 CVariantPlacement::TMol CVariationUtil::GetMolType(const CSeq_id& id) 1164 { 1165 //Most of the time can figure out from seq-id. Must be careful not to 1166 //automatically treat NC as "genomic", as we need to differentiate from 1167 //mitochondrion. 1168 CSeq_id::EAccessionInfo acinf = id.IdentifyAccession(); 1169 CVariantPlacement::TMol moltype = CVariantPlacement::eMol_unknown; 1170 1171 if(acinf == CSeq_id::eAcc_refseq_prot || acinf == CSeq_id::eAcc_refseq_prot_predicted) { 1172 moltype = CVariantPlacement::eMol_protein; 1173 } else if(acinf == CSeq_id::eAcc_refseq_mrna || acinf == CSeq_id::eAcc_refseq_mrna_predicted) { 1174 moltype = CVariantPlacement::eMol_cdna; 1175 } else if(acinf == CSeq_id::eAcc_refseq_ncrna || acinf == CSeq_id::eAcc_refseq_ncrna_predicted) { 1176 moltype = CVariantPlacement::eMol_rna; 1177 } else if(acinf == CSeq_id::eAcc_refseq_genomic 1178 || acinf == CSeq_id::eAcc_refseq_contig 1179 || acinf == CSeq_id::eAcc_refseq_wgs_intermed) { 1180 moltype = CVariantPlacement::eMol_genomic; 1181 } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome 1182 && NStr::StartsWith(id.GetTextseq_Id()->GetAccession(), "AC_")) { 1183 moltype = CVariantPlacement::eMol_genomic; 1184 } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome 1185 && ((id.GetTextseq_Id()->GetAccession() >= "NC_000001" && id.GetTextseq_Id()->GetAccession() <= "NC_000024") 1186 || (id.GetTextseq_Id()->GetAccession() >= "NC_000067" && id.GetTextseq_Id()->GetAccession() <= "NC_000087"))) { 1187 //hardcode most "popular" chromosomes that are not mitochondrions. 1188 moltype = CVariantPlacement::eMol_genomic; 1189 } else { 1190 CBioseq_Handle bsh = m_scope->GetBioseqHandle(id); 1191 if(!bsh) { 1192 moltype = CVariantPlacement::eMol_unknown; 1193 } else { 1194 //check if mitochondrion 1195 if(bsh.GetTopLevelEntry().IsSetDescr()) { 1196 ITERATE(CBioseq_Handle::TDescr::Tdata, it, bsh.GetTopLevelEntry().GetDescr().Get()) 1197 { 1198 const CSeqdesc& desc = **it; 1199 if(!desc.IsSource()) { 1200 continue; 1201 } 1202 if(desc.GetSource().IsSetGenome() && desc.GetSource().GetGenome() == CBioSource::eGenome_mitochondrion) { 1203 moltype = CVariantPlacement::eMol_mitochondrion; 1204 } 1205 } 1206 } 1207 1208 //check molinfo 1209 if(moltype == CVariantPlacement::eMol_unknown) { 1210 const CMolInfo* m = sequence::GetMolInfo(bsh); 1211 if(!m) { 1212 moltype = CVariantPlacement::eMol_unknown; 1213 } else if(m->GetBiomol() == CMolInfo::eBiomol_genomic 1214 || m->GetBiomol() == CMolInfo::eBiomol_other_genetic) { 1215 moltype = CVariantPlacement::eMol_genomic; 1216 } else if(m->GetBiomol() == CMolInfo::eBiomol_mRNA 1217 || m->GetBiomol() == CMolInfo::eBiomol_genomic_mRNA) { 1218 moltype = CVariantPlacement::eMol_cdna; 1219 } else if(m->GetBiomol() == CMolInfo::eBiomol_ncRNA 1220 || m->GetBiomol() == CMolInfo::eBiomol_rRNA 1221 || m->GetBiomol() == CMolInfo::eBiomol_scRNA 1222 || m->GetBiomol() == CMolInfo::eBiomol_snRNA 1223 || m->GetBiomol() == CMolInfo::eBiomol_snoRNA 1224 || m->GetBiomol() == CMolInfo::eBiomol_pre_RNA 1225 || m->GetBiomol() == CMolInfo::eBiomol_tmRNA 1226 || m->GetBiomol() == CMolInfo::eBiomol_cRNA 1227 || m->GetBiomol() == CMolInfo::eBiomol_tRNA 1228 || m->GetBiomol() == CMolInfo::eBiomol_transcribed_RNA) { 1229 moltype = CVariantPlacement::eMol_rna; 1230 } else if(m->GetBiomol() == CMolInfo::eBiomol_peptide) { 1231 moltype = CVariantPlacement::eMol_protein; 1232 } else { 1233 moltype = CVariantPlacement::eMol_unknown; 1234 } 1235 } 1236 } 1237 } 1238 return moltype; 1239 } 1240 1241 CreateFlankLocs(const CSeq_loc & loc,TSeqPos len)1242 CVariationUtil::SFlankLocs CVariationUtil::CreateFlankLocs(const CSeq_loc& loc, TSeqPos len) 1243 { 1244 SFlankLocs flanks; 1245 flanks.upstream.Reset(new CSeq_loc(CSeq_loc::e_Null)); 1246 flanks.downstream.Reset(new CSeq_loc(CSeq_loc::e_Null)); 1247 if(!loc.GetId()) { 1248 return flanks; 1249 } 1250 1251 TSignedSeqPos start = sequence::GetStart(loc, m_scope, eExtreme_Positional); 1252 TSignedSeqPos stop = sequence::GetStop(loc, m_scope, eExtreme_Positional); 1253 1254 const CSeq_id& seq_id = sequence::GetId(loc, NULL); 1255 1256 CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id); 1257 1258 if(!bsh) { 1259 NCBI_USER_THROW_FMT("Unable to retrieve bioseq from scope : id " << 1260 seq_id.GetSeqIdString(true /*with version*/)); 1261 } 1262 1263 TSignedSeqPos max_pos = bsh.GetInst_Length() - 1; 1264 1265 flanks.upstream->SetInt().SetId().Assign(sequence::GetId(loc, NULL)); 1266 flanks.upstream->SetInt().SetStrand(sequence::GetStrand(loc, NULL)); 1267 flanks.upstream->SetInt().SetTo(min(max_pos, stop + (TSignedSeqPos)len)); 1268 flanks.upstream->SetInt().SetFrom(max((TSignedSeqPos)0, start - (TSignedSeqPos)len)); 1269 1270 flanks.downstream->Assign(*flanks.upstream); 1271 1272 CSeq_loc& second = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.upstream : *flanks.downstream; 1273 CSeq_loc& first = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.downstream : *flanks.upstream; 1274 1275 if(start == 0) { 1276 first.SetNull(); 1277 } else { 1278 first.SetInt().SetTo(start - 1); 1279 } 1280 1281 if(stop == max_pos) { 1282 second.SetNull(); 1283 } else { 1284 second.SetInt().SetFrom(stop + 1); 1285 } 1286 1287 return flanks; 1288 } 1289 1290 1291 1292 /////////////////////////////////////////////////////////////////////////////// 1293 // 1294 // Methods and functions pertaining to converting protein variation in precursor coords 1295 // 1296 /////////////////////////////////////////////////////////////////////////////// s_UntranslateProt(const string & prot_str,vector<string> & codons)1297 void CVariationUtil::s_UntranslateProt(const string& prot_str, vector<string>& codons) 1298 { 1299 if(prot_str.size() != 1) { 1300 NCBI_THROW(CException, eUnknown, "Expected prot_str of length 1"); 1301 } 1302 1303 static const char* alphabet = "ACGT"; 1304 string codon = "AAA"; 1305 for(size_t i0 = 0; i0 < 4; i0++) { 1306 codon[0] = alphabet[i0]; 1307 for(size_t i1 = 0; i1 < 4; i1++) { 1308 codon[1] = alphabet[i1]; 1309 for(size_t i2 = 0; i2 < 4; i2++) { 1310 codon[2] = alphabet[i2]; 1311 string prot(""); 1312 CSeqTranslator::Translate(codon, prot, CSeqTranslator::fIs5PrimePartial); 1313 NStr::ReplaceInPlace(prot, "*", "X"); //Conversion to IUPAC produces "X", but Translate produces "*" 1314 1315 //LOG_POST(">>>" << codon << " " << prot << " " << prot_str); 1316 if(prot == prot_str) { 1317 codons.push_back(codon); 1318 } 1319 } 1320 } 1321 } 1322 } 1323 s_CountMatches(const string & a,const string & b)1324 size_t CVariationUtil::s_CountMatches(const string& a, const string& b) 1325 { 1326 size_t count(0); 1327 for(size_t i = 0; i < min(a.size(), b.size()); i++) { 1328 if(a[i] == b[i]) { 1329 count++; 1330 } 1331 } 1332 return count; 1333 } 1334 s_CalcPrecursorVariationCodon(const string & codon_from,const string & prot_to,vector<string> & codons_to)1335 void CVariationUtil::s_CalcPrecursorVariationCodon( 1336 const string& codon_from, //codon on cDNA 1337 const string& prot_to, //missense/nonsense AA 1338 vector<string>& codons_to) //calculated variation-codons 1339 { 1340 vector<string> candidates1; 1341 size_t max_matches(0); 1342 s_UntranslateProt(prot_to, candidates1); 1343 codons_to.clear(); 1344 bool have_silent = false; 1345 1346 ITERATE(vector<string>, it1, candidates1) 1347 { 1348 size_t matches = s_CountMatches(codon_from, *it1); 1349 1350 // LOG_POST("CalcPrecursorVariationCodon:" << codon_from << " " << prot_to << " " << *it1 << " " << matches); 1351 if(matches == 3) { 1352 //all three bases in a codon matched - we must be processing a silent mutation. 1353 //in this case we want to consider candidate codons other than itself. 1354 have_silent = true; 1355 continue; 1356 } 1357 1358 if(matches >= max_matches) { 1359 if(matches > max_matches) { 1360 codons_to.clear(); 1361 } 1362 codons_to.push_back(*it1); 1363 max_matches = matches; 1364 } 1365 } 1366 1367 // didn't find polymorphic candidate - must be dealing with no-change at nucleotide level 1368 if(codons_to.empty() && have_silent) { 1369 codons_to.push_back(codon_from); 1370 } 1371 } 1372 s_CollapseAmbiguities(const vector<string> & seqs)1373 string CVariationUtil::s_CollapseAmbiguities(const vector<string>& seqs) 1374 { 1375 string collapsed_seq; 1376 1377 vector<int> bits; //4-bit bitmask denoting whether a nucleotide occurs at this pos at any seq in seqs 1378 1379 typedef const vector<string> TConstStrs; 1380 ITERATE(TConstStrs, it, seqs) 1381 { 1382 const string& seq = *it; 1383 if(seq.size() > bits.size()) { 1384 bits.resize(seq.size()); 1385 } 1386 1387 for(size_t i = 0; i < seq.size(); i++) { 1388 char nt = seq[i]; 1389 int m = (nt == 'T' ? 1 1390 : nt == 'G' ? 2 1391 : nt == 'C' ? 4 1392 : nt == 'A' ? 8 : 0); 1393 if(!m) { 1394 NCBI_THROW(CException, eUnknown, "Expected [ACGT] alphabet"); 1395 } 1396 1397 bits[i] |= m; 1398 } 1399 } 1400 1401 static const char* iupac_nuc_ambiguity_codes = "NTGKCYSBAWRDMHVN"; 1402 collapsed_seq.resize(bits.size()); 1403 for(size_t i = 0; i < collapsed_seq.size(); i++) { 1404 collapsed_seq[i] = iupac_nuc_ambiguity_codes[bits[i]]; 1405 } 1406 return collapsed_seq; 1407 } 1408 1409 x_InferNAfromAA(CVariation & v,TAA2NAFlags flags)1410 void CVariationUtil::x_InferNAfromAA(CVariation& v, TAA2NAFlags flags) 1411 { 1412 if(v.GetData().IsSet()) { 1413 CVariation::TData::TSet::TVariations variations = v.SetData().SetSet().SetVariations(); 1414 v.SetData().SetSet().SetVariations().clear(); 1415 1416 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, variations) 1417 { 1418 CRef<CVariation> v2 = *it; 1419 1420 if(v2->GetData().IsInstance() 1421 && v2->GetData().GetInstance().IsSetObservation() 1422 && !(v2->GetData().GetInstance().GetObservation() & CVariation_inst::eObservation_variant)) { 1423 //only variant observations are propagated to the nuc level 1424 continue; 1425 } 1426 x_InferNAfromAA(*v2, flags); 1427 v.SetData().SetSet().SetVariations().push_back(v2); 1428 } 1429 v.ResetPlacements(); //nucleotide placement will be computed for each instance and attached at its level 1430 1431 if(v.GetData().GetSet().GetVariations().size() == 1) { 1432 v.Assign(*v.GetData().GetSet().GetVariations().front()); 1433 } 1434 return; 1435 } 1436 1437 //Remap the placement to nucleotide 1438 1439 const CVariation::TPlacements* placements = s_GetPlacements(v); 1440 if(!placements || placements->size() == 0) { 1441 NCBI_THROW(CException, eUnknown, "Expected a placement"); 1442 } 1443 1444 const CVariantPlacement& placement = *placements->front(); 1445 1446 if(placement.IsSetStart_offset() || placement.IsSetStop_offset()) { 1447 NCBI_THROW(CException, eUnknown, "Expected offset-free placement"); 1448 } 1449 1450 if(placement.IsSetMol() && placement.GetMol() != CVariantPlacement::eMol_protein) { 1451 NCBI_THROW(CException, eUnknown, "Expected a protein-type placement"); 1452 } 1453 1454 SAnnotSelector sel(CSeqFeatData::e_Cdregion, true); 1455 //note: sel by product; location is prot; found feature is mrna having this prot as product 1456 CRef<CSeq_loc_Mapper> prot2precursor_mapper; 1457 for(CFeat_CI ci(*m_scope, placement.GetLoc(), sel); ci; ++ci) { 1458 try { 1459 prot2precursor_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eProductToLocation, m_scope)); 1460 } 1461 catch(CException&) { 1462 ;// may legitimately throw if feature is not good for mapping 1463 } 1464 break; 1465 } 1466 1467 if(!prot2precursor_mapper) { 1468 NCBI_THROW(CException, eUnknown, "Can't create prot2precursor mapper. Is this a prot?"); 1469 } 1470 1471 CRef<CSeq_loc> nuc_loc = prot2precursor_mapper->Map(placement.GetLoc()); 1472 ChangeIdsInPlace(*nuc_loc, sequence::eGetId_ForceAcc, *m_scope); 1473 1474 CConstRef<CDelta_item> delta = 1475 !v.GetData().IsInstance() ? CConstRef<CDelta_item>(NULL) 1476 : v.GetData().GetInstance().GetDelta().size() != 1 ? CConstRef<CDelta_item>(NULL) 1477 : v.GetData().GetInstance().GetDelta().front(); 1478 1479 1480 if(!v.GetData().IsInstance()) { 1481 // variation has "special" payload - nothing to do other than remap location 1482 CRef<CVariantPlacement> p(new CVariantPlacement); 1483 p->SetLoc(*nuc_loc); 1484 p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown); 1485 CRef<CVariation> v2(new CVariation); 1486 v2->SetData().Assign(v.GetData()); 1487 v2->SetPlacements().push_back(p); 1488 v.Assign(*v2); 1489 return; 1490 } 1491 1492 //See if the variation is simple-enough to process; if too complex, return "unknown-variation" 1493 if(!nuc_loc->IsInt() //complex nucleotide location 1494 || sequence::GetLength(*nuc_loc, NULL) != 3 //does not remap to single codon 1495 || !delta //delta is not a single-element item 1496 || (delta->IsSetAction() && delta->GetAction() != CDelta_item::eAction_morph) //not a replace-at-location delta-type 1497 || !delta->IsSetSeq() //don't know what's the variant allele 1498 || !delta->GetSeq().IsLiteral() 1499 || delta->GetSeq().GetLiteral().GetLength() != 1) //not single-AA missense 1500 { 1501 CRef<CVariantPlacement> p(new CVariantPlacement); 1502 p->SetLoc(*nuc_loc); 1503 p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown); 1504 CRef<CVariation> v2(new CVariation); 1505 v2->SetData().SetUnknown(); 1506 v2->SetPlacements().push_back(p); 1507 v.Assign(*v2); 1508 return; 1509 } 1510 1511 1512 CSeqVector seqv(*nuc_loc, *m_scope, CBioseq_Handle::eCoding_Iupac); 1513 string original_allele_codon; //nucleotide allele on the sequence 1514 seqv.GetSeqData(seqv.begin(), seqv.end(), original_allele_codon); 1515 1516 string variant_codon; 1517 {{ 1518 CSeq_data variant_prot_seq; 1519 CSeqportUtil::Convert( 1520 delta->GetSeq().GetLiteral().GetSeq_data(), 1521 &variant_prot_seq, 1522 CSeq_data::e_Iupacaa); 1523 1524 vector<string> variant_codons; 1525 if(v.GetData().GetInstance().IsSetObservation() 1526 && v.GetData().GetInstance().GetObservation() 1527 == CVariation_inst::eObservation_asserted) { 1528 s_UntranslateProt( 1529 variant_prot_seq.GetIupacaa(), 1530 variant_codons); 1531 } else { 1532 s_CalcPrecursorVariationCodon( 1533 original_allele_codon, 1534 variant_prot_seq.GetIupacaa(), 1535 variant_codons); 1536 } 1537 variant_codon = s_CollapseAmbiguities(variant_codons); 1538 }} 1539 1540 if(flags & fAA2NA_truncate_common_prefix_and_suffix 1541 && !v.IsSetFrameshift() 1542 && variant_codon != original_allele_codon) { 1543 while(variant_codon.length() > 0 1544 && original_allele_codon.length() > 0 1545 && variant_codon.at(0) == original_allele_codon.at(0)) { 1546 variant_codon = variant_codon.substr(1); 1547 original_allele_codon = original_allele_codon.substr(1); 1548 if(nuc_loc->GetStrand() == eNa_strand_minus) { 1549 nuc_loc->SetInt().SetTo()--; 1550 } else { 1551 nuc_loc->SetInt().SetFrom()++; 1552 } 1553 } 1554 1555 while(variant_codon.length() > 0 1556 && original_allele_codon.length() > 0 1557 && variant_codon.at(variant_codon.length() - 1) 1558 == original_allele_codon.at(original_allele_codon.length() - 1)) { 1559 variant_codon.resize(variant_codon.length() - 1); 1560 original_allele_codon.resize(original_allele_codon.length() - 1); 1561 //Note: normally given a protein, the parent will be a mRNA and the CDS location 1562 //will have plus strand; however, the parent could be MT, so we can't assume plus strand 1563 if(nuc_loc->GetStrand() == eNa_strand_minus) { 1564 nuc_loc->SetInt().SetFrom()++; 1565 } else { 1566 nuc_loc->SetInt().SetTo()--; 1567 } 1568 } 1569 } 1570 1571 CRef<CDelta_item> delta2(new CDelta_item); 1572 delta2->SetSeq().SetLiteral().SetLength(variant_codon.length()); 1573 delta2->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(variant_codon); 1574 1575 CRef<CVariation> v2(new CVariation); 1576 1577 //set placement 1578 CRef<CVariantPlacement> p2(new CVariantPlacement); 1579 //merge loc to convert int of length 1 to a pnt as necessary 1580 p2->SetLoc(*sequence::Seq_loc_Merge(*nuc_loc, CSeq_loc::fSortAndMerge_All, NULL)); 1581 p2->SetMol(GetMolType(sequence::GetId(*nuc_loc, NULL))); 1582 AttachSeq(*p2); 1583 v2->SetPlacements().push_back(p2); 1584 1585 1586 if(v.IsSetFrameshift()) { 1587 v2->SetData().SetUnknown(); //don't know what is happening at nucleotide level 1588 } else { 1589 CVariation_inst& inst2 = v2->SetData().SetInstance(); 1590 inst2.SetType(variant_codon.length() == 1 ? CVariation_inst::eType_snv : CVariation_inst::eType_mnp); 1591 inst2.SetDelta().push_back(delta2); 1592 1593 if(v.GetData().GetInstance().IsSetObservation()) { 1594 inst2.SetObservation(v.GetData().GetInstance().GetObservation()); 1595 } 1596 } 1597 1598 v.Assign(*v2); 1599 } 1600 InferNAfromAA(const CVariation & v,TAA2NAFlags flags)1601 CRef<CVariation> CVariationUtil::InferNAfromAA(const CVariation& v, TAA2NAFlags flags) 1602 { 1603 CRef<CVariation> v2(new CVariation); 1604 v2->Assign(v); 1605 v2->Index(); 1606 x_InferNAfromAA(*v2, flags); 1607 s_FactorOutPlacements(*v2); 1608 v2->Index(); 1609 1610 CheckAmbiguitiesInLiterals(*v2); 1611 1612 //Note: The result describes whole codons, even for point mutations, i.e. common suffx/prefix are not truncated. 1613 return v2; 1614 } 1615 1616 GetLiteralAtLoc(const CSeq_loc & loc)1617 CRef<CSeq_literal> CVariationUtil::GetLiteralAtLoc(const CSeq_loc& loc) 1618 { 1619 return x_GetLiteralAtLoc(loc); 1620 } 1621 1622 IsRightPartial(CBioseq_Handle bsh)1623 static bool IsRightPartial(CBioseq_Handle bsh) 1624 { 1625 ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get()) 1626 { 1627 const CSeqdesc& desc = **it; 1628 if(!desc.IsMolinfo() 1629 || !desc.GetMolinfo().IsSetCompleteness()) { 1630 continue; 1631 } 1632 1633 return desc.GetMolinfo().GetCompleteness() == CMolInfo::eCompleteness_no_right 1634 || desc.GetMolinfo().GetCompleteness() == CMolInfo::eCompleteness_no_ends; 1635 } 1636 1637 return false; 1638 } 1639 1640 x_GetLiteralAtLoc(const CSeq_loc & loc)1641 CRef<CSeq_literal> CVariationUtil::x_GetLiteralAtLoc(const CSeq_loc& loc) 1642 { 1643 CRef<CSeq_literal> literal(new CSeq_literal); 1644 if(sequence::GetLength(loc, NULL) == 0) { 1645 literal->SetLength(0); 1646 } else { 1647 CRef<CSeq_literal> cached_literal = m_cdregion_index.GetCachedLiteralAtLoc(loc); 1648 if(cached_literal) { 1649 literal = cached_literal; 1650 } else { 1651 string seq; 1652 try { 1653 CSeqVector v(loc, *m_scope, CBioseq_Handle::eCoding_Iupac); 1654 1655 if(v.IsProtein()) { 1656 // if loc extends by 1 past the end protein - we'll need to 1657 // truncate the loc to the boundaries of prot, and then add "*" or "X" manually, 1658 // as otherwise fetching seq will throw. 1659 CBioseq_Handle bsh = m_scope->GetBioseqHandle(sequence::GetId(loc, NULL)); 1660 if(bsh.GetInst_Length() == loc.GetStop(eExtreme_Positional)) { 1661 1662 CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL); 1663 range_loc->SetInt().SetTo(bsh.GetInst_Length() - 1); 1664 1665 CRef<CSeq_literal> literal = x_GetLiteralAtLoc( 1666 *range_loc->Intersect(loc, 0, NULL)); 1667 1668 literal->SetLength() += 1; 1669 literal->SetSeq_data().SetNcbieaa().Set().push_back( 1670 IsRightPartial(bsh) ? 'X' : '*'); 1671 1672 return literal; 1673 } 1674 } 1675 1676 v.GetSeqData(v.begin(), v.end(), seq); 1677 literal->SetLength(seq.size()); 1678 if(v.IsProtein()) { 1679 literal->SetSeq_data().SetNcbieaa().Set(seq); 1680 } else if(v.IsNucleotide()) { 1681 literal->SetSeq_data().SetIupacna().Set(seq); 1682 } 1683 } 1684 catch(CException& e) { 1685 string loc_label; 1686 loc.GetLabel(&loc_label); 1687 NCBI_RETHROW_SAME(e, "Can't get literal for " + loc_label); 1688 } 1689 } 1690 } 1691 return literal; 1692 } 1693 1694 s_CatLiterals(const CSeq_literal & a,const CSeq_literal & b)1695 CRef<CSeq_literal> CVariationUtil::s_CatLiterals(const CSeq_literal& a, const CSeq_literal& b) 1696 { 1697 CRef<CSeq_literal> c(new CSeq_literal); 1698 1699 if(b.GetLength() == 0) { 1700 c->Assign(a); 1701 } else if(a.GetLength() == 0) { 1702 c->Assign(b); 1703 } else { 1704 c->SetLength(a.GetLength() + b.GetLength()); 1705 1706 if(a.IsSetFuzz() || b.IsSetFuzz()) { 1707 c->SetFuzz().SetLim(CInt_fuzz::eLim_unk); 1708 } 1709 1710 if(a.IsSetSeq_data() && b.IsSetSeq_data()) { 1711 CSeqportUtil::Append(&(c->SetSeq_data()), 1712 a.GetSeq_data(), 0, a.GetLength(), 1713 b.GetSeq_data(), 0, b.GetLength()); 1714 } 1715 } 1716 return c; 1717 } 1718 s_SpliceLiterals(const CSeq_literal & payload,const CSeq_literal & ref,TSeqPos pos)1719 CRef<CSeq_literal> CVariationUtil::s_SpliceLiterals( 1720 const CSeq_literal& payload, 1721 const CSeq_literal& ref, TSeqPos pos) 1722 { 1723 CRef<CSeq_literal> result; 1724 if(pos == 0) { 1725 result = s_CatLiterals(payload, ref); 1726 } else if(pos == ref.GetLength()) { 1727 result = s_CatLiterals(ref, payload); 1728 } else { 1729 CRef<CSeq_literal> prefix(new CSeq_literal); 1730 prefix->Assign(ref); 1731 prefix->SetLength(pos); 1732 CSeqportUtil::Keep(&prefix->SetSeq_data(), 0, prefix->GetLength()); 1733 1734 CRef<CSeq_literal> suffix(new CSeq_literal); 1735 suffix->Assign(ref); 1736 suffix->SetLength(ref.GetLength() - pos); 1737 CSeqportUtil::Keep(&suffix->SetSeq_data(), pos, suffix->GetLength()); 1738 1739 CRef<CSeq_literal> prefix_and_payload = s_CatLiterals(*prefix, payload); 1740 result = s_CatLiterals(*prefix_and_payload, *suffix); 1741 } 1742 return result; 1743 } 1744 1745 1746 1747 x_FindOrCreateLiteral(const CVariation & v)1748 CConstRef<CSeq_literal> CVariationUtil::x_FindOrCreateLiteral(const CVariation& v) 1749 { 1750 CConstRef<CSeq_literal> literal = s_FindFirstLiteral(v); 1751 if(literal) { 1752 return literal; 1753 } 1754 1755 //instantiate a literal from a placement (make sure there are no offsets) 1756 const CVariation::TPlacements* placements = s_GetPlacements(v); 1757 1758 if(!placements) { 1759 return literal; 1760 } 1761 1762 ITERATE(CVariation::TPlacements, it, *placements) 1763 { 1764 const CVariantPlacement& p = **it; 1765 if(p.IsSetStart_offset() || p.IsSetStop_offset()) { 1766 continue; 1767 } 1768 1769 literal = x_GetLiteralAtLoc(p.GetLoc()); 1770 break; 1771 } 1772 1773 return literal; 1774 } 1775 1776 1777 /*! 1778 * Convert any simple nucleotide variation to delins form, if possible; throw if not. 1779 * Precondition: location must be set. 1780 */ ChangeToDelins(CVariation & v)1781 void CVariationUtil::ChangeToDelins(CVariation& v) 1782 { 1783 v.Index(); 1784 1785 if(v.GetData().IsSet()) { 1786 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, 1787 v.SetData().SetSet().SetVariations()) 1788 { 1789 ChangeToDelins(**it); 1790 } 1791 } else if(v.GetData().IsInstance() 1792 && v.GetData().GetInstance().GetType() == CVariation_inst::eType_inv 1793 && v.GetData().GetInstance().GetDelta().empty()) { 1794 CVariation_inst& inst = v.SetData().SetInstance(); 1795 inst.SetType(CVariation_inst::eType_delins); 1796 1797 CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v); 1798 if(!this_literal) { 1799 NcbiCerr << MSerial_AsnText << v; 1800 NCBI_THROW(CException, eUnknown, 1801 "Could not find literal for 'this' location in placements"); 1802 } 1803 1804 CRef<CDelta_item> di(new CDelta_item); 1805 di->SetSeq().SetLiteral().Assign(*this_literal); 1806 this->FlipStrand(*di); 1807 inst.SetDelta().push_back(di); 1808 1809 } else if(v.GetData().IsInstance()) { 1810 CVariation_inst& inst = v.SetData().SetInstance(); 1811 inst.SetType(CVariation_inst::eType_delins); 1812 1813 if(inst.GetDelta().size() == 0) { 1814 CRef<CDelta_item> di(new CDelta_item); 1815 di->SetSeq().SetLiteral().SetLength(0); 1816 di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(""); 1817 inst.SetDelta().push_back(di); 1818 } else if(inst.GetDelta().size() > 1) { 1819 NCBI_THROW(CException, eUnknown, "Deltas of length >1 are not supported"); 1820 } else { 1821 CDelta_item& di = *inst.SetDelta().front(); 1822 1823 //convert 'del' to 'replace-with-empty-literal' 1824 if(di.GetAction() == CDelta_item::eAction_del_at) { 1825 di.ResetAction(); 1826 di.SetSeq().SetLiteral().SetLength(0); 1827 di.SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(""); 1828 } 1829 1830 //convert 'loc' or 'this'-based deltas to literals 1831 if(di.GetSeq().IsLoc()) { 1832 CRef<CSeq_literal> literal = x_GetLiteralAtLoc(di.GetSeq().GetLoc()); 1833 di.SetSeq().SetLiteral(*literal); 1834 } else if(di.GetSeq().IsThis()) { 1835 CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v); 1836 if(!this_literal) { 1837 NcbiCerr << MSerial_AsnText << v; 1838 NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements"); 1839 } else { 1840 di.SetSeq().SetLiteral().Assign(*this_literal); 1841 } 1842 } 1843 1844 //expand multipliers. 1845 if(di.IsSetMultiplier()) { 1846 if(di.GetMultiplier() < 0) { 1847 NCBI_THROW(CException, eUnknown, "Encountered negative multiplier"); 1848 } else { 1849 CSeq_literal& literal = di.SetSeq().SetLiteral(); 1850 1851 if(!literal.IsSetSeq_data() || !literal.GetSeq_data().IsIupacna()) { 1852 NCBI_THROW(CException, eUnknown, "Expected IUPACNA-type seq-literal"); 1853 } 1854 string str_kernel = literal.GetSeq_data().GetIupacna().Get(); 1855 literal.SetSeq_data().SetIupacna().Set(""); 1856 for(int i = 0; i < di.GetMultiplier(); i++) { 1857 literal.SetSeq_data().SetIupacna().Set() += str_kernel; 1858 } 1859 literal.SetLength(literal.GetSeq_data().GetIupacna().Get().size()); 1860 if(literal.IsSetFuzz()) { 1861 literal.SetFuzz().SetLim(CInt_fuzz::eLim_unk); 1862 } 1863 1864 di.ResetMultiplier(); 1865 if(di.IsSetMultiplier_fuzz()) { 1866 di.SetMultiplier_fuzz().SetLim(CInt_fuzz::eLim_unk); 1867 } 1868 } 1869 } 1870 1871 //Convert ins-X-before-loc to 'replace seq@loc with X + seq@loc' 1872 if(!di.IsSetAction() || di.GetAction() == CDelta_item::eAction_morph) { 1873 ; //already done 1874 } else if(di.GetAction() == CDelta_item::eAction_ins_before) { 1875 di.ResetAction(); 1876 CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v); 1877 1878 1879 if(!this_literal) { 1880 NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements"); 1881 } else if(this_literal->GetLength() == 0) { 1882 NCBI_THROW(CException, eUnknown, "Encountered empty literal"); 1883 } else { 1884 //Note: need to be careful here: if dinucleotide or multinucleotide, "before" really means "before last pos" 1885 // (as dinucleotide placement is used for insertions in order to be strand-flippable) 1886 1887 CRef<CSeq_literal> literal = s_SpliceLiterals(di.GetSeq().GetLiteral(), 1888 *this_literal, 1889 this_literal->GetLength() - 1); 1890 di.SetSeq().SetLiteral().Assign(*literal); 1891 } 1892 } 1893 } 1894 } 1895 } 1896 1897 1898 AttachProteinConsequences(CVariation & v,const CSeq_id * target_id,bool ignore_genomic)1899 void CVariationUtil::AttachProteinConsequences( 1900 CVariation& v, 1901 const CSeq_id* target_id, 1902 bool ignore_genomic) 1903 { 1904 v.Index(); 1905 1906 if(v.GetData().IsSet()) { 1907 1908 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, 1909 v.SetData().SetSet().SetVariations()) 1910 { 1911 AttachProteinConsequences(**it, target_id, ignore_genomic); 1912 } 1913 return; 1914 } 1915 1916 if(!v.GetData().IsInstance()) { 1917 return; 1918 } 1919 1920 1921 const CVariation::TPlacements* placements = s_GetPlacements(v); 1922 1923 if(!placements || placements->size() == 0) { 1924 return; 1925 } 1926 1927 1928 if(v.GetData().GetInstance().IsSetObservation() 1929 && !(v.GetData().GetInstance().GetObservation() 1930 & CVariation_inst::eObservation_variant)) { 1931 //only compute placements for variant instances; (as for asserted/reference there's no change) 1932 return; 1933 } 1934 1935 ITERATE(CVariation::TPlacements, it, *placements) 1936 { 1937 const CVariantPlacement& placement = **it; 1938 1939 if(!placement.GetLoc().GetId() 1940 || (target_id && !sequence::IsSameBioseq(*target_id, 1941 sequence::GetId(placement.GetLoc(), NULL), 1942 m_scope))) { 1943 continue; 1944 } 1945 1946 if(placement.IsSetStart_offset() && (placement.IsSetStop_offset() || placement.GetLoc().IsPnt())) { 1947 continue; //intronic. 1948 } 1949 1950 if(placement.GetMol() != CVariantPlacement::eMol_cdna 1951 && placement.GetMol() != CVariantPlacement::eMol_mitochondrion 1952 && (placement.GetMol() != CVariantPlacement::eMol_genomic || ignore_genomic)) { 1953 continue; 1954 } 1955 1956 1957 CCdregionIndex::TCdregions cdregions; 1958 m_cdregion_index.Get(placement.GetLoc(), cdregions); 1959 1960 ITERATE(CCdregionIndex::TCdregions, it, cdregions) 1961 { 1962 CRef<CVariation> prot_variation = TranslateNAtoAA(v.GetData().GetInstance(), placement, *it->cdregion_feat); 1963 CVariation::TConsequence::value_type consequence(new CVariation::TConsequence::value_type::TObjectType); 1964 consequence->SetVariation(*prot_variation); 1965 v.SetConsequence().push_back(consequence); 1966 } 1967 } 1968 } 1969 1970 //translate IUPACNA literal; do not translate last partial codon. Translate(const string & nuc_str,bool is_mito)1971 static string Translate(const string& nuc_str, bool is_mito) 1972 { 1973 string prot_str; 1974 1975 CGenetic_code code; 1976 code.SetId(is_mito ? 2 : 1); 1977 1978 CSeqTranslator::Translate( 1979 nuc_str, 1980 prot_str, 1981 CSeqTranslator::fIs5PrimePartial, 1982 &code); 1983 1984 if(prot_str.size() * 3 < nuc_str.size()) { 1985 prot_str.push_back('X'); 1986 } 1987 1988 //truncate everything past the first stop 1989 size_t stop_pos = prot_str.find('*'); 1990 if(stop_pos != NPOS) { 1991 prot_str.resize(stop_pos + 1); 1992 } 1993 1994 return prot_str; 1995 } 1996 CalcInstTypeForAA(const string & prot_ref_str,const string & prot_delta_str)1997 CVariation_inst::EType CalcInstTypeForAA( 1998 const string& prot_ref_str, 1999 const string& prot_delta_str) 2000 { 2001 CVariation_inst::EType inst_type = 2002 prot_delta_str.size() == 0 && prot_ref_str.size() > 0 ? CVariation_inst::eType_del 2003 : prot_delta_str.size() > 0 && prot_ref_str.size() == 0 ? CVariation_inst::eType_ins 2004 : prot_delta_str.size() != prot_ref_str.size() ? CVariation_inst::eType_prot_other 2005 : prot_ref_str == prot_delta_str ? CVariation_inst::eType_prot_silent 2006 : prot_ref_str.size() > 1 ? CVariation_inst::eType_prot_other 2007 : NStr::Find(prot_delta_str, "*") != NPOS ? CVariation_inst::eType_prot_nonsense 2008 : CVariation_inst::eType_prot_missense; 2009 return inst_type; 2010 } 2011 CalcEffectForProt(const string & prot_ref_str,const string & prot_delta_str)2012 static CVariantProperties::TEffect CalcEffectForProt( 2013 const string& prot_ref_str, 2014 const string& prot_delta_str) 2015 { 2016 CVariantProperties::TEffect effect = 0; 2017 for(size_t i = 0; i < prot_ref_str.size() && i < prot_delta_str.size(); i++) { 2018 if(prot_ref_str[i] == prot_delta_str[i]) { 2019 effect |= CVariantProperties::eEffect_synonymous; 2020 } else if(prot_ref_str[i] == '*') { 2021 effect |= CVariantProperties::eEffect_stop_loss; 2022 } else if(prot_delta_str[i] == '*') { 2023 effect |= CVariantProperties::eEffect_stop_gain; 2024 } else { 2025 effect |= CVariantProperties::eEffect_missense; 2026 } 2027 } 2028 return effect; 2029 } 2030 CalcSOTermsForProt(TSignedSeqPos nuc_delta_len,const string & prot_ref_str,const string & prot_variant_str)2031 static CVariationUtil::TSOTerms CalcSOTermsForProt( 2032 TSignedSeqPos nuc_delta_len, 2033 const string& prot_ref_str, 2034 const string& prot_variant_str) 2035 { 2036 CVariationUtil::TSOTerms terms; 2037 2038 bool stop_gain = false; 2039 bool stop_loss = false; 2040 for(size_t i = 0; i < max(prot_ref_str.size(), prot_variant_str.size()); i++) { 2041 char r = i >= prot_ref_str.size() ? '-' : prot_ref_str[i]; 2042 char v = i >= prot_variant_str.size() ? '-' : prot_variant_str[i]; 2043 2044 if(r == '*' && v != '*') { 2045 stop_loss = true; 2046 } 2047 2048 if(r != '*' && v == '*') { 2049 stop_gain = true; 2050 } 2051 } 2052 if(stop_gain) { 2053 terms.push_back(CVariationUtil::eSO_stop_gained); 2054 } 2055 if(stop_loss) { 2056 terms.push_back(CVariationUtil::eSO_stop_lost); 2057 } 2058 2059 if(nuc_delta_len == 0) { 2060 if(!stop_gain && !stop_loss) { 2061 terms.push_back(prot_ref_str == prot_variant_str ? CVariationUtil::eSO_synonymous_variant 2062 : CVariationUtil::eSO_missense_variant); 2063 } 2064 } else if(nuc_delta_len % 3 == 0) { 2065 terms.push_back(CVariationUtil::eSO_inframe_indel); 2066 } else { 2067 terms.push_back(CVariationUtil::eSO_frameshift_variant); 2068 } 2069 2070 return terms; 2071 } 2072 2073 FlipStrand(CVariantPlacement & vp) const2074 void CVariationUtil::FlipStrand(CVariantPlacement& vp) const 2075 { 2076 vp.SetLoc().FlipStrand(); 2077 if(vp.IsSetSeq() && vp.GetSeq().IsSetSeq_data()) { 2078 CSeqportUtil::ReverseComplement( 2079 vp.SetSeq().SetSeq_data(), 2080 &vp.SetSeq().SetSeq_data(), 2081 0, 2082 vp.GetSeq().GetLength()); 2083 } 2084 2085 CRef<CVariantPlacement> tmp(new CVariantPlacement); 2086 tmp->Assign(vp); 2087 2088 if(tmp->IsSetStart_offset()) { 2089 vp.SetStop_offset(tmp->GetStart_offset() * -1); 2090 } else { 2091 vp.ResetStop_offset(); 2092 } 2093 2094 if(tmp->IsSetStop_offset()) { 2095 vp.SetStart_offset(tmp->GetStop_offset() * -1); 2096 } else { 2097 vp.ResetStart_offset(); 2098 } 2099 2100 if(tmp->IsSetStart_offset_fuzz()) { 2101 vp.SetStop_offset_fuzz().Assign(tmp->GetStart_offset_fuzz()); 2102 } else { 2103 vp.ResetStop_offset_fuzz(); 2104 } 2105 2106 if(tmp->IsSetStop_offset_fuzz()) { 2107 vp.SetStart_offset_fuzz().Assign(tmp->GetStop_offset_fuzz()); 2108 } else { 2109 vp.ResetStart_offset_fuzz(); 2110 } 2111 } 2112 2113 s_IsInstStrandFlippable(const CVariation & v,const CVariation_inst & inst)2114 bool CVariationUtil::s_IsInstStrandFlippable( 2115 const CVariation& v, 2116 const CVariation_inst& inst) 2117 { 2118 bool ret = true; 2119 ITERATE(CVariation_inst::TDelta, it, v.GetData().GetInstance().GetDelta()) 2120 { 2121 const CDelta_item& di = **it; 2122 if(di.IsSetAction() && di.GetAction() == CDelta_item::eAction_ins_before) { 2123 const CVariation::TPlacements* p = s_GetPlacements(v); 2124 ret = false; //will set back to true if found acceptable placement 2125 if(p) { 2126 ITERATE(CVariation::TPlacements, it, *p) 2127 { 2128 if(s_GetLength(**it, NULL) >= 2) { 2129 ret = true; 2130 } 2131 } 2132 } 2133 } 2134 } 2135 return ret; 2136 } 2137 FlipStrand(CVariation & v) const2138 void CVariationUtil::FlipStrand(CVariation& v) const 2139 { 2140 v.Index(); //required so that can get factored placements from sub-variations. 2141 if(v.IsSetPlacements()) { 2142 NON_CONST_ITERATE(CVariation::TPlacements, it, v.SetPlacements()) 2143 { 2144 FlipStrand(**it); 2145 } 2146 } 2147 2148 if(v.GetData().IsSet()) { 2149 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, 2150 it, 2151 v.SetData().SetSet().SetVariations()) 2152 { 2153 FlipStrand(**it); 2154 } 2155 } else if(v.GetData().IsInstance()) { 2156 2157 if(!s_IsInstStrandFlippable(v, v.GetData().GetInstance())) { 2158 NCBI_THROW(CException, eUnknown, "Variation is not strand-flippable"); 2159 } else { 2160 FlipStrand(v.SetData().SetInstance()); 2161 } 2162 } 2163 } 2164 2165 FlipStrand(CDelta_item & di) const2166 void CVariationUtil::FlipStrand(CDelta_item& di) const 2167 { 2168 if(!di.IsSetSeq()) { 2169 return; 2170 } 2171 if(di.GetSeq().IsLoc()) { 2172 di.SetSeq().SetLoc().FlipStrand(); 2173 } else if(di.GetSeq().IsLiteral() && di.GetSeq().GetLiteral().IsSetSeq_data()) { 2174 CSeqportUtil::ReverseComplement( 2175 di.SetSeq().GetLiteral().GetSeq_data(), 2176 &di.SetSeq().SetLiteral().SetSeq_data(), 2177 0, di.GetSeq().GetLiteral().GetLength()); 2178 } 2179 } 2180 2181 FlipStrand(CVariation_inst & inst) const2182 void CVariationUtil::FlipStrand(CVariation_inst& inst) const 2183 { 2184 if(!inst.IsSetDelta()) { 2185 return; 2186 } 2187 2188 NON_CONST_ITERATE(CVariation_inst::TDelta, it, inst.SetDelta()) 2189 { 2190 FlipStrand(**it); 2191 } 2192 reverse(inst.SetDelta().begin(), inst.SetDelta().end()); 2193 } 2194 2195 x_AdjustDelinsToInterval(CVariation & v,const CSeq_loc & loc)2196 void CVariationUtil::x_AdjustDelinsToInterval(CVariation& v, const CSeq_loc& loc) 2197 { 2198 //note: loc may be a split loc when part of a codon is in another exon 2199 2200 if(!v.IsSetPlacements() || v.GetPlacements().size() != 1) { 2201 NCBI_THROW(CException, eUnknown, "Expected single placement"); 2202 } 2203 2204 CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL); 2205 2206 const CSeq_loc& orig_loc = v.GetPlacements().front()->GetLoc(); 2207 CRef<CSeq_loc> sub_loc = orig_loc.Subtract(loc, 0, NULL, NULL); 2208 if(sub_loc->Which() && sequence::GetLength(*sub_loc, NULL) > 0) { 2209 //NcbiCerr << MSerial_AsnText << v; 2210 //NcbiCerr << MSerial_AsnText << loc; 2211 NCBI_THROW(CException, eUnknown, "Location must be a superset of the variation's loc"); 2212 } 2213 2214 sub_loc->Assign(*range_loc); 2215 sub_loc->SetInt().SetTo(sequence::GetStop(orig_loc, NULL, eExtreme_Positional)); 2216 CRef<CSeq_loc> suffix_loc = sequence::Seq_loc_Subtract(loc, *sub_loc, CSeq_loc::fSortAndMerge_All, NULL); 2217 if(!suffix_loc->Which()) { 2218 suffix_loc->SetNull(); //bug in subtract 2219 } 2220 2221 sub_loc->Assign(*range_loc); 2222 sub_loc->SetInt().SetFrom(sequence::GetStart(orig_loc, NULL, eExtreme_Positional)); 2223 CRef<CSeq_loc> prefix_loc = sequence::Seq_loc_Subtract(loc, *sub_loc, CSeq_loc::fSortAndMerge_All, NULL); 2224 if(!prefix_loc->Which()) { 2225 prefix_loc->SetNull(); 2226 } 2227 2228 if(sequence::GetStrand(loc, NULL) == eNa_strand_minus) { 2229 swap(prefix_loc, suffix_loc); 2230 } 2231 2232 //NcbiCerr << "prefix loc" << MSerial_AsnText << *prefix_loc; 2233 //NcbiCerr << "suffix loc" << MSerial_AsnText << *suffix_loc; 2234 2235 2236 CRef<CSeq_literal> prefix_literal = x_GetLiteralAtLoc(*prefix_loc); 2237 CRef<CSeq_literal> suffix_literal = x_GetLiteralAtLoc(*suffix_loc); 2238 2239 for(CTypeIterator<CVariation_inst> it(Begin(v)); it; ++it) { 2240 CVariation_inst& inst = *it; 2241 if(inst.GetDelta().size() != 1) { 2242 NCBI_THROW(CException, eUnknown, "Expected single-element delta"); 2243 } 2244 2245 CDelta_item& delta = *inst.SetDelta().front(); 2246 2247 if(!delta.IsSetSeq() || !delta.GetSeq().IsLiteral()) { 2248 NCBI_THROW(CException, eUnknown, "Expected literal"); 2249 } 2250 2251 CRef<CSeq_literal> tmp_literal1 = s_CatLiterals(*prefix_literal, delta.SetSeq().SetLiteral()); 2252 CRef<CSeq_literal> tmp_literal2 = s_CatLiterals(*tmp_literal1, *suffix_literal); 2253 delta.SetSeq().SetLiteral(*tmp_literal2); 2254 } 2255 2256 v.SetPlacements().front()->SetLoc().Assign(loc); 2257 } 2258 Contains(const CSeq_loc & a,const CSeq_loc & b,CScope * scope)2259 static bool Contains(const CSeq_loc& a, const CSeq_loc& b, CScope* scope) 2260 { 2261 CRef<CSeq_loc> a1(new CSeq_loc); 2262 a1->Assign(a); 2263 a1->ResetStrand(); 2264 2265 CRef<CSeq_loc> b1(new CSeq_loc); 2266 b1->Assign(b); 2267 b1->ResetStrand(); 2268 2269 return sequence::Compare(*a1, *b1, scope, sequence::fCompareOverlapping) == sequence::eContains; 2270 } 2271 x_CreateUnknownVariation(const CSeq_id & id,CVariantPlacement::TMol mol)2272 CRef<CVariation> CVariationUtil::x_CreateUnknownVariation( 2273 const CSeq_id& id, 2274 CVariantPlacement::TMol mol) 2275 { 2276 CRef<CVariantPlacement> p(new CVariantPlacement); 2277 p->SetLoc().SetWhole().Assign(id); 2278 p->SetMol(mol); 2279 CRef<CVariation> v(new CVariation); 2280 v->SetData().SetUnknown(); 2281 v->SetPlacements().push_back(p); 2282 ChangeIdsInPlace(*v, sequence::eGetId_ForceAcc, *m_scope); 2283 return v; 2284 } 2285 CreateUnknownProtConsequenceVariation(const CVariantPlacement & nuc_p,const CSeq_feat & cds_feat,bool is_frameshifting,CScope & scope)2286 static CRef<CVariation> CreateUnknownProtConsequenceVariation( 2287 const CVariantPlacement& nuc_p, 2288 const CSeq_feat& cds_feat, 2289 bool is_frameshifting, 2290 CScope& scope) 2291 { 2292 CRef<CSeq_loc> prot_loc; 2293 CRef<CSeq_loc> codons_loc; 2294 try { 2295 CRef<CSeq_loc_Mapper> nuc2prot_mapper; 2296 CRef<CSeq_loc_Mapper> prot2nuc_mapper; 2297 nuc2prot_mapper.Reset(new CSeq_loc_Mapper( 2298 cds_feat, 2299 CSeq_loc_Mapper::eLocationToProduct, 2300 &scope)); 2301 prot2nuc_mapper.Reset(new CSeq_loc_Mapper( 2302 cds_feat, 2303 CSeq_loc_Mapper::eProductToLocation, 2304 &scope)); 2305 2306 prot_loc = nuc2prot_mapper->Map(nuc_p.GetLoc()); 2307 codons_loc = prot2nuc_mapper->Map(*prot_loc); 2308 2309 2310 2311 if(codons_loc->IsNull()) { 2312 // normally shouldn't happen, but may happen with 2313 // dubious annotation, e.g. BC149603.1:c.1A>G. 2314 // Mapping to protein coordinates returns NULL, 2315 // probably because protein and cds lengths are inconsistent. 2316 NCBI_THROW(CException, eUnknown, ""); 2317 } 2318 } 2319 catch(CException&) { 2320 // may legitimately throw if feature is not good for mapping (e.g. partial). 2321 prot_loc.Reset(new CSeq_loc()); 2322 prot_loc->SetWhole().Assign(sequence::GetId(cds_feat.GetProduct(), NULL)); 2323 codons_loc.Reset(SerialClone(nuc_p.GetLoc())); 2324 } 2325 2326 CRef<CVariation> v(new CVariation); 2327 v->SetData().SetUnknown(); 2328 2329 CRef<CVariantPlacement> prot_p(new CVariantPlacement); 2330 prot_p->SetLoc(*prot_loc); 2331 prot_p->SetMol(CVariantPlacement::eMol_protein); 2332 prot_p->SetExceptions().push_back(CreateException( 2333 "Cannot infer consequence; projecting location only", 2334 CVariationException::eCode_inconsistent_consequence)); 2335 v->SetPlacements().push_back(prot_p); 2336 2337 CRef<CVariantPlacement> codons_p(new CVariantPlacement); 2338 codons_p->SetLoc(*codons_loc); 2339 codons_p->SetMol(nuc_p.GetMol()); 2340 v->SetPlacements().push_back(codons_p); 2341 2342 ChangeIdsInPlace(*v, sequence::eGetId_ForceAcc, scope); 2343 2344 CVariationMethod& m = v->SetMethod(); 2345 m.SetMethod().push_back(CVariationMethod::eMethod_E_computational); 2346 2347 if(is_frameshifting) { 2348 v->SetSo_terms().push_back(CVariationUtil::eSO_frameshift_variant); 2349 } 2350 2351 return v; 2352 } 2353 2354 GetCommonPrefixLen(const string & a,const string & b)2355 static size_t GetCommonPrefixLen(const string& a, const string& b) 2356 { 2357 size_t i(0); 2358 while(i < a.size() && i < b.size() && a[i] == b[i]) { 2359 i++; 2360 } 2361 return i; 2362 } 2363 GetCommonSuffixLen(const string & a,const string & b)2364 static size_t GetCommonSuffixLen(const string& a, const string& b) 2365 { 2366 size_t i(0); 2367 while(i < a.size() && i < b.size() && a[a.size() - 1 - i] == b[b.size() - 1 - i]) { 2368 i++; 2369 } 2370 return i; 2371 } 2372 2373 2374 // Return true iff can't use the CDS for mapping between prot and mRNA; HasProblematicExceptions(const CSeq_feat & cds_feat)2375 static bool HasProblematicExceptions(const CSeq_feat& cds_feat) 2376 { 2377 return !cds_feat.IsSetExcept() ? false 2378 : !cds_feat.GetExcept() ? false 2379 : !cds_feat.IsSetExcept_text() ? true 2380 : (int)cds_feat.GetExcept_text().size() > ( //not all of except-text is explained by benign exceptions 2381 49 * (NStr::Find(cds_feat.GetExcept_text(), "translation initiation by" /* tRNA-Leu at CUG codon"*/) != NPOS) 2382 + 27 * (NStr::Find(cds_feat.GetExcept_text(), "mismatches in translation") != NPOS) 2383 + 20 * (NStr::Find(cds_feat.GetExcept_text(), "ribosomal slippage") != NPOS)); 2384 } 2385 2386 TranslateNAtoAA(const CVariation_inst & nuc_inst,const CVariantPlacement & nuc_p,const CSeq_feat & cds_feat)2387 CRef<CVariation> CVariationUtil::TranslateNAtoAA( 2388 const CVariation_inst& nuc_inst, 2389 const CVariantPlacement& nuc_p, 2390 const CSeq_feat& cds_feat) 2391 { 2392 const bool verbose = false; 2393 2394 if(verbose) NcbiCerr << "TranslateNAtoAA Inputs:\n" << MSerial_AsnText 2395 << nuc_inst << "\n\n" << MSerial_AsnText << nuc_p 2396 << "\n\n" << MSerial_AsnText << cds_feat << endl; 2397 2398 if(!sequence::IsSameBioseq(sequence::GetId(nuc_p.GetLoc(), NULL), 2399 sequence::GetId(cds_feat.GetLocation(), NULL), 2400 m_scope)) { 2401 NCBI_THROW(CException, eUnknown, "Placement and CDS are on different seqs"); 2402 } 2403 2404 //create an inst-variation from the provided inst with the single specified placement 2405 CRef<CVariation> v(new CVariation); 2406 v->SetData().SetInstance().Assign(nuc_inst); 2407 v->ResetPlacements(); 2408 2409 CRef<CVariantPlacement> p(new CVariantPlacement); 2410 p->Assign(nuc_p); 2411 p->ResetSeq(); //Will recalculate after adjusting to codon boundary 2412 v->SetPlacements().push_back(p); 2413 2414 //normalize to delins form so we can deal with it uniformly 2415 try { 2416 ChangeToDelins(*v); 2417 } 2418 catch(...) { 2419 //e.g. NM_000384.2:c.905-1_905dupGG - can't convert t odelins 2420 2421 return CreateUnknownProtConsequenceVariation( 2422 nuc_p, 2423 cds_feat, 2424 false, 2425 *m_scope); 2426 } 2427 2428 const CDelta_item& nuc_delta = *v->GetData().GetInstance().GetDelta().front(); 2429 2430 // note: using type long instead of TSignedSeqPos is a bug on 64-bit systems: 2431 // the result of the subtraction that's expected to be negative would be a 2432 // BIG_NUMBER that is cast to type long without the wrap-around into negatives. 2433 TSignedSeqPos nuc_delta_len = nuc_delta.GetSeq().GetLiteral().GetLength() 2434 - sequence::GetLength(p->GetLoc(), NULL); 2435 2436 2437 if(!SameOrientation(sequence::GetStrand(cds_feat.GetLocation(), NULL), 2438 sequence::GetStrand(p->GetLoc(), NULL))) { 2439 //need the variation and cds to be in the same orientation, such that 2440 //sequence represents the actual codons and AAs when mapped to protein 2441 FlipStrand(*v); 2442 } 2443 2444 if(verbose) NcbiCerr << "Normalized variation: " << MSerial_AsnText << *v; 2445 2446 2447 //Map to protein and back to get the affected codons. 2448 //Note: need the scope because the cds_feat is probably gi-based, 2449 //while our locs are probably accver-based 2450 CRef<CSeq_loc_Mapper> nuc2prot_mapper, prot2nuc_mapper; 2451 CRef<CSeq_loc> prot_loc, codons_loc; 2452 try { 2453 nuc2prot_mapper.Reset(new CSeq_loc_Mapper( 2454 cds_feat, 2455 CSeq_loc_Mapper::eLocationToProduct, 2456 m_scope)); 2457 prot2nuc_mapper.Reset(new CSeq_loc_Mapper(cds_feat, 2458 CSeq_loc_Mapper::eProductToLocation, 2459 m_scope)); 2460 2461 prot_loc = nuc2prot_mapper->Map(p->GetLoc()); 2462 codons_loc = prot2nuc_mapper->Map(*prot_loc); 2463 } 2464 catch(CException&) { 2465 // may legitimately throw if feature 2466 // is not good for mapping (e.g. partial). 2467 } 2468 2469 const bool is_within_cds = Contains(cds_feat.GetLocation(), nuc_p.GetLoc(), m_scope); 2470 2471 bool is_ok = !nuc_p.IsSetStart_offset() 2472 && !nuc_p.IsSetStop_offset() 2473 && is_within_cds 2474 && !codons_loc.IsNull() 2475 && !codons_loc->IsNull() 2476 && nuc_delta.GetSeq().IsLiteral() 2477 && nuc_delta.GetSeq().GetLiteral().IsSetSeq_data(); 2478 2479 int frameshift_phase = nuc_delta_len % 3; 2480 if(frameshift_phase < 0) { 2481 frameshift_phase += 3; 2482 } 2483 2484 if(!is_ok) { 2485 return CreateUnknownProtConsequenceVariation( 2486 nuc_p, 2487 cds_feat, 2488 frameshift_phase != 0, 2489 *m_scope); 2490 } 2491 2492 2493 string downstream_cds_suffix_seq_str; //cds sequence downstream of affected codons 2494 {{ 2495 //extend cds-loc downstream by a little bit to allow inferring changes in the stop-codon 2496 //(e.g. base in the stop-codon is deleted, and the sequence downstream of cds is now involved) 2497 CRef<CSeq_loc> ext_cds_loc; 2498 {{ 2499 SFlankLocs flocs = CreateFlankLocs(cds_feat.GetLocation(), 6); 2500 ext_cds_loc = sequence::Seq_loc_Add( 2501 cds_feat.GetLocation(), 2502 *flocs.downstream, 2503 CSeq_loc::fSortAndMerge_All, NULL); 2504 }} 2505 2506 CRef<CSeq_loc> downstream_cds_loc; 2507 {{ 2508 SFlankLocs flocs = CreateFlankLocs(*codons_loc, 1000); 2509 downstream_cds_loc = ext_cds_loc->Intersect( 2510 *flocs.downstream, 2511 CSeq_loc::fSortAndMerge_All, NULL); 2512 }} 2513 2514 CRef<CSeq_literal> literal = x_GetLiteralAtLoc(*downstream_cds_loc); 2515 if(verbose) NcbiCerr << "Downstream-cds: " << MSerial_AsnText << *literal; 2516 2517 if(literal->GetLength() > 0) { 2518 downstream_cds_suffix_seq_str = literal->GetSeq_data().GetIupacna().Get(); 2519 } 2520 }} 2521 2522 TSignedSeqPos frame = abs( 2523 (TSignedSeqPos)p->GetLoc().GetStart(eExtreme_Biological) 2524 - (TSignedSeqPos)codons_loc->GetStart(eExtreme_Biological)); 2525 2526 codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL)); 2527 // restore the original id, as mapping forward and back may have changed the type 2528 2529 ChangeIdsInPlace(*prot_loc, sequence::eGetId_ForceAcc, *m_scope); 2530 // not strictly required, but generally prefer accvers in the output for readability 2531 // as we're dealing with various types of mols 2532 2533 x_AdjustDelinsToInterval(*v, *codons_loc); 2534 AttachSeq(*v, 100000); //need enough sequnece to get create reference and variant translations downstream 2535 2536 if(!v->GetPlacements().front()->GetSeq().IsSetSeq_data()) { 2537 return CreateUnknownProtConsequenceVariation( 2538 nuc_p, 2539 cds_feat, 2540 frameshift_phase != 0, 2541 *m_scope); 2542 } 2543 2544 if(verbose) NcbiCerr << "Adjusted-for-codons " << MSerial_AsnText << *v; 2545 2546 string nuc_ref_prefix = v->GetPlacements().front()->GetSeq().GetSeq_data().GetIupacna().Get(); 2547 2548 const CSeq_literal& nuc_var_literal = v->GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral(); 2549 string nuc_var_prefix = nuc_var_literal.GetLength() == 0 ? "" : nuc_var_literal.GetSeq_data().GetIupacna().Get(); 2550 2551 string nuc_ref_str = nuc_ref_prefix + downstream_cds_suffix_seq_str; 2552 string nuc_var_str = nuc_var_prefix + downstream_cds_suffix_seq_str; 2553 2554 string prot_ref_str = Translate(nuc_ref_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion); 2555 string prot_var_str = Translate(nuc_var_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion); 2556 int num_ref_codons = (nuc_ref_prefix.size() + 2) / 3; 2557 int num_var_codons = (nuc_var_prefix.size() + 2) / 3; 2558 2559 2560 if(verbose) NcbiCerr << "nuc_ref_str: " << nuc_ref_str << "\n" 2561 << "nuc_var_str: " << nuc_var_str << "\n"; 2562 2563 2564 if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << "\n" 2565 << "prot_var_str: " << prot_var_str << "\n"; 2566 2567 2568 int common_prot_prefix_len(0); //will calculate length of unchanged leading AAs in translations (starting with codons of interest) 2569 2570 2571 if(prot_ref_str == prot_var_str) { 2572 //No-change variation. Will truncate to original counts of affected codons, such that the 2573 //no-change variation describes the original codons that did not affect the translation. 2574 prot_ref_str.resize(min(static_cast<int>(prot_ref_str.size()), num_ref_codons)); 2575 prot_var_str.resize(prot_ref_str.size()); 2576 2577 if(prot_ref_str.size() > 0 && *prot_ref_str.rbegin() == '*') { 2578 //If translated all the way to stop with no-changes, this is no longer a frameshift, even though the indel is not mod%3 2579 frameshift_phase = 0; 2580 } 2581 2582 } else { 2583 2584 //Justify the variation to first affected AA. 2585 // 2586 //Truncate common prefix of translations; truncate the 5'-end of the prot-loc and recalculate codons-loc accordingly. 2587 //Rationale: nucleotide level might not affect translation at the affected codons' location, but change the translation 2588 //downstream. That's why we have to skip unchanged translation to find the first affected AA. 2589 // 2590 //This does not apply to synonymous changes where there's no change at all rather than a downstream change. 2591 2592 common_prot_prefix_len = GetCommonPrefixLen(prot_ref_str, prot_var_str); 2593 if(common_prot_prefix_len > 0 2594 && common_prot_prefix_len == static_cast<int>(prot_ref_str.size())) { 2595 common_prot_prefix_len -= 1; // when truncating common prefx, don't want to truncate all the way 2596 // will leave last position as necessary 2597 } 2598 2599 prot_ref_str = prot_ref_str.substr(common_prot_prefix_len); 2600 prot_var_str = prot_var_str.substr(common_prot_prefix_len); 2601 2602 if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n" 2603 << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n"; 2604 2605 if(frameshift_phase == 0) { 2606 2607 size_t suffix_len = GetCommonSuffixLen(prot_ref_str, prot_var_str); 2608 size_t min_len = min(prot_ref_str.size(), prot_var_str.size()); 2609 size_t ref_stop_pos = prot_ref_str.find('*'); 2610 size_t var_stop_pos = prot_var_str.find('*'); 2611 size_t min_stop_pos = min(ref_stop_pos, var_stop_pos); 2612 2613 // VAR-699, VAR-872 2614 bool truncate_at_stop = min_stop_pos < min_len 2615 && ref_stop_pos != var_stop_pos 2616 && nuc_delta_len == 0; 2617 2618 if(truncate_at_stop) { 2619 prot_ref_str.resize(min_stop_pos + 1); 2620 prot_var_str.resize(min_stop_pos + 1); 2621 } else { 2622 prot_ref_str.resize(prot_ref_str.size() - suffix_len); 2623 prot_var_str.resize(prot_var_str.size() - suffix_len); 2624 } 2625 } else { 2626 //Keep the frst AA in case of frameshifts 2627 //NM_000492.3:c.3528delC -> NP_000483.3:p.Lys1177Serfs instead of NP_000483.3:p.Lys1177delfs 2628 prot_ref_str.resize(min(static_cast<size_t>(1), prot_ref_str.size())); 2629 prot_var_str.resize(min(static_cast<size_t>(1), prot_var_str.size())); 2630 } 2631 2632 //adjust the protein location. 2633 prot_loc = sequence::Seq_loc_Merge(*prot_loc, CSeq_loc::fMerge_SingleRange, NULL); //to convert point to interval 2634 if(prot_ref_str.size() == 0) { 2635 //After truncating common prefix and suffix the reference is reduced to nothing - we have pure insertion. 2636 //We need 2-AA location describing the point of insertion 2637 prot_loc->SetInt().SetFrom() += common_prot_prefix_len - 1; 2638 prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + 1); 2639 } else { 2640 //the location describes the sequence that's being changed 2641 prot_loc->SetInt().SetFrom() += common_prot_prefix_len; 2642 prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + prot_ref_str.size() - 1); 2643 } 2644 prot_loc = sequence::Seq_loc_Merge(*prot_loc, 0, NULL); //to convert single-pos int to point, as appropriate 2645 2646 codons_loc = prot2nuc_mapper->Map(*prot_loc); 2647 2648 if(codons_loc->IsNull()) { 2649 // may have gone past the end of the CDS, e.g. NG_011646.1:g.1508delT 2650 return CreateUnknownProtConsequenceVariation( 2651 nuc_p, 2652 cds_feat, 2653 frameshift_phase != 0, 2654 *m_scope); 2655 } 2656 2657 2658 codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL)); 2659 2660 if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n" 2661 << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n"; 2662 } 2663 2664 2665 if(verbose) NcbiCerr << "reference codons: " << num_ref_codons 2666 << "; variant codons: " << num_var_codons 2667 << "; common prefix: " << common_prot_prefix_len << "\n"; 2668 2669 2670 //if(verbose) NcbiCerr << "prot_ref : " << prot_ref_str << "\n" << "prot_delta: " << prot_var_str << "\ntruncated prefix: " << common_prot_prefix_len << "\n Adjusted codons loc: " << MSerial_AsnText << *codons_loc; 2671 2672 //Constructing protein-variation 2673 2674 CRef<CVariation> prot_v(new CVariation); 2675 2676 //will have two placements: one describing the AA, and the other describing codons 2677 CRef<CVariantPlacement> prot_p(new CVariantPlacement); 2678 2679 if(HasProblematicExceptions(cds_feat)) { 2680 //mapped protein position is bogus, as cds is either partial or there are indels. 2681 prot_p->SetLoc().SetEmpty().Assign(sequence::GetId(*prot_loc, NULL)); 2682 prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL))); 2683 prot_p->SetSeq().SetLength(prot_ref_str.size()); 2684 prot_p->SetSeq().SetSeq_data().SetNcbieaa().Set(prot_ref_str); 2685 } else { 2686 prot_p->SetLoc(*prot_loc); 2687 prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL))); 2688 AttachSeq(*prot_p); 2689 } 2690 prot_v->SetPlacements().push_back(prot_p); 2691 2692 2693 CRef<CVariantPlacement> codons_p(new CVariantPlacement); 2694 codons_p->SetLoc(*codons_loc); 2695 codons_p->SetMol(GetMolType(sequence::GetId(codons_p->GetLoc(), NULL))); 2696 codons_p->SetFrame(frame); 2697 AttachSeq(*codons_p); 2698 prot_v->SetPlacements().push_back(codons_p); 2699 2700 2701 //calculate properties 2702 {{ 2703 if(frameshift_phase == 0 && prot_ref_str.size() == prot_var_str.size()) { 2704 //VAR-267 - calculate missense/synonymous/stop-gain/loss for non-frameshifting and non-length-changing cases only 2705 CVariantProperties::TEffect prop = CalcEffectForProt(prot_ref_str, prot_var_str); 2706 if(prop != 0) { 2707 prot_v->SetVariant_prop().SetEffect(prop); 2708 } 2709 } 2710 2711 TSOTerms so_terms = CalcSOTermsForProt(nuc_delta_len, 2712 prot_ref_str, 2713 prot_var_str); 2714 copy(so_terms.begin(), so_terms.end(), back_inserter(prot_v->SetSo_terms())); 2715 }} 2716 2717 2718 prot_v->SetData().SetInstance().SetType(CalcInstTypeForAA(prot_ref_str, prot_var_str)); 2719 2720 //set inst data 2721 CRef<CDelta_item> di(new CDelta_item); 2722 prot_v->SetData().SetInstance().SetDelta().push_back(di); 2723 2724 if(prot_var_str.size() > 0) { 2725 2726 // Use NA alphabet instead of AA, as the dbSNP requested original codons. 2727 // The downstream process can always translate this to AAs. 2728 // 2729 // Note, however, that we cannot always use the original variant codons sequence, because 2730 // the location may have been adjusted downstream (see comments at TruncateCommonPrefixAndSuffix). 2731 // So we'll have to get this from nuc_var_str. 2732 if(false && common_prot_prefix_len == 0) { 2733 di->SetSeq().Assign(v->GetData().GetInstance().GetDelta().front()->GetSeq()); 2734 } else { 2735 2736 if(verbose) NcbiCerr 2737 << "inst-type: " << prot_v->GetData().GetInstance().GetType() 2738 << "; nuc_var_len: " << nuc_var_str.size() 2739 << "; nuc_var_str: " << nuc_var_str 2740 << "; prefix_len: " << common_prot_prefix_len * 3 2741 << "; var_codons:" << prot_var_str.size() * 3 << "\n"; 2742 2743 // Note: need to take min here, because common_prot_prefix_len * 3 may exceed 2744 // nuc_var_str.size() due to translation of a partial codon. 2745 string adjusted_codons_str = nuc_var_str.substr( 2746 min<int>(nuc_var_str.size(), common_prot_prefix_len * 3), 2747 prot_var_str.size() * 3); 2748 2749 if(adjusted_codons_str.size() > 0) { 2750 di->SetSeq().SetLiteral().SetLength(adjusted_codons_str.size()); 2751 di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set() = adjusted_codons_str; 2752 } else { 2753 di->SetSeq().SetThis(); 2754 di->SetAction(CDelta_item::eAction_del_at); 2755 } 2756 } 2757 2758 if(prot_ref_str.size() == 0) { 2759 di->SetAction(CDelta_item::eAction_ins_before); 2760 } 2761 } else { 2762 di->SetSeq().SetThis(); 2763 di->SetAction(CDelta_item::eAction_del_at); 2764 } 2765 2766 //set frameshift 2767 if(frameshift_phase != 0) { 2768 //note: SetEffect() contains magic-value in debug mode by default, not 0. 2769 //prot_v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_frameshift; 2770 prot_v->SetVariant_prop().SetEffect( 2771 CVariantProperties::eEffect_frameshift 2772 | (prot_v->IsSetVariant_prop() 2773 && prot_v->GetVariant_prop().IsSetEffect() 2774 ? prot_v->GetVariant_prop().GetEffect() : 0)); 2775 2776 prot_v->SetFrameshift().SetPhase(frameshift_phase); 2777 } 2778 2779 if(verbose) NcbiCerr << "protein variation:" << MSerial_AsnText << *prot_v; 2780 if(verbose) NcbiCerr << "Done with protein variation\n"; 2781 2782 return prot_v; 2783 } 2784 2785 2786 2787 //////////////////////////////////////////////////////////////////////////////// 2788 // 2789 // SO-terms calculations 2790 // 2791 //////////////////////////////////////////////////////////////////////////////// 2792 2793 AsSOTerms(const CVariantProperties & p,TSOTerms & terms)2794 void CVariationUtil::AsSOTerms(const CVariantProperties& p, TSOTerms& terms) 2795 { 2796 if(p.GetGene_location() & CVariantProperties::eGene_location_intergenic) { 2797 terms.push_back(eSO_intergenic_variant); 2798 } 2799 if(p.GetGene_location() & CVariantProperties::eGene_location_near_gene_5) { 2800 terms.push_back(eSO_2KB_upstream_variant); 2801 } 2802 if(p.GetGene_location() & CVariantProperties::eGene_location_near_gene_3) { 2803 terms.push_back(eSO_500B_downstream_variant); 2804 } 2805 if(p.GetGene_location() & CVariantProperties::eGene_location_donor) { 2806 terms.push_back(eSO_splice_donor_variant); 2807 } 2808 if(p.GetGene_location() & CVariantProperties::eGene_location_acceptor) { 2809 terms.push_back(eSO_splice_acceptor_variant); 2810 } 2811 if(p.GetGene_location() & CVariantProperties::eGene_location_intron) { 2812 terms.push_back(eSO_intron_variant); 2813 } 2814 if(p.GetGene_location() & CVariantProperties::eGene_location_utr_5) { 2815 terms.push_back(eSO_5_prime_UTR_variant); 2816 } 2817 if(p.GetGene_location() & CVariantProperties::eGene_location_utr_3) { 2818 terms.push_back(eSO_3_prime_UTR_variant); 2819 } 2820 if(p.GetGene_location() & CVariantProperties::eGene_location_conserved_noncoding) { 2821 terms.push_back(eSO_nc_transcript_variant); 2822 } 2823 if(p.GetGene_location() & CVariantProperties::eGene_location_in_start_codon) { 2824 terms.push_back(eSO_initiator_codon_variant); 2825 } 2826 if(p.GetGene_location() & CVariantProperties::eGene_location_in_stop_codon) { 2827 terms.push_back(eSO_terminator_codon_variant); 2828 } 2829 2830 if(p.GetEffect() & CVariantProperties::eEffect_frameshift) { 2831 terms.push_back(eSO_frameshift_variant); 2832 } 2833 if(p.GetEffect() & CVariantProperties::eEffect_missense) { 2834 terms.push_back(eSO_missense_variant); 2835 } 2836 2837 if(p.GetEffect() & CVariantProperties::eEffect_nonsense || p.GetEffect() & CVariantProperties::eEffect_stop_gain) { 2838 terms.push_back(eSO_stop_gained); 2839 } 2840 if(p.GetEffect() & CVariantProperties::eEffect_nonsense || p.GetEffect() & CVariantProperties::eEffect_stop_loss) { 2841 terms.push_back(eSO_stop_lost); 2842 } 2843 if(p.GetEffect() & CVariantProperties::eEffect_synonymous) { 2844 terms.push_back(eSO_synonymous_variant); 2845 } 2846 } 2847 AsString(ESOTerm term)2848 string CVariationUtil::AsString(ESOTerm term) 2849 { 2850 return 2851 term == eSO_intergenic_variant ? "intergenic_variant" 2852 : term == eSO_2KB_upstream_variant ? "2KB_upstream_variant" 2853 : term == eSO_500B_downstream_variant ? "500B_downstream_variant" 2854 : term == eSO_splice_donor_variant ? "splice_donor_variant" 2855 : term == eSO_splice_acceptor_variant ? "splice_acceptor_varian" 2856 : term == eSO_intron_variant ? "intron_variant" 2857 : term == eSO_5_prime_UTR_variant ? "5_prime_UTR_variant" 2858 : term == eSO_3_prime_UTR_variant ? "3_prime_UTR_variant" 2859 : term == eSO_coding_sequence_variant ? "coding_sequence_variant" 2860 : term == eSO_nc_transcript_variant ? "nc_transcript_variant" 2861 : term == eSO_initiator_codon_variant ? "initiator_codon_variant" 2862 : term == eSO_terminator_codon_variant ? "terminator_codon_variant" 2863 2864 : term == eSO_synonymous_variant ? "synonymous_variant" 2865 : term == eSO_missense_variant ? "missense_variant" 2866 : term == eSO_frameshift_variant ? "frameshift_variant" 2867 : term == eSO_inframe_indel ? "inframe_indel" 2868 : term == eSO_stop_gained ? "stop_gained" 2869 : term == eSO_stop_lost ? "stop_lost" 2870 : "other_variant"; 2871 }; 2872 2873 2874 s_GetPlacements(const CVariation & v)2875 const CVariation::TPlacements* CVariationUtil::s_GetPlacements(const CVariation& v) 2876 { 2877 if(v.IsSetPlacements()) { 2878 return &v.GetPlacements(); 2879 } else if(v.GetParent()) { 2880 return s_GetPlacements(*v.GetParent()); 2881 } else { 2882 return NULL; 2883 } 2884 } 2885 s_FindFirstLiteral(const CVariation & v)2886 const CConstRef<CSeq_literal> CVariationUtil::s_FindFirstLiteral(const CVariation& v) 2887 { 2888 const CVariation::TPlacements* placements = s_GetPlacements(v); 2889 ITERATE(CVariation::TPlacements, it, *placements) 2890 { 2891 const CVariantPlacement& p = **it; 2892 if(p.IsSetSeq()) { 2893 return CConstRef<CSeq_literal>(&p.GetSeq()); 2894 } 2895 } 2896 return CConstRef<CSeq_literal>(NULL); 2897 } 2898 s_FindAssertedLiteral(const CVariation & v)2899 const CConstRef<CSeq_literal> CVariationUtil::s_FindAssertedLiteral(const CVariation& v) 2900 { 2901 const CVariation* parent = v.GetParent(); 2902 if(parent == NULL) { 2903 return CConstRef<CSeq_literal>(NULL); 2904 } else { 2905 if(parent->GetData().IsSet()) { 2906 ITERATE(CVariation::TData::TSet::TVariations, it, parent->GetData().GetSet().GetVariations()) 2907 { 2908 const CVariation& sibling = **it; 2909 if(sibling.GetData().IsInstance() 2910 && sibling.GetData().GetInstance().IsSetObservation() 2911 && sibling.GetData().GetInstance().GetObservation() == CVariation_inst::eObservation_asserted 2912 && sibling.GetData().GetInstance().GetDelta().size() == 1 2913 && sibling.GetData().GetInstance().GetDelta().front()->IsSetSeq() 2914 && sibling.GetData().GetInstance().GetDelta().front()->GetSeq().IsLiteral()) { 2915 return CConstRef<CSeq_literal>(&sibling.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral()); 2916 } 2917 } 2918 } 2919 2920 return s_FindAssertedLiteral(*parent); 2921 } 2922 } 2923 2924 Equals(const CVariation::TPlacements & p1,const CVariation::TPlacements & p2)2925 static bool Equals( 2926 const CVariation::TPlacements& p1, 2927 const CVariation::TPlacements& p2) 2928 { 2929 if(p1.size() != p2.size()) { 2930 return false; 2931 } 2932 CVariation::TPlacements::const_iterator it1 = p1.begin(); 2933 CVariation::TPlacements::const_iterator it2 = p2.begin(); 2934 2935 for(; it1 != p1.end() && it2 != p2.end(); ++it1, ++it2) { 2936 const CVariantPlacement& p1 = **it1; 2937 const CVariantPlacement& p2 = **it2; 2938 if(!p1.Equals(p2)) { 2939 return false; 2940 } 2941 } 2942 return true; 2943 } 2944 s_FactorOutPlacements(CVariation & v)2945 void CVariationUtil::s_FactorOutPlacements(CVariation& v) 2946 { 2947 if(!v.GetData().IsSet()) { 2948 return; 2949 } 2950 2951 NON_CONST_ITERATE( 2952 CVariation::TData::TSet::TVariations, it, 2953 v.SetData().SetSet().SetVariations()) 2954 { 2955 s_FactorOutPlacements(**it); 2956 } 2957 2958 CVariation::TPlacements* p1 = NULL; 2959 2960 bool ok = true; 2961 NON_CONST_ITERATE( 2962 CVariation::TData::TSet::TVariations, it, 2963 v.SetData().SetSet().SetVariations()) 2964 { 2965 CVariation& v2 = **it; 2966 if(!v2.IsSetPlacements()) { 2967 ok = false; 2968 break; 2969 } else if(!p1) { 2970 p1 = &v2.SetPlacements(); 2971 continue; 2972 } else { 2973 if(!Equals(*p1, v2.GetPlacements())) { 2974 ok = false; 2975 break; 2976 } 2977 } 2978 } 2979 2980 if(ok && p1) { 2981 //transfer p1 placements to this level 2982 NON_CONST_ITERATE(CVariation::TPlacements, it, *p1) 2983 { 2984 v.SetPlacements().push_back(*it); 2985 } 2986 2987 //reset placements at the children level 2988 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, 2989 v.SetData().SetSet().SetVariations()) 2990 { 2991 CVariation& v2 = **it; 2992 v2.ResetPlacements(); 2993 } 2994 } 2995 } 2996 2997 s_FindConsequenceForPlacement(const CVariation & v,const CVariantPlacement & p)2998 CConstRef<CVariation> CVariationUtil::s_FindConsequenceForPlacement( 2999 const CVariation& v, 3000 const CVariantPlacement& p) 3001 { 3002 CConstRef<CVariation> cons_v(NULL); 3003 if(v.IsSetConsequence() && p.GetLoc().GetId()) { 3004 ITERATE(CVariation::TConsequence, it, v.GetConsequence()) 3005 { 3006 const CVariation::TConsequence::value_type::TObjectType& cons = **it; 3007 if(cons.IsVariation() 3008 && cons.GetVariation().IsSetPlacements()) { 3009 ITERATE(CVariation::TPlacements, it2, cons.GetVariation().GetPlacements()) 3010 { 3011 const CVariantPlacement& vp = **it2; 3012 if(vp.GetLoc().GetId() 3013 && p.GetLoc().GetId() 3014 && vp.GetLoc().GetId()->Equals(*p.GetLoc().GetId())) { 3015 cons_v.Reset(&cons.GetVariation()); 3016 break; 3017 } 3018 } 3019 } 3020 } 3021 } 3022 3023 if(!cons_v && v.GetData().IsSet()) { 3024 ITERATE(CVariation::TData::TSet::TVariations, it, 3025 v.GetData().GetSet().GetVariations()) 3026 { 3027 CConstRef<CVariation> cons_v1 = s_FindConsequenceForPlacement(**it, p); 3028 if(cons_v1) { 3029 cons_v = cons_v1; 3030 break; 3031 } 3032 } 3033 } 3034 3035 return cons_v; 3036 } 3037 3038 s_AttachGeneIDdbxref(CVariantPlacement & p,int gene_id)3039 void CVariationUtil::s_AttachGeneIDdbxref(CVariantPlacement& p, int gene_id) 3040 { 3041 ITERATE(CVariantPlacement::TDbxrefs, it, p.SetDbxrefs()) 3042 { 3043 const CDbtag& dbtag = **it; 3044 if(dbtag.GetDb() == "GeneID" 3045 && dbtag.GetTag().IsId() 3046 && dbtag.GetTag().GetId() == gene_id) { 3047 return; 3048 } 3049 } 3050 3051 CRef<CDbtag> dbtag(new CDbtag()); 3052 dbtag->SetDb("GeneID"); 3053 dbtag->SetTag().SetId(gene_id); 3054 p.SetDbxrefs().push_back(dbtag); 3055 } 3056 SetPlacementProperties(CVariantPlacement & placement)3057 void CVariationUtil::SetPlacementProperties(CVariantPlacement& placement) 3058 { 3059 if(!placement.IsSetGene_location()) { 3060 // need to zero-out the bitmask, 3061 // otherwise in debug mode it will be preset to a magic value, 3062 // and then modifying it with "|=" will produce garbage. 3063 placement.SetGene_location(0); 3064 } 3065 3066 3067 // for offset-style intronic locations (not genomic and have offset), 3068 // can infer where we are based on offset 3069 if(!placement.IsSetMol() 3070 || placement.GetMol() != CVariantPlacement::eMol_genomic) { 3071 CBioseq_Handle bsh = m_scope->GetBioseqHandle( 3072 sequence::GetId(placement.GetLoc(), NULL)); 3073 3074 if(placement.IsSetStart_offset() 3075 && placement.GetStart_offset() != 0) { 3076 x_SetVariantPropertiesForIntronic( 3077 placement, 3078 placement.GetStart_offset(), 3079 placement.GetLoc(), 3080 bsh); 3081 } 3082 3083 if(placement.IsSetStop_offset() 3084 && placement.GetStop_offset() != 0) { 3085 x_SetVariantPropertiesForIntronic( 3086 placement, 3087 placement.GetStop_offset(), 3088 placement.GetLoc(), 3089 bsh); 3090 } 3091 } 3092 3093 CVariantPropertiesIndex::TGeneIDAndPropVector v; 3094 m_variant_properties_index.GetLocationProperties(placement.GetLoc(), v); 3095 3096 // note: this assumes that the offsets are HGVS-spec compliant: 3097 // anchor locations are at the exon terminals, and the 3098 // offset values point into the intron. 3099 bool is_completely_intronic = false; 3100 {{ 3101 bool is_start_offset = placement.IsSetStart_offset() 3102 && placement.GetStart_offset() != 0; 3103 3104 bool is_stop_offset = placement.IsSetStop_offset() 3105 && placement.GetStop_offset() != 0; 3106 3107 // Single anchor point, and have any offset. 3108 bool is_case1 = sequence::GetLength(placement.GetLoc(), NULL) == 1 3109 && (is_start_offset || is_stop_offset); 3110 3111 // Other possibility is when start and stop are addressed from different exons: 3112 // The location length must be 2 (end of one exon and start of the other), 3113 // and the offsets point inwards. 3114 bool is_case2 = sequence::GetLength(placement.GetLoc(), NULL) == 2 3115 && is_start_offset && placement.GetStart_offset() > 0 3116 && is_stop_offset && placement.GetStop_offset() < 0; 3117 3118 is_completely_intronic = is_case1 || is_case2; 3119 }} 3120 3121 // collapse all gene-specific location properties into prop. 3122 // Will do it in three rounds so that more important properties 3123 // will have their gene-ids on top of the list: VAR-1297 3124 // First: ignore near-gene-or-intron. 3125 // Second: ignore near-gene. 3126 // Third: don't ignore any property. 3127 for(size_t i = 0; i < 3; i++) { 3128 static const CVariantProperties::TGene_location flags[3] = { 3129 CVariantProperties::eGene_location_near_gene_3 3130 | CVariantProperties::eGene_location_near_gene_5 3131 | CVariantProperties::eGene_location_intron 3132 , 3133 CVariantProperties::eGene_location_near_gene_3 3134 | CVariantProperties::eGene_location_near_gene_5 3135 , 3136 0 }; 3137 3138 ITERATE(CVariantPropertiesIndex::TGeneIDAndPropVector, it, v) 3139 { 3140 int gene_id = it->first; 3141 CVariantProperties::TGene_location loc_prop = it->second; 3142 3143 if(loc_prop & flags[i]) { 3144 continue; 3145 } 3146 3147 if(gene_id != 0) { 3148 s_AttachGeneIDdbxref(placement, gene_id); 3149 } 3150 3151 if(!is_completely_intronic) { 3152 //we don't want to compute properties for intronic anchor points. VAR-149 3153 placement.SetGene_location() |= loc_prop; 3154 } 3155 } 3156 } 3157 3158 } 3159 3160 FindLocationProperties(const CSeq_align & transcript_aln,const CSeq_loc & query_loc,TSOTerms & terms)3161 void CVariationUtil::FindLocationProperties( 3162 const CSeq_align& transcript_aln, 3163 const CSeq_loc& query_loc, 3164 TSOTerms& terms) 3165 { 3166 //note: initializing mapper with scope because the annotation from scope is gi-based, 3167 //while the parameters are normaly accver-based 3168 CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(transcript_aln, 1, m_scope)); 3169 3170 CConstRef<CSeq_loc> genomic_query_loc; 3171 if(query_loc.GetId() && query_loc.GetId()->Equals(transcript_aln.GetSeq_id(0))) { 3172 genomic_query_loc = mapper->Map(query_loc); 3173 } else { 3174 genomic_query_loc.Reset(&query_loc); 3175 } 3176 3177 CRef<CSeq_loc> rna_loc = transcript_aln.CreateRowSeq_loc(1); 3178 3179 CRef<CSeq_loc> cds_loc; 3180 {{ 3181 CBioseq_Handle bsh = m_scope->GetBioseqHandle(transcript_aln.GetSeq_id(0)); 3182 for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) { 3183 const CMappedFeat& mf = *ci; 3184 cds_loc = mapper->Map(mf.GetLocation()); 3185 3186 //remove indels from the mapped cds loc 3187 cds_loc = sequence::Seq_loc_Merge(*cds_loc, CSeq_loc::fMerge_SingleRange, NULL); 3188 cds_loc = rna_loc->Intersect(*cds_loc, 0, NULL); 3189 break; 3190 } 3191 }} 3192 3193 s_FindLocationProperties(rna_loc, cds_loc, *genomic_query_loc, terms); 3194 } 3195 3196 s_FindLocationProperties(CConstRef<CSeq_loc> rna_loc,CConstRef<CSeq_loc> cds_loc,const CSeq_loc & query_loc,CVariationUtil::TSOTerms & terms)3197 void CVariationUtil::s_FindLocationProperties( 3198 CConstRef<CSeq_loc> rna_loc, 3199 CConstRef<CSeq_loc> cds_loc, 3200 const CSeq_loc& query_loc, 3201 CVariationUtil::TSOTerms& terms) 3202 { 3203 struct SPropsMap 3204 { 3205 typedef CRangeMap<CVariationUtil::ESOTerm, TSeqPos> TRangeMap; 3206 typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap; 3207 TIdRangeMap loc_map; 3208 3209 void Add(CVariationUtil::ESOTerm term, const CSeq_loc& loc) 3210 { 3211 for(CSeq_loc_CI ci(loc); ci; ++ci) { 3212 loc_map[ci.GetSeq_id_Handle()][ci.GetRange()] = term; 3213 } 3214 } 3215 } props_map; 3216 3217 typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair; 3218 3219 if(!rna_loc && !cds_loc) { 3220 return; 3221 } 3222 3223 const CSeq_loc& main_loc = rna_loc ? *rna_loc : *cds_loc; 3224 3225 {{ 3226 TLocsPair p = CVariantPropertiesIndex::s_GetNeighborhoodLocs( 3227 main_loc, 3228 main_loc.GetStop(eExtreme_Positional) + 100000); 3229 3230 props_map.Add(eSO_2KB_upstream_variant, *p.first); 3231 props_map.Add(eSO_500B_downstream_variant, *p.second); 3232 }} 3233 3234 {{ 3235 TLocsPair p = CVariantPropertiesIndex::s_GetIntronsAndSpliceSiteLocs(main_loc); 3236 props_map.Add(eSO_intron_variant, *p.first); 3237 size_t i(0); 3238 for(CSeq_loc_CI ci(*p.second, 3239 CSeq_loc_CI::eEmpty_Skip, 3240 CSeq_loc_CI::eOrder_Biological); ci; ++ci) { 3241 props_map.Add((i % 2 ? eSO_splice_acceptor_variant 3242 : eSO_splice_donor_variant), 3243 *ci.GetRangeAsSeq_loc()); 3244 i++; 3245 } 3246 }} 3247 3248 if(!cds_loc) { 3249 props_map.Add(eSO_nc_transcript_variant, *rna_loc); 3250 } else { 3251 props_map.Add(eSO_coding_sequence_variant, *cds_loc); 3252 3253 {{ 3254 TLocsPair p = CVariantPropertiesIndex::s_GetStartAndStopCodonsLocs(*cds_loc); 3255 props_map.Add(eSO_initiator_codon_variant, *p.first); 3256 props_map.Add(eSO_terminator_codon_variant, *p.second); 3257 }} 3258 3259 if(rna_loc) { 3260 TLocsPair p = CVariantPropertiesIndex::s_GetUTRLocs(*cds_loc, *rna_loc); 3261 props_map.Add(eSO_5_prime_UTR_variant, *p.first); 3262 props_map.Add(eSO_3_prime_UTR_variant, *p.second); 3263 } 3264 } 3265 3266 set<CVariationUtil::ESOTerm> terms_set; 3267 for(CSeq_loc_CI ci(query_loc, CSeq_loc_CI::eEmpty_Skip); ci; ++ci) { 3268 const SPropsMap::TRangeMap& rm = props_map.loc_map[ci.GetSeq_id_Handle()]; 3269 for(SPropsMap::TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) { 3270 terms_set.insert(it2->second); 3271 } 3272 } 3273 copy(terms_set.begin(), terms_set.end(), back_inserter(terms)); 3274 } 3275 3276 3277 3278 3279 3280 //transcript length less polyA GetEffectiveTranscriptLength(const CBioseq_Handle & bsh)3281 TSeqPos CVariationUtil::GetEffectiveTranscriptLength(const CBioseq_Handle& bsh) 3282 { 3283 CVariantPlacement::TMol mol = GetMolType(*bsh.GetSeqId()); 3284 if(mol != CVariantPlacement::eMol_rna && mol != CVariantPlacement::eMol_cdna) { 3285 return bsh.GetInst_Length(); 3286 } 3287 3288 SAnnotSelector sel; 3289 sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon); 3290 sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_polyA_site); 3291 3292 TSeqPos last_exon_pos(0); 3293 TSeqPos last_polyA_pos(0); 3294 for(CFeat_CI ci(bsh, sel); ci; ++ci) { 3295 const CMappedFeat& mf = *ci; 3296 if(mf.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) { 3297 last_exon_pos = max(last_exon_pos, sequence::GetStop(mf.GetLocation(), NULL)); 3298 } else { 3299 last_polyA_pos = max(last_polyA_pos, sequence::GetStop(mf.GetLocation(), NULL)); 3300 } 3301 } 3302 3303 return last_exon_pos ? last_exon_pos + 1 3304 : last_polyA_pos ? last_polyA_pos + 1 3305 : bsh.GetInst_Length(); 3306 } 3307 3308 x_SetVariantPropertiesForIntronic(CVariantPlacement & p,int offset,const CSeq_loc & loc,CBioseq_Handle & bsh)3309 void CVariationUtil::x_SetVariantPropertiesForIntronic( 3310 CVariantPlacement& p, 3311 int offset, 3312 const CSeq_loc& loc, 3313 CBioseq_Handle& bsh) 3314 { 3315 if(!p.IsSetGene_location()) { 3316 //need to zero-out the bitmask, otherwise in debug mode it will be preset to a magic value, 3317 //and then modifying it with "|=" will produce garbage. 3318 p.SetGene_location(0); 3319 } 3320 3321 if(sequence::GetStrand(loc, NULL) == eNa_strand_minus) { 3322 offset *= -1; 3323 } 3324 3325 if(loc.GetStop(eExtreme_Positional) + 1 >= GetEffectiveTranscriptLength(bsh) && offset > 0) { 3326 //at the 3'-end; check if near-gene or intergenic 3327 if(offset <= 500) { 3328 p.SetGene_location() |= CVariantProperties::eGene_location_near_gene_3; 3329 } else { 3330 p.SetGene_location() |= CVariantProperties::eGene_location_intergenic; 3331 } 3332 } else if(loc.GetStart(eExtreme_Positional) == 0 && offset < 0) { 3333 //at the 5'-end; check if near-gene or intergenic 3334 if(offset >= -2000) { 3335 p.SetGene_location() |= CVariantProperties::eGene_location_near_gene_5; 3336 } else { 3337 p.SetGene_location() |= CVariantProperties::eGene_location_intergenic; 3338 } 3339 } else { 3340 //intronic or splice 3341 if(offset < 0 && offset >= -2) { 3342 p.SetGene_location() |= CVariantProperties::eGene_location_acceptor; 3343 } else if(offset > 0 && offset <= 2) { 3344 p.SetGene_location() |= CVariantProperties::eGene_location_donor; 3345 } else { 3346 p.SetGene_location() |= CVariantProperties::eGene_location_intron; 3347 } 3348 } 3349 3350 if(p.GetGene_location() == 0) { 3351 p.ResetGene_location(); 3352 } 3353 } 3354 3355 SetVariantProperties(CVariation & v)3356 void CVariationUtil::SetVariantProperties(CVariation& v) 3357 { 3358 v.Index(); 3359 3360 for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) { 3361 CVariation& v2 = *it; 3362 if(!v2.IsSetPlacements()) { 3363 continue; 3364 } 3365 3366 //will collapse placement-specific gene-location properties onto variation properties 3367 if(!v2.SetVariant_prop().IsSetGene_location()) { 3368 v2.SetVariant_prop().SetGene_location(0); //clear out the magic-value 3369 } 3370 3371 NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements()) 3372 { 3373 CVariantPlacement& p = **it2; 3374 SetPlacementProperties(p); 3375 3376 if(v2.GetConsequenceParent()) { 3377 //If this variation is a consequence of a parent variation, we are only interested 3378 //in the product-specific properties (as the consequence variation will have nucleotide-level placement 3379 //but we're not interested in recomputing multiple-genes-specific properties here. 3380 break; 3381 } 3382 3383 v2.SetVariant_prop().SetGene_location() |= p.GetGene_location(); 3384 } 3385 } 3386 } 3387 3388 GetLocationProperties(const CSeq_loc & loc,CVariantPropertiesIndex::TGeneIDAndPropVector & v)3389 void CVariationUtil::CVariantPropertiesIndex::GetLocationProperties( 3390 const CSeq_loc& loc, 3391 CVariantPropertiesIndex::TGeneIDAndPropVector& v) 3392 { 3393 typedef map<int, CVariantProperties::TGene_location> TMap; 3394 TMap m; //will collapse properties per gene-id (TGene_location is a bitmask) 3395 3396 CRef<CSeq_loc> loc2(new CSeq_loc); 3397 loc2->Assign(loc); 3398 ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope); 3399 3400 for(CSeq_loc_CI ci(*loc2); ci; ++ci) { 3401 CSeq_id_Handle idh = ci.GetSeq_id_Handle(); 3402 3403 if(m_loc2prop.find(idh) == m_loc2prop.end()) { 3404 //first time seeing this seq-id - index it 3405 try { 3406 x_Index(idh); 3407 } 3408 catch(CException& e) { 3409 NCBI_RETHROW_SAME(e, "Can't Index " + idh.AsString()); 3410 } 3411 } 3412 3413 TIdRangeMap::const_iterator it = m_loc2prop.find(idh); 3414 if(it == m_loc2prop.end()) { 3415 return; 3416 } 3417 const TRangeMap& rm = it->second; 3418 3419 for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) { 3420 ITERATE(TGeneIDAndPropVector, it3, it2->second) 3421 { 3422 TGeneIDAndProp gene_id_and_prop = *it3; 3423 int gene_id = gene_id_and_prop.first; 3424 CVariantProperties::TGene_location properties = gene_id_and_prop.second; 3425 3426 if(m.find(gene_id) == m.end()) { 3427 m[gene_id] = 0; //need to zero-out the bitmask in debug mode 3428 } 3429 m[gene_id] |= properties; 3430 } 3431 } 3432 } 3433 3434 ITERATE(TMap, it, m) 3435 { 3436 v.push_back(TGeneIDAndProp(it->first, it->second)); 3437 } 3438 } 3439 x_Add(const CSeq_loc & loc,int gene_id,CVariantProperties::TGene_location prop)3440 void CVariationUtil::CVariantPropertiesIndex::x_Add( 3441 const CSeq_loc& loc, 3442 int gene_id, 3443 CVariantProperties::TGene_location prop) 3444 { 3445 for(CSeq_loc_CI ci(loc); ci; ++ci) { 3446 try { 3447 m_loc2prop[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(TGeneIDAndProp(gene_id, prop)); 3448 } 3449 catch(CException& e) { 3450 NcbiCerr << MSerial_AsnText << loc << gene_id << " " << prop; 3451 NCBI_RETHROW_SAME(e, "Can't index location"); 3452 } 3453 } 3454 } 3455 3456 ///Note: this is strand-agnostic 3457 class SFastLocSubtract 3458 { 3459 public: SFastLocSubtract(const CSeq_loc & loc)3460 SFastLocSubtract(const CSeq_loc& loc) 3461 { 3462 for(CSeq_loc_CI ci(loc); ci; ++ci) { 3463 m_rangemap[ci.GetSeq_id_Handle()][ci.GetRange()] = ci.GetRangeAsSeq_loc(); 3464 } 3465 } 3466 operator ()(CSeq_loc & container_loc) const3467 void operator()(CSeq_loc& container_loc) const 3468 { 3469 for(CTypeIterator<CSeq_loc> it(Begin(container_loc)); it; ++it) { 3470 CSeq_loc& loc = *it; 3471 if(!loc.IsInt() && !loc.IsPnt()) { 3472 continue; 3473 } 3474 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*loc.GetId()); 3475 TIdRangeMap::const_iterator it2 = m_rangemap.find(idh); 3476 if(it2 == m_rangemap.end()) { 3477 continue; 3478 } 3479 const TRangeMap& rm = it2->second; 3480 3481 for(TRangeMap::const_iterator it3 = rm.begin(loc.GetTotalRange()); it3.Valid(); ++it3) { 3482 CConstRef<CSeq_loc> loc2 = it3->second; 3483 CRef<CSeq_loc> loc3 = sequence::Seq_loc_Subtract(loc, *loc2, 0, NULL); 3484 loc.Assign(*loc3); 3485 } 3486 } 3487 } 3488 private: 3489 typedef CRangeMap<CConstRef<CSeq_loc>, TSeqPos> TRangeMap; 3490 typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap; 3491 TIdRangeMap m_rangemap; 3492 }; 3493 3494 IsRefSeqGene(const CBioseq_Handle & bsh)3495 static bool IsRefSeqGene(const CBioseq_Handle& bsh) 3496 { 3497 if(!bsh.CanGetDescr()) { 3498 return false; 3499 } 3500 ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get()) 3501 { 3502 const CSeqdesc& desc = **it; 3503 if(desc.IsGenbank()) { 3504 const CGB_block::TKeywords& k = desc.GetGenbank().GetKeywords(); 3505 if(std::find(k.begin(), k.end(), "RefSeqGene") != k.end()) { 3506 return true; 3507 } 3508 } 3509 } 3510 return false; 3511 } 3512 3513 // VAR-239 3514 // If RefSeqGene NG, Return GeneIDs for loci having alignments to transcripts. 3515 // Otherwise, return empty set GetFocusLocusIDs(const CBioseq_Handle & bsh)3516 static set<int> GetFocusLocusIDs(const CBioseq_Handle& bsh) 3517 { 3518 set<int> gene_ids; 3519 if(!IsRefSeqGene(bsh)) { 3520 return gene_ids; 3521 } 3522 3523 set<CSeq_id_Handle> transcript_seq_ids; 3524 for(CAlign_CI ci(bsh); ci; ++ci) { 3525 const CSeq_align& aln = *ci; 3526 if(aln.GetSegs().IsSpliced()) { 3527 transcript_seq_ids.insert(CSeq_id_Handle::GetHandle(aln.GetSeq_id(0))); 3528 } 3529 } 3530 3531 SAnnotSelector sel; 3532 sel.IncludeFeatType(CSeqFeatData::e_Gene); 3533 sel.IncludeFeatType(CSeqFeatData::e_Rna); 3534 sel.IncludeFeatType(CSeqFeatData::e_Cdregion); 3535 for(CFeat_CI ci(bsh, sel); ci; ++ci) { 3536 const CMappedFeat& mf = *ci; 3537 if(!mf.IsSetProduct() || !mf.IsSetDbxref()) { 3538 continue; 3539 } 3540 3541 CSeq_id_Handle product_id = CSeq_id_Handle::GetHandle( 3542 sequence::GetId(mf.GetProduct(), NULL)); 3543 3544 if(transcript_seq_ids.find(product_id) == transcript_seq_ids.end()) { 3545 continue; 3546 } 3547 3548 //note: using temporary reference "dbxrefs" is necessary because 3549 //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode 3550 const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref(); 3551 ITERATE(CSeq_feat::TDbxref, it, dbxrefs) 3552 { 3553 const CDbtag& dbtag = **it; 3554 if(dbtag.GetDb() == "GeneID" 3555 || dbtag.GetDb() == "LocusID") { 3556 gene_ids.insert(dbtag.GetTag().GetId()); 3557 } 3558 } 3559 } 3560 return gene_ids; 3561 } 3562 3563 3564 // There's no GeneID on a protein bioseq. 3565 // Instead, need to find the corresponding cdregion (which may or may not have GeneID dbxref), 3566 // and get GeneID based on parent gene feature. s_GetGeneIdForProduct(CBioseq_Handle orig_bsh)3567 int CVariationUtil::CVariantPropertiesIndex::s_GetGeneIdForProduct( 3568 CBioseq_Handle orig_bsh) 3569 { 3570 CBioseq_Handle bsh = orig_bsh.GetInst_Mol() == CSeq_inst::eMol_aa ? 3571 sequence::GetNucleotideParent(orig_bsh) : orig_bsh; 3572 3573 SAnnotSelector sel; 3574 sel.SetResolveTSE(); 3575 sel.SetAdaptiveDepth(); 3576 sel.IncludeFeatType(CSeqFeatData::e_Gene); 3577 sel.IncludeFeatType(CSeqFeatData::e_Rna); 3578 sel.IncludeFeatType(CSeqFeatData::e_Cdregion); 3579 CFeat_CI ci(bsh, sel); 3580 feature::CFeatTree ft(ci); 3581 3582 for(ci.Rewind(); ci; ++ci) { 3583 const CMappedFeat& mf = *ci; 3584 if(mf.IsSetProduct() 3585 && mf.GetProduct().GetId() 3586 && sequence::IsSameBioseq( 3587 *orig_bsh.GetSeqId(), 3588 *mf.GetProduct().GetId(), 3589 &orig_bsh.GetScope())) { 3590 return s_GetGeneID(mf, ft); 3591 } 3592 } 3593 return 0; 3594 } 3595 3596 x_Index(const CSeq_id_Handle & idh)3597 void CVariationUtil::CVariantPropertiesIndex::x_Index(const CSeq_id_Handle& idh) 3598 { 3599 SAnnotSelector sel; 3600 sel.SetResolveAll(); //needs to work on NCs 3601 sel.SetAdaptiveDepth(); 3602 sel.IncludeFeatType(CSeqFeatData::e_Gene); 3603 sel.IncludeFeatType(CSeqFeatData::e_Rna); 3604 sel.IncludeFeatType(CSeqFeatData::e_Cdregion); 3605 3606 CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh); 3607 CFeat_CI ci(bsh, sel); 3608 feature::CFeatTree ft(ci); 3609 3610 m_loc2prop[idh].size(); //in case there's no annotation, simply add the entry to the map 3611 //se we know it has been indexed. 3612 3613 set<int> focus_loci = GetFocusLocusIDs(bsh); 3614 3615 //Note: A location can have in-neighborhood == true only if it is NOT in 3616 //range of another locus. However, if we're dealing with in-focus/out-of-focus 3617 //genes on NG, then for the purpose of calculating in-neighborhood flags we need to 3618 //put the non-focus genes out-of-scope. On the other hand, when calculating is-intergenic 3619 //flag, we have to assume that the non-focus genes are in scope (VAR-239). 3620 //That's why we need to collect focus_gene_ranges and non_focus_gene_ranges separately. 3621 //(See the code below that sets CVariantProperties::eGene_location_intergenic) 3622 3623 CRef<CSeq_loc> focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix)); 3624 CRef<CSeq_loc> non_focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix)); 3625 for(ci.Rewind(); ci; ++ci) { 3626 const CMappedFeat& mf = *ci; 3627 if(!mf.GetData().IsGene()) { 3628 continue; 3629 } 3630 3631 const int gene_id = s_GetGeneID(mf, ft); 3632 const bool is_focus_locus = focus_loci.empty() 3633 || focus_loci.count(gene_id); 3634 3635 (is_focus_locus ? focus_gene_ranges 3636 : non_focus_gene_ranges) 3637 ->SetMix().Set().push_back( 3638 sequence::Seq_loc_Merge( 3639 mf.GetLocation(), 3640 CSeq_loc::fMerge_SingleRange, 3641 NULL)); 3642 } 3643 focus_gene_ranges->ResetStrand(); 3644 non_focus_gene_ranges->ResetStrand(); 3645 SFastLocSubtract subtract_gene_ranges_from(*focus_gene_ranges); 3646 3647 CRef<CSeq_loc> all_gene_neighborhoods(new CSeq_loc(CSeq_loc::e_Mix)); 3648 3649 bool found_some_gene_ids = false; 3650 3651 for(ci.Rewind(); ci; ++ci) { 3652 const CMappedFeat& mf = *ci; 3653 CMappedFeat parent_mf = ft.GetParent(mf); 3654 3655 if(!mf.GetLocation().GetId()) { 3656 continue; 3657 } 3658 3659 const int gene_id = s_GetGeneID(mf, ft); 3660 if(!focus_loci.empty() 3661 && focus_loci.find(gene_id) == focus_loci.end()) { 3662 continue; //VAR-239 3663 } 3664 3665 if(!parent_mf && gene_id) { 3666 // Some locations currently may not be covered by 3667 // any variant-properties values (e.g. in-cds) 3668 // and yet we need to index gene_ids at those locations. 3669 // Since variant-properties is a bitmask, 3670 // we can simply use the value 0 for that. 3671 // Only need to do that for parent locs. 3672 x_Add(mf.GetLocation(), gene_id, 0); 3673 found_some_gene_ids = true; 3674 } 3675 3676 // compute neighbborhood locations. 3677 // Normally for gene features, or rna features lacking a parent gene feature. 3678 // Applicable for dna context only. 3679 if((mf.GetData().IsGene() || (!parent_mf && mf.GetData().IsRna())) 3680 && bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) { 3681 TLocsPair p = s_GetNeighborhoodLocs(mf.GetLocation(), bsh.GetInst_Length() - 1); 3682 3683 // exclude any gene overlap from neighborhood 3684 // (i.e. we don't care if location is in 3685 // neighborhood of gene-A if it is within gene-B. 3686 3687 // First, reset strands (as we did with gene_ranges), 3688 // as we need to do this in strand-agnostic manner. 3689 // Note: a reasonable thing to do would be to set 3690 // all-gene-ranges strand to "both", with an expectation 3691 // that subtracting such a loc from either plus or minus 3692 // loc would work, but unfortunately it doesn't. 3693 // Resetting strand, however, is fine, because our 3694 // indexing is strand-agnostic. 3695 p.first->ResetStrand(); 3696 p.second->ResetStrand(); 3697 3698 #if 0 3699 // Note: disabling subtraction of gene ranges because want to 3700 // report neargene-ness regardless of other loci SNP-5000 3701 #if 3702 //all_gene_ranges is a big complex loc, and subtracting with Seq_loc_Subtract 3703 //for every neighborhood is slow (takes almost 10 seconds for NC_000001), 3704 //so we'll use fast map-based subtractor instead 3705 p.first = sequence::Seq_loc_Subtract( 3706 *p.first, 3707 *all_gene_ranges, 3708 CSeq_loc::fSortAndMerge_All, 3709 NULL); 3710 3711 p.second = sequence::Seq_loc_Subtract( 3712 *p.second, 3713 *all_gene_ranges, 3714 CSeq_loc::fSortAndMerge_All, 3715 NULL); 3716 #else 3717 subtract_gene_ranges_from(*p.first); 3718 subtract_gene_ranges_from(*p.second); 3719 #endif 3720 #endif 3721 x_Add(*p.first, gene_id, CVariantProperties::eGene_location_near_gene_5); 3722 x_Add(*p.second, gene_id, CVariantProperties::eGene_location_near_gene_3); 3723 3724 all_gene_neighborhoods->SetMix().Set().push_back(p.first); 3725 all_gene_neighborhoods->SetMix().Set().push_back(p.second); 3726 } 3727 3728 //VAR-703: need to process both RNAs and CDSes to include IGs. 3729 if(mf.GetData().IsRna() || mf.GetData().IsCdregion()) { 3730 if(mf.GetData().IsRna() 3731 && mf.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA) { 3732 x_Add(mf.GetLocation(), 3733 gene_id, 3734 CVariantProperties::eGene_location_conserved_noncoding); 3735 } 3736 3737 TLocsPair p = s_GetIntronsAndSpliceSiteLocs(mf.GetLocation()); 3738 x_Add(*p.first, 3739 gene_id, 3740 CVariantProperties::eGene_location_intron); 3741 3742 size_t i(0); 3743 for(CSeq_loc_CI ci(*p.second, 3744 CSeq_loc_CI::eEmpty_Skip, 3745 CSeq_loc_CI::eOrder_Biological); 3746 ci; ++ci) { 3747 x_Add(*ci.GetRangeAsSeq_loc(), 3748 gene_id, 3749 i % 2 ? CVariantProperties::eGene_location_acceptor 3750 : CVariantProperties::eGene_location_donor); 3751 ++i; 3752 } 3753 } 3754 3755 if(mf.GetData().IsCdregion()) { 3756 TLocsPair p = s_GetStartAndStopCodonsLocs(mf.GetLocation()); 3757 3758 x_Add(*p.first, 3759 gene_id, 3760 CVariantProperties::eGene_location_in_start_codon); 3761 3762 x_Add(*p.second, 3763 gene_id, 3764 CVariantProperties::eGene_location_in_stop_codon); 3765 3766 CRef<CSeq_loc> rna_loc(new CSeq_loc(CSeq_loc::e_Null)); 3767 {{ 3768 if(parent_mf) { 3769 rna_loc->Assign(parent_mf.GetLocation()); 3770 } else if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) { 3771 //refseqs have gene feature annotated on rnas spanning the whole sequence, 3772 //but in general there may be free-floating CDS on an rna, in which case 3773 //parent loc is the whole seq. 3774 rna_loc = bsh.GetRangeSeq_loc( 3775 0, 3776 bsh.GetInst_Length() - 1, 3777 mf.GetLocation().GetStrand()); 3778 } 3779 }} 3780 3781 p = s_GetUTRLocs(mf.GetLocation(), *rna_loc); 3782 3783 x_Add(*p.first, 3784 gene_id, 3785 CVariantProperties::eGene_location_utr_5); 3786 3787 x_Add(*p.second, 3788 gene_id, 3789 CVariantProperties::eGene_location_utr_3); 3790 3791 } else if(mf.GetData().IsGene()) { 3792 if(ft.GetChildren(mf).size() == 0) { 3793 x_Add(mf.GetLocation(), 3794 gene_id, 3795 bsh.GetBioseqMolType() == CSeq_inst::eMol_rna ? 3796 CVariantProperties::eGene_location_conserved_noncoding 3797 : CVariantProperties::eGene_location_in_gene); 3798 } 3799 } 3800 } 3801 3802 if(bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) { 3803 3804 CRef<CSeq_loc> range_loc = 3805 bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus); 3806 3807 CRef<CSeq_loc> genes_and_neighborhoods_loc = 3808 sequence::Seq_loc_Add(*all_gene_neighborhoods, 3809 *focus_gene_ranges, 3810 0, NULL); 3811 3812 genes_and_neighborhoods_loc = 3813 sequence::Seq_loc_Add( 3814 *genes_and_neighborhoods_loc, 3815 *non_focus_gene_ranges, 3816 0, NULL); 3817 3818 genes_and_neighborhoods_loc->ResetStrand(); 3819 3820 CRef<CSeq_loc> intergenic_loc = 3821 sequence::Seq_loc_Subtract( 3822 *range_loc, 3823 *genes_and_neighborhoods_loc, 3824 CSeq_loc::fSortAndMerge_All, 3825 NULL); 3826 3827 x_Add(*intergenic_loc, 3828 0, 3829 CVariantProperties::eGene_location_intergenic); 3830 } 3831 3832 if(bsh.GetBioseqMolType() == CSeq_inst::eMol_aa 3833 && !found_some_gene_ids) { 3834 // JIRA:SNP-5390 3835 // in it's normal configuration the iterator won't 3836 // find a feature with GeneID on a protein, and so 3837 // it would not be reported in the protein placement. 3838 // If a CDS is annotated directly on a DNA molecule, 3839 // there would be no product-specific GeneID at all ( 3840 CRef<CSeq_loc> whole_range_loc = 3841 bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus); 3842 3843 int gene_id = s_GetGeneIdForProduct(bsh); 3844 if(gene_id) { 3845 x_Add(*whole_range_loc, gene_id, 0); 3846 } 3847 } 3848 } 3849 s_GetGeneID(const CMappedFeat & mf,feature::CFeatTree & ft)3850 int CVariationUtil::CVariantPropertiesIndex::s_GetGeneID( 3851 const CMappedFeat& mf, 3852 feature::CFeatTree& ft) 3853 { 3854 int gene_id = 0; 3855 if(mf.IsSetDbxref()) { 3856 //note: using temporary reference "dbxrefs" is necessary because 3857 //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode 3858 const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref(); 3859 ITERATE(CSeq_feat::TDbxref, it, dbxrefs) 3860 { 3861 const CDbtag& dbtag = **it; 3862 if(dbtag.GetDb() == "GeneID" 3863 || dbtag.GetDb() == "LocusID") { 3864 gene_id = dbtag.GetTag().GetId(); 3865 } 3866 } 3867 } 3868 3869 // try legacy representation, e.g. NM_000155.1: 3870 if(!gene_id 3871 && mf.GetData().IsGene() 3872 && mf.GetData().GetGene().IsSetDb()) { 3873 const CGene_ref::TDb& dbxrefs = mf.GetData().GetGene().GetDb(); 3874 ITERATE(CSeq_feat::TDbxref, it, dbxrefs) 3875 { 3876 const CDbtag& dbtag = **it; 3877 if(dbtag.GetDb() == "GeneID" 3878 || dbtag.GetDb() == "LocusID") { 3879 gene_id = dbtag.GetTag().GetId(); 3880 } 3881 } 3882 } 3883 3884 if(gene_id == 0) { 3885 CMappedFeat parent = ft.GetParent(mf); 3886 return parent ? s_GetGeneID(parent, ft) : gene_id; 3887 } else { 3888 return gene_id; 3889 } 3890 } 3891 3892 3893 CVariationUtil::CVariantPropertiesIndex::TLocsPair s_GetStartAndStopCodonsLocs(const CSeq_loc & cds_loc)3894 CVariationUtil::CVariantPropertiesIndex::s_GetStartAndStopCodonsLocs(const CSeq_loc& cds_loc) 3895 { 3896 TLocsPair p; 3897 p.first = sequence::Seq_loc_Merge(cds_loc, CSeq_loc::fMerge_SingleRange, NULL); 3898 p.second.Reset(new CSeq_loc); 3899 p.second->Assign(*p.first); 3900 p.first->SetInt().SetTo(p.first->GetInt().GetFrom() + 2); 3901 p.second->SetInt().SetFrom(p.second->GetInt().GetTo() - 2); 3902 3903 if(IsReverse(cds_loc.GetStrand())) { 3904 swap(p.first, p.second); 3905 } 3906 3907 if(cds_loc.IsPartialStart(eExtreme_Biological) || cds_loc.IsTruncatedStart(eExtreme_Biological)) { 3908 p.first->SetNull(); 3909 } 3910 if(cds_loc.IsPartialStop(eExtreme_Biological) || cds_loc.IsTruncatedStop(eExtreme_Biological)) { 3911 p.second->SetNull(); 3912 } 3913 3914 return p; 3915 } 3916 3917 CVariationUtil::CVariantPropertiesIndex::TLocsPair s_GetUTRLocs(const CSeq_loc & cds_loc,const CSeq_loc & parent_loc)3918 CVariationUtil::CVariantPropertiesIndex::s_GetUTRLocs(const CSeq_loc& cds_loc, const CSeq_loc& parent_loc) 3919 { 3920 TLocsPair p; 3921 CRef<CSeq_loc> sub_loc1 = sequence::Seq_loc_Merge(cds_loc, CSeq_loc::fMerge_SingleRange, NULL); 3922 CRef<CSeq_loc> sub_loc2(new CSeq_loc); 3923 sub_loc2->Assign(*sub_loc1); 3924 3925 sub_loc1->SetInt().SetTo(parent_loc.GetTotalRange().GetTo()); 3926 sub_loc2->SetInt().SetFrom(parent_loc.GetTotalRange().GetFrom()); 3927 3928 p.first = sequence::Seq_loc_Subtract(parent_loc, *sub_loc1, CSeq_loc::fSortAndMerge_All, NULL); 3929 p.second = sequence::Seq_loc_Subtract(parent_loc, *sub_loc2, CSeq_loc::fSortAndMerge_All, NULL); 3930 3931 if(IsReverse(cds_loc.GetStrand())) { 3932 swap(p.first, p.second); 3933 } 3934 return p; 3935 } 3936 3937 CVariationUtil::CVariantPropertiesIndex::TLocsPair s_GetNeighborhoodLocs(const CSeq_loc & gene_loc,TSeqPos max_pos)3938 CVariationUtil::CVariantPropertiesIndex::s_GetNeighborhoodLocs(const CSeq_loc& gene_loc, TSeqPos max_pos) 3939 { 3940 TSeqPos flank1_len(2000), flank2_len(500); 3941 if(IsReverse(gene_loc.GetStrand())) { 3942 swap(flank1_len, flank2_len); 3943 } 3944 3945 TLocsPair p; 3946 p.first = sequence::Seq_loc_Merge(gene_loc, CSeq_loc::fMerge_SingleRange, NULL); 3947 p.second.Reset(new CSeq_loc); 3948 p.second->Assign(*p.first); 3949 3950 if(p.first->GetTotalRange().GetFrom() == 0) { 3951 p.first->SetNull(); 3952 } else { 3953 p.first->SetInt().SetTo(p.first->GetTotalRange().GetFrom() - 1); 3954 p.first->SetInt().SetFrom(p.first->GetTotalRange().GetFrom() < flank1_len ? 0 : p.first->GetTotalRange().GetFrom() - flank1_len); 3955 } 3956 3957 if(p.second->GetTotalRange().GetTo() == max_pos) { 3958 p.second->SetNull(); 3959 } else { 3960 p.second->SetInt().SetFrom(p.second->GetTotalRange().GetTo() + 1); 3961 p.second->SetInt().SetTo(p.second->GetTotalRange().GetTo() > max_pos ? max_pos : p.second->GetTotalRange().GetTo() + flank2_len); 3962 } 3963 3964 if(IsReverse(gene_loc.GetStrand())) { 3965 swap(p.first, p.second); 3966 } 3967 return p; 3968 } 3969 3970 3971 CVariationUtil::CVariantPropertiesIndex::TLocsPair s_GetIntronsAndSpliceSiteLocs(const CSeq_loc & rna_loc)3972 CVariationUtil::CVariantPropertiesIndex::s_GetIntronsAndSpliceSiteLocs(const CSeq_loc& rna_loc) 3973 { 3974 CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(rna_loc, CSeq_loc::fMerge_SingleRange, NULL); 3975 3976 CRef<CSeq_loc> introns_loc_with_splice_sites = sequence::Seq_loc_Subtract(*range_loc, rna_loc, 0, NULL); 3977 CRef<CSeq_loc> introns_loc_without_splice_sites(new CSeq_loc); 3978 introns_loc_without_splice_sites->Assign(*introns_loc_with_splice_sites); 3979 3980 //truncate each terminal by two bases from each end to exclude splice sites. 3981 for(CTypeIterator<CSeq_interval> it(Begin(*introns_loc_without_splice_sites)); it; ++it) { 3982 CSeq_interval& seqint = *it; 3983 if(seqint.GetLength() >= 5) { 3984 seqint.SetFrom() += 2; 3985 seqint.SetTo() -= 2; 3986 } 3987 } 3988 3989 TLocsPair p; 3990 p.first = introns_loc_without_splice_sites; 3991 p.second = sequence::Seq_loc_Subtract(*introns_loc_with_splice_sites, 3992 *introns_loc_without_splice_sites, 3993 CSeq_loc::fSortAndMerge_All, 3994 NULL); 3995 return p; 3996 } 3997 3998 3999 4000 4001 4002 x_Index(const CSeq_id_Handle & idh)4003 void CVariationUtil::CCdregionIndex::x_Index(const CSeq_id_Handle& idh) 4004 { 4005 SAnnotSelector sel; 4006 sel.SetResolveAll(); //needs to work on NCs 4007 sel.SetAdaptiveDepth(); 4008 sel.IncludeFeatType(CSeqFeatData::e_Cdregion); 4009 sel.IncludeFeatType(CSeqFeatData::e_Rna); 4010 CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh); 4011 4012 m_data[idh].size(); //this will create m_data[idh] entry so that next time we don't try to reindex it if there are no cdregions 4013 m_seq_data_map[idh].mapper.Reset(); 4014 4015 CRef<CSeq_loc> all_rna_loc(new CSeq_loc(CSeq_loc::e_Null)); 4016 4017 for(CFeat_CI ci(bsh, sel); ci; ++ci) { 4018 const CMappedFeat& mf = *ci; 4019 if(!mf.IsSetProduct() || !mf.GetProduct().GetId()) { 4020 continue; //could be segment 4021 } 4022 4023 if(mf.GetData().IsRna()) { 4024 all_rna_loc->Add(mf.GetLocation()); 4025 } else { 4026 all_rna_loc->Add(mf.GetLocation()); //if on mRNA seq, there's no annotated mRNA feature - will just index cds 4027 4028 SCdregion s; 4029 s.cdregion_feat.Reset(mf.GetSeq_feat()); 4030 for(CSeq_loc_CI ci(mf.GetLocation()); ci; ++ci) { 4031 m_data[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(s); 4032 } 4033 4034 #if 0 4035 //could possibly cache product sequence as well 4036 if(m_options && CVariationUtil::fOpt_cache_exon_sequence) { 4037 x_CacheSeqData(mf.GetProduct(), CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId())); 4038 } 4039 #endif 4040 } 4041 } 4042 4043 all_rna_loc = sequence::Seq_loc_Merge(*all_rna_loc, CSeq_loc::fSortAndMerge_All, NULL); 4044 4045 if(m_options && CVariationUtil::fOpt_cache_exon_sequence) { 4046 x_CacheSeqData(*all_rna_loc, idh); 4047 } 4048 } 4049 x_CacheSeqData(const CSeq_loc & loc,const CSeq_id_Handle & idh)4050 void CVariationUtil::CCdregionIndex::x_CacheSeqData(const CSeq_loc& loc, const CSeq_id_Handle& idh) 4051 { 4052 CSeq_id_Handle idh2 = sequence::GetId(idh, *m_scope, sequence::eGetId_Canonical); 4053 SSeqData& d = m_seq_data_map[idh2]; 4054 4055 if(loc.IsNull() || loc.IsEmpty()) { 4056 return; 4057 } 4058 4059 CRef<CSeq_loc> loc2(new CSeq_loc); 4060 loc2->Assign(loc); 4061 4062 CSeqVector seqv; 4063 if(loc.IsWhole()) { 4064 CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh); 4065 loc2 = bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus); 4066 seqv = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac); 4067 } else { 4068 seqv = CSeqVector(*loc2, *m_scope, CBioseq_Handle::eCoding_Iupac); 4069 } 4070 4071 seqv.GetSeqData(seqv.begin(), seqv.end(), d.seq_data); 4072 4073 if(d.seq_data.size() != sequence::GetLength(*loc2, m_scope) /*could be whole, so need scope*/) { 4074 //expecting the length of returned sequence to be exactly the same as location; 4075 //otherwise can't associate sub-location with position in returned sequence 4076 d.seq_data = ""; 4077 d.mapper.Reset(); 4078 return; 4079 } 4080 4081 CRef<CSeq_loc> target_loc(new CSeq_loc); 4082 target_loc->SetInt().SetId().SetLocal().SetStr("all_cds"); 4083 target_loc->SetInt().SetFrom(0); 4084 target_loc->SetInt().SetTo(d.seq_data.size() - 1); 4085 target_loc->SetInt().SetStrand(eNa_strand_plus); 4086 4087 d.mapper.Reset(new CSeq_loc_Mapper(*loc2, *target_loc, NULL)); 4088 } 4089 GetCachedLiteralAtLoc(const CSeq_loc & loc)4090 CRef<CSeq_literal> CVariationUtil::CCdregionIndex::GetCachedLiteralAtLoc(const CSeq_loc& loc) 4091 { 4092 CRef<CSeq_literal> literal(new CSeq_literal); 4093 literal->SetSeq_data().SetIupacna().Set(""); 4094 4095 4096 CRef<CSeq_loc> loc2(new CSeq_loc); 4097 loc2->Assign(loc); 4098 ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope); 4099 4100 for(CSeq_loc_CI ci(*loc2); ci; ++ci) { 4101 if(m_seq_data_map.find(ci.GetSeq_id_Handle()) == m_seq_data_map.end()) { 4102 return CRef<CSeq_literal>(NULL); 4103 } 4104 const SSeqData& d = m_seq_data_map.find(ci.GetSeq_id_Handle())->second; 4105 if(!d.mapper) { 4106 return CRef<CSeq_literal>(NULL); 4107 } 4108 4109 CRef<CSeq_loc> mapped_loc = d.mapper->Map(*ci.GetRangeAsSeq_loc()); 4110 4111 string s = ""; 4112 mapped_loc->GetLabel(&s); 4113 if((!mapped_loc->IsInt() && !mapped_loc->IsPnt()) //split mapping 4114 || mapped_loc->GetTotalRange().GetLength() != ci.GetRange().GetLength() 4115 || mapped_loc->GetStrand() != eNa_strand_plus) //todo: can accomodate minus by reverse-complementing the chunk 4116 { 4117 return CRef<CSeq_literal>(NULL); 4118 } 4119 4120 string seq_chunk = d.seq_data.substr(mapped_loc->GetStart(eExtreme_Positional), mapped_loc->GetTotalRange().GetLength()); 4121 literal->SetSeq_data().SetIupacna().Set() += seq_chunk; 4122 } 4123 4124 literal->SetLength(literal->GetSeq_data().GetIupacna().Get().size()); 4125 return literal; 4126 } 4127 Get(const CSeq_loc & loc,TCdregions & cdregions)4128 void CVariationUtil::CCdregionIndex::Get(const CSeq_loc& loc, TCdregions& cdregions) 4129 { 4130 CRef<CSeq_loc> loc2(new CSeq_loc); 4131 loc2->Assign(loc); 4132 ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope); 4133 4134 for(CSeq_loc_CI ci(*loc2); ci; ++ci) { 4135 CSeq_id_Handle idh = ci.GetSeq_id_Handle(); 4136 4137 if(m_data.find(idh) == m_data.end()) { 4138 //first time seeing this seq-id - index it 4139 x_Index(idh); 4140 } 4141 4142 TIdRangeMap::const_iterator it = m_data.find(idh); 4143 if(it == m_data.end()) { 4144 return; 4145 } 4146 const TRangeMap& rm = it->second; 4147 4148 set<SCdregion> results; 4149 for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) { 4150 ITERATE(TCdregions, it3, it2->second) 4151 { 4152 results.insert(*it3); 4153 } 4154 } 4155 ITERATE(set<SCdregion>, it, results) 4156 { 4157 cdregions.push_back(*it); 4158 } 4159 } 4160 } 4161 4162 4163 4164 // 4165 // 4166 // Methods to suuport Variation / Variation_ref conversions. 4167 // 4168 // 4169 4170 //Make a copy of the child and inherit relevant (SNP-5716) fields from parent that are not set in the child InheritParentAttributes(const CVariation & child,const CVariation & parent)4171 CRef<CVariation> InheritParentAttributes(const CVariation& child, const CVariation& parent) 4172 { 4173 CRef<CVariation> v(SerialClone(child)); 4174 4175 if(!v->IsSetId() && parent.IsSetId()) { 4176 v->SetId().Assign(parent.GetId()); 4177 } 4178 4179 if(!v->IsSetParent_id() && parent.IsSetParent_id()) { 4180 v->SetParent_id().Assign(parent.GetParent_id()); 4181 } 4182 4183 if(!v->IsSetSample_id() && parent.IsSetSample_id()) { 4184 ITERATE(CVariation::TSample_id, it, parent.GetSample_id()) 4185 { 4186 v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(**it))); 4187 } 4188 } 4189 4190 if(!v->IsSetOther_ids() && parent.IsSetOther_ids()) { 4191 ITERATE(CVariation::TOther_ids, it, parent.GetOther_ids()) 4192 { 4193 v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it))); 4194 } 4195 } 4196 4197 return v; 4198 } 4199 AsVariation_feats(const CVariation & v,CSeq_annot::TData::TFtable & out_feats)4200 void CVariationUtil::AsVariation_feats(const CVariation& v, CSeq_annot::TData::TFtable& out_feats) 4201 { 4202 CSeq_annot::TData::TFtable feats; 4203 if(v.IsSetPlacements()) { 4204 ITERATE(CVariation::TPlacements, it, v.GetPlacements()) 4205 { 4206 const CVariantPlacement& p = **it; 4207 CRef<CVariation_ref> vr = x_AsVariation_ref(v, p); 4208 CRef<CSeq_feat> feat(new CSeq_feat); 4209 feat->SetLocation().Assign(p.GetLoc()); 4210 feat->SetData().SetVariation(*vr); 4211 feats.push_back(feat); 4212 } 4213 } else if(v.GetData().IsSet()) { 4214 ITERATE(CVariation::TData::TSet::TVariations, it, v.GetData().GetSet().GetVariations()) 4215 { 4216 AsVariation_feats(*InheritParentAttributes(**it, v), feats); 4217 } 4218 } 4219 4220 NON_CONST_ITERATE(CSeq_annot::TData::TFtable, it, feats) 4221 { 4222 CSeq_feat& feat = **it; 4223 if(v.IsSetPub() && !feat.IsSetCit()) { 4224 feat.SetCit().Assign(v.GetPub()); 4225 } 4226 if(v.IsSetExt()) { 4227 ITERATE(CVariation::TExt, it2, v.GetExt()) 4228 { 4229 CRef<CUser_object> uo(new CUser_object); 4230 uo->Assign(**it2); 4231 feat.SetExts().push_back(uo); 4232 } 4233 } 4234 } 4235 4236 //note: we can't attach user-objects to out_feats directly because it will result in duplicates 4237 //when out_feats recurses into next subvariation. Hence we are working with our own ftable of 4238 //feats, and attach the results in the end. 4239 out_feats.insert(out_feats.end(), feats.begin(), feats.end()); 4240 } 4241 x_AsVariation_ref(const CVariation & v,const CVariantPlacement & p)4242 CRef<CVariation_ref> CVariationUtil::x_AsVariation_ref(const CVariation& v, const CVariantPlacement& p) 4243 { 4244 CRef<CVariation_ref> vr(new CVariation_ref); 4245 4246 if(v.IsSetId()) { 4247 vr->SetId().Assign(v.GetId()); 4248 } 4249 4250 if(v.IsSetParent_id()) { 4251 vr->SetParent_id().Assign(v.GetParent_id()); 4252 } 4253 4254 if(v.IsSetSample_id() && v.GetSample_id().size() > 0) { 4255 vr->SetSample_id().Assign(*v.GetSample_id().front()); 4256 } 4257 4258 if(v.IsSetOther_ids()) { 4259 ITERATE(CVariation::TOther_ids, it, v.GetOther_ids()) 4260 { 4261 vr->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it))); 4262 } 4263 } 4264 4265 if(v.IsSetName()) { 4266 vr->SetName(v.GetName()); 4267 } 4268 4269 if(v.IsSetSynonyms()) { 4270 vr->SetSynonyms() = v.GetSynonyms(); 4271 } 4272 4273 if(v.IsSetDescription()) { 4274 vr->SetName(v.GetDescription()); 4275 } 4276 4277 if(v.IsSetPhenotype()) { 4278 ITERATE(CVariation::TPhenotype, it, v.GetPhenotype()) 4279 { 4280 CRef<CPhenotype> p(new CPhenotype); 4281 p->Assign(**it); 4282 vr->SetPhenotype().push_back(p); 4283 } 4284 } 4285 4286 if(v.IsSetMethod()) { 4287 vr->SetMethod() = v.GetMethod().GetMethod(); 4288 } 4289 4290 if(v.IsSetVariant_prop()) { 4291 vr->SetVariant_prop().Assign(v.GetVariant_prop()); 4292 } 4293 4294 //Note: Variation.frameshift <-> Variation-ref.consequence[].frameshift 4295 if(v.IsSetFrameshift()) { 4296 CVariation_ref::TConsequence::value_type fr_cons( 4297 new CVariation_ref::TConsequence::value_type::TObjectType); 4298 vr->SetConsequence().push_back(fr_cons); 4299 fr_cons->SetFrameshift(); 4300 if(v.GetFrameshift().IsSetPhase()) { 4301 fr_cons->SetFrameshift().SetPhase(v.GetFrameshift().GetPhase()); 4302 } 4303 if(v.GetFrameshift().IsSetX_length()) { 4304 fr_cons->SetFrameshift().SetX_length(v.GetFrameshift().GetX_length()); 4305 } 4306 } 4307 4308 if(v.GetData().IsComplex()) { 4309 vr->SetData().SetComplex(); 4310 } else if(v.GetData().IsInstance()) { 4311 vr->SetData().SetInstance().Assign(v.GetData().GetInstance()); 4312 s_AddInstOffsetsFromPlacementOffsets(vr->SetData().SetInstance(), p); 4313 } else if(v.GetData().IsNote()) { 4314 vr->SetData().SetNote() = v.GetData().GetNote(); 4315 } else if(v.GetData().IsUniparental_disomy()) { 4316 vr->SetData().SetUniparental_disomy(); 4317 } else if(v.GetData().IsUnknown()) { 4318 vr->SetData().SetUnknown(); 4319 } else if(v.GetData().IsSet()) { 4320 const CVariation::TData::TSet& v_set = v.GetData().GetSet(); 4321 CVariation_ref::TData::TSet& vr_set = vr->SetData().SetSet(); 4322 vr_set.SetType(v_set.GetType()); 4323 if(v_set.IsSetName()) { 4324 vr_set.SetName(v_set.GetName()); 4325 } 4326 ITERATE(CVariation::TData::TSet::TVariations, it, v_set.GetVariations()) 4327 { 4328 vr_set.SetVariations().push_back(x_AsVariation_ref(**it, p)); 4329 } 4330 } else { 4331 NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice"); 4332 } 4333 4334 if(v.IsSetConsequence()) { 4335 vr->SetConsequence(); 4336 ITERATE(CVariation::TConsequence, it, v.GetConsequence()) 4337 { 4338 const CVariation::TConsequence::value_type::TObjectType& v_cons = **it; 4339 CVariation_ref::TConsequence::value_type vr_cons( 4340 new CVariation_ref::TConsequence::value_type::TObjectType); 4341 vr->SetConsequence().push_back(vr_cons); 4342 vr_cons->SetUnknown(); 4343 4344 if(v_cons.IsSplicing()) { 4345 vr_cons->SetSplicing(); 4346 } else if(v_cons.IsNote()) { 4347 vr_cons->SetNote(v_cons.GetNote()); 4348 } else if(v_cons.IsVariation()) { 4349 CRef<CVariation_ref> cons_variation = x_AsVariation_ref(v_cons.GetVariation(), p); 4350 vr_cons->SetVariation(*cons_variation); 4351 } else if(v_cons.IsLoss_of_heterozygosity()) { 4352 vr_cons->SetLoss_of_heterozygosity(); 4353 if(v_cons.GetLoss_of_heterozygosity().IsSetReference()) { 4354 vr_cons->SetLoss_of_heterozygosity().SetReference( 4355 v_cons.GetLoss_of_heterozygosity().GetReference()); 4356 } 4357 if(v_cons.GetLoss_of_heterozygosity().IsSetTest()) { 4358 vr_cons->SetLoss_of_heterozygosity().SetTest( 4359 v_cons.GetLoss_of_heterozygosity().GetTest()); 4360 } 4361 } 4362 } 4363 } 4364 4365 if(v.IsSetSomatic_origin()) { 4366 vr->SetSomatic_origin(); 4367 ITERATE(CVariation::TSomatic_origin, it, v.GetSomatic_origin()) 4368 { 4369 const CVariation::TSomatic_origin::value_type::TObjectType& v_so = **it; 4370 4371 CVariation_ref::TSomatic_origin::value_type vr_so( 4372 new CVariation_ref::TSomatic_origin::value_type::TObjectType); 4373 4374 if(v_so.IsSetSource()) { 4375 vr_so->SetSource().Assign(v_so.GetSource()); 4376 } 4377 4378 if(v_so.IsSetCondition()) { 4379 vr_so->SetCondition(); 4380 if(v_so.GetCondition().IsSetDescription()) { 4381 vr_so->SetCondition().SetDescription( 4382 v_so.GetCondition().GetDescription()); 4383 } 4384 if(v_so.GetCondition().IsSetObject_id()) { 4385 vr_so->SetCondition().SetObject_id(); 4386 ITERATE(CVariation::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id, 4387 it, 4388 v_so.GetCondition().GetObject_id()) 4389 { 4390 CRef<CDbtag> dbtag(new CDbtag); 4391 dbtag->Assign(**it); 4392 vr_so->SetCondition().SetObject_id().push_back(dbtag); 4393 } 4394 } 4395 } 4396 4397 vr->SetSomatic_origin().push_back(vr_so); 4398 } 4399 } 4400 4401 4402 return vr; 4403 } 4404 CreateDeltaForOffset(int offset,const CInt_fuzz * fuzz)4405 static CRef<CDelta_item> CreateDeltaForOffset(int offset, const CInt_fuzz* fuzz) 4406 { 4407 CRef<CDelta_item> delta(new CDelta_item); 4408 delta->SetAction(CDelta_item::eAction_offset); 4409 delta->SetSeq().SetLiteral().SetLength(abs(offset)); 4410 if(offset < 0) { 4411 delta->SetMultiplier(-1); 4412 } 4413 if(fuzz) { 4414 delta->SetSeq().SetLiteral().SetFuzz().Assign(*fuzz); 4415 } 4416 return delta; 4417 } 4418 s_AddInstOffsetsFromPlacementOffsets(CVariation_inst & vi,const CVariantPlacement & p)4419 void CVariationUtil::s_AddInstOffsetsFromPlacementOffsets( 4420 CVariation_inst& vi, 4421 const CVariantPlacement& p) 4422 { 4423 if(p.IsSetStart_offset()) { 4424 vi.SetDelta().push_front(CreateDeltaForOffset( 4425 p.GetStart_offset(), 4426 p.IsSetStart_offset_fuzz() ? &p.GetStart_offset_fuzz() : NULL)); 4427 } 4428 if(p.IsSetStop_offset()) { 4429 vi.SetDelta().push_back(CreateDeltaForOffset( 4430 p.GetStop_offset(), 4431 p.IsSetStop_offset_fuzz() ? &p.GetStop_offset_fuzz() : NULL)); 4432 } 4433 } 4434 4435 AsVariation(const CSeq_feat & variation_feat)4436 CRef<CVariation> CVariationUtil::AsVariation(const CSeq_feat& variation_feat) 4437 { 4438 if(!variation_feat.GetData().IsVariation()) { 4439 NCBI_THROW(CException, eUnknown, "Expected variation-ref feature"); 4440 } 4441 4442 CRef<CVariation> v = x_AsVariation(variation_feat.GetData().GetVariation()); 4443 4444 CRef<CVariantPlacement> p(new CVariantPlacement); 4445 p->SetLoc().Assign(variation_feat.GetLocation()); 4446 if(p->GetLoc().GetId()) { 4447 p->SetMol(GetMolType(sequence::GetId(p->GetLoc(), NULL))); 4448 } else { 4449 p->SetMol(CVariantPlacement::eMol_unknown); 4450 } 4451 v->SetPlacements().push_back(p); 4452 4453 4454 if(variation_feat.IsSetCit()) { 4455 v->SetPub().Assign(variation_feat.GetCit()); 4456 } 4457 if(variation_feat.IsSetExt()) { 4458 v->SetExt(); 4459 CRef<CUser_object> uo(new CUser_object); 4460 uo->Assign(variation_feat.GetExt()); 4461 v->SetExt().push_back(uo); 4462 } 4463 if(variation_feat.IsSetExts()) { 4464 v->SetExt(); 4465 ITERATE(CSeq_feat::TExts, it, variation_feat.GetExts()) 4466 { 4467 CRef<CUser_object> uo(new CUser_object); 4468 uo->Assign(**it); 4469 v->SetExt().push_back(uo); 4470 } 4471 } 4472 4473 s_ConvertInstOffsetsToPlacementOffsets(*v, *p); 4474 return v; 4475 } 4476 x_AsVariation(const CVariation_ref & vr)4477 CRef<CVariation> CVariationUtil::x_AsVariation(const CVariation_ref& vr) 4478 { 4479 CRef<CVariation> v(new CVariation); 4480 if(vr.IsSetId()) { 4481 v->SetId().Assign(vr.GetId()); 4482 } 4483 4484 if(vr.IsSetParent_id()) { 4485 v->SetParent_id().Assign(vr.GetParent_id()); 4486 } 4487 4488 if(vr.IsSetSample_id()) { 4489 v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(vr.GetSample_id()))); 4490 } 4491 4492 if(vr.IsSetOther_ids()) { 4493 ITERATE(CVariation_ref::TOther_ids, it, vr.GetOther_ids()) 4494 { 4495 v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it))); 4496 } 4497 } 4498 4499 if(vr.IsSetName()) { 4500 v->SetName(vr.GetName()); 4501 } 4502 4503 if(vr.IsSetSynonyms()) { 4504 v->SetSynonyms() = vr.GetSynonyms(); 4505 } 4506 4507 if(vr.IsSetDescription()) { 4508 v->SetName(vr.GetDescription()); 4509 } 4510 4511 if(vr.IsSetPhenotype()) { 4512 ITERATE(CVariation_ref::TPhenotype, it, vr.GetPhenotype()) 4513 { 4514 CRef<CPhenotype> p(new CPhenotype); 4515 p->Assign(**it); 4516 v->SetPhenotype().push_back(p); 4517 } 4518 } 4519 4520 if(vr.IsSetMethod()) { 4521 v->SetMethod().SetMethod() = vr.GetMethod(); 4522 } 4523 4524 if(vr.IsSetVariant_prop()) { 4525 v->SetVariant_prop().Assign(vr.GetVariant_prop()); 4526 } 4527 4528 if(vr.GetData().IsComplex()) { 4529 v->SetData().SetComplex(); 4530 } else if(vr.GetData().IsInstance()) { 4531 v->SetData().SetInstance().Assign(vr.GetData().GetInstance()); 4532 } else if(vr.GetData().IsNote()) { 4533 v->SetData().SetNote() = vr.GetData().GetNote(); 4534 } else if(vr.GetData().IsUniparental_disomy()) { 4535 v->SetData().SetUniparental_disomy(); 4536 } else if(vr.GetData().IsUnknown()) { 4537 v->SetData().SetUnknown(); 4538 } else if(vr.GetData().IsSet()) { 4539 const CVariation_ref::TData::TSet& vr_set = vr.GetData().GetSet(); 4540 CVariation::TData::TSet& v_set = v->SetData().SetSet(); 4541 v_set.SetType(vr_set.GetType()); 4542 if(vr_set.IsSetName()) { 4543 v_set.SetName(vr_set.GetName()); 4544 } 4545 ITERATE(CVariation_ref::TData::TSet::TVariations, it, vr_set.GetVariations()) 4546 { 4547 v_set.SetVariations().push_back(x_AsVariation(**it)); 4548 } 4549 } else { 4550 NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice"); 4551 } 4552 4553 if(vr.IsSetConsequence()) { 4554 v->SetConsequence(); 4555 ITERATE(CVariation_ref::TConsequence, it, vr.GetConsequence()) 4556 { 4557 const CVariation_ref::TConsequence::value_type::TObjectType& vr_cons = **it; 4558 4559 if(vr_cons.IsFrameshift()) { 4560 // frameshift is represented in consequence in Variation-ref, and 4561 // directly as an attribute in Variation. 4562 CVariation& cons_variation = *v; 4563 cons_variation.SetFrameshift(); 4564 if(vr_cons.GetFrameshift().IsSetPhase()) { 4565 cons_variation.SetFrameshift().SetPhase(vr_cons.GetFrameshift().GetPhase()); 4566 } 4567 if(vr_cons.GetFrameshift().IsSetX_length()) { 4568 cons_variation.SetFrameshift().SetX_length(vr_cons.GetFrameshift().GetX_length()); 4569 } 4570 continue; 4571 } 4572 4573 CVariation::TConsequence::value_type v_cons(new CVariation::TConsequence::value_type::TObjectType); 4574 4575 if(vr_cons.IsUnknown()) { 4576 v_cons->SetUnknown(); 4577 } else if(vr_cons.IsSplicing()) { 4578 v_cons->SetSplicing(); 4579 } else if(vr_cons.IsNote()) { 4580 v_cons->SetNote(vr_cons.GetNote()); 4581 } else if(vr_cons.IsVariation()) { 4582 CRef<CVariation> cons_variation = x_AsVariation(vr_cons.GetVariation()); 4583 v_cons->SetVariation(*cons_variation); 4584 } else if(vr_cons.IsLoss_of_heterozygosity()) { 4585 v_cons->SetLoss_of_heterozygosity(); 4586 if(vr_cons.GetLoss_of_heterozygosity().IsSetReference()) { 4587 v_cons->SetLoss_of_heterozygosity().SetReference(vr_cons.GetLoss_of_heterozygosity().GetReference()); 4588 } 4589 if(vr_cons.GetLoss_of_heterozygosity().IsSetTest()) { 4590 v_cons->SetLoss_of_heterozygosity().SetTest(vr_cons.GetLoss_of_heterozygosity().GetTest()); 4591 } 4592 } 4593 4594 v->SetConsequence().push_back(v_cons); 4595 } 4596 if(v->GetConsequence().empty()) { 4597 v->ResetConsequence(); 4598 } 4599 } 4600 4601 if(vr.IsSetSomatic_origin()) { 4602 v->SetSomatic_origin(); 4603 ITERATE(CVariation_ref::TSomatic_origin, it, vr.GetSomatic_origin()) 4604 { 4605 const CVariation_ref::TSomatic_origin::value_type::TObjectType& vr_so = **it; 4606 CVariation::TSomatic_origin::value_type v_so(new CVariation::TSomatic_origin::value_type::TObjectType); 4607 4608 if(vr_so.IsSetSource()) { 4609 v_so->SetSource().Assign(vr_so.GetSource()); 4610 } 4611 4612 if(vr_so.IsSetCondition()) { 4613 v_so->SetCondition(); 4614 if(vr_so.GetCondition().IsSetDescription()) { 4615 v_so->SetCondition().SetDescription(vr_so.GetCondition().GetDescription()); 4616 } 4617 if(vr_so.GetCondition().IsSetObject_id()) { 4618 v_so->SetCondition().SetObject_id(); 4619 ITERATE(CVariation_ref::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id, 4620 it, 4621 vr_so.GetCondition().GetObject_id()) 4622 { 4623 CRef<CDbtag> dbtag(new CDbtag); 4624 dbtag->Assign(**it); 4625 v_so->SetCondition().SetObject_id().push_back(dbtag); 4626 } 4627 } 4628 } 4629 4630 v->SetSomatic_origin().push_back(v_so); 4631 } 4632 } 4633 4634 return v; 4635 } 4636 4637 GetSignedOffset(const CDelta_item & delta)4638 static int GetSignedOffset(const CDelta_item& delta) 4639 { 4640 return delta.GetSeq().GetLiteral().GetLength() 4641 * (delta.IsSetMultiplier() ? delta.GetMultiplier() : 1); 4642 } 4643 GetFuzz(const CDelta_item & delta)4644 static const CInt_fuzz* GetFuzz(const CDelta_item& delta) 4645 { 4646 return delta.GetSeq().GetLiteral().IsSetFuzz() ? 4647 &delta.GetSeq().GetLiteral().GetFuzz() : NULL; 4648 } 4649 s_ConvertInstOffsetsToPlacementOffsets(CVariation & v,CVariantPlacement & p)4650 void CVariationUtil::s_ConvertInstOffsetsToPlacementOffsets(CVariation& v, CVariantPlacement& p) 4651 { 4652 if(v.GetData().IsSet()) { 4653 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, 4654 it, 4655 v.SetData().SetSet().SetVariations()) 4656 { 4657 s_ConvertInstOffsetsToPlacementOffsets(**it, p); 4658 } 4659 } else if(v.GetData().IsInstance() && v.GetData().GetInstance().GetDelta().size() > 0) { 4660 CConstRef<CDelta_item> delta_first = v.GetData().GetInstance().GetDelta().front(); 4661 CConstRef<CDelta_item> delta_last = v.GetData().GetInstance().GetDelta().back(); 4662 4663 if(delta_first->IsSetAction() 4664 && delta_first->GetAction() == CDelta_item::eAction_offset) { 4665 p.SetStart_offset(GetSignedOffset(*delta_first)); 4666 const CInt_fuzz* fuzz = GetFuzz(*delta_first); 4667 if(fuzz) { 4668 p.SetStart_offset_fuzz().Assign(*fuzz); 4669 } 4670 4671 v.SetData().SetInstance().SetDelta().pop_front(); 4672 } 4673 4674 if(delta_last != delta_first 4675 && delta_last->IsSetAction() 4676 && delta_last->GetAction() == CDelta_item::eAction_offset) { 4677 p.SetStop_offset(GetSignedOffset(*delta_last)); 4678 const CInt_fuzz* fuzz = GetFuzz(*delta_last); 4679 if(fuzz) { 4680 p.SetStop_offset_fuzz().Assign(*fuzz); 4681 } 4682 4683 v.SetData().SetInstance().SetDelta().pop_back(); 4684 } 4685 } 4686 } 4687 4688 4689 4690 #if 0 4691 CVariationUtil::ETestStatus CVariationUtil::CheckAssertedAllele( 4692 const CSeq_feat& variation_feat, 4693 string* asserted_out, 4694 string* actual_out) 4695 { 4696 return eNotApplicable; 4697 if(!variation_feat.GetData().IsVariation()) { 4698 return eNotApplicable; 4699 } 4700 4701 CVariation_ref vr; 4702 vr.Assign(variation_feat.GetData().GetVariation()); 4703 if(!vr.IsSetLocation()) { 4704 vr.SetLocation().Assign(variation_feat.GetLocation()); 4705 } 4706 s_PropagateLocsInPlace(vr); 4707 4708 4709 bool have_asserted_seq = false; 4710 bool is_ok = true; 4711 for(CTypeIterator<CVariation_ref> it1(Begin(vr)); it1; ++it1) { 4712 const CVariation_ref& vr1 = *it1; 4713 if(vr1.GetData().IsInstance() 4714 && vr1.GetData().GetInstance().IsSetObservation() 4715 && vr1.GetData().GetInstance().GetObservation() == CVariation_inst::eObservation_asserted) 4716 { 4717 string asserted_seq; 4718 const CSeq_literal& literal = vr1.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral(); 4719 if(literal.GetSeq_data().IsIupacna()) { 4720 asserted_seq = literal.GetSeq_data().GetIupacna(); 4721 have_asserted_seq = true; 4722 } else if(literal.GetSeq_data().IsNcbieaa()) { 4723 asserted_seq = literal.GetSeq_data().GetNcbieaa(); 4724 have_asserted_seq = true; 4725 } 4726 4727 //an asserted sequnece may be of the form "A..BC", where ".." is to be interpreted as a 4728 //gap of arbitrary length - we need to match prefix and suffix separately 4729 string prefix, suffix; 4730 string str_tmp = NStr::Replace(asserted_seq, "..", "\t"); //SplitInTwo's delimiter must be single-character 4731 NStr::SplitInTwo(str_tmp, "\t", prefix, suffix); 4732 4733 CSeqVector v(vr1.GetLocation(), *m_scope, CBioseq_Handle::eCoding_Iupac); 4734 string actual_seq; 4735 v.GetSeqData(v.begin(), v.end(), actual_seq); 4736 4737 if( prefix.size() > 0 && !NStr::StartsWith(actual_seq, prefix) 4738 || suffix.size() > 0 && !NStr::EndsWith(actual_seq, suffix)) 4739 { 4740 is_ok = false; 4741 if(asserted_out) { 4742 *asserted_out = asserted_seq; 4743 } 4744 if(actual_out) { 4745 *actual_out = actual_seq; 4746 } 4747 break; 4748 } 4749 } 4750 } 4751 4752 return !have_asserted_seq ? eNotApplicable : is_ok ? ePass : eFail; 4753 } 4754 #endif 4755 4756 4757 }; 4758 4759 END_NCBI_SCOPE 4760 4761