1 /* $Id: cdd_pssm_input.cpp 500404 2016-05-04 14:59:01Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Greg Boratyn
27 *
28 */
29
30 /** @file psi_pssm_input.cpp
31 * Implementation of the concrete strategy to obtain PSSM input data for
32 * PSI-BLAST.
33 */
34 #include <ncbi_pch.hpp>
35
36 // BLAST includes
37 //#include <algo/blast/api/psi_pssm_input.hpp>
38 #include <algo/blast/api/cdd_pssm_input.hpp>
39 #include <algo/blast/api/blast_exception.hpp>
40 #include "../core/blast_psi_priv.h"
41
42 // Object includes
43 #include <objects/seqalign/Seq_align.hpp>
44 #include <objects/seqalign/Seq_align_set.hpp>
45 #include <objects/seqalign/Dense_seg.hpp>
46 #include <objects/seqalign/Score.hpp>
47 #include <objects/seq/Seq_descr.hpp>
48
49 // Object manager includes
50 #include <objmgr/scope.hpp>
51 #include <objmgr/seq_vector.hpp>
52
53
54 /** @addtogroup AlgoBlast
55 *
56 * @{
57 */
58
59 BEGIN_NCBI_SCOPE
60 USING_SCOPE(objects);
BEGIN_SCOPE(blast)61 BEGIN_SCOPE(blast)
62
63 #ifndef GAP_IN_ALIGNMENT
64 /// Representation of GAP in Seq-align
65 # define GAP_IN_ALIGNMENT ((Uint4)-1)
66 #endif
67
68 //////////////////////////////////////////////////////////////////////////////
69
70
71 CCddInputData::CCddInputData(const Uint1* query, unsigned int query_length,
72 CConstRef<objects::CSeq_align_set> seqaligns,
73 const PSIBlastOptions& opts,
74 const string& dbname,
75 const string& matrix_name /* = "BLOSUM62" */,
76 int gap_existence /* = 0 */,
77 int gap_extension /* = 0 */,
78 PSIDiagnosticsRequest* diags /* = NULL */,
79 const string& query_title /* = "" */)
80 : m_QueryTitle(query_title),
81 m_DbName(dbname),
82 m_SeqalignSet(seqaligns),
83 m_Msa(NULL),
84 m_Opts(opts),
85 m_MatrixName(matrix_name),
86 m_DiagnosticsRequest(diags),
87 m_MinEvalue(-1.0),
88 m_GapExistence(gap_existence),
89 m_GapExtension(gap_extension)
90 {
91 if (!query) {
92 NCBI_THROW(CBlastException, eInvalidArgument, "NULL query");
93 }
94
95 if (seqaligns.Empty()) {
96 NCBI_THROW(CBlastException, eInvalidArgument, "NULL alignments");
97 }
98
99 m_QueryData.resize(query_length);
100 memcpy(&m_QueryData[0], query, query_length * sizeof(Uint1));
101 }
102
103
~CCddInputData()104 CCddInputData::~CCddInputData()
105 {
106 for (unsigned int i=0;i < m_Hits.size();i++) {
107 delete m_Hits[i];
108 }
109
110 delete [] m_Msa;
111 }
112
Process(void)113 void CCddInputData::Process(void)
114 {
115 if (m_MinEvalue > m_Opts.inclusion_ethresh) {
116
117 NCBI_THROW(CBlastException, eInvalidOptions,
118 "Minimum RPS-BLAST e-value is larger than the maximum one");
119 }
120
121 m_CddData.query = &m_QueryData[0];
122
123 // process primary alignments
124 x_ProcessAlignments(m_MinEvalue, m_Opts.inclusion_ethresh);
125
126 // remove overlaping mutliple hits to the same CD
127 x_RemoveMultipleCdHits();
128
129 // this is required by PSSM engine code
130 m_MsaDimensions.query_length = m_QueryData.size();
131 m_MsaDimensions.num_seqs = m_Hits.size();
132 m_CddData.dimensions = &m_MsaDimensions;
133
134 x_FillHitsData();
135 // this validation has only assertions, no use calling it
136 // for non-debug builds
137 _ASSERT(x_ValidateHits());
138
139 x_CreateMsa();
140 // the same validation is done on the core level
141 _ASSERT(x_ValidateMsa());
142
143 // extract query as Bioseq, needed so that query information can be stored
144 // in PssmWithParameters
145 x_ExtractQueryForPssm();
146
147 _ASSERT(m_MsaDimensions.query_length == m_QueryData.size());
148 _ASSERT(m_MsaDimensions.num_seqs == m_Hits.size());
149 }
150
151
x_ProcessAlignments(double min_evalue,double max_evalue)152 void CCddInputData::x_ProcessAlignments(double min_evalue, double max_evalue)
153 {
154 ITERATE (CSeq_align_set::Tdata, it, m_SeqalignSet->Get()) {
155 double evalue;
156 if (!(*it)->GetNamedScore(CSeq_align::eScore_EValue, evalue)) {
157 NCBI_THROW(CBlastException, eInvalidArgument,
158 "Evalue not found in Seq-align");
159 }
160
161 if (evalue >= min_evalue && evalue < max_evalue) {
162 m_Hits.push_back(new CHit((*it)->GetSegs().GetDenseg(), evalue));
163 }
164 }
165 }
166
167
x_RemoveMultipleCdHits(void)168 void CCddInputData::x_RemoveMultipleCdHits(void)
169 {
170 // if less than 2 hits, do nothing
171 if (m_Hits.size() < 2) {
172 return;
173 }
174
175 // sort by accession and e-value
176 sort(m_Hits.begin(), m_Hits.end(), compare_hits_by_seqid_eval());
177 vector<CHit*> new_hits;
178 new_hits.reserve(m_Hits.size());
179
180 new_hits.push_back(m_Hits[0]);
181
182 vector<CHit*>::iterator it(m_Hits.begin());
183 ++it;
184
185 // for each hit
186 for (;it != m_Hits.end();++it) {
187
188 // for each kept hit with the same subject accession as it and better
189 // e-value
190 for (int i=new_hits.size() - 1;i >= 0
191 && (*it)->m_SubjectId->Match(*new_hits[i]->m_SubjectId);i--) {
192
193 const CHit* kept_hit = new_hits[i];
194
195 // find intersection between hits on subjects,
196 // intersection needs to have query range from kept_hit for
197 // later subtraction
198 CHit intersection(*kept_hit);
199 intersection.IntersectWith(**it, CHit::eSubject);
200
201 // subtract the subject intersection using query ranges,
202 // hits to different ranges of the same CD are treated as
203 // different hits
204 (*it)->Subtract(intersection);
205
206 if ((*it)->IsEmpty()) {
207 delete *it;
208 *it = NULL;
209 break;
210 }
211 }
212 if (*it) {
213 new_hits.push_back(*it);
214 }
215
216 }
217 m_Hits.swap(new_hits);
218 }
219
220
x_FillHitsData(void)221 void CCddInputData::x_FillHitsData(void)
222 {
223 // initialize seqdb
224 CSeqDB seqdb(m_DbName, CSeqDB::eProtein);
225
226 // load residue counts from file
227 CRef<CBlastRPSInfo> profile_data(
228 new CBlastRPSInfo(m_DbName, CBlastRPSInfo::fDeltaBlast));
229
230 // Set data for each hit
231 NON_CONST_ITERATE (vector<CHit*>, it, m_Hits) {
232
233 _ASSERT(*it);
234
235 (*it)->FillData(seqdb, *profile_data);
236 }
237 }
238
239
x_CreateMsa(void)240 void CCddInputData::x_CreateMsa(void)
241 {
242 const int kQueryLength = m_QueryData.size();
243 const int kNumCds = m_Hits.size();
244
245 // initialize msa map
246 PSICdMsaCell cell;
247 cell.is_aligned = (Uint1)false;
248 cell.data = NULL;
249 // allocate memory for num cdds + query
250 m_MsaData.resize(kQueryLength * (kNumCds), cell);
251 m_Msa = new PSICdMsaCell*[kNumCds];
252 if (!m_Msa) {
253 NCBI_THROW(CBlastSystemException, eOutOfMemory,
254 "Multiple alignment data structure");
255 }
256 for (int i=0;i < kNumCds;i++) {
257 m_Msa[i] = &m_MsaData[i * (int)kQueryLength];
258 }
259
260 // fot each hit
261 for (size_t hit_idx=0;hit_idx < m_Hits.size();hit_idx++) {
262
263 // for each hit segment
264 NON_CONST_ITERATE(vector<CHitSegment*>, it,
265 m_Hits[hit_idx]->GetSegments()) {
266
267 const int kNumQueryColumns
268 = (*it)->m_QueryRange.GetTo() - (*it)->m_QueryRange.GetFrom();
269
270 int q_from = (*it)->m_QueryRange.GetFrom();
271
272 // for each position in the hit segemnt
273 for (int i=0;i < kNumQueryColumns; i++) {
274 // set as aligned and point to data
275 m_Msa[hit_idx][q_from + i].is_aligned = (Uint1)true;
276 m_Msa[hit_idx][q_from + i].data = &(*it)->m_MsaData[i];
277 }
278 }
279 m_Hits[hit_idx]->m_MsaIdx = hit_idx;
280 }
281
282 m_CddData.msa = m_Msa;
283 }
284
285
x_ValidateMsa(void) const286 bool CCddInputData::x_ValidateMsa(void) const
287 {
288 _ASSERT(m_Msa);
289 const int kQueryLength = m_QueryData.size();
290 const int kNumCds = m_Hits.size();
291 const Uint1 kGapChar = AMINOACID_TO_NCBISTDAA[(int)'-'];
292 for (int i=0;i < kNumCds;i++) {
293 _ASSERT(m_Msa[i]);
294 }
295
296 for (int i=0;i < kNumCds;i++) {
297 for (int j=0;j < kQueryLength;j++) {
298
299 if (m_QueryData[i] == kGapChar) {
300 NCBI_THROW(CBlastException, eInvalidArgument,
301 "Query sequence cannot contain gaps");
302 }
303
304 if (m_Msa[i][j].is_aligned) {
305 _ASSERT(m_Msa[i][j].data);
306 const PSICdMsaCellData* data = m_Msa[i][j].data;
307
308 // some domain models have incomplete data and are supposed to
309 // be removed from the database or search results,
310 // this exception checks whether one of these domains
311 // has slipped in
312 if (data->iobsr <= 0.0) {
313 NCBI_THROW(CBlastException, eInvalidArgument,
314 "Zero independent observations in domain model");
315 }
316
317 _ASSERT(data->wfreqs);
318 double s = 0;
319 for (int k=0;k < kAlphabetSize;k++) {
320 if (data->wfreqs[k] < 0.0) {
321 NCBI_THROW(CBlastException, eInvalidArgument,
322 "Negative residue frequency in a domain "
323 "model");
324 }
325 s += data->wfreqs[k];
326 }
327 // some domain models have incomplete data and are supposed to
328 // be removed from the database or search results,
329 // this exception checks whether one of these domains
330 // has slipped in
331 if (fabs(s - 1.0) > 1e-5) {
332 NCBI_THROW(CBlastException, eInvalidArgument,
333 "Domain residue frequencies do not sum to 1");
334 }
335 }
336 }
337 }
338
339 return true;
340 }
341
342
CHit(const CDense_seg & denseg,double evalue)343 CCddInputData::CHit::CHit(const CDense_seg& denseg, double evalue)
344 : m_Evalue(evalue), m_MsaIdx(-1)
345 {
346 const int kNumDims = denseg.GetDim();
347 const int kNumSegments = denseg.GetNumseg();
348
349 _ASSERT(kNumDims == 2);
350
351 m_SubjectId.Reset(denseg.GetIds()[1].GetNonNullPointer());
352
353 const vector<TSignedSeqPos>& starts = denseg.GetStarts();
354 const vector<TSeqPos>& lens = denseg.GetLens();
355
356 TSeqPos query_index = 0;
357 TSeqPos subject_index = 1;
358
359 for (int seg=0;seg < kNumSegments;seg++) {
360 TSeqPos query_offset = starts[query_index];
361 TSeqPos subject_offset = starts[subject_index];
362
363 query_index += kNumDims;
364 subject_index += kNumDims;
365
366 // segments of gaps in query or subject are ignored
367 if (query_offset != GAP_IN_ALIGNMENT
368 && subject_offset != GAP_IN_ALIGNMENT) {
369
370 m_SegmentList.push_back(new CHitSegment(
371 TRange(query_offset, query_offset + lens[seg]),
372 TRange(subject_offset, subject_offset
373 + lens[seg])));
374
375 query_offset += lens[seg];
376 subject_offset += lens[seg];
377 }
378 }
379 }
380
381
CHit(const CHit & hit)382 CCddInputData::CHit::CHit(const CHit& hit)
383 : m_SubjectId(hit.m_SubjectId),
384 m_Evalue(hit.m_Evalue),
385 m_MsaIdx(hit.m_MsaIdx)
386 {
387 m_SegmentList.reserve(hit.m_SegmentList.size());
388 ITERATE (vector<CHitSegment*>, it, hit.m_SegmentList) {
389 m_SegmentList.push_back(new CHitSegment(**it));
390 }
391 }
392
393
~CHit()394 CCddInputData::CHit::~CHit()
395 {
396 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
397 delete *it;
398 }
399 }
400
401
GetLength(void) const402 int CCddInputData::CHit::GetLength(void) const
403 {
404 if (IsEmpty()) {
405 return 0;
406 }
407
408 unsigned int result = 0;
409 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
410 result += (*it)->GetLength();
411 }
412
413 return result;
414 }
415
416
FillData(const CSeqDB & seqdb,const CBlastRPSInfo & profile_data)417 void CCddInputData::CHit::FillData(const CSeqDB& seqdb,
418 const CBlastRPSInfo& profile_data)
419 {
420 // get record index of the CD in the database
421 int db_oid;
422 seqdb.SeqidToOid(*m_SubjectId, db_oid);
423
424 // fill segment data
425 NON_CONST_ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
426 (*it)->FillData(db_oid, profile_data);
427 }
428 }
429
430
x_ValidateHits(void) const431 bool CCddInputData::x_ValidateHits(void) const
432 {
433 ITERATE (vector<CHit*>, it, m_Hits) {
434 _ASSERT(*it);
435 (*it)->Validate();
436 }
437 return true;
438 }
439
440
x_ExtractQueryForPssm(void)441 void CCddInputData::x_ExtractQueryForPssm(void)
442 {
443 // Test our pre-conditions
444 _ASSERT(m_QueryData.size() && m_SeqalignSet.NotEmpty());
445 _ASSERT(m_QueryBioseq.Empty());
446
447 m_QueryBioseq.Reset(new CBioseq);
448
449 // set the sequence id
450 if (!m_SeqalignSet->Get().empty()) {
451 CRef<CSeq_align> aln =
452 const_cast<CSeq_align_set*>(&*m_SeqalignSet)->Set().front();
453 CRef<CSeq_id> query_id(const_cast<CSeq_id*>(&aln->GetSeq_id(0)));
454 m_QueryBioseq->SetId().push_back(query_id);
455 }
456
457 // set required Seq-inst fields
458 m_QueryBioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
459 m_QueryBioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
460 m_QueryBioseq->SetInst().SetLength(GetQueryLength());
461
462 // set the sequence data in ncbistdaa format
463 CNCBIstdaa& seq = m_QueryBioseq->SetInst().SetSeq_data().SetNcbistdaa();
464 seq.Set().reserve(GetQueryLength());
465 for (TSeqPos i = 0; i < GetQueryLength(); i++) {
466 seq.Set().push_back(m_QueryData[i]);
467 }
468
469 if (!m_QueryTitle.empty()) {
470 CRef<CSeqdesc> desc(new CSeqdesc());
471 desc->SetTitle(m_QueryTitle);
472 m_QueryBioseq->SetDescr().Set().push_back(desc);
473 }
474
475 // Test our post-condition
476 _ASSERT(m_QueryBioseq.NotEmpty());
477 }
478
479
Validate(void) const480 bool CCddInputData::CHit::Validate(void) const
481 {
482 _ASSERT(!m_SubjectId.Empty());
483
484 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
485 _ASSERT(*it);
486 (*it)->Validate();
487 }
488
489 return true;
490 }
491
492
IsEmpty(void) const493 bool CCddInputData::CHit::IsEmpty(void) const
494 {
495 if (m_SegmentList.empty()) {
496 return true;
497 }
498
499 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
500 if (!(*it)->IsEmpty()) {
501 return false;
502 }
503 }
504
505 return true;
506 }
507
508
IntersectWith(const vector<TRange> & ranges,CCddInputData::CHit::EApplyTo app)509 void CCddInputData::CHit::IntersectWith(const vector<TRange>& ranges,
510 CCddInputData::CHit::EApplyTo app)
511 {
512 // This function assumes that input ranges and hit segments are sorted
513 // by range and mutually exclusive
514
515 vector<TRange>::const_iterator r_itr = ranges.begin();
516 vector<CHitSegment*>::iterator seg_it = m_SegmentList.begin();
517 vector<CHitSegment*> new_segs;
518 while (seg_it != m_SegmentList.end() && r_itr != ranges.end()) {
519
520 // get current hit segment range
521 const TRange seg_range
522 = (app == eSubject ? (*seg_it)->m_SubjectRange
523 : (*seg_it)->m_QueryRange);
524
525 // skip all ranges strictly below current hit segment
526 while (r_itr != ranges.end() && r_itr->GetTo() < seg_range.GetFrom()) {
527 r_itr++;
528 }
529
530 if (r_itr == ranges.end()) {
531 break;
532 }
533
534 // find intersection with current hit segment
535 TRange intersection(seg_range.IntersectionWith(*r_itr));
536
537 // if intersection is the same as hit segment, do nothing
538 if (intersection == seg_range) {
539 seg_it++;
540 continue;
541 }
542
543 // if intersection is empty, delete current hit segment
544 if (intersection.Empty()) {
545 delete *seg_it;
546 *seg_it = NULL;
547
548 seg_it++;
549 continue;
550 }
551
552 // otherwise find intersections with current hit segment
553 // for each range that intersects with current hit segment
554 while (r_itr != ranges.end() && r_itr->GetFrom() < seg_range.GetTo()) {
555
556 // get and save intersection
557 int d_from = max(seg_range.GetFrom(),
558 r_itr->GetFrom()) - seg_range.GetFrom();
559 int d_to = min(seg_range.GetTo(),
560 r_itr->GetTo()) - seg_range.GetTo();
561
562 CHitSegment* new_seg = new CHitSegment(**seg_it);
563 new_seg->AdjustRanges(d_from, d_to);
564 _ASSERT(!new_seg->IsEmpty());
565 new_segs.push_back(new_seg);
566
567 // move to the next range
568 r_itr++;
569 }
570
571 // current hit segment will be replaced with intersection, hence it
572 // is deleted
573 delete *seg_it;
574 *seg_it = NULL;
575 seg_it++;
576 }
577
578 // each hit segment behind the last input range will have an empty
579 // interesection hence it is removed
580 while (seg_it != m_SegmentList.end()) {
581 delete *seg_it;
582 *seg_it = NULL;
583 seg_it++;
584 }
585
586 // remove empty hit segments, add new intersections and sort the list
587 ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
588 if (*it) {
589 new_segs.push_back(*it);
590 }
591 }
592 sort(new_segs.begin(), new_segs.end(), compare_hitseg_range());
593
594 m_SegmentList.swap(new_segs);
595 }
596
597
IntersectWith(const CCddInputData::CHit & hit,CCddInputData::CHit::EApplyTo app)598 void CCddInputData::CHit::IntersectWith(const CCddInputData::CHit& hit,
599 CCddInputData::CHit::EApplyTo app)
600 {
601 vector<TRange> ranges;
602 ranges.reserve(hit.GetSegments().size());
603 ITERATE (vector<CHitSegment*>, it, hit.GetSegments()) {
604 ranges.push_back(app == eQuery ? (*it)->m_QueryRange
605 : (*it)->m_SubjectRange);
606 }
607
608 sort(ranges.begin(), ranges.end(), compare_range());
609
610 IntersectWith(ranges, app);
611 }
612
613
Subtract(const CHit & hit)614 void CCddInputData::CHit::Subtract(const CHit& hit)
615 {
616 // if either hit is empty than the result is the same as current
617 // object
618 if (IsEmpty() || hit.IsEmpty()) {
619 return;
620 }
621
622 // This function assumes that input ranges and hit segments are sorted
623 // by range and mutually exclusive
624
625 // find alignment start and stop of the hit to be subtracted
626 int from = hit.GetSegments().front()->m_QueryRange.GetFrom();
627 int to = hit.GetSegments().back()->m_QueryRange.GetTo();
628
629 // if there is no overlap between hits, then do nothing
630 if (m_SegmentList.front()->m_QueryRange.GetFrom() >= to
631 || m_SegmentList.back()->m_QueryRange.GetTo() <= from) {
632
633 return;
634 }
635
636 // iterate over segments
637 vector<CHitSegment*>::iterator it = m_SegmentList.begin();
638
639 vector<CHitSegment*> new_segments;
640 new_segments.reserve(m_SegmentList.size());
641
642 // keep all segments that end before the subtracted hits starts
643 // unchanged
644 while (it != m_SegmentList.end() && (*it)->m_QueryRange.GetTo() <= from) {
645
646 new_segments.push_back(*it);
647 ++it;
648 }
649
650 // if all segments end before the subctracted hit starts
651 // or none of the segments overlaps with the subtracted hit,
652 // there is nothing to subtract, exit
653 if (it == m_SegmentList.end() || (*it)->m_QueryRange.GetFrom() > to) {
654 return;
655 }
656
657 // if the current segment covers the whole subtracted hit
658 if ((*it)->m_QueryRange.GetTo() > to) {
659
660 // make two segments for what is to the left and right of
661 // the subtracted hit
662
663 CHitSegment* new_seg;
664
665 if ((*it)->m_QueryRange.GetFrom() < from) {
666
667 new_seg = new CHitSegment(**it);
668
669 // left part
670 int d_to = from - (*it)->m_QueryRange.GetTo();
671 _ASSERT(d_to < 0);
672 (*it)->AdjustRanges(0, d_to);
673 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
674 new_segments.push_back(*it);
675 }
676 else {
677 new_seg = *it;
678 }
679
680 // right part
681 int d_from = to - new_seg->m_QueryRange.GetFrom();
682 _ASSERT(d_from >= 0);
683 new_seg->AdjustRanges(d_from, 0);
684 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
685 new_segments.push_back(new_seg);
686
687 // the following segments do not intersect with subtracted hit
688 ++it;
689 for (;it != m_SegmentList.end();++it) {
690 new_segments.push_back(*it);
691 }
692 }
693 else {
694
695 // if the segment overlaps completely with the subtracted hit,
696 // delete it
697 if ((*it)->m_QueryRange.GetFrom() >= from) {
698 delete *it;
699 *it = NULL;
700 }
701 else {
702
703 // otherwise adjust segment end
704 int d_to = from - (*it)->m_QueryRange.GetTo();
705 _ASSERT(d_to < 0);
706
707 (*it)->AdjustRanges(0, d_to);
708 _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
709 new_segments.push_back(*it);
710 }
711
712 // delete all segments that completely overlap with subtracted hit
713 ++it;
714 while (it != m_SegmentList.end()
715 && (*it)->m_QueryRange.GetTo() <= to) {
716
717 delete *it;
718 *it = NULL;
719
720 ++it;
721 }
722
723 if (it != m_SegmentList.end()) {
724
725 if ((*it)->m_QueryRange.GetFrom() < to) {
726 int d_from = to - (*it)->m_QueryRange.GetFrom();
727 _ASSERT(d_from > 0);
728
729 (*it)->AdjustRanges(d_from, 0);
730 _ASSERT((*it)->m_QueryRange.GetFrom()
731 < (*it)->m_QueryRange.GetTo());
732
733 new_segments.push_back(*it);
734 }
735 else {
736 delete *it;
737 *it = NULL;
738 }
739
740 // keep all segments above subtracted hit
741 ++it;
742 while (it != m_SegmentList.end()) {
743 new_segments.push_back(*it);
744 ++it;
745 }
746 }
747 }
748
749 m_SegmentList.swap(new_segments);
750 }
751
752
FillData(int db_oid,const CBlastRPSInfo & profile_data)753 void CCddInputData::CHitSegment::FillData(int db_oid,
754 const CBlastRPSInfo& profile_data)
755 {
756 PSICdMsaCellData d;
757 d.wfreqs = NULL;
758 d.iobsr = -1.0;
759 m_MsaData.resize(m_QueryRange.GetTo() - m_QueryRange.GetFrom(), d);
760
761 x_FillResidueCounts(db_oid, profile_data);
762 x_FillObservations(db_oid, profile_data);
763 }
764
765
Validate(void) const766 bool CCddInputData::CHitSegment::Validate(void) const
767 {
768 _ASSERT(m_QueryRange.GetFrom() >= 0 && m_QueryRange.GetTo() >= 0);
769 _ASSERT(m_SubjectRange.GetFrom() >= 0 && m_SubjectRange.GetTo() >= 0);
770
771 const int kQueryLength = m_QueryRange.GetTo() - m_QueryRange.GetFrom();
772 const int kSubjectLength = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
773
774 if (kQueryLength != kSubjectLength) {
775 return false;
776 }
777
778 _ASSERT((int)m_WFreqsData.size() == kSubjectLength * kAlphabetSize);
779 _ASSERT((int)m_MsaData.size() == kSubjectLength);
780
781 ITERATE (vector<PSICdMsaCellData>, it, m_MsaData) {
782 _ASSERT(it->wfreqs);
783 }
784
785 return true;
786 }
787
AdjustRanges(int d_from,int d_to)788 void CCddInputData::CHitSegment::AdjustRanges(int d_from, int d_to)
789 {
790 m_QueryRange.SetFrom(m_QueryRange.GetFrom() + d_from);
791 m_QueryRange.SetTo(m_QueryRange.GetTo() + d_to);
792
793 m_SubjectRange.SetFrom(m_SubjectRange.GetFrom() + d_from);
794 m_SubjectRange.SetTo(m_SubjectRange.GetTo() + d_to);
795 }
796
797
IsEmpty(void) const798 bool CCddInputData::CHitSegment::IsEmpty(void) const
799 {
800 return m_QueryRange.GetFrom() > m_QueryRange.GetTo()
801 || m_SubjectRange.GetFrom() > m_SubjectRange.GetTo();
802 }
803
x_FillResidueCounts(int db_oid,const CBlastRPSInfo & profile_data)804 void CCddInputData::CHitSegment::x_FillResidueCounts(int db_oid,
805 const CBlastRPSInfo& profile_data)
806 {
807 _ASSERT(profile_data()->freq_header);
808
809 BlastRPSProfileHeader* header = profile_data()->freq_header;
810 int num_profiles = header->num_profiles;
811
812 _ASSERT(db_oid < num_profiles);
813
814 // Get weighted residue counts for CD
815 const Int4* db_seq_offsets = header->start_offsets;
816 const TFreqs* db_counts =
817 (TFreqs*)(header->start_offsets + num_profiles + 1);
818
819 // extract residue counts
820 const TFreqs* counts = db_counts + db_seq_offsets[db_oid] * kAlphabetSize;
821 int db_seq_length = db_seq_offsets[db_oid + 1] - db_seq_offsets[db_oid];
822
823 // correct seq length for column of zero counts in cdd counts file
824 db_seq_length--;
825 _ASSERT(db_seq_length > 0);
826 _ASSERT(m_SubjectRange.GetTo() <= db_seq_length);
827
828
829 int num_columns = (int)m_MsaData.size();
830 m_WFreqsData.resize(num_columns * kAlphabetSize);
831 for (int i=0;i < num_columns;i++) {
832 m_MsaData[i].wfreqs = &m_WFreqsData[i * kAlphabetSize];
833
834 // column frequencies for a column must sum to 1, but they may not due
835 // to storing in CDD as integers, the difference is distributed equally
836 // among all the non-zero frequencies
837 TFreqs sum_freqs = 0;
838 for (int j=0;j < kAlphabetSize;j++) {
839 sum_freqs +=
840 counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j];
841 }
842
843 for (int j=0;j < kAlphabetSize;j++) {
844 m_MsaData[i].wfreqs[j] =
845 (double)counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j]
846 / (double)sum_freqs;
847 }
848 }
849 }
850
x_FillObservations(int db_oid,const CBlastRPSInfo & profile_data)851 void CCddInputData::CHitSegment::x_FillObservations(int db_oid,
852 const CBlastRPSInfo& profile_data)
853 {
854 // Get effective numbers of independent observations
855
856 _ASSERT(profile_data()->obsr_header);
857
858 BlastRPSProfileHeader* header = profile_data()->obsr_header;
859 int num_profiles = header->num_profiles;
860
861 _ASSERT(db_oid < num_profiles);
862
863 // find poiter to eff number of observations
864 const Int4* offsets = header->start_offsets;
865 const TObsr* data_start
866 = (TObsr*)(header->start_offsets + num_profiles + 1);
867
868 const TObsr* data = data_start + offsets[db_oid];
869 int data_size = offsets[db_oid + 1] - offsets[db_oid];
870
871 // extract effective numbers of obaservations
872 vector<TObsr> obsr;
873 for (int i=0;i < data_size;i+=2) {
874 TObsr val = data[i];
875 Int4 num = (Int4)data[i + 1];
876 _ASSERT(fabs((double)num - data[i + 1]) < 1e-05);
877
878 for (int j=0;j < num;j++) {
879 obsr.push_back(val);
880 }
881 }
882
883 int num_columns = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
884 for (int i=0;i < num_columns;i++) {
885 m_MsaData[i].iobsr =
886 (double)obsr[m_SubjectRange.GetFrom() + i] / kRpsScaleFactor;
887 }
888 }
889
890
891 END_SCOPE(blast)
892 END_NCBI_SCOPE
893
894 /* @} */
895