1 /*  $Id: cdd_pssm_input.cpp 500404 2016-05-04 14:59:01Z camacho $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Greg Boratyn
27  *
28  */
29 
30 /** @file psi_pssm_input.cpp
31  * Implementation of the concrete strategy to obtain PSSM input data for
32  * PSI-BLAST.
33  */
34 #include <ncbi_pch.hpp>
35 
36 // BLAST includes
37 //#include <algo/blast/api/psi_pssm_input.hpp>
38 #include <algo/blast/api/cdd_pssm_input.hpp>
39 #include <algo/blast/api/blast_exception.hpp>
40 #include "../core/blast_psi_priv.h"
41 
42 // Object includes
43 #include <objects/seqalign/Seq_align.hpp>
44 #include <objects/seqalign/Seq_align_set.hpp>
45 #include <objects/seqalign/Dense_seg.hpp>
46 #include <objects/seqalign/Score.hpp>
47 #include <objects/seq/Seq_descr.hpp>
48 
49 // Object manager includes
50 #include <objmgr/scope.hpp>
51 #include <objmgr/seq_vector.hpp>
52 
53 
54 /** @addtogroup AlgoBlast
55  *
56  * @{
57  */
58 
59 BEGIN_NCBI_SCOPE
60 USING_SCOPE(objects);
BEGIN_SCOPE(blast)61 BEGIN_SCOPE(blast)
62 
63 #ifndef GAP_IN_ALIGNMENT
64     /// Representation of GAP in Seq-align
65 #   define GAP_IN_ALIGNMENT     ((Uint4)-1)
66 #endif
67 
68 //////////////////////////////////////////////////////////////////////////////
69 
70 
71 CCddInputData::CCddInputData(const Uint1* query, unsigned int query_length,
72                              CConstRef<objects::CSeq_align_set> seqaligns,
73                              const PSIBlastOptions& opts,
74                              const string& dbname,
75                              const string& matrix_name /* = "BLOSUM62" */,
76                              int gap_existence /* = 0 */,
77                              int gap_extension /* = 0 */,
78                              PSIDiagnosticsRequest* diags /* = NULL */,
79                              const string& query_title /* = "" */)
80     : m_QueryTitle(query_title),
81       m_DbName(dbname),
82       m_SeqalignSet(seqaligns),
83       m_Msa(NULL),
84       m_Opts(opts),
85       m_MatrixName(matrix_name),
86       m_DiagnosticsRequest(diags),
87       m_MinEvalue(-1.0),
88       m_GapExistence(gap_existence),
89       m_GapExtension(gap_extension)
90 {
91     if (!query) {
92         NCBI_THROW(CBlastException, eInvalidArgument, "NULL query");
93     }
94 
95     if (seqaligns.Empty()) {
96         NCBI_THROW(CBlastException, eInvalidArgument, "NULL alignments");
97     }
98 
99     m_QueryData.resize(query_length);
100     memcpy(&m_QueryData[0], query, query_length * sizeof(Uint1));
101 }
102 
103 
~CCddInputData()104 CCddInputData::~CCddInputData()
105 {
106     for (unsigned int i=0;i < m_Hits.size();i++) {
107         delete m_Hits[i];
108     }
109 
110     delete [] m_Msa;
111 }
112 
Process(void)113 void CCddInputData::Process(void)
114 {
115     if (m_MinEvalue > m_Opts.inclusion_ethresh) {
116 
117         NCBI_THROW(CBlastException, eInvalidOptions,
118                    "Minimum RPS-BLAST e-value is larger than the maximum one");
119     }
120 
121     m_CddData.query = &m_QueryData[0];
122 
123     // process primary alignments
124     x_ProcessAlignments(m_MinEvalue, m_Opts.inclusion_ethresh);
125 
126     // remove overlaping mutliple hits to the same CD
127     x_RemoveMultipleCdHits();
128 
129     // this is required by PSSM engine code
130     m_MsaDimensions.query_length = m_QueryData.size();
131     m_MsaDimensions.num_seqs = m_Hits.size();
132     m_CddData.dimensions = &m_MsaDimensions;
133 
134     x_FillHitsData();
135     // this validation has only assertions, no use calling it
136     // for non-debug builds
137     _ASSERT(x_ValidateHits());
138 
139     x_CreateMsa();
140     // the same validation is done on the core level
141     _ASSERT(x_ValidateMsa());
142 
143     // extract query as Bioseq, needed so that query information can be stored
144     // in PssmWithParameters
145     x_ExtractQueryForPssm();
146 
147     _ASSERT(m_MsaDimensions.query_length == m_QueryData.size());
148     _ASSERT(m_MsaDimensions.num_seqs == m_Hits.size());
149 }
150 
151 
x_ProcessAlignments(double min_evalue,double max_evalue)152 void CCddInputData::x_ProcessAlignments(double min_evalue, double max_evalue)
153 {
154     ITERATE (CSeq_align_set::Tdata, it, m_SeqalignSet->Get()) {
155         double evalue;
156         if (!(*it)->GetNamedScore(CSeq_align::eScore_EValue, evalue)) {
157             NCBI_THROW(CBlastException, eInvalidArgument,
158                        "Evalue not found in Seq-align");
159         }
160 
161         if (evalue >= min_evalue && evalue < max_evalue) {
162             m_Hits.push_back(new CHit((*it)->GetSegs().GetDenseg(), evalue));
163         }
164     }
165 }
166 
167 
x_RemoveMultipleCdHits(void)168 void CCddInputData::x_RemoveMultipleCdHits(void)
169 {
170     // if less than 2 hits, do nothing
171     if (m_Hits.size() < 2) {
172         return;
173     }
174 
175     // sort by accession and e-value
176     sort(m_Hits.begin(), m_Hits.end(), compare_hits_by_seqid_eval());
177     vector<CHit*> new_hits;
178     new_hits.reserve(m_Hits.size());
179 
180     new_hits.push_back(m_Hits[0]);
181 
182     vector<CHit*>::iterator it(m_Hits.begin());
183     ++it;
184 
185     // for each hit
186     for (;it != m_Hits.end();++it) {
187 
188         // for each kept hit with the same subject accession as it and better
189         // e-value
190         for (int i=new_hits.size() - 1;i >= 0
191                  && (*it)->m_SubjectId->Match(*new_hits[i]->m_SubjectId);i--) {
192 
193             const CHit* kept_hit = new_hits[i];
194 
195             // find intersection between hits on subjects,
196             // intersection needs to have query range from kept_hit for
197             // later subtraction
198             CHit intersection(*kept_hit);
199             intersection.IntersectWith(**it, CHit::eSubject);
200 
201             // subtract the subject intersection using query ranges,
202             // hits to different ranges of the same CD are treated as
203             // different hits
204             (*it)->Subtract(intersection);
205 
206             if ((*it)->IsEmpty()) {
207                 delete *it;
208                 *it = NULL;
209                 break;
210             }
211         }
212         if (*it) {
213             new_hits.push_back(*it);
214         }
215 
216     }
217     m_Hits.swap(new_hits);
218 }
219 
220 
x_FillHitsData(void)221 void CCddInputData::x_FillHitsData(void)
222 {
223     // initialize seqdb
224     CSeqDB seqdb(m_DbName, CSeqDB::eProtein);
225 
226     // load residue counts from file
227     CRef<CBlastRPSInfo> profile_data(
228                   new CBlastRPSInfo(m_DbName, CBlastRPSInfo::fDeltaBlast));
229 
230     // Set data for each hit
231     NON_CONST_ITERATE (vector<CHit*>, it, m_Hits) {
232 
233         _ASSERT(*it);
234 
235         (*it)->FillData(seqdb, *profile_data);
236     }
237 }
238 
239 
x_CreateMsa(void)240 void CCddInputData::x_CreateMsa(void)
241 {
242     const int kQueryLength = m_QueryData.size();
243     const int kNumCds = m_Hits.size();
244 
245     // initialize msa map
246     PSICdMsaCell cell;
247     cell.is_aligned = (Uint1)false;
248     cell.data = NULL;
249     // allocate memory for num cdds + query
250     m_MsaData.resize(kQueryLength * (kNumCds), cell);
251     m_Msa = new PSICdMsaCell*[kNumCds];
252     if (!m_Msa) {
253         NCBI_THROW(CBlastSystemException, eOutOfMemory,
254                    "Multiple alignment data structure");
255     }
256     for (int i=0;i < kNumCds;i++) {
257         m_Msa[i] = &m_MsaData[i * (int)kQueryLength];
258     }
259 
260     // fot each hit
261     for (size_t hit_idx=0;hit_idx < m_Hits.size();hit_idx++) {
262 
263         // for each hit segment
264         NON_CONST_ITERATE(vector<CHitSegment*>, it,
265                           m_Hits[hit_idx]->GetSegments()) {
266 
267             const int kNumQueryColumns
268                 = (*it)->m_QueryRange.GetTo() - (*it)->m_QueryRange.GetFrom();
269 
270             int q_from = (*it)->m_QueryRange.GetFrom();
271 
272             // for each position in the hit segemnt
273             for (int i=0;i < kNumQueryColumns; i++) {
274                 // set as aligned and point to data
275                 m_Msa[hit_idx][q_from + i].is_aligned = (Uint1)true;
276                 m_Msa[hit_idx][q_from + i].data = &(*it)->m_MsaData[i];
277             }
278         }
279         m_Hits[hit_idx]->m_MsaIdx = hit_idx;
280     }
281 
282     m_CddData.msa = m_Msa;
283 }
284 
285 
x_ValidateMsa(void) const286 bool CCddInputData::x_ValidateMsa(void) const
287 {
288     _ASSERT(m_Msa);
289     const int kQueryLength = m_QueryData.size();
290     const int kNumCds = m_Hits.size();
291     const Uint1 kGapChar = AMINOACID_TO_NCBISTDAA[(int)'-'];
292     for (int i=0;i < kNumCds;i++) {
293         _ASSERT(m_Msa[i]);
294     }
295 
296     for (int i=0;i < kNumCds;i++) {
297         for (int j=0;j < kQueryLength;j++) {
298 
299             if (m_QueryData[i] == kGapChar) {
300                 NCBI_THROW(CBlastException, eInvalidArgument,
301                            "Query sequence cannot contain gaps");
302             }
303 
304             if (m_Msa[i][j].is_aligned) {
305                 _ASSERT(m_Msa[i][j].data);
306                 const PSICdMsaCellData* data = m_Msa[i][j].data;
307 
308                 // some domain models have incomplete data and are supposed to
309                 // be removed from the database or search results,
310                 // this exception checks whether one of these domains
311                 // has slipped in
312                 if (data->iobsr <= 0.0) {
313                     NCBI_THROW(CBlastException, eInvalidArgument,
314                                "Zero independent observations in domain model");
315                 }
316 
317                 _ASSERT(data->wfreqs);
318                 double s = 0;
319                 for (int k=0;k < kAlphabetSize;k++) {
320                     if (data->wfreqs[k] < 0.0) {
321                         NCBI_THROW(CBlastException, eInvalidArgument,
322                                    "Negative residue frequency in a domain "
323                                    "model");
324                     }
325                     s += data->wfreqs[k];
326                 }
327                 // some domain models have incomplete data and are supposed to
328                 // be removed from the database or search results,
329                 // this exception checks whether one of these domains
330                 // has slipped in
331                 if (fabs(s - 1.0) > 1e-5) {
332                     NCBI_THROW(CBlastException, eInvalidArgument,
333                                "Domain residue frequencies do not sum to 1");
334                 }
335             }
336         }
337     }
338 
339     return true;
340 }
341 
342 
CHit(const CDense_seg & denseg,double evalue)343 CCddInputData::CHit::CHit(const CDense_seg& denseg, double evalue)
344     : m_Evalue(evalue), m_MsaIdx(-1)
345 {
346     const int kNumDims = denseg.GetDim();
347     const int kNumSegments = denseg.GetNumseg();
348 
349     _ASSERT(kNumDims == 2);
350 
351     m_SubjectId.Reset(denseg.GetIds()[1].GetNonNullPointer());
352 
353     const vector<TSignedSeqPos>& starts = denseg.GetStarts();
354     const vector<TSeqPos>& lens = denseg.GetLens();
355 
356     TSeqPos query_index = 0;
357     TSeqPos subject_index = 1;
358 
359     for (int seg=0;seg < kNumSegments;seg++) {
360         TSeqPos query_offset = starts[query_index];
361         TSeqPos subject_offset = starts[subject_index];
362 
363         query_index += kNumDims;
364         subject_index += kNumDims;
365 
366         // segments of gaps in query or subject are ignored
367          if (query_offset != GAP_IN_ALIGNMENT
368             && subject_offset != GAP_IN_ALIGNMENT) {
369 
370             m_SegmentList.push_back(new CHitSegment(
371                       TRange(query_offset, query_offset + lens[seg]),
372                       TRange(subject_offset, subject_offset
373                                      + lens[seg])));
374 
375             query_offset += lens[seg];
376             subject_offset += lens[seg];
377         }
378     }
379 }
380 
381 
CHit(const CHit & hit)382 CCddInputData::CHit::CHit(const CHit& hit)
383     : m_SubjectId(hit.m_SubjectId),
384       m_Evalue(hit.m_Evalue),
385       m_MsaIdx(hit.m_MsaIdx)
386 {
387     m_SegmentList.reserve(hit.m_SegmentList.size());
388     ITERATE (vector<CHitSegment*>, it, hit.m_SegmentList) {
389         m_SegmentList.push_back(new CHitSegment(**it));
390     }
391 }
392 
393 
~CHit()394 CCddInputData::CHit::~CHit()
395 {
396     ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
397         delete *it;
398     }
399 }
400 
401 
GetLength(void) const402 int CCddInputData::CHit::GetLength(void) const
403 {
404     if (IsEmpty()) {
405         return 0;
406     }
407 
408     unsigned int result = 0;
409     ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
410         result += (*it)->GetLength();
411     }
412 
413     return result;
414 }
415 
416 
FillData(const CSeqDB & seqdb,const CBlastRPSInfo & profile_data)417 void CCddInputData::CHit::FillData(const CSeqDB& seqdb,
418                                    const CBlastRPSInfo& profile_data)
419 {
420     // get record index of the CD in the database
421     int db_oid;
422     seqdb.SeqidToOid(*m_SubjectId, db_oid);
423 
424     // fill segment data
425     NON_CONST_ITERATE(vector<CHitSegment*>, it, m_SegmentList) {
426         (*it)->FillData(db_oid, profile_data);
427     }
428 }
429 
430 
x_ValidateHits(void) const431 bool CCddInputData::x_ValidateHits(void) const
432 {
433     ITERATE (vector<CHit*>, it, m_Hits) {
434         _ASSERT(*it);
435         (*it)->Validate();
436     }
437     return true;
438 }
439 
440 
x_ExtractQueryForPssm(void)441 void CCddInputData::x_ExtractQueryForPssm(void)
442 {
443     // Test our pre-conditions
444     _ASSERT(m_QueryData.size() && m_SeqalignSet.NotEmpty());
445     _ASSERT(m_QueryBioseq.Empty());
446 
447     m_QueryBioseq.Reset(new CBioseq);
448 
449     // set the sequence id
450     if (!m_SeqalignSet->Get().empty()) {
451         CRef<CSeq_align> aln =
452             const_cast<CSeq_align_set*>(&*m_SeqalignSet)->Set().front();
453         CRef<CSeq_id> query_id(const_cast<CSeq_id*>(&aln->GetSeq_id(0)));
454         m_QueryBioseq->SetId().push_back(query_id);
455     }
456 
457     // set required Seq-inst fields
458     m_QueryBioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
459     m_QueryBioseq->SetInst().SetMol(CSeq_inst::eMol_aa);
460     m_QueryBioseq->SetInst().SetLength(GetQueryLength());
461 
462     // set the sequence data in ncbistdaa format
463     CNCBIstdaa& seq = m_QueryBioseq->SetInst().SetSeq_data().SetNcbistdaa();
464     seq.Set().reserve(GetQueryLength());
465     for (TSeqPos i = 0; i < GetQueryLength(); i++) {
466         seq.Set().push_back(m_QueryData[i]);
467     }
468 
469     if (!m_QueryTitle.empty()) {
470         CRef<CSeqdesc> desc(new CSeqdesc());
471         desc->SetTitle(m_QueryTitle);
472         m_QueryBioseq->SetDescr().Set().push_back(desc);
473     }
474 
475     // Test our post-condition
476     _ASSERT(m_QueryBioseq.NotEmpty());
477 }
478 
479 
Validate(void) const480 bool CCddInputData::CHit::Validate(void) const
481 {
482     _ASSERT(!m_SubjectId.Empty());
483 
484     ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
485         _ASSERT(*it);
486         (*it)->Validate();
487     }
488 
489     return true;
490 }
491 
492 
IsEmpty(void) const493 bool CCddInputData::CHit::IsEmpty(void) const
494 {
495     if (m_SegmentList.empty()) {
496         return true;
497     }
498 
499     ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
500         if (!(*it)->IsEmpty()) {
501             return false;
502         }
503     }
504 
505     return true;
506 }
507 
508 
IntersectWith(const vector<TRange> & ranges,CCddInputData::CHit::EApplyTo app)509 void CCddInputData::CHit::IntersectWith(const vector<TRange>& ranges,
510                                         CCddInputData::CHit::EApplyTo app)
511 {
512     // This function assumes that input ranges and hit segments are sorted
513     // by range and mutually exclusive
514 
515     vector<TRange>::const_iterator r_itr = ranges.begin();
516     vector<CHitSegment*>::iterator seg_it = m_SegmentList.begin();
517     vector<CHitSegment*> new_segs;
518     while (seg_it != m_SegmentList.end() && r_itr != ranges.end()) {
519 
520         // get current hit segment range
521         const TRange seg_range
522             = (app == eSubject ? (*seg_it)->m_SubjectRange
523                : (*seg_it)->m_QueryRange);
524 
525         // skip all ranges strictly below current hit segment
526         while (r_itr != ranges.end() && r_itr->GetTo() < seg_range.GetFrom()) {
527             r_itr++;
528         }
529 
530         if (r_itr == ranges.end()) {
531             break;
532         }
533 
534         // find intersection with current hit segment
535         TRange intersection(seg_range.IntersectionWith(*r_itr));
536 
537         // if intersection is the same as hit segment, do nothing
538         if (intersection == seg_range) {
539             seg_it++;
540             continue;
541         }
542 
543         // if intersection is empty, delete current hit segment
544         if (intersection.Empty()) {
545             delete *seg_it;
546             *seg_it = NULL;
547 
548             seg_it++;
549             continue;
550         }
551 
552         // otherwise find intersections with current hit segment
553         // for each range that intersects with current hit segment
554         while (r_itr != ranges.end() && r_itr->GetFrom() < seg_range.GetTo()) {
555 
556             // get and save intersection
557             int d_from = max(seg_range.GetFrom(),
558                              r_itr->GetFrom()) - seg_range.GetFrom();
559             int d_to = min(seg_range.GetTo(),
560                            r_itr->GetTo()) - seg_range.GetTo();
561 
562             CHitSegment* new_seg = new CHitSegment(**seg_it);
563             new_seg->AdjustRanges(d_from, d_to);
564             _ASSERT(!new_seg->IsEmpty());
565             new_segs.push_back(new_seg);
566 
567             // move to the next range
568             r_itr++;
569         }
570 
571         // current hit segment will be replaced with intersection, hence it
572         // is deleted
573         delete *seg_it;
574         *seg_it = NULL;
575         seg_it++;
576     }
577 
578     // each hit segment behind the last input range will have an empty
579     // interesection hence it is removed
580     while (seg_it != m_SegmentList.end()) {
581         delete *seg_it;
582         *seg_it = NULL;
583         seg_it++;
584     }
585 
586     // remove empty hit segments, add new intersections and sort the list
587     ITERATE (vector<CHitSegment*>, it, m_SegmentList) {
588         if (*it) {
589             new_segs.push_back(*it);
590         }
591     }
592     sort(new_segs.begin(), new_segs.end(), compare_hitseg_range());
593 
594     m_SegmentList.swap(new_segs);
595 }
596 
597 
IntersectWith(const CCddInputData::CHit & hit,CCddInputData::CHit::EApplyTo app)598 void CCddInputData::CHit::IntersectWith(const CCddInputData::CHit& hit,
599                                         CCddInputData::CHit::EApplyTo app)
600 {
601     vector<TRange> ranges;
602     ranges.reserve(hit.GetSegments().size());
603     ITERATE (vector<CHitSegment*>, it, hit.GetSegments()) {
604         ranges.push_back(app == eQuery ? (*it)->m_QueryRange
605                          : (*it)->m_SubjectRange);
606     }
607 
608     sort(ranges.begin(), ranges.end(), compare_range());
609 
610     IntersectWith(ranges, app);
611 }
612 
613 
Subtract(const CHit & hit)614 void CCddInputData::CHit::Subtract(const CHit& hit)
615 {
616     // if either hit is empty than the result is the same as current
617     // object
618     if (IsEmpty() || hit.IsEmpty()) {
619         return;
620     }
621 
622     // This function assumes that input ranges and hit segments are sorted
623     // by range and mutually exclusive
624 
625     // find alignment start and stop of the hit to be subtracted
626     int from = hit.GetSegments().front()->m_QueryRange.GetFrom();
627     int to = hit.GetSegments().back()->m_QueryRange.GetTo();
628 
629     // if there is no overlap between hits, then do nothing
630     if (m_SegmentList.front()->m_QueryRange.GetFrom() >= to
631         || m_SegmentList.back()->m_QueryRange.GetTo() <= from) {
632 
633         return;
634     }
635 
636     // iterate over segments
637     vector<CHitSegment*>::iterator it = m_SegmentList.begin();
638 
639     vector<CHitSegment*> new_segments;
640     new_segments.reserve(m_SegmentList.size());
641 
642     // keep all segments that end before the subtracted hits starts
643     // unchanged
644     while (it != m_SegmentList.end() && (*it)->m_QueryRange.GetTo() <= from) {
645 
646         new_segments.push_back(*it);
647         ++it;
648     }
649 
650     // if all segments end before the subctracted hit starts
651     // or none of the segments overlaps with the subtracted hit,
652     // there is nothing to subtract, exit
653     if (it == m_SegmentList.end() || (*it)->m_QueryRange.GetFrom() > to) {
654         return;
655     }
656 
657     // if the current segment covers the whole subtracted hit
658     if ((*it)->m_QueryRange.GetTo() > to) {
659 
660         // make two segments for what is to the left and right of
661         // the subtracted hit
662 
663         CHitSegment* new_seg;
664 
665         if ((*it)->m_QueryRange.GetFrom() < from) {
666 
667             new_seg = new CHitSegment(**it);
668 
669             // left part
670             int d_to = from - (*it)->m_QueryRange.GetTo();
671             _ASSERT(d_to < 0);
672             (*it)->AdjustRanges(0, d_to);
673             _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
674             new_segments.push_back(*it);
675         }
676         else {
677             new_seg = *it;
678         }
679 
680         // right part
681         int d_from = to - new_seg->m_QueryRange.GetFrom();
682         _ASSERT(d_from >= 0);
683         new_seg->AdjustRanges(d_from, 0);
684         _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
685         new_segments.push_back(new_seg);
686 
687         // the following segments do not intersect with subtracted hit
688         ++it;
689         for (;it != m_SegmentList.end();++it) {
690             new_segments.push_back(*it);
691         }
692     }
693     else {
694 
695         // if the segment overlaps completely with the subtracted hit,
696         // delete it
697         if ((*it)->m_QueryRange.GetFrom() >= from) {
698             delete *it;
699             *it = NULL;
700         }
701         else {
702 
703             // otherwise adjust segment end
704             int d_to = from - (*it)->m_QueryRange.GetTo();
705             _ASSERT(d_to < 0);
706 
707             (*it)->AdjustRanges(0, d_to);
708             _ASSERT((*it)->m_QueryRange.GetFrom() < (*it)->m_QueryRange.GetTo());
709             new_segments.push_back(*it);
710         }
711 
712         // delete all segments that completely overlap with subtracted hit
713         ++it;
714         while (it != m_SegmentList.end()
715                && (*it)->m_QueryRange.GetTo() <= to) {
716 
717             delete *it;
718             *it = NULL;
719 
720             ++it;
721         }
722 
723         if (it != m_SegmentList.end()) {
724 
725             if ((*it)->m_QueryRange.GetFrom() < to) {
726                 int d_from = to - (*it)->m_QueryRange.GetFrom();
727                 _ASSERT(d_from > 0);
728 
729                 (*it)->AdjustRanges(d_from, 0);
730                 _ASSERT((*it)->m_QueryRange.GetFrom()
731                         < (*it)->m_QueryRange.GetTo());
732 
733                 new_segments.push_back(*it);
734             }
735             else {
736                 delete *it;
737                 *it = NULL;
738             }
739 
740             // keep all segments above subtracted hit
741             ++it;
742             while (it != m_SegmentList.end()) {
743                 new_segments.push_back(*it);
744                 ++it;
745             }
746         }
747     }
748 
749     m_SegmentList.swap(new_segments);
750 }
751 
752 
FillData(int db_oid,const CBlastRPSInfo & profile_data)753 void CCddInputData::CHitSegment::FillData(int db_oid,
754                                           const CBlastRPSInfo& profile_data)
755 {
756     PSICdMsaCellData d;
757     d.wfreqs = NULL;
758     d.iobsr = -1.0;
759     m_MsaData.resize(m_QueryRange.GetTo() - m_QueryRange.GetFrom(), d);
760 
761     x_FillResidueCounts(db_oid, profile_data);
762     x_FillObservations(db_oid, profile_data);
763 }
764 
765 
Validate(void) const766 bool CCddInputData::CHitSegment::Validate(void) const
767 {
768     _ASSERT(m_QueryRange.GetFrom() >= 0 && m_QueryRange.GetTo() >= 0);
769     _ASSERT(m_SubjectRange.GetFrom() >= 0 && m_SubjectRange.GetTo() >= 0);
770 
771     const int kQueryLength = m_QueryRange.GetTo() - m_QueryRange.GetFrom();
772     const int kSubjectLength = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
773 
774     if (kQueryLength != kSubjectLength) {
775         return false;
776     }
777 
778     _ASSERT((int)m_WFreqsData.size() == kSubjectLength * kAlphabetSize);
779     _ASSERT((int)m_MsaData.size() == kSubjectLength);
780 
781     ITERATE (vector<PSICdMsaCellData>, it, m_MsaData) {
782         _ASSERT(it->wfreqs);
783     }
784 
785     return true;
786 }
787 
AdjustRanges(int d_from,int d_to)788 void CCddInputData::CHitSegment::AdjustRanges(int d_from, int d_to)
789 {
790     m_QueryRange.SetFrom(m_QueryRange.GetFrom() + d_from);
791     m_QueryRange.SetTo(m_QueryRange.GetTo() + d_to);
792 
793     m_SubjectRange.SetFrom(m_SubjectRange.GetFrom() + d_from);
794     m_SubjectRange.SetTo(m_SubjectRange.GetTo() + d_to);
795 }
796 
797 
IsEmpty(void) const798 bool CCddInputData::CHitSegment::IsEmpty(void) const
799 {
800     return m_QueryRange.GetFrom() > m_QueryRange.GetTo()
801         || m_SubjectRange.GetFrom() > m_SubjectRange.GetTo();
802 }
803 
x_FillResidueCounts(int db_oid,const CBlastRPSInfo & profile_data)804 void CCddInputData::CHitSegment::x_FillResidueCounts(int db_oid,
805                                      const CBlastRPSInfo& profile_data)
806 {
807     _ASSERT(profile_data()->freq_header);
808 
809     BlastRPSProfileHeader* header = profile_data()->freq_header;
810     int num_profiles = header->num_profiles;
811 
812     _ASSERT(db_oid < num_profiles);
813 
814     // Get weighted residue counts for CD
815     const Int4* db_seq_offsets = header->start_offsets;
816     const TFreqs* db_counts =
817         (TFreqs*)(header->start_offsets + num_profiles + 1);
818 
819     // extract residue counts
820     const TFreqs* counts = db_counts + db_seq_offsets[db_oid] * kAlphabetSize;
821     int db_seq_length = db_seq_offsets[db_oid + 1] - db_seq_offsets[db_oid];
822 
823     // correct seq length for column of zero counts in cdd counts file
824     db_seq_length--;
825     _ASSERT(db_seq_length > 0);
826     _ASSERT(m_SubjectRange.GetTo() <= db_seq_length);
827 
828 
829     int num_columns = (int)m_MsaData.size();
830     m_WFreqsData.resize(num_columns * kAlphabetSize);
831     for (int i=0;i < num_columns;i++) {
832         m_MsaData[i].wfreqs = &m_WFreqsData[i * kAlphabetSize];
833 
834         // column frequencies for a column must sum to 1, but they may not due
835         // to storing in CDD as integers, the difference is distributed equally
836         // among all the non-zero frequencies
837         TFreqs sum_freqs = 0;
838         for (int j=0;j < kAlphabetSize;j++) {
839             sum_freqs +=
840                 counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j];
841         }
842 
843         for (int j=0;j < kAlphabetSize;j++) {
844             m_MsaData[i].wfreqs[j] =
845                 (double)counts[(m_SubjectRange.GetFrom() + i) * kAlphabetSize + j]
846                 / (double)sum_freqs;
847         }
848     }
849 }
850 
x_FillObservations(int db_oid,const CBlastRPSInfo & profile_data)851 void CCddInputData::CHitSegment::x_FillObservations(int db_oid,
852                                             const CBlastRPSInfo& profile_data)
853 {
854     // Get effective numbers of independent observations
855 
856     _ASSERT(profile_data()->obsr_header);
857 
858     BlastRPSProfileHeader* header = profile_data()->obsr_header;
859     int num_profiles = header->num_profiles;
860 
861     _ASSERT(db_oid < num_profiles);
862 
863     // find poiter to eff number of observations
864     const Int4* offsets = header->start_offsets;
865     const TObsr* data_start
866         = (TObsr*)(header->start_offsets + num_profiles + 1);
867 
868     const TObsr* data = data_start + offsets[db_oid];
869     int data_size = offsets[db_oid + 1] - offsets[db_oid];
870 
871     // extract effective numbers of obaservations
872     vector<TObsr> obsr;
873     for (int i=0;i < data_size;i+=2) {
874         TObsr val = data[i];
875         Int4 num = (Int4)data[i + 1];
876         _ASSERT(fabs((double)num - data[i + 1]) < 1e-05);
877 
878         for (int j=0;j < num;j++) {
879             obsr.push_back(val);
880         }
881     }
882 
883     int num_columns = m_SubjectRange.GetTo() - m_SubjectRange.GetFrom();
884     for (int i=0;i < num_columns;i++) {
885         m_MsaData[i].iobsr =
886             (double)obsr[m_SubjectRange.GetFrom() + i] / kRpsScaleFactor;
887     }
888 }
889 
890 
891 END_SCOPE(blast)
892 END_NCBI_SCOPE
893 
894 /* @} */
895