1 // $Id: dlcell.cpp,v 1.43 2012/05/22 18:17:01 ewalkup Exp $
2 
3 /*
4   Copyright 2002  Peter Beerli, Mary Kuhner, Jon Yamato and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 // This file contains the implementation code for the data-likelihood
12 // storage object.
13 
14 #include <cassert>
15 #include <cmath>
16 #include <cstring>
17 #include <iostream>                     //debug
18 
19 #include "local_build.h"
20 
21 #include "dlcell.h"
22 #include "dlmodel.h"
23 #include "errhandling.h"
24 #include "definitions.h"
25 #include "registry.h"
26 #include "cellmanager.h"
27 #include "stringx.h"                    // for ToString() use in debug function DLsToString()
28 
29 #ifdef DMALLOC_FUNC_CHECK
30 #include "/usr/local/include/dmalloc.h"
31 #endif
32 
33 // this turns on the data likelihood details for testing
34 //#define DATA_LIKELIHOOD_DETAILS
35 
36 using namespace std;
37 
38 //------------------------------------------------------------------------------------
39 // DLCell
40 //------------------------------------------------------------------------------------
41 
DLCell(long markers,long cats,long bins)42 DLCell::DLCell(long markers, long cats, long bins)
43     : Cell(),
44       m_nmarkers(markers),
45       m_ncats(cats),
46       m_nbins(bins),
47       m_norms(markers, 0.0)
48 {
49     // obtain an internal array from the free store
50     m_identifier.first = m_nmarkers;
51     m_identifier.second = m_ncats;
52     m_identifier.third = m_nbins;
53 
54     m_DLs = registry.GetCellManager().GetArray(m_identifier, *this);
55 } // DLCell constructor
56 
57 //------------------------------------------------------------------------------------
58 
~DLCell()59 DLCell::~DLCell()
60 {
61     // return the internal array to the free store
62     registry.GetCellManager().FreeArray(m_identifier, m_DLs);
63     m_DLs = NULL;
64 } // DLCell destructor
65 
66 //------------------------------------------------------------------------------------
67 
Copy() const68 Cell* DLCell::Copy() const
69 {
70     Cell* pCell = Clone();
71     pCell->CopyInitialize(*this);
72     return pCell;
73 
74 } // Copy
75 
76 //------------------------------------------------------------------------------------
77 
EmptyCell()78 void DLCell::EmptyCell()
79 {
80     for (long posn = 0; posn < m_nmarkers; ++posn)
81     {
82         for (long cat = 0; cat < m_ncats; ++cat)
83         {
84             for (long base = 0; base < m_nbins; ++base)
85             {
86                 m_DLs[posn][cat][base] = 0;
87             }
88         }
89     }
90 }
91 
92 //------------------------------------------------------------------------------------
93 
CopyInitialize(const Cell & src)94 void DLCell::CopyInitialize(const Cell& src)
95 {
96 
97     const DLCell& srcell = dynamic_cast<const DLCell&>(src);
98 
99     assert(m_nmarkers == srcell.m_nmarkers);
100     assert(m_ncats == srcell.m_ncats);
101     assert(m_nbins == srcell.m_nbins);
102 
103 
104     m_norms = srcell.m_norms;
105     // m_nmarkers+1 because there is an extra cell at the end, to
106     // allow an STL-like interface where one past the end is a valid
107     // address
108 
109     long arraySize = (m_nmarkers+1) * m_ncats * m_nbins;
110     //  cout << "arraysize=" << arraySize << endl;
111     // memcpy for speed--be careful!
112     memcpy(m_DLs[0][0], srcell.m_DLs[0][0], arraySize*sizeof(double));
113 
114 } // CopyInitialize
115 
116 //------------------------------------------------------------------------------------
117 // Debugging function.
118 
IsValidPosition(long pos) const119 bool DLCell::IsValidPosition(long pos) const
120 {
121     // for some purposes, pos == m_nmarkers is in fact valid; this
122     // allows "one past the end" logic emulating the STL
123 
124     if (0 <= pos && pos <= m_nmarkers) return true;
125     return false;
126 } // IsValidPosition
127 
128 //------------------------------------------------------------------------------------
129 // Make a datalikelihood array appropriately sized for this Cell
130 
MakeArray()131 cellarray DLCell::MakeArray()
132 {
133     long len = m_nmarkers + 1;
134     // provide a STL like interface into the m_DLs array.
135     // (i.e. provides a legal reference one past the normal end
136     // of the array)
137 
138     // set up a 3-dimensional array in which all positions are
139     // contiguous, so that memcpy can be used on it (a speed
140     // optimization).
141 
142     long site, category;
143     cellarray dl;
144 
145     dl = new double**[len];
146 
147     dl[0] = new double*[len * m_ncats];
148     for (site = 0; site < len; ++site)
149     {
150         dl[site] = dl[0] + site * m_ncats;
151     }
152 
153     dl[0][0] = new double[len * m_ncats * m_nbins];
154     for (site = 0; site < len; ++site)
155     {
156         for (category = 0; category < m_ncats; ++category)
157         {
158             dl[site][category] = dl[0][0] + site * m_ncats * m_nbins + category * m_nbins;
159         }
160     }
161 
162     return dl;
163 
164 } // MakeArray
165 
166 //------------------------------------------------------------------------------------
167 
SwapDLs(Cell_ptr othercell,long pos)168 void DLCell::SwapDLs(Cell_ptr othercell, long pos)
169 {
170     // This creates a 2D array (categories x bins) and uses it to
171     // swap the information for a single marker between two DLCells.
172     // All bets are off if they are not the same type!
173 
174     // WARNING -- Not exception safe due to raw new.
175 
176     // take pointers to the two things to be swapped
177     double** otherDLs = othercell->GetSiteDLs(pos);
178     double** myDLs = GetSiteDLs(pos);
179 
180     // create temporary storage for the swap
181     double** newDLs = new double* [m_ncats];
182     newDLs[0] = new double [m_ncats * m_nbins];
183     long cat;
184     for(cat = 1; cat < m_ncats; ++cat)
185         newDLs[cat] = newDLs[0] + cat * m_nbins;
186 
187     // memcpy for speed
188     memcpy(newDLs[0], myDLs[0], m_ncats * m_nbins * sizeof(double));
189 
190     // swap
191     SetSiteDLs(pos, otherDLs);
192     othercell->SetSiteDLs(pos, newDLs);
193 
194     // release temporary storage
195     delete [] newDLs[0];
196     delete [] newDLs;
197 
198 } // SwapDLs
199 
200 //------------------------------------------------------------------------------------
201 
SumMarkers(long startpos,long endpos,bool normalize)202 void DLCell::SumMarkers(long startpos, long endpos, bool normalize)
203 {
204     long cat, bin, pos;
205 
206     if (normalize)
207     {
208 
209         for (cat = 0; cat < m_ncats; ++cat)
210         {
211             for (bin = 0; bin < m_nbins; ++bin)
212             {
213                 double result = 0.0;
214                 for (pos = startpos; pos != endpos; ++pos)
215                 {
216                     double markerval = log(m_DLs[pos][cat][bin]) + GetNorms(pos);
217                     if (markerval > EXPMIN) result += exp(markerval);
218                 }
219 #ifdef DATA_LIKELIHOOD_DETAILS
220                 cout<< "SumMarkers-normalized cat:" << cat
221                     << " bin:" << bin
222                     << " result:" << result
223                     <<endl;
224 #endif
225                 if (result == 0.0)   // never found a good value?
226                     m_DLs[endpos][cat][bin] = exp(EXPMIN);
227                 else
228                     m_DLs[endpos][cat][bin] = result;
229             }
230         }
231 
232         // renormalize and re-set norms
233         double newnorm = Normalize(m_DLs[endpos]);
234         SetNorms(newnorm, endpos);
235 
236     }
237     else                                // no normalization
238     {
239 
240         for (cat = 0; cat < m_ncats; ++cat)
241         {
242             for (bin = 0; bin < m_nbins; ++bin)
243             {
244                 double result = 0.0;
245                 for (pos = startpos; pos != endpos; ++pos)
246                 {
247                     result += m_DLs[pos][cat][bin];
248                 }
249 #ifdef DATA_LIKELIHOOD_DETAILS
250                 cout<< "SumMarkers-not normalized cat:" << cat
251                     << " bin:" << bin
252                     << " result:" << result
253                     <<endl;
254 #endif
255 
256                 m_DLs[endpos][cat][bin] = result;
257             }
258         }
259     }
260 
261 } // SumMarkers
262 
263 //------------------------------------------------------------------------------------
264 
IsSameAs(const Cell_ptr othercell,long pos) const265 bool DLCell::IsSameAs(const Cell_ptr othercell, long pos) const
266 {
267     double** otherDLs = othercell->GetSiteDLs(pos);
268     double** myDLs = GetSiteDLs(pos);
269     long cat, bin;
270     for(cat = 0; cat < m_ncats; ++cat)
271         for (bin = 0; bin < m_nbins; ++bin)
272             if (myDLs[cat][bin] != otherDLs[cat][bin]) return false;
273 
274     return true;
275 } // DLCell::IsSameAs
276 
277 //------------------------------------------------------------------------------------
278 
DiffersFrom(Cell_ptr othercell) const279 long DLCell::DiffersFrom(Cell_ptr othercell) const
280 {
281     long marker;
282     for(marker = 0; marker < m_nmarkers; ++marker)
283         if (!IsSameAs(othercell,marker)) return marker;
284 
285     return FLAGLONG;
286 
287 } // DLCell::DiffersFrom
288 
289 //------------------------------------------------------------------------------------
290 
SetAllCategoriesTo(DoubleVec1d & state,long posn)291 void DLCell::SetAllCategoriesTo(DoubleVec1d& state, long posn)
292 {
293     for (long cat = 0; cat < m_ncats; cat++)
294     {
295         for (unsigned long nstate=0; nstate < state.size(); nstate++)
296         {
297             m_DLs[posn][cat][nstate] = state[nstate];
298         }
299     }
300 }
301 
302 //------------------------------------------------------------------------------------
303 
GetStateFor(long posn,long cat) const304 DoubleVec1d DLCell::GetStateFor(long posn, long cat) const
305 {
306     DoubleVec1d state;
307     for (long nstate = 0; nstate < m_nbins; nstate++)
308     {
309         state.push_back(m_DLs[posn][cat][nstate]);
310     }
311     return state;
312 }
313 
314 
315 //------------------------------------------------------------------------------------
316 
SetStateTo(long posn,long cat,DoubleVec1d state)317 void DLCell::SetStateTo (long posn, long cat, DoubleVec1d state)
318 {
319     for (long nstate = 0; nstate < m_nbins; nstate++)
320     {
321         m_DLs[posn][cat][nstate] = state[nstate];
322     }
323 }
324 
325 //------------------------------------------------------------------------------------
326 
AddTo(const Cell_ptr othercell)327 void DLCell::AddTo(const Cell_ptr othercell)
328 {
329     for (long pos = 0; pos < m_nmarkers; pos++)
330     {
331         double** otherDLs = othercell->GetSiteDLs(pos);
332         double** myDLs = GetSiteDLs(pos);
333         long cat, bin;
334         for(cat = 0; cat < m_ncats; ++cat)
335         {
336             for (bin = 0; bin < m_nbins; ++bin)
337             {
338                 myDLs[cat][bin] += otherDLs[cat][bin];
339             }
340         }
341     }
342 }
343 
344 //------------------------------------------------------------------------------------
345 
SubtractFrom(const Cell_ptr othercell)346 void DLCell::SubtractFrom(const Cell_ptr othercell)
347 {
348     for (long pos = 0; pos < m_nmarkers; pos++)
349     {
350         double** otherDLs = othercell->GetSiteDLs(pos);
351         double** myDLs = GetSiteDLs(pos);
352         long cat, bin;
353         for(cat = 0; cat < m_ncats; ++cat)
354         {
355             for (bin = 0; bin < m_nbins; ++bin)
356             {
357                 myDLs[cat][bin] -= otherDLs[cat][bin];
358             }
359         }
360     }
361 }
362 
363 //------------------------------------------------------------------------------------
364 
MultiplyBy(double mult)365 void DLCell::MultiplyBy(double mult)
366 {
367     for (long pos = 0; pos < m_nmarkers; pos++)
368     {
369         double** myDLs = GetSiteDLs(pos);
370         long cat, bin;
371         for(cat = 0; cat < m_ncats; ++cat)
372         {
373             for (bin = 0; bin < m_nbins; ++bin)
374             {
375                 myDLs[cat][bin] *= mult;
376             }
377         }
378     }
379 }
380 
381 //------------------------------------------------------------------------------------
382 
MultiplyBy(const Cell_ptr othercell)383 void DLCell::MultiplyBy(const Cell_ptr othercell)
384 {
385     for (long pos = 0; pos < m_nmarkers; pos++)
386     {
387         double** otherDLs = othercell->GetSiteDLs(pos);
388         double** myDLs = GetSiteDLs(pos);
389         long cat, bin;
390         for(cat = 0; cat < m_ncats; ++cat)
391         {
392             for (bin = 0; bin < m_nbins; ++bin)
393             {
394                 myDLs[cat][bin] *= otherDLs[cat][bin];
395             }
396         }
397     }
398 }
399 
400 //------------------------------------------------------------------------------------
401 
GetOnes(long marker) const402 LongVec1d DLCell::GetOnes(long marker) const
403 {
404     LongVec1d ones;
405     double** dls = GetSiteDLs(marker);
406     //Just mess with the first category
407     for (long bin = 0; bin < m_nbins; ++bin)
408     {
409         if (dls[0][bin] == 1.0)
410         {
411             ones.push_back(bin);
412         }
413     }
414     return ones;
415 }
416 
417 //------------------------------------------------------------------------------------
418 // Debugging function.
419 
DLsToString(long start,long end) const420 string DLCell::DLsToString(long start, long end) const
421 {
422     string lines;
423 
424     long line;
425     for(line = start; line <= end; ++line)
426     {
427         lines += "marker " + ToString(line) + ": ";
428         double** dls = GetSiteDLs(line);
429         long cat;
430         for(cat = 0; cat < m_ncats; ++cat)
431         {
432             lines += "{";
433             long bin;
434             for(bin = 0; bin < m_nbins; ++bin)
435             {
436                 lines += ToString(dls[cat][bin]);
437                 if (bin != m_nbins - 1) lines += ",";
438             }
439             lines += "}";
440             if (cat != m_ncats - 1) lines += ", ";
441         }
442         lines += "\n";
443     }
444 
445     return lines;
446 
447 } // DLCell::DLsToString
448 
449 //------------------------------------------------------------------------------------
450 
Normalize(double ** siteDLs)451 double DLCell::Normalize(double** siteDLs)
452 {
453     double biggest = NEG_MAX;
454     long cat, bin;
455     for(cat = 0; cat < m_ncats; ++cat)
456     {
457         for(bin = 0; bin < m_nbins; ++bin)
458         {
459             if (siteDLs[cat][bin] > biggest) biggest = siteDLs[cat][bin];
460         }
461     }
462 
463     for(cat = 0; cat < m_ncats; ++cat)
464     {
465         for(bin = 0; bin < m_nbins; ++bin)
466         {
467             siteDLs[cat][bin] /= biggest;
468         }
469     }
470 
471     return log(biggest);
472 
473 } // Normalize
474 
475 //------------------------------------------------------------------------------------
476 
SetSiteDLs(long posn,double ** siteDLs)477 void DLCell::SetSiteDLs(long posn, double **siteDLs)
478 {
479     assert(IsValidPosition(posn));
480     long siteSize = m_ncats * m_nbins;
481     memcpy(m_DLs[posn][0], siteDLs[0], siteSize * sizeof(double));
482 }
483 
484 //------------------------------------------------------------------------------------
485 
AddToSiteDLs(long posn,double ** siteDLs)486 void DLCell::AddToSiteDLs(long posn, double **siteDLs)
487 {
488     assert(IsValidPosition(posn));
489     long cat, bin;
490     for(cat = 0; cat < m_ncats; ++cat)
491     {
492         for (bin = 0; bin < m_nbins; ++bin)
493         {
494             m_DLs[posn][cat][bin] += siteDLs[cat][bin];
495         }
496     }
497 }
498 
499 //------------------------------------------------------------------------------------
500 // NullCell
501 //------------------------------------------------------------------------------------
502 
NullCell()503 NullCell::NullCell()
504     : Cell()
505 {
506     // deliberately blank
507 } // NullCell constructor
508 
509 //------------------------------------------------------------------------------------
510 
Initialize(const StringVec1d &,const DataModel_ptr)511 void NullCell::Initialize (const StringVec1d&, const DataModel_ptr)
512 {
513     assert (false); // should never call this!
514 
515 } // Initialize
516 
GetStateFor(long posn,long cat) const517 DoubleVec1d NullCell::GetStateFor(long posn, long cat) const
518 {
519     throw implementation_error("No state for this data likelihood.");
520 }
521 
522 //------------------------------------------------------------------------------------
523 // NucCell
524 //------------------------------------------------------------------------------------
525 
NucCell(long markers,long cats)526 NucCell::NucCell(long markers, long cats)
527     : DLCell(markers, cats, BASES)
528 {
529     // deliberately left blank
530 } // NucCell constructor
531 
532 //------------------------------------------------------------------------------------
533 
Initialize(const StringVec1d & sequence,const DataModel_ptr trans)534 void NucCell::Initialize(const StringVec1d &sequence, const DataModel_ptr trans)
535 {
536     long posn, base, cat;
537 
538     string postring;
539     vector<double> likes;
540 
541     // could be OPTIMIZED
542 
543     for (posn = 0; posn < m_nmarkers; ++posn)
544     {
545         postring = sequence[posn];
546         likes = trans->DataToLikes(postring);
547         for (cat = 0; cat < m_ncats; ++cat)
548         {
549             for (base = 0; base < m_nbins; ++base)
550             {
551                 m_DLs[posn][cat][base] = likes[base];
552             }
553         }
554     }
555 } // Initialize
556 
557 //------------------------------------------------------------------------------------
558 // DNACell
559 //------------------------------------------------------------------------------------
560 
DNACell(long markers,long cats)561 DNACell::DNACell(long markers, long cats)
562     : NucCell(markers, cats)
563 {
564     // intentionally blank
565 } // DNACell constructor
566 
567 //------------------------------------------------------------------------------------
568 
Clone() const569 Cell *DNACell::Clone() const
570 {
571     Cell *pDLCell = new DNACell(m_nmarkers, m_ncats);
572     return pDLCell;
573 }
574 
575 //------------------------------------------------------------------------------------
576 // SNPCell
577 //------------------------------------------------------------------------------------
578 
SNPCell(long markers,long cats)579 SNPCell::SNPCell(long markers, long cats)
580     : NucCell(markers, cats)
581 {
582     // deliberately blank
583 } // SNPCell constructor
584 
585 //------------------------------------------------------------------------------------
586 
~SNPCell()587 SNPCell::~SNPCell()
588 {
589     // deliberately blank
590 }
591 
592 //------------------------------------------------------------------------------------
593 
Clone() const594 Cell *SNPCell::Clone() const
595 {
596     Cell *pDLCell  = new SNPCell(m_nmarkers, m_ncats);
597     return pDLCell;
598 }
599 
600 //------------------------------------------------------------------------------------
601 
Initialize(const StringVec1d &,const DataModel_ptr trans)602 void SNPCell::Initialize(const StringVec1d&, const DataModel_ptr trans)
603 {
604     // We do not use the sequence input -- SNPCell::Initialize should
605     // only be used to initialize invariant cells. We keep the first
606     // argument for conformity with the base class interface.
607 
608     long categ, invar, base;
609 
610     // In the m_DLs array, the basic idea is to set all entries to 0.0
611     // except for the entries where the invariant equals the base,
612     // where they are set to 1.0.
613     // HOWEVER,
614     // since we've now got a data uncertainty model, each of the
615     // invariant bases needs to go through the error correction
616     // transformation first.
617 
618     for (invar = 0; invar < INVARIANTS; ++invar)
619     {
620         vector<double> likes = trans->DataToLikes(SINGLEBASES[invar]);
621         for (categ = 0; categ < m_ncats; ++categ)
622         {
623             for (base = 0; base < m_nbins; ++base)
624             {
625                 m_DLs[invar][categ][base] = likes[base];
626             }
627         }
628     }
629 }
630 
631 //------------------------------------------------------------------------------------
632 // AlleleCell
633 //------------------------------------------------------------------------------------
634 
AlleleCell(long markers,long cats,long bins)635 AlleleCell::AlleleCell(long markers, long cats, long bins)
636     : DLCell(markers, cats, bins)
637 {
638     // deliberately blank
639 } // AlleleCell constructor
640 
641 //------------------------------------------------------------------------------------
642 
Clone() const643 Cell *AlleleCell::Clone() const
644 {
645     Cell *pDLCell  = new AlleleCell(m_nmarkers, m_ncats, m_nbins);
646     return pDLCell;
647 } // Clone
648 
649 //------------------------------------------------------------------------------------
650 
Initialize(const StringVec1d & sequence,const DataModel_ptr trans)651 void AlleleCell::Initialize(const StringVec1d &sequence, const DataModel_ptr trans)
652 {
653     long posn, bin, cat;
654     vector<double> likes;
655 
656     // could be OPTIMIZED
657 
658     for (posn = 0; posn < m_nmarkers; ++posn)
659     {
660         likes = trans->DataToLikes(sequence[posn], posn);
661         for (cat = 0; cat < m_ncats; ++cat)
662         {
663             for (bin = 0; bin < m_nbins; ++bin)
664             {
665                 m_DLs[posn][cat][bin] = likes[bin];
666             }
667         }
668     }
669 } // Initialize
670 
671 //------------------------------------------------------------------------------------
672 //------------------------------------------------------------------------------------
673 
triplet()674 triplet::triplet()
675     : first(0),
676       second(0),
677       third(0)
678 {
679     // intentionally blank
680 } // triplet default constructor
681 
682 //------------------------------------------------------------------------------------
683 
triplet(long f,long s,long t)684 triplet::triplet(long f, long s, long t)
685     : first(f),
686       second(s),
687       third(t)
688 {
689     // intentionally blank
690 } // triplet constructor
691 
692 //------------------------------------------------------------------------------------
693 
operator <(const triplet & rhs) const694 bool triplet::operator<(const triplet& rhs) const
695 {
696     if (first != rhs.first) return (first < rhs.first);
697     if (second != rhs.second) return (second < rhs.second);
698     return (third < rhs.third);
699 } // triplet operator<
700 
701 //____________________________________________________________________________________
702