1 package structures; 2 3 import java.util.Arrays; 4 5 import dna.AminoAcid; 6 import shared.KillSwitch; 7 import shared.Tools; 8 import stream.Read; 9 10 /** 11 * Tracks the number of homopolymers observed of given lengths. 12 * Only the longest homopolymer for a given base is counted per sequence. 13 * @author Brian Bushnell 14 * @date August 27, 2018 15 * 16 */ 17 public class PolymerTracker { 18 19 /*--------------------------------------------------------------*/ 20 /*---------------- Initialization ----------------*/ 21 /*--------------------------------------------------------------*/ 22 PolymerTracker()23 public PolymerTracker(){ 24 reset(); 25 } 26 reset()27 public void reset(){ 28 Arrays.fill(maxACGTN, 0); 29 for(int i=0; i<5; i++){ 30 countsACGTN[i]=new LongList(); 31 } 32 } 33 34 /*--------------------------------------------------------------*/ 35 /*---------------- Public Add Methods ----------------*/ 36 /*--------------------------------------------------------------*/ 37 addPair(Read r)38 public void addPair(Read r){ 39 if(r==null){return;} 40 add(r.bases); 41 add(r.mate); 42 } 43 add(Read r)44 public void add(Read r){ 45 if(r==null){return;} 46 add(r.bases); 47 } 48 add(PolymerTracker pt)49 public void add(PolymerTracker pt){ 50 for(int i=0; i<5; i++){ 51 LongList list=countsACGTN[i]; 52 LongList ptList=pt.countsACGTN[i]; 53 for(int len=0; len<ptList.size; len++){ 54 long count=ptList.get(len); 55 list.increment(len, count); 56 } 57 } 58 } 59 add(byte[] bases)60 public void add(byte[] bases){ 61 if(bases==null || bases.length<1){return;} 62 if(PER_SEQUENCE){ 63 addPerSequence(bases); 64 }else{ 65 addPerPolymer(bases); 66 } 67 } 68 69 /*--------------------------------------------------------------*/ 70 /*---------------- Methods ----------------*/ 71 /*--------------------------------------------------------------*/ 72 accumulate()73 public LongList[] accumulate(){ 74 if(cumulativeACGTN!=null){return cumulativeACGTN;} 75 LongList[] sums=new LongList[5]; 76 for(int i=0; i<5; i++){//Make reverse-cumulative version 77 LongList list=countsACGTN[i]; 78 LongList sumList=new LongList(list.size); 79 for(int len=list.size-1; len>=0; len--){ 80 sumList.set(len, sumList.get(len+1)+list.get(len)); 81 } 82 sums[i]=sumList; 83 } 84 cumulativeACGTN=sums; 85 return sums; 86 } 87 88 //Non-cumulative toHistogram()89 public String toHistogram(){ 90 StringBuilder sb=new StringBuilder(); 91 sb.append("#Length\tA\tC\tG\tT\tN\n"); 92 93 final int maxIndex=longest(); 94 for(int len=0; len<maxIndex; len++){ 95 sb.append(len); 96 for(int i=0; i<5; i++){ 97 long count=countsACGTN[i].get(len); 98 sb.append('\t').append(count); 99 } 100 sb.append('\n'); 101 } 102 return sb.toString(); 103 } 104 105 //Cumulative toHistogramCumulative()106 public String toHistogramCumulative(){ 107 LongList[] sums=accumulate(); 108 109 StringBuilder sb=new StringBuilder(); 110 sb.append("#Length\tA\tC\tG\tT\tN\n"); 111 112 final int maxIndex=longest(); 113 for(int len=0; len<maxIndex; len++){ 114 sb.append(len); 115 for(int i=0; i<5; i++){ 116 long count=sums[i].get(len); 117 sb.append('\t').append(count); 118 } 119 sb.append('\n'); 120 } 121 return sb.toString(); 122 } 123 calcRatio(byte base1, byte base2, int length)124 public double calcRatio(byte base1, byte base2, int length){ 125 long count1=getCount(base1, length); 126 long count2=getCount(base2, length); 127 return count1/Tools.max(1.0, count2); 128 } 129 getCount(byte base, int length)130 public long getCount(byte base, int length) { 131 int x=AminoAcid.baseToNumberACGTN[base]; 132 return countsACGTN[x].get(length); 133 } 134 calcRatioCumulative(byte base1, byte base2, int length)135 public double calcRatioCumulative(byte base1, byte base2, int length){ 136 long count1=getCountCumulative(base1, length); 137 long count2=getCountCumulative(base2, length); 138 return count1/Tools.max(1.0, count2); 139 } 140 getCountCumulative(byte base, int length)141 public long getCountCumulative(byte base, int length) { 142 int x=AminoAcid.baseToNumberACGTN[base]; 143 return accumulate()[x].get(length); 144 } 145 146 /*--------------------------------------------------------------*/ 147 /*---------------- Inner Methods ----------------*/ 148 /*--------------------------------------------------------------*/ 149 150 /** Increment once per sequence, 151 * for the longest homopolymer of each base */ addPerSequence(byte[] bases)152 private void addPerSequence(byte[] bases){ 153 Arrays.fill(maxACGTN, 0); 154 byte prev=-1; 155 int current=0; 156 for(byte b : bases){ 157 if(b==prev){ 158 current++; 159 }else{ 160 recordMax(prev, current); 161 prev=b; 162 current=1; 163 } 164 } 165 recordMax(prev, current); 166 167 for(int i=0; i<maxACGTN.length; i++){ 168 countsACGTN[i].increment(maxACGTN[i], 1); 169 } 170 } 171 172 /** Increment once per homopolymer */ addPerPolymer(byte[] bases)173 private void addPerPolymer(byte[] bases){ 174 byte prev=-1; 175 int current=0; 176 for(byte b : bases){ 177 if(b==prev){ 178 current++; 179 }else{ 180 recordCounts(prev, current); 181 prev=b; 182 current=1; 183 } 184 } 185 recordCounts(prev, current); 186 } 187 recordMax(byte base, int len)188 private void recordMax(byte base, int len){ 189 if(base<0){return;} 190 int x=AminoAcid.baseToNumberACGTN[base]; 191 if(x<0){return;} 192 maxACGTN[x]=Tools.max(maxACGTN[x], len); 193 } 194 recordCounts(byte base, int len)195 private void recordCounts(byte base, int len){ 196 if(base<0){return;} 197 int x=AminoAcid.baseToNumberACGTN[base]; 198 if(x<0){return;} 199 countsACGTN[x].increment(len, 1); 200 } 201 longest()202 private int longest(){ 203 int max=0; 204 for(LongList list : countsACGTN){ 205 max=Tools.max(list.size(), max); 206 } 207 return max; 208 } 209 210 /*--------------------------------------------------------------*/ 211 /*---------------- Fields ----------------*/ 212 /*--------------------------------------------------------------*/ 213 214 private final int[] maxACGTN=KillSwitch.allocInt1D(5); 215 final LongList[] countsACGTN=new LongList[5]; 216 private LongList[] cumulativeACGTN; 217 218 /*--------------------------------------------------------------*/ 219 /*---------------- Static Fields ----------------*/ 220 /*--------------------------------------------------------------*/ 221 222 public static boolean PER_SEQUENCE=true; 223 public static boolean CUMULATIVE=true; 224 225 } 226