1 package stream;
2 
3 import java.io.Serializable;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Locale;
7 
8 import dna.AminoAcid;
9 import dna.ChromosomeArray;
10 import dna.Data;
11 import dna.Gene;
12 import dna.ScafLoc;
13 import shared.KillSwitch;
14 import shared.Parse;
15 import shared.Shared;
16 import shared.Tools;
17 import structures.ByteBuilder;
18 import var2.ScafMap;
19 import var2.Scaffold;
20 
21 
22 public class SamLine implements Serializable {
23 
24 //	426_647_582	161	chr1	10159	0	26M9H	chr3	170711991	0	TCCCTAACCCTAACCCTAACCTAACC	IIFIIIIIIIIIIIIIIIIIICH2<>	RG:Z:20110708003021394	NH:i:3	CM:i:2	SM:i:1	CQ:Z:A9?(BB?:<A?>=>B67=:7A);.%8'%))/%*%'	CS:Z:G12002301002301002301023010200000003	XS:A:+
25 
26 //	1 QNAME String [!-?A-~]f1,255g Query template NAME
27 //	2 FLAG Int [0,216-1] bitwise FLAG
28 //	3 RNAME String \*|[!-()+-<>-~][!-~]* Reference sequence NAME
29 //	4 POS Int [0,229-1] 1-based leftmost mapping POSition
30 //	5 MAPQ Int [0,28-1] MAPping Quality
31 //	6 CIGAR String \*|([0-9]+[MIDNSHPX=])+ CIGAR string
32 //	7 RNEXT String \*|=|[!-()+-<>-~][!-~]* Ref. name of the mate/next fragment
33 //	8 PNEXT Int [0,229-1] Position of the mate/next fragment
34 //	9 TLEN Int [-229+1,229-1] observed Template LENgth
35 //	10 SEQ String \*|[A-Za-z=.]+ fragment SEQuence
36 //	11 QUAL String [!-~]+ ASCII of Phred-scaled base QUALity+33
37 
38 
39 //	FCB062MABXX:1:1101:1177:2115#GGCTACAA	147	chr11	47765857	29	90M	=	47765579	-368	CCTCTGTGGCCCGGGTTGGAGTGCAGTGTCATGATCATGGCTCGCTGTAGCTACACCCTTCTGAGCTCAAGCAATCCTCCCACCTCTCCC	############################################################A@@><D<AAAB<=A2BD/BC<7:<4<%679	XT:A:M	NM:i:5	SM:i:29	AM:i:29	XM:i:5	XO:i:0	XG:i:0	MD:Z:7T4A15G26A30A3
40 //	FCB062MABXX:1:1101:1193:2122#GGCTACAA	77	*	    0	         0	*	*	0	           0	TATATATGTGCTATGTACAGCATTGGAATTCACACCCTACACTTTCAAAAGNGAGCCCTAAATAAATGTTAGATCGGAAGAGCACACGTC	FCFCFDDDADDEDEBDAEDFEDEFFGGFGGHEEFHHHHHHEDDDEDFFEFB#CBBA@B8BGGFGEEEC>DGGGDFBGGGGHHHHH9<@##
41 
42 
43 	/*--------------------------------------------------------------*/
44 	/*----------------        Initialization        ----------------*/
45 	/*--------------------------------------------------------------*/
46 
47 	/**
48 	 *
49 	 */
50 	private static final long serialVersionUID = -4180486051387471116L;
51 
SamLine(String s)52 	public SamLine(String s){
53 		this(s.split("\t"));
54 	}
55 
SamLine(SamLine sl)56 	public SamLine(SamLine sl){
57 		setFrom(sl);
58 	}
59 
60 	/** Prevents references to original string, in case of e.g. very long MD tags. */
toSamLine(String s)61 	public SamLine toSamLine(String s){
62 		String[] split=s.split("\t");
63 		split[0]=new String(split[0]);
64 		split[5]=new String(split[5]);
65 		split[9]=new String(split[9]);
66 		split[10]=new String(split[10]);
67 		for(int i=11; i<split.length; i++){
68 			split[i]=new String(split[i]);
69 		}
70 		return new SamLine(split);
71 	}
72 
setFrom(SamLine sl)73 	private void setFrom(SamLine sl){
74 		qname=sl.qname;
75 		flag=sl.flag;
76 		rname=sl.rname;
77 		rnameS=sl.rnameS;
78 		pos=sl.pos;
79 		mapq=sl.mapq;
80 		cigar=sl.cigar;
81 		rnext=sl.rnext;
82 		pnext=sl.pnext;
83 		tlen=sl.tlen;
84 		seq=sl.seq;
85 		qual=sl.qual;
86 		optional=sl.optional;
87 	}
88 
89 
SamLine(Read r1, int fragNum)90 	public SamLine(Read r1, int fragNum){
91 
92 		if(verbose){
93 			System.err.println("new SamLine for read with match "+(r1.match==null ? "null" : new String(r1.match)));
94 		}
95 
96 		Read r2=r1.mate;
97 		final boolean perfect=r1.perfect();
98 
99 		if(Data.scaffoldLocs==null && r1.samline!=null){
100 			assert(SET_FROM_OK) : "Sam format cannot be used as input to this program when no genome build is loaded.\n" +
101 					"Please index the reference first and rerun with e.g. 'build=1', or use a different input format.";
102 			setFrom(r1.samline);
103 			return;
104 		}
105 
106 //		qname=r.id.replace(' ', '_').replace('\t', '_');
107 //		qname=r.id.split("\\s+")[0];
108 		qname=r1.id.replace('\t', '_');
109 //		if(!KEEP_NAMES && qname.length()>2 && r2!=null){
110 //			if(qname.endsWith("/1") || qname.endsWith("/2") || qname.endsWith(" 1") || qname.endsWith(" 2")){}
111 //		}
112 
113 		if(!KEEP_NAMES && qname.length()>2 && r2!=null){
114 			char c=qname.charAt(qname.length()-2);
115 			int num=(qname.charAt(qname.length()-1))-'1';
116 			if((num==0 || num==1) && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);}
117 //			if(r.pairnum()==num && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);}
118 		}
119 //		flag=Integer.parseInt(s[1]);
120 
121 		int idx1=-1, idx2=-1;
122 		int chrom1=-1, chrom2=-1;
123 		int start1=-1, start2=-1, a1=0, a2=0;
124 		int stop1=-1, stop2=-1, b1=0, b2=0;
125 		int scaflen=0, scafloc=0, scaflen2=0;
126 		byte[] name1=bytestar, name2=bytestar;
127 		if(r1.mapped()){
128 			assert(r1.chrom>=0);
129 			chrom1=r1.chrom;
130 			start1=r1.start;
131 			stop1=r1.stop;
132 			if(Data.isSingleScaffold(chrom1, start1, stop1)){
133 				assert(Data.scaffoldLocs!=null) : "\n\n"+r1+"\n\n"+r1.obj+"\n\n";
134 				idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2);
135 				name1=Data.scaffoldNames[chrom1][idx1];
136 				scaflen=Data.scaffoldLengths[chrom1][idx1];
137 				scafloc=Data.scaffoldLocs[chrom1][idx1];
138 				a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1);
139 				b1=a1-start1+stop1;
140 			}else{
141 				if(verbose){System.err.println("------------- Found multi-scaffold alignment! -------------");}
142 				r1.setMapped(false);
143 				r1.setPaired(false);
144 				r1.match=null;
145 				if(r2!=null){r2.setPaired(false);}
146 			}
147 		}
148 		if(r2!=null && r2.mapped()){
149 			chrom2=r2.chrom;
150 			start2=r2.start;
151 			stop2=r2.stop;
152 			if(Data.isSingleScaffold(chrom2, start2, stop2)){
153 				idx2=Data.scaffoldIndex(chrom2, (start2+stop2)/2);
154 				name2=Data.scaffoldNames[chrom2][idx2];
155 				scaflen2=Data.scaffoldLengths[chrom2][idx2];
156 				a2=Data.scaffoldRelativeLoc(chrom2, start2, idx2);
157 				b2=a2-start2+stop2;
158 			}else{
159 				if(verbose){System.err.println("------------- Found multi-scaffold alignment for r2! -------------");}
160 				r2.setMapped(false);
161 				r2.setPaired(false);
162 				r2.match=null;
163 				if(r1!=null){r1.setPaired(false);}
164 			}
165 		}
166 
167 		final boolean sameScaf=(r2!=null && idx1>-1 && idx1==idx2 && r1.chrom==r2.chrom);
168 		flag=makeFlag(r1, r2, fragNum, sameScaf);
169 
170 		rname=r1.mapped() ? name1 : ((r2!=null && r2.mapped()) ? name2 : null);
171 
172 		{
173 			int pos0, pos0_mate; //start pos
174 			int pos1, pos1_mate; //stop pos
175 
176 			if(r1.mapped()){
177 //				int leadingClip=countLeadingClip(cigar);
178 				int clip=countLeadingClip(r1.match);
179 				int clippedIndels=countLeadingIndels(a1, r1.match);
180 				int tclip=countTrailingClip(r1.match);
181 				int tclippedIndels=countTrailingIndels(b1, scaflen, r1.match);
182 
183 				if(verbose){
184 					System.err.println("leadingClip="+clip);
185 					System.err.println("clippedDels="+clippedIndels);
186 				}
187 				pos0=(a1+1)+clip+clippedIndels;
188 				pos1=(b1+1)-tclip-tclippedIndels;
189 				if(pos1>scaflen){pos1=scaflen;}
190 
191 				if(pos0<1){
192 					//This is necessary to prevent mapped reads from having POS less than 1.
193 					pos0=1;
194 				}
195 				assert(pos1>=pos0) : pos0+", "+pos1+"\n"+r1+"\n"+r2+"\n";
196 
197 			}else{
198 				pos0=0;
199 				pos1=0;
200 			}
201 
202 			if(r2!=null && r2.mapped()){
203 				int clip=countLeadingClip(r2.match);
204 				int clippedIndels=countLeadingIndels(a2, r2.match);
205 				int tclip=countTrailingClip(r2.match);
206 				int tclippedIndels=countTrailingIndels(b2, scaflen, r2.match);
207 				if(verbose){
208 					System.err.println("leadingClip="+clip);
209 					System.err.println("clippedDels="+clippedIndels);
210 				}
211 				pos0_mate=(a2+1)+clip+clippedIndels;
212 				pos1_mate=(b2+1)-tclip-tclippedIndels;
213 				if(pos1_mate>scaflen){pos1=scaflen;}
214 
215 				if(pos0_mate<1){
216 					//This is necessary to prevent mapped reads from having POS less than 1.
217 					pos0_mate=1;
218 				}
219 				assert(!sameScaf || pos1_mate>=pos0_mate) : pos0_mate+", "+pos1_mate+", "+scaflen+"\n"+r1+"\n"+r2+"\n";
220 
221 			}else{
222 				pos0_mate=0;
223 				pos1_mate=0;
224 			}
225 
226 			if(r2==null){
227 				pos=pos0;
228 				pnext=pos0_mate;
229 				tlen=0;
230 				assert(((pos>0 && r1.mapped()) || (pos==0 && !r1.mapped())) && pnext==0);
231 			}else{
232 				if(r1.mapped() && r2.mapped()){
233 					pos=pos0;
234 					pnext=pos0_mate;
235 					if(sameScaf){
236 //						tlen=1+(Data.max(r.stop, r2.stop)-Data.min(r.start, r2.start));
237 						tlen=1+(Data.max(pos1, pos1_mate)-Data.min(pos0, pos0_mate));
238 					}else{
239 						tlen=0;
240 					}
241 					assert(pos>0) : pos+"\n"+r1+"\n"+r2;
242 					assert(pnext>0) : pnext+"\n"+r1+"\n"+r2;
243 				}else if(r1.mapped() && !r2.mapped()){
244 					pos=pos0;
245 					pnext=pos0;
246 					tlen=0;
247 					assert(pos>0 && pnext>0);
248 				}else if(!r1.mapped() && r2.mapped()){
249 					pos=pos0_mate;
250 					pnext=pos0_mate;
251 					tlen=0;
252 					assert(pos>0 && pnext>0);
253 				}else if(!r1.mapped() && !r2.mapped()){
254 					pos=pos0;
255 					pnext=pos0_mate;
256 					tlen=0;
257 					assert(pos==0 && pnext==0);
258 				}else{assert(false);}
259 			}
260 
261 			assert(pos>=0) : "Negative coordinate "+pos+" for read:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+
262 				", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false)+", clipM="+countLeadingClip(r1.match);
263 			assert(pnext>=0) : "Negative coordinate "+pnext+" for mate:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+
264 				", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false);
265 		}
266 
267 		mapq=toMapq(r1, null);
268 
269 		if(verbose){
270 			System.err.println("Making cigar for "+(r1.match==null ? "null" : new String(r1.match)));
271 		}
272 
273 		final boolean inbounds=!r1.mapped() ? false : (a1>=0 && b1<scaflen);
274 		final boolean inbounds2=(r2==null ? true : !r2.mapped() ? false : (a2>=0 && b2<scaflen2));
275 		if(r1.bases!=null && r1.mapped() && r1.match!=null){
276 			if(VERSION>1.3f){
277 				if(inbounds && perfect && !r1.containsNonM()){//r.containsNonM() should be unnecessary...  it's there in case of clipping...
278 					cigar=(r1.length()+"=");
279 //					System.err.println("SETTING cigar14="+cigar);
280 //
281 //					byte[] match=r.match;
282 //					if(r.shortmatch()){match=Read.toLongMatchString(match);}
283 //					cigar=toCigar13(match, a1, b1, scaflen, r.bases);
284 //					System.err.println("RESETTING cigar14="+cigar+" from toCigar14("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
285 				}else{
286 					byte[] match=r1.match;
287 					if(r1.shortmatch()){match=Read.toLongMatchString(match);}
288 					cigar=toCigar14(match, a1, b1, scaflen, r1.bases);
289 //					System.err.println("CALLING toCigar14("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
290 				}
291 			}else{
292 				if(inbounds && (perfect || !r1.containsNonNMS())){
293 					cigar=(r1.length()+"M");
294 //					System.err.println("SETTING cigar13="+cigar);
295 //
296 //					byte[] match=r.match;
297 //					if(r.shortmatch()){match=Read.toLongMatchString(match);}
298 //					cigar=toCigar13(match, a1, b1, scaflen, r.bases);
299 //					System.err.println("RESETTING cigar13="+cigar+" from toCigar13("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
300 				}else{
301 					byte[] match=r1.match;
302 					if(r1.shortmatch()){match=Read.toLongMatchString(match);}
303 					cigar=toCigar13(match, a1, b1, scaflen, r1.bases);
304 //					System.err.println("CALLING toCigar13("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")");
305 				}
306 			}
307 		}
308 
309 		if(verbose){
310 			System.err.println("cigar="+cigar);
311 		}
312 
313 //		assert(false);
314 
315 //		assert(primary() || cigar.equals(stringstar)) : cigar;
316 //		if(pos<0){pos=0;cigar=null;rname=bytestar;mapq=0;flag|=0x4;}
317 
318 //		assert(false) : "\npos="+pos+"\ncigar='"+cigar+"'\nVERSION="+VERSION+"\na1="+a1+", b1="+b1+"\n\n"+r.toString();
319 
320 //		rnext=(r2==null ? stringstar : (r.mapped() && !r2.mapped()) ? "chr"+Gene.chromCodes[r.chrom] : "chr"+Gene.chromCodes[r2.chrom]);
321 		rnext=((r2==null || (!r1.mapped() && !r2.mapped())) ? bytestar : (r1.mapped() && r2.mapped()) ? (sameScaf ? byteequals : name2) : byteequals);
322 
323 		assert(rnext!=byteequals || name1==name2 || name1==bytestar || name2==bytestar) :
324 			new String(rname)+", "+new String(rnext)+", "+new String(name1)+", "+new String(name2)+"\n"+r1+"\n"+r2;
325 
326 //		assert(r1.pairnum()==0) : r1.mapped()+", "+r2.mapped()+"fragNum="+fragNum+
327 //			"\nname1="+new String(name1)+"\nname2="+new String(name2)+"\nrname="+new String(rname)+"\nrnext="+new String(rnext)+
328 //			"\nname1="+name1+"\nname2="+name2+"\nrname="+rname+"\nrnext="+rnext+"\nidx1="+idx1+"\nidx2="+idx2;
329 
330 		if(Data.scaffoldPrefixes){
331 			 if(rname!=null && rname!=bytestar){
332 				 int k=Tools.indexOf(rname, (byte)'$');
333 				 rname=KillSwitch.copyOfRange(rname, k+1, rname.length);
334 			 }
335 			 if(rnext!=null && rnext!=bytestar){
336 				 int k=Tools.indexOf(rnext, (byte)'$');
337 				 rnext=KillSwitch.copyOfRange(rnext, k+1, rnext.length);
338 			 }
339 		}
340 
341 //		if(r2==null || r.stop<=r2.start){
342 //			//plus sign
343 //		}else if(r2.stop<=r.start){
344 //			//minus sign
345 //			tlen=-tlen;
346 //		}else{
347 //			//They overlap... a lot.  Physically shorter than read length.
348 //			if(r.start<=r2.start){
349 //
350 //			}else{
351 //				tlen=-tlen;
352 //			}
353 //		}
354 		//This version is less technically correct (does not account for very short insert reads) but probably more in line with what is expected
355 		if(r2==null || r1.start<r2.start || (r1.start==r2.start && r1.pairnum()==0)){
356 			//plus sign
357 		}else{
358 			//minus sign
359 			tlen=-tlen;
360 		}
361 
362 //		if(r.secondary()){
363 ////			seq=qual=stringstar;
364 //			seq=qual=bytestar;
365 //		}else{
366 //			if(r.strand()==Gene.PLUS){
367 ////				seq=new String(r.bases);
368 //				seq=r.bases.clone();
369 //				if(r.quality==null){
370 ////					qual=stringstar;
371 //					qual=bytestar;
372 //				}else{
373 ////					StringBuilder q=new StringBuilder(r.quality.length);
374 ////					for(byte b : r.quality){
375 ////						q.append((char)(b+33));
376 ////					}
377 ////					qual=q.toString();
378 //					qual=new byte[r.quality.length];
379 //					for(int i=0, j=qual.length-1; i<qual.length; i++, j--){
380 //						qual[i]=(byte)(r.quality[j]+33);
381 //					}
382 //				}
383 //			}else{
384 ////				seq=new String(AminoAcid.reverseComplementBases(r.bases));
385 //				seq=AminoAcid.reverseComplementBases(r.bases);
386 //				if(r.quality==null){
387 ////					qual=stringstar;
388 //					qual=bytestar;
389 //				}else{
390 ////					StringBuilder q=new StringBuilder(r.quality.length);
391 ////					for(int i=r.quality.length-1; i>=0; i--){
392 ////						q.append((char)(r.quality[i]+33));
393 ////					}
394 ////					qual=q.toString();
395 //					qual=new byte[r.quality.length];
396 //					for(int i=0, j=qual.length-1; i<qual.length; i++, j--){
397 //						qual[i]=(byte)(r.quality[j]+33);
398 //					}
399 //				}
400 //			}
401 //		}
402 
403 		if(r1.secondary() && SECONDARY_ALIGNMENT_ASTERISKS){
404 //			seq=qual=bytestar;
405 			seq=qual=null;
406 		}else{
407 			seq=r1.bases;
408 			if(r1.quality==null){
409 //				qual=bytestar;
410 				qual=null;
411 			}else{
412 				qual=r1.quality;
413 			}
414 		}
415 
416 		trimNames();
417 		optional=makeOptionalTags(r1, r2, perfect, scafloc, scaflen, inbounds, inbounds2);
418 //		assert(r.pairnum()==1) : "\n"+r.toText(false)+"\n"+this+"\n"+r2;
419 	}
420 
SamLine(String[] s)421 	public SamLine(String[] s){
422 		assert(!s[0].startsWith("@")) : "Tried to make a SamLine from a header: "+s[0];
423 		assert(s.length>=11) : "\nNot all required fields are present: "+s.length+"\nline='"+Arrays.toString(s)+"'\n";
424 		if(s.length<11){
425 			System.err.println("Invalid SamLine: "+Arrays.toString(s));
426 			return;
427 		}
428 		qname=s[0];
429 		flag=Integer.parseInt(s[1]);
430 		if(RNAME_AS_BYTES){
431 			rname=s[2].getBytes();
432 		}else{
433 			rnameS=s[2];
434 		}
435 		pos=Integer.parseInt(s[3]);
436 //		try {
437 //			Integer.parseInt(s[4]);
438 //		} catch (NumberFormatException e) {
439 //			System.err.println(Arrays.toString(s));
440 //		}
441 		mapq=Tools.isDigit(s[4].charAt(0)) ? Integer.parseInt(s[4]) : 99; //Added for non-compliant mappers that put * here
442 		cigar=s[5];
443 		rnext=s[6].getBytes();
444 		pnext=(s[7].charAt(0)=='*' ? 0 : Integer.parseInt(s[7]));
445 		tlen=Tools.isDigit(s[8].charAt(0)) ? Integer.parseInt(s[8]) : 0; //Added for non-compliant mappers that put * here
446 //		seq=s[9];
447 //		qual=s[10];
448 		seq=(s[9].equals(stringstar) ? null : s[9].getBytes());
449 		qual=(s[10].equals(stringstar) ? null : s[10].getBytes());
450 
451 		if(mapped() && strand()==Shared.MINUS){
452 			if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);}
453 			if(qual!=bytestar){Tools.reverseInPlace(qual);}
454 		}
455 
456 		if(qual!=null && qual!=bytestar){
457 			for(int i=0; i<qual.length; i++){qual[i]-=33;}
458 		}
459 
460 		if(!PARSE_OPTIONAL || s.length<=11){return;}
461 
462 		if(PARSE_OPTIONAL_MD_ONLY){
463 			optional=new ArrayList<String>(1);
464 			for(int i=11; i<s.length; i++){
465 				if(s[i].startsWith("MD:")){
466 					optional.add(s[i]);
467 					break;
468 				}
469 			}
470 		}else{
471 			optional=new ArrayList<String>(s.length-11);
472 			for(int i=11; i<s.length; i++){
473 				optional.add(s[i]);
474 			}
475 		}
476 
477 		trimNames();
478 	}
479 
SamLine(byte[] s)480 	public SamLine(byte[] s){
481 		assert(s[0]!='@') : "Tried to make a SamLine from a header: "+new String(s);
482 
483 		int a=0, b=0;
484 
485 		while(b<s.length && s[b]!='\t'){b++;}
486 		assert(b>a) : "Missing field 0: "+new String(s);
487 		if(PARSE_0){qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));}
488 		b++;
489 		a=b;
490 
491 		while(b<s.length && s[b]!='\t'){b++;}
492 		assert(b>a) : "Missing field 1: "+new String(s);
493 		flag=Parse.parseInt(s, a, b);
494 		b++;
495 		a=b;
496 
497 		while(b<s.length && s[b]!='\t'){b++;}
498 		assert(b>a) : "Missing field 2: "+new String(s);
499 		if(PARSE_2){
500 			if(RNAME_AS_BYTES){
501 				rname=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));
502 			}else{
503 				rnameS=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));
504 			}
505 		}
506 		b++;
507 		a=b;
508 
509 		while(b<s.length && s[b]!='\t'){b++;}
510 		assert(b>a) : "Missing field 3: "+new String(s);
511 		pos=Parse.parseInt(s, a, b);
512 		b++;
513 		a=b;
514 
515 		while(b<s.length && s[b]!='\t'){b++;}
516 		assert(b>a) : "Missing field 4: "+new String(s);
517 		mapq=Parse.parseInt(s, a, b);
518 		b++;
519 		a=b;
520 
521 		while(b<s.length && s[b]!='\t'){b++;}
522 		assert(b>a) : "Missing field 5: "+new String(s);
523 		if(PARSE_5){cigar=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));}
524 		b++;
525 		a=b;
526 
527 		while(b<s.length && s[b]!='\t'){b++;}
528 		assert(b>a) : "Missing field 6: "+new String(s);
529 		if(PARSE_6){rnext=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));}
530 		b++;
531 		a=b;
532 
533 		while(b<s.length && s[b]!='\t'){b++;}
534 		assert(b>a) : "Missing field 7: "+new String(s);
535 		if(PARSE_7){pnext=(b==a+1 && s[a]=='*' ? 0 :Parse.parseInt(s, a, b));}
536 		b++;
537 		a=b;
538 
539 		while(b<s.length && s[b]!='\t'){b++;}
540 		assert(b>a) : "Missing field 8: "+new String(s);
541 		if(PARSE_8){tlen=Parse.parseInt(s, a, b);}
542 		b++;
543 		a=b;
544 
545 		while(b<s.length && s[b]!='\t'){b++;}
546 		assert(b>a) : "Missing field 9: "+new String(s);
547 //		seq=new String(s, a, b-a);
548 		seq=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));
549 		b++;
550 		a=b;
551 
552 		while(b<s.length && s[b]!='\t'){b++;}
553 		assert(b>a) : "Missing field 10: "+new String(s);
554 //		qual=new String(s, a, b-a);
555 		if(PARSE_10){qual=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));}
556 		b++;
557 		a=b;
558 
559 		assert((seq==bytestar)==(Tools.equals(seq, bytestar)));
560 		assert((qual==bytestar)==(Tools.equals(qual, bytestar)));
561 
562 		if(mapped() && strand()==Shared.MINUS && FLIP_ON_LOAD){
563 			if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);}
564 			if(qual!=bytestar){Tools.reverseInPlace(qual);}
565 		}
566 
567 		if(qual!=null && qual!=bytestar){
568 			for(int i=0; i<qual.length; i++){qual[i]-=33;}
569 		}
570 
571 		if(!PARSE_OPTIONAL || b>=s.length){return;}
572 
573 		if(PARSE_OPTIONAL_MD_ONLY){
574 			optional=new ArrayList<String>(1);
575 			while(b<s.length){
576 				while(b<s.length && s[b]!='\t'){b++;}
577 				if(b>a){
578 					if(b>=a+3 && s[a]=='M' && s[a+1]=='D' && s[a+2]==':'){
579 						String x=new String(s, a, b-a);
580 						optional.add(x);
581 						return;
582 					}
583 				}else{
584 					//Empty field
585 				}
586 				b++;
587 				a=b;
588 			}
589 		}else{
590 			optional=new ArrayList<String>(4);
591 			while(b<s.length){
592 				while(b<s.length && s[b]!='\t'){b++;}
593 				if(b>a){
594 					String x=new String(s, a, b-a);
595 					optional.add(x);
596 				}else{
597 					//Empty field
598 				}
599 				b++;
600 				a=b;
601 			}
602 		}
603 
604 		trimNames();
605 	}
606 
trimNames()607 	public void trimNames(){
608 //		System.err.println();
609 //		System.err.println("rname= "+new String(rname));
610 //		System.err.println("qname= "+new String(qname));
611 		if(Shared.TRIM_RNAME){
612 			if(RNAME_AS_BYTES){
613 				setRname(Tools.trimToWhitespace(rname()));
614 				setRnext(Tools.trimToWhitespace(rnext()));
615 			}else{
616 				setRname(Tools.trimToWhitespace(rnameS()));
617 				setRnext(Tools.trimToWhitespace(rnext()));
618 			}
619 		}
620 		if(Shared.TRIM_READ_COMMENTS){
621 			qname=(Tools.trimToWhitespace(qname));
622 		}
623 //		System.err.println("rname2="+new String(rname));
624 //		System.err.println("qname2="+new String(qname));
625 //		assert(false) : Shared.TRIM_RNAME+", "+Shared.TRIM_READ_COMMENTS+", "+new String(rname)+", "+qname;
626 	}
627 
parseFlagOnly(byte[] s)628 	public static final int parseFlagOnly(byte[] s){
629 		assert(s!=null && s.length>0) : "Blank line.";
630 		if(s[0]=='@'){return -1;}
631 
632 		int a=0, b=0;
633 
634 		while(b<s.length && s[b]!='\t'){b++;}
635 		assert(b>a) : "Missing field 0: "+new String(s);
636 		b++;
637 		a=b;
638 
639 		while(b<s.length && s[b]!='\t'){b++;}
640 		assert(b>a) : "Missing field 1: "+new String(s);
641 		int flag=Parse.parseInt(s, a, b);
642 		return flag;
643 	}
644 
parseNameOnly(byte[] s)645 	public static final String parseNameOnly(byte[] s){
646 		assert(s!=null && s.length>0) : "Blank line.";
647 		if(s[0]=='@'){return null;}
648 
649 		int a=0, b=0;
650 
651 		while(b<s.length && s[b]!='\t'){b++;}
652 		assert(b>a) : "Missing field 0: "+new String(s);
653 		String qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));
654 		return qname;
655 	}
656 
657 	/*--------------------------------------------------------------*/
658 	/*----------------             Cigar            ----------------*/
659 	/*--------------------------------------------------------------*/
660 
toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases)661 	public static String toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases){
662 		if(match==null || readStart==readStop){return null;}
663 		ByteBuilder sb=new ByteBuilder(8);
664 		int count=0;
665 		char mode='=';
666 		char lastMode='=';
667 
668 		int refloc=readStart;
669 
670 		int cigarlen=0; //for debugging
671 		int opcount=0; //for debugging
672 
673 		for(int mpos=0; mpos<match.length; mpos++){
674 
675 			byte m=match[mpos];
676 
677 			boolean sfdflag=false;
678 			if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){
679 				mode='S'; //soft-clip out-of-bounds
680 				if(m!='I'){refloc++;}
681 				if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions!
682 			}else if(m=='m' || m=='s' || m=='S' || m=='N' || m=='B'){//Little 's' is for a match classified as a sub to improve the affine score.
683 				mode='M';
684 				refloc++;
685 			}else if(m=='I' || m=='X' || m=='Y'){
686 				mode='I';
687 			}else if(m=='D'){
688 				mode='D';
689 				refloc++;
690 			}else if(m=='C'){
691 				mode='S';
692 				refloc++;
693 			}else{
694 				throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii).  " +
695 						"Match string should be in long format here.");
696 			}
697 
698 			if(mode!=lastMode){
699 				if(count>0){//Prevents an initial length-0 match
700 					sb.append(count);
701 //					sb.append(lastMode);
702 					if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');}
703 					else{sb.append(lastMode);}
704 					if(lastMode!='D'){cigarlen+=count;}
705 					opcount+=count;
706 				}
707 				count=0;
708 				lastMode=mode;
709 			}
710 
711 			count++;
712 			if(sfdflag){count--;}
713 		}
714 		sb.append(count);
715 		if(mode=='D' && count>INTRON_LIMIT){sb.append('N');}
716 		else{sb.append(mode);}
717 		if(mode!='D'){cigarlen+=count;}
718 		opcount+=count;
719 
720 		assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" +
721 				"cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n";
722 
723 		return sb.toString();
724 	}
725 
toCigar13(String cigar14)726 	public static String toCigar13(String cigar14) {
727 		if(cigar14==null){return null;}
728 		final int len=cigar14.length();
729 
730 		int current=0;
731 		int mcount=0;
732 		ByteBuilder sb=new ByteBuilder(len);
733 
734 		for(int i=0; i<len; i++){
735 			char b=cigar14.charAt(i);
736 			if(Tools.isDigit(b)){
737 				current=(10*current)+(b-'0');
738 			}else{
739 				if(b=='X' || b=='=' || b=='M'){
740 					mcount+=current;
741 				}else{
742 					if(mcount>0){
743 						sb.append(mcount);
744 						sb.append('M');
745 						mcount=0;
746 					}
747 					sb.append(current);
748 					sb.append(b);
749 				}
750 				current=0;
751 			}
752 		}
753 		assert(current==0);
754 		if(mcount>0){
755 			sb.append(mcount);
756 			sb.append('M');
757 			mcount=0;
758 		}
759 		return sb.toString();
760 	}
761 
762 
toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases)763 	public static String toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases){
764 //		assert(false) : readStart+", "+readStop+", "+reflen;
765 		if(match==null || readStart==readStop){return null;}
766 		ByteBuilder sb=new ByteBuilder(8);
767 		int count=0;
768 		char mode='=';
769 		char lastMode='=';
770 
771 		int refloc=readStart;
772 
773 		int cigarlen=0; //for debugging
774 		int opcount=0; //for debugging
775 
776 		for(int mpos=0; mpos<match.length; mpos++){
777 
778 			byte m=match[mpos];
779 
780 			boolean sfdflag=false;
781 			if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){
782 				mode='S'; //soft-clip out-of-bounds
783 				if(m!='I'){refloc++;}
784 				if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions!
785 			}else if(m=='m' || m=='s'){//Little 's' is for a match classified as a sub to improve the affine score.
786 				mode='=';
787 				refloc++;
788 			}else if(m=='S' || m=='V'){
789 				mode='X';
790 				refloc++;
791 			}else if(m=='I' || m=='X' || m=='Y'){
792 				mode='I';
793 			}else if(m=='D'){
794 				mode='D';
795 				refloc++;
796 			}else if(m=='C'){
797 				mode='S';
798 				refloc++;
799 			}else if(m=='N' || m=='B'){
800 				mode='M';
801 				refloc++;
802 			}else{
803 				throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii).  " +
804 						"Match string should be in long format here.");
805 			}
806 
807 			if(mode!=lastMode){
808 				if(count>0){//Prevents an initial length-0 match
809 					sb.append(count);
810 					if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');}
811 					else{sb.append(lastMode);}
812 					if(lastMode!='D'){cigarlen+=count;}
813 					opcount+=count;
814 				}
815 				count=0;
816 				lastMode=mode;
817 			}
818 
819 			count++;
820 			if(sfdflag){count--;}
821 		}
822 		sb.append(count);
823 		if(mode=='D' && count>INTRON_LIMIT){
824 			sb.append('N');
825 		}else{
826 			sb.append(mode);
827 		}
828 		if(mode!='D'){cigarlen+=count;}
829 		opcount+=count;
830 
831 		assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" +
832 				"cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n";
833 
834 		return sb.toString();
835 	}
836 
calcCigarLength(boolean includeSoftClip, boolean includeHardClip)837 	public int calcCigarLength(boolean includeSoftClip, boolean includeHardClip){
838 		return calcCigarLength(cigar, includeSoftClip, includeHardClip);
839 	}
840 
calcCigarReadLength(boolean includeSoftClip, boolean includeHardClip)841 	public int calcCigarReadLength(boolean includeSoftClip, boolean includeHardClip){
842 		return calcCigarReadLength(cigar, includeSoftClip, includeHardClip);
843 	}
844 
845 	/** Reference length of cigar string */
calcCigarLength(String cigar, boolean includeSoftClip, boolean includeHardClip)846 	public static int calcCigarLength(String cigar, boolean includeSoftClip, boolean includeHardClip){
847 		if(cigar==null){return 0;}
848 		int len=0;
849 		int current=0;
850 		for(int i=0; i<cigar.length(); i++){
851 			char c=cigar.charAt(i);
852 			if(Tools.isDigit(c)){
853 				current=(current*10)+(c-'0');
854 			}else{
855 				if(c=='M' || c=='=' || c=='X' || c=='D' || c=='N'){
856 					len+=current;
857 				}else if(c=='S'){
858 					if(includeSoftClip){len+=current;}
859 				}else if (c=='H'){
860 					//In this case, the base string is the wrong length since letters were truncated.
861 					//Therefore, the bases cannot be used for calling variations after mapping.
862 					//Hard clipping messes up original location verification.
863 					//Therefore...  len+=current would be best in practice, but for GRADING purposes, leaving it disabled is best.
864 
865 					if(includeHardClip){len+=current;}
866 				}else if(c=='I'){
867 					//do nothing
868 				}else if(c=='P'){
869 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
870 					//'P' is currently poorly defined
871 				}else{
872 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
873 				}
874 				current=0;
875 			}
876 		}
877 		return len;
878 	}
879 
880 	/** Reference length of cigar string */
calcCigarReadLength(String cigar, boolean includeSoftClip, boolean includeHardClip)881 	public static int calcCigarReadLength(String cigar, boolean includeSoftClip, boolean includeHardClip){
882 		if(cigar==null){return 0;}
883 		int len=0;
884 		int current=0;
885 		for(int i=0; i<cigar.length(); i++){
886 			char c=cigar.charAt(i);
887 			if(Tools.isDigit(c)){
888 				current=(current*10)+(c-'0');
889 			}else{
890 				if(c=='M' || c=='=' || c=='X' || c=='I'){
891 					len+=current;
892 				}else if(c=='S'){
893 					if(includeSoftClip){len+=current;}
894 				}else if (c=='H'){
895 					//In this case, the base string is the wrong length since letters were truncated.
896 					//Therefore, the bases cannot be used for calling variations after mapping.
897 					//Hard clipping messes up original location verification.
898 					//Therefore...  len+=current would be best in practice, but for GRADING purposes, leaving it disabled is best.
899 
900 					if(includeHardClip){len+=current;}
901 				}else if(c=='D' || c=='N'){
902 					//do nothing
903 				}else if(c=='P'){
904 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
905 					//'P' is currently poorly defined
906 				}else{
907 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
908 				}
909 				current=0;
910 			}
911 		}
912 		return len;
913 	}
914 
915 	/** Number of query bases in cigar string */
calcCigarBases(String cigar, boolean includeSoftClip, boolean includeHardClip)916 	public static int calcCigarBases(String cigar, boolean includeSoftClip, boolean includeHardClip){
917 		if(cigar==null){return 0;}
918 		int len=0;
919 		int current=0;
920 		for(int i=0; i<cigar.length(); i++){
921 			char c=cigar.charAt(i);
922 			if(Tools.isDigit(c)){
923 				current=(current*10)+(c-'0');
924 			}else{
925 				if(c=='M' || c=='=' || c=='X' || c=='I'){
926 					len+=current;
927 				}else if(c=='D' || c=='N'){
928 					//do nothing
929 				}else if (c=='H'){
930 					if(includeHardClip){len+=current;}
931 				}else if(c=='S'){
932 					if(includeSoftClip){len+=current;}
933 				}else if(c=='P'){
934 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
935 					//'P' is currently poorly defined
936 				}else{
937 					throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n");
938 				}
939 				current=0;
940 			}
941 		}
942 		return len;
943 	}
944 
945 	/** Length of clipped initial bases.  Used to calculate correct start location of clipped reads. */
countLeadingClip(String cigar, boolean includeSoftClip, boolean includeHardClip)946 	public static int countLeadingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){
947 		if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;}
948 		int len=0;
949 		int current=0;
950 		for(int i=0; i<cigar.length(); i++){
951 			char c=cigar.charAt(i);
952 			if(Tools.isLetter(c) || c=='='){
953 				if(c=='H'){
954 					if(includeHardClip){
955 						len+=current;
956 					}
957 				}else if(c=='S'){
958 					if(includeSoftClip){
959 						len+=current;
960 					}
961 				}else{
962 					break;
963 				}
964 				current=0;
965 			}else{
966 				current=(current*10)+(c-'0');
967 			}
968 		}
969 		return len;
970 	}
971 
972 	/** Length of clipped final bases.  Used to calculate correct stop location of clipped reads. */
countTrailingClip(String cigar, boolean includeSoftClip, boolean includeHardClip)973 	public static int countTrailingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){
974 		if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;}
975 		int len=0;
976 		if(includeHardClip){len+=countTrailingHardClip(cigar);}
977 		int last=cigar.lastIndexOf('S');
978 
979 		int mult=1;
980 		int i;
981 		for(i=last-1; i>=0; i--){
982 			char c=cigar.charAt(i);
983 			if(Tools.isLetter(c) || c=='='){
984 				break;
985 			}
986 			len+=(len+(c-'0')*mult);
987 			mult*=10;
988 		}
989 		if(i<0){return 0;}
990 		return len;
991 	}
992 
993 	/** Length of clipped final bases.  Used to calculate correct stop location of clipped reads. */
countTrailingHardClip(String cigar)994 	public static int countTrailingHardClip(String cigar){
995 		if(cigar==null){return 0;}
996 		int last=cigar.lastIndexOf('H');
997 
998 		int mult=1, len=0;
999 		int i;
1000 		for(i=last-1; i>=0; i--){
1001 			char c=cigar.charAt(i);
1002 			if(Tools.isLetter(c) || c=='='){
1003 				break;
1004 			}
1005 			len+=(len+(c-'0')*mult);
1006 			mult*=10;
1007 		}
1008 		if(i<0){return 0;}
1009 		return len;
1010 	}
1011 
countMdSubs(String mdTag)1012 	public static int countMdSubs(String mdTag){
1013 		assert(mdTag!=null);
1014 
1015 		final int NORMAL=0, SUB=1, DEL=2;
1016 		int dels=0, subs=0, normals=0;
1017 
1018 		if(mdTag!=null){
1019 			int current=0;
1020 			int mode=NORMAL;
1021 			int i=0;
1022 			if(mdTag.startsWith("MD:Z:")){i=5;}
1023 			for(final int max=mdTag.length(); i<max; i++){
1024 				char c=mdTag.charAt(i);
1025 				if(Tools.isDigit(c)){
1026 					current=(current*10)+(c-'0');
1027 					mode=NORMAL;
1028 				}else{
1029 					if(current>0){
1030 						if(mode==NORMAL){normals+=current;}
1031 						else{assert(false) : mode+", "+current;}
1032 						current=0;
1033 					}
1034 					if(c=='^'){mode=DEL;}
1035 					else if(mode==DEL){
1036 						dels++;
1037 					}else if(mode==NORMAL || mode==SUB){
1038 						mode=SUB;
1039 						subs++;
1040 					}
1041 				}
1042 			}
1043 		}
1044 		return subs;
1045 	}
1046 
1047 	/** Length of clipped initial bases. */
countLeadingClip(byte[] match)1048 	public static int countLeadingClip(byte[] match){
1049 		if(match==null || match.length<1 || match[0]!='C'){return 0;}
1050 		int clips=0;
1051 		int current=0;
1052 		for(int mloc=0; mloc<match.length; mloc++){
1053 			byte b=match[mloc];
1054 			if(Tools.isDigit(b)){
1055 				current=current*10+(b-'0');
1056 			}else{
1057 				if(current>0){
1058 					clips=clips+current-1;
1059 				}
1060 				current=0;
1061 				if(b!='C'){break;}
1062 				clips++;
1063 			}
1064 		}
1065 		if(current>0){
1066 			clips=clips+current-1;
1067 		}
1068 		return clips;
1069 	}
1070 
1071 	/** Length of match string portion describing clipped initial bases. */
countLeadingClip2(byte[] match)1072 	public static int countLeadingClip2(byte[] match){
1073 		if(match==null || match.length<1 || match[0]!='C'){return 0;}
1074 		int mloc=0;
1075 		for(; mloc<match.length; mloc++){
1076 			byte b=match[mloc];
1077 			if(b!='C' && !Tools.isDigit(b)){return mloc;}
1078 		}
1079 		return match.length;
1080 	}
1081 
1082 	/** Length of clipped trailing bases. */
countTrailingClip(byte[] match)1083 	public static int countTrailingClip(byte[] match){
1084 		if(match==null){return 0;}
1085 		int clips=0;
1086 		for(int mloc=match.length-1; mloc>=0; mloc--){
1087 			byte b=match[mloc];
1088 			assert(!Tools.isDigit(b)) : new String(match);
1089 			if(b=='C'){
1090 				clips++;
1091 			}else{
1092 				break;
1093 			}
1094 		}
1095 		return clips;
1096 	}
1097 
1098 	/** Length of clipped (out of bounds) initial insertions and deletions. */
countLeadingIndels(int rloc, byte[] match)1099 	public static int countLeadingIndels(int rloc, byte[] match){
1100 		if(match==null || rloc>=0){return 0;}
1101 		int dels=0;
1102 		int inss=0;
1103 		int cloc=0;
1104 		for(int mloc=0; mloc<match.length && rloc<0; mloc++){
1105 			byte b=match[mloc];
1106 			assert(!Tools.isDigit(b));
1107 			if(b=='D'){
1108 				dels++;
1109 				rloc++;
1110 			}else if(b=='I'){
1111 				inss++;
1112 				cloc++;
1113 			}else{
1114 				rloc++;
1115 				cloc++;
1116 			}
1117 		}
1118 		return dels-inss;
1119 	}
1120 
1121 	/** Length of clipped (out of bounds) trialing insertions and deletions. */
countTrailingIndels(int rloc, int rlen, byte[] match)1122 	public static int countTrailingIndels(int rloc, int rlen, byte[] match){
1123 		if(match==null || rloc>=0){return 0;}
1124 		int dels=0;
1125 		int inss=0;
1126 		int cloc=0;
1127 		for(int mloc=match.length; mloc>=0 && rloc>=rlen; mloc--){
1128 			byte b=match[mloc];
1129 			assert(!Tools.isDigit(b));
1130 			if(b=='D'){
1131 				dels++;
1132 				rloc--;
1133 			}else if(b=='I'){
1134 				inss++;
1135 				cloc--;
1136 			}else{
1137 				rloc--;
1138 				cloc--;
1139 			}
1140 		}
1141 		return dels-inss;
1142 	}
1143 
mappedNonClippedBases()1144 	public int mappedNonClippedBases() {
1145 		if(!mapped() || cigar==null || seq==null){return 0;}
1146 
1147 		int len=0;
1148 		int current=0;
1149 		for(int i=0; i<cigar.length(); i++){
1150 			char c=cigar.charAt(i);
1151 			if(Tools.isDigit(c)){
1152 				current=(current*10)+(c-'0');
1153 			}else{
1154 				if(c=='M' || c=='='){
1155 					len+=current;
1156 				}else if(c=='X'){
1157 					len+=current;
1158 				}else if(c=='D' || c=='N'){
1159 
1160 				}else if(c=='I'){
1161 					len+=current;
1162 				}else if(c=='S' || c=='H' || c=='P'){
1163 
1164 				}
1165 				current=0;
1166 			}
1167 		}
1168 		return len;
1169 	}
1170 
1171 	/**
1172 	 * @param cigar
1173 	 * @return Max consecutive match, sub, del, ins, or clip symbols
1174 	 */
cigarToMdsiMax(String cigar)1175 	public static final int[] cigarToMdsiMax(String cigar) {
1176 		if(cigar==null){return null;}
1177 		int[] msdic=KillSwitch.allocInt1D(5);
1178 
1179 		int current=0;
1180 		for(int i=0; i<cigar.length(); i++){
1181 			char c=cigar.charAt(i);
1182 			if(Tools.isDigit(c)){
1183 				current=(current*10)+(c-'0');
1184 			}else{
1185 				if(c=='M' || c=='='){
1186 					msdic[0]=Tools.max(msdic[0], current);
1187 				}else if(c=='X'){
1188 					msdic[1]=Tools.max(msdic[1], current);
1189 				}else if(c=='D' || c=='N'){
1190 					msdic[2]=Tools.max(msdic[2], current);
1191 				}else if(c=='I'){
1192 					msdic[3]=Tools.max(msdic[3], current);
1193 				}else if(c=='S' || c=='H' || c=='P'){
1194 					msdic[4]=Tools.max(msdic[4], current);
1195 				}
1196 				current=0;
1197 			}
1198 		}
1199 		return msdic;
1200 	}
1201 
calcIdentity()1202 	public float calcIdentity() {
1203 		assert(cigar!=null);
1204 		int match=0, other=0;
1205 
1206 		int current=0;
1207 		for(int i=0; i<cigar.length(); i++){
1208 			char c=cigar.charAt(i);
1209 			if(Tools.isDigit(c)){
1210 				current=(current*10)+(c-'0');
1211 			}else{
1212 				if(c=='='){
1213 					match+=current;
1214 				}else if(c=='M'){
1215 
1216 				}else if(c=='X'){
1217 					other+=current;
1218 				}else if(c=='D'){
1219 					other+=current;
1220 				}else if(c=='N'){
1221 
1222 				}else if(c=='I'){
1223 					other+=current;
1224 				}else if(c=='S' || c=='H' || c=='P'){
1225 
1226 				}
1227 				current=0;
1228 			}
1229 		}
1230 		return match/(float)Tools.max(match+other, 1);
1231 	}
1232 
1233 	/**
1234 	 * @param cigar
1235 	 * @return Total number of match, sub, del, ins, or clip symbols
1236 	 */
cigarToMsdic(String cigar)1237 	public static final int[] cigarToMsdic(String cigar) {
1238 		if(cigar==null){return null;}
1239 		int[] msdic=KillSwitch.allocInt1D(5);
1240 
1241 		int current=0;
1242 		for(int i=0; i<cigar.length(); i++){
1243 			char c=cigar.charAt(i);
1244 			if(Tools.isDigit(c)){
1245 				current=(current*10)+(c-'0');
1246 			}else{
1247 				if(c=='M' || c=='='){
1248 					msdic[0]+=current;
1249 				}else if(c=='X'){
1250 					msdic[1]+=current;
1251 				}else if(c=='D' || c=='N'){
1252 					msdic[2]+=current;
1253 				}else if(c=='I'){
1254 					msdic[3]+=current;
1255 				}else if(c=='S' || c=='H' || c=='P'){
1256 					msdic[4]+=current;
1257 				}
1258 				current=0;
1259 			}
1260 		}
1261 		return msdic;
1262 	}
1263 
1264 	/**
1265 	 * @param allowM Allow M symbols in the cigar string
1266 	 * @return Match string of this cigar string when possible, otherwise null.
1267 	 * Takes into account MD tag and bases, but not reference (other than in MD tag).
1268 	 */
toShortMatch(boolean allowM)1269 	public final byte[] toShortMatch(boolean allowM) {
1270 		if(cigar==null || cigar.equals(stringstar)){return null;}
1271 		if(allowM){return cigarToShortMatch_old(cigar, allowM);}
1272 
1273 //		System.err.println("\nInput: cigar="+cigar+", MD="+mdTag());//123
1274 
1275 		final boolean fixMatchSubs;
1276 		final boolean fixMatchNs;
1277 		boolean foundE=false;
1278 		boolean foundX=false;
1279 		boolean foundM=false;
1280 //		System.err.println("Block 1.");//123
1281 		{
1282 			int current=0, total=0;
1283 			for(int i=0; i<cigar.length(); i++){
1284 				char c=cigar.charAt(i);
1285 
1286 				if(Tools.isDigit(c)){
1287 					current=(current*10)+(c-'0');
1288 				}else{
1289 
1290 					if(c=='H'){
1291 						current=0; //Information destroyed
1292 					}else if(c=='P'){
1293 						return null; //Undefined symbol
1294 					}
1295 					foundE|=(c=='=');
1296 					foundX|=(c=='X');
1297 					foundM|=(c=='M');
1298 
1299 					total+=current;
1300 					current=0;
1301 				}
1302 			}
1303 			if(total<1){return null;}
1304 			fixMatchSubs=(!allowM && foundM && !foundX && !foundE);//Note: allowM already exited.
1305 			fixMatchNs=(FIX_MATCH_NS && foundX && !foundM); //Means no-calls are possibly marked as X, which is technically OK.
1306 
1307 //			System.err.println("allowM="+allowM);//123
1308 //			System.err.println("foundE="+foundE);//123
1309 //			System.err.println("foundX="+foundX);//123
1310 //			System.err.println("foundM="+foundM);//123
1311 		}
1312 
1313 //		System.err.println("Block 2.");//123
1314 		final String mdTag;
1315 		final int mdSubs;
1316 		final byte[] refBases;
1317 
1318 		//1) if fixMatch, grab MD tag
1319 		//2) if MD and no subs, return
1320 		//3) grab ref bases
1321 
1322 		if(fixMatchSubs || fixMatchNs){
1323 			final String md0=mdTag();
1324 			mdSubs=(md0==null ? -1 : countMdSubs(md0));
1325 			if(mdSubs==0 && !fixMatchNs){
1326 				refBases=null;
1327 				mdTag=null;
1328 			}else{
1329 				mdTag=md0;
1330 				if(mdTag!=null && PREFER_MDTAG){
1331 					refBases=null;
1332 				}else{
1333 					ScafMap map=ScafMap.defaultScafMap();
1334 					assert(!fixMatchSubs || mdTag!=null || map!=null) : "TODO: Encountered a read with 'M' in cigar string but no MD tag and no ScafMap loaded.\n"
1335 							+ "This can normally be resolved by adding the flag ref=file, where file is the fasta file to which the reads were mapped.\n\n"+this;
1336 					if(map==null){
1337 						refBases=null;
1338 					}else{
1339 						Scaffold scaf=map.getScaffold(rnameS());
1340 						assert(!fixMatchSubs || mdTag!=null || scaf!=null) : "Encountered a read with 'M' in cigar string but no scaffold loaded for "+rnameS();
1341 						if(scaf==null){
1342 							refBases=null;
1343 						}else{
1344 							refBases=scaf.bases;
1345 							assert(!fixMatchSubs || mdTag!=null || refBases!=null) : "Encountered a read with 'M' in cigar string but no ref bases loaded for "+rnameS();
1346 							if(fixMatchSubs && refBases==null && mdTag==null){
1347 								return null;
1348 							}
1349 						}
1350 					}
1351 				}
1352 			}
1353 		}else{
1354 			mdSubs=-1;
1355 			mdTag=null;
1356 			refBases=null;
1357 		}
1358 
1359 //		System.err.println("mdTag="+mdTag);//123
1360 //		System.err.println("mdSubs="+mdSubs);//123
1361 
1362 		char mSymbol=((foundX || foundE || !foundM) ? 'N' : mdSubs>=0 ? 'm' : 'N');
1363 
1364 //		System.err.println("Block 3.");//123
1365 		final byte[] match0;
1366 		{
1367 			ByteBuilder sb=new ByteBuilder(cigar.length());
1368 			int current=0;
1369 			for(int cpos=0, max=cigar.length(); cpos<max; cpos++){
1370 				char c=cigar.charAt(cpos);
1371 				if(Tools.isDigit(c)){
1372 					current=(current*10)+(c-'0');
1373 				}else{
1374 					if(c=='='){
1375 						sb.append('m');
1376 						if(current>1){sb.append(current);}
1377 					}else if(c=='X'){
1378 						sb.append('S');
1379 						if(current>1){sb.append(current);}
1380 					}else if(c=='D' || c=='N'){
1381 						sb.append('D');
1382 						if(current>1){sb.append(current);}
1383 					}else if(c=='I'){
1384 						sb.append('I');
1385 						if(current>1){sb.append(current);}
1386 					}else if(c=='S'){
1387 						sb.append('C');
1388 						if(current>1){sb.append(current);}
1389 					}else if(c=='M'){
1390 						sb.append(mSymbol);
1391 						if(current>1){sb.append(current);}
1392 					}
1393 					current=0;
1394 				}
1395 			}
1396 			match0=(sb.array.length==sb.length() ? sb.array : sb.toBytes());
1397 //			System.err.println("match="+new String(match0));//123
1398 		}
1399 
1400 //		System.err.println("Block 4.");//123
1401 
1402 		if((!fixMatchSubs || mdSubs==0) && (!fixMatchNs || seq==null || refBases==null)){return match0;}
1403 		assert(((mdTag!=null || refBases!=null) && fixMatchSubs && mdSubs!=0) || fixMatchNs /*|| noCalls>0*/) :
1404 			(mdTag!=null)+", "+(refBases!=null)+", "+(fixMatchSubs)+", "+(mdSubs!=0)+", "+fixMatchNs/*+", "+(noCalls)*/;
1405 
1406 //		assert(false) : mdTag+", "+refBases+", "+processMD+", "+mdSubs+"\n"+this;
1407 
1408 //		System.err.println("processMD="+processMD+", mdSubs="+mdSubs+", mdTag="+mdTag);//123
1409 
1410 		final byte[] bases;
1411 		if(refBases!=null){
1412 //			bases=(strand()==1) ? AminoAcid.reverseComplementBases(seq) : seq;//Why not reverse in place?
1413 			if(strand()==1){AminoAcid.reverseComplementBasesInPlace(seq);}
1414 			bases=seq;
1415 		}else{bases=null;}
1416 
1417 //		System.err.println("Block 5.");//123
1418 		final byte[] longmatch=Read.toLongMatchString(match0);
1419 
1420 //		final int noCalls=((foundE || foundX) && foundM ? 1 : seq==null ? -1 : Read.countNocalls(seq));
1421 
1422 //		System.err.println("Block 6");//123
1423 
1424 		if(mdTag!=null && (refBases==null || PREFER_MDTAG)){
1425 //			System.err.println("match="+new String(longmatch));//123
1426 
1427 			final int noCalls=seq==null ? -1 : Read.countNocalls(seq);
1428 			if(noCalls>0 && bases!=null && refBases==null){//Not necessary if ref sequence is present
1429 				int bpos=0;
1430 				for(int mpos=0; mpos<longmatch.length; mpos++){
1431 					final byte m=longmatch[mpos];
1432 					if(m=='C'){
1433 						bpos++;
1434 					}else if(m=='m' || m=='s' || m=='S' || m=='N'){
1435 						if(!AminoAcid.isFullyDefined(bases[bpos])){longmatch[mpos]='N';}
1436 						bpos++;
1437 					}else if(m=='I' || m=='X' || m=='Y'){
1438 						bpos++;
1439 					}else if(m=='D'){
1440 						//do nothing
1441 					}else{
1442 						assert(false) : m;
1443 					}
1444 				}
1445 			}
1446 
1447 			final MDWalker walker=new MDWalker(mdTag, cigar, longmatch, this);
1448 
1449 			walker.fixMatch(bases);
1450 		}else
1451 		if(refBases!=null){
1452 			final int refStart=start(true, false);
1453 			fixMatch(bases, refBases, longmatch, refStart, false);
1454 		}
1455 
1456 //		else if(refBases!=null){
1457 //			final int refStart=start(true, false);
1458 //			fixMatch(bases, refBases, longmatch, refStart, false);
1459 //		}
1460 		else{
1461 			assert(false) : "Fallthorugh.";
1462 		}
1463 
1464 		final byte[] match=Read.toShortMatchString(longmatch);
1465 
1466 		if(bases!=null && strand()==1){AminoAcid.reverseComplementBasesInPlace(seq);}
1467 
1468 //		System.err.println("Block 7.");//123
1469 //		System.err.println("Returning "+new String(match));//123
1470 		return match;
1471 	}
1472 
1473 	/** Requires longmatch.
1474 	 * Replaces M  */
fixMatch(byte[] call, byte[] ref, byte[] match, int refstart, boolean unClip)1475 	public static void fixMatch(byte[] call, byte[] ref, byte[] match, int refstart, boolean unClip){
1476 		for(int mpos=0, rpos=refstart, cpos=0; mpos<match.length; mpos++){
1477 			assert(cpos>=0 && cpos<call.length) : "\n"+new String(match)+"\n"+new String(call)+"\n"+mpos+", "+cpos;
1478 			final byte m=match[mpos];
1479 
1480 			if(rpos<0 || rpos>=ref.length){
1481 				if(m=='I'){
1482 					assert(false) : "Insertion off scaffold end: "+refstart+", "+ref.length+"\n"+new String(call)+"\n"+new String(match);
1483 					cpos++;
1484 				}else if(m=='D'){
1485 					assert(false) : "Deletion off scaffold end: "+refstart+", "+ref.length+"\n"+new String(call)+"\n"+new String(match);
1486 					rpos++;
1487 				}else{
1488 					match[mpos]='C';
1489 					rpos++;
1490 					cpos++;
1491 				}
1492 			}else if(m=='m' || m=='S' || m=='N' || m=='s' || (m=='C' && unClip)){
1493 				final byte c=Tools.toUpperCase(call[cpos]);
1494 				final byte r=Tools.toUpperCase(ref[rpos]);
1495 				final boolean defined=(AminoAcid.isFullyDefined(c) && AminoAcid.isFullyDefined(r));
1496 				if(!defined){
1497 					match[mpos]='N';
1498 				}else if(c==r){
1499 					match[mpos]='m';
1500 				}else{
1501 					match[mpos]='S';
1502 				}
1503 				rpos++;
1504 				cpos++;
1505 			}else if(m=='C'){ //Do nothing for clipped call
1506 				rpos++;
1507 				cpos++;
1508 			}else if(m=='I' || m=='X' || m=='Y'){
1509 				cpos++;
1510 			}else if(m=='D'){
1511 				rpos++;
1512 			}else{
1513 				assert(false) : Character.toString((char)m);
1514 			}
1515 		}
1516 	}
1517 
1518 	/**
1519 	 * @param cigar
1520 	 * @return Match string of this cigar string when possible, otherwise null
1521 	 */
cigarToShortMatch_old(String cigar, boolean allowM)1522 	public static final byte[] cigarToShortMatch_old(String cigar, boolean allowM) {
1523 
1524 		int current=0;
1525 		ByteBuilder sb=new ByteBuilder(cigar.length());
1526 
1527 		for(int i=0; i<cigar.length(); i++){
1528 			char c=cigar.charAt(i);
1529 			if(Tools.isDigit(c)){
1530 				current=(current*10)+(c-'0');
1531 			}else{
1532 				if(c=='='){
1533 					sb.append('m');
1534 					if(current>1){sb.append(current);}
1535 				}else if(c=='X'){
1536 					sb.append('S');
1537 					if(current>1){sb.append(current);}
1538 				}else if(c=='D' || c=='N'){
1539 					sb.append('D');
1540 					if(current>1){sb.append(current);}
1541 				}else if(c=='I'){
1542 					sb.append('I');
1543 					if(current>1){sb.append(current);}
1544 				}else if(c=='S'){
1545 					sb.append('C');
1546 					if(current>1){sb.append(current);}
1547 				}else if(c=='M'){
1548 					if(!allowM){return null;}
1549 //					sb.append('B');
1550 					sb.append('N');
1551 					if(current>1){sb.append(current);}
1552 				}
1553 				current=0;
1554 			}
1555 		}
1556 
1557 		if(sb.array.length==sb.length()){return sb.array;}
1558 		return sb.toBytes();
1559 	}
1560 
1561 	/*--------------------------------------------------------------*/
1562 	/*----------------             Tags             ----------------*/
1563 	/*--------------------------------------------------------------*/
1564 
makeStopTag(int pos, int seqLength, String cigar, boolean perfect)1565 	public static String makeStopTag(int pos, int seqLength, String cigar, boolean perfect){
1566 //		return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : -countLeadingClip(cigar, false)+calcCigarLength(cigar, false))-1); //123456789
1567 		return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : calcCigarLength(cigar, true, false))-1);
1568 	}
1569 
makeLengthTag(int pos, int seqLength, String cigar, boolean perfect)1570 	public static String makeLengthTag(int pos, int seqLength, String cigar, boolean perfect){
1571 		if(cigar==null || perfect){return "YL:Z:"+seqLength+","+seqLength;}
1572 		return "YL:Z:"+(seqLength-countLeadingClip(cigar, true, false))+","+calcCigarLength(cigar, false, false);
1573 	}
1574 
makeIdentityTag(byte[] match, boolean perfect)1575 	public static String makeIdentityTag(byte[] match, boolean perfect){
1576 		if(perfect){return "YI:f:100";}
1577 		float f=Read.identity(match);
1578 		return String.format(Locale.ROOT, "YI:f:%.2f", (100*f));
1579 	}
1580 
makeScoreTag(int score)1581 	public static String makeScoreTag(int score){
1582 		return "YR:i:"+score;
1583 	}
1584 
matchTag()1585 	public String matchTag(){
1586 		if(optional==null){return null;}
1587 		for(String s : optional){
1588 			if(s.startsWith("X2:Z:")){
1589 				return s;
1590 			}
1591 		}
1592 		return null;
1593 	}
1594 
makeXSTag(Read r)1595 	private String makeXSTag(Read r){
1596 		if(r.mapped() && cigar!=null && cigar.indexOf('N')>=0){
1597 //			System.err.println("For read "+r.pairnum()+" mapped to strand "+r.strand());
1598 			boolean plus=(r.strand()==Shared.PLUS); //Assumes secondstrand=false
1599 //			System.err.println("plus="+plus);
1600 			if(r.pairnum()!=0){plus=!plus;}
1601 //			System.err.println("plus="+plus);
1602 			if(XS_SECONDSTRAND){plus=!plus;}
1603 //			System.err.println("plus="+plus);
1604 			return (plus ? XSPLUS : XSMINUS);
1605 		}else{
1606 			return null;
1607 		}
1608 	}
1609 
makeMdTag(int chrom, int refstart, byte[] match, byte[] call, int scafloc, int scaflen)1610 	public static String makeMdTag(int chrom, int refstart, byte[] match, byte[] call, int scafloc, int scaflen){
1611 		if(match==null || chrom<0){return null;}
1612 		ByteBuilder md=new ByteBuilder(8);
1613 		md.append("MD:Z:");
1614 
1615 		ChromosomeArray cha=Data.getChromosome(chrom);
1616 
1617 		final int scafstop=scafloc+scaflen;
1618 
1619 		byte prevM='?';
1620 		int count=0;
1621 		int dels=0;
1622 		boolean prevSub=false;
1623 		for(int mpos=0, rpos=refstart, cpos=0; mpos<match.length; mpos++){
1624 			assert(cpos>=0 && cpos<call.length) : "\n"+new String(match)+"\n"+new String(call)+"\n"+mpos+", "+cpos+", "+dels+", "+INTRON_LIMIT;
1625 			final byte c=call[cpos];
1626 			final byte m=match[mpos];
1627 
1628 			if(prevM=='D' && m!='D'){
1629 				if(dels<=INTRON_LIMIT){//Otherwise, ignore it
1630 					md.append(count);
1631 					count=0;
1632 					md.append('^');
1633 					for(int i=rpos-dels; i<rpos; i++){
1634 						md.append((char)cha.get(i));
1635 					}
1636 					dels=0;
1637 				}
1638 			}
1639 
1640 			if(m=='C' || rpos<scafloc || rpos>=scafstop){ //Do nothing for clipped bases
1641 				rpos++;
1642 				if(m!='D'){cpos++;}
1643 			}else if(m=='m' || m=='s'){
1644 				count++;
1645 				rpos++;
1646 				cpos++;
1647 			}else if(m=='S'){
1648 				if(count>0 || !prevSub){md.append(count);}
1649 				md.append((char)cha.get(rpos));
1650 
1651 				count=0;
1652 				rpos++;
1653 				cpos++;
1654 				prevSub=true;
1655 			}else if(m=='N'){
1656 
1657 				final byte r=cha.get(rpos);
1658 
1659 				if(c==r){//Act like match
1660 					count++;
1661 					rpos++;
1662 					cpos++;
1663 				}else{//Act like sub
1664 					if(count>0 || !prevSub){md.append(count);}
1665 					md.append((char)r);
1666 
1667 					count=0;
1668 					rpos++;
1669 					cpos++;
1670 					prevSub=true;
1671 				}
1672 			}else if(m=='I' || m=='X' || m=='Y'){
1673 				cpos++;
1674 //				count++;
1675 			}else if(m=='D'){
1676 //				if(prevM!='D'){
1677 //					md.append(count);
1678 //					count=0;
1679 //					md.append('^');
1680 //				}
1681 //				md.append((char)cha.get(rpos));
1682 
1683 				rpos++;
1684 				dels++;
1685 			}
1686 			prevM=m;
1687 
1688 		}
1689 //		if(count>0){
1690 			md.append(count);
1691 //		}
1692 
1693 		return md.toString();
1694 	}
1695 
calcLeftClip(String cig, String id)1696 	public static int calcLeftClip(String cig, String id){
1697 		if(cig==null){return 0;}
1698 		int len=0;
1699 		for(int i=0; i<cig.length(); i++){
1700 			char c=cig.charAt(i);
1701 			if(Tools.isDigit(c)){
1702 				len=len*10+(c-'0');
1703 			}else{
1704 				assert(c!='S' || i<cig.length()-1);//ban entirely soft-clipped reads
1705 				return (c=='S') ? len : 0;
1706 			}
1707 		}
1708 		return 0;
1709 	}
1710 
calcRightClip(String cig, String id)1711 	public static int calcRightClip(String cig, String id){
1712 		if(cig==null || cig.length()<1 || cig.charAt(cig.length()-1)!='S'){return 0;}
1713 		int pos=cig.length()-2;
1714 		for(; pos>=0 && Tools.isDigit(cig.charAt(pos)); pos--){}
1715 
1716 		assert(pos>0) : cig+", id="+id+", pos="+pos;//ban entirely soft-clipped reads
1717 
1718 		int len=0;
1719 		for(int i=pos+1; i<cig.length(); i++){
1720 			char c=cig.charAt(i);
1721 			if(Tools.isDigit(c)){
1722 				len=len*10+(c-'0');
1723 			}else{
1724 				return (c=='S') ? len : 0;
1725 			}
1726 		}
1727 		return len;
1728 	}
1729 
makeOptionalTags(Read r, Read r2, boolean perfect, int scafloc, int scaflen, boolean inbounds, boolean inbounds2)1730 	public ArrayList<String> makeOptionalTags(Read r, Read r2, boolean perfect, int scafloc, int scaflen, boolean inbounds, boolean inbounds2){
1731 		if(NO_TAGS){return null;}
1732 		final boolean mapped=r.mapped();
1733 		if(!mapped && READGROUP_ID==null && !MAKE_CUSTOM_TAGS && !MAKE_TIME_TAG){return null;}
1734 
1735 		ArrayList<String> optionalTags=new ArrayList<String>(8);
1736 
1737 		if(mapped){
1738 			if(!r.secondary() && r.ambiguous()){optionalTags.add("XT:A:R");} //Not sure what do do for secondary alignments
1739 
1740 //			int nm=r.length();
1741 //			int dels=0;
1742 
1743 			int nm=0;
1744 
1745 //			//Only works for cigar strings in format 1.4+
1746 //			if(perfect){nm=0;}else if(cigar!=null){
1747 //				int len=0;
1748 //				for(int i=0; i<cigar.length(); i++){
1749 //					char c=cigar.charAt(i);
1750 //					if(Tools.isDigit(c)){
1751 //						len=len*10+(c-'0');
1752 //					}else{
1753 //						if(c=='X' || c=='I' || c=='D' || c=='M'){
1754 //							nm+=len;
1755 //						}
1756 //						len=0;
1757 //					}
1758 //				}
1759 ////				System.err.println("\nRead "+r.id+": nm="+nm+"\n"+cigar+"\n"+new String(r.match));
1760 //				System.err.println("\nRead "+r.id+": nm="+nm);
1761 //			}
1762 
1763 			if(perfect){nm=0;}else if(r.match!=null){
1764 				nm=0;
1765 				int leftclip=calcLeftClip(cigar, r.id), rightclip=calcRightClip(cigar, r.id);
1766 				final int from=leftclip, to=r.length()-rightclip;
1767 				int delsCurrent=0;
1768 				for(int i=0, cpos=0; i<r.match.length; i++){
1769 					final byte b=r.match[i];
1770 
1771 //					System.err.println("i="+i+", cpos="+cpos+", from="+from+", ")
1772 
1773 					if(cpos>=from && cpos<to){
1774 						if(b=='I' || b=='S' || b=='N' || b=='X' || b=='Y'){nm++;}
1775 
1776 						if(b=='D'){delsCurrent++;}
1777 						else{
1778 							if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;}
1779 							delsCurrent=0;
1780 						}
1781 					}
1782 					if(b!='D'){cpos++;}
1783 				}
1784 				if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;}
1785 				//				assert(false) : nm+",  "+dels+", "+delsCurrent+", "+r.length()+", "+r.match.length;
1786 
1787 //				assert(false) : "rlen="+r.length()+", nm="+nm+", dels="+delsCurrent+", intron="+INTRON_LIMIT+", inbound1="+inbounds+", ib2="+inbounds2+"\n"+new String(r.match);
1788 
1789 //				System.err.println("\nRead "+r.id+": left="+leftclip+", right="+rightclip+", nm="+nm+"\n"+cigar+"\n"+new String(r.match));
1790 
1791 			}
1792 
1793 			if(MAKE_NM_TAG){
1794 				if(perfect){optionalTags.add("NM:i:0");}
1795 				else if(r.match!=null){optionalTags.add("NM:i:"+(nm));}
1796 			}
1797 			if(MAKE_SM_TAG){optionalTags.add("SM:i:"+mapq);}
1798 			if(MAKE_AM_TAG){optionalTags.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, r2.mapScore/r2.length()) : 0)));}
1799 
1800 			if(MAKE_TOPHAT_TAGS){
1801 				optionalTags.add("AS:i:0");
1802 				if(cigar==null || cigar.indexOf('N')<0){
1803 					optionalTags.add("XN:i:0");
1804 				}else{
1805 				}
1806 				optionalTags.add("XM:i:0");
1807 				optionalTags.add("XO:i:0");
1808 				optionalTags.add("XG:i:0");
1809 				if(cigar==null || cigar.indexOf('N')<0){
1810 					optionalTags.add("YT:Z:UU");
1811 				}else{
1812 				}
1813 				optionalTags.add("NH:i:1");
1814 			}else if(MAKE_XM_TAG){//XM tag.  For bowtie compatibility; unfortunately it is poorly defined.
1815 				int x=0;
1816 				if(r.discarded() || (!r.ambiguous() && !mapped)){
1817 					x=0;//TODO: See if the flag needs to be present in this case.
1818 				}else if(mapped){
1819 					x=1;
1820 					if(r.numSites()>0 && r.numSites()>0){
1821 						int z=r.topSite().score;
1822 						for(int i=1; i<r.sites.size(); i++){
1823 							SiteScore ss=r.sites.get(i);
1824 							if(ss!=null && ss.score==z){x++;}
1825 						}
1826 					}
1827 					if(r.ambiguous()){x=Tools.max(x, 2);}
1828 				}
1829 				if(x>=0){optionalTags.add("XM:i:"+x);}
1830 			}
1831 
1832 			//XS tag
1833 			if(MAKE_XS_TAG){
1834 				String xs=makeXSTag(r);
1835 				if(xs!=null){
1836 					optionalTags.add(xs);
1837 					assert(r2==null || r.pairnum()!=r2.pairnum());
1838 					//					assert(r2==null || !r2.mapped() || r.strand()==r2.strand() || makeXSTag(r2)==xs) :
1839 					//						"XS problem:\n"+r+"\n"+r2+"\n"+xs+"\n"+makeXSTag(r2)+"\n";
1840 				}
1841 			}
1842 
1843 			if(MAKE_MD_TAG){
1844 				String md=makeMdTag(r.chrom, r.start, r.match, r.bases, scafloc, scaflen);
1845 				if(md!=null){optionalTags.add(md);}
1846 			}
1847 
1848 			if(r.mapped() && MAKE_NH_TAG){
1849 				if(ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS && r.numSites()>1){
1850 					optionalTags.add("NH:i:"+r.sites.size());
1851 				}else{
1852 					optionalTags.add("NH:i:1");
1853 				}
1854 			}
1855 
1856 			if(MAKE_STOP_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeStopTag(pos, r.length(), cigar, perfect));}
1857 
1858 			if(MAKE_LENGTH_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeLengthTag(pos, r.length(), cigar, perfect));}
1859 
1860 			if(MAKE_IDENTITY_TAG && (perfect || r.match!=null)){optionalTags.add(makeIdentityTag(r.match, perfect));}
1861 
1862 			if(MAKE_SCORE_TAG && r.mapped()){optionalTags.add(makeScoreTag(r.mapScore));}
1863 
1864 			if(MAKE_INSERT_TAG && r2!=null){
1865 				if((r.mapped() && r.paired()) || r.originalSite!=null){
1866 					optionalTags.add("X8:Z:"+r.insertSizeMapped(false)+(r.originalSite==null ? "" : ","+r.insertSizeOriginalSite()));
1867 				}
1868 			}
1869 			if(MAKE_CORRECTNESS_TAG){
1870 				final SiteScore ss0=r.originalSite;
1871 				if(ss0!=null){
1872 					optionalTags.add("X9:Z:"+(ss0.isCorrect(r.chrom, r.strand(), r.start, r.stop, 0) ? "T" : "F"));
1873 				}
1874 			}
1875 		}
1876 
1877 		if(READGROUP_ID!=null){
1878 			assert(READGROUP_TAG!=null);
1879 			optionalTags.add(READGROUP_TAG);
1880 		}
1881 
1882 		if(MAKE_CUSTOM_TAGS){
1883 			int sites=r.numSites() + (r.originalSite==null ? 0 : 1);
1884 			if(sites>0){
1885 				ByteBuilder sb=new ByteBuilder();
1886 				sb.append("X1:Z:");
1887 				if(r.sites!=null){
1888 					for(SiteScore ss : r.sites){
1889 						sb.append('$');
1890 						sb.append(ss.toText());
1891 					}
1892 				}
1893 				if(r.originalSite!=null){
1894 					sb.append('$');
1895 					sb.append('*');
1896 					sb.append(r.originalSite.toText());
1897 				}
1898 				optionalTags.add(sb.toString());
1899 			}
1900 
1901 			if(mapped){
1902 				if(r.match!=null){
1903 					byte[] match=r.match;
1904 					if(!r.shortmatch()){
1905 						match=Read.toShortMatchString(match);
1906 					}
1907 					optionalTags.add("X2:Z:"+new String(match));
1908 				}
1909 
1910 				optionalTags.add("X3:i:"+r.mapScore);
1911 			}
1912 			optionalTags.add("X5:Z:"+r.numericID);
1913 			optionalTags.add("X6:i:"+(r.flags|(r.match==null ? 0 : Read.SHORTMATCHMASK)));
1914 			if(r.copies>1){optionalTags.add("X7:i:"+r.copies);}
1915 		}
1916 
1917 		if(MAKE_TIME_TAG){
1918 			assert(r.obj!=null && r.obj.getClass()==Long.class) : r.obj;
1919 			optionalTags.add("X0:i:"+(r.obj==null ? 0 : r.obj));
1920 		}
1921 
1922 		if(MAKE_BOUNDS_TAG){
1923 			String a=(r.mapped() ? inbounds ? "I" : "O" : "U");
1924 			if(r2==null){
1925 				optionalTags.add("XB:Z:"+a);
1926 			}else{
1927 				String b=(r2.mapped() ? inbounds2 ? "I" : "O" : "U");
1928 				optionalTags.add("XB:Z:"+a+b);
1929 			}
1930 		}
1931 
1932 		return optionalTags;
1933 	}
1934 
1935 	/*--------------------------------------------------------------*/
1936 	/*----------------            ?            ----------------*/
1937 	/*--------------------------------------------------------------*/
1938 
1939 	/** Length of read bases */
length()1940 	public int length(){
1941 		assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) :
1942 			"This program requires bases or a cigar string for every sam line.  Problem line:\n"+this+"\n";
1943 		return seq==null ? calcCigarBases(cigar, true, false) : seq.length;
1944 	}
1945 
1946 //	public int length(boolean includeSoftClip){
1947 //		assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) :
1948 //			"This program requires bases or a cigar string for every sam line.  Problem line:\n"+this+"\n";
1949 //		return seq==null ? calcCigarBases(cigar, includeSoftClip, false) : seq.length;
1950 //	}
1951 
toMapq(Read r, SiteScore ss)1952 	public static int toMapq(Read r, SiteScore ss){
1953 		assert(r!=null);
1954 		int score=(ss==null ? r.mapScore : ss.slowScore);
1955 		return toMapq(score, r.length(), r.mapped(), r.ambiguous());
1956 	}
1957 
toMapq(int score, int length, boolean mapped, boolean ambig)1958 	public static int toMapq(int score, int length, boolean mapped, boolean ambig){
1959 		if(!mapped || length<1){return 0;}
1960 
1961 		if(ambig && PENALIZE_AMBIG){
1962 			float max=3;
1963 			float adjusted=(score*max)/(100f*length);
1964 			return Tools.max(1, (int)Math.round(adjusted));
1965 		}else{
1966 			float score2=(score-length*40)*1.6f;
1967 			float max=1.5f*((float)Tools.log2(length))+36;
1968 			float adjusted=(score2*max)/(100f*length);
1969 			return Tools.max(4, (int)Math.round(adjusted));
1970 		}
1971 	}
1972 
1973 
parseName()1974 	public Read parseName(){
1975 		try {
1976 			String[] answer=qname.split("_");
1977 			long id=Long.parseLong(answer[0]);
1978 			int trueChrom=Gene.toChromosome(answer[1]);
1979 			byte trueStrand=Byte.parseByte(answer[2]);
1980 			int trueLoc=Integer.parseInt(answer[3]);
1981 			int trueStop=Integer.parseInt(answer[4]);
1982 //			for(int i=0; i<quals.length; i++){quals[i]-=33;}
1983 //			Read r=new Read(seq.getBytes(), trueChrom, trueStrand, trueLoc, trueStop, qname, quals, false, id);
1984 			Read r=new Read(seq, qual, qname, id, trueStrand, trueChrom, trueLoc, trueStop);
1985 			return r;
1986 		} catch (NumberFormatException e) {
1987 			// TODO Auto-generated catch block
1988 			e.printStackTrace();
1989 			return null;
1990 		}
1991 	}
1992 
parseNumericId()1993 	public long parseNumericId(){
1994 //		return Long.parseLong(qname.substring(0, qname.indexOf('_')));
1995 		return Long.parseLong(qname.split("_")[1]);
1996 	}
1997 
toRead(boolean parseCustom)1998 	public Read toRead(boolean parseCustom){
1999 		return toRead(parseCustom, false);
2000 	}
2001 
toRead(boolean parseCustom, boolean includeHardClip)2002 	public Read toRead(boolean parseCustom, boolean includeHardClip){
2003 
2004 		SiteScore originalSite=null;
2005 		long numericId_=0;
2006 		boolean synthetic=false;
2007 
2008 		if(parseCustom){
2009 
2010 
2011 			Header h=new Header(qname, pairnum());
2012 
2013 			numericId_=h.id;
2014 			int trueChrom=h.bbchrom;
2015 			byte trueStrand=(byte)h.strand;
2016 			int trueLoc=h.bbstart;
2017 			int trueStop=h.bbstop();
2018 
2019 			originalSite=new SiteScore(trueChrom, trueStrand, trueLoc, trueStop, 0, 0);
2020 			synthetic=true;
2021 
2022 //			try {
2023 //				String[] answer=qname.split("_");
2024 //				numericId_=Long.parseLong(answer[0]);
2025 //				int trueChrom=Gene.toChromosome(answer[1]);
2026 //				byte trueStrand=Byte.parseByte(answer[2]);
2027 //				int trueLoc=Integer.parseInt(answer[3]);
2028 //				int trueStop=Integer.parseInt(answer[4]);
2029 //
2030 //				originalSite=new SiteScore(trueChrom, trueStrand, trueLoc, trueStop, 0, 0);
2031 //				synthetic=true;
2032 //
2033 //			} catch (NumberFormatException e) {
2034 //				System.err.println("Failed to parse "+qname);
2035 //			} catch (NullPointerException e) {
2036 //				System.err.println("Bad read with no name.");
2037 //				return null;
2038 //			}
2039 		}
2040 //		assert(false) : originalSite;
2041 
2042 
2043 		if(Data.GENOME_BUILD>=0){
2044 
2045 		}
2046 
2047 		int chrom_=-1;
2048 		byte strand_=strand();
2049 		int start_=start(true, includeHardClip);
2050 		int stop_=stop(start_, true, includeHardClip);
2051 		assert(start_<=stop_) : start_+", "+stop_+"\n"+this+"\n";
2052 
2053 		if(Data.GENOME_BUILD>=0){
2054 			ScafLoc sc=null;
2055 			if(RNAME_AS_BYTES){
2056 				if(rname!=null && (rname.length!=1 || rname[0]!='*')){
2057 					sc=Data.getScafLoc(rname);
2058 					assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rname)+"\n"+this;
2059 				}
2060 			}else{
2061 				if(rnameS!=null && (rnameS.length()!=1 || rnameS.charAt(0)!='*')){
2062 					sc=Data.getScafLoc(rnameS);
2063 					assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rnameS)+"\n"+this;
2064 				}
2065 			}
2066 			if(sc!=null){
2067 				chrom_=sc.chrom;
2068 				start_+=sc.loc;
2069 				stop_+=sc.loc;
2070 			}
2071 		}
2072 
2073 ////		byte[] quals=(qual==null || (qual.length()==1 && qual.charAt(0)=='*')) ? null : qual.getBytes();
2074 ////		byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual.clone();
2075 //		byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual;
2076 //		byte[] bases=seq==null ? null : seq.clone();
2077 //		if(strand_==Gene.MINUS){//Minus-mapped SAM lines have bases and quals reversed
2078 //			AminoAcid.reverseComplementBasesInPlace(bases);
2079 //			Tools.reverseInPlace(quals);
2080 //		}
2081 //		Read r=new Read(bases, chrom_, strand_, start_, stop_, qname, quals, cs_, numericId_);
2082 
2083 		final Read r;
2084 		{
2085 			byte[] seqX=(seq==null || (seq.length==1 && seq[0]=='*')) ? null : seq;
2086 			byte[] qualX=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual;
2087 			String qnameX=(qname==null || qname.equals(stringstar)) ? null : qname;
2088 			r=new Read(seqX, qualX, qnameX, numericId_, strand_, chrom_, start_, stop_);
2089 		}
2090 
2091 		r.setMapped(mapped());
2092 		r.setSynthetic(synthetic);
2093 //		r.setPairnum(pairnum()); //TODO:  Enable after fixing assertions that this will break in read input streams.
2094 		if(originalSite!=null){
2095 			r.originalSite=originalSite;
2096 		}
2097 
2098 		r.mapScore=mapq;
2099 		r.setSecondary(!primary());
2100 
2101 //		if(mapped()){
2102 //			r.list=new ArrayList<SiteScore>(1);
2103 //			r.list.add(new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0));
2104 //		}
2105 
2106 //		System.out.println(optional);
2107 		if(optional!=null){
2108 			for(String s : optional){
2109 				if(s.equals("XT:A:R")){
2110 					r.setAmbiguous(true);
2111 				}else if(s.startsWith("X1:Z:")){
2112 //					System.err.println("Found X1 tag!\t"+s);
2113 					String[] split=s.split("\\$");
2114 //					assert(false) : Arrays.toString(split);
2115 					ArrayList<SiteScore> list=new ArrayList<SiteScore>(3);
2116 
2117 					for(int i=1; i<split.length; i++){
2118 //						System.err.println("Processing ss\t"+split[i]);
2119 						String s2=split[i];
2120 						SiteScore ss=SiteScore.fromText(s2);
2121 						if(s2.charAt(0)=='*'){
2122 							r.originalSite=ss;
2123 						}else{
2124 							list.add(ss);
2125 						}
2126 					}
2127 //					System.err.println("List size = "+list.size());
2128 					if(list.size()>0){r.sites=list;}
2129 				}else if(s.startsWith("X2:Z:")){
2130 					String s2=s.substring(5);
2131 					r.match=s2.getBytes();
2132 				}else if(s.startsWith("X3:i:")){
2133 					String s2=s.substring(5);
2134 //					r.mapScore=Integer.parseInt(s2); //Messes up generation of ROC curve
2135 				}else if(s.startsWith("X5:Z:")){
2136 					String s2=s.substring(5);
2137 					r.numericID=Long.parseLong(s2);
2138 				}else if(s.startsWith("X6:i:")){
2139 					String s2=s.substring(5);
2140 					r.flags=Integer.parseInt(s2);
2141 				}else if(s.startsWith("X7:i:")){
2142 					String s2=s.substring(5);
2143 					r.copies=Integer.parseInt(s2);
2144 				}else{
2145 //					System.err.println("Unknown SAM field:"+s);
2146 				}
2147 			}
2148 		}
2149 //		assert(false) : CONVERT_CIGAR_TO_MATCH;
2150 		if(r.match==null && cigar!=null && (CONVERT_CIGAR_TO_MATCH || cigar.indexOf('=')>=0)){
2151 //			r.match=cigarToShortMatch(cigar, true);
2152 			r.match=toShortMatch(false);
2153 
2154 			if(r.match!=null){
2155 				r.setShortMatch(true);
2156 				if(Tools.indexOf(r.match, (byte)'B')>=0){
2157 					boolean success=r.fixMatchB();
2158 //					if(!success){r.match=null;}
2159 //					assert(false) : new String(r.match);
2160 				}
2161 //				assert(false) : new String(r.match);
2162 			}
2163 //			assert(false) : new String(r.match);
2164 //			System.err.println(">\n"+cigar+"\n"+(r.match==null ? "null" : new String(r.match)));
2165 		}
2166 //		assert(false) : new String(r.match);
2167 
2168 //		System.err.println("Resulting read: "+r.toText());
2169 
2170 		return r;
2171 
2172 	}
2173 
2174 	/*--------------------------------------------------------------*/
2175 	/*----------------           toString           ----------------*/
2176 	/*--------------------------------------------------------------*/
2177 
2178 	/** Aproximate length of result of SamLine.toText() */
textLength()2179 	public int textLength(){
2180 		int len=11; //11 tabs
2181 		len+=(3+9+3+9);
2182 		len+=(tlen>999 ? 9 : 3);
2183 
2184 		len+=(qname==null ? 1 : qname.length());
2185 		len+=rnameLen();
2186 		len+=(rnext==null ? 1 : rnext.length);
2187 		len+=(cigar==null ? 1 : cigar.length());
2188 		len+=(seq==null ? 1 : seq.length);
2189 		len+=(qual==null ? 1 : qual.length);
2190 
2191 		if(optional!=null){
2192 			len+=optional.size();
2193 			for(String s : optional){len+=s.length();}
2194 		}
2195 		return len;
2196 	}
2197 
toText()2198 	public ByteBuilder toText(){return toBytes((ByteBuilder)null);}
2199 
toBytes(ByteBuilder bb)2200 	public ByteBuilder toBytes(ByteBuilder bb){
2201 
2202 		final int buflen=Tools.max(rnameLen(), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length));
2203 
2204 		if(bb==null){bb=new ByteBuilder(textLength()+4);}
2205 		if(qname==null){bb.append('*').append('\t');}else{bb.append(qname).append('\t');}
2206 		bb.append(flag).append('\t');
2207 		if(RNAME_AS_BYTES){
2208 			assert(!(rname==null && rnameS!=null));
2209 			appendTo(bb, rname).append('\t');
2210 		}else{
2211 			assert(!(rname!=null && rnameS==null));
2212 			appendTo(bb, rnameS).append('\t');
2213 		}
2214 		bb.append(pos).append('\t');
2215 		bb.append(mapq).append('\t');
2216 		if(cigar==null){bb.append('*').append('\t');}else{bb.append(cigar).append('\t');}
2217 		appendTo(bb, rnext).append('\t');
2218 		bb.append(pnext).append('\t');
2219 		bb.append(tlen).append('\t');
2220 
2221 		if(mapped() && strand()==Shared.MINUS){
2222 			appendReverseComplemented(bb, seq).append('\t');
2223 			appendQualReversed(bb, qual);
2224 		}else{
2225 			appendTo(bb, seq).append('\t');
2226 			appendQual(bb, qual);
2227 		}
2228 
2229 //		assert(seq.getClass()==String.class);
2230 //		assert(qual.getClass()==String.class);
2231 //		sb.append(seq).append('\t');
2232 //		sb.append(qual);
2233 
2234 		if(optional!=null){
2235 			for(String s : optional){
2236 				bb.tab().append(s);
2237 			}
2238 		}
2239 		return bb;
2240 	}
2241 
2242 	@Override
toString()2243 	public String toString(){return toBytes(null).toString();}
2244 
appendTo(ByteBuilder sb, byte[] a)2245 	private static ByteBuilder appendTo(ByteBuilder sb, byte[] a){
2246 		if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
2247 		return sb.append(a);
2248 	}
2249 
appendTo(ByteBuilder sb, String a)2250 	private static ByteBuilder appendTo(ByteBuilder sb, String a){
2251 		if(a==null || a==stringstar || (a.length()==1 && a.charAt(0)=='*')){return sb.append('*');}
2252 		return sb.append(a);
2253 	}
2254 
appendReverseComplemented(ByteBuilder sb, byte[] a)2255 	private static ByteBuilder appendReverseComplemented(ByteBuilder sb, byte[] a){
2256 		if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
2257 
2258 		sb.ensureExtra(a.length);
2259 		byte[] buffer=sb.array;
2260 		int i=sb.length;
2261 		for(int j=a.length-1; j>=0; i++, j--){buffer[i]=AminoAcid.baseToComplementExtended[a[j]];}
2262 		sb.length+=a.length;
2263 
2264 		return sb;
2265 	}
2266 
appendQual(ByteBuilder sb, byte[] a)2267 	private static ByteBuilder appendQual(ByteBuilder sb, byte[] a){
2268 		if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
2269 
2270 		sb.ensureExtra(a.length);
2271 		byte[] buffer=sb.array;
2272 		int i=sb.length;
2273 		for(int j=0; j<a.length; i++, j++){buffer[i]=(byte)(a[j]+33);}
2274 		sb.length+=a.length;
2275 
2276 		return sb;
2277 	}
2278 
appendQualReversed(ByteBuilder sb, byte[] a)2279 	private static ByteBuilder appendQualReversed(ByteBuilder sb, byte[] a){
2280 		if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');}
2281 
2282 		sb.ensureExtra(a.length);
2283 		byte[] buffer=sb.array;
2284 		int i=sb.length;
2285 		for(int j=a.length-1; j>=0; i++, j--){buffer[i]=(byte)(a[j]+33);}
2286 		sb.length+=a.length;
2287 
2288 		return sb;
2289 	}
2290 
2291 	/** Assumes a custom name including original location */
originalContig()2292 	public byte[] originalContig(){
2293 //		assert(PARSE_CUSTOM);
2294 		int loc=-1;
2295 		int count=0;
2296 		for(int i=0; i<qname.length() && loc==-1; i++){
2297 			if(qname.charAt(i)=='_'){
2298 				count++;
2299 				if(count==6){loc=i;}
2300 			}
2301 		}
2302 		if(loc==-1){
2303 			return null;
2304 		}
2305 		return qname.substring(loc+1).getBytes();
2306 	}
2307 
2308 
2309 	/*--------------------------------------------------------------*/
2310 	/*----------------             Flag             ----------------*/
2311 	/*--------------------------------------------------------------*/
2312 
2313 //	Bit Description
2314 //	0x1 template having multiple fragments in sequencing
2315 //	0x2 each fragment properly aligned according to the aligner
2316 //	0x4 fragment unmapped
2317 //	0x8 next fragment in the template unmapped
2318 //	0x10 SEQ being reverse complemented
2319 //	0x20 SEQ of the next fragment in the template being reversed
2320 //	0x40 the first fragment in the template
2321 //	0x80 the last fragment in the template
2322 //	0x100 secondary alignment
2323 //	0x200 not passing quality controls
2324 //	0x400 PCR or optical duplicate
2325 //	0x800 supplementary alignment
2326 
2327 
makeFlag(Read r, Read r2, int fragNum, boolean sameScaf)2328 	public static int makeFlag(Read r, Read r2, int fragNum, boolean sameScaf){
2329 		int flag=0;
2330 		if(r2!=null){
2331 			flag|=0x1;
2332 
2333 			if(r.mapped() && r.valid() && r.match!=null &&
2334 					(r2==null || (sameScaf && r.paired() && r2.mapped() && r2.valid() && r2.match!=null))){flag|=0x2;}
2335 			if(fragNum==0){flag|=0x40;}
2336 			if(fragNum>0){flag|=0x80;}
2337 		}
2338 		if(!r.mapped()){flag|=0x4;}
2339 		if(r2!=null && !r2.mapped()){flag|=0x8;}
2340 		if(r.strand()==Shared.MINUS){flag|=0x10;}
2341 		if(r2!=null && r2.strand()==Shared.MINUS){flag|=0x20;}
2342 		if(r.secondary()){flag|=0x100;}
2343 		if(r.discarded()){flag|=0x200;}
2344 		return flag;
2345 	}
2346 
2347 
hasMate()2348 	public boolean hasMate(){
2349 		return (flag&0x1)==0x1;
2350 	}
2351 
properPair()2352 	public boolean properPair(){
2353 		return (flag&0x2)==0x2;
2354 	}
2355 
mapped(int flag)2356 	public static boolean mapped(int flag){
2357 		return (flag&0x4)!=0x4;
2358 	}
2359 
strand(int flag)2360 	public static byte strand(int flag){
2361 		return ((flag&0x10)==0x10 ? (byte)1 : (byte)0);
2362 	}
2363 
mapped()2364 	public boolean mapped(){
2365 		return (flag&0x4)!=0x4;
2366 //		0x4 fragment unmapped
2367 //		0x8 next fragment in the template unmapped
2368 	}
2369 
nextMapped()2370 	public boolean nextMapped(){
2371 		return (flag&0x8)!=0x8;
2372 //		0x4 fragment unmapped
2373 //		0x8 next fragment in the template unmapped
2374 	}
2375 
strand()2376 	public byte strand(){
2377 		return ((flag&0x10)==0x10 ? (byte)1 : (byte)0);
2378 	}
2379 
mateStrand()2380 	public byte mateStrand(){return nextStrand();}
nextStrand()2381 	public byte nextStrand(){
2382 		return ((flag&0x20)==0x20 ? (byte)1 : (byte)0);
2383 	}
2384 
firstFragment()2385 	public boolean firstFragment(){
2386 		return (flag&0x40)==0x40;
2387 	}
2388 
lastFragment()2389 	public boolean lastFragment(){
2390 		return (flag&0x80)==0x80;
2391 	}
2392 
pairnum()2393 	public int pairnum(){
2394 		return firstFragment() ? 0 : lastFragment() ? 1 : 0;
2395 	}
2396 
primary()2397 	public boolean primary(){return (flag&0x100)==0;}
setPrimary(boolean b)2398 	public void setPrimary(boolean b){
2399 		if(b){
2400 			flag=flag|0x100;
2401 		}else{
2402 			flag=flag&~0x100;
2403 		}
2404 	}
2405 
discarded()2406 	public boolean discarded(){
2407 		return (flag&0x200)==0x200;
2408 	}
2409 
duplicate()2410 	public boolean duplicate(){
2411 		return (flag&0x400)==0x400;
2412 	}
2413 
supplementary()2414 	public boolean supplementary(){
2415 		return (flag&0x800)==0x800;
2416 	}
2417 
leftmost()2418 	public boolean leftmost(){
2419 		if(!pairedOnSameChrom() || tlen==0){return true;}
2420 		return tlen>0;
2421 	}
2422 
2423 	/*--------------------------------------------------------------*/
2424 	/*----------------             ?             ----------------*/
2425 	/*--------------------------------------------------------------*/
2426 
2427 //	/** Assumes rname is an integer. */
2428 //	public int chrom(){
2429 //		if(Data.GENOME_BUILD<0){return -1;}
2430 //		HashMap sc
2431 //	}
2432 
2433 	/** Assumes rname is an integer. */
chrom_old()2434 	public int chrom_old(){
2435 		assert(false);
2436 		if(!Tools.isDigit(rname[0]) && !Tools.isDigit(rname[rname.length-1])){
2437 			if(warning){
2438 				warning=false;
2439 				System.err.println("Warning - sam lines need a chrom field.");
2440 			}
2441 			return -1;
2442 		}
2443 		assert(Shared.anomaly || '*'==rname[0] || (Tools.isDigit(rname[0]) && Tools.isDigit(rname[rname.length-1]))) :
2444 			"This is no longer correct, considering that sam lines are named by scaffold.  They need a chrom field.\n"+new String(rname);
2445 		if(rname==null || Arrays.equals(rname, bytestar) || !(Tools.isDigit(rname[0]) && Tools.isDigit(rname[rname.length-1]))){return -1;}
2446 		//return Gene.toChromosome(new String(rname));
2447 		//return Integer.parseInt(new String(rname)));
2448 		final byte z='0';
2449 		int x=rname[0]-z;
2450 		for(int i=1; i<rname.length; i++){
2451 			x=(x*10)+(rname[i]-z);
2452 		}
2453 		return x;
2454 	}
2455 
2456 	/** Returns the zero-based starting location of this read on the sequence. */
start(boolean includeSoftClip, boolean includeHardClip)2457 	public int start(boolean includeSoftClip, boolean includeHardClip){
2458 		int x=countLeadingClip(cigar, includeSoftClip, includeHardClip);
2459 		return pos-1-x;
2460 	}
2461 
2462 	/** Returns the zero-based stop location of this read on the sequence. */
stop(int start, boolean includeSoftClip, boolean includeHardClip)2463 	public int stop(int start, boolean includeSoftClip, boolean includeHardClip){
2464 		if(!mapped() || cigar==null || cigar.charAt(0)=='*'){
2465 //			return -1;
2466 			return start+(seq==null ? 0 : Tools.max(0, seq.length-1));
2467 		}
2468 		int r=start+calcCigarLength(cigar, includeSoftClip, includeHardClip)-1;
2469 
2470 //		assert(false) : start+", "+r+", "+calcCigarLength(cigar, includeHardClip);
2471 //		System.err.println("start= "+start+", stop="+r);
2472 		return r;
2473 	}
2474 
stop2(final int start, final boolean includeSoftClip, final boolean includeHardClip)2475 	public int stop2(final int start, final boolean includeSoftClip, final boolean includeHardClip){
2476 		if(mapped() && cigar!=null && cigar.charAt(0)!='*'){return stop(start, includeSoftClip, includeHardClip);}
2477 //		return (seq==null ? -1 : start()+seq.length());
2478 		return (seq==null ? -1 : start+seq.length);
2479 	}
2480 
numericId()2481 	public long numericId(){
2482 		return 0;
2483 	}
2484 
2485 	/** This includes half-mapped pairs. */
pairedOnSameChrom()2486 	public boolean pairedOnSameChrom(){
2487 //		assert(false) : (rname==null ? "nullX" : new String(rname))+", "+
2488 //		(rnext==null ? "nullX" : new String(rnext))+", "+Tools.equals(rnext, byteequals)+", "+Arrays.equals(rname, rnext)+"\n"+this;
2489 		if(RNAME_AS_BYTES){
2490 			return Tools.equals(rnext, byteequals) || Arrays.equals(rname, rnext) || (/*pairnum()==1 &&*/ Tools.equals(rname, byteequals));
2491 		}else{
2492 			return Tools.equals(rnext, byteequals) || Tools.equals(rnameS, rnext) || (/*pairnum()==1 &&*/ stringequals.equals(rnameS));
2493 		}
2494 	}
2495 
2496 	/** Assumes a custom name including original location */
originalContigStart()2497 	public int originalContigStart(){
2498 //		assert(PARSE_CUSTOM);
2499 		int loc=-1;
2500 		int count=0;
2501 		for(int i=0; i<qname.length() && loc==-1; i++){
2502 			if(qname.charAt(i)=='_'){
2503 				count++;
2504 				if(count==5){loc=i;}
2505 			}
2506 		}
2507 		if(loc==-1){
2508 			return -1;
2509 		}
2510 
2511 		int sum=0;
2512 		int mult=1;
2513 		for(int i=loc+1; i<qname.length(); i++){
2514 			char c=qname.charAt(i);
2515 			if(!Tools.isDigit(c)){
2516 				if(i==loc+1 && c=='-'){mult=-1;}
2517 				else{break;}
2518 			}else{
2519 				sum=(sum*10)+(c-'0');
2520 			}
2521 		}
2522 		return sum*mult;
2523 	}
2524 
2525 	/*--------------------------------------------------------------*/
2526 	/*----------------           Getters            ----------------*/
2527 	/*--------------------------------------------------------------*/
2528 
rnameLen()2529 	public int rnameLen(){
2530 		return (rname==null ? rnameS==null ? 1 : rnameS.length() : rname.length);
2531 	}
2532 
rname()2533 	public byte[] rname(){
2534 		assert(RNAME_AS_BYTES);
2535 		return rname;
2536 	}
rnext()2537 	public byte[] rnext(){return rnext;}
2538 
setRname(byte[] x)2539 	public void setRname(byte[] x){assert(RNAME_AS_BYTES);rname=x;}
setRnext(byte[] x)2540 	public void setRnext(byte[] x){rnext=x;}
2541 
setRname(String x)2542 	public void setRname(String x){assert(!RNAME_AS_BYTES);rnameS=x;}
setRnext(String x)2543 	public void setRnext(String x){rnext=(x==null ? null : x.getBytes());}
2544 
rnameS()2545 	public String rnameS(){return rnameS!=null ? rnameS : rname==null ? null : new String(rname);}
rnextS()2546 	public String rnextS(){return rnext==null ? null : new String(rnext);}
2547 
2548 	/*--------------------------------------------------------------*/
2549 	/*----------------           Fields             ----------------*/
2550 	/*--------------------------------------------------------------*/
2551 
mdTag()2552 	private String mdTag(){
2553 		if(optional==null){return null;}
2554 		for(String s : optional){
2555 			if(s.startsWith("MD:Z:")){return s;}
2556 		}
2557 		return null;
2558 	}
2559 
setScafnum(ScafMap scafMap)2560 	public int setScafnum(ScafMap scafMap) {
2561 		assert(scafnum<0);
2562 
2563 		String name=null;
2564 		if(mapped() || (rname!=null && rname!=byteequals && rname!=bytestar)){
2565 			name=rnameS();
2566 		}else if(nextMapped() && rnext!=null && rnext!=byteequals && rnext!=bytestar){
2567 			name=new String(rnext);
2568 		}
2569 		if(name!=null){scafnum=scafMap.getNumber(name);}
2570 		return scafnum;
2571 	}
2572 
2573 	public long countBytes(){
2574 		long sum=76;
2575 		sum+=(cigar==null ? 0 : cigar.length()*2+16);
2576 		sum+=(optional==null ? 0 : optional.size()*32+16);
2577 		sum+=(rname==null ? 0 : rname.length+16);
2578 		sum+=(rnext==null ? 0 : rnext.length+16);
2579 		return sum;
2580 	}
2581 
2582 	public String qname;
2583 	public int flag;
2584 	public int pos;
2585 	public int mapq;
2586 	public String cigar;
2587 	public int pnext;
2588 	public int tlen;
2589 	public byte[] seq;
2590 	public byte[] qual;
2591 	public ArrayList<String> optional;
2592 
2593 	public Object obj;
2594 	public int scafnum=-1;
2595 
2596 	/*--------------------------------------------------------------*/
2597 	/*----------------        Private Fields        ----------------*/
2598 	/*--------------------------------------------------------------*/
2599 
2600 	private byte[] rname;
2601 	private byte[] rnext;
2602 
2603 	private String rnameS;
2604 
2605 	/*--------------------------------------------------------------*/
2606 	/*----------------         Static Fields        ----------------*/
2607 	/*--------------------------------------------------------------*/
2608 
2609 	private static final String stringstar="*";
2610 	private static final String stringequals="=";
2611 	private static final byte[] bytestar=new byte[] {(byte)'*'};
2612 	private static final byte[] byteequals=new byte[] {(byte)'='};
2613 	private static final String XSPLUS="XS:A:+", XSMINUS="XS:A:-";
2614 //	private static final double inv100=0.01d;
2615 //	private static float minratio=0.4f;
2616 
2617 	private static boolean warning=System.getProperty("user.dir").contains("/bushnell/");
2618 
2619 	/*--------------------------------------------------------------*/
2620 	/*----------------     Public Static Fields     ----------------*/
2621 	/*--------------------------------------------------------------*/
2622 
2623 	public static boolean makeReadgroupTags(){
2624 		return READGROUP_ID!=null || READGROUP_CN!=null || READGROUP_DS!=null || READGROUP_DT!=null ||
2625 				READGROUP_FO!=null || READGROUP_KS!=null || READGROUP_LB!=null || READGROUP_PG!=null ||
2626 				READGROUP_PI!=null || READGROUP_PL!=null || READGROUP_PU!=null || READGROUP_SM!=null ||
2627 				READGROUP_TAG!=null;
2628 	}
2629 
2630 	public static boolean makeOtherTags(){
2631 		if(NO_TAGS){return false;}
2632 		return MAKE_AM_TAG || MAKE_NM_TAG || MAKE_SM_TAG || MAKE_XM_TAG || MAKE_XS_TAG || MAKE_AS_TAG ||
2633 				MAKE_NH_TAG || MAKE_TOPHAT_TAGS || MAKE_IDENTITY_TAG || MAKE_SCORE_TAG || MAKE_STOP_TAG || MAKE_LENGTH_TAG ||
2634 				MAKE_CUSTOM_TAGS || MAKE_INSERT_TAG || MAKE_CORRECTNESS_TAG || MAKE_TIME_TAG || MAKE_BOUNDS_TAG;
2635 	}
2636 
2637 	public static boolean makeAnyTags(){
2638 		return makeReadgroupTags() || makeOtherTags();
2639 	}
2640 
2641 	public static String READGROUP_ID=null;
2642 	public static String READGROUP_CN=null;
2643 	public static String READGROUP_DS=null;
2644 	public static String READGROUP_DT=null;
2645 	public static String READGROUP_FO=null;
2646 	public static String READGROUP_KS=null;
2647 	public static String READGROUP_LB=null;
2648 	public static String READGROUP_PG=null;
2649 	public static String READGROUP_PI=null;
2650 	public static String READGROUP_PL=null;
2651 	public static String READGROUP_PU=null;
2652 	public static String READGROUP_SM=null;
2653 
2654 	public static String READGROUP_TAG=null;
2655 
2656 	/** Turn this off for RNAseq or long indels */
2657 	public static boolean MAKE_MD_TAG=false;
2658 
2659 	public static boolean NO_TAGS=false;
2660 
2661 	public static boolean MAKE_AM_TAG=true;
2662 	public static boolean MAKE_NM_TAG=true;
2663 	public static boolean MAKE_SM_TAG=false;
2664 	public static boolean MAKE_XM_TAG=false;
2665 	public static boolean MAKE_XS_TAG=false;
2666 	public static boolean MAKE_AS_TAG=false; //TODO: Alignment score from aligner
2667 	public static boolean MAKE_NH_TAG=false;
2668 	public static boolean MAKE_TOPHAT_TAGS=false;
2669 	public static boolean XS_SECONDSTRAND=false;
2670 	public static boolean MAKE_IDENTITY_TAG=false;
2671 	public static boolean MAKE_SCORE_TAG=false;
2672 	public static boolean MAKE_STOP_TAG=false;
2673 	public static boolean MAKE_LENGTH_TAG=false;
2674 	public static boolean MAKE_CUSTOM_TAGS=false;
2675 	public static boolean MAKE_INSERT_TAG=false;
2676 	public static boolean MAKE_CORRECTNESS_TAG=false;
2677 	public static boolean MAKE_TIME_TAG=false;
2678 	public static boolean MAKE_BOUNDS_TAG=false;
2679 
2680 	public static boolean PENALIZE_AMBIG=true;
2681 	public static boolean CONVERT_CIGAR_TO_MATCH=true;
2682 	public static boolean SOFT_CLIP=true;
2683 	public static boolean SECONDARY_ALIGNMENT_ASTERISKS=true;
2684 	/** OK to use the "setFrom" function which uses the old SamLine instead of translating the read, if a genome is not loaded. */
2685 	public static boolean SET_FROM_OK=false;
2686 	/** For paired reads, keep original names rather than changing read2's name to match read1 */
2687 	public static boolean KEEP_NAMES=false;
2688 	public static float VERSION=1.4f;
2689 	/** Tells program when to use 'N' rather than 'D' in cigar strings */
2690 	public static int INTRON_LIMIT=Integer.MAX_VALUE;
2691 	public static boolean RNAME_AS_BYTES=true;//Effect on speed is negligible for pileup...
2692 
2693 	/** Prefer MD tag over reference for translating cigar strings to match */
2694 	public static boolean PREFER_MDTAG=false;
2695 	/** Determine whether cigar X means match N or S.
2696 	 * This makes sam loading substantially slower. */
2697 	public static boolean FIX_MATCH_NS=false;
2698 
2699 	public static boolean setxs=false;
2700 	public static boolean setintron=false;
2701 
2702 //	/** SSAHA2 incorrectly calculates the start position of reads with soft-clipped starts, and needs this enabled. */
2703 //	public static boolean SUBTRACT_LEADING_SOFT_CLIP=true;
2704 	/** Sort header scaffolds in alphabetical order to be more compatible with Tophat */
2705 	public static boolean SORT_SCAFFOLDS=false;
2706 
2707 	/** qname */
2708 	public static boolean PARSE_0=true;
2709 	/** rname */
2710 	public static boolean PARSE_2=true;
2711 	/** cigar */
2712 	public static boolean PARSE_5=true;
2713 	/** rnext */
2714 	public static boolean PARSE_6=true;
2715 	/** pnext */
2716 	public static boolean PARSE_7=true;
2717 	/** tlen */
2718 	public static boolean PARSE_8=true;
2719 	/** qual */
2720 	public static boolean PARSE_10=true;
2721 	public static boolean PARSE_OPTIONAL=true;
2722 	public static boolean PARSE_OPTIONAL_MD_ONLY=false;
2723 
2724 	public static boolean FLIP_ON_LOAD=true;
2725 
2726 	public static boolean verbose=false;
2727 
2728 }
2729