1 package stream;
2 
3 import java.io.Serializable;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.HashSet;
7 
8 import align2.GapTools;
9 import align2.QualityTools;
10 import dna.AminoAcid;
11 import dna.ChromosomeArray;
12 import dna.Data;
13 import shared.KillSwitch;
14 import shared.Shared;
15 import shared.Tools;
16 import shared.TrimRead;
17 import structures.ByteBuilder;
18 import ukmer.Kmer;
19 
20 public final class Read implements Comparable<Read>, Cloneable, Serializable{
21 
22 	/**
23 	 *
24 	 */
25 	private static final long serialVersionUID = -1026645233407290096L;
26 
main(String[] args)27 	public static void main(String[] args){
28 		byte[] a=args[0].getBytes();
29 		System.out.println(new String(a));
30 		byte[] b=toShortMatchString(a);
31 		System.out.println(new String(b));
32 		byte[] c=toLongMatchString(b);
33 		System.out.println(new String(c));
34 		byte[] d=toLongMatchString(c);
35 		System.out.println(new String(d));
36 //		byte[] e=toShortMatchString(b);
37 //		System.out.println(new String(e));
38 
39 	}
40 
Read(byte[] bases_, byte[] quals_, long id_)41 	public Read(byte[] bases_, byte[] quals_, long id_){
42 		this(bases_, quals_, Long.toString(id_), id_);
43 	}
44 
Read(byte[] bases_, byte[] quals_, String name_, long id_)45 	public Read(byte[] bases_, byte[] quals_, String name_, long id_){
46 		this(bases_, quals_, name_, id_, 0, -1, -1, -1);
47 	}
48 
Read(byte[] bases_, byte[] quals_, String name_, long id_, int flag_)49 	public Read(byte[] bases_, byte[] quals_, String name_, long id_, int flag_){
50 		this(bases_, quals_, name_, id_, flag_, -1, -1, -1);
51 	}
52 
Read(byte[] s_, byte[] quals_, long id_, int chrom_, int start_, int stop_, byte strand_)53 	public Read(byte[] s_, byte[] quals_, long id_, int chrom_, int start_, int stop_, byte strand_){
54 		this(s_, quals_, Long.toString(id_), id_, (int)strand_, chrom_, start_, stop_);
55 	}
56 
57 //	public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, byte strand_, int chrom_, int start_, int stop_){
58 //		this(bases_, quals_, id_, numericID_, (int)strand_, chrom_, start_, stop_);
59 //		assert(strand_==0 || strand_==1);
60 //		assert(start_<=stop_) : chrom_+", "+start_+", "+stop_+", "+numericID_;
61 //	}
62 
63 	/** Note that strand can be used as flag */
Read(byte[] bases_, byte[] quals_, String id_, long numericID_, int flags_, int chrom_, int start_, int stop_)64 	public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, int flags_, int chrom_, int start_, int stop_){
65 		flags=flags_&~VALIDATEDMASK;
66 		bases=bases_;
67 		quality=quals_;
68 
69 		chrom=chrom_;
70 		start=start_;
71 		stop=stop_;
72 
73 		id=id_;
74 		numericID=numericID_;
75 
76 //		assert(amino()) : Shared.AMINO_IN;//123
77 		if(VALIDATE_IN_CONSTRUCTOR){validate(true);}
78 	}
79 
80 	/*--------------------------------------------------------------*/
81 	/*----------------          Validation          ----------------*/
82 	/*--------------------------------------------------------------*/
83 
validate(final boolean processAssertions)84 	public boolean validate(final boolean processAssertions){
85 		assert(!validated());
86 
87 //		if(false){//This causes problems with error-corrected PacBio reads.
88 //			boolean x=(quality==null || quality.length<1 || quality[0]<=80 || !FASTQ.DETECT_QUALITY || FASTQ.IGNORE_BAD_QUALITY);
89 //			if(!x){
90 //				if(processAssertions){
91 //					KillSwitch.kill("Quality value ("+quality[0]+") appears too high.\n"+Arrays.toString(quality)+
92 //							"\n"+Arrays.toString(bases)+"\n"+numericID+"\n"+id+"\n"+FASTQ.ASCII_OFFSET);
93 //				}
94 //				return false;
95 //			}
96 //		}
97 
98 		if(bases==null){
99 			quality=null; //I could require this to be true
100 			if(FIX_HEADER){fixHeader(processAssertions);}
101 			setValidated(true);
102 			return true;
103 		}
104 
105 		validateQualityLength(processAssertions);
106 
107 		final boolean passesJunk;
108 
109 //		assert(false) : SKIP_SLOW_VALIDATION+","+VALIDATE_BRANCHLESS+","+JUNK_MODE;
110 		if(SKIP_SLOW_VALIDATION){
111 			passesJunk=true;
112 		}else if(!aminoacid()){
113 			if(U_TO_T){uToT();}
114 			if(VALIDATE_BRANCHLESS){
115 				passesJunk=validateCommonCase_branchless(processAssertions);
116 			}else{
117 				passesJunk=validateCommonCase(processAssertions);
118 			}
119 		}else{
120 //			if(U_TO_T){uToT();}  This is amino, so...
121 			fixCase();
122 			passesJunk=validateJunk(processAssertions);
123 			if(CHANGE_QUALITY){fixQuality();}
124 		}
125 
126 		if(FIX_HEADER){fixHeader(processAssertions);}
127 
128 		setValidated(true);
129 
130 		return true;
131 	}
132 
133 
134 
checkQuality()135 	public boolean checkQuality(){
136 		if(quality==null){return true;}
137 		for(int i=0; quality!=null && i<bases.length; i++){
138 			if((bases[i]=='N')!=(quality[i]<1)){
139 				assert(false) : (char)bases[i]+", "+quality[i]+", "+i+", "+CHANGE_QUALITY+", "+MIN_CALLED_QUALITY+"\n"+toFastq();
140 				return false;
141 			}
142 		}
143 		return true;
144 	}
145 
uToT()146 	private void uToT(){
147 		if(bases==null){return;}
148 		for(int i=0; i<bases.length; i++){
149 			bases[i]=AminoAcid.uToT[bases[i]];
150 		}
151 	}
152 
tToU()153 	private void tToU(){
154 		if(bases==null){return;}
155 		for(int i=0; i<bases.length; i++){
156 			bases[i]=AminoAcid.tToU[bases[i]];
157 		}
158 	}
159 
validateJunk(boolean processAssertions)160 	private boolean validateJunk(boolean processAssertions){
161 		assert(bases!=null);
162 		if(JUNK_MODE==IGNORE_JUNK){return true;}
163 		final byte nocall;
164 		final byte[] toNum;
165 		final boolean aa=aminoacid();
166 		if(aa){
167 			nocall='X';
168 			toNum=AminoAcid.acidToNumberExtended;
169 		}else{
170 			nocall='N';
171 			toNum=AminoAcid.baseToNumberExtended;
172 		}
173 		for(int i=0; i<bases.length; i++){
174 			byte b=bases[i];
175 			int num=toNum[b];
176 //			System.err.println(Character.toString(b)+" -> "+num);
177 			if(num<0){
178 				if(JUNK_MODE==FIX_JUNK){
179 					bases[i]=nocall;
180 				}else if(JUNK_MODE==FLAG_JUNK){
181 					setJunk(true);
182 					return false;
183 				}else{
184 					if(processAssertions){
185 						KillSwitch.kill("\nAn input file appears to be misformatted:\n"
186 							+ "The character with ASCII code "+b+" appeared where "+(aa ? "an amino acid" : "a base")+" was expected"
187 									+ (b>31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n")
188 							+ "Sequence #"+numericID+"\n"
189 							+ "Sequence ID '"+id+"'\n"
190 							+ "Sequence: "+Tools.toStringSafe(bases)+"\n"
191 							+ "Flags: "+Long.toBinaryString(flags)+"\n\n"
192 							+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
193 					}
194 					setJunk(true);
195 					return false;
196 				}
197 			}
198 		}
199 		return true;
200 	}
201 
validateQualityLength(boolean processAssertions)202 	private void validateQualityLength(boolean processAssertions){
203 		if(quality==null || quality.length==bases.length){return;}
204 		if(TOSS_BROKEN_QUALITY){
205 			quality=null;
206 			setDiscarded(true);
207 			setJunk(true);
208 		}else if(NULLIFY_BROKEN_QUALITY){
209 			quality=null;
210 			setJunk(true);
211 		}else if(FLAG_BROKEN_QUALITY){
212 			setJunk(true);
213 		}else{
214 			boolean x=false;
215 			assert(x=processAssertions);
216 			if(x){
217 				KillSwitch.kill("\nMismatch between length of bases and qualities for read "+numericID+" (id="+id+").\n"+
218 						"# qualities="+quality.length+", # bases="+bases.length+"\n\n"+
219 						FASTQ.qualToString(quality)+"\n"+new String(bases)+"\n\n"
220 						+ "This can be bypassed with the flag 'tossbrokenreads' or 'nullifybrokenquality'");
221 			}
222 		}
223 	}
224 
fixQuality()225 	private void fixQuality(){
226 		if(quality==null || !CHANGE_QUALITY){return;}
227 
228 		final byte[] toNumber=aminoacid() ? AminoAcid.acidToNumber : AminoAcid.baseToNumber;
229 
230 		for(int i=0; i<quality.length; i++){
231 			byte b=bases[i];
232 			byte q=quality[i];
233 			quality[i]=capQuality(q, b);
234 //			if(toNumber[b]>=0){
235 //				if(q<MIN_CALLED_QUALITY){
236 //					quality[i]=MIN_CALLED_QUALITY;
237 //				}else if(q>MAX_CALLED_QUALITY){
238 //					quality[i]=MAX_CALLED_QUALITY;
239 //				}
240 //			}else{
241 //				quality[i]=0;
242 //			}
243 		}
244 	}
245 
fixCase()246 	private void fixCase(){
247 		if(bases==null || (!DOT_DASH_X_TO_N && !TO_UPPER_CASE && !LOWER_CASE_TO_N)){return;}
248 		final boolean aa=aminoacid();
249 
250 		final byte[] caseMap, ddxMap;
251 		if(!aa){
252 			caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocall : null;
253 			ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocall : null;
254 		}else{
255 			caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocallAA : null;
256 			ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocallAA : null;
257 		}
258 
259 //		assert(false) : (AminoAcid.toUpperCase==caseMap)+", "+ddxMap;
260 
261 		if(DOT_DASH_X_TO_N){
262 			if(TO_UPPER_CASE || LOWER_CASE_TO_N){
263 				for(int i=0; i<bases.length; i++){
264 					byte b=bases[i];
265 					bases[i]=caseMap[ddxMap[b]];
266 				}
267 			}else{
268 				for(int i=0; i<bases.length; i++){
269 					byte b=bases[i];
270 					bases[i]=ddxMap[b];
271 				}
272 			}
273 		}else{
274 			if(TO_UPPER_CASE || LOWER_CASE_TO_N){
275 				for(int i=0; i<bases.length; i++){
276 					byte b=bases[i];
277 					bases[i]=caseMap[b];
278 				}
279 			}else{
280 				assert(false);
281 			}
282 		}
283 	}
284 
validateCommonCase_branchless(boolean processAssertions)285 	private boolean validateCommonCase_branchless(boolean processAssertions){
286 
287 		assert(!aminoacid());
288 		assert(bases!=null);
289 
290 		if(TO_UPPER_CASE || LOWER_CASE_TO_N){fixCase();}
291 
292 		final byte nocall='N';
293 		final byte[] toNum=AminoAcid.baseToNumber;
294 		final byte[] map=(DOT_DASH_X_TO_N && IUPAC_TO_N ? AminoAcid.baseToACGTN :
295 			DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocall : IUPAC_TO_N ? AminoAcid.iupacToNocall : null);
296 
297 		if(JUNK_MODE==IGNORE_JUNK){
298 			if(quality!=null && CHANGE_QUALITY){
299 				if(DOT_DASH_X_TO_N || IUPAC_TO_N){
300 					for(int i=0; i<bases.length; i++){
301 						final byte b=map[bases[i]];
302 						final byte q=quality[i];
303 						final int num=toNum[b];
304 						bases[i]=b;
305 						quality[i]=(num>=0 ? qMap[q] : 0);
306 					}
307 				}else{
308 					for(int i=0; i<bases.length; i++){
309 						final byte b=bases[i];
310 						final byte q=quality[i];
311 						final int num=toNum[b];
312 						quality[i]=(num>=0 ? qMap[q] : 0);
313 					}
314 				}
315 			}else if(DOT_DASH_X_TO_N || IUPAC_TO_N){
316 				for(int i=0; i<bases.length; i++){
317 					byte b=map[bases[i]];
318 					bases[i]=b;
319 				}
320 			}
321 			return true;
322 		}
323 
324 		int junkOr=0;
325 //		int iupacOr=0;
326 		final byte[] toNumE=AminoAcid.baseToNumberExtended;
327 
328 //		asdf
329 
330 		if(DOT_DASH_X_TO_N || IUPAC_TO_N){
331 			if(quality!=null && CHANGE_QUALITY){
332 				for(int i=0; i<bases.length; i++){
333 					final byte b=map[bases[i]];
334 					final byte q=quality[i];
335 					final int numE=toNumE[b];
336 					final int num=toNum[b];
337 					junkOr|=numE;
338 //					iupacOr|=num;
339 					bases[i]=b;
340 					quality[i]=(num>=0 ? qMap[q] : 0);
341 				}
342 			}else{
343 				for(int i=0; i<bases.length; i++){
344 					final byte b=map[bases[i]];
345 					final int numE=toNumE[b];
346 //					final int num=toNum[b];
347 					junkOr|=numE;
348 //					iupacOr|=num;
349 					bases[i]=b;
350 				}
351 			}
352 		}else{
353 			if(quality!=null && CHANGE_QUALITY){
354 				for(int i=0; i<bases.length; i++){
355 					final byte b=bases[i];
356 					final byte q=quality[i];
357 					final int numE=toNumE[b];
358 					final int num=toNum[b];
359 					junkOr|=numE;
360 //					iupacOr|=num;
361 					quality[i]=(num>=0 ? qMap[q] : 0);
362 				}
363 			}else{
364 				for(int i=0; i<bases.length; i++){
365 					final byte b=bases[i];
366 					final int numE=toNumE[b];
367 //					final int num=toNum[b];
368 					junkOr|=numE;
369 //					iupacOr|=num;
370 				}
371 			}
372 		}
373 
374 //		System.err.println(junkOr+", "+JUNK_MODE);
375 
376 		//Common case
377 		if(junkOr>=0){return true;}
378 //		if(junkOr>=0 && (JUNK_MODE!=FIX_JUNK_AND_IUPAC || iupacOr>=0)){return true;}
379 //
380 //		assert(junkOr<0 || (JUNK_MODE==FIX_JUNK_AND_IUPAC && iupacOr<0));
381 
382 		//TODO: I could disable VALIDATE_BRANCHLESS here, if it's not final
383 		//VALIDATE_BRANCHLESS=false;
384 		if(JUNK_MODE==FIX_JUNK){
385 			for(int i=0; i<bases.length; i++){
386 				byte b=bases[i];
387 				final int numE=toNumE[b];
388 
389 				if(numE<0){
390 					bases[i]=nocall;
391 					if(quality!=null){quality[i]=0;}
392 				}
393 			}
394 			return true;
395 		}else if(JUNK_MODE==FLAG_JUNK){
396 			setJunk(true);
397 			return false;
398 		}else if(JUNK_MODE==FIX_JUNK_AND_IUPAC){
399 			for(int i=0; i<bases.length; i++){
400 				byte b=bases[i];
401 				final byte c=AminoAcid.baseToACGTN[b];
402 				bases[i]=c;
403 			}
404 			return true;
405 		}else{
406 			if(processAssertions){
407 				int i=0;
408 				for(; i<bases.length; i++){
409 					if(toNumE[bases[i]]<0){break;}
410 				}
411 				byte b=bases[i];
412 				KillSwitch.kill("\nAn input file appears to be misformatted:\n"
413 						+ "The character with ASCII code "+b+" appeared where a base was expected"
414 							+ (b>31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n")
415 						+ "Sequence #"+numericID+"\n"
416 						+ "Sequence ID: '"+id+"'\n"
417 						+ "Sequence: '"+Tools.toStringSafe(bases)+"'\n\n"
418 						+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
419 			}
420 			setJunk(true);
421 			return false;
422 		}
423 	}
424 
validateCommonCase(boolean processAssertions)425 	private boolean validateCommonCase(boolean processAssertions){
426 
427 		assert(!aminoacid());
428 		assert(bases!=null);
429 
430 		if(TO_UPPER_CASE || LOWER_CASE_TO_N){fixCase();}
431 
432 		final byte nocall='N';
433 		final byte[] toNum=AminoAcid.baseToNumber;
434 		final byte[] ddxMap=AminoAcid.dotDashXToNocall;
435 
436 		if(JUNK_MODE==IGNORE_JUNK){
437 			if(quality!=null && CHANGE_QUALITY){
438 				if(DOT_DASH_X_TO_N){
439 					for(int i=0; i<bases.length; i++){
440 						final byte b=ddxMap[bases[i]];
441 						final byte q=quality[i];
442 						final int num=toNum[b];
443 						bases[i]=b;
444 						quality[i]=(num>=0 ? qMap[q] : 0);
445 					}
446 				}else{
447 					for(int i=0; i<bases.length; i++){
448 						final byte b=bases[i];
449 						final byte q=quality[i];
450 						final int num=toNum[b];
451 						quality[i]=(num>=0 ? qMap[q] : 0);
452 					}
453 				}
454 			}else if(DOT_DASH_X_TO_N){
455 				for(int i=0; i<bases.length; i++){
456 					byte b=ddxMap[bases[i]];
457 					bases[i]=b;
458 				}
459 			}
460 		}else if(DOT_DASH_X_TO_N){
461 			final byte[] toNumE=AminoAcid.baseToNumberExtended;
462 			if(quality!=null && CHANGE_QUALITY){
463 				for(int i=0; i<bases.length; i++){
464 					byte b=ddxMap[bases[i]];
465 					final byte q=quality[i];
466 					final int numE=toNumE[b];
467 
468 					if(numE<0){
469 						if(JUNK_MODE==FIX_JUNK){
470 							b=nocall;
471 						}else if(JUNK_MODE==FLAG_JUNK){
472 							setJunk(true);
473 							return false;
474 						}else{
475 							if(processAssertions){
476 								KillSwitch.kill("\nAn input file appears to be misformatted:\n"
477 										+ "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n"
478 										+ "Sequence #"+numericID+"\n"
479 										+ "Sequence ID '"+id+"'\n"
480 										+ "Sequence: "+Tools.toStringSafe(bases)+"\n\n"
481 										+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
482 							}
483 							setJunk(true);
484 							return false;
485 						}
486 					}
487 
488 					final int num=toNum[b];
489 					bases[i]=b;
490 					quality[i]=(num>=0 ? qMap[q] : 0);
491 				}
492 			}else{
493 				for(int i=0; i<bases.length; i++){
494 					byte b=ddxMap[bases[i]];
495 					final int numE=toNumE[b];
496 
497 					if(numE<0){
498 						if(JUNK_MODE==FIX_JUNK){
499 							b=nocall;
500 						}else if(JUNK_MODE==FLAG_JUNK){
501 							setJunk(true);
502 							return false;
503 						}else{
504 							if(processAssertions){
505 								KillSwitch.kill("\nAn input file appears to be misformatted:\n"
506 										+ "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n"
507 										+ "Sequence #"+numericID+"\n"
508 										+ "Sequence ID '"+id+"'\n"
509 										+ "Sequence: "+Tools.toStringSafe(bases)+"\n\n"
510 										+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
511 							}
512 							setJunk(true);
513 							return false;
514 						}
515 					}
516 
517 					bases[i]=b;
518 				}
519 			}
520 		}else{
521 			final byte[] toNumE=AminoAcid.baseToNumberExtended;
522 			if(quality!=null && CHANGE_QUALITY){
523 				for(int i=0; i<bases.length; i++){
524 					byte b=bases[i];
525 					final byte q=quality[i];
526 					final int numE=toNumE[b];
527 
528 					if(numE<0){
529 						if(JUNK_MODE==FIX_JUNK){
530 							bases[i]=b=nocall;
531 						}else if(JUNK_MODE==FLAG_JUNK){
532 							setJunk(true);
533 							return false;
534 						}else{
535 							if(processAssertions){
536 								KillSwitch.kill("\nAn input file appears to be misformatted:\n"
537 										+ "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n"
538 										+ "Sequence #"+numericID+"\n"
539 										+ "Sequence ID '"+id+"'\n"
540 										+ "Sequence: "+Tools.toStringSafe(bases)+"\n\n"
541 										+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
542 							}
543 							setJunk(true);
544 							return false;
545 						}
546 					}
547 
548 					final int num=toNum[b];
549 					bases[i]=b;
550 					quality[i]=(num>=0 ? qMap[q] : 0);
551 				}
552 			}else{
553 				for(int i=0; i<bases.length; i++){
554 					byte b=bases[i];
555 					final int numE=toNumE[b];
556 
557 					if(numE<0){
558 						if(JUNK_MODE==FIX_JUNK){
559 							bases[i]=b=nocall;
560 						}else if(JUNK_MODE==FLAG_JUNK){
561 							setJunk(true);
562 							return false;
563 						}else{
564 							if(processAssertions){
565 								KillSwitch.kill("\nAn input file appears to be misformatted:\n"
566 										+ "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n"
567 										+ "Sequence #"+numericID+"\n"
568 										+ "Sequence ID '"+id+"'\n"
569 										+ "Sequence: "+Tools.toStringSafe(bases)+"\n\n"
570 										+ "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'");
571 							}
572 							setJunk(true);
573 							return false;
574 						}
575 					}
576 				}
577 			}
578 		}
579 		return true;
580 	}
581 
fixHeader(boolean processAssertions)582 	private final void fixHeader(boolean processAssertions){
583 		id=Tools.fixHeader(id, ALLOW_NULL_HEADER, processAssertions);
584 	}
585 
586 	/*--------------------------------------------------------------*/
587 	/*----------------           Various            ----------------*/
588 	/*--------------------------------------------------------------*/
589 
590 
absdif(int a, int b)591 	private static final int absdif(int a, int b){
592 		return a>b ? a-b : b-a;
593 	}
594 
595 	/** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/
isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch)596 	public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch){
597 		return isDuplicateByBases(r, nmax, mmax, qmax, banSameQualityMismatch, false);
598 	}
599 
600 
601 
602 	/** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/
isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength)603 	public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength){
604 		int n=0, m=0;
605 		assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful.";
606 		if(!allowDifferentLength && r.length()!=bases.length){return false;}
607 		int minLen=Tools.min(bases.length, r.length());
608 		for(int i=0; i<minLen; i++){
609 			byte b1=bases[i];
610 			byte b2=r.bases[i];
611 			if(b1=='N' || b2=='N'){
612 				n++;
613 				if(n>nmax){return false;}
614 			}else if(b1!=b2){
615 				m++;
616 				if(m>mmax){return false;}
617 				if(quality[i]>qmax && r.quality[i]>qmax){return false;}
618 				if(banSameQualityMismatch && quality[i]==r.quality[i]){return false;}
619 			}
620 		}
621 		return true;
622 	}
623 
isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment)624 	public boolean isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment){
625 		if(bases.length!=r.length()){
626 			return isDuplicateByMappingDifferentLength(r, bothEnds, checkAlignment);
627 		}
628 		assert(this!=r && mate!=r);
629 		assert(!bothEnds || bases.length==r.length());
630 		if(!mapped() || !r.mapped()){return false;}
631 //		if(chrom==-1 && start==-1){return false;}
632 		if(chrom<1 && start<1){return false;}
633 
634 //		if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;}
635 ////		if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match
636 //		if(stop!=r.stop){return false;} //For unpaired reads, require both ends match
637 //		return true;
638 
639 		if(chrom!=r.chrom || strand()!=r.strand()){return false;}
640 		if(bothEnds){
641 			if(start!=r.start || stop!=r.stop){return false;}
642 		}else{
643 			if(strand()==Shared.PLUS){
644 				if(start!=r.start){return false;}
645 			}else{
646 				if(stop!=r.stop){return false;}
647 			}
648 		}
649 		if(checkAlignment){
650 			if(perfect() && r.perfect()){return true;}
651 			if(match!=null && r.match!=null){
652 				if(match.length!=r.match.length){return false;}
653 				for(int i=0; i<match.length; i++){
654 					byte a=match[i];
655 					byte b=r.match[i];
656 					if(a!=b){
657 						if((a=='D') != (b=='D')){return false;}
658 						if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;}
659 					}
660 				}
661 			}
662 		}
663 		return true;
664 	}
665 
isDuplicateByMappingDifferentLength(Read r, boolean bothEnds, boolean checkAlignment)666 	public boolean isDuplicateByMappingDifferentLength(Read r, boolean bothEnds, boolean checkAlignment){
667 		assert(this!=r && mate!=r);
668 		assert(bases.length!=r.length());
669 		if(bothEnds){return false;}
670 //		assert(!bothEnds || bases.length==r.length());
671 		if(!mapped() || !r.mapped()){return false;}
672 //		if(chrom==-1 && start==-1){return false;}
673 		if(chrom<1 && start<1){return false;}
674 
675 //		if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;}
676 ////		if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match
677 //		if(stop!=r.stop){return false;} //For unpaired reads, require both ends match
678 //		return true;
679 
680 		if(chrom!=r.chrom || strand()!=r.strand()){return false;}
681 
682 		if(strand()==Shared.PLUS){
683 			if(start!=r.start){return false;}
684 		}else{
685 			if(stop!=r.stop){return false;}
686 		}
687 
688 		if(checkAlignment){
689 			if(perfect() && r.perfect()){return true;}
690 			if(match!=null && r.match!=null){
691 				int minLen=Tools.min(match.length, r.match.length);
692 				for(int i=0; i<minLen; i++){
693 					byte a=match[i];
694 					byte b=r.match[i];
695 					if(a!=b){
696 						if((a=='D') != (b=='D')){return false;}
697 						if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;}
698 					}
699 				}
700 			}
701 		}
702 		return true;
703 	}
704 
merge(Read r, boolean mergeVectors, boolean mergeN)705 	public void merge(Read r, boolean mergeVectors, boolean mergeN){mergePrivate(r, mergeVectors, mergeN, true);}
706 
mergePrivate(Read r, boolean mergeVectors, boolean mergeN, boolean mergeMate)707 	private void mergePrivate(Read r, boolean mergeVectors, boolean mergeN, boolean mergeMate){
708 		assert(r!=this);
709 		assert(r!=this.mate);
710 		assert(r!=r.mate);
711 		assert(this!=this.mate);
712 		assert(r.mate==null || r.mate.mate==r);
713 		assert(this.mate==null || this.mate.mate==this);
714 		assert(r.mate==null || r.numericID==r.mate.numericID);
715 		assert(mate==null || numericID==mate.numericID);
716 		mergeN=(mergeN||mergeVectors);
717 
718 		assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful.";
719 
720 		if((mergeN || mergeVectors) && bases.length<r.length()){
721 			int oldLenB=bases.length;
722 			start=Tools.min(start, r.start);
723 			stop=Tools.max(stop, r.stop);
724 			mapScore=Tools.max(mapScore, r.mapScore);
725 
726 			bases=KillSwitch.copyOfRange(bases, 0, r.length());
727 			quality=KillSwitch.copyOfRange(quality, 0, r.quality.length);
728 			for(int i=oldLenB; i<bases.length; i++){
729 				bases[i]='N';
730 				quality[i]=0;
731 			}
732 			match=null;
733 			r.match=null;
734 		}
735 
736 		copies+=r.copies;
737 
738 
739 //		if(numericID==11063941 || r.numericID==11063941 || numericID==8715632){
740 //			System.err.println("***************");
741 //			System.err.println(this.toText()+"\n");
742 //			System.err.println(r.toText()+"\n");
743 //			System.err.println(mergeVectors+", "+mergeN+", "+mergeMate+"\n");
744 //		}
745 
746 		boolean pflag1=perfect();
747 		boolean pflag2=r.perfect();
748 
749 		final int minLenB=Tools.min(bases.length, r.length());
750 
751 		if(mergeN){
752 			if(quality==null){
753 				for(int i=0; i<minLenB; i++){
754 					byte b=r.bases[i];
755 					if(bases[i]=='N' && b!='N'){bases[i]=b;}
756 				}
757 			}else{
758 				for(int i=0; i<minLenB; i++){
759 					final byte b1=bases[i];
760 					final byte b2=r.bases[i];
761 					final byte q1=Tools.max((byte)0, quality[i]);
762 					final byte q2=Tools.max((byte)0, r.quality[i]);
763 					if(b1==b2){
764 						if(b1=='N'){
765 							//do nothing
766 						}else if(mergeVectors){
767 							//merge qualities
768 							//						quality[i]=(byte) Tools.min(40, q1+q2);
769 							if(q1>=q2){
770 								quality[i]=(byte) Tools.min(48, q1+1+q2/4);
771 							}else{
772 								quality[i]=(byte) Tools.min(48, q2+1+q1/4);
773 							}
774 						}
775 					}else if(b1=='N'){
776 						bases[i]=b2;
777 						quality[i]=q2;
778 					}else if(b2=='N'){
779 						//do nothing
780 					}else if(mergeVectors){
781 						if(q1<1 && q2<1){
782 							//Special case - e.g. Illumina calls bases at 0 quality.
783 							//Possibly best to keep the matching allele if one matches the ref.
784 							//But for now, do nothing.
785 							//This was causing problems changing perfect match strings into imperfect matches.
786 						}else if(q1==q2){
787 							assert(b1!=b2);
788 							bases[i]='N';
789 							quality[i]=0;
790 						}else if(q1>q2){
791 							bases[i]=b1;
792 							quality[i]=(byte)(q1-q2/2);
793 						}else{
794 							bases[i]=b2;
795 							quality[i]=(byte)(q2-q1/2);
796 						}
797 						assert(quality[i]>=0 && quality[i]<=48);
798 					}
799 				}
800 			}
801 		}
802 
803 		//TODO:
804 		//Note that the read may need to be realigned after merging, so the match string may be rendered incorrect.
805 
806 		if(mergeN && match!=null){
807 			if(r.match==null){match=null;}
808 			else{
809 				if(match.length!=r.match.length){match=null;}
810 				else{
811 					boolean ok=true;
812 					for(int i=0; i<match.length && ok; i++){
813 						byte a=match[i], b=r.match[i];
814 						if(a!=b){
815 							if((a=='m' || a=='S') && b=='N'){
816 								//do nothing;
817 							}else if(a=='N' && (b=='m' || b=='S')){
818 								match[i]=b;
819 							}else{
820 								ok=false;
821 							}
822 						}
823 					}
824 					if(!ok){match=null;}
825 				}
826 			}
827 		}
828 
829 		if(mergeMate && mate!=null){
830 			mate.mergePrivate(r.mate, mergeVectors, mergeN, false);
831 			assert(copies==mate.copies);
832 		}
833 		assert(copies>1);
834 
835 		assert(r!=this);
836 		assert(r!=this.mate);
837 		assert(r!=r.mate);
838 		assert(this!=this.mate);
839 		assert(r.mate==null || r.mate.mate==r);
840 		assert(this.mate==null || this.mate.mate==this);
841 		assert(r.mate==null || r.numericID==r.mate.numericID);
842 		assert(mate==null || numericID==mate.numericID);
843 	}
844 
845 	@Override
toString()846 	public String toString(){return toText(false).toString();}
847 
toSites()848 	public ByteBuilder toSites(){return toSites((ByteBuilder)null);}
849 
toSites(ByteBuilder sb)850 	public ByteBuilder toSites(ByteBuilder sb){
851 		if(numSites()==0){
852 			if(sb==null){sb=new ByteBuilder(2);}
853 			sb.append('.');
854 		}else{
855 			if(sb==null){sb=new ByteBuilder(sites.size()*20);}
856 			int appended=0;
857 			for(SiteScore ss : sites){
858 				if(appended>0){sb.append('\t');}
859 				if(ss!=null){
860 					ss.toBytes(sb);
861 					appended++;
862 				}
863 			}
864 			if(appended==0){sb.append('.');}
865 		}
866 		return sb;
867 	}
868 
toInfo()869 	public ByteBuilder toInfo(){
870 		if(obj==null){return new ByteBuilder();}
871 		if(obj.getClass()==ByteBuilder.class){return (ByteBuilder)obj;}
872 		return new ByteBuilder(obj.toString());
873 	}
874 
toInfo(ByteBuilder bb)875 	public ByteBuilder toInfo(ByteBuilder bb){
876 		if(obj==null){return bb;}
877 		if(obj.getClass()==ByteBuilder.class){return bb.append((ByteBuilder)obj);}
878 		return bb.append(obj.toString());
879 	}
880 
toFastq()881 	public ByteBuilder toFastq(){
882 		return FASTQ.toFASTQ(this, (ByteBuilder)null);
883 	}
884 
toFastq(ByteBuilder bb)885 	public ByteBuilder toFastq(ByteBuilder bb){
886 		return FASTQ.toFASTQ(this, bb);
887 	}
888 
toFasta()889 	public ByteBuilder toFasta(){return toFasta(Shared.FASTA_WRAP);}
toFasta(ByteBuilder bb)890 	public ByteBuilder toFasta(ByteBuilder bb){return toFasta(Shared.FASTA_WRAP, bb);}
891 
toFasta(int wrap)892 	public ByteBuilder toFasta(int wrap){
893 		return toFasta(wrap, (ByteBuilder)null);
894 	}
895 
toFasta(int wrap, ByteBuilder bb)896 	public ByteBuilder toFasta(int wrap, ByteBuilder bb){
897 		if(wrap<1){wrap=Integer.MAX_VALUE;}
898 		int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5;
899 		if(bb==null){bb=new ByteBuilder(len+1);}
900 		bb.append('>');
901 		if(id==null){bb.append(numericID);}
902 		else{bb.append(id);}
903 		if(bases!=null){
904 			int pos=0;
905 			while(pos<bases.length-wrap){
906 				bb.append('\n');
907 				bb.append(bases, pos, wrap);
908 				pos+=wrap;
909 			}
910 			if(pos<bases.length){
911 				bb.append('\n');
912 				bb.append(bases, pos, bases.length-pos);
913 			}
914 		}
915 		return bb;
916 	}
917 
toSam()918 	public ByteBuilder toSam(){
919 		return toSam((ByteBuilder)null);
920 	}
921 
toSam(ByteBuilder bb)922 	public ByteBuilder toSam(ByteBuilder bb){
923 		SamLine sl=new SamLine(this, pairnum());
924 		return sl.toBytes(bb);//Note: This used to have a newline
925 	}
926 
header()927 	public static CharSequence header(){
928 
929 		StringBuilder sb=new StringBuilder();
930 		sb.append("id");
931 		sb.append('\t');
932 		sb.append("numericID");
933 		sb.append('\t');
934 		sb.append("chrom");
935 		sb.append('\t');
936 		sb.append("strand");
937 		sb.append('\t');
938 		sb.append("start");
939 		sb.append('\t');
940 		sb.append("stop");
941 		sb.append('\t');
942 
943 		sb.append("flags");
944 		sb.append('\t');
945 
946 		sb.append("copies");
947 		sb.append('\t');
948 
949 		sb.append("errors,fixed");
950 		sb.append('\t');
951 		sb.append("mapScore");
952 		sb.append('\t');
953 		sb.append("length");
954 		sb.append('\t');
955 
956 		sb.append("bases");
957 		sb.append('\t');
958 		sb.append("quality");
959 		sb.append('\t');
960 
961 		sb.append("insert");
962 		sb.append('\t');
963 		{
964 			//These are not really necessary...
965 			sb.append("avgQual");
966 			sb.append('\t');
967 		}
968 
969 		sb.append("match");
970 		sb.append('\t');
971 		sb.append("SiteScores: "+SiteScore.header());
972 		return sb;
973 	}
974 
toText(boolean okToCompressMatch)975 	public ByteBuilder toText(boolean okToCompressMatch){
976 		return toText(okToCompressMatch, (ByteBuilder)null);
977 	}
978 
toText(boolean okToCompressMatch, ByteBuilder bb)979 	public ByteBuilder toText(boolean okToCompressMatch, ByteBuilder bb){
980 
981 		final byte[] oldmatch=match;
982 		final boolean oldshortmatch=this.shortmatch();
983 		if(COMPRESS_MATCH_BEFORE_WRITING && !shortmatch() && okToCompressMatch){
984 			match=toShortMatchString(match);
985 			setShortMatch(true);
986 		}
987 
988 		if(bb==null){bb=new ByteBuilder();}
989 		bb.append(id);
990 		bb.tab();
991 		bb.append(numericID);
992 		bb.tab();
993 		bb.append(chrom);
994 		bb.tab();
995 		bb.append(Shared.strandCodes2[strand()]);
996 		bb.tab();
997 		bb.append(start);
998 		bb.tab();
999 		bb.append(stop);
1000 		bb.tab();
1001 
1002 		for(int i=maskArray.length-1; i>=0; i--){
1003 			bb.append(flagToNumber(maskArray[i]));
1004 		}
1005 		bb.tab();
1006 
1007 		bb.append(copies);
1008 		bb.tab();
1009 
1010 		bb.append(errors);
1011 		bb.tab();
1012 		bb.append(mapScore);
1013 		bb.tab();
1014 
1015 		if(bases==null){bb.append('.');}
1016 		else{bb.append(bases);}
1017 		bb.tab();
1018 
1019 //		int qualSum=0;
1020 //		int qualMin=99999;
1021 
1022 		if(quality==null){
1023 			bb.append('.');
1024 		}else{
1025 			bb.ensureExtra(quality.length);
1026 			for(int i=0, j=bb.length; i<quality.length; i++, j++){
1027 				byte q=quality[i];
1028 				bb.array[j]=(byte)(q+ASCII_OFFSET);
1029 //				qualSum+=q;
1030 //				qualMin=Tools.min(q, qualMin);
1031 			}
1032 			bb.length+=quality.length;
1033 		}
1034 		bb.tab();
1035 
1036 		if(insert<1){bb.append('.');}else{bb.append(insert);};
1037 		bb.tab();
1038 
1039 		if(true || quality==null){
1040 			bb.append('.');
1041 			bb.tab();
1042 		}else{
1043 //			//These are not really necessary...
1044 //			sb.append(qualSum/quality.length);
1045 //			sb.append('\t');
1046 		}
1047 
1048 		if(match==null){bb.append('.');}
1049 		else{bb.append(match);}
1050 		bb.tab();
1051 
1052 		if(gaps==null){
1053 			bb.append('.');
1054 		}else{
1055 			for(int i=0; i<gaps.length; i++){
1056 				if(i>0){bb.append('~');}
1057 				bb.append(gaps[i]);
1058 			}
1059 		}
1060 
1061 		if(sites!=null && sites.size()>0){
1062 
1063 			assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) :
1064 				"\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n";
1065 
1066 			for(SiteScore ss : sites){
1067 				bb.tab();
1068 				if(ss==null){
1069 					bb.append((byte[])null);
1070 				}else{
1071 					ss.toBytes(bb);
1072 				}
1073 				bb.append(ss==null ? "null" : ss.toText());
1074 			}
1075 		}
1076 
1077 		if(originalSite!=null){
1078 			bb.tab();
1079 			bb.append('*');
1080 			originalSite.toBytes(bb);
1081 		}
1082 
1083 		match=oldmatch;
1084 		setShortMatch(oldshortmatch);
1085 
1086 		return bb;
1087 	}
1088 
1089 	public static Read fromText(String line){
1090 		if(line.length()==1 && line.charAt(0)=='.'){return null;}
1091 
1092 		String[] split=line.split("\t");
1093 
1094 		if(split.length<17){
1095 			throw new RuntimeException("Error parsing read from text.\n\n" +
1096 					"This may be caused be attempting to parse the wrong format.\n" +
1097 					"Please ensure that the file extension is correct:\n" +
1098 					"\tFASTQ should end in .fastq or .fq\n" +
1099 					"\tFASTA should end in .fasta or .fa, .fas, .fna, .ffn, .frn, .seq, .fsa\n" +
1100 					"\tSAM should end in .sam\n" +
1101 					"\tNative format should end in .txt or .bread\n" +
1102 					"If a file is compressed, there must be a compression extension after the format extension:\n" +
1103 					"\tgzipped files should end in .gz or .gzip\n" +
1104 					"\tzipped files should end in .zip and have only 1 file per archive\n" +
1105 					"\tbz2 files should end in .bz2\n");
1106 		}
1107 
1108 		final String id=new String(split[0]);
1109 		long numericID=Long.parseLong(split[1]);
1110 		int chrom=Byte.parseByte(split[2]);
1111 //		byte strand=Byte.parseByte(split[3]);
1112 		int start=Integer.parseInt(split[4]);
1113 		int stop=Integer.parseInt(split[5]);
1114 
1115 		int flags=Integer.parseInt(split[6], 2);
1116 
1117 		int copies=Integer.parseInt(split[7]);
1118 
1119 		int errors;
1120 		int errorsCorrected;
1121 		if(split[8].indexOf(',')>=0){
1122 			String[] estring=split[8].split(",");
1123 			errors=Integer.parseInt(estring[0]);
1124 			errorsCorrected=Integer.parseInt(estring[1]);
1125 		}else{
1126 			errors=Integer.parseInt(split[8]);
1127 			errorsCorrected=0;
1128 		}
1129 
1130 		int mapScore=Integer.parseInt(split[9]);
1131 
1132 		byte[] basesOriginal=split[10].getBytes();
1133 		byte[] qualityOriginal=(split[11].equals(".") ? null : split[11].getBytes());
1134 
1135 		if(qualityOriginal!=null){
1136 			for(int i=0; i<qualityOriginal.length; i++){
1137 				byte b=qualityOriginal[i];
1138 				b=(byte) (b-ASCII_OFFSET);
1139 				assert(b>=-1) : b;
1140 				qualityOriginal[i]=b;
1141 			}
1142 		}
1143 
1144 		int insert=-1;
1145 		if(!split[12].equals(".")){insert=Integer.parseInt(split[12]);}
1146 
1147 		byte[] match=null;
1148 		if(!split[14].equals(".")){match=split[14].getBytes();}
1149 		int[] gaps=null;
1150 		if(!split[15].equals(".")){
1151 
1152 			String[] gstring=split[16].split("~");
1153 			gaps=new int[gstring.length];
1154 			for(int i=0; i<gstring.length; i++){
1155 				gaps[i]=Integer.parseInt(gstring[i]);
1156 			}
1157 		}
1158 
1159 //		assert(false) : split[16];
1160 
1161 		Read r=new Read(basesOriginal, qualityOriginal, id, numericID, flags, chrom, start, stop);
1162 		r.match=match;
1163 		r.errors=errors;
1164 		r.mapScore=mapScore;
1165 		r.copies=copies;
1166 		r.gaps=gaps;
1167 		r.insert=insert;
1168 
1169 		int firstScore=(ADD_BEST_SITE_TO_LIST_FROM_TEXT) ? 17 : 18;
1170 
1171 		int scores=split.length-firstScore;
1172 
1173 		int mSites=0;
1174 		for(int i=firstScore; i<split.length; i++){
1175 			if(split[i].charAt(0)!='*'){mSites++;}
1176 		}
1177 
1178 		//This can be disabled to handle very old text format.
1179 		if(mSites>0){r.sites=new ArrayList<SiteScore>(mSites);}
1180 		for(int i=firstScore; i<split.length; i++){
1181 			SiteScore ss=SiteScore.fromText(split[i]);
1182 			if(split[i].charAt(0)=='*'){r.originalSite=ss;}
1183 			else{r.sites.add(ss);}
1184 		}
1185 
1186 		if(DECOMPRESS_MATCH_ON_LOAD && r.shortmatch()){
1187 			r.toLongMatchString(true);
1188 		}
1189 
1190 		assert(r.numSites()==0 || absdif(r.start, r.stop)<3000 || (r.gaps==null) == (r.topSite().gaps==null)) :
1191 			"\n"+r.numericID+", "+r.chrom+", "+r.strand()+", "+r.start+", "+r.stop+", "+Arrays.toString(r.gaps)+"\n"+r.sites+"\n"+line+"\n";
1192 
1193 		return r;
1194 	}
1195 
1196 	/** Inflates gaps between contigs in a scaffold. */
1197 	public void inflateGaps(int minGapIn, int minGapOut) {
1198 		assert(minGapIn>0);
1199 		if(!containsNocalls()){return;}
1200 		final ByteBuilder bbb=new ByteBuilder();
1201 		final ByteBuilder bbq=(quality==null ? null : new ByteBuilder());
1202 
1203 		int gap=0;
1204 		for(int i=0; i<bases.length; i++){
1205 			byte b=bases[i];
1206 			byte q=(quality==null ? 0 : quality[i]);
1207 			if(b=='N'){
1208 				gap++;
1209 			}else{
1210 				while(gap>=minGapIn && gap<minGapOut){
1211 					gap++;
1212 					bbb.append('N');
1213 					if(bbq!=null){bbq.append(0);}
1214 				}
1215 				gap=0;
1216 			}
1217 			bbb.append(b);
1218 			if(bbq!=null){bbq.append(q);}
1219 		}
1220 
1221 		while(gap>=minGapIn && gap<minGapOut){//Handle trailing bases
1222 			gap++;
1223 			bbb.append('N');
1224 			if(bbq!=null){bbq.append(0);}
1225 		}
1226 
1227 		assert(bbb.length()>=bases.length);
1228 		if(bbb.length()>bases.length){
1229 			bases=bbb.toBytes();
1230 			if(bbq!=null){quality=bbq.toBytes();}
1231 		}
1232 	}
1233 
1234 	public ArrayList<Read> breakAtGaps(final boolean agp, final int minContig){
1235 		ArrayList<Read> list=new ArrayList<Read>();
1236 		byte prev='N';
1237 		int lastN=-1, lastBase=-1;
1238 		int contignum=1;
1239 		long feature=1;
1240 		ByteBuilder bb=(agp ? new ByteBuilder() : null);
1241 		assert(obj==null);
1242 		for(int i=0; i<bases.length; i++){
1243 			final byte b=bases[i];
1244 			if(b=='N'){
1245 				if(prev!='N'){
1246 					final int start=lastN+1, stop=i;
1247 					byte[] b2=KillSwitch.copyOfRange(bases, start, stop);
1248 					byte[] q2=(quality==null ? null : KillSwitch.copyOfRange(quality, start, stop));
1249 					Read r=new Read(b2, q2, id+"_c"+contignum, numericID);
1250 					if(r.length()>=minContig){list.add(r);}
1251 					contignum++;
1252 
1253 					if(bb!=null){
1254 						bb.append(id).append('\t');
1255 						bb.append(start+1).append('\t');
1256 						bb.append(stop).append('\t');
1257 						bb.append(feature).append('\t');
1258 						feature++;
1259 						bb.append('W').append('\t');
1260 						bb.append(r.id).append('\t');
1261 						bb.append(1).append('\t');
1262 						bb.append(r.length()).append('\t');
1263 						bb.append('+').append('\n');
1264 					}
1265 				}
1266 				lastN=i;
1267 			}else{
1268 				if(bb!=null && prev=='N' && lastBase>=0){
1269 					bb.append(id).append('\t');
1270 					bb.append(lastBase+2).append('\t');
1271 					bb.append(i).append('\t');
1272 					bb.append(feature).append('\t');
1273 					feature++;
1274 					bb.append('N').append('\t');
1275 					bb.append((i-lastBase-1)).append('\t');
1276 					bb.append("scaffold").append('\t');
1277 					bb.append("yes").append('\t');
1278 					bb.append("paired-ends").append('\n');
1279 				}
1280 				lastBase=i;
1281 			}
1282 			prev=b;
1283 		}
1284 		if(prev!='N'){
1285 			final int start=lastN+1, stop=bases.length;
1286 			byte[] b2=KillSwitch.copyOfRange(bases, start, stop);
1287 			byte[] q2=(quality==null ? null : KillSwitch.copyOfRange(quality, start, stop));
1288 			Read r=new Read(b2, q2, id+"_c"+contignum, numericID);
1289 			if(r.length()>=minContig){list.add(r);}
1290 			contignum++;
1291 
1292 			if(bb!=null){
1293 				bb.append(id).append('\t');
1294 				bb.append(start+1).append('\t');
1295 				bb.append(stop).append('\t');
1296 				bb.append(feature).append('\t');
1297 				feature++;
1298 				bb.append('W').append('\t');
1299 				bb.append(r.id).append('\t');
1300 				bb.append(1).append('\t');
1301 				bb.append(r.length()).append('\t');
1302 				bb.append('+').append('\n');
1303 			}
1304 		}else{
1305 			if(bb!=null && prev=='N' && lastBase>=0){
1306 				bb.append(id).append('\t');
1307 				bb.append(lastBase+2).append('\t');
1308 				bb.append(bases.length).append('\t');
1309 				bb.append(feature).append('\t');
1310 				feature++;
1311 				bb.append('N').append('\t');
1312 				bb.append((bases.length-lastBase-1)).append('\t');
1313 				bb.append("scaffold").append('\t');
1314 				bb.append("yes").append('\t');
1315 				bb.append("paired-ends").append('\n');
1316 			}
1317 			lastBase=bases.length;
1318 		}
1319 		if(bb!=null){obj=bb.toBytes();}
1320 		return list;
1321 	}
1322 
1323 	/** Reverse-complements the read. */
reverseComplement()1324 	public void reverseComplement() {
1325 		AminoAcid.reverseComplementBasesInPlace(bases);
1326 		Tools.reverseInPlace(quality);
1327 		setStrand(strand()^1);
1328 	}
1329 
1330 	/** Complements the read. */
complement()1331 	public void complement() {
1332 		AminoAcid.reverseComplementBasesInPlace(bases);
1333 	}
1334 
1335 	@Override
compareTo(Read o)1336 	public int compareTo(Read o) {
1337 		if(chrom!=o.chrom){return chrom-o.chrom;}
1338 		if(start!=o.start){return start-o.start;}
1339 		if(stop!=o.stop){return stop-o.stop;}
1340 		if(strand()!=o.strand()){return strand()-o.strand();}
1341 		return 0;
1342 	}
1343 
toSite()1344 	public SiteScore toSite(){
1345 		assert(start<=stop) : this.toText(false);
1346 		SiteScore ss=new SiteScore(chrom, strand(), start, stop, 0, 0, rescued(), perfect());
1347 		if(paired()){
1348 			ss.setSlowPairedScore(mapScore-1, mapScore);
1349 		}else{
1350 			ss.setSlowPairedScore(mapScore, 0);
1351 		}
1352 		ss.setScore(mapScore);
1353 		ss.gaps=gaps;
1354 		ss.match=match;
1355 		originalSite=ss;
1356 		return ss;
1357 	}
1358 
topSite()1359 	public SiteScore topSite(){
1360 		final SiteScore ss=(sites==null || sites.isEmpty()) ? null : sites.get(0);
1361 		assert(sites==null || sites.isEmpty() || ss!=null) : "Top site is null for read "+this;
1362 		return ss;
1363 	}
1364 
numSites()1365 	public int numSites(){
1366 		return (sites==null ? 0 : sites.size());
1367 	}
1368 
makeOriginalSite()1369 	public SiteScore makeOriginalSite(){
1370 		originalSite=toSite();
1371 		return originalSite;
1372 	}
1373 
setFromSite(SiteScore ss)1374 	public void setFromSite(SiteScore ss){
1375 		assert(ss!=null);
1376 		chrom=ss.chrom;
1377 		setStrand(ss.strand);
1378 		start=ss.start;
1379 		stop=ss.stop;
1380 		mapScore=ss.slowScore;
1381 		setRescued(ss.rescued);
1382 		gaps=ss.gaps;
1383 		setPerfect(ss.perfect);
1384 
1385 		match=ss.match;
1386 
1387 		if(gaps!=null){
1388 			gaps=ss.gaps=GapTools.fixGaps(start, stop, gaps, Shared.MINGAP);
1389 //			gaps[0]=Tools.min(gaps[0], start);
1390 //			gaps[gaps.length-1]=Tools.max(gaps[gaps.length-1], stop);
1391 		}
1392 	}
1393 
1394 //	public static int[] fixGaps(int a, int b, int[] gaps, int minGap){
1395 ////		System.err.println("fixGaps input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap);
1396 //		int[] r=GapTools.fixGaps(a, b, gaps, minGap);
1397 ////		System.err.println("fixGaps output: "+Arrays.toString(r));
1398 //		return r;
1399 //	}
1400 
setFromOriginalSite()1401 	public void setFromOriginalSite(){
1402 		setFromSite(originalSite);
1403 	}
setFromTopSite()1404 	public void setFromTopSite(){
1405 		final SiteScore ss=topSite();
1406 		if(ss==null){
1407 			clearSite();
1408 			setMapped(false);
1409 			return;
1410 		}
1411 		setMapped(true);
1412 		setFromSite(ss);
1413 	}
1414 
setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist)1415 	public void setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist){
1416 		final SiteScore ss0=topSite();
1417 		if(ss0==null){
1418 			clearSite();
1419 			setMapped(false);
1420 			return;
1421 		}
1422 		setMapped(true);
1423 
1424 		if(sites.size()==1 || !randomIfAmbiguous || !ambiguous()){
1425 			setFromSite(ss0);
1426 			return;
1427 		}
1428 
1429 		if(primary || mate==null || !mate.mapped() || !mate.paired()){
1430 			int count=1;
1431 			for(int i=1; i<sites.size(); i++){
1432 				SiteScore ss=sites.get(i);
1433 				if(ss.score<ss0.score || (ss0.perfect && !ss.perfect) || (ss0.semiperfect && !ss.semiperfect)){break;}
1434 				count++;
1435 			}
1436 
1437 			int x=(int)(numericID%count);
1438 			if(x>0){
1439 				SiteScore ss=sites.get(x);
1440 				sites.set(0, ss);
1441 				sites.set(x, ss0);
1442 			}
1443 			setFromSite(sites.get(0));
1444 			return;
1445 		}
1446 
1447 //		assert(false) : "TODO: Proper strand orientation, and more.";
1448 		//TODO: Also, this code appears to sometimes duplicate sitescores(?)
1449 //		for(int i=0; i<list.size(); i++){
1450 //			SiteScore ss=list.get(i);
1451 //			if(ss.chrom==mate.chrom && Tools.min(Tools.absdifUnsigned(ss.start, mate.stop), Tools.absdifUnsigned(ss.stop, mate.start))<=maxPairDist){
1452 //				list.set(0, ss);
1453 //				list.set(i, ss0);
1454 //				setFromSite(ss);
1455 //				return;
1456 //			}
1457 //		}
1458 
1459 		//If unsuccessful, recur unpaired.
1460 
1461 		this.setPaired(false);
1462 		mate.setPaired(false);
1463 		setFromTopSite(randomIfAmbiguous, true, maxPairDist);
1464 	}
1465 
clearPairMapping()1466 	public void clearPairMapping(){
1467 		clearMapping();
1468 		if(mate!=null){mate.clearMapping();}
1469 	}
1470 
clearMapping()1471 	public void clearMapping(){
1472 		clearSite();
1473 		match=null;
1474 		sites=null;
1475 		setMapped(false);
1476 		setPaired(false);
1477 		if(mate!=null){mate.setPaired(false);}
1478 	}
1479 
clearSite()1480 	public void clearSite(){
1481 		chrom=-1;
1482 		setStrand(0);
1483 		start=-1;
1484 		stop=-1;
1485 //		errors=0;
1486 		mapScore=0;
1487 		gaps=null;
1488 	}
1489 
1490 
clearAnswers(boolean clearMate)1491 	public void clearAnswers(boolean clearMate) {
1492 //		assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum();
1493 		clearSite();
1494 		match=null;
1495 		sites=null;
1496 		flags=(flags&(SYNTHMASK|PAIRNUMMASK|SWAPMASK));
1497 		if(clearMate && mate!=null){
1498 			mate.clearSite();
1499 			mate.match=null;
1500 			mate.sites=null;
1501 			mate.flags=(mate.flags&(SYNTHMASK|PAIRNUMMASK|SWAPMASK));
1502 		}
1503 //		assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum();
1504 	}
1505 
1506 
isBadPair(boolean requireCorrectStrands, boolean sameStrandPairs, int maxdist)1507 	public boolean isBadPair(boolean requireCorrectStrands, boolean sameStrandPairs, int maxdist){
1508 		if(mate==null || paired()){return false;}
1509 		if(!mapped() || !mate.mapped()){return false;}
1510 		if(chrom!=mate.chrom){return true;}
1511 
1512 		{
1513 			int inner;
1514 			if(start<=mate.start){inner=mate.start-stop;}
1515 			else{inner=start-mate.stop;}
1516 			if(inner>maxdist){return true;}
1517 		}
1518 //		if(absdif(start, mate.start)>maxdist){return true;}
1519 		if(requireCorrectStrands){
1520 			if((strand()==mate.strand())!=sameStrandPairs){return true;}
1521 		}
1522 		if(!sameStrandPairs){
1523 			if(strand()==Shared.PLUS && mate.strand()==Shared.MINUS){
1524 				if(start>=mate.stop){return true;}
1525 			}else if(strand()==Shared.MINUS && mate.strand()==Shared.PLUS){
1526 				if(mate.start>=stop){return true;}
1527 			}
1528 		}
1529 		return false;
1530 	}
1531 
countMismatches()1532 	public int countMismatches(){
1533 		assert(match!=null);
1534 		int x=0;
1535 		for(byte b : match){
1536 			if(b=='S'){x++;}
1537 		}
1538 		return x;
1539 	}
1540 
1541 	/**
1542 	 * @param k
1543 	 * @return Number of valid kmers
1544 	 */
numValidKmers(int k)1545 	public int numValidKmers(int k) {
1546 		if(bases==null){return 0;}
1547 		int len=0, counted=0;
1548 		for(int i=0; i<bases.length; i++){
1549 			byte b=bases[i];
1550 			long x=AminoAcid.baseToNumber[b];
1551 			if(x<0){len=0;}else{len++;}
1552 			if(len>=k){counted++;}
1553 		}
1554 		return counted;
1555 	}
1556 
1557 	/**
1558 	 * @param match string
1559 	 * @return Total number of match, sub, del, ins, or clip symbols
1560 	 */
matchToMsdicn(byte[] match)1561 	public static final int[] matchToMsdicn(byte[] match) {
1562 		if(match==null || match.length<1){return null;}
1563 		int[] msdicn=KillSwitch.allocInt1D(6);
1564 
1565 		byte mode='0', c='0';
1566 		int current=0;
1567 		for(int i=0; i<match.length; i++){
1568 			c=match[i];
1569 			if(Tools.isDigit(c)){
1570 				current=(current*10)+(c-'0');
1571 			}else{
1572 				if(mode==c){
1573 					current=Tools.max(current+1, 2);
1574 				}else{
1575 					current=Tools.max(current, 1);
1576 
1577 					if(mode=='m'){
1578 						msdicn[0]+=current;
1579 					}else if(mode=='S'){
1580 						msdicn[1]+=current;
1581 					}else if(mode=='D'){
1582 						msdicn[2]+=current;
1583 					}else if(mode=='I'){
1584 						msdicn[3]+=current;
1585 					}else if(mode=='C' || mode=='X' || mode=='Y'){
1586 						msdicn[4]+=current;
1587 					}else if(mode=='N' || mode=='R'){
1588 						msdicn[5]+=current;
1589 					}
1590 					mode=c;
1591 					current=0;
1592 				}
1593 			}
1594 		}
1595 		if(current>0 || !Tools.isDigit(c)){
1596 			current=Tools.max(current, 1);
1597 			if(mode=='m'){
1598 				msdicn[0]+=current;
1599 			}else if(mode=='S'){
1600 				msdicn[1]+=current;
1601 			}else if(mode=='D'){
1602 				msdicn[2]+=current;
1603 			}else if(mode=='I'){
1604 				msdicn[3]+=current;
1605 			}else if(mode=='C' || mode=='X' || mode=='Y'){
1606 				msdicn[4]+=current;
1607 			}else if(mode=='N' || mode=='R'){
1608 				msdicn[5]+=current;
1609 			}
1610 		}
1611 		return msdicn;
1612 	}
1613 
1614 
1615 	/**
1616 	 * @param match string
1617 	 * @return Ref length of match string
1618 	 */
calcMatchLength(byte[] match)1619 	public static final int calcMatchLength(byte[] match) {
1620 		if(match==null || match.length<1){return 0;}
1621 
1622 		byte mode='0', c='0';
1623 		int current=0;
1624 		int len=0;
1625 		for(int i=0; i<match.length; i++){
1626 			c=match[i];
1627 			if(Tools.isDigit(c)){
1628 				current=(current*10)+(c-'0');
1629 			}else{
1630 				if(mode==c){
1631 					current=Tools.max(current+1, 2);
1632 				}else{
1633 					current=Tools.max(current, 1);
1634 
1635 					if(mode=='m'){
1636 						len+=current;
1637 					}else if(mode=='S'){
1638 						len+=current;
1639 					}else if(mode=='D'){
1640 						len+=current;
1641 					}else if(mode=='I'){ //Do nothing
1642 						//len+=current;
1643 					}else if(mode=='C'){
1644 						len+=current;
1645 					}else if(mode=='X'){ //Not sure about this case, but adding seems fine
1646 						len+=current;
1647 //						assert(false) : new String(match);
1648 					}else if(mode=='Y'){ //Do nothing
1649 						//len+=current;
1650 //						assert(false) : new String(match);
1651 					}else if(mode=='N' || mode=='R'){
1652 						len+=current;
1653 					}
1654 					mode=c;
1655 					current=0;
1656 				}
1657 			}
1658 		}
1659 		if(current>0 || !Tools.isDigit(c)){
1660 			current=Tools.max(current, 1);
1661 			if(mode=='m'){
1662 				len+=current;
1663 			}else if(mode=='S'){
1664 				len+=current;
1665 			}else if(mode=='D'){
1666 				len+=current;
1667 			}else if(mode=='I'){ //Do nothing
1668 				//len+=current;
1669 			}else if(mode=='C'){
1670 				len+=current;
1671 			}else if(mode=='X'){ //Not sure about this case, but adding seems fine
1672 				len+=current;
1673 				assert(false) : new String(match);
1674 			}else if(mode=='Y'){ //Do nothing
1675 				//len+=current;
1676 //				assert(false) : new String(match);
1677 			}else if(mode=='N' || mode=='R'){
1678 				len+=current;
1679 			}
1680 		}
1681 		return len;
1682 	}
1683 
identity()1684 	public final float identity() {return identity(match);}
1685 
identity(byte[] match)1686 	public static final float identity(byte[] match) {
1687 		if(FLAT_IDENTITY){
1688 			return identityFlat(match, true);
1689 		}else{
1690 			return identitySkewed(match, true, true, false, false);
1691 		}
1692 	}
1693 
hasLongInsertion(int maxlen)1694 	public final boolean hasLongInsertion(int maxlen){
1695 		return hasLongInsertion(match, maxlen);
1696 	}
1697 
hasLongDeletion(int maxlen)1698 	public final boolean hasLongDeletion(int maxlen){
1699 		return hasLongDeletion(match, maxlen);
1700 	}
1701 
hasLongInsertion(byte[] match, int maxlen)1702 	public static final boolean hasLongInsertion(byte[] match, int maxlen){
1703 		if(match==null || match.length<maxlen){return false;}
1704 		byte prev='0';
1705 		int len=0;
1706 		for(byte b : match){
1707 			if(b=='I' || b=='X' || b=='Y'){
1708 				if(b==prev){len++;}
1709 				else{len=1;}
1710 				if(len>maxlen){return true;}
1711 			}else{
1712 				len=0;
1713 			}
1714 			prev=b;
1715 		}
1716 		return false;
1717 	}
1718 
hasLongDeletion(byte[] match, int maxlen)1719 	public static final boolean hasLongDeletion(byte[] match, int maxlen){
1720 		if(match==null || match.length<maxlen){return false;}
1721 		byte prev='0';
1722 		int len=0;
1723 		for(byte b : match){
1724 			if(b=='D'){
1725 				if(b==prev){len++;}
1726 				else{len=1;}
1727 				if(len>maxlen){return true;}
1728 			}else{
1729 				len=0;
1730 			}
1731 			prev=b;
1732 		}
1733 		return false;
1734 	}
1735 
mappedNonClippedBases()1736 	public int mappedNonClippedBases() {
1737 		if(!mapped() || match==null || bases==null){return 0;}
1738 
1739 		int len=0;
1740 		byte mode='0', c='0';
1741 		int current=0;
1742 		for(int i=0; i<match.length; i++){
1743 			c=match[i];
1744 			if(Tools.isDigit(c)){
1745 				current=(current*10)+(c-'0');
1746 			}else{
1747 				if(mode==c){
1748 					current=Tools.max(current+1, 2);
1749 				}else{
1750 					current=Tools.max(current, 1);
1751 
1752 					if(mode=='D' || mode=='C' || mode=='X' || mode=='Y' || mode=='d'){
1753 
1754 					}else{
1755 						len+=current;
1756 					}
1757 					mode=c;
1758 					current=0;
1759 				}
1760 			}
1761 		}
1762 		if(current>0 || !Tools.isDigit(c)){
1763 			current=Tools.max(current, 1);
1764 			if(mode=='D' || mode=='C' || mode=='X' || mode=='Y' || mode=='d'){
1765 
1766 			}else{
1767 				len+=current;
1768 			}
1769 			mode=c;
1770 			current=0;
1771 		}
1772 		return len;
1773 	}
1774 
1775 	/**
1776 	 * Handles short or long mode.
1777 	 * @param match string
1778 	 * @return Identity based on number of match, sub, del, ins, or N symbols
1779 	 */
identityFlat(byte[] match, boolean penalizeN)1780 	public static final float identityFlat(byte[] match, boolean penalizeN) {
1781 //		assert(false) : new String(match);
1782 		if(match==null || match.length<1){return 0;}
1783 
1784 		int good=0, bad=0, n=0;
1785 
1786 		byte mode='0', c='0';
1787 		int current=0;
1788 		for(int i=0; i<match.length; i++){
1789 			c=match[i];
1790 			if(Tools.isDigit(c)){
1791 				current=(current*10)+(c-'0');
1792 			}else{
1793 				if(mode==c){
1794 					current=Tools.max(current+1, 2);
1795 				}else{
1796 					current=Tools.max(current, 1);
1797 
1798 					if(mode=='m'){
1799 						good+=current;
1800 //						System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1801 					}else if(mode=='R' || mode=='N'){
1802 						n+=current;
1803 					}else if(mode=='C' || mode=='V'){
1804 						//Do nothing
1805 						//I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
1806 					}else if(mode!='0'){
1807 						assert(mode=='S' || mode=='D' || mode=='I' || mode=='X' || mode=='Y' || mode=='i' || mode=='d') : (char)mode;
1808 						if(mode!='D' || current<SamLine.INTRON_LIMIT){
1809 							bad+=current;
1810 						}
1811 //						System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1812 					}
1813 					mode=c;
1814 					current=0;
1815 				}
1816 			}
1817 		}
1818 		if(current>0 || !Tools.isDigit(c)){
1819 			current=Tools.max(current, 1);
1820 			if(mode=='m'){
1821 				good+=current;
1822 			}else if(mode=='R' || mode=='N'){
1823 				n+=current;
1824 			}else if(mode=='C' || mode=='V'){
1825 				//Do nothing
1826 				//I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
1827 			}else if(mode!='0'){
1828 				assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
1829 				if(mode!='D' || current<SamLine.INTRON_LIMIT){
1830 					bad+=current;
1831 				}
1832 //				System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1833 			}
1834 		}
1835 
1836 
1837 		float good2=good+n*(penalizeN ? 0.25f : 0);
1838 		float bad2=bad+n*(penalizeN ? 0.75f : 0);
1839 		float r=good2/Tools.max(good2+bad2, 1);
1840 //		assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad;
1841 
1842 //		System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r);
1843 //		System.out.println(Arrays.toString(matchToMsdicn(match)));
1844 
1845 		return r;
1846 	}
1847 
1848 	/**
1849 	 * Handles short or long mode.
1850 	 * @param match string
1851 	 * @return Identity based on number of match, sub, del, ins, or N symbols
1852 	 */
identitySkewed(byte[] match, boolean penalizeN, boolean sqrt, boolean log, boolean single)1853 	public static final float identitySkewed(byte[] match, boolean penalizeN, boolean sqrt, boolean log, boolean single) {
1854 //		assert(false) : new String(match);
1855 		if(match==null || match.length<1){return 0;}
1856 
1857 		int good=0, bad=0, n=0;
1858 
1859 		byte mode='0', c='0';
1860 		int current=0;
1861 		for(int i=0; i<match.length; i++){
1862 			c=match[i];
1863 			if(Tools.isDigit(c)){
1864 				current=(current*10)+(c-'0');
1865 			}else{
1866 				if(mode==c){
1867 					current=Tools.max(current+1, 2);
1868 				}else{
1869 					current=Tools.max(current, 1);
1870 
1871 					if(mode=='m'){
1872 						good+=current;
1873 //						System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1874 					}else if(mode=='D'){
1875 						if(current<SamLine.INTRON_LIMIT){
1876 							int x;
1877 
1878 							if(sqrt){x=(int)Math.ceil(Math.sqrt(current));}
1879 							else if(log){x=(int)Math.ceil(Tools.log2(current));}
1880 							else{x=1;}
1881 
1882 							bad+=(Tools.min(x, current));
1883 						}
1884 
1885 //						System.out.println("D: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad+", x="+x);
1886 					}else if(mode=='R' || mode=='N'){
1887 						n+=current;
1888 					}else if(mode=='C' || mode=='V'){
1889 						//Do nothing
1890 						//I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
1891 					}else if(mode!='0'){
1892 						assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y' || mode=='i' || mode=='d') : (char)mode;
1893 						bad+=current;
1894 //						System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1895 					}
1896 					mode=c;
1897 					current=0;
1898 				}
1899 			}
1900 		}
1901 		if(current>0 || !Tools.isDigit(c)){
1902 			current=Tools.max(current, 1);
1903 			if(mode=='m'){
1904 				good+=current;
1905 			}else if(mode=='R' || mode=='N'){
1906 				n+=current;
1907 			}else if(mode=='C' || mode=='V'){
1908 				//Do nothing
1909 				//I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity
1910 			}else if(mode!='0'){
1911 				assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode;
1912 				if(mode!='D' || current<SamLine.INTRON_LIMIT){
1913 					bad+=current;
1914 				}
1915 //				System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad);
1916 			}
1917 		}
1918 
1919 
1920 		float good2=good+n*(penalizeN ? 0.25f : 0);
1921 		float bad2=bad+n*(penalizeN ? 0.75f : 0);
1922 		float r=good2/Tools.max(good2+bad2, 1);
1923 //		assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad;
1924 
1925 //		System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r);
1926 //		System.out.println(Arrays.toString(matchToMsdicn(match)));
1927 
1928 		return r;
1929 	}
1930 
failsChastity()1931 	public boolean failsChastity(){
1932 		return failsChastity(true);
1933 	}
1934 
failsChastity(boolean processAssertions)1935 	public boolean failsChastity(boolean processAssertions){
1936 		if(id==null){return false;}
1937 		int space=id.indexOf(' ');
1938 		if(space<0 || space+5>id.length()){return false;}
1939 		char a=id.charAt(space+1);
1940 		char b=id.charAt(space+2);
1941 		char c=id.charAt(space+3);
1942 		char d=id.charAt(space+4);
1943 
1944 		if(a=='/'){
1945 			if(b<'1' || b>'4' || c!=':'){
1946 				if(!processAssertions){return false;}
1947 				KillSwitch.kill("Strangely formatted read.  Please disable chastityfilter with the flag chastityfilter=f.  id:"+id);
1948 			}
1949 			return d=='Y';
1950 		}else{
1951 			if(processAssertions){
1952 				assert(a=='1' || a=='2' || a=='3' || a=='4') : id;
1953 				assert(b==':') : id;
1954 				assert(d==':');
1955 			}
1956 			if(a<'1' || a>'4' || b!=':' || d!=':'){
1957 				if(!processAssertions){return false;}
1958 				KillSwitch.kill("Strangely formatted read.  Please disable chastityfilter with the flag chastityfilter=f.  id:"+id);
1959 			}
1960 			return c=='Y';
1961 		}
1962 	}
1963 
failsBarcode(HashSet<String> set, boolean failIfNoBarcode)1964 	public boolean failsBarcode(HashSet<String> set, boolean failIfNoBarcode){
1965 		if(id==null){return false;}
1966 
1967 		final int loc=id.lastIndexOf(':');
1968 		final int loc2=Tools.max(id.indexOf(' '), id.indexOf('/'));
1969 		if(loc<0 || loc<=loc2 || loc>=id.length()-1){
1970 			return failIfNoBarcode;
1971 		}
1972 
1973 		if(set==null){
1974 			for(int i=loc+1; i<id.length(); i++){
1975 				char c=id.charAt(i);
1976 				boolean ok=(c=='+' || AminoAcid.isFullyDefined(c));
1977 				if(!ok){return true;}
1978 			}
1979 			return false;
1980 		}else{
1981 			String code=id.substring(loc+1);
1982 			return !set.contains(code);
1983 		}
1984 	}
1985 
1986 	/**
1987 	 * Parse the barcode from an Illumina header.
1988 	 * @param failIfNoBarcode Terminate the JVM if no barcode is present.
1989 	 * @return The barcode
1990 	 */
barcode(boolean failIfNoBarcode)1991 	public String barcode(boolean failIfNoBarcode){
1992 
1993 		if(id==null){
1994 			if(failIfNoBarcode && Shared.EA()){
1995 				KillSwitch.kill("Encountered a read header without a barcode:\n"+id+"\n");
1996 			}
1997 			return null;
1998 		}
1999 
2000 		final int loc=id.lastIndexOf(':');
2001 		final int loc2=Tools.max(id.indexOf(' '), id.indexOf('/'));
2002 		if(loc<0 || loc<=loc2 || loc>=id.length()-1){
2003 			if(failIfNoBarcode && Shared.EA()){
2004 				KillSwitch.kill("Encountered a read header without a barcode:\n"+id+"\n");
2005 			}
2006 			return null;
2007 		}
2008 
2009 		String code=id.substring(loc+1);
2010 		return code;
2011 	}
2012 
2013 	/**
2014 	 * @return The rname of this Read's SamLine, if present and mapped.
2015 	 */
rnameS()2016 	public String rnameS() {
2017 		if(samline==null || !samline.mapped()){return null;}
2018 		return samline.rnameS();
2019 	}
2020 
2021 	/** Average based on summing quality scores */
avgQuality(boolean countUndefined, int maxBases)2022 	public double avgQuality(boolean countUndefined, int maxBases){
2023 		return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityDouble(countUndefined, maxBases) : avgQualityByScoreDouble(maxBases);
2024 	}
2025 
2026 	/** Average based on summing quality scores */
avgQualityInt(boolean countUndefined, int maxBases)2027 	public int avgQualityInt(boolean countUndefined, int maxBases){
2028 		return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityInt(countUndefined, maxBases) : avgQualityByScoreInt(maxBases);
2029 	}
2030 
2031 	/** Average based on summing error probabilities */
avgQualityByProbabilityInt(boolean countUndefined, int maxBases)2032 	public int avgQualityByProbabilityInt(boolean countUndefined, int maxBases){
2033 		if(bases==null || bases.length==0){return 0;}
2034 		return avgQualityByProbabilityInt(bases, quality, countUndefined, maxBases);
2035 	}
2036 
2037 	/** Average based on summing error probabilities */
avgQualityByProbabilityDouble(boolean countUndefined, int maxBases)2038 	public double avgQualityByProbabilityDouble(boolean countUndefined, int maxBases){
2039 		if(bases==null || bases.length==0){return 0;}
2040 		return avgQualityByProbabilityDouble(bases, quality, countUndefined, maxBases);
2041 	}
2042 
2043 	/** Average based on summing error probabilities */
probabilityErrorFree(boolean countUndefined, int maxBases)2044 	public double probabilityErrorFree(boolean countUndefined, int maxBases){
2045 		if(bases==null || bases.length==0){return 0;}
2046 		return probabilityErrorFree(bases, quality, countUndefined, maxBases);
2047 	}
2048 
2049 	/** Average based on summing error probabilities */
avgQualityByProbabilityInt(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2050 	public static int avgQualityByProbabilityInt(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
2051 		if(quality==null){return 40;}
2052 		if(quality.length==0){return 0;}
2053 		float e=expectedErrors(bases, quality, countUndefined, maxBases);
2054 		final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2055 		float p=e/div;
2056 		return QualityTools.probErrorToPhred(p);
2057 	}
2058 
2059 	/** Average based on summing error probabilities */
avgQualityByProbabilityDouble(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2060 	public static double avgQualityByProbabilityDouble(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
2061 		if(quality==null){return 40;}
2062 		if(quality.length==0){return 0;}
2063 		float e=expectedErrors(bases, quality, countUndefined, maxBases);
2064 		final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2065 		float p=e/div;
2066 		return QualityTools.probErrorToPhredDouble(p);
2067 	}
2068 
2069 	/** Average based on summing quality scores */
avgQualityByScoreInt(int maxBases)2070 	public int avgQualityByScoreInt(int maxBases){
2071 		if(bases==null || bases.length==0){return 0;}
2072 		if(quality==null){return 40;}
2073 		int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2074 		for(int i=0; i<limit; i++){
2075 			byte b=quality[i];
2076 			x+=(b<0 ? 0 : b);
2077 		}
2078 		return x/limit;
2079 	}
2080 
2081 	/** Average based on summing quality scores */
avgQualityByScoreDouble(int maxBases)2082 	public double avgQualityByScoreDouble(int maxBases){
2083 		if(bases==null || bases.length==0){return 0;}
2084 		if(quality==null){return 40;}
2085 		int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2086 		for(int i=0; i<limit; i++){
2087 			byte b=quality[i];
2088 			x+=(b<0 ? 0 : b);
2089 		}
2090 		return x/(double)limit;
2091 	}
2092 
2093 	/** Used by BBMap tipsearch. */
avgQualityFirstNBases(int n)2094 	public int avgQualityFirstNBases(int n){
2095 		if(bases==null || bases.length==0){return 0;}
2096 		if(quality==null || n<1){return 40;}
2097 		assert(quality!=null);
2098 		int x=0;
2099 		if(n>quality.length){return 0;}
2100 		for(int i=0; i<n; i++){
2101 			byte b=quality[i];
2102 			x+=(b<0 ? 0 : b);
2103 		}
2104 		return x/n;
2105 	}
2106 
2107 	/** Used by BBMap tipsearch. */
avgQualityLastNBases(int n)2108 	public int avgQualityLastNBases(int n){
2109 		if(bases==null || bases.length==0){return 0;}
2110 		if(quality==null || n<1){return 40;}
2111 		assert(quality!=null);
2112 		int x=0;
2113 		if(n>quality.length){return 0;}
2114 		for(int i=bases.length-n; i<bases.length; i++){
2115 			byte b=quality[i];
2116 			x+=(b<0 ? 0 : b);
2117 		}
2118 		return x/n;
2119 	}
2120 
2121 	/** Used by BBDuk. */
minQuality()2122 	public int minQuality(){
2123 		byte min=41;
2124 		if(bases!=null && quality!=null){
2125 			for(byte q : quality){
2126 				min=Tools.min(min, q);
2127 			}
2128 		}
2129 		return min;
2130 	}
2131 
2132 	/** Used by BBMap tipsearch. */
minQualityFirstNBases(int n)2133 	public byte minQualityFirstNBases(int n){
2134 		if(bases==null || bases.length==0){return 0;}
2135 		if(quality==null || n<1){return 41;}
2136 		assert(quality!=null && n>0);
2137 		if(n>quality.length){return 0;}
2138 		byte x=quality[0];
2139 		for(int i=1; i<n; i++){
2140 			byte b=quality[i];
2141 			if(b<x){x=b;}
2142 		}
2143 		return x;
2144 	}
2145 
2146 	/** Used by BBMap tipsearch. */
minQualityLastNBases(int n)2147 	public byte minQualityLastNBases(int n){
2148 		if(bases==null || bases.length==0){return 0;}
2149 		if(quality==null || n<1){return 41;}
2150 		assert(quality!=null && n>0);
2151 		if(n>quality.length){return 0;}
2152 		byte x=quality[bases.length-n];
2153 		for(int i=bases.length-n; i<bases.length; i++){
2154 			byte b=quality[i];
2155 			if(b<x){x=b;}
2156 		}
2157 		return x;
2158 	}
2159 
containsNonM()2160 	public boolean containsNonM(){
2161 		assert(match!=null && valid());
2162 		for(int i=0; i<match.length; i++){
2163 			byte b=match[i];
2164 			assert(b!='M');
2165 			if(b>'9' && b!='m'){return true;}
2166 		}
2167 		return false;
2168 	}
2169 
containsNonNM()2170 	public boolean containsNonNM(){
2171 		assert(match!=null && valid());
2172 		for(int i=0; i<match.length; i++){
2173 			byte b=match[i];
2174 			assert(b!='M');
2175 			if(b>'9' && b!='m' && b!='N'){return true;}
2176 		}
2177 		return false;
2178 	}
2179 
containsVariants()2180 	public boolean containsVariants(){
2181 		assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n";
2182 		for(int i=0; i<match.length; i++){
2183 			byte b=match[i];
2184 			assert(b!='M');
2185 			if(b>'9' && b!='m' && b!='N' && b!='C'){return true;}
2186 		}
2187 		return false;
2188 	}
2189 
containsClipping()2190 	public boolean containsClipping(){
2191 		assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n";
2192 		if(match.length<1){return false;}
2193 		if(match[0]=='C'){return true;}
2194 		for(int i=match.length-1; i>0; i--){
2195 			if(match[i]=='C'){return true;}
2196 			if(match[i]>'9'){break;}
2197 		}
2198 		return false;
2199 	}
2200 
2201 	/**
2202 	 * @return {m,S,C,N,I,D};
2203 	 */
countMatchSymbols()2204 	public int[] countMatchSymbols(){
2205 		int m=0, S=0, C=0, N=0, I=0, D=0;
2206 		int current=0;
2207 		byte last='?';
2208 		for(byte b : match){
2209 			if(Tools.isDigit(b)){
2210 				current=current*10+b-'0';
2211 			}else{
2212 				current=Tools.max(current, 1);
2213 				if(last=='m'){
2214 					m+=current;
2215 				}else if(last=='S'){
2216 					S+=current;
2217 				}else if(last=='C'){
2218 					C+=current;
2219 				}else if(last=='N'){
2220 					N+=current;
2221 				}else if(last=='I'){
2222 					I+=current;
2223 				}else if(last=='D'){
2224 					D+=current;
2225 				}else{
2226 					assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match);
2227 				}
2228 				current=0;
2229 				last=b;
2230 			}
2231 		}
2232 		current=Tools.max(current, 1);
2233 		if(last=='m'){
2234 			m+=current;
2235 		}else if(last=='S'){
2236 			S+=current;
2237 		}else if(last=='C'){
2238 			C+=current;
2239 		}else if(last=='N'){
2240 			N+=current;
2241 		}else if(last=='I'){
2242 			I+=current;
2243 		}else if(last=='D'){
2244 			D+=current;
2245 		}else{
2246 			assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match);
2247 		}
2248 		current=0;
2249 		return new int[] {m,S,C,N,I,D};
2250 	}
2251 
containsNonNMXY()2252 	public boolean containsNonNMXY(){
2253 		assert(match!=null && valid());
2254 		for(int i=0; i<match.length; i++){
2255 			byte b=match[i];
2256 			assert(b!='M');
2257 			if(b>'9' && b!='m' && b!='N' && b!='X' && b!='Y'){return true;}
2258 		}
2259 		return false;
2260 	}
2261 
containsSDI()2262 	public boolean containsSDI(){
2263 		assert(match!=null && valid());
2264 		for(int i=0; i<match.length; i++){
2265 			byte b=match[i];
2266 			assert(b!='M');
2267 			if(b=='S' || b=='s' || b=='D' || b=='I'){return true;}
2268 		}
2269 		return false;
2270 	}
2271 
containsNonNMS()2272 	public boolean containsNonNMS(){
2273 		assert(match!=null && valid());
2274 		for(int i=0; i<match.length; i++){
2275 			byte b=match[i];
2276 			assert(b!='M');
2277 			if(b>'9' && b!='m' && b!='s' && b!='N' && b!='S'){return true;}
2278 		}
2279 		return false;
2280 	}
2281 
containsConsecutiveS(int num)2282 	public boolean containsConsecutiveS(int num){
2283 		assert(match!=null && valid() && !shortmatch());
2284 		int cnt=0;
2285 		for(int i=0; i<match.length; i++){
2286 			byte b=match[i];
2287 			assert(b!='M');
2288 			if(b=='S'){
2289 				cnt++;
2290 				if(cnt>=num){return true;}
2291 			}else{
2292 				cnt=0;
2293 			}
2294 		}
2295 		return false;
2296 	}
2297 
containsIndels()2298 	public boolean containsIndels(){
2299 		assert(match!=null && valid());
2300 		for(int i=0; i<match.length; i++){
2301 			byte b=match[i];
2302 			if(b=='I' || b=='D' || b=='X' || b=='Y'){return true;}
2303 		}
2304 		return false;
2305 	}
2306 
countSubs()2307 	public int countSubs(){
2308 		assert(match!=null && valid()) : (match!=null)+", "+valid()+", "+shortmatch();
2309 		return countSubs(match);
2310 	}
2311 
containsInMatch(char c)2312 	public boolean containsInMatch(char c){
2313 		assert(match!=null && valid());
2314 		for(int i=0; i<match.length; i++){
2315 			byte b=match[i];
2316 			if(b==c){return true;}
2317 		}
2318 		return false;
2319 	}
2320 
containsNocalls()2321 	public boolean containsNocalls(){
2322 		for(int i=0; i<bases.length; i++){
2323 			byte b=bases[i];
2324 			if(b=='N'){return true;}
2325 		}
2326 		return false;
2327 	}
2328 
countNocalls()2329 	public int countNocalls(){
2330 		return countNocalls(bases);
2331 	}
2332 
countSubs(byte[] match)2333 	public static int countSubs(byte[] match){
2334 		int S=0;
2335 		int current=0;
2336 		byte last='?';
2337 		for(byte b : match){
2338 			if(Tools.isDigit(b)){
2339 				current=current*10+b-'0';
2340 			}else{
2341 				if(last=='S'){S+=Tools.max(1, current);}
2342 				current=0;
2343 				last=b;
2344 			}
2345 		}
2346 		if(last=='S'){S+=Tools.max(1, current);}
2347 //		assert(S==0) : S+"\t"+new String(match);
2348 		return S;
2349 //		int x=0;
2350 //		assert(match!=null);
2351 //		for(int i=0; i<match.length; i++){
2352 //			byte b=match[i];
2353 //			if(b=='S'){x++;}
2354 //			assert(!Tools.isDigit(b));
2355 //		}
2356 //		return x;
2357 	}
2358 
countVars(byte[] match)2359 	public static int countVars(byte[] match){
2360 		return countVars(match, true, true, true);
2361 	}
2362 
countVars(byte[] match, boolean sub, boolean ins, boolean del)2363 	public static int countVars(byte[] match, boolean sub, boolean ins, boolean del){
2364 		int S=0, I=0, D=0;
2365 		int current=0;
2366 		byte last='?';
2367 		for(byte b : match){
2368 			if(Tools.isDigit(b)){
2369 				current=current*10+b-'0';
2370 			}else{
2371 				if(last=='S'){S+=Tools.max(1, current);}
2372 				else if(last=='I'){I++;}
2373 				else if(last=='D'){D++;}
2374 				current=0;
2375 				last=b;
2376 			}
2377 		}
2378 		if(last=='S'){S+=Tools.max(1, current);}
2379 		else if(last=='I'){I++;}
2380 		else if(last=='D'){D++;}
2381 		return (sub ? S : 0)+(ins ? I : 0)+(del ? D : 0);
2382 	}
2383 
containsSubs(byte[] match)2384 	public static boolean containsSubs(byte[] match){
2385 		int x=0;
2386 		assert(match!=null);
2387 		for(int i=0; i<match.length; i++){
2388 			byte b=match[i];
2389 			if(b=='S'){return true;}
2390 		}
2391 		return false;
2392 	}
2393 
containsVars(byte[] match)2394 	public static boolean containsVars(byte[] match){
2395 		int x=0;
2396 		assert(match!=null);
2397 		for(int i=0; i<match.length; i++){
2398 			byte b=match[i];
2399 			if(b=='S' || b=='I' || b=='D'){return true;}
2400 		}
2401 		return false;
2402 	}
2403 
countNocalls(byte[] match)2404 	public static int countNocalls(byte[] match){
2405 		int n=0;
2406 		for(int i=0; i<match.length; i++){
2407 			byte b=match[i];
2408 			if(b=='N'){n++;}
2409 		}
2410 		return n;
2411 	}
2412 
countInsertions(byte[] match)2413 	public static int countInsertions(byte[] match){
2414 		int n=0;
2415 		for(int i=0; i<match.length; i++){
2416 			byte b=match[i];
2417 			if(b=='I'){n++;}
2418 		}
2419 		return n;
2420 	}
2421 
countDeletions(byte[] match)2422 	public static int countDeletions(byte[] match){
2423 		int n=0;
2424 		for(int i=0; i<match.length; i++){
2425 			byte b=match[i];
2426 			if(b=='D'){n++;}
2427 		}
2428 		return n;
2429 	}
2430 
countInsertionEvents(byte[] match)2431 	public static int countInsertionEvents(byte[] match){
2432 		int n=0;
2433 		byte prev='N';
2434 		for(int i=0; i<match.length; i++){
2435 			byte b=match[i];
2436 			if(b=='I' && prev!=b){n++;}
2437 			prev=b;
2438 		}
2439 		return n;
2440 	}
2441 
countDeletionEvents(byte[] match)2442 	public static int countDeletionEvents(byte[] match){
2443 		int n=0;
2444 		byte prev='N';
2445 		for(int i=0; i<match.length; i++){
2446 			byte b=match[i];
2447 			if(b=='D' && prev!=b){n++;}
2448 			prev=b;
2449 		}
2450 		return n;
2451 	}
2452 
containsNonACGTN()2453 	public boolean containsNonACGTN(){
2454 		if(bases==null){return false;}
2455 		for(byte b : bases){
2456 			if(AminoAcid.baseToNumberACGTN[b]<0){return true;}
2457 		}
2458 		return false;
2459 	}
2460 
containsUndefined()2461 	public boolean containsUndefined(){
2462 		if(bases==null){return false;}
2463 		final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino());
2464 		for(byte b : bases){
2465 			if(symbolToNumber[b]<0){return true;}
2466 		}
2467 		return false;
2468 	}
2469 
containsLowercase()2470 	public boolean containsLowercase(){
2471 		if(bases==null){return false;}
2472 		for(byte b : bases){
2473 			if(Tools.isLowerCase(b)){return true;}
2474 		}
2475 		return false;
2476 	}
2477 
countUndefined()2478 	public int countUndefined(){
2479 		if(bases==null){return 0;}
2480 		final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino());
2481 		int n=0;
2482 		for(byte b : bases){
2483 			if(symbolToNumber[b]<0){n++;}
2484 		}
2485 		return n;
2486 	}
2487 
hasMinConsecutiveBases(final int min)2488 	public boolean hasMinConsecutiveBases(final int min){
2489 		if(bases==null){return min<=0;}
2490 		final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino());
2491 		int len=0;
2492 		for(byte b : bases){
2493 			if(symbolToNumber[b]<0){len=0;}
2494 			else{
2495 				len++;
2496 				if(len>=min){return true;}
2497 			}
2498 		}
2499 		return false;
2500 	}
2501 
2502 
2503 	/**
2504 	 * @return The number of occurrences of the rarest base.
2505 	 */
minBaseCount()2506 	public int minBaseCount(){
2507 		if(bases==null){return 0;}
2508 		int a=0, c=0, g=0, t=0;
2509 		for(byte b : bases){
2510 			if(b=='A'){a++;}
2511 			else if(b=='C'){c++;}
2512 			else if(b=='G'){g++;}
2513 			else if(b=='T'){t++;}
2514 		}
2515 		return Tools.min(a, c, g, t);
2516 	}
2517 
containsXY()2518 	public boolean containsXY(){
2519 		assert(match!=null && valid());
2520 		return containsXY(match);
2521 	}
2522 
containsXY(byte[] match)2523 	public static boolean containsXY(byte[] match){
2524 		if(match==null){return false;}
2525 		for(int i=0; i<match.length; i++){
2526 			byte b=match[i];
2527 			if(b=='X' || b=='Y'){return true;}
2528 		}
2529 		return false;
2530 	}
2531 
containsXY2()2532 	public boolean containsXY2(){
2533 		if(match==null || match.length<1){return false;}
2534 		boolean b=(match[0]=='X' || match[match.length-1]=='Y');
2535 		assert(!valid() || b==containsXY());
2536 		return b;
2537 	}
2538 
containsXYC()2539 	public boolean containsXYC(){
2540 		if(match==null || match.length<1){return false;}
2541 		boolean b=(match[0]=='X' || match[match.length-1]=='Y');
2542 		assert(!valid() || b==containsXY());
2543 		return b || match[0]=='C' || match[match.length-1]=='C';
2544 	}
2545 
2546 	/** Replaces 'B' in match string with 'S', 'm', or 'N' */
fixMatchB()2547 	public boolean fixMatchB(){
2548 		assert(match!=null);
2549 		final ChromosomeArray ca;
2550 		if(Data.GENOME_BUILD>=0){
2551 			ca=Data.getChromosome(chrom);
2552 		}else{
2553 			ca=null;
2554 		}
2555 		boolean originallyShort=shortmatch();
2556 		if(originallyShort){match=toLongMatchString(match);}
2557 		int mloc=0, cloc=0, rloc=start;
2558 		for(; mloc<match.length; mloc++){
2559 			byte m=match[mloc];
2560 
2561 			if(m=='B'){
2562 				byte r=(ca==null ? (byte)'?' : ca.get(rloc));
2563 				byte c=bases[cloc];
2564 				if(r=='N' || c=='N'){
2565 					match[mloc]='N';
2566 				}else if(r==c || Tools.toUpperCase(r)==Tools.toUpperCase(c)){
2567 					match[mloc]='m';
2568 				}else{
2569 					if(ca==null){
2570 						if(originallyShort){
2571 							match=toShortMatchString(match);
2572 						}
2573 						for(int i=0; i<match.length; i++){
2574 							if(match[i]=='B'){match[i]='N';}
2575 						}
2576 						return false;
2577 					}
2578 					match[mloc]='S';
2579 				}
2580 				cloc++;
2581 				rloc++;
2582 			}else if(m=='m' || m=='S' || m=='N' || m=='s' || m=='C'){
2583 				cloc++;
2584 				rloc++;
2585 			}else if(m=='D'){
2586 				rloc++;
2587 			}else if(m=='I' || m=='X' || m=='Y'){
2588 				cloc++;
2589 			}
2590 		}
2591 		if(originallyShort){match=toShortMatchString(match);}
2592 		return true;
2593 	}
2594 
expectedTipErrors(boolean countUndefined, int maxBases)2595 	public float expectedTipErrors(boolean countUndefined, int maxBases){
2596 		return expectedTipErrors(bases, quality, countUndefined, maxBases);
2597 	}
2598 
expectedErrorsIncludingMate(boolean countUndefined)2599 	public float expectedErrorsIncludingMate(boolean countUndefined){
2600 		float a=expectedErrors(countUndefined, length());
2601 		float b=(mate==null ? 0 : mate.expectedErrors(countUndefined, mate.length()));
2602 		return a+b;
2603 	}
2604 
expectedErrors(boolean countUndefined, int maxBases)2605 	public float expectedErrors(boolean countUndefined, int maxBases){
2606 		return expectedErrors(bases, quality, countUndefined, maxBases);
2607 	}
2608 
probabilityErrorFree(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2609 	public static float probabilityErrorFree(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
2610 		if(quality==null){return 0;}
2611 		final int limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2612 		final float[] array=QualityTools.PROB_CORRECT;
2613 		assert(array[0]>0 && array[0]<1);
2614 		float product=1;
2615 		for(int i=0; i<limit; i++){
2616 			byte b=bases[i];
2617 			byte q=quality[i];
2618 			if(AminoAcid.isFullyDefined(b)){
2619 				product*=array[q];
2620 			}else if(countUndefined){
2621 				return 0;
2622 			}
2623 		}
2624 		return product;
2625 	}
2626 
expectedErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2627 	public static float expectedErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
2628 		if(quality==null){return 0;}
2629 		final int limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2630 		final float[] array=QualityTools.PROB_ERROR;
2631 		assert(array[0]>0 && array[0]<1);
2632 		float sum=0;
2633 		for(int i=0; i<limit; i++){
2634 			byte b=bases[i];
2635 			boolean d=AminoAcid.isFullyDefined(b);
2636 			//assert((quality[i]==0)==d) : "Q="+quality[i]+" for base "+(char)b;
2637 			if(d || countUndefined){
2638 				byte q=(d ? quality[i] : 0);
2639 				sum+=array[q];
2640 			}
2641 		}
2642 		return sum;
2643 	}
2644 
2645 	/** Runs backwards instead of forwards */
expectedTipErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2646 	public static float expectedTipErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){
2647 		if(quality==null){return 0;}
2648 		final int limit;
2649 		{
2650 			final int limit0=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length));
2651 			limit=quality.length-limit0;
2652 		}
2653 		final float[] array=QualityTools.PROB_ERROR;
2654 		assert(array[0]>0 && array[0]<1);
2655 		float sum=0;
2656 		for(int i=quality.length-1; i>=limit; i--){
2657 			byte b=bases[i];
2658 			byte q=quality[i];
2659 			if(AminoAcid.isFullyDefined(b)){
2660 				sum+=array[q];
2661 			}else{
2662 				assert(q==0);
2663 				if(countUndefined){sum+=0.75f;}
2664 			}
2665 		}
2666 		return sum;
2667 	}
2668 
estimateErrors()2669 	public int estimateErrors() {
2670 		if(quality==null){return 0;}
2671 		assert(match!=null) : this.toText(false);
2672 
2673 		int count=0;
2674 		for(int ci=0, mi=0; ci<bases.length && mi<match.length; mi++){
2675 
2676 //			byte b=bases[ci];
2677 			byte q=quality[ci];
2678 			byte m=match[mi];
2679 			if(m=='m' || m=='s' || m=='N'){
2680 				ci++;
2681 			}else if(m=='X' || m=='Y'){
2682 				ci++;
2683 				count++;
2684 			}else if(m=='I'){
2685 				ci++;
2686 			}else if(m=='D'){
2687 
2688 			}else if(m=='S'){
2689 				ci++;
2690 				if(q<19){
2691 					count++;
2692 				}
2693 			}
2694 
2695 		}
2696 		return count;
2697 	}
2698 
2699 	/** {M, S, D, I, N, splice} */
countErrors(int minSplice)2700 	public int[] countErrors(int minSplice) {
2701 		assert(match!=null) : this.toText(false);
2702 		int m=0;
2703 		int s=0;
2704 		int d=0;
2705 		int i=0;
2706 		int n=0;
2707 		int splice=0;
2708 
2709 		byte prev=' ';
2710 		int streak=0;
2711 		minSplice=Tools.max(minSplice, 1);
2712 
2713 		for(int pos=0; pos<match.length; pos++){
2714 			final byte b=match[pos];
2715 
2716 			if(b==prev){streak++;}else{streak=1;}
2717 
2718 			if(b=='m'){
2719 				m++;
2720 			}else if(b=='N' || b=='C'){
2721 				n++;
2722 			}else if(b=='X' || b=='Y'){
2723 				i++;
2724 			}else if(b=='I'){
2725 				i++;
2726 			}else if(b=='D'){
2727 				d++;
2728 				if(streak==minSplice){splice++;}
2729 			}else if(b=='S'){
2730 				s++;
2731 			}else{
2732 				if(Tools.isDigit(b) && shortmatch()){
2733 					System.err.println("Warning! Found read in shortmatch form during countErrors():\n"+this); //Usually caused by verbose output.
2734 					if(mate!=null){System.err.println("mate:\n"+mate.id+"\t"+new String(mate.bases));}
2735 					System.err.println("Stack trace: ");
2736 					new Exception().printStackTrace();
2737 					match=toLongMatchString(match);
2738 					setShortMatch(false);
2739 					return countErrors(minSplice);
2740 				}else{
2741 					throw new RuntimeException("\nUnknown symbol "+(char)b+":\n"+new String(match)+"\n"+this+"\nshortmatch="+this.shortmatch());
2742 				}
2743 			}
2744 
2745 			prev=b;
2746 		}
2747 
2748 //		assert(i==0) : i+"\n"+this+"\n"+new String(match)+"\n"+Arrays.toString(new int[] {m, s, d, i, n, splice});
2749 
2750 		return new int[] {m, s, d, i, n, splice};
2751 	}
2752 
isShortMatchString(byte[] match)2753 	public static boolean isShortMatchString(byte[] match){
2754 		byte last=' ';
2755 		int streak=0;
2756 		for(int i=0; i<match.length; i++){
2757 			byte b=match[i];
2758 			if(Tools.isDigit(b)){return true;}
2759 			if(b==last){
2760 				streak++;
2761 				if(streak>3){return true;}
2762 			}else{
2763 				streak=0;
2764 				last=b;
2765 			}
2766 		}
2767 		return false;
2768 	}
2769 
toShortMatchString(boolean doAssertion)2770 	public void toShortMatchString(boolean doAssertion){
2771 		if(shortmatch()){
2772 			assert(!doAssertion);
2773 			return;
2774 		}
2775 		match=toShortMatchString(match);
2776 		setShortMatch(true);
2777 	}
2778 
toShortMatchString(byte[] match)2779 	public static byte[] toShortMatchString(byte[] match){
2780 		if(match==null){return null;}
2781 		assert(match.length>0);
2782 		ByteBuilder sb=new ByteBuilder(10);
2783 
2784 		byte prev=match[0];
2785 		int count=1;
2786 		for(int i=1; i<match.length; i++){
2787 			byte m=match[i];
2788 			assert(Tools.isLetter(m) || m==0) : new String(match);
2789 			if(m==0){System.err.println("Warning! Converting empty match string to short form.");}
2790 			if(m==prev){count++;}
2791 			else{
2792 				sb.append(prev);
2793 				if(count>1){sb.append(count);}
2794 //				else if(count==2){sb.append(prev);}
2795 				prev=m;
2796 				count=1;
2797 			}
2798 		}
2799 		sb.append(prev);
2800 		if(count>1){sb.append(count);}
2801 //		else if(count==2){sb.append(prev);}
2802 
2803 		byte[] r=sb.toBytes();
2804 		return r;
2805 	}
2806 
toLongMatchString(boolean doAssertion)2807 	public void toLongMatchString(boolean doAssertion){
2808 		if(!shortmatch()){
2809 			assert(!doAssertion);
2810 			return;
2811 		}
2812 		match=toLongMatchString(match);
2813 		setShortMatch(false);
2814 	}
2815 
toLongMatchString(byte[] shortmatch)2816 	public static byte[] toLongMatchString(byte[] shortmatch){
2817 		if(shortmatch==null){return null;}
2818 		assert(shortmatch.length>0);
2819 
2820 		int count=0;
2821 		int current=0;
2822 		for(int i=0; i<shortmatch.length; i++){
2823 			byte m=shortmatch[i];
2824 			if(Tools.isLetter(m)){
2825 				count++;
2826 				count+=(current>0 ? current-1 : 0);
2827 				current=0;
2828 			}else{
2829 				assert(Tools.isDigit(m));
2830 				current=(current*10)+(m-48); //48 == '0'
2831 			}
2832 		}
2833 		count+=(current>0 ? current-1 : 0);
2834 
2835 
2836 		byte[] r=new byte[count];
2837 		current=0;
2838 		byte lastLetter='?';
2839 		int j=0;
2840 		for(int i=0; i<shortmatch.length; i++){
2841 			byte m=shortmatch[i];
2842 			if(Tools.isLetter(m)){
2843 				while(current>1){
2844 					r[j]=lastLetter;
2845 					current--;
2846 					j++;
2847 				}
2848 				current=0;
2849 
2850 				r[j]=m;
2851 				j++;
2852 				lastLetter=m;
2853 			}else{
2854 				assert(Tools.isDigit(m));
2855 				current=(current*10)+(m-48); //48 == '0'
2856 			}
2857 		}
2858 		while(current>1){
2859 			r[j]=lastLetter;
2860 			current--;
2861 			j++;
2862 		}
2863 
2864 		assert(r[r.length-1]>0);
2865 		return r;
2866 	}
2867 
parseCustomRname()2868 	public String parseCustomRname(){
2869 		assert(id.startsWith("SYN")) : "Can't parse header "+id;
2870 		return new Header(id, pairnum()).rname;
2871 	}
2872 
2873 	/** Bases of the read. */
2874 	public byte[] bases;
2875 
2876 	/** Quality of the read. */
2877 	public byte[] quality;
2878 
2879 	/** Alignment string.  E.G. mmmmDDDmmm would have 4 matching bases, then a 3-base deletion, then 3 matching bases. */
2880 	public byte[] match;
2881 
2882 	public int[] gaps;
2883 
name()2884 	public String name() {return id;}
2885 	public String id;
2886 	public long numericID;
2887 	public int chrom;
2888 	public int start;
2889 	public int stop;
2890 
2891 	public int copies=1;
2892 
2893 	/** Errors detected */
2894 	public int errors=0;
2895 
2896 	/** Alignment score from BBMap.  Assumed to max at approx 100*bases.length */
2897 	public int mapScore=0;
2898 
2899 	public ArrayList<SiteScore> sites;
2900 	public SiteScore originalSite; //Origin site for synthetic reads
2901 	public Object obj=null;//Don't set this to a SamLine; it's for other things.
2902 	public SamLine samline=null;
2903 	public Read mate;
2904 
2905 	public int flags;
2906 
2907 	/** -1 if invalid.  TODO: Currently not retained through most processes. */
2908 	private int insert=-1;
2909 
2910 	/** A random number for deterministic usage.
2911 	 * May decrease speed in multithreaded applications.
2912 	 */
2913 	public double rand=-1;
2914 
time()2915 	public long time(){
2916 		assert(obj!=null && obj.getClass()==Long.class) : obj;
2917 		return ((Long)obj).longValue();
2918 	}
2919 	/** Number of bases in this pair, including the mate if present. */
pairLength()2920 	public int pairLength(){return length()+mateLength();}
2921 	/** Number of bases in this pair, including the mate if present. */
numPairKmers(int k)2922 	public int numPairKmers(int k){return numKmers(k)+numMateKmers(k);}
2923 	/** Number of reads in this pair.  Returns 1 if the read has no mate, and 2 if it does. */
pairCount()2924 	public int pairCount(){return 1+mateCount();}
length()2925 	public int length(){return bases==null ? 0 : bases.length;}
numKmers(int k)2926 	public int numKmers(int k){return bases==null ? 0 : Tools.max(0, bases.length-k+1);}
qlength()2927 	public int qlength(){return quality==null ? 0 : quality.length;}
mateLength()2928 	public int mateLength(){return mate==null ? 0 : mate.length();}
numMateKmers(int k)2929 	public int numMateKmers(int k){return mate==null ? 0 : mate.numKmers(k);}
mateId()2930 	public String mateId(){return mate==null ? null : mate.id;}
mateCount()2931 	public int mateCount(){return mate==null ? 0 : 1;}
mateMapped()2932 	public boolean mateMapped(){return mate==null ? false : mate.mapped();}
countMateBytes()2933 	public long countMateBytes(){return mate==null ? 0 : mate.countBytes();}
countMateFastqBytes()2934 	public long countMateFastqBytes(){return mate==null ? 0 : mate.countFastqBytes();}
2935 	/** Number of bytes this read pair uses in memory, approximately */
countPairBytes()2936 	public long countPairBytes(){return countBytes()+(mate==null ? 0 : mate.countBytes());}
2937 
2938 	/** Number of bytes this read uses in memory, approximately */
countBytes()2939 	public long countBytes(){
2940 		long sum=129; //Approximate per-read overhead
2941 		sum+=(bases==null ? 0 : bases.length+16);
2942 		sum+=(quality==null ? 0 : quality.length+16);
2943 		sum+=(id==null ? 0 : id.length()*2+16);
2944 		sum+=(match==null ? 0 : match.length+16);
2945 		sum+=(samline==null ? 0 : samline.countBytes());
2946 		return sum;
2947 	}
2948 
2949 	/** Number of bytes this read uses in on disk in Fastq format */
countFastqBytes()2950 	public long countFastqBytes(){
2951 		long sum=6;//4 newlines, +, @
2952 		sum+=(bases==null ? 0 : bases.length);
2953 		sum+=(quality==null ? 0 : quality.length);
2954 		sum+=(id==null ? 0 : id.length());
2955 		return sum;
2956 	}
2957 
countLeft(final char base)2958 	public int countLeft(final char base){return countLeft((byte)base);}
countRight(final char base)2959 	public int countRight(final char base){return countRight((byte)base);}
2960 
countLeft(final byte base)2961 	public int countLeft(final byte base){
2962 		for(int i=0; i<bases.length; i++){
2963 			final byte b=bases[i];
2964 			if(b!=base){return i;}
2965 		}
2966 		return bases.length;
2967 	}
2968 
countRight(final byte base)2969 	public int countRight(final byte base){
2970 		for(int i=bases.length-1; i>=0; i--){
2971 			final byte b=bases[i];
2972 			if(b!=base){return bases.length-i-1;}
2973 		}
2974 		return bases.length;
2975 	}
2976 
untrim()2977 	public boolean untrim(){
2978 		if(obj==null || obj.getClass()!=TrimRead.class){return false;}
2979 		((TrimRead)obj).untrim();
2980 		obj=null;
2981 		return true;
2982 	}
2983 
trailingLowerCase()2984 	public int trailingLowerCase(){
2985 		for(int i=bases.length-1; i>=0;){
2986 			if(Tools.isLowerCase(bases[i])){
2987 				i--;
2988 			}else{
2989 				return bases.length-i-1;
2990 			}
2991 		}
2992 		return bases.length;
2993 	}
leadingLowerCase()2994 	public int leadingLowerCase(){
2995 		for(int i=0; i<bases.length; i++){
2996 			if(!Tools.isLowerCase(bases[i])){return i;}
2997 		}
2998 		return bases.length;
2999 	}
3000 
strandChar()3001 	public char strandChar(){return Shared.strandCodes2[strand()];}
strand()3002 	public byte strand(){return (byte)(flags&1);}
mapped()3003 	public boolean mapped(){return (flags&MAPPEDMASK)==MAPPEDMASK;}
paired()3004 	public boolean paired(){return (flags&PAIREDMASK)==PAIREDMASK;}
synthetic()3005 	public boolean synthetic(){return (flags&SYNTHMASK)==SYNTHMASK;}
ambiguous()3006 	public boolean ambiguous(){return (flags&AMBIMASK)==AMBIMASK;}
perfect()3007 	public boolean perfect(){return (flags&PERFECTMASK)==PERFECTMASK;}
3008 //	public boolean semiperfect(){return perfect() ? true : list!=null && list.size()>0 ? list.get(0).semiperfect : false;} //TODO: This is a hack.  Add a semiperfect flag.
rescued()3009 	public boolean rescued(){return (flags&RESCUEDMASK)==RESCUEDMASK;}
discarded()3010 	public boolean discarded(){return (flags&DISCARDMASK)==DISCARDMASK;}
invalid()3011 	public boolean invalid(){return (flags&INVALIDMASK)==INVALIDMASK;}
swapped()3012 	public boolean swapped(){return (flags&SWAPMASK)==SWAPMASK;}
shortmatch()3013 	public boolean shortmatch(){return (flags&SHORTMATCHMASK)==SHORTMATCHMASK;}
insertvalid()3014 	public boolean insertvalid(){return (flags&INSERTMASK)==INSERTMASK;}
hasAdapter()3015 	public boolean hasAdapter(){return (flags&ADAPTERMASK)==ADAPTERMASK;}
secondary()3016 	public boolean secondary(){return (flags&SECONDARYMASK)==SECONDARYMASK;}
aminoacid()3017 	public boolean aminoacid(){return (flags&AAMASK)==AAMASK;}
amino()3018 	public boolean amino(){return (flags&AAMASK)==AAMASK;}
junk()3019 	public boolean junk(){return (flags&JUNKMASK)==JUNKMASK;}
validated()3020 	public boolean validated(){return (flags&VALIDATEDMASK)==VALIDATEDMASK;}
tested()3021 	public boolean tested(){return (flags&TESTEDMASK)==TESTEDMASK;}
invertedRepeat()3022 	public boolean invertedRepeat(){return (flags&IRMASK)==IRMASK;}
trimmed()3023 	public boolean trimmed(){return (flags&TRIMMEDMASK)==TRIMMEDMASK;}
3024 
3025 	/** For paired ends: 0 for read1, 1 for read2 */
pairnum()3026 	public int pairnum(){return (flags&PAIRNUMMASK)>>PAIRNUMSHIFT;}
valid()3027 	public boolean valid(){return !invalid();}
3028 
getFlag(int mask)3029 	public boolean getFlag(int mask){return (flags&mask)==mask;}
flagToNumber(int mask)3030 	public int flagToNumber(int mask){return (flags&mask)==mask ? 1 : 0;}
3031 
setFlag(int mask, boolean b)3032 	public void setFlag(int mask, boolean b){
3033 		flags=(flags&~mask);
3034 		if(b){flags|=mask;}
3035 	}
3036 
setStrand(int b)3037 	public void setStrand(int b){
3038 		assert(b==1 || b==0);
3039 		flags=(flags&(~1))|b;
3040 	}
3041 
3042 	/** For paired ends: 0 for read1, 1 for read2 */
setPairnum(int b)3043 	public void setPairnum(int b){
3044 //		System.err.println("Setting pairnum to "+b+" for "+id);
3045 //		assert(!id.equals("2_chr1_0_1853883_1853982_1845883_ecoli_K12") || b==1);
3046 		assert(b==1 || b==0);
3047 		flags=(flags&(~PAIRNUMMASK))|(b<<PAIRNUMSHIFT);
3048 //		assert(pairnum()==b);
3049 	}
3050 
setPaired(boolean b)3051 	public void setPaired(boolean b){
3052 		flags=(flags&~PAIREDMASK);
3053 		if(b){flags|=PAIREDMASK;}
3054 	}
3055 
setSynthetic(boolean b)3056 	public void setSynthetic(boolean b){
3057 		flags=(flags&~SYNTHMASK);
3058 		if(b){flags|=SYNTHMASK;}
3059 	}
3060 
setAmbiguous(boolean b)3061 	public void setAmbiguous(boolean b){
3062 		flags=(flags&~AMBIMASK);
3063 		if(b){flags|=AMBIMASK;}
3064 	}
3065 
setPerfectFlag(int maxScore)3066 	public boolean setPerfectFlag(int maxScore){
3067 		final SiteScore ss=topSite();
3068 		if(ss==null){
3069 			setPerfect(false);
3070 		}else{
3071 			assert(ss.slowScore<=maxScore) : maxScore+", "+ss.slowScore+", "+ss.toText();
3072 
3073 			if(ss.slowScore==maxScore || ss.perfect){
3074 				assert(testMatchPerfection(true)) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n";
3075 				setPerfect(true);
3076 			}else{
3077 				boolean flag=testMatchPerfection(false);
3078 				setPerfect(flag);
3079 				assert(flag || !ss.perfect) : "flag="+flag+", ss.perfect="+ss.perfect+"\nmatch="+new String(match)+"\n"+this.toText(false);
3080 				assert(!flag || ss.slowScore>=maxScore) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n";
3081 			}
3082 		}
3083 		return perfect();
3084 	}
3085 
testMatchPerfection(boolean returnIfNoMatch)3086 	private boolean testMatchPerfection(boolean returnIfNoMatch){
3087 		if(match==null){return returnIfNoMatch;}
3088 		boolean flag=(match.length==bases.length);
3089 		if(shortmatch()){
3090 			flag=(match.length==0 || match[0]=='m');
3091 			for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m' || Tools.isDigit(match[i]));}
3092 		}else{
3093 			for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m');}
3094 		}
3095 		for(int i=0; i<bases.length && flag; i++){flag=(bases[i]!='N');}
3096 		return flag;
3097 	}
3098 
3099 	/**
3100 	 * @return GC fraction
3101 	 */
gc()3102 	public float gc() {
3103 		if(bases==null || bases.length<1){return 0;}
3104 		int at=0, gc=0;
3105 		for(byte b : bases){
3106 			int x=AminoAcid.baseToNumber[b];
3107 			if(x>-1){
3108 				if(x==0 || x==3){at++;}
3109 				else{gc++;}
3110 			}
3111 		}
3112 		if(gc<1){return 0;}
3113 		return gc*1f/(at+gc);
3114 	}
3115 
3116 	/**
3117 	 * @param swapFrom
3118 	 * @param swapTo
3119 	 * @return number of swaps
3120 	 */
swapBase(byte swapFrom, byte swapTo)3121 	public int swapBase(byte swapFrom, byte swapTo) {
3122 		if(bases==null){return 0;}
3123 		int swaps=0;
3124 		for(int i=0; i<bases.length; i++){
3125 			if(bases[i]==swapFrom){
3126 				bases[i]=swapTo;
3127 				swaps++;
3128 			}
3129 		}
3130 		return swaps;
3131 	}
3132 
3133 	/**
3134 	 * @param remap Table of new values
3135 	 */
remap(byte[] remap)3136 	public void remap(byte[] remap) {
3137 		if(bases==null){return;}
3138 		for(int i=0; i<bases.length; i++){
3139 			bases[i]=remap[bases[i]];
3140 		}
3141 	}
3142 
3143 	/**
3144 	 * @param remap Table of new values
3145 	 */
remapAndCount(byte[] remap)3146 	public int remapAndCount(byte[] remap) {
3147 		if(bases==null){return 0;}
3148 		int swaps=0;
3149 		for(int i=0; i<bases.length; i++){
3150 			byte a=bases[i];
3151 			byte b=remap[a];
3152 			if(a!=b){
3153 				bases[i]=b;
3154 				swaps++;
3155 			}
3156 		}
3157 		return swaps;
3158 	}
3159 
convertUndefinedTo(byte b)3160 	public int convertUndefinedTo(byte b){
3161 		if(bases==null){return 0;}
3162 		int changed=0;
3163 		for(int i=0; i<bases.length; i++){
3164 			if(b<0 || AminoAcid.baseToNumberACGTN[bases[i]]<0){
3165 				changed++;
3166 				bases[i]=b;
3167 				if(quality!=null){quality[i]=0;}
3168 			}
3169 		}
3170 		return changed;
3171 	}
3172 
swapBasesWithMate()3173 	public void swapBasesWithMate(){
3174 		if(mate==null){
3175 			assert(false);
3176 			return;
3177 		}
3178 		byte[] temp=bases;
3179 		bases=mate.bases;
3180 		mate.bases=temp;
3181 		temp=quality;
3182 		quality=mate.quality;
3183 		mate.quality=temp;
3184 	}
3185 
insert()3186 	public int insert(){
3187 		return insertvalid() ? insert : -1;
3188 	}
3189 
insertSizeMapped(boolean ignoreStrand)3190 	public int insertSizeMapped(boolean ignoreStrand){
3191 		return insertSizeMapped(this, mate, ignoreStrand);
3192 	}
3193 
insertSizeMapped(Read r1, Read r2, boolean ignoreStrand)3194 	public static int insertSizeMapped(Read r1, Read r2, boolean ignoreStrand){
3195 //		assert(false) : ignoreStrand+", "+(r2==null)+", "+(r1.mapped())+", "+(r2.mapped())+", "+(r1.strand()==r2.strand())+", "+r1.strand()+", "+r2.strand();
3196 		if(ignoreStrand || r2==null || !r1.mapped() || !r2.mapped() || r1.strand()==r2.strand()){return insertSizeMapped_Unstranded(r1, r2);}
3197 		return insertSizeMapped_PlusLeft(r1, r2);
3198 	}
3199 
3200 	/** TODO: This is not correct when the insert is shorter than a read's bases with same-strand reads */
insertSizeMapped_PlusLeft(Read r1, Read r2)3201 	public static int insertSizeMapped_PlusLeft(Read r1, Read r2){
3202 		if(r1.strand()>r2.strand()){return insertSizeMapped_PlusLeft(r2, r1);}
3203 		if(r1.strand()==r2.strand() || r1.start>r2.stop){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left.
3204 //		if(!mapped() || !mate.mapped()){return 0;}
3205 		if(r1.chrom!=r2.chrom){return 0;}
3206 		if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //???
3207 
3208 		int a=r1.length();
3209 		int b=r2.length();
3210 		int mid=r2.start-r1.stop-1;
3211 		if(-mid>=a+b){return insertSizeMapped_Unstranded(r1, r2);} //Not properly oriented; plus read is to the right of minus read
3212 		return mid+a+b;
3213 	}
3214 
insertSizeMapped_Unstranded(Read r1, Read r2)3215 	public static int insertSizeMapped_Unstranded(Read r1, Read r2){
3216 		if(r2==null){return r1.start==r1.stop ? 0 : r1.stop-r1.start+1;}
3217 
3218 		if(r1.start>r2.start){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left side.
3219 
3220 //		if(!mapped() || !mate.mapped()){return 0;}
3221 		if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //???
3222 
3223 		if(r1.chrom!=r2.chrom){return 0;}
3224 		int a=r1.length();
3225 		int b=r2.length();
3226 		if(false && Tools.overlap(r1.start, r1.stop, r2.start, r2.stop)){
3227 			//This does not handle very short inserts
3228 			return Tools.max(r1.stop, r2.stop)-Tools.min(r1.start, r2.start)+1;
3229 
3230 		}else{
3231 			if(r1.start<r2.start){
3232 				int mid=r2.start-r1.stop-1;
3233 //				assert(false) : mid+", "+a+", "+b;
3234 //				if(-mid>a && -mid>b){return Tools.min(a, b);} //Strange situation, no way to guess insert size
3235 				if(-mid>=a+b){return 0;} //Strange situation, no way to guess insert size
3236 				return mid+a+b;
3237 			}else{
3238 				assert(r1.start==r2.start);
3239 				return Tools.min(a, b);
3240 			}
3241 		}
3242 	}
3243 
insertSizeOriginalSite()3244 	public int insertSizeOriginalSite(){
3245 		if(mate==null){
3246 //			System.err.println("A: "+(originalSite==null ? "null" : (originalSite.stop-originalSite.start+1)));
3247 			return (originalSite==null ? -1 : originalSite.stop-originalSite.start+1);
3248 		}
3249 
3250 		final SiteScore ssa=originalSite, ssb=mate.originalSite;
3251 		final int x;
3252 		if(ssa==null || ssb==null){
3253 //			System.err.println("B: 0");
3254 			x=0;
3255 		}else{
3256 			x=insertSize(ssa, ssb, bases.length, mate.length());
3257 		}
3258 
3259 		assert(pairnum()>=mate.pairnum() || x==mate.insertSizeOriginalSite());
3260 		return x;
3261 	}
3262 
insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb)3263 	public static int insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb){
3264 		return insertSize(ssa.chrom, ssb.chrom, ssa.start, ssb.start, ssa.stop, ssb.stop, lena, lenb);
3265 	}
3266 
insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb)3267 	public static int insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb){
3268 
3269 		final int x;
3270 
3271 		//		if(mate==null || ){return bases==null ? 0 : bases.length;}
3272 		if(chroma!=chromb){x=0;}
3273 		else{
3274 
3275 			if(Tools.overlap(starta, stopa, startb, stopb)){
3276 				x=Tools.max(stopa, stopb)-Tools.min(starta, startb)+1;
3277 //				System.err.println("C: "+x);
3278 			}else{
3279 				if(starta<=startb){
3280 					int mid=startb-stopa-1;
3281 					//				assert(false) : mid+", "+a+", "+b;
3282 					x=mid+lena+lenb;
3283 //					System.err.println("D: "+x);
3284 				}else{
3285 					int mid=starta-stopb-1;
3286 					//				assert(false) : mid+", "+a+", "+b;
3287 					x=mid+lena+lenb;
3288 //					System.err.println("E: "+x);
3289 				}
3290 			}
3291 		}
3292 		return x;
3293 	}
3294 
subRead(int from, int to)3295 	public Read subRead(int from, int to){
3296 		Read r=this.clone();
3297 		r.bases=KillSwitch.copyOfRange(bases, from, to);
3298 		r.quality=(quality==null ? null : KillSwitch.copyOfRange(quality, from, to));
3299 		r.mate=null;
3300 //		assert(Tools.indexOf(r.bases, (byte)'J')<0);
3301 		return r;
3302 	}
3303 
joinRead()3304 	public Read joinRead(){
3305 		if(insert<1 || mate==null || !insertvalid()){return this;}
3306 		assert(insert>9 || bases.length<20) : "Perhaps old read format is being used?  This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n";
3307 		return joinRead(this, mate, insert);
3308 	}
3309 
joinRead(int x)3310 	public Read joinRead(int x){
3311 		if(x<1 || mate==null){return this;}
3312 		assert(x>9 || bases.length<20) : "Perhaps old read format is being used?  This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n";
3313 		return joinRead(this, mate, x);
3314 	}
3315 
joinRead(Read a, Read b, int insert)3316 	public static Read joinRead(Read a, Read b, int insert){
3317 		assert(a!=null && b!=null && insert>0);
3318 		final int lengthSum=a.length()+b.length();
3319 		final int overlap=Tools.min(insert, lengthSum-insert);
3320 
3321 //		System.err.println(insert);
3322 		final byte[] bases=new byte[insert], abases=a.bases, bbases=b.bases;
3323 		final byte[] aquals=a.quality, bquals=b.quality;
3324 		final byte[] quals=(aquals==null || bquals==null ? null : new byte[insert]);
3325 		assert(aquals==null || (aquals.length==abases.length && bquals.length==bbases.length));
3326 
3327 		int mismatches=0;
3328 
3329 		int start, stop;
3330 
3331 		if(overlap<=0){//Simple join in which there is no overlap
3332 			int lim=insert-b.length();
3333 			if(quals==null){
3334 				for(int i=0; i<a.length(); i++){
3335 					bases[i]=abases[i];
3336 				}
3337 				for(int i=a.length(); i<lim; i++){
3338 					bases[i]='N';
3339 				}
3340 				for(int i=0; i<b.length(); i++){
3341 					bases[i+lim]=bbases[i];
3342 				}
3343 			}else{
3344 				for(int i=0; i<a.length(); i++){
3345 					bases[i]=abases[i];
3346 					quals[i]=aquals[i];
3347 				}
3348 				for(int i=a.length(); i<lim; i++){
3349 					bases[i]='N';
3350 					quals[i]=0;
3351 				}
3352 				for(int i=0; i<b.length(); i++){
3353 					bases[i+lim]=bbases[i];
3354 					quals[i+lim]=bquals[i];
3355 				}
3356 			}
3357 
3358 			start=Tools.min(a.start, b.start);
3359 //			stop=start+insert-1;
3360 			stop=Tools.max(a.stop, b.stop);
3361 
3362 //		}else if(insert>=a.length() && insert>=b.length()){ //Overlapped join, proper orientation
3363 //			final int lim1=a.length()-overlap;
3364 //			final int lim2=a.length();
3365 //			for(int i=0; i<lim1; i++){
3366 //				bases[i]=abases[i];
3367 //				quals[i]=aquals[i];
3368 //			}
3369 //			for(int i=lim1, j=0; i<lim2; i++, j++){
3370 //				assert(false) : "TODO";
3371 //				bases[i]='N';
3372 //				quals[i]=0;
3373 //			}
3374 //			for(int i=lim2, j=overlap; i<bases.length; i++, j++){
3375 //				bases[i]=bbases[j];
3376 //				quals[i]=bquals[j];
3377 //			}
3378 		}else{ //reads go off ends of molecule.
3379 			if(quals==null){
3380 				for(int i=0; i<a.length() && i<bases.length; i++){
3381 					bases[i]=abases[i];
3382 				}
3383 				for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){
3384 					byte ca=bases[i], cb=bbases[j];
3385 					if(ca==0 || ca=='N'){
3386 						bases[i]=cb;
3387 					}else if(ca==cb){
3388 					}else{
3389 						bases[i]=(ca>=cb ? ca : cb);
3390 						if(ca!='N' && cb!='N'){mismatches++;}
3391 					}
3392 				}
3393 			}else{
3394 				for(int i=0; i<a.length() && i<bases.length; i++){
3395 					bases[i]=abases[i];
3396 					quals[i]=aquals[i];
3397 				}
3398 				for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){
3399 					byte ca=bases[i], cb=bbases[j];
3400 					byte qa=quals[i], qb=bquals[j];
3401 					if(ca==0 || ca=='N'){
3402 						bases[i]=cb;
3403 						quals[i]=qb;
3404 					}else if(cb==0 || cb=='N'){
3405 						//do nothing
3406 					}else if(ca==cb){
3407 						quals[i]=(byte)Tools.min((Tools.max(qa, qb)+Tools.min(qa, qb)/4), MAX_MERGE_QUALITY);
3408 					}else{
3409 						bases[i]=(qa>qb ? ca : qa<qb ? cb : (byte)'N');
3410 						quals[i]=(byte)(Tools.max(qa, qb)-Tools.min(qa, qb));
3411 						if(ca!='N' && cb!='N'){mismatches++;}
3412 					}
3413 				}
3414 			}
3415 
3416 			if(a.strand()==0){
3417 				start=a.start;
3418 //				stop=start+insert-1;
3419 				stop=b.stop;
3420 			}else{
3421 				stop=a.stop;
3422 //				start=stop-insert+1;
3423 				start=b.start;
3424 			}
3425 			if(start>stop){
3426 				start=Tools.min(a.start, b.start);
3427 				stop=Tools.max(a.stop, b.stop);
3428 			}
3429 		}
3430 //		assert(mismatches>=countMismatches(a, b, insert, 999));
3431 //		System.err.println(mismatches);
3432 		if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){start=stop=a.chrom=0;}
3433 
3434 //		System.err.println(bases.length+", "+start+", "+stop);
3435 
3436 		Read r=new Read(bases, null, a.id, a.numericID, a.flags, a.chrom, start, stop);
3437 		r.quality=quals; //This prevents quality from getting capped.
3438 		if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){r.setMapped(true);}
3439 		r.setInsert(insert);
3440 		r.setPaired(false);
3441 		r.copies=a.copies;
3442 		r.mapScore=a.mapScore+b.mapScore;
3443 		if(overlap<=0){
3444 			r.mapScore=a.mapScore+b.mapScore;
3445 			r.errors=a.errors+b.errors;
3446 			//TODO r.gaps=?
3447 		}else{//Hard to calculate
3448 			r.mapScore=(int)((a.mapScore*(long)a.length()+b.mapScore*(long)b.length())/insert);
3449 			r.errors=a.errors;
3450 		}
3451 
3452 
3453 		assert(r.insertvalid()) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
3454 		assert(r.insert()==r.length()) : r.insert()+"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
3455 //		assert(false) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
3456 
3457 		//TODO: Triggered by BBMerge in useratio mode for some reason.
3458 //		assert(Shared.anomaly || (a.insertSizeMapped(false)>0 == r.insertSizeMapped(false)>0)) :
3459 //			"\n"+r.length()+"\n"+r.insert()+"\n"+r.insertSizeMapped(false)+"\n"+a.insert()+"\n"+a.insertSizeMapped(false)+
3460 //			"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n";
3461 
3462 		return r;
3463 	}
3464 
3465 	/**
3466 	 * @param minlen
3467 	 * @param maxlen
3468 	 * @return A list of read fragments
3469 	 */
split(int minlen, int maxlen)3470 	public ArrayList<Read> split(int minlen, int maxlen) {
3471 		int len=bases==null ? 0 : bases.length;
3472 		if(len<minlen){return null;}
3473 		int parts=(len+maxlen-1)/maxlen;
3474 		ArrayList<Read> subreads=new ArrayList<Read>(parts);
3475 		if(len<=maxlen){
3476 			subreads.add(this);
3477 		}else{
3478 			float ideal=Tools.max(minlen, len/(float)parts);
3479 			int actual=(int)ideal;
3480 			assert(false) : "TODO"; //Some assertion goes here, I forget what
3481 			for(int i=0; i<parts; i++){
3482 				int a=i*actual;
3483 				int b=(i+1)*actual;
3484 				if(b>bases.length){b=bases.length;}
3485 //				if(b-a<)
3486 				byte[] subbases=KillSwitch.copyOfRange(bases, a, b);
3487 				byte[] subquals=(quality==null ? null : KillSwitch.copyOfRange(quality, a, b+1));
3488 				Read r=new Read(subbases, subquals, id+"_"+i, numericID, flags);
3489 				subreads.add(r);
3490 			}
3491 		}
3492 		return subreads;
3493 	}
3494 
3495 	/** Generate and return an array of canonical kmers for this read */
toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer)3496 	public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) {
3497 		if(gap>0){throw new RuntimeException("Gapped reads: TODO");}
3498 		if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);}
3499 		if(bases==null || bases.length<k+gap){return null;}
3500 
3501 		final int arraylen=bases.length-k+1;
3502 		if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
3503 		Arrays.fill(kmers, -1);
3504 
3505 		final int shift=2*k;
3506 		final int shift2=shift-2;
3507 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
3508 		long kmer=0, rkmer=0;
3509 		int len=0;
3510 
3511 		for(int i=0; i<bases.length; i++){
3512 			byte b=bases[i];
3513 			long x=AminoAcid.baseToNumber[b];
3514 			long x2=AminoAcid.baseToComplementNumber[b];
3515 			kmer=((kmer<<2)|x)&mask;
3516 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
3517 			if(x<0){len=0; rkmer=0;}else{len++;}
3518 			if(len>=k){
3519 				kmers[i-k+1]=makeCanonical ? Tools.max(kmer, rkmer) : kmer;
3520 			}
3521 		}
3522 		return kmers;
3523 	}
3524 
3525 //	/** Generate and return an array of canonical kmers for this read */
3526 //	public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) {
3527 //		if(gap>0){throw new RuntimeException("Gapped reads: TODO");}
3528 //		if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);}
3529 //		if(bases==null || bases.length<k+gap){return null;}
3530 //
3531 //		final int kbits=2*k;
3532 //		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
3533 //
3534 //		int len=0;
3535 //		long kmer=0;
3536 //		final int arraylen=bases.length-k+1;
3537 //		if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
3538 //		Arrays.fill(kmers, -1);
3539 //
3540 //		for(int i=0; i<bases.length; i++){
3541 //			byte b=bases[i];
3542 //			int x=AminoAcid.baseToNumber[b];
3543 //			if(x<0){
3544 //				len=0;
3545 //				kmer=0;
3546 //			}else{
3547 //				kmer=((kmer<<2)|x)&mask;
3548 //				len++;
3549 //
3550 //				if(len>=k){
3551 //					kmers[i-k+1]=kmer;
3552 //				}
3553 //			}
3554 //		}
3555 //
3556 ////		System.out.println(new String(bases));
3557 ////		System.out.println(Arrays.toString(kmers));
3558 //
3559 //		if(makeCanonical){
3560 //			this.reverseComplement();
3561 //			len=0;
3562 //			kmer=0;
3563 //			for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
3564 //				byte b=bases[i];
3565 //				int x=AminoAcid.baseToNumber[b];
3566 //				if(x<0){
3567 //					len=0;
3568 //					kmer=0;
3569 //				}else{
3570 //					kmer=((kmer<<2)|x)&mask;
3571 //					len++;
3572 //
3573 //					if(len>=k){
3574 //						assert(kmer==AminoAcid.reverseComplementBinaryFast(kmers[j], k));
3575 //						kmers[j]=Tools.max(kmers[j], kmer);
3576 //					}
3577 //				}
3578 //			}
3579 //			this.reverseComplement();
3580 //
3581 ////			System.out.println(Arrays.toString(kmers));
3582 //		}
3583 //
3584 //
3585 //		return kmers;
3586 //	}
3587 
3588 	/** Generate and return an array of canonical kmers for this read */
toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer)3589 	public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer) {
3590 		assert(k>31) : k;
3591 		assert(makeCanonical);
3592 		if(bases==null || bases.length<k){return null;}
3593 		kmer.clear();
3594 
3595 		final int arraylen=bases.length-k+1;
3596 		if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
3597 		Arrays.fill(kmers, -1);
3598 
3599 		for(int i=0; i<bases.length; i++){
3600 			byte b=bases[i];
3601 			kmer.addRight(b);
3602 			if(!AminoAcid.isFullyDefined(b)){kmer.clear();}
3603 			if(kmer.len>=k){
3604 				kmers[i-k+1]=kmer.xor();
3605 			}
3606 		}
3607 
3608 		return kmers;
3609 	}
3610 
3611 //	/** Generate and return an array of canonical kmers for this read */
3612 //	public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer longkmer) {
3613 //		assert(k>31) : k;
3614 //		if(bases==null || bases.length<k){return null;}
3615 //
3616 //		final int kbits=2*k;
3617 //		final long mask=Long.MAX_VALUE;
3618 //
3619 //		int len=0;
3620 //		long kmer=0;
3621 //		final int arraylen=bases.length-k+1;
3622 //		if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];}
3623 //		Arrays.fill(kmers, -1);
3624 //
3625 //
3626 //		final int tailshift=k%32;
3627 //		final int tailshiftbits=tailshift*2;
3628 //
3629 //		for(int i=0; i<bases.length; i++){
3630 //			byte b=bases[i];
3631 //			int x=AminoAcid.baseToNumber[b];
3632 //			if(x<0){
3633 //				len=0;
3634 //				kmer=0;
3635 //			}else{
3636 //				kmer=Long.rotateLeft(kmer, 2);
3637 //				kmer=kmer^x;
3638 //				len++;
3639 //
3640 //				if(len>=k){
3641 //					long x2=AminoAcid.baseToNumber[bases[i-k]];
3642 //					kmer=kmer^(x2<<tailshiftbits);
3643 //					kmers[i-k+1]=kmer;
3644 //				}
3645 //			}
3646 //		}
3647 //		if(makeCanonical){
3648 //			this.reverseComplement();
3649 //			len=0;
3650 //			kmer=0;
3651 //			for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
3652 //				byte b=bases[i];
3653 //				int x=AminoAcid.baseToNumber[b];
3654 //				if(x<0){
3655 //					len=0;
3656 //					kmer=0;
3657 //				}else{
3658 //					kmer=Long.rotateLeft(kmer, 2);
3659 //					kmer=kmer^x;
3660 //					len++;
3661 //
3662 //					if(len>=k){
3663 //						long x2=AminoAcid.baseToNumber[bases[i-k]];
3664 //						kmer=kmer^(x2<<tailshiftbits);
3665 //						kmers[j]=mask&(Tools.max(kmers[j], kmer));
3666 //					}
3667 //				}
3668 //			}
3669 //			this.reverseComplement();
3670 //		}else{
3671 //			assert(false) : "Long kmers should be made canonical here because they cannot be canonicized later.";
3672 //		}
3673 //
3674 //		return kmers;
3675 //	}
3676 
CHECKSITES(Read r, byte[] basesM)3677 	public static final boolean CHECKSITES(Read r, byte[] basesM){
3678 		return CHECKSITES(r.sites, r.bases, basesM, r.numericID, true);
3679 	}
3680 
CHECKSITES(Read r, byte[] basesM, boolean verifySorted)3681 	public static final boolean CHECKSITES(Read r, byte[] basesM, boolean verifySorted){
3682 		return CHECKSITES(r.sites, r.bases, basesM, r.numericID, verifySorted);
3683 	}
3684 
CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id)3685 	public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id){
3686 		return CHECKSITES(list, basesP, basesM, id, true);
3687 	}
3688 
CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id, boolean verifySorted)3689 	public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id, boolean verifySorted){
3690 		return true; //Temporarily disabled
3691 //		if(list==null || list.isEmpty()){return true;}
3692 //		SiteScore prev=null;
3693 //		for(int i=0; i<list.size(); i++){
3694 //			SiteScore ss=list.get(i);
3695 //			if(ss.strand==Gene.MINUS && basesM==null && basesP!=null){basesM=AminoAcid.reverseComplementBases(basesP);}
3696 //			byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM);
3697 //			if(verbose){System.err.println("Checking site "+i+": "+ss);}
3698 //			boolean b=CHECKSITE(ss, bases, id);
3699 //			assert(b) : id+"\n"+new String(basesP)+"\n"+ss+"\n";
3700 //			if(verbose){System.err.println("Checked site "+i+" = "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect);}
3701 //			if(!b){
3702 ////				System.err.println("Error at SiteScore "+i+": ss.p="+ss.perfect+", ss.sp="+ss.semiperfect);
3703 //				return false;
3704 //			}
3705 //			if(verifySorted && prev!=null && ss.score>prev.score){
3706 //				if(verbose){System.err.println("verifySorted failed.");}
3707 //				return false;
3708 //			}
3709 //			prev=ss;
3710 //		}
3711 //		return true;
3712 	}
3713 
3714 	/** Makes sure 'bases' is for correct strand. */
CHECKORDER(ArrayList<SiteScore> list)3715 	public static final boolean CHECKORDER(ArrayList<SiteScore> list){
3716 		if(list==null || list.size()<2){return true;}
3717 		SiteScore prev=list.get(0);
3718 		for(int i=0; i<list.size(); i++){
3719 			SiteScore ss=list.get(i);
3720 			if(ss.score>prev.score){return false;}
3721 			prev=ss;
3722 		}
3723 		return true;
3724 	}
3725 
3726 	/** Makes sure 'bases' is for correct strand. */
CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id)3727 	public static final boolean CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id){
3728 		return CHECKSITE(ss, ss.plus() ? basesP : basesM, id);
3729 	}
3730 
3731 	/** Make sure 'bases' is for correct strand! */
CHECKSITE(SiteScore ss, byte[] bases, long id)3732 	public static final boolean CHECKSITE(SiteScore ss, byte[] bases, long id){
3733 		return true; //Temporarily disabled
3734 //		if(ss==null){return true;}
3735 ////		System.err.println("Checking site "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect+", bases="+new String(bases));
3736 //		if(ss.perfect){assert(ss.semiperfect) : ss+"\n"+new String(bases);}
3737 //		if(ss.gaps!=null){
3738 //			if(ss.gaps[0]!=ss.start || ss.gaps[ss.gaps.length-1]!=ss.stop){return false;}
3739 ////			assert(ss.gaps[0]==ss.start && ss.gaps[ss.gaps.length-1]==ss.stop);
3740 //		}
3741 //
3742 //		if(!(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore)){
3743 //			System.err.println("Site paired score violation: "+ss.quickScore+", "+ss.slowScore+", "+ss.pairedScore);
3744 //			return false;
3745 //		}
3746 //
3747 //		final boolean xy=ss.matchContainsXY();
3748 //		if(bases!=null){
3749 //
3750 //			final boolean p0=ss.perfect;
3751 //			final boolean sp0=ss.semiperfect;
3752 //			final boolean p1=ss.isPerfect(bases);
3753 //			final boolean sp1=(p1 ? true : ss.isSemiPerfect(bases));
3754 //
3755 //			assert(p0==p1 || (xy && p1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+
3756 //				"\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n";
3757 //			assert(sp0==sp1 || (xy && sp1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+
3758 //				"\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n";
3759 //
3760 ////			ss.setPerfect(bases, false);
3761 //
3762 //			assert(p0==ss.perfect) :
3763 //				p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+
3764 //				Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n";
3765 //			assert(sp0==ss.semiperfect) :
3766 //				p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+
3767 //				Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n";
3768 //			if(ss.perfect){assert(ss.semiperfect);}
3769 //		}
3770 //		if(ss.match!=null && ss.matchLength()!=ss.mappedLength()){
3771 //			if(verbose){System.err.println("Returning false because matchLength!=mappedLength:\n"+ss.matchLength()+", "+ss.mappedLength()+"\n"+ss);}
3772 //			return false;
3773 //		}
3774 //		return true;
3775 	}
3776 
setPerfect(boolean b)3777 	public void setPerfect(boolean b){
3778 		flags=(flags&~PERFECTMASK);
3779 		if(b){flags|=PERFECTMASK;}
3780 	}
3781 
setRescued(boolean b)3782 	public void setRescued(boolean b){
3783 		flags=(flags&~RESCUEDMASK);
3784 		if(b){flags|=RESCUEDMASK;}
3785 	}
3786 
setMapped(boolean b)3787 	public void setMapped(boolean b){
3788 		flags=(flags&~MAPPEDMASK);
3789 		if(b){flags|=MAPPEDMASK;}
3790 	}
3791 
setDiscarded(boolean b)3792 	public void setDiscarded(boolean b){
3793 		flags=(flags&~DISCARDMASK);
3794 		if(b){flags|=DISCARDMASK;}
3795 	}
3796 
setInvalid(boolean b)3797 	public void setInvalid(boolean b){
3798 		flags=(flags&~INVALIDMASK);
3799 		if(b){flags|=INVALIDMASK;}
3800 	}
3801 
setSwapped(boolean b)3802 	public void setSwapped(boolean b){
3803 		flags=(flags&~SWAPMASK);
3804 		if(b){flags|=SWAPMASK;}
3805 	}
3806 
setShortMatch(boolean b)3807 	public void setShortMatch(boolean b){
3808 		flags=(flags&~SHORTMATCHMASK);
3809 		if(b){flags|=SHORTMATCHMASK;}
3810 	}
3811 
setInsertValid(boolean b)3812 	public void setInsertValid(boolean b){
3813 		flags=(flags&~INSERTMASK);
3814 		if(b){flags|=INSERTMASK;}
3815 	}
3816 
setHasAdapter(boolean b)3817 	public void setHasAdapter(boolean b){
3818 		flags=(flags&~ADAPTERMASK);
3819 		if(b){flags|=ADAPTERMASK;}
3820 	}
3821 
setSecondary(boolean b)3822 	public void setSecondary(boolean b){
3823 		flags=(flags&~SECONDARYMASK);
3824 		if(b){flags|=SECONDARYMASK;}
3825 	}
3826 
setAminoAcid(boolean b)3827 	public void setAminoAcid(boolean b){
3828 		flags=(flags&~AAMASK);
3829 		if(b){flags|=AAMASK;}
3830 	}
3831 
setJunk(boolean b)3832 	public void setJunk(boolean b){
3833 		flags=(flags&~JUNKMASK);
3834 		if(b){flags|=JUNKMASK;}
3835 	}
3836 
setValidated(boolean b)3837 	public void setValidated(boolean b){
3838 		flags=(flags&~VALIDATEDMASK);
3839 		if(b){flags|=VALIDATEDMASK;}
3840 	}
3841 
setTested(boolean b)3842 	public void setTested(boolean b){
3843 		flags=(flags&~TESTEDMASK);
3844 		if(b){flags|=TESTEDMASK;}
3845 	}
3846 
setInvertedRepeat(boolean b)3847 	public void setInvertedRepeat(boolean b){
3848 		flags=(flags&~IRMASK);
3849 		if(b){flags|=IRMASK;}
3850 	}
3851 
setTrimmed(boolean b)3852 	public void setTrimmed(boolean b){
3853 		flags=(flags&~TRIMMEDMASK);
3854 		if(b){flags|=TRIMMEDMASK;}
3855 	}
3856 
setInsert(int x)3857 	public void setInsert(int x){
3858 		if(x<1){x=-1;}
3859 //		assert(x==-1 || x>9 || length()<20) : x+", "+length(); //Invalid assertion for synthetic reads.
3860 		insert=x;
3861 		setInsertValid(x>0);
3862 		if(mate!=null){
3863 			mate.insert=x;
3864 			mate.setInsertValid(x>0);
3865 		}
3866 	}
3867 
makeMaskArray(int max)3868 	private static int[] makeMaskArray(int max) {
3869 		int[] r=new int[max+1];
3870 		for(int i=0; i<r.length; i++){r[i]=(1<<i);}
3871 		return r;
3872 	}
3873 
3874 
3875 
getFakeQuality(int len)3876 	public static byte[] getFakeQuality(int len){
3877 		if(len>=QUALCACHE.length){
3878 			byte[] r=KillSwitch.allocByte1D(len);
3879 			Arrays.fill(r, (byte)30);
3880 			return r;
3881 		}
3882 		if(QUALCACHE[len]==null){
3883 			synchronized(QUALCACHE){
3884 				if(QUALCACHE[len]==null){
3885 					QUALCACHE[len]=KillSwitch.allocByte1D(len);
3886 					Arrays.fill(QUALCACHE[len], (byte)30);
3887 				}
3888 			}
3889 		}
3890 		return QUALCACHE[len];
3891 	}
3892 
getScaffoldName(boolean requireSingleScaffold)3893 	public byte[] getScaffoldName(boolean requireSingleScaffold){
3894 		byte[] name=null;
3895 		if(mapped()){
3896 			if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){
3897 				int idx=Data.scaffoldIndex(chrom, (start+stop)/2);
3898 				name=Data.scaffoldNames[chrom][idx];
3899 //				int scaflen=Data.scaffoldLengths[chrom][idx];
3900 //				a1=Data.scaffoldRelativeLoc(chrom, start, idx);
3901 //				b1=a1-start1+stop1;
3902 			}
3903 		}
3904 		return name;
3905 	}
3906 
bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC)3907 	public void bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC){
3908 		for(int i=0; i<bases.length; i++){
3909 			final int x=AminoAcid.baseToNumber[bases[i]];
3910 			if(x==0 && AtoG){bases[i]='G';}
3911 			else if(x==1 && CtoT){bases[i]='T';}
3912 			else if(x==2 && GtoA){bases[i]='A';}
3913 			else if(x==3 && TtoC){bases[i]='C';}
3914 		}
3915 	}
3916 
copy()3917 	public Read copy(){
3918 		Read r=clone();
3919 		r.bases=(r.bases==null ? null : r.bases.clone());
3920 		r.quality=(r.quality==null ? null : r.quality.clone());
3921 		r.match=(r.match==null ? null : r.match.clone());
3922 		r.gaps=(r.gaps==null ? null : r.gaps.clone());
3923 		r.originalSite=(r.originalSite==null ? null : r.originalSite.clone());
3924 		r.sites=(ArrayList<SiteScore>) (r.sites==null ? null : r.sites.clone());
3925 		r.mate=null;
3926 
3927 		if(r.sites!=null){
3928 			for(int i=0; i<r.sites.size(); i++){
3929 				r.sites.set(i, r.sites.get(i).clone());
3930 			}
3931 		}
3932 		return r;
3933 	}
3934 
3935 	@Override
clone()3936 	public Read clone(){
3937 		try {
3938 			return (Read) super.clone();
3939 		} catch (CloneNotSupportedException e) {
3940 			// TODO Auto-generated catch block
3941 			e.printStackTrace();
3942 		}
3943 		throw new RuntimeException();
3944 	}
3945 
3946 	/**
3947 	 * @return This protein in canonical nucleotide space.
3948 	 */
aminoToNucleic()3949 	public Read aminoToNucleic() {
3950 		assert(aminoacid()) : "This read is not flagged as an amino acid sequence.";
3951 		Read r=this.clone();
3952 		r.setAminoAcid(false);
3953 		r.bases=AminoAcid.toNTs(r.bases);
3954 		if(quality!=null){
3955 			byte[] ntquals=new byte[r.quality.length*3];
3956 			for(int i=0; i<quality.length; i++){
3957 				byte q=quality[i];
3958 				byte q2=(byte)Tools.min(q+5, MAX_CALLED_QUALITY);
3959 				ntquals[i]=ntquals[i+1]=ntquals[i+2]=q2;
3960 			}
3961 			r.quality=ntquals;
3962 		}
3963 		return r;
3964 	}
3965 
3966 	private static final byte[][] QUALCACHE=new byte[1000][];
3967 
3968 
3969 	public static final int STRANDMASK=1;
3970 	public static final int MAPPEDMASK=(1<<1);
3971 	public static final int PAIREDMASK=(1<<2);
3972 	public static final int PERFECTMASK=(1<<3);
3973 	public static final int AMBIMASK=(1<<4);
3974 	public static final int RESCUEDMASK=(1<<5);
3975 //	public static final int COLORMASK=(1<<6); //TODO:  Change to semiperfectmask?
3976 	public static final int SYNTHMASK=(1<<7);
3977 	public static final int DISCARDMASK=(1<<8);
3978 	public static final int INVALIDMASK=(1<<9);
3979 	public static final int SWAPMASK=(1<<10);
3980 	public static final int SHORTMATCHMASK=(1<<11);
3981 
3982 	public static final int PAIRNUMSHIFT=12;
3983 	public static final int PAIRNUMMASK=(1<<PAIRNUMSHIFT);
3984 
3985 	public static final int INSERTMASK=(1<<13);
3986 	public static final int ADAPTERMASK=(1<<14);
3987 	public static final int SECONDARYMASK=(1<<15);
3988 	public static final int AAMASK=(1<<16);
3989 	public static final int JUNKMASK=(1<<17);
3990 	public static final int VALIDATEDMASK=(1<<18);
3991 	public static final int TESTEDMASK=(1<<19);
3992 	public static final int IRMASK=(1<<20);
3993 	public static final int TRIMMEDMASK=(1<<21);
3994 
3995 	private static final int[] maskArray=makeMaskArray(22); //Be sure this is big enough for all flags!
3996 
3997 	public static boolean TO_UPPER_CASE=false;
3998 	public static boolean LOWER_CASE_TO_N=false;
3999 	public static boolean DOT_DASH_X_TO_N=false;
4000 	public static boolean AVERAGE_QUALITY_BY_PROBABILITY=true;
4001 	public static boolean FIX_HEADER=false;
4002 	public static boolean ALLOW_NULL_HEADER=false;
4003 	public static boolean SKIP_SLOW_VALIDATION=false;
4004 	public static final boolean VALIDATE_BRANCHLESS=true;
4005 
4006 	public static final int IGNORE_JUNK=0;
4007 	public static final int FLAG_JUNK=1;
4008 	public static final int FIX_JUNK=2;
4009 	public static final int CRASH_JUNK=3;
4010 	public static final int FIX_JUNK_AND_IUPAC=4;
4011 	public static int JUNK_MODE=CRASH_JUNK;
4012 	public static boolean IUPAC_TO_N=false;
4013 
4014 	public static boolean U_TO_T=false;
4015 	public static boolean COMPRESS_MATCH_BEFORE_WRITING=true;
4016 	public static boolean DECOMPRESS_MATCH_ON_LOAD=true; //Set to false for some applications, like sorting, perhaps
4017 
4018 	public static boolean ADD_BEST_SITE_TO_LIST_FROM_TEXT=true;
4019 	public static boolean NULLIFY_BROKEN_QUALITY=false;
4020 	public static boolean TOSS_BROKEN_QUALITY=false;
4021 	public static boolean FLAG_BROKEN_QUALITY=false;
4022 	public static boolean FLAT_IDENTITY=true;
4023 	public static boolean VALIDATE_IN_CONSTRUCTOR=true;
4024 
4025 	public static boolean verbose=false;
4026 
4027 	/*--------------------------------------------------------------*/
4028 
4029 	/** Offset for quality scores, normally 33, or 64 for some old Illumina data */
4030 	private static final byte ASCII_OFFSET=33;
4031 
4032 	/** Allow quality scores to be changed, typically for corrupt Illumina data */
4033 	public static boolean CHANGE_QUALITY=true; //Cap all quality values between MIN_CALLED_QUALITY and MAX_CALLED_QUALITY
4034 
4035 	/** Minimum allowed quality score for called (ACGT) bases */
4036 	private static byte MIN_CALLED_QUALITY=2;
4037 
4038 	/** Maximum allowed quality score for called (ACGT) bases */
4039 	private static byte MAX_CALLED_QUALITY=41;
4040 
4041 	/** Maximum allowed quality score for merged reads, which otherwise would normally be very high */
4042 	public static byte MAX_MERGE_QUALITY=41;
4043 	public static byte[] qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY);
4044 
MIN_CALLED_QUALITY()4045 	public static byte MIN_CALLED_QUALITY(){return MIN_CALLED_QUALITY;}
MAX_CALLED_QUALITY()4046 	public static byte MAX_CALLED_QUALITY(){return MAX_CALLED_QUALITY;}
4047 
setMaxCalledQuality(int x)4048 	public static void setMaxCalledQuality(int x){
4049 		x=Tools.mid(1, x, 93);
4050 		if(x!=MAX_CALLED_QUALITY){
4051 			MAX_CALLED_QUALITY=(byte)x;
4052 			qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY);
4053 		}
4054 	}
4055 
setMinCalledQuality(int x)4056 	public static void setMinCalledQuality(int x){
4057 		x=Tools.mid(0, x, 93);
4058 		if(x!=MIN_CALLED_QUALITY){
4059 			MIN_CALLED_QUALITY=(byte)x;
4060 			qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY);
4061 		}
4062 	}
4063 
capQuality(long q)4064 	public static byte capQuality(long q){
4065 		return (byte)Tools.mid(MIN_CALLED_QUALITY, q, MAX_CALLED_QUALITY);
4066 	}
4067 
capQuality(byte q)4068 	public static byte capQuality(byte q){
4069 		return qMap[q];
4070 	}
4071 
capQuality(byte q, byte b)4072 	public static byte capQuality(byte q, byte b){
4073 		return AminoAcid.isFullyDefined(b) ? qMap[q] : 0;
4074 	}
4075 
makeQmap(byte min, byte max)4076 	private static byte[] makeQmap(byte min, byte max){
4077 		byte[] r=(qMap==null ? new byte[128] : qMap);
4078 		for(int i=0; i<r.length; i++){
4079 			r[i]=(byte) Tools.mid(min, i, max);
4080 		}
4081 		return r;
4082 	}
4083 }
4084