package stream; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import align2.GapTools; import align2.QualityTools; import dna.AminoAcid; import dna.ChromosomeArray; import dna.Data; import shared.KillSwitch; import shared.Shared; import shared.Tools; import shared.TrimRead; import structures.ByteBuilder; import ukmer.Kmer; public final class Read implements Comparable, Cloneable, Serializable{ /** * */ private static final long serialVersionUID = -1026645233407290096L; public static void main(String[] args){ byte[] a=args[0].getBytes(); System.out.println(new String(a)); byte[] b=toShortMatchString(a); System.out.println(new String(b)); byte[] c=toLongMatchString(b); System.out.println(new String(c)); byte[] d=toLongMatchString(c); System.out.println(new String(d)); // byte[] e=toShortMatchString(b); // System.out.println(new String(e)); } public Read(byte[] bases_, byte[] quals_, long id_){ this(bases_, quals_, Long.toString(id_), id_); } public Read(byte[] bases_, byte[] quals_, String name_, long id_){ this(bases_, quals_, name_, id_, 0, -1, -1, -1); } public Read(byte[] bases_, byte[] quals_, String name_, long id_, int flag_){ this(bases_, quals_, name_, id_, flag_, -1, -1, -1); } public Read(byte[] s_, byte[] quals_, long id_, int chrom_, int start_, int stop_, byte strand_){ this(s_, quals_, Long.toString(id_), id_, (int)strand_, chrom_, start_, stop_); } // public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, byte strand_, int chrom_, int start_, int stop_){ // this(bases_, quals_, id_, numericID_, (int)strand_, chrom_, start_, stop_); // assert(strand_==0 || strand_==1); // assert(start_<=stop_) : chrom_+", "+start_+", "+stop_+", "+numericID_; // } /** Note that strand can be used as flag */ public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, int flags_, int chrom_, int start_, int stop_){ flags=flags_&~VALIDATEDMASK; bases=bases_; quality=quals_; chrom=chrom_; start=start_; stop=stop_; id=id_; numericID=numericID_; // assert(amino()) : Shared.AMINO_IN;//123 if(VALIDATE_IN_CONSTRUCTOR){validate(true);} } /*--------------------------------------------------------------*/ /*---------------- Validation ----------------*/ /*--------------------------------------------------------------*/ public boolean validate(final boolean processAssertions){ assert(!validated()); // if(false){//This causes problems with error-corrected PacBio reads. // boolean x=(quality==null || quality.length<1 || quality[0]<=80 || !FASTQ.DETECT_QUALITY || FASTQ.IGNORE_BAD_QUALITY); // if(!x){ // if(processAssertions){ // KillSwitch.kill("Quality value ("+quality[0]+") appears too high.\n"+Arrays.toString(quality)+ // "\n"+Arrays.toString(bases)+"\n"+numericID+"\n"+id+"\n"+FASTQ.ASCII_OFFSET); // } // return false; // } // } if(bases==null){ quality=null; //I could require this to be true if(FIX_HEADER){fixHeader(processAssertions);} setValidated(true); return true; } validateQualityLength(processAssertions); final boolean passesJunk; // assert(false) : SKIP_SLOW_VALIDATION+","+VALIDATE_BRANCHLESS+","+JUNK_MODE; if(SKIP_SLOW_VALIDATION){ passesJunk=true; }else if(!aminoacid()){ if(U_TO_T){uToT();} if(VALIDATE_BRANCHLESS){ passesJunk=validateCommonCase_branchless(processAssertions); }else{ passesJunk=validateCommonCase(processAssertions); } }else{ // if(U_TO_T){uToT();} This is amino, so... fixCase(); passesJunk=validateJunk(processAssertions); if(CHANGE_QUALITY){fixQuality();} } if(FIX_HEADER){fixHeader(processAssertions);} setValidated(true); return true; } public boolean checkQuality(){ if(quality==null){return true;} for(int i=0; quality!=null && i "+num); if(num<0){ if(JUNK_MODE==FIX_JUNK){ bases[i]=nocall; }else if(JUNK_MODE==FLAG_JUNK){ setJunk(true); return false; }else{ if(processAssertions){ KillSwitch.kill("\nAn input file appears to be misformatted:\n" + "The character with ASCII code "+b+" appeared where "+(aa ? "an amino acid" : "a base")+" was expected" + (b>31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n") + "Sequence #"+numericID+"\n" + "Sequence ID '"+id+"'\n" + "Sequence: "+Tools.toStringSafe(bases)+"\n" + "Flags: "+Long.toBinaryString(flags)+"\n\n" + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); } setJunk(true); return false; } } } return true; } private void validateQualityLength(boolean processAssertions){ if(quality==null || quality.length==bases.length){return;} if(TOSS_BROKEN_QUALITY){ quality=null; setDiscarded(true); setJunk(true); }else if(NULLIFY_BROKEN_QUALITY){ quality=null; setJunk(true); }else if(FLAG_BROKEN_QUALITY){ setJunk(true); }else{ boolean x=false; assert(x=processAssertions); if(x){ KillSwitch.kill("\nMismatch between length of bases and qualities for read "+numericID+" (id="+id+").\n"+ "# qualities="+quality.length+", # bases="+bases.length+"\n\n"+ FASTQ.qualToString(quality)+"\n"+new String(bases)+"\n\n" + "This can be bypassed with the flag 'tossbrokenreads' or 'nullifybrokenquality'"); } } } private void fixQuality(){ if(quality==null || !CHANGE_QUALITY){return;} final byte[] toNumber=aminoacid() ? AminoAcid.acidToNumber : AminoAcid.baseToNumber; for(int i=0; i=0){ // if(qMAX_CALLED_QUALITY){ // quality[i]=MAX_CALLED_QUALITY; // } // }else{ // quality[i]=0; // } } } private void fixCase(){ if(bases==null || (!DOT_DASH_X_TO_N && !TO_UPPER_CASE && !LOWER_CASE_TO_N)){return;} final boolean aa=aminoacid(); final byte[] caseMap, ddxMap; if(!aa){ caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocall : null; ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocall : null; }else{ caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocallAA : null; ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocallAA : null; } // assert(false) : (AminoAcid.toUpperCase==caseMap)+", "+ddxMap; if(DOT_DASH_X_TO_N){ if(TO_UPPER_CASE || LOWER_CASE_TO_N){ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; i=0 ? qMap[q] : 0); } } }else if(DOT_DASH_X_TO_N || IUPAC_TO_N){ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; i=0){return true;} // if(junkOr>=0 && (JUNK_MODE!=FIX_JUNK_AND_IUPAC || iupacOr>=0)){return true;} // // assert(junkOr<0 || (JUNK_MODE==FIX_JUNK_AND_IUPAC && iupacOr<0)); //TODO: I could disable VALIDATE_BRANCHLESS here, if it's not final //VALIDATE_BRANCHLESS=false; if(JUNK_MODE==FIX_JUNK){ for(int i=0; i31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n") + "Sequence #"+numericID+"\n" + "Sequence ID: '"+id+"'\n" + "Sequence: '"+Tools.toStringSafe(bases)+"'\n\n" + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); } setJunk(true); return false; } } private boolean validateCommonCase(boolean processAssertions){ assert(!aminoacid()); assert(bases!=null); if(TO_UPPER_CASE || LOWER_CASE_TO_N){fixCase();} final byte nocall='N'; final byte[] toNum=AminoAcid.baseToNumber; final byte[] ddxMap=AminoAcid.dotDashXToNocall; if(JUNK_MODE==IGNORE_JUNK){ if(quality!=null && CHANGE_QUALITY){ if(DOT_DASH_X_TO_N){ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; i=0 ? qMap[q] : 0); } } }else if(DOT_DASH_X_TO_N){ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; i=0 ? qMap[q] : 0); } }else{ for(int i=0; ib ? a-b : b-a; } /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch){ return isDuplicateByBases(r, nmax, mmax, qmax, banSameQualityMismatch, false); } /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength){ int n=0, m=0; assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful."; if(!allowDifferentLength && r.length()!=bases.length){return false;} int minLen=Tools.min(bases.length, r.length()); for(int i=0; inmax){return false;} }else if(b1!=b2){ m++; if(m>mmax){return false;} if(quality[i]>qmax && r.quality[i]>qmax){return false;} if(banSameQualityMismatch && quality[i]==r.quality[i]){return false;} } } return true; } public boolean isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment){ if(bases.length!=r.length()){ return isDuplicateByMappingDifferentLength(r, bothEnds, checkAlignment); } assert(this!=r && mate!=r); assert(!bothEnds || bases.length==r.length()); if(!mapped() || !r.mapped()){return false;} // if(chrom==-1 && start==-1){return false;} if(chrom<1 && start<1){return false;} // if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;} //// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match // if(stop!=r.stop){return false;} //For unpaired reads, require both ends match // return true; if(chrom!=r.chrom || strand()!=r.strand()){return false;} if(bothEnds){ if(start!=r.start || stop!=r.stop){return false;} }else{ if(strand()==Shared.PLUS){ if(start!=r.start){return false;} }else{ if(stop!=r.stop){return false;} } } if(checkAlignment){ if(perfect() && r.perfect()){return true;} if(match!=null && r.match!=null){ if(match.length!=r.match.length){return false;} for(int i=0; i=q2){ quality[i]=(byte) Tools.min(48, q1+1+q2/4); }else{ quality[i]=(byte) Tools.min(48, q2+1+q1/4); } } }else if(b1=='N'){ bases[i]=b2; quality[i]=q2; }else if(b2=='N'){ //do nothing }else if(mergeVectors){ if(q1<1 && q2<1){ //Special case - e.g. Illumina calls bases at 0 quality. //Possibly best to keep the matching allele if one matches the ref. //But for now, do nothing. //This was causing problems changing perfect match strings into imperfect matches. }else if(q1==q2){ assert(b1!=b2); bases[i]='N'; quality[i]=0; }else if(q1>q2){ bases[i]=b1; quality[i]=(byte)(q1-q2/2); }else{ bases[i]=b2; quality[i]=(byte)(q2-q1/2); } assert(quality[i]>=0 && quality[i]<=48); } } } } //TODO: //Note that the read may need to be realigned after merging, so the match string may be rendered incorrect. if(mergeN && match!=null){ if(r.match==null){match=null;} else{ if(match.length!=r.match.length){match=null;} else{ boolean ok=true; for(int i=0; i1); assert(r!=this); assert(r!=this.mate); assert(r!=r.mate); assert(this!=this.mate); assert(r.mate==null || r.mate.mate==r); assert(this.mate==null || this.mate.mate==this); assert(r.mate==null || r.numericID==r.mate.numericID); assert(mate==null || numericID==mate.numericID); } @Override public String toString(){return toText(false).toString();} public ByteBuilder toSites(){return toSites((ByteBuilder)null);} public ByteBuilder toSites(ByteBuilder sb){ if(numSites()==0){ if(sb==null){sb=new ByteBuilder(2);} sb.append('.'); }else{ if(sb==null){sb=new ByteBuilder(sites.size()*20);} int appended=0; for(SiteScore ss : sites){ if(appended>0){sb.append('\t');} if(ss!=null){ ss.toBytes(sb); appended++; } } if(appended==0){sb.append('.');} } return sb; } public ByteBuilder toInfo(){ if(obj==null){return new ByteBuilder();} if(obj.getClass()==ByteBuilder.class){return (ByteBuilder)obj;} return new ByteBuilder(obj.toString()); } public ByteBuilder toInfo(ByteBuilder bb){ if(obj==null){return bb;} if(obj.getClass()==ByteBuilder.class){return bb.append((ByteBuilder)obj);} return bb.append(obj.toString()); } public ByteBuilder toFastq(){ return FASTQ.toFASTQ(this, (ByteBuilder)null); } public ByteBuilder toFastq(ByteBuilder bb){ return FASTQ.toFASTQ(this, bb); } public ByteBuilder toFasta(){return toFasta(Shared.FASTA_WRAP);} public ByteBuilder toFasta(ByteBuilder bb){return toFasta(Shared.FASTA_WRAP, bb);} public ByteBuilder toFasta(int wrap){ return toFasta(wrap, (ByteBuilder)null); } public ByteBuilder toFasta(int wrap, ByteBuilder bb){ if(wrap<1){wrap=Integer.MAX_VALUE;} int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5; if(bb==null){bb=new ByteBuilder(len+1);} bb.append('>'); if(id==null){bb.append(numericID);} else{bb.append(id);} if(bases!=null){ int pos=0; while(pos=0; i--){ bb.append(flagToNumber(maskArray[i])); } bb.tab(); bb.append(copies); bb.tab(); bb.append(errors); bb.tab(); bb.append(mapScore); bb.tab(); if(bases==null){bb.append('.');} else{bb.append(bases);} bb.tab(); // int qualSum=0; // int qualMin=99999; if(quality==null){ bb.append('.'); }else{ bb.ensureExtra(quality.length); for(int i=0, j=bb.length; i0){bb.append('~');} bb.append(gaps[i]); } } if(sites!=null && sites.size()>0){ assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) : "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n"; for(SiteScore ss : sites){ bb.tab(); if(ss==null){ bb.append((byte[])null); }else{ ss.toBytes(bb); } bb.append(ss==null ? "null" : ss.toText()); } } if(originalSite!=null){ bb.tab(); bb.append('*'); originalSite.toBytes(bb); } match=oldmatch; setShortMatch(oldshortmatch); return bb; } public static Read fromText(String line){ if(line.length()==1 && line.charAt(0)=='.'){return null;} String[] split=line.split("\t"); if(split.length<17){ throw new RuntimeException("Error parsing read from text.\n\n" + "This may be caused be attempting to parse the wrong format.\n" + "Please ensure that the file extension is correct:\n" + "\tFASTQ should end in .fastq or .fq\n" + "\tFASTA should end in .fasta or .fa, .fas, .fna, .ffn, .frn, .seq, .fsa\n" + "\tSAM should end in .sam\n" + "\tNative format should end in .txt or .bread\n" + "If a file is compressed, there must be a compression extension after the format extension:\n" + "\tgzipped files should end in .gz or .gzip\n" + "\tzipped files should end in .zip and have only 1 file per archive\n" + "\tbz2 files should end in .bz2\n"); } final String id=new String(split[0]); long numericID=Long.parseLong(split[1]); int chrom=Byte.parseByte(split[2]); // byte strand=Byte.parseByte(split[3]); int start=Integer.parseInt(split[4]); int stop=Integer.parseInt(split[5]); int flags=Integer.parseInt(split[6], 2); int copies=Integer.parseInt(split[7]); int errors; int errorsCorrected; if(split[8].indexOf(',')>=0){ String[] estring=split[8].split(","); errors=Integer.parseInt(estring[0]); errorsCorrected=Integer.parseInt(estring[1]); }else{ errors=Integer.parseInt(split[8]); errorsCorrected=0; } int mapScore=Integer.parseInt(split[9]); byte[] basesOriginal=split[10].getBytes(); byte[] qualityOriginal=(split[11].equals(".") ? null : split[11].getBytes()); if(qualityOriginal!=null){ for(int i=0; i=-1) : b; qualityOriginal[i]=b; } } int insert=-1; if(!split[12].equals(".")){insert=Integer.parseInt(split[12]);} byte[] match=null; if(!split[14].equals(".")){match=split[14].getBytes();} int[] gaps=null; if(!split[15].equals(".")){ String[] gstring=split[16].split("~"); gaps=new int[gstring.length]; for(int i=0; i0){r.sites=new ArrayList(mSites);} for(int i=firstScore; i0); if(!containsNocalls()){return;} final ByteBuilder bbb=new ByteBuilder(); final ByteBuilder bbq=(quality==null ? null : new ByteBuilder()); int gap=0; for(int i=0; i=minGapIn && gap=minGapIn && gap=bases.length); if(bbb.length()>bases.length){ bases=bbb.toBytes(); if(bbq!=null){quality=bbq.toBytes();} } } public ArrayList breakAtGaps(final boolean agp, final int minContig){ ArrayList list=new ArrayList(); byte prev='N'; int lastN=-1, lastBase=-1; int contignum=1; long feature=1; ByteBuilder bb=(agp ? new ByteBuilder() : null); assert(obj==null); for(int i=0; i=minContig){list.add(r);} contignum++; if(bb!=null){ bb.append(id).append('\t'); bb.append(start+1).append('\t'); bb.append(stop).append('\t'); bb.append(feature).append('\t'); feature++; bb.append('W').append('\t'); bb.append(r.id).append('\t'); bb.append(1).append('\t'); bb.append(r.length()).append('\t'); bb.append('+').append('\n'); } } lastN=i; }else{ if(bb!=null && prev=='N' && lastBase>=0){ bb.append(id).append('\t'); bb.append(lastBase+2).append('\t'); bb.append(i).append('\t'); bb.append(feature).append('\t'); feature++; bb.append('N').append('\t'); bb.append((i-lastBase-1)).append('\t'); bb.append("scaffold").append('\t'); bb.append("yes").append('\t'); bb.append("paired-ends").append('\n'); } lastBase=i; } prev=b; } if(prev!='N'){ final int start=lastN+1, stop=bases.length; byte[] b2=KillSwitch.copyOfRange(bases, start, stop); byte[] q2=(quality==null ? null : KillSwitch.copyOfRange(quality, start, stop)); Read r=new Read(b2, q2, id+"_c"+contignum, numericID); if(r.length()>=minContig){list.add(r);} contignum++; if(bb!=null){ bb.append(id).append('\t'); bb.append(start+1).append('\t'); bb.append(stop).append('\t'); bb.append(feature).append('\t'); feature++; bb.append('W').append('\t'); bb.append(r.id).append('\t'); bb.append(1).append('\t'); bb.append(r.length()).append('\t'); bb.append('+').append('\n'); } }else{ if(bb!=null && prev=='N' && lastBase>=0){ bb.append(id).append('\t'); bb.append(lastBase+2).append('\t'); bb.append(bases.length).append('\t'); bb.append(feature).append('\t'); feature++; bb.append('N').append('\t'); bb.append((bases.length-lastBase-1)).append('\t'); bb.append("scaffold").append('\t'); bb.append("yes").append('\t'); bb.append("paired-ends").append('\n'); } lastBase=bases.length; } if(bb!=null){obj=bb.toBytes();} return list; } /** Reverse-complements the read. */ public void reverseComplement() { AminoAcid.reverseComplementBasesInPlace(bases); Tools.reverseInPlace(quality); setStrand(strand()^1); } /** Complements the read. */ public void complement() { AminoAcid.reverseComplementBasesInPlace(bases); } @Override public int compareTo(Read o) { if(chrom!=o.chrom){return chrom-o.chrom;} if(start!=o.start){return start-o.start;} if(stop!=o.stop){return stop-o.stop;} if(strand()!=o.strand()){return strand()-o.strand();} return 0; } public SiteScore toSite(){ assert(start<=stop) : this.toText(false); SiteScore ss=new SiteScore(chrom, strand(), start, stop, 0, 0, rescued(), perfect()); if(paired()){ ss.setSlowPairedScore(mapScore-1, mapScore); }else{ ss.setSlowPairedScore(mapScore, 0); } ss.setScore(mapScore); ss.gaps=gaps; ss.match=match; originalSite=ss; return ss; } public SiteScore topSite(){ final SiteScore ss=(sites==null || sites.isEmpty()) ? null : sites.get(0); assert(sites==null || sites.isEmpty() || ss!=null) : "Top site is null for read "+this; return ss; } public int numSites(){ return (sites==null ? 0 : sites.size()); } public SiteScore makeOriginalSite(){ originalSite=toSite(); return originalSite; } public void setFromSite(SiteScore ss){ assert(ss!=null); chrom=ss.chrom; setStrand(ss.strand); start=ss.start; stop=ss.stop; mapScore=ss.slowScore; setRescued(ss.rescued); gaps=ss.gaps; setPerfect(ss.perfect); match=ss.match; if(gaps!=null){ gaps=ss.gaps=GapTools.fixGaps(start, stop, gaps, Shared.MINGAP); // gaps[0]=Tools.min(gaps[0], start); // gaps[gaps.length-1]=Tools.max(gaps[gaps.length-1], stop); } } // public static int[] fixGaps(int a, int b, int[] gaps, int minGap){ //// System.err.println("fixGaps input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap); // int[] r=GapTools.fixGaps(a, b, gaps, minGap); //// System.err.println("fixGaps output: "+Arrays.toString(r)); // return r; // } public void setFromOriginalSite(){ setFromSite(originalSite); } public void setFromTopSite(){ final SiteScore ss=topSite(); if(ss==null){ clearSite(); setMapped(false); return; } setMapped(true); setFromSite(ss); } public void setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist){ final SiteScore ss0=topSite(); if(ss0==null){ clearSite(); setMapped(false); return; } setMapped(true); if(sites.size()==1 || !randomIfAmbiguous || !ambiguous()){ setFromSite(ss0); return; } if(primary || mate==null || !mate.mapped() || !mate.paired()){ int count=1; for(int i=1; i0){ SiteScore ss=sites.get(x); sites.set(0, ss); sites.set(x, ss0); } setFromSite(sites.get(0)); return; } // assert(false) : "TODO: Proper strand orientation, and more."; //TODO: Also, this code appears to sometimes duplicate sitescores(?) // for(int i=0; imaxdist){return true;} } // if(absdif(start, mate.start)>maxdist){return true;} if(requireCorrectStrands){ if((strand()==mate.strand())!=sameStrandPairs){return true;} } if(!sameStrandPairs){ if(strand()==Shared.PLUS && mate.strand()==Shared.MINUS){ if(start>=mate.stop){return true;} }else if(strand()==Shared.MINUS && mate.strand()==Shared.PLUS){ if(mate.start>=stop){return true;} } } return false; } public int countMismatches(){ assert(match!=null); int x=0; for(byte b : match){ if(b=='S'){x++;} } return x; } /** * @param k * @return Number of valid kmers */ public int numValidKmers(int k) { if(bases==null){return 0;} int len=0, counted=0; for(int i=0; i=k){counted++;} } return counted; } /** * @param match string * @return Total number of match, sub, del, ins, or clip symbols */ public static final int[] matchToMsdicn(byte[] match) { if(match==null || match.length<1){return null;} int[] msdicn=KillSwitch.allocInt1D(6); byte mode='0', c='0'; int current=0; for(int i=0; i0 || !Tools.isDigit(c)){ current=Tools.max(current, 1); if(mode=='m'){ msdicn[0]+=current; }else if(mode=='S'){ msdicn[1]+=current; }else if(mode=='D'){ msdicn[2]+=current; }else if(mode=='I'){ msdicn[3]+=current; }else if(mode=='C' || mode=='X' || mode=='Y'){ msdicn[4]+=current; }else if(mode=='N' || mode=='R'){ msdicn[5]+=current; } } return msdicn; } /** * @param match string * @return Ref length of match string */ public static final int calcMatchLength(byte[] match) { if(match==null || match.length<1){return 0;} byte mode='0', c='0'; int current=0; int len=0; for(int i=0; i0 || !Tools.isDigit(c)){ current=Tools.max(current, 1); if(mode=='m'){ len+=current; }else if(mode=='S'){ len+=current; }else if(mode=='D'){ len+=current; }else if(mode=='I'){ //Do nothing //len+=current; }else if(mode=='C'){ len+=current; }else if(mode=='X'){ //Not sure about this case, but adding seems fine len+=current; assert(false) : new String(match); }else if(mode=='Y'){ //Do nothing //len+=current; // assert(false) : new String(match); }else if(mode=='N' || mode=='R'){ len+=current; } } return len; } public final float identity() {return identity(match);} public static final float identity(byte[] match) { if(FLAT_IDENTITY){ return identityFlat(match, true); }else{ return identitySkewed(match, true, true, false, false); } } public final boolean hasLongInsertion(int maxlen){ return hasLongInsertion(match, maxlen); } public final boolean hasLongDeletion(int maxlen){ return hasLongDeletion(match, maxlen); } public static final boolean hasLongInsertion(byte[] match, int maxlen){ if(match==null || match.lengthmaxlen){return true;} }else{ len=0; } prev=b; } return false; } public static final boolean hasLongDeletion(byte[] match, int maxlen){ if(match==null || match.lengthmaxlen){return true;} }else{ len=0; } prev=b; } return false; } public int mappedNonClippedBases() { if(!mapped() || match==null || bases==null){return 0;} int len=0; byte mode='0', c='0'; int current=0; for(int i=0; i0 || !Tools.isDigit(c)){ current=Tools.max(current, 1); if(mode=='D' || mode=='C' || mode=='X' || mode=='Y' || mode=='d'){ }else{ len+=current; } mode=c; current=0; } return len; } /** * Handles short or long mode. * @param match string * @return Identity based on number of match, sub, del, ins, or N symbols */ public static final float identityFlat(byte[] match, boolean penalizeN) { // assert(false) : new String(match); if(match==null || match.length<1){return 0;} int good=0, bad=0, n=0; byte mode='0', c='0'; int current=0; for(int i=0; i'4' || b!=':' || d!=':'){ if(!processAssertions){return false;} KillSwitch.kill("Strangely formatted read. Please disable chastityfilter with the flag chastityfilter=f. id:"+id); } return c=='Y'; } } public boolean failsBarcode(HashSet set, boolean failIfNoBarcode){ if(id==null){return false;} final int loc=id.lastIndexOf(':'); final int loc2=Tools.max(id.indexOf(' '), id.indexOf('/')); if(loc<0 || loc<=loc2 || loc>=id.length()-1){ return failIfNoBarcode; } if(set==null){ for(int i=loc+1; i=id.length()-1){ if(failIfNoBarcode && Shared.EA()){ KillSwitch.kill("Encountered a read header without a barcode:\n"+id+"\n"); } return null; } String code=id.substring(loc+1); return code; } /** * @return The rname of this Read's SamLine, if present and mapped. */ public String rnameS() { if(samline==null || !samline.mapped()){return null;} return samline.rnameS(); } /** Average based on summing quality scores */ public double avgQuality(boolean countUndefined, int maxBases){ return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityDouble(countUndefined, maxBases) : avgQualityByScoreDouble(maxBases); } /** Average based on summing quality scores */ public int avgQualityInt(boolean countUndefined, int maxBases){ return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityInt(countUndefined, maxBases) : avgQualityByScoreInt(maxBases); } /** Average based on summing error probabilities */ public int avgQualityByProbabilityInt(boolean countUndefined, int maxBases){ if(bases==null || bases.length==0){return 0;} return avgQualityByProbabilityInt(bases, quality, countUndefined, maxBases); } /** Average based on summing error probabilities */ public double avgQualityByProbabilityDouble(boolean countUndefined, int maxBases){ if(bases==null || bases.length==0){return 0;} return avgQualityByProbabilityDouble(bases, quality, countUndefined, maxBases); } /** Average based on summing error probabilities */ public double probabilityErrorFree(boolean countUndefined, int maxBases){ if(bases==null || bases.length==0){return 0;} return probabilityErrorFree(bases, quality, countUndefined, maxBases); } /** Average based on summing error probabilities */ public static int avgQualityByProbabilityInt(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ if(quality==null){return 40;} if(quality.length==0){return 0;} float e=expectedErrors(bases, quality, countUndefined, maxBases); final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); float p=e/div; return QualityTools.probErrorToPhred(p); } /** Average based on summing error probabilities */ public static double avgQualityByProbabilityDouble(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ if(quality==null){return 40;} if(quality.length==0){return 0;} float e=expectedErrors(bases, quality, countUndefined, maxBases); final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); float p=e/div; return QualityTools.probErrorToPhredDouble(p); } /** Average based on summing quality scores */ public int avgQualityByScoreInt(int maxBases){ if(bases==null || bases.length==0){return 0;} if(quality==null){return 40;} int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); for(int i=0; iquality.length){return 0;} for(int i=0; iquality.length){return 0;} for(int i=bases.length-n; i0); if(n>quality.length){return 0;} byte x=quality[0]; for(int i=1; i0); if(n>quality.length){return 0;} byte x=quality[bases.length-n]; for(int i=bases.length-n; i'9' && b!='m'){return true;} } return false; } public boolean containsNonNM(){ assert(match!=null && valid()); for(int i=0; i'9' && b!='m' && b!='N'){return true;} } return false; } public boolean containsVariants(){ assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n"; for(int i=0; i'9' && b!='m' && b!='N' && b!='C'){return true;} } return false; } public boolean containsClipping(){ assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n"; if(match.length<1){return false;} if(match[0]=='C'){return true;} for(int i=match.length-1; i>0; i--){ if(match[i]=='C'){return true;} if(match[i]>'9'){break;} } return false; } /** * @return {m,S,C,N,I,D}; */ public int[] countMatchSymbols(){ int m=0, S=0, C=0, N=0, I=0, D=0; int current=0; byte last='?'; for(byte b : match){ if(Tools.isDigit(b)){ current=current*10+b-'0'; }else{ current=Tools.max(current, 1); if(last=='m'){ m+=current; }else if(last=='S'){ S+=current; }else if(last=='C'){ C+=current; }else if(last=='N'){ N+=current; }else if(last=='I'){ I+=current; }else if(last=='D'){ D+=current; }else{ assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match); } current=0; last=b; } } current=Tools.max(current, 1); if(last=='m'){ m+=current; }else if(last=='S'){ S+=current; }else if(last=='C'){ C+=current; }else if(last=='N'){ N+=current; }else if(last=='I'){ I+=current; }else if(last=='D'){ D+=current; }else{ assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match); } current=0; return new int[] {m,S,C,N,I,D}; } public boolean containsNonNMXY(){ assert(match!=null && valid()); for(int i=0; i'9' && b!='m' && b!='N' && b!='X' && b!='Y'){return true;} } return false; } public boolean containsSDI(){ assert(match!=null && valid()); for(int i=0; i'9' && b!='m' && b!='s' && b!='N' && b!='S'){return true;} } return false; } public boolean containsConsecutiveS(int num){ assert(match!=null && valid() && !shortmatch()); int cnt=0; for(int i=0; i=num){return true;} }else{ cnt=0; } } return false; } public boolean containsIndels(){ assert(match!=null && valid()); for(int i=0; i=min){return true;} } } return false; } /** * @return The number of occurrences of the rarest base. */ public int minBaseCount(){ if(bases==null){return 0;} int a=0, c=0, g=0, t=0; for(byte b : bases){ if(b=='A'){a++;} else if(b=='C'){c++;} else if(b=='G'){g++;} else if(b=='T'){t++;} } return Tools.min(a, c, g, t); } public boolean containsXY(){ assert(match!=null && valid()); return containsXY(match); } public static boolean containsXY(byte[] match){ if(match==null){return false;} for(int i=0; i=0){ ca=Data.getChromosome(chrom); }else{ ca=null; } boolean originallyShort=shortmatch(); if(originallyShort){match=toLongMatchString(match);} int mloc=0, cloc=0, rloc=start; for(; mloc0 && array[0]<1); float product=1; for(int i=0; i0 && array[0]<1); float sum=0; for(int i=0; i0 && array[0]<1); float sum=0; for(int i=quality.length-1; i>=limit; i--){ byte b=bases[i]; byte q=quality[i]; if(AminoAcid.isFullyDefined(b)){ sum+=array[q]; }else{ assert(q==0); if(countUndefined){sum+=0.75f;} } } return sum; } public int estimateErrors() { if(quality==null){return 0;} assert(match!=null) : this.toText(false); int count=0; for(int ci=0, mi=0; ci3){return true;} }else{ streak=0; last=b; } } return false; } public void toShortMatchString(boolean doAssertion){ if(shortmatch()){ assert(!doAssertion); return; } match=toShortMatchString(match); setShortMatch(true); } public static byte[] toShortMatchString(byte[] match){ if(match==null){return null;} assert(match.length>0); ByteBuilder sb=new ByteBuilder(10); byte prev=match[0]; int count=1; for(int i=1; i1){sb.append(count);} // else if(count==2){sb.append(prev);} prev=m; count=1; } } sb.append(prev); if(count>1){sb.append(count);} // else if(count==2){sb.append(prev);} byte[] r=sb.toBytes(); return r; } public void toLongMatchString(boolean doAssertion){ if(!shortmatch()){ assert(!doAssertion); return; } match=toLongMatchString(match); setShortMatch(false); } public static byte[] toLongMatchString(byte[] shortmatch){ if(shortmatch==null){return null;} assert(shortmatch.length>0); int count=0; int current=0; for(int i=0; i0 ? current-1 : 0); current=0; }else{ assert(Tools.isDigit(m)); current=(current*10)+(m-48); //48 == '0' } } count+=(current>0 ? current-1 : 0); byte[] r=new byte[count]; current=0; byte lastLetter='?'; int j=0; for(int i=0; i1){ r[j]=lastLetter; current--; j++; } current=0; r[j]=m; j++; lastLetter=m; }else{ assert(Tools.isDigit(m)); current=(current*10)+(m-48); //48 == '0' } } while(current>1){ r[j]=lastLetter; current--; j++; } assert(r[r.length-1]>0); return r; } public String parseCustomRname(){ assert(id.startsWith("SYN")) : "Can't parse header "+id; return new Header(id, pairnum()).rname; } /** Bases of the read. */ public byte[] bases; /** Quality of the read. */ public byte[] quality; /** Alignment string. E.G. mmmmDDDmmm would have 4 matching bases, then a 3-base deletion, then 3 matching bases. */ public byte[] match; public int[] gaps; public String name() {return id;} public String id; public long numericID; public int chrom; public int start; public int stop; public int copies=1; /** Errors detected */ public int errors=0; /** Alignment score from BBMap. Assumed to max at approx 100*bases.length */ public int mapScore=0; public ArrayList sites; public SiteScore originalSite; //Origin site for synthetic reads public Object obj=null;//Don't set this to a SamLine; it's for other things. public SamLine samline=null; public Read mate; public int flags; /** -1 if invalid. TODO: Currently not retained through most processes. */ private int insert=-1; /** A random number for deterministic usage. * May decrease speed in multithreaded applications. */ public double rand=-1; public long time(){ assert(obj!=null && obj.getClass()==Long.class) : obj; return ((Long)obj).longValue(); } /** Number of bases in this pair, including the mate if present. */ public int pairLength(){return length()+mateLength();} /** Number of bases in this pair, including the mate if present. */ public int numPairKmers(int k){return numKmers(k)+numMateKmers(k);} /** Number of reads in this pair. Returns 1 if the read has no mate, and 2 if it does. */ public int pairCount(){return 1+mateCount();} public int length(){return bases==null ? 0 : bases.length;} public int numKmers(int k){return bases==null ? 0 : Tools.max(0, bases.length-k+1);} public int qlength(){return quality==null ? 0 : quality.length;} public int mateLength(){return mate==null ? 0 : mate.length();} public int numMateKmers(int k){return mate==null ? 0 : mate.numKmers(k);} public String mateId(){return mate==null ? null : mate.id;} public int mateCount(){return mate==null ? 0 : 1;} public boolean mateMapped(){return mate==null ? false : mate.mapped();} public long countMateBytes(){return mate==null ? 0 : mate.countBytes();} public long countMateFastqBytes(){return mate==null ? 0 : mate.countFastqBytes();} /** Number of bytes this read pair uses in memory, approximately */ public long countPairBytes(){return countBytes()+(mate==null ? 0 : mate.countBytes());} /** Number of bytes this read uses in memory, approximately */ public long countBytes(){ long sum=129; //Approximate per-read overhead sum+=(bases==null ? 0 : bases.length+16); sum+=(quality==null ? 0 : quality.length+16); sum+=(id==null ? 0 : id.length()*2+16); sum+=(match==null ? 0 : match.length+16); sum+=(samline==null ? 0 : samline.countBytes()); return sum; } /** Number of bytes this read uses in on disk in Fastq format */ public long countFastqBytes(){ long sum=6;//4 newlines, +, @ sum+=(bases==null ? 0 : bases.length); sum+=(quality==null ? 0 : quality.length); sum+=(id==null ? 0 : id.length()); return sum; } public int countLeft(final char base){return countLeft((byte)base);} public int countRight(final char base){return countRight((byte)base);} public int countLeft(final byte base){ for(int i=0; i=0; i--){ final byte b=bases[i]; if(b!=base){return bases.length-i-1;} } return bases.length; } public boolean untrim(){ if(obj==null || obj.getClass()!=TrimRead.class){return false;} ((TrimRead)obj).untrim(); obj=null; return true; } public int trailingLowerCase(){ for(int i=bases.length-1; i>=0;){ if(Tools.isLowerCase(bases[i])){ i--; }else{ return bases.length-i-1; } } return bases.length; } public int leadingLowerCase(){ for(int i=0; i0 ? list.get(0).semiperfect : false;} //TODO: This is a hack. Add a semiperfect flag. public boolean rescued(){return (flags&RESCUEDMASK)==RESCUEDMASK;} public boolean discarded(){return (flags&DISCARDMASK)==DISCARDMASK;} public boolean invalid(){return (flags&INVALIDMASK)==INVALIDMASK;} public boolean swapped(){return (flags&SWAPMASK)==SWAPMASK;} public boolean shortmatch(){return (flags&SHORTMATCHMASK)==SHORTMATCHMASK;} public boolean insertvalid(){return (flags&INSERTMASK)==INSERTMASK;} public boolean hasAdapter(){return (flags&ADAPTERMASK)==ADAPTERMASK;} public boolean secondary(){return (flags&SECONDARYMASK)==SECONDARYMASK;} public boolean aminoacid(){return (flags&AAMASK)==AAMASK;} public boolean amino(){return (flags&AAMASK)==AAMASK;} public boolean junk(){return (flags&JUNKMASK)==JUNKMASK;} public boolean validated(){return (flags&VALIDATEDMASK)==VALIDATEDMASK;} public boolean tested(){return (flags&TESTEDMASK)==TESTEDMASK;} public boolean invertedRepeat(){return (flags&IRMASK)==IRMASK;} public boolean trimmed(){return (flags&TRIMMEDMASK)==TRIMMEDMASK;} /** For paired ends: 0 for read1, 1 for read2 */ public int pairnum(){return (flags&PAIRNUMMASK)>>PAIRNUMSHIFT;} public boolean valid(){return !invalid();} public boolean getFlag(int mask){return (flags&mask)==mask;} public int flagToNumber(int mask){return (flags&mask)==mask ? 1 : 0;} public void setFlag(int mask, boolean b){ flags=(flags&~mask); if(b){flags|=mask;} } public void setStrand(int b){ assert(b==1 || b==0); flags=(flags&(~1))|b; } /** For paired ends: 0 for read1, 1 for read2 */ public void setPairnum(int b){ // System.err.println("Setting pairnum to "+b+" for "+id); // assert(!id.equals("2_chr1_0_1853883_1853982_1845883_ecoli_K12") || b==1); assert(b==1 || b==0); flags=(flags&(~PAIRNUMMASK))|(b<=maxScore) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n"; } } return perfect(); } private boolean testMatchPerfection(boolean returnIfNoMatch){ if(match==null){return returnIfNoMatch;} boolean flag=(match.length==bases.length); if(shortmatch()){ flag=(match.length==0 || match[0]=='m'); for(int i=0; i-1){ if(x==0 || x==3){at++;} else{gc++;} } } if(gc<1){return 0;} return gc*1f/(at+gc); } /** * @param swapFrom * @param swapTo * @return number of swaps */ public int swapBase(byte swapFrom, byte swapTo) { if(bases==null){return 0;} int swaps=0; for(int i=0; ir2.strand()){return insertSizeMapped_PlusLeft(r2, r1);} if(r1.strand()==r2.strand() || r1.start>r2.stop){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left. // if(!mapped() || !mate.mapped()){return 0;} if(r1.chrom!=r2.chrom){return 0;} if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? int a=r1.length(); int b=r2.length(); int mid=r2.start-r1.stop-1; if(-mid>=a+b){return insertSizeMapped_Unstranded(r1, r2);} //Not properly oriented; plus read is to the right of minus read return mid+a+b; } public static int insertSizeMapped_Unstranded(Read r1, Read r2){ if(r2==null){return r1.start==r1.stop ? 0 : r1.stop-r1.start+1;} if(r1.start>r2.start){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left side. // if(!mapped() || !mate.mapped()){return 0;} if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? if(r1.chrom!=r2.chrom){return 0;} int a=r1.length(); int b=r2.length(); if(false && Tools.overlap(r1.start, r1.stop, r2.start, r2.stop)){ //This does not handle very short inserts return Tools.max(r1.stop, r2.stop)-Tools.min(r1.start, r2.start)+1; }else{ if(r1.starta && -mid>b){return Tools.min(a, b);} //Strange situation, no way to guess insert size if(-mid>=a+b){return 0;} //Strange situation, no way to guess insert size return mid+a+b; }else{ assert(r1.start==r2.start); return Tools.min(a, b); } } } public int insertSizeOriginalSite(){ if(mate==null){ // System.err.println("A: "+(originalSite==null ? "null" : (originalSite.stop-originalSite.start+1))); return (originalSite==null ? -1 : originalSite.stop-originalSite.start+1); } final SiteScore ssa=originalSite, ssb=mate.originalSite; final int x; if(ssa==null || ssb==null){ // System.err.println("B: 0"); x=0; }else{ x=insertSize(ssa, ssb, bases.length, mate.length()); } assert(pairnum()>=mate.pairnum() || x==mate.insertSizeOriginalSite()); return x; } public static int insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb){ return insertSize(ssa.chrom, ssb.chrom, ssa.start, ssb.start, ssa.stop, ssb.stop, lena, lenb); } public static int insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb){ final int x; // if(mate==null || ){return bases==null ? 0 : bases.length;} if(chroma!=chromb){x=0;} else{ if(Tools.overlap(starta, stopa, startb, stopb)){ x=Tools.max(stopa, stopb)-Tools.min(starta, startb)+1; // System.err.println("C: "+x); }else{ if(starta<=startb){ int mid=startb-stopa-1; // assert(false) : mid+", "+a+", "+b; x=mid+lena+lenb; // System.err.println("D: "+x); }else{ int mid=starta-stopb-1; // assert(false) : mid+", "+a+", "+b; x=mid+lena+lenb; // System.err.println("E: "+x); } } } return x; } public Read subRead(int from, int to){ Read r=this.clone(); r.bases=KillSwitch.copyOfRange(bases, from, to); r.quality=(quality==null ? null : KillSwitch.copyOfRange(quality, from, to)); r.mate=null; // assert(Tools.indexOf(r.bases, (byte)'J')<0); return r; } public Read joinRead(){ if(insert<1 || mate==null || !insertvalid()){return this;} assert(insert>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; return joinRead(this, mate, insert); } public Read joinRead(int x){ if(x<1 || mate==null){return this;} assert(x>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; return joinRead(this, mate, x); } public static Read joinRead(Read a, Read b, int insert){ assert(a!=null && b!=null && insert>0); final int lengthSum=a.length()+b.length(); final int overlap=Tools.min(insert, lengthSum-insert); // System.err.println(insert); final byte[] bases=new byte[insert], abases=a.bases, bbases=b.bases; final byte[] aquals=a.quality, bquals=b.quality; final byte[] quals=(aquals==null || bquals==null ? null : new byte[insert]); assert(aquals==null || (aquals.length==abases.length && bquals.length==bbases.length)); int mismatches=0; int start, stop; if(overlap<=0){//Simple join in which there is no overlap int lim=insert-b.length(); if(quals==null){ for(int i=0; i=a.length() && insert>=b.length()){ //Overlapped join, proper orientation // final int lim1=a.length()-overlap; // final int lim2=a.length(); // for(int i=0; i=0 && j>=0; i--, j--){ byte ca=bases[i], cb=bbases[j]; if(ca==0 || ca=='N'){ bases[i]=cb; }else if(ca==cb){ }else{ bases[i]=(ca>=cb ? ca : cb); if(ca!='N' && cb!='N'){mismatches++;} } } }else{ for(int i=0; i=0 && j>=0; i--, j--){ byte ca=bases[i], cb=bbases[j]; byte qa=quals[i], qb=bquals[j]; if(ca==0 || ca=='N'){ bases[i]=cb; quals[i]=qb; }else if(cb==0 || cb=='N'){ //do nothing }else if(ca==cb){ quals[i]=(byte)Tools.min((Tools.max(qa, qb)+Tools.min(qa, qb)/4), MAX_MERGE_QUALITY); }else{ bases[i]=(qa>qb ? ca : qastop){ start=Tools.min(a.start, b.start); stop=Tools.max(a.stop, b.stop); } } // assert(mismatches>=countMismatches(a, b, insert, 999)); // System.err.println(mismatches); if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){start=stop=a.chrom=0;} // System.err.println(bases.length+", "+start+", "+stop); Read r=new Read(bases, null, a.id, a.numericID, a.flags, a.chrom, start, stop); r.quality=quals; //This prevents quality from getting capped. if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){r.setMapped(true);} r.setInsert(insert); r.setPaired(false); r.copies=a.copies; r.mapScore=a.mapScore+b.mapScore; if(overlap<=0){ r.mapScore=a.mapScore+b.mapScore; r.errors=a.errors+b.errors; //TODO r.gaps=? }else{//Hard to calculate r.mapScore=(int)((a.mapScore*(long)a.length()+b.mapScore*(long)b.length())/insert); r.errors=a.errors; } assert(r.insertvalid()) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; assert(r.insert()==r.length()) : r.insert()+"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; // assert(false) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; //TODO: Triggered by BBMerge in useratio mode for some reason. // assert(Shared.anomaly || (a.insertSizeMapped(false)>0 == r.insertSizeMapped(false)>0)) : // "\n"+r.length()+"\n"+r.insert()+"\n"+r.insertSizeMapped(false)+"\n"+a.insert()+"\n"+a.insertSizeMapped(false)+ // "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; return r; } /** * @param minlen * @param maxlen * @return A list of read fragments */ public ArrayList split(int minlen, int maxlen) { int len=bases==null ? 0 : bases.length; if(len subreads=new ArrayList(parts); if(len<=maxlen){ subreads.add(this); }else{ float ideal=Tools.max(minlen, len/(float)parts); int actual=(int)ideal; assert(false) : "TODO"; //Some assertion goes here, I forget what for(int i=0; ibases.length){b=bases.length;} // if(b-a<) byte[] subbases=KillSwitch.copyOfRange(bases, a, b); byte[] subquals=(quality==null ? null : KillSwitch.copyOfRange(quality, a, b+1)); Read r=new Read(subbases, subquals, id+"_"+i, numericID, flags); subreads.add(r); } } return subreads; } /** Generate and return an array of canonical kmers for this read */ public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) { if(gap>0){throw new RuntimeException("Gapped reads: TODO");} if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);} if(bases==null || bases.length63 ? -1L : ~((-1L)<>>2)|(x2<=k){ kmers[i-k+1]=makeCanonical ? Tools.max(kmer, rkmer) : kmer; } } return kmers; } // /** Generate and return an array of canonical kmers for this read */ // public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) { // if(gap>0){throw new RuntimeException("Gapped reads: TODO");} // if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);} // if(bases==null || bases.length63 ? -1L : ~((-1L)<=k){ // kmers[i-k+1]=kmer; // } // } // } // //// System.out.println(new String(bases)); //// System.out.println(Arrays.toString(kmers)); // // if(makeCanonical){ // this.reverseComplement(); // len=0; // kmer=0; // for(int i=0, j=bases.length-1; i=k){ // assert(kmer==AminoAcid.reverseComplementBinaryFast(kmers[j], k)); // kmers[j]=Tools.max(kmers[j], kmer); // } // } // } // this.reverseComplement(); // //// System.out.println(Arrays.toString(kmers)); // } // // // return kmers; // } /** Generate and return an array of canonical kmers for this read */ public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer) { assert(k>31) : k; assert(makeCanonical); if(bases==null || bases.length=k){ kmers[i-k+1]=kmer.xor(); } } return kmers; } // /** Generate and return an array of canonical kmers for this read */ // public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer longkmer) { // assert(k>31) : k; // if(bases==null || bases.length=k){ // long x2=AminoAcid.baseToNumber[bases[i-k]]; // kmer=kmer^(x2<=k){ // long x2=AminoAcid.baseToNumber[bases[i-k]]; // kmer=kmer^(x2< list, byte[] basesP, byte[] basesM, long id){ return CHECKSITES(list, basesP, basesM, id, true); } public static final boolean CHECKSITES(ArrayList list, byte[] basesP, byte[] basesM, long id, boolean verifySorted){ return true; //Temporarily disabled // if(list==null || list.isEmpty()){return true;} // SiteScore prev=null; // for(int i=0; i list){ if(list==null || list.size()<2){return true;} SiteScore prev=list.get(0); for(int i=0; iprev.score){return false;} prev=ss; } return true; } /** Makes sure 'bases' is for correct strand. */ public static final boolean CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id){ return CHECKSITE(ss, ss.plus() ? basesP : basesM, id); } /** Make sure 'bases' is for correct strand! */ public static final boolean CHECKSITE(SiteScore ss, byte[] bases, long id){ return true; //Temporarily disabled // if(ss==null){return true;} //// System.err.println("Checking site "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect+", bases="+new String(bases)); // if(ss.perfect){assert(ss.semiperfect) : ss+"\n"+new String(bases);} // if(ss.gaps!=null){ // if(ss.gaps[0]!=ss.start || ss.gaps[ss.gaps.length-1]!=ss.stop){return false;} //// assert(ss.gaps[0]==ss.start && ss.gaps[ss.gaps.length-1]==ss.stop); // } // // if(!(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore)){ // System.err.println("Site paired score violation: "+ss.quickScore+", "+ss.slowScore+", "+ss.pairedScore); // return false; // } // // final boolean xy=ss.matchContainsXY(); // if(bases!=null){ // // final boolean p0=ss.perfect; // final boolean sp0=ss.semiperfect; // final boolean p1=ss.isPerfect(bases); // final boolean sp1=(p1 ? true : ss.isSemiPerfect(bases)); // // assert(p0==p1 || (xy && p1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+ // "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n"; // assert(sp0==sp1 || (xy && sp1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+ // "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n"; // //// ss.setPerfect(bases, false); // // assert(p0==ss.perfect) : // p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+ // Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n"; // assert(sp0==ss.semiperfect) : // p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+ // Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n"; // if(ss.perfect){assert(ss.semiperfect);} // } // if(ss.match!=null && ss.matchLength()!=ss.mappedLength()){ // if(verbose){System.err.println("Returning false because matchLength!=mappedLength:\n"+ss.matchLength()+", "+ss.mappedLength()+"\n"+ss);} // return false; // } // return true; } public void setPerfect(boolean b){ flags=(flags&~PERFECTMASK); if(b){flags|=PERFECTMASK;} } public void setRescued(boolean b){ flags=(flags&~RESCUEDMASK); if(b){flags|=RESCUEDMASK;} } public void setMapped(boolean b){ flags=(flags&~MAPPEDMASK); if(b){flags|=MAPPEDMASK;} } public void setDiscarded(boolean b){ flags=(flags&~DISCARDMASK); if(b){flags|=DISCARDMASK;} } public void setInvalid(boolean b){ flags=(flags&~INVALIDMASK); if(b){flags|=INVALIDMASK;} } public void setSwapped(boolean b){ flags=(flags&~SWAPMASK); if(b){flags|=SWAPMASK;} } public void setShortMatch(boolean b){ flags=(flags&~SHORTMATCHMASK); if(b){flags|=SHORTMATCHMASK;} } public void setInsertValid(boolean b){ flags=(flags&~INSERTMASK); if(b){flags|=INSERTMASK;} } public void setHasAdapter(boolean b){ flags=(flags&~ADAPTERMASK); if(b){flags|=ADAPTERMASK;} } public void setSecondary(boolean b){ flags=(flags&~SECONDARYMASK); if(b){flags|=SECONDARYMASK;} } public void setAminoAcid(boolean b){ flags=(flags&~AAMASK); if(b){flags|=AAMASK;} } public void setJunk(boolean b){ flags=(flags&~JUNKMASK); if(b){flags|=JUNKMASK;} } public void setValidated(boolean b){ flags=(flags&~VALIDATEDMASK); if(b){flags|=VALIDATEDMASK;} } public void setTested(boolean b){ flags=(flags&~TESTEDMASK); if(b){flags|=TESTEDMASK;} } public void setInvertedRepeat(boolean b){ flags=(flags&~IRMASK); if(b){flags|=IRMASK;} } public void setTrimmed(boolean b){ flags=(flags&~TRIMMEDMASK); if(b){flags|=TRIMMEDMASK;} } public void setInsert(int x){ if(x<1){x=-1;} // assert(x==-1 || x>9 || length()<20) : x+", "+length(); //Invalid assertion for synthetic reads. insert=x; setInsertValid(x>0); if(mate!=null){ mate.insert=x; mate.setInsertValid(x>0); } } private static int[] makeMaskArray(int max) { int[] r=new int[max+1]; for(int i=0; i=QUALCACHE.length){ byte[] r=KillSwitch.allocByte1D(len); Arrays.fill(r, (byte)30); return r; } if(QUALCACHE[len]==null){ synchronized(QUALCACHE){ if(QUALCACHE[len]==null){ QUALCACHE[len]=KillSwitch.allocByte1D(len); Arrays.fill(QUALCACHE[len], (byte)30); } } } return QUALCACHE[len]; } public byte[] getScaffoldName(boolean requireSingleScaffold){ byte[] name=null; if(mapped()){ if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){ int idx=Data.scaffoldIndex(chrom, (start+stop)/2); name=Data.scaffoldNames[chrom][idx]; // int scaflen=Data.scaffoldLengths[chrom][idx]; // a1=Data.scaffoldRelativeLoc(chrom, start, idx); // b1=a1-start1+stop1; } } return name; } public void bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC){ for(int i=0; i) (r.sites==null ? null : r.sites.clone()); r.mate=null; if(r.sites!=null){ for(int i=0; i