1 package stream; 2 3 import java.io.Serializable; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Locale; 7 8 import dna.AminoAcid; 9 import dna.ChromosomeArray; 10 import dna.Data; 11 import dna.Gene; 12 import dna.ScafLoc; 13 import shared.KillSwitch; 14 import shared.Parse; 15 import shared.Shared; 16 import shared.Tools; 17 import structures.ByteBuilder; 18 import var2.ScafMap; 19 import var2.Scaffold; 20 21 22 public class SamLine implements Serializable { 23 24 // 426_647_582 161 chr1 10159 0 26M9H chr3 170711991 0 TCCCTAACCCTAACCCTAACCTAACC IIFIIIIIIIIIIIIIIIIIICH2<> RG:Z:20110708003021394 NH:i:3 CM:i:2 SM:i:1 CQ:Z:A9?(BB?:<A?>=>B67=:7A);.%8'%))/%*%' CS:Z:G12002301002301002301023010200000003 XS:A:+ 25 26 // 1 QNAME String [!-?A-~]f1,255g Query template NAME 27 // 2 FLAG Int [0,216-1] bitwise FLAG 28 // 3 RNAME String \*|[!-()+-<>-~][!-~]* Reference sequence NAME 29 // 4 POS Int [0,229-1] 1-based leftmost mapping POSition 30 // 5 MAPQ Int [0,28-1] MAPping Quality 31 // 6 CIGAR String \*|([0-9]+[MIDNSHPX=])+ CIGAR string 32 // 7 RNEXT String \*|=|[!-()+-<>-~][!-~]* Ref. name of the mate/next fragment 33 // 8 PNEXT Int [0,229-1] Position of the mate/next fragment 34 // 9 TLEN Int [-229+1,229-1] observed Template LENgth 35 // 10 SEQ String \*|[A-Za-z=.]+ fragment SEQuence 36 // 11 QUAL String [!-~]+ ASCII of Phred-scaled base QUALity+33 37 38 39 // FCB062MABXX:1:1101:1177:2115#GGCTACAA 147 chr11 47765857 29 90M = 47765579 -368 CCTCTGTGGCCCGGGTTGGAGTGCAGTGTCATGATCATGGCTCGCTGTAGCTACACCCTTCTGAGCTCAAGCAATCCTCCCACCTCTCCC ############################################################A@@><D<AAAB<=A2BD/BC<7:<4<%679 XT:A:M NM:i:5 SM:i:29 AM:i:29 XM:i:5 XO:i:0 XG:i:0 MD:Z:7T4A15G26A30A3 40 // FCB062MABXX:1:1101:1193:2122#GGCTACAA 77 * 0 0 * * 0 0 TATATATGTGCTATGTACAGCATTGGAATTCACACCCTACACTTTCAAAAGNGAGCCCTAAATAAATGTTAGATCGGAAGAGCACACGTC FCFCFDDDADDEDEBDAEDFEDEFFGGFGGHEEFHHHHHHEDDDEDFFEFB#CBBA@B8BGGFGEEEC>DGGGDFBGGGGHHHHH9<@## 41 42 43 /*--------------------------------------------------------------*/ 44 /*---------------- Initialization ----------------*/ 45 /*--------------------------------------------------------------*/ 46 47 /** 48 * 49 */ 50 private static final long serialVersionUID = -4180486051387471116L; 51 SamLine(String s)52 public SamLine(String s){ 53 this(s.split("\t")); 54 } 55 SamLine(SamLine sl)56 public SamLine(SamLine sl){ 57 setFrom(sl); 58 } 59 60 /** Prevents references to original string, in case of e.g. very long MD tags. */ toSamLine(String s)61 public SamLine toSamLine(String s){ 62 String[] split=s.split("\t"); 63 split[0]=new String(split[0]); 64 split[5]=new String(split[5]); 65 split[9]=new String(split[9]); 66 split[10]=new String(split[10]); 67 for(int i=11; i<split.length; i++){ 68 split[i]=new String(split[i]); 69 } 70 return new SamLine(split); 71 } 72 setFrom(SamLine sl)73 private void setFrom(SamLine sl){ 74 qname=sl.qname; 75 flag=sl.flag; 76 rname=sl.rname; 77 rnameS=sl.rnameS; 78 pos=sl.pos; 79 mapq=sl.mapq; 80 cigar=sl.cigar; 81 rnext=sl.rnext; 82 pnext=sl.pnext; 83 tlen=sl.tlen; 84 seq=sl.seq; 85 qual=sl.qual; 86 optional=sl.optional; 87 } 88 89 SamLine(Read r1, int fragNum)90 public SamLine(Read r1, int fragNum){ 91 92 if(verbose){ 93 System.err.println("new SamLine for read with match "+(r1.match==null ? "null" : new String(r1.match))); 94 } 95 96 Read r2=r1.mate; 97 final boolean perfect=r1.perfect(); 98 99 if(Data.scaffoldLocs==null && r1.samline!=null){ 100 assert(SET_FROM_OK) : "Sam format cannot be used as input to this program when no genome build is loaded.\n" + 101 "Please index the reference first and rerun with e.g. 'build=1', or use a different input format."; 102 setFrom(r1.samline); 103 return; 104 } 105 106 // qname=r.id.replace(' ', '_').replace('\t', '_'); 107 // qname=r.id.split("\\s+")[0]; 108 qname=r1.id.replace('\t', '_'); 109 // if(!KEEP_NAMES && qname.length()>2 && r2!=null){ 110 // if(qname.endsWith("/1") || qname.endsWith("/2") || qname.endsWith(" 1") || qname.endsWith(" 2")){} 111 // } 112 113 if(!KEEP_NAMES && qname.length()>2 && r2!=null){ 114 char c=qname.charAt(qname.length()-2); 115 int num=(qname.charAt(qname.length()-1))-'1'; 116 if((num==0 || num==1) && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);} 117 // if(r.pairnum()==num && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);} 118 } 119 // flag=Integer.parseInt(s[1]); 120 121 int idx1=-1, idx2=-1; 122 int chrom1=-1, chrom2=-1; 123 int start1=-1, start2=-1, a1=0, a2=0; 124 int stop1=-1, stop2=-1, b1=0, b2=0; 125 int scaflen=0, scafloc=0, scaflen2=0; 126 byte[] name1=bytestar, name2=bytestar; 127 if(r1.mapped()){ 128 assert(r1.chrom>=0); 129 chrom1=r1.chrom; 130 start1=r1.start; 131 stop1=r1.stop; 132 if(Data.isSingleScaffold(chrom1, start1, stop1)){ 133 assert(Data.scaffoldLocs!=null) : "\n\n"+r1+"\n\n"+r1.obj+"\n\n"; 134 idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2); 135 name1=Data.scaffoldNames[chrom1][idx1]; 136 scaflen=Data.scaffoldLengths[chrom1][idx1]; 137 scafloc=Data.scaffoldLocs[chrom1][idx1]; 138 a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1); 139 b1=a1-start1+stop1; 140 }else{ 141 if(verbose){System.err.println("------------- Found multi-scaffold alignment! -------------");} 142 r1.setMapped(false); 143 r1.setPaired(false); 144 r1.match=null; 145 if(r2!=null){r2.setPaired(false);} 146 } 147 } 148 if(r2!=null && r2.mapped()){ 149 chrom2=r2.chrom; 150 start2=r2.start; 151 stop2=r2.stop; 152 if(Data.isSingleScaffold(chrom2, start2, stop2)){ 153 idx2=Data.scaffoldIndex(chrom2, (start2+stop2)/2); 154 name2=Data.scaffoldNames[chrom2][idx2]; 155 scaflen2=Data.scaffoldLengths[chrom2][idx2]; 156 a2=Data.scaffoldRelativeLoc(chrom2, start2, idx2); 157 b2=a2-start2+stop2; 158 }else{ 159 if(verbose){System.err.println("------------- Found multi-scaffold alignment for r2! -------------");} 160 r2.setMapped(false); 161 r2.setPaired(false); 162 r2.match=null; 163 if(r1!=null){r1.setPaired(false);} 164 } 165 } 166 167 final boolean sameScaf=(r2!=null && idx1>-1 && idx1==idx2 && r1.chrom==r2.chrom); 168 flag=makeFlag(r1, r2, fragNum, sameScaf); 169 170 rname=r1.mapped() ? name1 : ((r2!=null && r2.mapped()) ? name2 : null); 171 172 { 173 int pos0, pos0_mate; //start pos 174 int pos1, pos1_mate; //stop pos 175 176 if(r1.mapped()){ 177 // int leadingClip=countLeadingClip(cigar); 178 int clip=countLeadingClip(r1.match); 179 int clippedIndels=countLeadingIndels(a1, r1.match); 180 int tclip=countTrailingClip(r1.match); 181 int tclippedIndels=countTrailingIndels(b1, scaflen, r1.match); 182 183 if(verbose){ 184 System.err.println("leadingClip="+clip); 185 System.err.println("clippedDels="+clippedIndels); 186 } 187 pos0=(a1+1)+clip+clippedIndels; 188 pos1=(b1+1)-tclip-tclippedIndels; 189 if(pos1>scaflen){pos1=scaflen;} 190 191 if(pos0<1){ 192 //This is necessary to prevent mapped reads from having POS less than 1. 193 pos0=1; 194 } 195 assert(pos1>=pos0) : pos0+", "+pos1+"\n"+r1+"\n"+r2+"\n"; 196 197 }else{ 198 pos0=0; 199 pos1=0; 200 } 201 202 if(r2!=null && r2.mapped()){ 203 int clip=countLeadingClip(r2.match); 204 int clippedIndels=countLeadingIndels(a2, r2.match); 205 int tclip=countTrailingClip(r2.match); 206 int tclippedIndels=countTrailingIndels(b2, scaflen, r2.match); 207 if(verbose){ 208 System.err.println("leadingClip="+clip); 209 System.err.println("clippedDels="+clippedIndels); 210 } 211 pos0_mate=(a2+1)+clip+clippedIndels; 212 pos1_mate=(b2+1)-tclip-tclippedIndels; 213 if(pos1_mate>scaflen){pos1=scaflen;} 214 215 if(pos0_mate<1){ 216 //This is necessary to prevent mapped reads from having POS less than 1. 217 pos0_mate=1; 218 } 219 assert(!sameScaf || pos1_mate>=pos0_mate) : pos0_mate+", "+pos1_mate+", "+scaflen+"\n"+r1+"\n"+r2+"\n"; 220 221 }else{ 222 pos0_mate=0; 223 pos1_mate=0; 224 } 225 226 if(r2==null){ 227 pos=pos0; 228 pnext=pos0_mate; 229 tlen=0; 230 assert(((pos>0 && r1.mapped()) || (pos==0 && !r1.mapped())) && pnext==0); 231 }else{ 232 if(r1.mapped() && r2.mapped()){ 233 pos=pos0; 234 pnext=pos0_mate; 235 if(sameScaf){ 236 // tlen=1+(Data.max(r.stop, r2.stop)-Data.min(r.start, r2.start)); 237 tlen=1+(Data.max(pos1, pos1_mate)-Data.min(pos0, pos0_mate)); 238 }else{ 239 tlen=0; 240 } 241 assert(pos>0) : pos+"\n"+r1+"\n"+r2; 242 assert(pnext>0) : pnext+"\n"+r1+"\n"+r2; 243 }else if(r1.mapped() && !r2.mapped()){ 244 pos=pos0; 245 pnext=pos0; 246 tlen=0; 247 assert(pos>0 && pnext>0); 248 }else if(!r1.mapped() && r2.mapped()){ 249 pos=pos0_mate; 250 pnext=pos0_mate; 251 tlen=0; 252 assert(pos>0 && pnext>0); 253 }else if(!r1.mapped() && !r2.mapped()){ 254 pos=pos0; 255 pnext=pos0_mate; 256 tlen=0; 257 assert(pos==0 && pnext==0); 258 }else{assert(false);} 259 } 260 261 assert(pos>=0) : "Negative coordinate "+pos+" for read:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+ 262 ", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false)+", clipM="+countLeadingClip(r1.match); 263 assert(pnext>=0) : "Negative coordinate "+pnext+" for mate:\n\n"+r1+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+ 264 ", pos0="+pos0+", pos0_mate="+pos0_mate+", clip="+countLeadingClip(cigar, true, false); 265 } 266 267 mapq=toMapq(r1, null); 268 269 if(verbose){ 270 System.err.println("Making cigar for "+(r1.match==null ? "null" : new String(r1.match))); 271 } 272 273 final boolean inbounds=!r1.mapped() ? false : (a1>=0 && b1<scaflen); 274 final boolean inbounds2=(r2==null ? true : !r2.mapped() ? false : (a2>=0 && b2<scaflen2)); 275 if(r1.bases!=null && r1.mapped() && r1.match!=null){ 276 if(VERSION>1.3f){ 277 if(inbounds && perfect && !r1.containsNonM()){//r.containsNonM() should be unnecessary... it's there in case of clipping... 278 cigar=(r1.length()+"="); 279 // System.err.println("SETTING cigar14="+cigar); 280 // 281 // byte[] match=r.match; 282 // if(r.shortmatch()){match=Read.toLongMatchString(match);} 283 // cigar=toCigar13(match, a1, b1, scaflen, r.bases); 284 // System.err.println("RESETTING cigar14="+cigar+" from toCigar14("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); 285 }else{ 286 byte[] match=r1.match; 287 if(r1.shortmatch()){match=Read.toLongMatchString(match);} 288 cigar=toCigar14(match, a1, b1, scaflen, r1.bases); 289 // System.err.println("CALLING toCigar14("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); 290 } 291 }else{ 292 if(inbounds && (perfect || !r1.containsNonNMS())){ 293 cigar=(r1.length()+"M"); 294 // System.err.println("SETTING cigar13="+cigar); 295 // 296 // byte[] match=r.match; 297 // if(r.shortmatch()){match=Read.toLongMatchString(match);} 298 // cigar=toCigar13(match, a1, b1, scaflen, r.bases); 299 // System.err.println("RESETTING cigar13="+cigar+" from toCigar13("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); 300 }else{ 301 byte[] match=r1.match; 302 if(r1.shortmatch()){match=Read.toLongMatchString(match);} 303 cigar=toCigar13(match, a1, b1, scaflen, r1.bases); 304 // System.err.println("CALLING toCigar13("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); 305 } 306 } 307 } 308 309 if(verbose){ 310 System.err.println("cigar="+cigar); 311 } 312 313 // assert(false); 314 315 // assert(primary() || cigar.equals(stringstar)) : cigar; 316 // if(pos<0){pos=0;cigar=null;rname=bytestar;mapq=0;flag|=0x4;} 317 318 // assert(false) : "\npos="+pos+"\ncigar='"+cigar+"'\nVERSION="+VERSION+"\na1="+a1+", b1="+b1+"\n\n"+r.toString(); 319 320 // rnext=(r2==null ? stringstar : (r.mapped() && !r2.mapped()) ? "chr"+Gene.chromCodes[r.chrom] : "chr"+Gene.chromCodes[r2.chrom]); 321 rnext=((r2==null || (!r1.mapped() && !r2.mapped())) ? bytestar : (r1.mapped() && r2.mapped()) ? (sameScaf ? byteequals : name2) : byteequals); 322 323 assert(rnext!=byteequals || name1==name2 || name1==bytestar || name2==bytestar) : 324 new String(rname)+", "+new String(rnext)+", "+new String(name1)+", "+new String(name2)+"\n"+r1+"\n"+r2; 325 326 // assert(r1.pairnum()==0) : r1.mapped()+", "+r2.mapped()+"fragNum="+fragNum+ 327 // "\nname1="+new String(name1)+"\nname2="+new String(name2)+"\nrname="+new String(rname)+"\nrnext="+new String(rnext)+ 328 // "\nname1="+name1+"\nname2="+name2+"\nrname="+rname+"\nrnext="+rnext+"\nidx1="+idx1+"\nidx2="+idx2; 329 330 if(Data.scaffoldPrefixes){ 331 if(rname!=null && rname!=bytestar){ 332 int k=Tools.indexOf(rname, (byte)'$'); 333 rname=KillSwitch.copyOfRange(rname, k+1, rname.length); 334 } 335 if(rnext!=null && rnext!=bytestar){ 336 int k=Tools.indexOf(rnext, (byte)'$'); 337 rnext=KillSwitch.copyOfRange(rnext, k+1, rnext.length); 338 } 339 } 340 341 // if(r2==null || r.stop<=r2.start){ 342 // //plus sign 343 // }else if(r2.stop<=r.start){ 344 // //minus sign 345 // tlen=-tlen; 346 // }else{ 347 // //They overlap... a lot. Physically shorter than read length. 348 // if(r.start<=r2.start){ 349 // 350 // }else{ 351 // tlen=-tlen; 352 // } 353 // } 354 //This version is less technically correct (does not account for very short insert reads) but probably more in line with what is expected 355 if(r2==null || r1.start<r2.start || (r1.start==r2.start && r1.pairnum()==0)){ 356 //plus sign 357 }else{ 358 //minus sign 359 tlen=-tlen; 360 } 361 362 // if(r.secondary()){ 363 //// seq=qual=stringstar; 364 // seq=qual=bytestar; 365 // }else{ 366 // if(r.strand()==Gene.PLUS){ 367 //// seq=new String(r.bases); 368 // seq=r.bases.clone(); 369 // if(r.quality==null){ 370 //// qual=stringstar; 371 // qual=bytestar; 372 // }else{ 373 //// StringBuilder q=new StringBuilder(r.quality.length); 374 //// for(byte b : r.quality){ 375 //// q.append((char)(b+33)); 376 //// } 377 //// qual=q.toString(); 378 // qual=new byte[r.quality.length]; 379 // for(int i=0, j=qual.length-1; i<qual.length; i++, j--){ 380 // qual[i]=(byte)(r.quality[j]+33); 381 // } 382 // } 383 // }else{ 384 //// seq=new String(AminoAcid.reverseComplementBases(r.bases)); 385 // seq=AminoAcid.reverseComplementBases(r.bases); 386 // if(r.quality==null){ 387 //// qual=stringstar; 388 // qual=bytestar; 389 // }else{ 390 //// StringBuilder q=new StringBuilder(r.quality.length); 391 //// for(int i=r.quality.length-1; i>=0; i--){ 392 //// q.append((char)(r.quality[i]+33)); 393 //// } 394 //// qual=q.toString(); 395 // qual=new byte[r.quality.length]; 396 // for(int i=0, j=qual.length-1; i<qual.length; i++, j--){ 397 // qual[i]=(byte)(r.quality[j]+33); 398 // } 399 // } 400 // } 401 // } 402 403 if(r1.secondary() && SECONDARY_ALIGNMENT_ASTERISKS){ 404 // seq=qual=bytestar; 405 seq=qual=null; 406 }else{ 407 seq=r1.bases; 408 if(r1.quality==null){ 409 // qual=bytestar; 410 qual=null; 411 }else{ 412 qual=r1.quality; 413 } 414 } 415 416 trimNames(); 417 optional=makeOptionalTags(r1, r2, perfect, scafloc, scaflen, inbounds, inbounds2); 418 // assert(r.pairnum()==1) : "\n"+r.toText(false)+"\n"+this+"\n"+r2; 419 } 420 SamLine(String[] s)421 public SamLine(String[] s){ 422 assert(!s[0].startsWith("@")) : "Tried to make a SamLine from a header: "+s[0]; 423 assert(s.length>=11) : "\nNot all required fields are present: "+s.length+"\nline='"+Arrays.toString(s)+"'\n"; 424 if(s.length<11){ 425 System.err.println("Invalid SamLine: "+Arrays.toString(s)); 426 return; 427 } 428 qname=s[0]; 429 flag=Integer.parseInt(s[1]); 430 if(RNAME_AS_BYTES){ 431 rname=s[2].getBytes(); 432 }else{ 433 rnameS=s[2]; 434 } 435 pos=Integer.parseInt(s[3]); 436 // try { 437 // Integer.parseInt(s[4]); 438 // } catch (NumberFormatException e) { 439 // System.err.println(Arrays.toString(s)); 440 // } 441 mapq=Tools.isDigit(s[4].charAt(0)) ? Integer.parseInt(s[4]) : 99; //Added for non-compliant mappers that put * here 442 cigar=s[5]; 443 rnext=s[6].getBytes(); 444 pnext=(s[7].charAt(0)=='*' ? 0 : Integer.parseInt(s[7])); 445 tlen=Tools.isDigit(s[8].charAt(0)) ? Integer.parseInt(s[8]) : 0; //Added for non-compliant mappers that put * here 446 // seq=s[9]; 447 // qual=s[10]; 448 seq=(s[9].equals(stringstar) ? null : s[9].getBytes()); 449 qual=(s[10].equals(stringstar) ? null : s[10].getBytes()); 450 451 if(mapped() && strand()==Shared.MINUS){ 452 if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);} 453 if(qual!=bytestar){Tools.reverseInPlace(qual);} 454 } 455 456 if(qual!=null && qual!=bytestar){ 457 for(int i=0; i<qual.length; i++){qual[i]-=33;} 458 } 459 460 if(!PARSE_OPTIONAL || s.length<=11){return;} 461 462 if(PARSE_OPTIONAL_MD_ONLY){ 463 optional=new ArrayList<String>(1); 464 for(int i=11; i<s.length; i++){ 465 if(s[i].startsWith("MD:")){ 466 optional.add(s[i]); 467 break; 468 } 469 } 470 }else{ 471 optional=new ArrayList<String>(s.length-11); 472 for(int i=11; i<s.length; i++){ 473 optional.add(s[i]); 474 } 475 } 476 477 trimNames(); 478 } 479 SamLine(byte[] s)480 public SamLine(byte[] s){ 481 assert(s[0]!='@') : "Tried to make a SamLine from a header: "+new String(s); 482 483 int a=0, b=0; 484 485 while(b<s.length && s[b]!='\t'){b++;} 486 assert(b>a) : "Missing field 0: "+new String(s); 487 if(PARSE_0){qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));} 488 b++; 489 a=b; 490 491 while(b<s.length && s[b]!='\t'){b++;} 492 assert(b>a) : "Missing field 1: "+new String(s); 493 flag=Parse.parseInt(s, a, b); 494 b++; 495 a=b; 496 497 while(b<s.length && s[b]!='\t'){b++;} 498 assert(b>a) : "Missing field 2: "+new String(s); 499 if(PARSE_2){ 500 if(RNAME_AS_BYTES){ 501 rname=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b)); 502 }else{ 503 rnameS=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a)); 504 } 505 } 506 b++; 507 a=b; 508 509 while(b<s.length && s[b]!='\t'){b++;} 510 assert(b>a) : "Missing field 3: "+new String(s); 511 pos=Parse.parseInt(s, a, b); 512 b++; 513 a=b; 514 515 while(b<s.length && s[b]!='\t'){b++;} 516 assert(b>a) : "Missing field 4: "+new String(s); 517 mapq=Parse.parseInt(s, a, b); 518 b++; 519 a=b; 520 521 while(b<s.length && s[b]!='\t'){b++;} 522 assert(b>a) : "Missing field 5: "+new String(s); 523 if(PARSE_5){cigar=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a));} 524 b++; 525 a=b; 526 527 while(b<s.length && s[b]!='\t'){b++;} 528 assert(b>a) : "Missing field 6: "+new String(s); 529 if(PARSE_6){rnext=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));} 530 b++; 531 a=b; 532 533 while(b<s.length && s[b]!='\t'){b++;} 534 assert(b>a) : "Missing field 7: "+new String(s); 535 if(PARSE_7){pnext=(b==a+1 && s[a]=='*' ? 0 :Parse.parseInt(s, a, b));} 536 b++; 537 a=b; 538 539 while(b<s.length && s[b]!='\t'){b++;} 540 assert(b>a) : "Missing field 8: "+new String(s); 541 if(PARSE_8){tlen=Parse.parseInt(s, a, b);} 542 b++; 543 a=b; 544 545 while(b<s.length && s[b]!='\t'){b++;} 546 assert(b>a) : "Missing field 9: "+new String(s); 547 // seq=new String(s, a, b-a); 548 seq=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b)); 549 b++; 550 a=b; 551 552 while(b<s.length && s[b]!='\t'){b++;} 553 assert(b>a) : "Missing field 10: "+new String(s); 554 // qual=new String(s, a, b-a); 555 if(PARSE_10){qual=(b==a+1 && s[a]=='*' ? null : KillSwitch.copyOfRange(s, a, b));} 556 b++; 557 a=b; 558 559 assert((seq==bytestar)==(Tools.equals(seq, bytestar))); 560 assert((qual==bytestar)==(Tools.equals(qual, bytestar))); 561 562 if(mapped() && strand()==Shared.MINUS && FLIP_ON_LOAD){ 563 if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);} 564 if(qual!=bytestar){Tools.reverseInPlace(qual);} 565 } 566 567 if(qual!=null && qual!=bytestar){ 568 for(int i=0; i<qual.length; i++){qual[i]-=33;} 569 } 570 571 if(!PARSE_OPTIONAL || b>=s.length){return;} 572 573 if(PARSE_OPTIONAL_MD_ONLY){ 574 optional=new ArrayList<String>(1); 575 while(b<s.length){ 576 while(b<s.length && s[b]!='\t'){b++;} 577 if(b>a){ 578 if(b>=a+3 && s[a]=='M' && s[a+1]=='D' && s[a+2]==':'){ 579 String x=new String(s, a, b-a); 580 optional.add(x); 581 return; 582 } 583 }else{ 584 //Empty field 585 } 586 b++; 587 a=b; 588 } 589 }else{ 590 optional=new ArrayList<String>(4); 591 while(b<s.length){ 592 while(b<s.length && s[b]!='\t'){b++;} 593 if(b>a){ 594 String x=new String(s, a, b-a); 595 optional.add(x); 596 }else{ 597 //Empty field 598 } 599 b++; 600 a=b; 601 } 602 } 603 604 trimNames(); 605 } 606 trimNames()607 public void trimNames(){ 608 // System.err.println(); 609 // System.err.println("rname= "+new String(rname)); 610 // System.err.println("qname= "+new String(qname)); 611 if(Shared.TRIM_RNAME){ 612 if(RNAME_AS_BYTES){ 613 setRname(Tools.trimToWhitespace(rname())); 614 setRnext(Tools.trimToWhitespace(rnext())); 615 }else{ 616 setRname(Tools.trimToWhitespace(rnameS())); 617 setRnext(Tools.trimToWhitespace(rnext())); 618 } 619 } 620 if(Shared.TRIM_READ_COMMENTS){ 621 qname=(Tools.trimToWhitespace(qname)); 622 } 623 // System.err.println("rname2="+new String(rname)); 624 // System.err.println("qname2="+new String(qname)); 625 // assert(false) : Shared.TRIM_RNAME+", "+Shared.TRIM_READ_COMMENTS+", "+new String(rname)+", "+qname; 626 } 627 parseFlagOnly(byte[] s)628 public static final int parseFlagOnly(byte[] s){ 629 assert(s!=null && s.length>0) : "Blank line."; 630 if(s[0]=='@'){return -1;} 631 632 int a=0, b=0; 633 634 while(b<s.length && s[b]!='\t'){b++;} 635 assert(b>a) : "Missing field 0: "+new String(s); 636 b++; 637 a=b; 638 639 while(b<s.length && s[b]!='\t'){b++;} 640 assert(b>a) : "Missing field 1: "+new String(s); 641 int flag=Parse.parseInt(s, a, b); 642 return flag; 643 } 644 parseNameOnly(byte[] s)645 public static final String parseNameOnly(byte[] s){ 646 assert(s!=null && s.length>0) : "Blank line."; 647 if(s[0]=='@'){return null;} 648 649 int a=0, b=0; 650 651 while(b<s.length && s[b]!='\t'){b++;} 652 assert(b>a) : "Missing field 0: "+new String(s); 653 String qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a)); 654 return qname; 655 } 656 657 /*--------------------------------------------------------------*/ 658 /*---------------- Cigar ----------------*/ 659 /*--------------------------------------------------------------*/ 660 toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases)661 public static String toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases){ 662 if(match==null || readStart==readStop){return null;} 663 ByteBuilder sb=new ByteBuilder(8); 664 int count=0; 665 char mode='='; 666 char lastMode='='; 667 668 int refloc=readStart; 669 670 int cigarlen=0; //for debugging 671 int opcount=0; //for debugging 672 673 for(int mpos=0; mpos<match.length; mpos++){ 674 675 byte m=match[mpos]; 676 677 boolean sfdflag=false; 678 if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){ 679 mode='S'; //soft-clip out-of-bounds 680 if(m!='I'){refloc++;} 681 if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions! 682 }else if(m=='m' || m=='s' || m=='S' || m=='N' || m=='B'){//Little 's' is for a match classified as a sub to improve the affine score. 683 mode='M'; 684 refloc++; 685 }else if(m=='I' || m=='X' || m=='Y'){ 686 mode='I'; 687 }else if(m=='D'){ 688 mode='D'; 689 refloc++; 690 }else if(m=='C'){ 691 mode='S'; 692 refloc++; 693 }else{ 694 throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " + 695 "Match string should be in long format here."); 696 } 697 698 if(mode!=lastMode){ 699 if(count>0){//Prevents an initial length-0 match 700 sb.append(count); 701 // sb.append(lastMode); 702 if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');} 703 else{sb.append(lastMode);} 704 if(lastMode!='D'){cigarlen+=count;} 705 opcount+=count; 706 } 707 count=0; 708 lastMode=mode; 709 } 710 711 count++; 712 if(sfdflag){count--;} 713 } 714 sb.append(count); 715 if(mode=='D' && count>INTRON_LIMIT){sb.append('N');} 716 else{sb.append(mode);} 717 if(mode!='D'){cigarlen+=count;} 718 opcount+=count; 719 720 assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" + 721 "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n"; 722 723 return sb.toString(); 724 } 725 toCigar13(String cigar14)726 public static String toCigar13(String cigar14) { 727 if(cigar14==null){return null;} 728 final int len=cigar14.length(); 729 730 int current=0; 731 int mcount=0; 732 ByteBuilder sb=new ByteBuilder(len); 733 734 for(int i=0; i<len; i++){ 735 char b=cigar14.charAt(i); 736 if(Tools.isDigit(b)){ 737 current=(10*current)+(b-'0'); 738 }else{ 739 if(b=='X' || b=='=' || b=='M'){ 740 mcount+=current; 741 }else{ 742 if(mcount>0){ 743 sb.append(mcount); 744 sb.append('M'); 745 mcount=0; 746 } 747 sb.append(current); 748 sb.append(b); 749 } 750 current=0; 751 } 752 } 753 assert(current==0); 754 if(mcount>0){ 755 sb.append(mcount); 756 sb.append('M'); 757 mcount=0; 758 } 759 return sb.toString(); 760 } 761 762 toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases)763 public static String toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases){ 764 // assert(false) : readStart+", "+readStop+", "+reflen; 765 if(match==null || readStart==readStop){return null;} 766 ByteBuilder sb=new ByteBuilder(8); 767 int count=0; 768 char mode='='; 769 char lastMode='='; 770 771 int refloc=readStart; 772 773 int cigarlen=0; //for debugging 774 int opcount=0; //for debugging 775 776 for(int mpos=0; mpos<match.length; mpos++){ 777 778 byte m=match[mpos]; 779 780 boolean sfdflag=false; 781 if(SOFT_CLIP && (refloc<0 || refloc>=reflen)){ 782 mode='S'; //soft-clip out-of-bounds 783 if(m!='I'){refloc++;} 784 if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions! 785 }else if(m=='m' || m=='s'){//Little 's' is for a match classified as a sub to improve the affine score. 786 mode='='; 787 refloc++; 788 }else if(m=='S' || m=='V'){ 789 mode='X'; 790 refloc++; 791 }else if(m=='I' || m=='X' || m=='Y'){ 792 mode='I'; 793 }else if(m=='D'){ 794 mode='D'; 795 refloc++; 796 }else if(m=='C'){ 797 mode='S'; 798 refloc++; 799 }else if(m=='N' || m=='B'){ 800 mode='M'; 801 refloc++; 802 }else{ 803 throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " + 804 "Match string should be in long format here."); 805 } 806 807 if(mode!=lastMode){ 808 if(count>0){//Prevents an initial length-0 match 809 sb.append(count); 810 if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');} 811 else{sb.append(lastMode);} 812 if(lastMode!='D'){cigarlen+=count;} 813 opcount+=count; 814 } 815 count=0; 816 lastMode=mode; 817 } 818 819 count++; 820 if(sfdflag){count--;} 821 } 822 sb.append(count); 823 if(mode=='D' && count>INTRON_LIMIT){ 824 sb.append('N'); 825 }else{ 826 sb.append(mode); 827 } 828 if(mode!='D'){cigarlen+=count;} 829 opcount+=count; 830 831 assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" + 832 "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n"; 833 834 return sb.toString(); 835 } 836 calcCigarLength(boolean includeSoftClip, boolean includeHardClip)837 public int calcCigarLength(boolean includeSoftClip, boolean includeHardClip){ 838 return calcCigarLength(cigar, includeSoftClip, includeHardClip); 839 } 840 calcCigarReadLength(boolean includeSoftClip, boolean includeHardClip)841 public int calcCigarReadLength(boolean includeSoftClip, boolean includeHardClip){ 842 return calcCigarReadLength(cigar, includeSoftClip, includeHardClip); 843 } 844 845 /** Reference length of cigar string */ calcCigarLength(String cigar, boolean includeSoftClip, boolean includeHardClip)846 public static int calcCigarLength(String cigar, boolean includeSoftClip, boolean includeHardClip){ 847 if(cigar==null){return 0;} 848 int len=0; 849 int current=0; 850 for(int i=0; i<cigar.length(); i++){ 851 char c=cigar.charAt(i); 852 if(Tools.isDigit(c)){ 853 current=(current*10)+(c-'0'); 854 }else{ 855 if(c=='M' || c=='=' || c=='X' || c=='D' || c=='N'){ 856 len+=current; 857 }else if(c=='S'){ 858 if(includeSoftClip){len+=current;} 859 }else if (c=='H'){ 860 //In this case, the base string is the wrong length since letters were truncated. 861 //Therefore, the bases cannot be used for calling variations after mapping. 862 //Hard clipping messes up original location verification. 863 //Therefore... len+=current would be best in practice, but for GRADING purposes, leaving it disabled is best. 864 865 if(includeHardClip){len+=current;} 866 }else if(c=='I'){ 867 //do nothing 868 }else if(c=='P'){ 869 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 870 //'P' is currently poorly defined 871 }else{ 872 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 873 } 874 current=0; 875 } 876 } 877 return len; 878 } 879 880 /** Reference length of cigar string */ calcCigarReadLength(String cigar, boolean includeSoftClip, boolean includeHardClip)881 public static int calcCigarReadLength(String cigar, boolean includeSoftClip, boolean includeHardClip){ 882 if(cigar==null){return 0;} 883 int len=0; 884 int current=0; 885 for(int i=0; i<cigar.length(); i++){ 886 char c=cigar.charAt(i); 887 if(Tools.isDigit(c)){ 888 current=(current*10)+(c-'0'); 889 }else{ 890 if(c=='M' || c=='=' || c=='X' || c=='I'){ 891 len+=current; 892 }else if(c=='S'){ 893 if(includeSoftClip){len+=current;} 894 }else if (c=='H'){ 895 //In this case, the base string is the wrong length since letters were truncated. 896 //Therefore, the bases cannot be used for calling variations after mapping. 897 //Hard clipping messes up original location verification. 898 //Therefore... len+=current would be best in practice, but for GRADING purposes, leaving it disabled is best. 899 900 if(includeHardClip){len+=current;} 901 }else if(c=='D' || c=='N'){ 902 //do nothing 903 }else if(c=='P'){ 904 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 905 //'P' is currently poorly defined 906 }else{ 907 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 908 } 909 current=0; 910 } 911 } 912 return len; 913 } 914 915 /** Number of query bases in cigar string */ calcCigarBases(String cigar, boolean includeSoftClip, boolean includeHardClip)916 public static int calcCigarBases(String cigar, boolean includeSoftClip, boolean includeHardClip){ 917 if(cigar==null){return 0;} 918 int len=0; 919 int current=0; 920 for(int i=0; i<cigar.length(); i++){ 921 char c=cigar.charAt(i); 922 if(Tools.isDigit(c)){ 923 current=(current*10)+(c-'0'); 924 }else{ 925 if(c=='M' || c=='=' || c=='X' || c=='I'){ 926 len+=current; 927 }else if(c=='D' || c=='N'){ 928 //do nothing 929 }else if (c=='H'){ 930 if(includeHardClip){len+=current;} 931 }else if(c=='S'){ 932 if(includeSoftClip){len+=current;} 933 }else if(c=='P'){ 934 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 935 //'P' is currently poorly defined 936 }else{ 937 throw new RuntimeException("Unhandled cigar symbol: "+c+"\n"+cigar+"\n"); 938 } 939 current=0; 940 } 941 } 942 return len; 943 } 944 945 /** Length of clipped initial bases. Used to calculate correct start location of clipped reads. */ countLeadingClip(String cigar, boolean includeSoftClip, boolean includeHardClip)946 public static int countLeadingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){ 947 if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;} 948 int len=0; 949 int current=0; 950 for(int i=0; i<cigar.length(); i++){ 951 char c=cigar.charAt(i); 952 if(Tools.isLetter(c) || c=='='){ 953 if(c=='H'){ 954 if(includeHardClip){ 955 len+=current; 956 } 957 }else if(c=='S'){ 958 if(includeSoftClip){ 959 len+=current; 960 } 961 }else{ 962 break; 963 } 964 current=0; 965 }else{ 966 current=(current*10)+(c-'0'); 967 } 968 } 969 return len; 970 } 971 972 /** Length of clipped final bases. Used to calculate correct stop location of clipped reads. */ countTrailingClip(String cigar, boolean includeSoftClip, boolean includeHardClip)973 public static int countTrailingClip(String cigar, boolean includeSoftClip, boolean includeHardClip){ 974 if(cigar==null || (!includeSoftClip && !includeHardClip)){return 0;} 975 int len=0; 976 if(includeHardClip){len+=countTrailingHardClip(cigar);} 977 int last=cigar.lastIndexOf('S'); 978 979 int mult=1; 980 int i; 981 for(i=last-1; i>=0; i--){ 982 char c=cigar.charAt(i); 983 if(Tools.isLetter(c) || c=='='){ 984 break; 985 } 986 len+=(len+(c-'0')*mult); 987 mult*=10; 988 } 989 if(i<0){return 0;} 990 return len; 991 } 992 993 /** Length of clipped final bases. Used to calculate correct stop location of clipped reads. */ countTrailingHardClip(String cigar)994 public static int countTrailingHardClip(String cigar){ 995 if(cigar==null){return 0;} 996 int last=cigar.lastIndexOf('H'); 997 998 int mult=1, len=0; 999 int i; 1000 for(i=last-1; i>=0; i--){ 1001 char c=cigar.charAt(i); 1002 if(Tools.isLetter(c) || c=='='){ 1003 break; 1004 } 1005 len+=(len+(c-'0')*mult); 1006 mult*=10; 1007 } 1008 if(i<0){return 0;} 1009 return len; 1010 } 1011 countMdSubs(String mdTag)1012 public static int countMdSubs(String mdTag){ 1013 assert(mdTag!=null); 1014 1015 final int NORMAL=0, SUB=1, DEL=2; 1016 int dels=0, subs=0, normals=0; 1017 1018 if(mdTag!=null){ 1019 int current=0; 1020 int mode=NORMAL; 1021 int i=0; 1022 if(mdTag.startsWith("MD:Z:")){i=5;} 1023 for(final int max=mdTag.length(); i<max; i++){ 1024 char c=mdTag.charAt(i); 1025 if(Tools.isDigit(c)){ 1026 current=(current*10)+(c-'0'); 1027 mode=NORMAL; 1028 }else{ 1029 if(current>0){ 1030 if(mode==NORMAL){normals+=current;} 1031 else{assert(false) : mode+", "+current;} 1032 current=0; 1033 } 1034 if(c=='^'){mode=DEL;} 1035 else if(mode==DEL){ 1036 dels++; 1037 }else if(mode==NORMAL || mode==SUB){ 1038 mode=SUB; 1039 subs++; 1040 } 1041 } 1042 } 1043 } 1044 return subs; 1045 } 1046 1047 /** Length of clipped initial bases. */ countLeadingClip(byte[] match)1048 public static int countLeadingClip(byte[] match){ 1049 if(match==null || match.length<1 || match[0]!='C'){return 0;} 1050 int clips=0; 1051 int current=0; 1052 for(int mloc=0; mloc<match.length; mloc++){ 1053 byte b=match[mloc]; 1054 if(Tools.isDigit(b)){ 1055 current=current*10+(b-'0'); 1056 }else{ 1057 if(current>0){ 1058 clips=clips+current-1; 1059 } 1060 current=0; 1061 if(b!='C'){break;} 1062 clips++; 1063 } 1064 } 1065 if(current>0){ 1066 clips=clips+current-1; 1067 } 1068 return clips; 1069 } 1070 1071 /** Length of match string portion describing clipped initial bases. */ countLeadingClip2(byte[] match)1072 public static int countLeadingClip2(byte[] match){ 1073 if(match==null || match.length<1 || match[0]!='C'){return 0;} 1074 int mloc=0; 1075 for(; mloc<match.length; mloc++){ 1076 byte b=match[mloc]; 1077 if(b!='C' && !Tools.isDigit(b)){return mloc;} 1078 } 1079 return match.length; 1080 } 1081 1082 /** Length of clipped trailing bases. */ countTrailingClip(byte[] match)1083 public static int countTrailingClip(byte[] match){ 1084 if(match==null){return 0;} 1085 int clips=0; 1086 for(int mloc=match.length-1; mloc>=0; mloc--){ 1087 byte b=match[mloc]; 1088 assert(!Tools.isDigit(b)) : new String(match); 1089 if(b=='C'){ 1090 clips++; 1091 }else{ 1092 break; 1093 } 1094 } 1095 return clips; 1096 } 1097 1098 /** Length of clipped (out of bounds) initial insertions and deletions. */ countLeadingIndels(int rloc, byte[] match)1099 public static int countLeadingIndels(int rloc, byte[] match){ 1100 if(match==null || rloc>=0){return 0;} 1101 int dels=0; 1102 int inss=0; 1103 int cloc=0; 1104 for(int mloc=0; mloc<match.length && rloc<0; mloc++){ 1105 byte b=match[mloc]; 1106 assert(!Tools.isDigit(b)); 1107 if(b=='D'){ 1108 dels++; 1109 rloc++; 1110 }else if(b=='I'){ 1111 inss++; 1112 cloc++; 1113 }else{ 1114 rloc++; 1115 cloc++; 1116 } 1117 } 1118 return dels-inss; 1119 } 1120 1121 /** Length of clipped (out of bounds) trialing insertions and deletions. */ countTrailingIndels(int rloc, int rlen, byte[] match)1122 public static int countTrailingIndels(int rloc, int rlen, byte[] match){ 1123 if(match==null || rloc>=0){return 0;} 1124 int dels=0; 1125 int inss=0; 1126 int cloc=0; 1127 for(int mloc=match.length; mloc>=0 && rloc>=rlen; mloc--){ 1128 byte b=match[mloc]; 1129 assert(!Tools.isDigit(b)); 1130 if(b=='D'){ 1131 dels++; 1132 rloc--; 1133 }else if(b=='I'){ 1134 inss++; 1135 cloc--; 1136 }else{ 1137 rloc--; 1138 cloc--; 1139 } 1140 } 1141 return dels-inss; 1142 } 1143 mappedNonClippedBases()1144 public int mappedNonClippedBases() { 1145 if(!mapped() || cigar==null || seq==null){return 0;} 1146 1147 int len=0; 1148 int current=0; 1149 for(int i=0; i<cigar.length(); i++){ 1150 char c=cigar.charAt(i); 1151 if(Tools.isDigit(c)){ 1152 current=(current*10)+(c-'0'); 1153 }else{ 1154 if(c=='M' || c=='='){ 1155 len+=current; 1156 }else if(c=='X'){ 1157 len+=current; 1158 }else if(c=='D' || c=='N'){ 1159 1160 }else if(c=='I'){ 1161 len+=current; 1162 }else if(c=='S' || c=='H' || c=='P'){ 1163 1164 } 1165 current=0; 1166 } 1167 } 1168 return len; 1169 } 1170 1171 /** 1172 * @param cigar 1173 * @return Max consecutive match, sub, del, ins, or clip symbols 1174 */ cigarToMdsiMax(String cigar)1175 public static final int[] cigarToMdsiMax(String cigar) { 1176 if(cigar==null){return null;} 1177 int[] msdic=KillSwitch.allocInt1D(5); 1178 1179 int current=0; 1180 for(int i=0; i<cigar.length(); i++){ 1181 char c=cigar.charAt(i); 1182 if(Tools.isDigit(c)){ 1183 current=(current*10)+(c-'0'); 1184 }else{ 1185 if(c=='M' || c=='='){ 1186 msdic[0]=Tools.max(msdic[0], current); 1187 }else if(c=='X'){ 1188 msdic[1]=Tools.max(msdic[1], current); 1189 }else if(c=='D' || c=='N'){ 1190 msdic[2]=Tools.max(msdic[2], current); 1191 }else if(c=='I'){ 1192 msdic[3]=Tools.max(msdic[3], current); 1193 }else if(c=='S' || c=='H' || c=='P'){ 1194 msdic[4]=Tools.max(msdic[4], current); 1195 } 1196 current=0; 1197 } 1198 } 1199 return msdic; 1200 } 1201 calcIdentity()1202 public float calcIdentity() { 1203 assert(cigar!=null); 1204 int match=0, other=0; 1205 1206 int current=0; 1207 for(int i=0; i<cigar.length(); i++){ 1208 char c=cigar.charAt(i); 1209 if(Tools.isDigit(c)){ 1210 current=(current*10)+(c-'0'); 1211 }else{ 1212 if(c=='='){ 1213 match+=current; 1214 }else if(c=='M'){ 1215 1216 }else if(c=='X'){ 1217 other+=current; 1218 }else if(c=='D'){ 1219 other+=current; 1220 }else if(c=='N'){ 1221 1222 }else if(c=='I'){ 1223 other+=current; 1224 }else if(c=='S' || c=='H' || c=='P'){ 1225 1226 } 1227 current=0; 1228 } 1229 } 1230 return match/(float)Tools.max(match+other, 1); 1231 } 1232 1233 /** 1234 * @param cigar 1235 * @return Total number of match, sub, del, ins, or clip symbols 1236 */ cigarToMsdic(String cigar)1237 public static final int[] cigarToMsdic(String cigar) { 1238 if(cigar==null){return null;} 1239 int[] msdic=KillSwitch.allocInt1D(5); 1240 1241 int current=0; 1242 for(int i=0; i<cigar.length(); i++){ 1243 char c=cigar.charAt(i); 1244 if(Tools.isDigit(c)){ 1245 current=(current*10)+(c-'0'); 1246 }else{ 1247 if(c=='M' || c=='='){ 1248 msdic[0]+=current; 1249 }else if(c=='X'){ 1250 msdic[1]+=current; 1251 }else if(c=='D' || c=='N'){ 1252 msdic[2]+=current; 1253 }else if(c=='I'){ 1254 msdic[3]+=current; 1255 }else if(c=='S' || c=='H' || c=='P'){ 1256 msdic[4]+=current; 1257 } 1258 current=0; 1259 } 1260 } 1261 return msdic; 1262 } 1263 1264 /** 1265 * @param allowM Allow M symbols in the cigar string 1266 * @return Match string of this cigar string when possible, otherwise null. 1267 * Takes into account MD tag and bases, but not reference (other than in MD tag). 1268 */ toShortMatch(boolean allowM)1269 public final byte[] toShortMatch(boolean allowM) { 1270 if(cigar==null || cigar.equals(stringstar)){return null;} 1271 if(allowM){return cigarToShortMatch_old(cigar, allowM);} 1272 1273 // System.err.println("\nInput: cigar="+cigar+", MD="+mdTag());//123 1274 1275 final boolean fixMatchSubs; 1276 final boolean fixMatchNs; 1277 boolean foundE=false; 1278 boolean foundX=false; 1279 boolean foundM=false; 1280 // System.err.println("Block 1.");//123 1281 { 1282 int current=0, total=0; 1283 for(int i=0; i<cigar.length(); i++){ 1284 char c=cigar.charAt(i); 1285 1286 if(Tools.isDigit(c)){ 1287 current=(current*10)+(c-'0'); 1288 }else{ 1289 1290 if(c=='H'){ 1291 current=0; //Information destroyed 1292 }else if(c=='P'){ 1293 return null; //Undefined symbol 1294 } 1295 foundE|=(c=='='); 1296 foundX|=(c=='X'); 1297 foundM|=(c=='M'); 1298 1299 total+=current; 1300 current=0; 1301 } 1302 } 1303 if(total<1){return null;} 1304 fixMatchSubs=(!allowM && foundM && !foundX && !foundE);//Note: allowM already exited. 1305 fixMatchNs=(FIX_MATCH_NS && foundX && !foundM); //Means no-calls are possibly marked as X, which is technically OK. 1306 1307 // System.err.println("allowM="+allowM);//123 1308 // System.err.println("foundE="+foundE);//123 1309 // System.err.println("foundX="+foundX);//123 1310 // System.err.println("foundM="+foundM);//123 1311 } 1312 1313 // System.err.println("Block 2.");//123 1314 final String mdTag; 1315 final int mdSubs; 1316 final byte[] refBases; 1317 1318 //1) if fixMatch, grab MD tag 1319 //2) if MD and no subs, return 1320 //3) grab ref bases 1321 1322 if(fixMatchSubs || fixMatchNs){ 1323 final String md0=mdTag(); 1324 mdSubs=(md0==null ? -1 : countMdSubs(md0)); 1325 if(mdSubs==0 && !fixMatchNs){ 1326 refBases=null; 1327 mdTag=null; 1328 }else{ 1329 mdTag=md0; 1330 if(mdTag!=null && PREFER_MDTAG){ 1331 refBases=null; 1332 }else{ 1333 ScafMap map=ScafMap.defaultScafMap(); 1334 assert(!fixMatchSubs || mdTag!=null || map!=null) : "TODO: Encountered a read with 'M' in cigar string but no MD tag and no ScafMap loaded.\n" 1335 + "This can normally be resolved by adding the flag ref=file, where file is the fasta file to which the reads were mapped.\n\n"+this; 1336 if(map==null){ 1337 refBases=null; 1338 }else{ 1339 Scaffold scaf=map.getScaffold(rnameS()); 1340 assert(!fixMatchSubs || mdTag!=null || scaf!=null) : "Encountered a read with 'M' in cigar string but no scaffold loaded for "+rnameS(); 1341 if(scaf==null){ 1342 refBases=null; 1343 }else{ 1344 refBases=scaf.bases; 1345 assert(!fixMatchSubs || mdTag!=null || refBases!=null) : "Encountered a read with 'M' in cigar string but no ref bases loaded for "+rnameS(); 1346 if(fixMatchSubs && refBases==null && mdTag==null){ 1347 return null; 1348 } 1349 } 1350 } 1351 } 1352 } 1353 }else{ 1354 mdSubs=-1; 1355 mdTag=null; 1356 refBases=null; 1357 } 1358 1359 // System.err.println("mdTag="+mdTag);//123 1360 // System.err.println("mdSubs="+mdSubs);//123 1361 1362 char mSymbol=((foundX || foundE || !foundM) ? 'N' : mdSubs>=0 ? 'm' : 'N'); 1363 1364 // System.err.println("Block 3.");//123 1365 final byte[] match0; 1366 { 1367 ByteBuilder sb=new ByteBuilder(cigar.length()); 1368 int current=0; 1369 for(int cpos=0, max=cigar.length(); cpos<max; cpos++){ 1370 char c=cigar.charAt(cpos); 1371 if(Tools.isDigit(c)){ 1372 current=(current*10)+(c-'0'); 1373 }else{ 1374 if(c=='='){ 1375 sb.append('m'); 1376 if(current>1){sb.append(current);} 1377 }else if(c=='X'){ 1378 sb.append('S'); 1379 if(current>1){sb.append(current);} 1380 }else if(c=='D' || c=='N'){ 1381 sb.append('D'); 1382 if(current>1){sb.append(current);} 1383 }else if(c=='I'){ 1384 sb.append('I'); 1385 if(current>1){sb.append(current);} 1386 }else if(c=='S'){ 1387 sb.append('C'); 1388 if(current>1){sb.append(current);} 1389 }else if(c=='M'){ 1390 sb.append(mSymbol); 1391 if(current>1){sb.append(current);} 1392 } 1393 current=0; 1394 } 1395 } 1396 match0=(sb.array.length==sb.length() ? sb.array : sb.toBytes()); 1397 // System.err.println("match="+new String(match0));//123 1398 } 1399 1400 // System.err.println("Block 4.");//123 1401 1402 if((!fixMatchSubs || mdSubs==0) && (!fixMatchNs || seq==null || refBases==null)){return match0;} 1403 assert(((mdTag!=null || refBases!=null) && fixMatchSubs && mdSubs!=0) || fixMatchNs /*|| noCalls>0*/) : 1404 (mdTag!=null)+", "+(refBases!=null)+", "+(fixMatchSubs)+", "+(mdSubs!=0)+", "+fixMatchNs/*+", "+(noCalls)*/; 1405 1406 // assert(false) : mdTag+", "+refBases+", "+processMD+", "+mdSubs+"\n"+this; 1407 1408 // System.err.println("processMD="+processMD+", mdSubs="+mdSubs+", mdTag="+mdTag);//123 1409 1410 final byte[] bases; 1411 if(refBases!=null){ 1412 // bases=(strand()==1) ? AminoAcid.reverseComplementBases(seq) : seq;//Why not reverse in place? 1413 if(strand()==1){AminoAcid.reverseComplementBasesInPlace(seq);} 1414 bases=seq; 1415 }else{bases=null;} 1416 1417 // System.err.println("Block 5.");//123 1418 final byte[] longmatch=Read.toLongMatchString(match0); 1419 1420 // final int noCalls=((foundE || foundX) && foundM ? 1 : seq==null ? -1 : Read.countNocalls(seq)); 1421 1422 // System.err.println("Block 6");//123 1423 1424 if(mdTag!=null && (refBases==null || PREFER_MDTAG)){ 1425 // System.err.println("match="+new String(longmatch));//123 1426 1427 final int noCalls=seq==null ? -1 : Read.countNocalls(seq); 1428 if(noCalls>0 && bases!=null && refBases==null){//Not necessary if ref sequence is present 1429 int bpos=0; 1430 for(int mpos=0; mpos<longmatch.length; mpos++){ 1431 final byte m=longmatch[mpos]; 1432 if(m=='C'){ 1433 bpos++; 1434 }else if(m=='m' || m=='s' || m=='S' || m=='N'){ 1435 if(!AminoAcid.isFullyDefined(bases[bpos])){longmatch[mpos]='N';} 1436 bpos++; 1437 }else if(m=='I' || m=='X' || m=='Y'){ 1438 bpos++; 1439 }else if(m=='D'){ 1440 //do nothing 1441 }else{ 1442 assert(false) : m; 1443 } 1444 } 1445 } 1446 1447 final MDWalker walker=new MDWalker(mdTag, cigar, longmatch, this); 1448 1449 walker.fixMatch(bases); 1450 }else 1451 if(refBases!=null){ 1452 final int refStart=start(true, false); 1453 fixMatch(bases, refBases, longmatch, refStart, false); 1454 } 1455 1456 // else if(refBases!=null){ 1457 // final int refStart=start(true, false); 1458 // fixMatch(bases, refBases, longmatch, refStart, false); 1459 // } 1460 else{ 1461 assert(false) : "Fallthorugh."; 1462 } 1463 1464 final byte[] match=Read.toShortMatchString(longmatch); 1465 1466 if(bases!=null && strand()==1){AminoAcid.reverseComplementBasesInPlace(seq);} 1467 1468 // System.err.println("Block 7.");//123 1469 // System.err.println("Returning "+new String(match));//123 1470 return match; 1471 } 1472 1473 /** Requires longmatch. 1474 * Replaces M */ fixMatch(byte[] call, byte[] ref, byte[] match, int refstart, boolean unClip)1475 public static void fixMatch(byte[] call, byte[] ref, byte[] match, int refstart, boolean unClip){ 1476 for(int mpos=0, rpos=refstart, cpos=0; mpos<match.length; mpos++){ 1477 assert(cpos>=0 && cpos<call.length) : "\n"+new String(match)+"\n"+new String(call)+"\n"+mpos+", "+cpos; 1478 final byte m=match[mpos]; 1479 1480 if(rpos<0 || rpos>=ref.length){ 1481 if(m=='I'){ 1482 assert(false) : "Insertion off scaffold end: "+refstart+", "+ref.length+"\n"+new String(call)+"\n"+new String(match); 1483 cpos++; 1484 }else if(m=='D'){ 1485 assert(false) : "Deletion off scaffold end: "+refstart+", "+ref.length+"\n"+new String(call)+"\n"+new String(match); 1486 rpos++; 1487 }else{ 1488 match[mpos]='C'; 1489 rpos++; 1490 cpos++; 1491 } 1492 }else if(m=='m' || m=='S' || m=='N' || m=='s' || (m=='C' && unClip)){ 1493 final byte c=Tools.toUpperCase(call[cpos]); 1494 final byte r=Tools.toUpperCase(ref[rpos]); 1495 final boolean defined=(AminoAcid.isFullyDefined(c) && AminoAcid.isFullyDefined(r)); 1496 if(!defined){ 1497 match[mpos]='N'; 1498 }else if(c==r){ 1499 match[mpos]='m'; 1500 }else{ 1501 match[mpos]='S'; 1502 } 1503 rpos++; 1504 cpos++; 1505 }else if(m=='C'){ //Do nothing for clipped call 1506 rpos++; 1507 cpos++; 1508 }else if(m=='I' || m=='X' || m=='Y'){ 1509 cpos++; 1510 }else if(m=='D'){ 1511 rpos++; 1512 }else{ 1513 assert(false) : Character.toString((char)m); 1514 } 1515 } 1516 } 1517 1518 /** 1519 * @param cigar 1520 * @return Match string of this cigar string when possible, otherwise null 1521 */ cigarToShortMatch_old(String cigar, boolean allowM)1522 public static final byte[] cigarToShortMatch_old(String cigar, boolean allowM) { 1523 1524 int current=0; 1525 ByteBuilder sb=new ByteBuilder(cigar.length()); 1526 1527 for(int i=0; i<cigar.length(); i++){ 1528 char c=cigar.charAt(i); 1529 if(Tools.isDigit(c)){ 1530 current=(current*10)+(c-'0'); 1531 }else{ 1532 if(c=='='){ 1533 sb.append('m'); 1534 if(current>1){sb.append(current);} 1535 }else if(c=='X'){ 1536 sb.append('S'); 1537 if(current>1){sb.append(current);} 1538 }else if(c=='D' || c=='N'){ 1539 sb.append('D'); 1540 if(current>1){sb.append(current);} 1541 }else if(c=='I'){ 1542 sb.append('I'); 1543 if(current>1){sb.append(current);} 1544 }else if(c=='S'){ 1545 sb.append('C'); 1546 if(current>1){sb.append(current);} 1547 }else if(c=='M'){ 1548 if(!allowM){return null;} 1549 // sb.append('B'); 1550 sb.append('N'); 1551 if(current>1){sb.append(current);} 1552 } 1553 current=0; 1554 } 1555 } 1556 1557 if(sb.array.length==sb.length()){return sb.array;} 1558 return sb.toBytes(); 1559 } 1560 1561 /*--------------------------------------------------------------*/ 1562 /*---------------- Tags ----------------*/ 1563 /*--------------------------------------------------------------*/ 1564 makeStopTag(int pos, int seqLength, String cigar, boolean perfect)1565 public static String makeStopTag(int pos, int seqLength, String cigar, boolean perfect){ 1566 // return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : -countLeadingClip(cigar, false)+calcCigarLength(cigar, false))-1); //123456789 1567 return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : calcCigarLength(cigar, true, false))-1); 1568 } 1569 makeLengthTag(int pos, int seqLength, String cigar, boolean perfect)1570 public static String makeLengthTag(int pos, int seqLength, String cigar, boolean perfect){ 1571 if(cigar==null || perfect){return "YL:Z:"+seqLength+","+seqLength;} 1572 return "YL:Z:"+(seqLength-countLeadingClip(cigar, true, false))+","+calcCigarLength(cigar, false, false); 1573 } 1574 makeIdentityTag(byte[] match, boolean perfect)1575 public static String makeIdentityTag(byte[] match, boolean perfect){ 1576 if(perfect){return "YI:f:100";} 1577 float f=Read.identity(match); 1578 return String.format(Locale.ROOT, "YI:f:%.2f", (100*f)); 1579 } 1580 makeScoreTag(int score)1581 public static String makeScoreTag(int score){ 1582 return "YR:i:"+score; 1583 } 1584 matchTag()1585 public String matchTag(){ 1586 if(optional==null){return null;} 1587 for(String s : optional){ 1588 if(s.startsWith("X2:Z:")){ 1589 return s; 1590 } 1591 } 1592 return null; 1593 } 1594 makeXSTag(Read r)1595 private String makeXSTag(Read r){ 1596 if(r.mapped() && cigar!=null && cigar.indexOf('N')>=0){ 1597 // System.err.println("For read "+r.pairnum()+" mapped to strand "+r.strand()); 1598 boolean plus=(r.strand()==Shared.PLUS); //Assumes secondstrand=false 1599 // System.err.println("plus="+plus); 1600 if(r.pairnum()!=0){plus=!plus;} 1601 // System.err.println("plus="+plus); 1602 if(XS_SECONDSTRAND){plus=!plus;} 1603 // System.err.println("plus="+plus); 1604 return (plus ? XSPLUS : XSMINUS); 1605 }else{ 1606 return null; 1607 } 1608 } 1609 makeMdTag(int chrom, int refstart, byte[] match, byte[] call, int scafloc, int scaflen)1610 public static String makeMdTag(int chrom, int refstart, byte[] match, byte[] call, int scafloc, int scaflen){ 1611 if(match==null || chrom<0){return null;} 1612 ByteBuilder md=new ByteBuilder(8); 1613 md.append("MD:Z:"); 1614 1615 ChromosomeArray cha=Data.getChromosome(chrom); 1616 1617 final int scafstop=scafloc+scaflen; 1618 1619 byte prevM='?'; 1620 int count=0; 1621 int dels=0; 1622 boolean prevSub=false; 1623 for(int mpos=0, rpos=refstart, cpos=0; mpos<match.length; mpos++){ 1624 assert(cpos>=0 && cpos<call.length) : "\n"+new String(match)+"\n"+new String(call)+"\n"+mpos+", "+cpos+", "+dels+", "+INTRON_LIMIT; 1625 final byte c=call[cpos]; 1626 final byte m=match[mpos]; 1627 1628 if(prevM=='D' && m!='D'){ 1629 if(dels<=INTRON_LIMIT){//Otherwise, ignore it 1630 md.append(count); 1631 count=0; 1632 md.append('^'); 1633 for(int i=rpos-dels; i<rpos; i++){ 1634 md.append((char)cha.get(i)); 1635 } 1636 dels=0; 1637 } 1638 } 1639 1640 if(m=='C' || rpos<scafloc || rpos>=scafstop){ //Do nothing for clipped bases 1641 rpos++; 1642 if(m!='D'){cpos++;} 1643 }else if(m=='m' || m=='s'){ 1644 count++; 1645 rpos++; 1646 cpos++; 1647 }else if(m=='S'){ 1648 if(count>0 || !prevSub){md.append(count);} 1649 md.append((char)cha.get(rpos)); 1650 1651 count=0; 1652 rpos++; 1653 cpos++; 1654 prevSub=true; 1655 }else if(m=='N'){ 1656 1657 final byte r=cha.get(rpos); 1658 1659 if(c==r){//Act like match 1660 count++; 1661 rpos++; 1662 cpos++; 1663 }else{//Act like sub 1664 if(count>0 || !prevSub){md.append(count);} 1665 md.append((char)r); 1666 1667 count=0; 1668 rpos++; 1669 cpos++; 1670 prevSub=true; 1671 } 1672 }else if(m=='I' || m=='X' || m=='Y'){ 1673 cpos++; 1674 // count++; 1675 }else if(m=='D'){ 1676 // if(prevM!='D'){ 1677 // md.append(count); 1678 // count=0; 1679 // md.append('^'); 1680 // } 1681 // md.append((char)cha.get(rpos)); 1682 1683 rpos++; 1684 dels++; 1685 } 1686 prevM=m; 1687 1688 } 1689 // if(count>0){ 1690 md.append(count); 1691 // } 1692 1693 return md.toString(); 1694 } 1695 calcLeftClip(String cig, String id)1696 public static int calcLeftClip(String cig, String id){ 1697 if(cig==null){return 0;} 1698 int len=0; 1699 for(int i=0; i<cig.length(); i++){ 1700 char c=cig.charAt(i); 1701 if(Tools.isDigit(c)){ 1702 len=len*10+(c-'0'); 1703 }else{ 1704 assert(c!='S' || i<cig.length()-1);//ban entirely soft-clipped reads 1705 return (c=='S') ? len : 0; 1706 } 1707 } 1708 return 0; 1709 } 1710 calcRightClip(String cig, String id)1711 public static int calcRightClip(String cig, String id){ 1712 if(cig==null || cig.length()<1 || cig.charAt(cig.length()-1)!='S'){return 0;} 1713 int pos=cig.length()-2; 1714 for(; pos>=0 && Tools.isDigit(cig.charAt(pos)); pos--){} 1715 1716 assert(pos>0) : cig+", id="+id+", pos="+pos;//ban entirely soft-clipped reads 1717 1718 int len=0; 1719 for(int i=pos+1; i<cig.length(); i++){ 1720 char c=cig.charAt(i); 1721 if(Tools.isDigit(c)){ 1722 len=len*10+(c-'0'); 1723 }else{ 1724 return (c=='S') ? len : 0; 1725 } 1726 } 1727 return len; 1728 } 1729 makeOptionalTags(Read r, Read r2, boolean perfect, int scafloc, int scaflen, boolean inbounds, boolean inbounds2)1730 public ArrayList<String> makeOptionalTags(Read r, Read r2, boolean perfect, int scafloc, int scaflen, boolean inbounds, boolean inbounds2){ 1731 if(NO_TAGS){return null;} 1732 final boolean mapped=r.mapped(); 1733 if(!mapped && READGROUP_ID==null && !MAKE_CUSTOM_TAGS && !MAKE_TIME_TAG){return null;} 1734 1735 ArrayList<String> optionalTags=new ArrayList<String>(8); 1736 1737 if(mapped){ 1738 if(!r.secondary() && r.ambiguous()){optionalTags.add("XT:A:R");} //Not sure what do do for secondary alignments 1739 1740 // int nm=r.length(); 1741 // int dels=0; 1742 1743 int nm=0; 1744 1745 // //Only works for cigar strings in format 1.4+ 1746 // if(perfect){nm=0;}else if(cigar!=null){ 1747 // int len=0; 1748 // for(int i=0; i<cigar.length(); i++){ 1749 // char c=cigar.charAt(i); 1750 // if(Tools.isDigit(c)){ 1751 // len=len*10+(c-'0'); 1752 // }else{ 1753 // if(c=='X' || c=='I' || c=='D' || c=='M'){ 1754 // nm+=len; 1755 // } 1756 // len=0; 1757 // } 1758 // } 1759 //// System.err.println("\nRead "+r.id+": nm="+nm+"\n"+cigar+"\n"+new String(r.match)); 1760 // System.err.println("\nRead "+r.id+": nm="+nm); 1761 // } 1762 1763 if(perfect){nm=0;}else if(r.match!=null){ 1764 nm=0; 1765 int leftclip=calcLeftClip(cigar, r.id), rightclip=calcRightClip(cigar, r.id); 1766 final int from=leftclip, to=r.length()-rightclip; 1767 int delsCurrent=0; 1768 for(int i=0, cpos=0; i<r.match.length; i++){ 1769 final byte b=r.match[i]; 1770 1771 // System.err.println("i="+i+", cpos="+cpos+", from="+from+", ") 1772 1773 if(cpos>=from && cpos<to){ 1774 if(b=='I' || b=='S' || b=='N' || b=='X' || b=='Y'){nm++;} 1775 1776 if(b=='D'){delsCurrent++;} 1777 else{ 1778 if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;} 1779 delsCurrent=0; 1780 } 1781 } 1782 if(b!='D'){cpos++;} 1783 } 1784 if(delsCurrent<=INTRON_LIMIT){nm+=delsCurrent;} 1785 // assert(false) : nm+", "+dels+", "+delsCurrent+", "+r.length()+", "+r.match.length; 1786 1787 // assert(false) : "rlen="+r.length()+", nm="+nm+", dels="+delsCurrent+", intron="+INTRON_LIMIT+", inbound1="+inbounds+", ib2="+inbounds2+"\n"+new String(r.match); 1788 1789 // System.err.println("\nRead "+r.id+": left="+leftclip+", right="+rightclip+", nm="+nm+"\n"+cigar+"\n"+new String(r.match)); 1790 1791 } 1792 1793 if(MAKE_NM_TAG){ 1794 if(perfect){optionalTags.add("NM:i:0");} 1795 else if(r.match!=null){optionalTags.add("NM:i:"+(nm));} 1796 } 1797 if(MAKE_SM_TAG){optionalTags.add("SM:i:"+mapq);} 1798 if(MAKE_AM_TAG){optionalTags.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, r2.mapScore/r2.length()) : 0)));} 1799 1800 if(MAKE_TOPHAT_TAGS){ 1801 optionalTags.add("AS:i:0"); 1802 if(cigar==null || cigar.indexOf('N')<0){ 1803 optionalTags.add("XN:i:0"); 1804 }else{ 1805 } 1806 optionalTags.add("XM:i:0"); 1807 optionalTags.add("XO:i:0"); 1808 optionalTags.add("XG:i:0"); 1809 if(cigar==null || cigar.indexOf('N')<0){ 1810 optionalTags.add("YT:Z:UU"); 1811 }else{ 1812 } 1813 optionalTags.add("NH:i:1"); 1814 }else if(MAKE_XM_TAG){//XM tag. For bowtie compatibility; unfortunately it is poorly defined. 1815 int x=0; 1816 if(r.discarded() || (!r.ambiguous() && !mapped)){ 1817 x=0;//TODO: See if the flag needs to be present in this case. 1818 }else if(mapped){ 1819 x=1; 1820 if(r.numSites()>0 && r.numSites()>0){ 1821 int z=r.topSite().score; 1822 for(int i=1; i<r.sites.size(); i++){ 1823 SiteScore ss=r.sites.get(i); 1824 if(ss!=null && ss.score==z){x++;} 1825 } 1826 } 1827 if(r.ambiguous()){x=Tools.max(x, 2);} 1828 } 1829 if(x>=0){optionalTags.add("XM:i:"+x);} 1830 } 1831 1832 //XS tag 1833 if(MAKE_XS_TAG){ 1834 String xs=makeXSTag(r); 1835 if(xs!=null){ 1836 optionalTags.add(xs); 1837 assert(r2==null || r.pairnum()!=r2.pairnum()); 1838 // assert(r2==null || !r2.mapped() || r.strand()==r2.strand() || makeXSTag(r2)==xs) : 1839 // "XS problem:\n"+r+"\n"+r2+"\n"+xs+"\n"+makeXSTag(r2)+"\n"; 1840 } 1841 } 1842 1843 if(MAKE_MD_TAG){ 1844 String md=makeMdTag(r.chrom, r.start, r.match, r.bases, scafloc, scaflen); 1845 if(md!=null){optionalTags.add(md);} 1846 } 1847 1848 if(r.mapped() && MAKE_NH_TAG){ 1849 if(ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS && r.numSites()>1){ 1850 optionalTags.add("NH:i:"+r.sites.size()); 1851 }else{ 1852 optionalTags.add("NH:i:1"); 1853 } 1854 } 1855 1856 if(MAKE_STOP_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeStopTag(pos, r.length(), cigar, perfect));} 1857 1858 if(MAKE_LENGTH_TAG && (perfect || (r.match!=null && r.bases!=null))){optionalTags.add(makeLengthTag(pos, r.length(), cigar, perfect));} 1859 1860 if(MAKE_IDENTITY_TAG && (perfect || r.match!=null)){optionalTags.add(makeIdentityTag(r.match, perfect));} 1861 1862 if(MAKE_SCORE_TAG && r.mapped()){optionalTags.add(makeScoreTag(r.mapScore));} 1863 1864 if(MAKE_INSERT_TAG && r2!=null){ 1865 if((r.mapped() && r.paired()) || r.originalSite!=null){ 1866 optionalTags.add("X8:Z:"+r.insertSizeMapped(false)+(r.originalSite==null ? "" : ","+r.insertSizeOriginalSite())); 1867 } 1868 } 1869 if(MAKE_CORRECTNESS_TAG){ 1870 final SiteScore ss0=r.originalSite; 1871 if(ss0!=null){ 1872 optionalTags.add("X9:Z:"+(ss0.isCorrect(r.chrom, r.strand(), r.start, r.stop, 0) ? "T" : "F")); 1873 } 1874 } 1875 } 1876 1877 if(READGROUP_ID!=null){ 1878 assert(READGROUP_TAG!=null); 1879 optionalTags.add(READGROUP_TAG); 1880 } 1881 1882 if(MAKE_CUSTOM_TAGS){ 1883 int sites=r.numSites() + (r.originalSite==null ? 0 : 1); 1884 if(sites>0){ 1885 ByteBuilder sb=new ByteBuilder(); 1886 sb.append("X1:Z:"); 1887 if(r.sites!=null){ 1888 for(SiteScore ss : r.sites){ 1889 sb.append('$'); 1890 sb.append(ss.toText()); 1891 } 1892 } 1893 if(r.originalSite!=null){ 1894 sb.append('$'); 1895 sb.append('*'); 1896 sb.append(r.originalSite.toText()); 1897 } 1898 optionalTags.add(sb.toString()); 1899 } 1900 1901 if(mapped){ 1902 if(r.match!=null){ 1903 byte[] match=r.match; 1904 if(!r.shortmatch()){ 1905 match=Read.toShortMatchString(match); 1906 } 1907 optionalTags.add("X2:Z:"+new String(match)); 1908 } 1909 1910 optionalTags.add("X3:i:"+r.mapScore); 1911 } 1912 optionalTags.add("X5:Z:"+r.numericID); 1913 optionalTags.add("X6:i:"+(r.flags|(r.match==null ? 0 : Read.SHORTMATCHMASK))); 1914 if(r.copies>1){optionalTags.add("X7:i:"+r.copies);} 1915 } 1916 1917 if(MAKE_TIME_TAG){ 1918 assert(r.obj!=null && r.obj.getClass()==Long.class) : r.obj; 1919 optionalTags.add("X0:i:"+(r.obj==null ? 0 : r.obj)); 1920 } 1921 1922 if(MAKE_BOUNDS_TAG){ 1923 String a=(r.mapped() ? inbounds ? "I" : "O" : "U"); 1924 if(r2==null){ 1925 optionalTags.add("XB:Z:"+a); 1926 }else{ 1927 String b=(r2.mapped() ? inbounds2 ? "I" : "O" : "U"); 1928 optionalTags.add("XB:Z:"+a+b); 1929 } 1930 } 1931 1932 return optionalTags; 1933 } 1934 1935 /*--------------------------------------------------------------*/ 1936 /*---------------- ? ----------------*/ 1937 /*--------------------------------------------------------------*/ 1938 1939 /** Length of read bases */ length()1940 public int length(){ 1941 assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) : 1942 "This program requires bases or a cigar string for every sam line. Problem line:\n"+this+"\n"; 1943 return seq==null ? calcCigarBases(cigar, true, false) : seq.length; 1944 } 1945 1946 // public int length(boolean includeSoftClip){ 1947 // assert((seq!=null && (seq.length!=1 || seq[0]!='*')) || cigar!=null) : 1948 // "This program requires bases or a cigar string for every sam line. Problem line:\n"+this+"\n"; 1949 // return seq==null ? calcCigarBases(cigar, includeSoftClip, false) : seq.length; 1950 // } 1951 toMapq(Read r, SiteScore ss)1952 public static int toMapq(Read r, SiteScore ss){ 1953 assert(r!=null); 1954 int score=(ss==null ? r.mapScore : ss.slowScore); 1955 return toMapq(score, r.length(), r.mapped(), r.ambiguous()); 1956 } 1957 toMapq(int score, int length, boolean mapped, boolean ambig)1958 public static int toMapq(int score, int length, boolean mapped, boolean ambig){ 1959 if(!mapped || length<1){return 0;} 1960 1961 if(ambig && PENALIZE_AMBIG){ 1962 float max=3; 1963 float adjusted=(score*max)/(100f*length); 1964 return Tools.max(1, (int)Math.round(adjusted)); 1965 }else{ 1966 float score2=(score-length*40)*1.6f; 1967 float max=1.5f*((float)Tools.log2(length))+36; 1968 float adjusted=(score2*max)/(100f*length); 1969 return Tools.max(4, (int)Math.round(adjusted)); 1970 } 1971 } 1972 1973 parseName()1974 public Read parseName(){ 1975 try { 1976 String[] answer=qname.split("_"); 1977 long id=Long.parseLong(answer[0]); 1978 int trueChrom=Gene.toChromosome(answer[1]); 1979 byte trueStrand=Byte.parseByte(answer[2]); 1980 int trueLoc=Integer.parseInt(answer[3]); 1981 int trueStop=Integer.parseInt(answer[4]); 1982 // for(int i=0; i<quals.length; i++){quals[i]-=33;} 1983 // Read r=new Read(seq.getBytes(), trueChrom, trueStrand, trueLoc, trueStop, qname, quals, false, id); 1984 Read r=new Read(seq, qual, qname, id, trueStrand, trueChrom, trueLoc, trueStop); 1985 return r; 1986 } catch (NumberFormatException e) { 1987 // TODO Auto-generated catch block 1988 e.printStackTrace(); 1989 return null; 1990 } 1991 } 1992 parseNumericId()1993 public long parseNumericId(){ 1994 // return Long.parseLong(qname.substring(0, qname.indexOf('_'))); 1995 return Long.parseLong(qname.split("_")[1]); 1996 } 1997 toRead(boolean parseCustom)1998 public Read toRead(boolean parseCustom){ 1999 return toRead(parseCustom, false); 2000 } 2001 toRead(boolean parseCustom, boolean includeHardClip)2002 public Read toRead(boolean parseCustom, boolean includeHardClip){ 2003 2004 SiteScore originalSite=null; 2005 long numericId_=0; 2006 boolean synthetic=false; 2007 2008 if(parseCustom){ 2009 2010 2011 Header h=new Header(qname, pairnum()); 2012 2013 numericId_=h.id; 2014 int trueChrom=h.bbchrom; 2015 byte trueStrand=(byte)h.strand; 2016 int trueLoc=h.bbstart; 2017 int trueStop=h.bbstop(); 2018 2019 originalSite=new SiteScore(trueChrom, trueStrand, trueLoc, trueStop, 0, 0); 2020 synthetic=true; 2021 2022 // try { 2023 // String[] answer=qname.split("_"); 2024 // numericId_=Long.parseLong(answer[0]); 2025 // int trueChrom=Gene.toChromosome(answer[1]); 2026 // byte trueStrand=Byte.parseByte(answer[2]); 2027 // int trueLoc=Integer.parseInt(answer[3]); 2028 // int trueStop=Integer.parseInt(answer[4]); 2029 // 2030 // originalSite=new SiteScore(trueChrom, trueStrand, trueLoc, trueStop, 0, 0); 2031 // synthetic=true; 2032 // 2033 // } catch (NumberFormatException e) { 2034 // System.err.println("Failed to parse "+qname); 2035 // } catch (NullPointerException e) { 2036 // System.err.println("Bad read with no name."); 2037 // return null; 2038 // } 2039 } 2040 // assert(false) : originalSite; 2041 2042 2043 if(Data.GENOME_BUILD>=0){ 2044 2045 } 2046 2047 int chrom_=-1; 2048 byte strand_=strand(); 2049 int start_=start(true, includeHardClip); 2050 int stop_=stop(start_, true, includeHardClip); 2051 assert(start_<=stop_) : start_+", "+stop_+"\n"+this+"\n"; 2052 2053 if(Data.GENOME_BUILD>=0){ 2054 ScafLoc sc=null; 2055 if(RNAME_AS_BYTES){ 2056 if(rname!=null && (rname.length!=1 || rname[0]!='*')){ 2057 sc=Data.getScafLoc(rname); 2058 assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rname)+"\n"+this; 2059 } 2060 }else{ 2061 if(rnameS!=null && (rnameS.length()!=1 || rnameS.charAt(0)!='*')){ 2062 sc=Data.getScafLoc(rnameS); 2063 assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rnameS)+"\n"+this; 2064 } 2065 } 2066 if(sc!=null){ 2067 chrom_=sc.chrom; 2068 start_+=sc.loc; 2069 stop_+=sc.loc; 2070 } 2071 } 2072 2073 //// byte[] quals=(qual==null || (qual.length()==1 && qual.charAt(0)=='*')) ? null : qual.getBytes(); 2074 //// byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual.clone(); 2075 // byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual; 2076 // byte[] bases=seq==null ? null : seq.clone(); 2077 // if(strand_==Gene.MINUS){//Minus-mapped SAM lines have bases and quals reversed 2078 // AminoAcid.reverseComplementBasesInPlace(bases); 2079 // Tools.reverseInPlace(quals); 2080 // } 2081 // Read r=new Read(bases, chrom_, strand_, start_, stop_, qname, quals, cs_, numericId_); 2082 2083 final Read r; 2084 { 2085 byte[] seqX=(seq==null || (seq.length==1 && seq[0]=='*')) ? null : seq; 2086 byte[] qualX=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual; 2087 String qnameX=(qname==null || qname.equals(stringstar)) ? null : qname; 2088 r=new Read(seqX, qualX, qnameX, numericId_, strand_, chrom_, start_, stop_); 2089 } 2090 2091 r.setMapped(mapped()); 2092 r.setSynthetic(synthetic); 2093 // r.setPairnum(pairnum()); //TODO: Enable after fixing assertions that this will break in read input streams. 2094 if(originalSite!=null){ 2095 r.originalSite=originalSite; 2096 } 2097 2098 r.mapScore=mapq; 2099 r.setSecondary(!primary()); 2100 2101 // if(mapped()){ 2102 // r.list=new ArrayList<SiteScore>(1); 2103 // r.list.add(new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0)); 2104 // } 2105 2106 // System.out.println(optional); 2107 if(optional!=null){ 2108 for(String s : optional){ 2109 if(s.equals("XT:A:R")){ 2110 r.setAmbiguous(true); 2111 }else if(s.startsWith("X1:Z:")){ 2112 // System.err.println("Found X1 tag!\t"+s); 2113 String[] split=s.split("\\$"); 2114 // assert(false) : Arrays.toString(split); 2115 ArrayList<SiteScore> list=new ArrayList<SiteScore>(3); 2116 2117 for(int i=1; i<split.length; i++){ 2118 // System.err.println("Processing ss\t"+split[i]); 2119 String s2=split[i]; 2120 SiteScore ss=SiteScore.fromText(s2); 2121 if(s2.charAt(0)=='*'){ 2122 r.originalSite=ss; 2123 }else{ 2124 list.add(ss); 2125 } 2126 } 2127 // System.err.println("List size = "+list.size()); 2128 if(list.size()>0){r.sites=list;} 2129 }else if(s.startsWith("X2:Z:")){ 2130 String s2=s.substring(5); 2131 r.match=s2.getBytes(); 2132 }else if(s.startsWith("X3:i:")){ 2133 String s2=s.substring(5); 2134 // r.mapScore=Integer.parseInt(s2); //Messes up generation of ROC curve 2135 }else if(s.startsWith("X5:Z:")){ 2136 String s2=s.substring(5); 2137 r.numericID=Long.parseLong(s2); 2138 }else if(s.startsWith("X6:i:")){ 2139 String s2=s.substring(5); 2140 r.flags=Integer.parseInt(s2); 2141 }else if(s.startsWith("X7:i:")){ 2142 String s2=s.substring(5); 2143 r.copies=Integer.parseInt(s2); 2144 }else{ 2145 // System.err.println("Unknown SAM field:"+s); 2146 } 2147 } 2148 } 2149 // assert(false) : CONVERT_CIGAR_TO_MATCH; 2150 if(r.match==null && cigar!=null && (CONVERT_CIGAR_TO_MATCH || cigar.indexOf('=')>=0)){ 2151 // r.match=cigarToShortMatch(cigar, true); 2152 r.match=toShortMatch(false); 2153 2154 if(r.match!=null){ 2155 r.setShortMatch(true); 2156 if(Tools.indexOf(r.match, (byte)'B')>=0){ 2157 boolean success=r.fixMatchB(); 2158 // if(!success){r.match=null;} 2159 // assert(false) : new String(r.match); 2160 } 2161 // assert(false) : new String(r.match); 2162 } 2163 // assert(false) : new String(r.match); 2164 // System.err.println(">\n"+cigar+"\n"+(r.match==null ? "null" : new String(r.match))); 2165 } 2166 // assert(false) : new String(r.match); 2167 2168 // System.err.println("Resulting read: "+r.toText()); 2169 2170 return r; 2171 2172 } 2173 2174 /*--------------------------------------------------------------*/ 2175 /*---------------- toString ----------------*/ 2176 /*--------------------------------------------------------------*/ 2177 2178 /** Aproximate length of result of SamLine.toText() */ textLength()2179 public int textLength(){ 2180 int len=11; //11 tabs 2181 len+=(3+9+3+9); 2182 len+=(tlen>999 ? 9 : 3); 2183 2184 len+=(qname==null ? 1 : qname.length()); 2185 len+=rnameLen(); 2186 len+=(rnext==null ? 1 : rnext.length); 2187 len+=(cigar==null ? 1 : cigar.length()); 2188 len+=(seq==null ? 1 : seq.length); 2189 len+=(qual==null ? 1 : qual.length); 2190 2191 if(optional!=null){ 2192 len+=optional.size(); 2193 for(String s : optional){len+=s.length();} 2194 } 2195 return len; 2196 } 2197 toText()2198 public ByteBuilder toText(){return toBytes((ByteBuilder)null);} 2199 toBytes(ByteBuilder bb)2200 public ByteBuilder toBytes(ByteBuilder bb){ 2201 2202 final int buflen=Tools.max(rnameLen(), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length)); 2203 2204 if(bb==null){bb=new ByteBuilder(textLength()+4);} 2205 if(qname==null){bb.append('*').append('\t');}else{bb.append(qname).append('\t');} 2206 bb.append(flag).append('\t'); 2207 if(RNAME_AS_BYTES){ 2208 assert(!(rname==null && rnameS!=null)); 2209 appendTo(bb, rname).append('\t'); 2210 }else{ 2211 assert(!(rname!=null && rnameS==null)); 2212 appendTo(bb, rnameS).append('\t'); 2213 } 2214 bb.append(pos).append('\t'); 2215 bb.append(mapq).append('\t'); 2216 if(cigar==null){bb.append('*').append('\t');}else{bb.append(cigar).append('\t');} 2217 appendTo(bb, rnext).append('\t'); 2218 bb.append(pnext).append('\t'); 2219 bb.append(tlen).append('\t'); 2220 2221 if(mapped() && strand()==Shared.MINUS){ 2222 appendReverseComplemented(bb, seq).append('\t'); 2223 appendQualReversed(bb, qual); 2224 }else{ 2225 appendTo(bb, seq).append('\t'); 2226 appendQual(bb, qual); 2227 } 2228 2229 // assert(seq.getClass()==String.class); 2230 // assert(qual.getClass()==String.class); 2231 // sb.append(seq).append('\t'); 2232 // sb.append(qual); 2233 2234 if(optional!=null){ 2235 for(String s : optional){ 2236 bb.tab().append(s); 2237 } 2238 } 2239 return bb; 2240 } 2241 2242 @Override toString()2243 public String toString(){return toBytes(null).toString();} 2244 appendTo(ByteBuilder sb, byte[] a)2245 private static ByteBuilder appendTo(ByteBuilder sb, byte[] a){ 2246 if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} 2247 return sb.append(a); 2248 } 2249 appendTo(ByteBuilder sb, String a)2250 private static ByteBuilder appendTo(ByteBuilder sb, String a){ 2251 if(a==null || a==stringstar || (a.length()==1 && a.charAt(0)=='*')){return sb.append('*');} 2252 return sb.append(a); 2253 } 2254 appendReverseComplemented(ByteBuilder sb, byte[] a)2255 private static ByteBuilder appendReverseComplemented(ByteBuilder sb, byte[] a){ 2256 if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} 2257 2258 sb.ensureExtra(a.length); 2259 byte[] buffer=sb.array; 2260 int i=sb.length; 2261 for(int j=a.length-1; j>=0; i++, j--){buffer[i]=AminoAcid.baseToComplementExtended[a[j]];} 2262 sb.length+=a.length; 2263 2264 return sb; 2265 } 2266 appendQual(ByteBuilder sb, byte[] a)2267 private static ByteBuilder appendQual(ByteBuilder sb, byte[] a){ 2268 if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} 2269 2270 sb.ensureExtra(a.length); 2271 byte[] buffer=sb.array; 2272 int i=sb.length; 2273 for(int j=0; j<a.length; i++, j++){buffer[i]=(byte)(a[j]+33);} 2274 sb.length+=a.length; 2275 2276 return sb; 2277 } 2278 appendQualReversed(ByteBuilder sb, byte[] a)2279 private static ByteBuilder appendQualReversed(ByteBuilder sb, byte[] a){ 2280 if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} 2281 2282 sb.ensureExtra(a.length); 2283 byte[] buffer=sb.array; 2284 int i=sb.length; 2285 for(int j=a.length-1; j>=0; i++, j--){buffer[i]=(byte)(a[j]+33);} 2286 sb.length+=a.length; 2287 2288 return sb; 2289 } 2290 2291 /** Assumes a custom name including original location */ originalContig()2292 public byte[] originalContig(){ 2293 // assert(PARSE_CUSTOM); 2294 int loc=-1; 2295 int count=0; 2296 for(int i=0; i<qname.length() && loc==-1; i++){ 2297 if(qname.charAt(i)=='_'){ 2298 count++; 2299 if(count==6){loc=i;} 2300 } 2301 } 2302 if(loc==-1){ 2303 return null; 2304 } 2305 return qname.substring(loc+1).getBytes(); 2306 } 2307 2308 2309 /*--------------------------------------------------------------*/ 2310 /*---------------- Flag ----------------*/ 2311 /*--------------------------------------------------------------*/ 2312 2313 // Bit Description 2314 // 0x1 template having multiple fragments in sequencing 2315 // 0x2 each fragment properly aligned according to the aligner 2316 // 0x4 fragment unmapped 2317 // 0x8 next fragment in the template unmapped 2318 // 0x10 SEQ being reverse complemented 2319 // 0x20 SEQ of the next fragment in the template being reversed 2320 // 0x40 the first fragment in the template 2321 // 0x80 the last fragment in the template 2322 // 0x100 secondary alignment 2323 // 0x200 not passing quality controls 2324 // 0x400 PCR or optical duplicate 2325 // 0x800 supplementary alignment 2326 2327 makeFlag(Read r, Read r2, int fragNum, boolean sameScaf)2328 public static int makeFlag(Read r, Read r2, int fragNum, boolean sameScaf){ 2329 int flag=0; 2330 if(r2!=null){ 2331 flag|=0x1; 2332 2333 if(r.mapped() && r.valid() && r.match!=null && 2334 (r2==null || (sameScaf && r.paired() && r2.mapped() && r2.valid() && r2.match!=null))){flag|=0x2;} 2335 if(fragNum==0){flag|=0x40;} 2336 if(fragNum>0){flag|=0x80;} 2337 } 2338 if(!r.mapped()){flag|=0x4;} 2339 if(r2!=null && !r2.mapped()){flag|=0x8;} 2340 if(r.strand()==Shared.MINUS){flag|=0x10;} 2341 if(r2!=null && r2.strand()==Shared.MINUS){flag|=0x20;} 2342 if(r.secondary()){flag|=0x100;} 2343 if(r.discarded()){flag|=0x200;} 2344 return flag; 2345 } 2346 2347 hasMate()2348 public boolean hasMate(){ 2349 return (flag&0x1)==0x1; 2350 } 2351 properPair()2352 public boolean properPair(){ 2353 return (flag&0x2)==0x2; 2354 } 2355 mapped(int flag)2356 public static boolean mapped(int flag){ 2357 return (flag&0x4)!=0x4; 2358 } 2359 strand(int flag)2360 public static byte strand(int flag){ 2361 return ((flag&0x10)==0x10 ? (byte)1 : (byte)0); 2362 } 2363 mapped()2364 public boolean mapped(){ 2365 return (flag&0x4)!=0x4; 2366 // 0x4 fragment unmapped 2367 // 0x8 next fragment in the template unmapped 2368 } 2369 nextMapped()2370 public boolean nextMapped(){ 2371 return (flag&0x8)!=0x8; 2372 // 0x4 fragment unmapped 2373 // 0x8 next fragment in the template unmapped 2374 } 2375 strand()2376 public byte strand(){ 2377 return ((flag&0x10)==0x10 ? (byte)1 : (byte)0); 2378 } 2379 mateStrand()2380 public byte mateStrand(){return nextStrand();} nextStrand()2381 public byte nextStrand(){ 2382 return ((flag&0x20)==0x20 ? (byte)1 : (byte)0); 2383 } 2384 firstFragment()2385 public boolean firstFragment(){ 2386 return (flag&0x40)==0x40; 2387 } 2388 lastFragment()2389 public boolean lastFragment(){ 2390 return (flag&0x80)==0x80; 2391 } 2392 pairnum()2393 public int pairnum(){ 2394 return firstFragment() ? 0 : lastFragment() ? 1 : 0; 2395 } 2396 primary()2397 public boolean primary(){return (flag&0x100)==0;} setPrimary(boolean b)2398 public void setPrimary(boolean b){ 2399 if(b){ 2400 flag=flag|0x100; 2401 }else{ 2402 flag=flag&~0x100; 2403 } 2404 } 2405 discarded()2406 public boolean discarded(){ 2407 return (flag&0x200)==0x200; 2408 } 2409 duplicate()2410 public boolean duplicate(){ 2411 return (flag&0x400)==0x400; 2412 } 2413 supplementary()2414 public boolean supplementary(){ 2415 return (flag&0x800)==0x800; 2416 } 2417 leftmost()2418 public boolean leftmost(){ 2419 if(!pairedOnSameChrom() || tlen==0){return true;} 2420 return tlen>0; 2421 } 2422 2423 /*--------------------------------------------------------------*/ 2424 /*---------------- ? ----------------*/ 2425 /*--------------------------------------------------------------*/ 2426 2427 // /** Assumes rname is an integer. */ 2428 // public int chrom(){ 2429 // if(Data.GENOME_BUILD<0){return -1;} 2430 // HashMap sc 2431 // } 2432 2433 /** Assumes rname is an integer. */ chrom_old()2434 public int chrom_old(){ 2435 assert(false); 2436 if(!Tools.isDigit(rname[0]) && !Tools.isDigit(rname[rname.length-1])){ 2437 if(warning){ 2438 warning=false; 2439 System.err.println("Warning - sam lines need a chrom field."); 2440 } 2441 return -1; 2442 } 2443 assert(Shared.anomaly || '*'==rname[0] || (Tools.isDigit(rname[0]) && Tools.isDigit(rname[rname.length-1]))) : 2444 "This is no longer correct, considering that sam lines are named by scaffold. They need a chrom field.\n"+new String(rname); 2445 if(rname==null || Arrays.equals(rname, bytestar) || !(Tools.isDigit(rname[0]) && Tools.isDigit(rname[rname.length-1]))){return -1;} 2446 //return Gene.toChromosome(new String(rname)); 2447 //return Integer.parseInt(new String(rname))); 2448 final byte z='0'; 2449 int x=rname[0]-z; 2450 for(int i=1; i<rname.length; i++){ 2451 x=(x*10)+(rname[i]-z); 2452 } 2453 return x; 2454 } 2455 2456 /** Returns the zero-based starting location of this read on the sequence. */ start(boolean includeSoftClip, boolean includeHardClip)2457 public int start(boolean includeSoftClip, boolean includeHardClip){ 2458 int x=countLeadingClip(cigar, includeSoftClip, includeHardClip); 2459 return pos-1-x; 2460 } 2461 2462 /** Returns the zero-based stop location of this read on the sequence. */ stop(int start, boolean includeSoftClip, boolean includeHardClip)2463 public int stop(int start, boolean includeSoftClip, boolean includeHardClip){ 2464 if(!mapped() || cigar==null || cigar.charAt(0)=='*'){ 2465 // return -1; 2466 return start+(seq==null ? 0 : Tools.max(0, seq.length-1)); 2467 } 2468 int r=start+calcCigarLength(cigar, includeSoftClip, includeHardClip)-1; 2469 2470 // assert(false) : start+", "+r+", "+calcCigarLength(cigar, includeHardClip); 2471 // System.err.println("start= "+start+", stop="+r); 2472 return r; 2473 } 2474 stop2(final int start, final boolean includeSoftClip, final boolean includeHardClip)2475 public int stop2(final int start, final boolean includeSoftClip, final boolean includeHardClip){ 2476 if(mapped() && cigar!=null && cigar.charAt(0)!='*'){return stop(start, includeSoftClip, includeHardClip);} 2477 // return (seq==null ? -1 : start()+seq.length()); 2478 return (seq==null ? -1 : start+seq.length); 2479 } 2480 numericId()2481 public long numericId(){ 2482 return 0; 2483 } 2484 2485 /** This includes half-mapped pairs. */ pairedOnSameChrom()2486 public boolean pairedOnSameChrom(){ 2487 // assert(false) : (rname==null ? "nullX" : new String(rname))+", "+ 2488 // (rnext==null ? "nullX" : new String(rnext))+", "+Tools.equals(rnext, byteequals)+", "+Arrays.equals(rname, rnext)+"\n"+this; 2489 if(RNAME_AS_BYTES){ 2490 return Tools.equals(rnext, byteequals) || Arrays.equals(rname, rnext) || (/*pairnum()==1 &&*/ Tools.equals(rname, byteequals)); 2491 }else{ 2492 return Tools.equals(rnext, byteequals) || Tools.equals(rnameS, rnext) || (/*pairnum()==1 &&*/ stringequals.equals(rnameS)); 2493 } 2494 } 2495 2496 /** Assumes a custom name including original location */ originalContigStart()2497 public int originalContigStart(){ 2498 // assert(PARSE_CUSTOM); 2499 int loc=-1; 2500 int count=0; 2501 for(int i=0; i<qname.length() && loc==-1; i++){ 2502 if(qname.charAt(i)=='_'){ 2503 count++; 2504 if(count==5){loc=i;} 2505 } 2506 } 2507 if(loc==-1){ 2508 return -1; 2509 } 2510 2511 int sum=0; 2512 int mult=1; 2513 for(int i=loc+1; i<qname.length(); i++){ 2514 char c=qname.charAt(i); 2515 if(!Tools.isDigit(c)){ 2516 if(i==loc+1 && c=='-'){mult=-1;} 2517 else{break;} 2518 }else{ 2519 sum=(sum*10)+(c-'0'); 2520 } 2521 } 2522 return sum*mult; 2523 } 2524 2525 /*--------------------------------------------------------------*/ 2526 /*---------------- Getters ----------------*/ 2527 /*--------------------------------------------------------------*/ 2528 rnameLen()2529 public int rnameLen(){ 2530 return (rname==null ? rnameS==null ? 1 : rnameS.length() : rname.length); 2531 } 2532 rname()2533 public byte[] rname(){ 2534 assert(RNAME_AS_BYTES); 2535 return rname; 2536 } rnext()2537 public byte[] rnext(){return rnext;} 2538 setRname(byte[] x)2539 public void setRname(byte[] x){assert(RNAME_AS_BYTES);rname=x;} setRnext(byte[] x)2540 public void setRnext(byte[] x){rnext=x;} 2541 setRname(String x)2542 public void setRname(String x){assert(!RNAME_AS_BYTES);rnameS=x;} setRnext(String x)2543 public void setRnext(String x){rnext=(x==null ? null : x.getBytes());} 2544 rnameS()2545 public String rnameS(){return rnameS!=null ? rnameS : rname==null ? null : new String(rname);} rnextS()2546 public String rnextS(){return rnext==null ? null : new String(rnext);} 2547 2548 /*--------------------------------------------------------------*/ 2549 /*---------------- Fields ----------------*/ 2550 /*--------------------------------------------------------------*/ 2551 mdTag()2552 private String mdTag(){ 2553 if(optional==null){return null;} 2554 for(String s : optional){ 2555 if(s.startsWith("MD:Z:")){return s;} 2556 } 2557 return null; 2558 } 2559 setScafnum(ScafMap scafMap)2560 public int setScafnum(ScafMap scafMap) { 2561 assert(scafnum<0); 2562 2563 String name=null; 2564 if(mapped() || (rname!=null && rname!=byteequals && rname!=bytestar)){ 2565 name=rnameS(); 2566 }else if(nextMapped() && rnext!=null && rnext!=byteequals && rnext!=bytestar){ 2567 name=new String(rnext); 2568 } 2569 if(name!=null){scafnum=scafMap.getNumber(name);} 2570 return scafnum; 2571 } 2572 2573 public long countBytes(){ 2574 long sum=76; 2575 sum+=(cigar==null ? 0 : cigar.length()*2+16); 2576 sum+=(optional==null ? 0 : optional.size()*32+16); 2577 sum+=(rname==null ? 0 : rname.length+16); 2578 sum+=(rnext==null ? 0 : rnext.length+16); 2579 return sum; 2580 } 2581 2582 public String qname; 2583 public int flag; 2584 public int pos; 2585 public int mapq; 2586 public String cigar; 2587 public int pnext; 2588 public int tlen; 2589 public byte[] seq; 2590 public byte[] qual; 2591 public ArrayList<String> optional; 2592 2593 public Object obj; 2594 public int scafnum=-1; 2595 2596 /*--------------------------------------------------------------*/ 2597 /*---------------- Private Fields ----------------*/ 2598 /*--------------------------------------------------------------*/ 2599 2600 private byte[] rname; 2601 private byte[] rnext; 2602 2603 private String rnameS; 2604 2605 /*--------------------------------------------------------------*/ 2606 /*---------------- Static Fields ----------------*/ 2607 /*--------------------------------------------------------------*/ 2608 2609 private static final String stringstar="*"; 2610 private static final String stringequals="="; 2611 private static final byte[] bytestar=new byte[] {(byte)'*'}; 2612 private static final byte[] byteequals=new byte[] {(byte)'='}; 2613 private static final String XSPLUS="XS:A:+", XSMINUS="XS:A:-"; 2614 // private static final double inv100=0.01d; 2615 // private static float minratio=0.4f; 2616 2617 private static boolean warning=System.getProperty("user.dir").contains("/bushnell/"); 2618 2619 /*--------------------------------------------------------------*/ 2620 /*---------------- Public Static Fields ----------------*/ 2621 /*--------------------------------------------------------------*/ 2622 2623 public static boolean makeReadgroupTags(){ 2624 return READGROUP_ID!=null || READGROUP_CN!=null || READGROUP_DS!=null || READGROUP_DT!=null || 2625 READGROUP_FO!=null || READGROUP_KS!=null || READGROUP_LB!=null || READGROUP_PG!=null || 2626 READGROUP_PI!=null || READGROUP_PL!=null || READGROUP_PU!=null || READGROUP_SM!=null || 2627 READGROUP_TAG!=null; 2628 } 2629 2630 public static boolean makeOtherTags(){ 2631 if(NO_TAGS){return false;} 2632 return MAKE_AM_TAG || MAKE_NM_TAG || MAKE_SM_TAG || MAKE_XM_TAG || MAKE_XS_TAG || MAKE_AS_TAG || 2633 MAKE_NH_TAG || MAKE_TOPHAT_TAGS || MAKE_IDENTITY_TAG || MAKE_SCORE_TAG || MAKE_STOP_TAG || MAKE_LENGTH_TAG || 2634 MAKE_CUSTOM_TAGS || MAKE_INSERT_TAG || MAKE_CORRECTNESS_TAG || MAKE_TIME_TAG || MAKE_BOUNDS_TAG; 2635 } 2636 2637 public static boolean makeAnyTags(){ 2638 return makeReadgroupTags() || makeOtherTags(); 2639 } 2640 2641 public static String READGROUP_ID=null; 2642 public static String READGROUP_CN=null; 2643 public static String READGROUP_DS=null; 2644 public static String READGROUP_DT=null; 2645 public static String READGROUP_FO=null; 2646 public static String READGROUP_KS=null; 2647 public static String READGROUP_LB=null; 2648 public static String READGROUP_PG=null; 2649 public static String READGROUP_PI=null; 2650 public static String READGROUP_PL=null; 2651 public static String READGROUP_PU=null; 2652 public static String READGROUP_SM=null; 2653 2654 public static String READGROUP_TAG=null; 2655 2656 /** Turn this off for RNAseq or long indels */ 2657 public static boolean MAKE_MD_TAG=false; 2658 2659 public static boolean NO_TAGS=false; 2660 2661 public static boolean MAKE_AM_TAG=true; 2662 public static boolean MAKE_NM_TAG=true; 2663 public static boolean MAKE_SM_TAG=false; 2664 public static boolean MAKE_XM_TAG=false; 2665 public static boolean MAKE_XS_TAG=false; 2666 public static boolean MAKE_AS_TAG=false; //TODO: Alignment score from aligner 2667 public static boolean MAKE_NH_TAG=false; 2668 public static boolean MAKE_TOPHAT_TAGS=false; 2669 public static boolean XS_SECONDSTRAND=false; 2670 public static boolean MAKE_IDENTITY_TAG=false; 2671 public static boolean MAKE_SCORE_TAG=false; 2672 public static boolean MAKE_STOP_TAG=false; 2673 public static boolean MAKE_LENGTH_TAG=false; 2674 public static boolean MAKE_CUSTOM_TAGS=false; 2675 public static boolean MAKE_INSERT_TAG=false; 2676 public static boolean MAKE_CORRECTNESS_TAG=false; 2677 public static boolean MAKE_TIME_TAG=false; 2678 public static boolean MAKE_BOUNDS_TAG=false; 2679 2680 public static boolean PENALIZE_AMBIG=true; 2681 public static boolean CONVERT_CIGAR_TO_MATCH=true; 2682 public static boolean SOFT_CLIP=true; 2683 public static boolean SECONDARY_ALIGNMENT_ASTERISKS=true; 2684 /** OK to use the "setFrom" function which uses the old SamLine instead of translating the read, if a genome is not loaded. */ 2685 public static boolean SET_FROM_OK=false; 2686 /** For paired reads, keep original names rather than changing read2's name to match read1 */ 2687 public static boolean KEEP_NAMES=false; 2688 public static float VERSION=1.4f; 2689 /** Tells program when to use 'N' rather than 'D' in cigar strings */ 2690 public static int INTRON_LIMIT=Integer.MAX_VALUE; 2691 public static boolean RNAME_AS_BYTES=true;//Effect on speed is negligible for pileup... 2692 2693 /** Prefer MD tag over reference for translating cigar strings to match */ 2694 public static boolean PREFER_MDTAG=false; 2695 /** Determine whether cigar X means match N or S. 2696 * This makes sam loading substantially slower. */ 2697 public static boolean FIX_MATCH_NS=false; 2698 2699 public static boolean setxs=false; 2700 public static boolean setintron=false; 2701 2702 // /** SSAHA2 incorrectly calculates the start position of reads with soft-clipped starts, and needs this enabled. */ 2703 // public static boolean SUBTRACT_LEADING_SOFT_CLIP=true; 2704 /** Sort header scaffolds in alphabetical order to be more compatible with Tophat */ 2705 public static boolean SORT_SCAFFOLDS=false; 2706 2707 /** qname */ 2708 public static boolean PARSE_0=true; 2709 /** rname */ 2710 public static boolean PARSE_2=true; 2711 /** cigar */ 2712 public static boolean PARSE_5=true; 2713 /** rnext */ 2714 public static boolean PARSE_6=true; 2715 /** pnext */ 2716 public static boolean PARSE_7=true; 2717 /** tlen */ 2718 public static boolean PARSE_8=true; 2719 /** qual */ 2720 public static boolean PARSE_10=true; 2721 public static boolean PARSE_OPTIONAL=true; 2722 public static boolean PARSE_OPTIONAL_MD_ONLY=false; 2723 2724 public static boolean FLIP_ON_LOAD=true; 2725 2726 public static boolean verbose=false; 2727 2728 } 2729