1 package org.broadinstitute.hellbender.utils.read; 2 3 import htsjdk.samtools.*; 4 import htsjdk.samtools.util.SequenceUtil; 5 import htsjdk.variant.variantcontext.Allele; 6 import org.apache.commons.lang3.ArrayUtils; 7 import org.broadinstitute.hellbender.exceptions.GATKException; 8 import org.broadinstitute.hellbender.utils.Utils; 9 import org.broadinstitute.hellbender.utils.param.ParamUtils; 10 import org.broadinstitute.hellbender.utils.pileup.PileupElement; 11 12 import java.util.*; 13 14 public final class ArtificialReadUtils { 15 public static final int DEFAULT_READ_LENGTH = 50; 16 17 private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup"; 18 private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane"; 19 private static final String DEFAULT_PLATFORM_PREFIX = "Platform"; 20 private static final String DEFAULT_SAMPLE_NAME = "SampleX"; 21 private static final String DEFAULT_PROGRAM_NAME = "Program"; 22 public static final String READ_GROUP_ID = "x"; 23 24 /** 25 * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc 26 * 27 * @param numberOfChromosomes the number of chromosomes to create 28 * @param startingChromosome the starting number for the chromosome (most likely set to 1) 29 * @param chromosomeSize the length of each chromosome 30 * @return 31 */ createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize)32 public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { 33 SAMFileHeader header = new SAMFileHeader(); 34 header.setSortOrder(SAMFileHeader.SortOrder.coordinate); 35 SAMSequenceDictionary dict = new SAMSequenceDictionary(); 36 // make up some sequence records 37 for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { 38 SAMSequenceRecord rec = new SAMSequenceRecord( Integer.toString(x), chromosomeSize /* size */); 39 rec.setSequenceLength(chromosomeSize); 40 dict.addSequence(rec); 41 } 42 header.setSequenceDictionary(dict); 43 return header; 44 } 45 46 /** 47 * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc 48 * It also adds read groups. 49 * 50 * @param numberOfChromosomes the number of chromosomes to create 51 * @param startingChromosome the starting number for the chromosome (most likely set to 1) 52 * @param chromosomeSize the length of each chromosome 53 * @param groupCount the number of groups to make 54 */ createArtificialSamHeaderWithGroups(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int groupCount)55 public static SAMFileHeader createArtificialSamHeaderWithGroups(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int groupCount) { 56 final SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); 57 58 final List<SAMReadGroupRecord> readGroups = new ArrayList<>(); 59 for (int i = 0; i < groupCount; i++) { 60 SAMReadGroupRecord rec = new SAMReadGroupRecord(DEFAULT_READ_GROUP_PREFIX + i); 61 rec.setSample(DEFAULT_SAMPLE_NAME); 62 readGroups.add(rec); 63 } 64 header.setReadGroups(readGroups); 65 66 for (int i = 0; i < groupCount; i++) { 67 final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroups.get(i).getId()); 68 groupRecord.setPlatform(DEFAULT_PLATFORM_PREFIX + ((i % 2)+1)); 69 groupRecord.setPlatformUnit(DEFAULT_PLATFORM_UNIT_PREFIX + ((i % 3)+1)); 70 } 71 return header; 72 } 73 74 /** 75 * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc 76 * It also adds program records. 77 * 78 * @param numberOfChromosomes the number of chromosomes to create 79 * @param startingChromosome the starting number for the chromosome (most likely set to 1) 80 * @param chromosomeSize the length of each chromosome 81 * @param programCount the number of programs to make 82 * @return 83 */ createArtificialSamHeaderWithPrograms(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int programCount)84 public static SAMFileHeader createArtificialSamHeaderWithPrograms(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int programCount) { 85 final SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); 86 87 final List<SAMProgramRecord> programRecords = new ArrayList<>(); 88 for (int i = 0; i < programCount; i++) { 89 final SAMProgramRecord rec = new SAMProgramRecord(Integer.toString(i)); 90 rec.setCommandLine("run " + Integer.toString(i)); 91 rec.setProgramVersion("1.0"); 92 if (i > 0) { 93 rec.setPreviousProgramGroupId(Integer.toString(i-1)); 94 } 95 rec.setProgramName(DEFAULT_PROGRAM_NAME + i); 96 programRecords.add(rec); 97 } 98 header.setProgramRecords(programRecords); 99 100 return header; 101 } 102 103 /** 104 * Creates an artificial SAM header with standard test parameters 105 * 106 * @return the SAM header 107 */ createArtificialSamHeader()108 public static SAMFileHeader createArtificialSamHeader() { 109 return createArtificialSamHeader(22, 1, 1000000); 110 } 111 112 /** 113 * Creates an artificial SAM header with standard test parameters and a Read Group 114 * 115 * @param readGroup read group 116 * @return the SAM header 117 */ createArtificialSamHeaderWithReadGroup( final SAMReadGroupRecord readGroup )118 public static SAMFileHeader createArtificialSamHeaderWithReadGroup( final SAMReadGroupRecord readGroup ) { 119 final SAMFileHeader header = createArtificialSamHeader(); 120 header.addReadGroup(readGroup); 121 return header; 122 } 123 124 /** 125 * Create an artificial GATKRead based on the parameters. The cigar string will be *M, where * is the length of the read 126 * 127 * @param header the SAM header to associate the read with 128 * @param name the name of the read 129 * @param refIndex the reference index, i.e. what chromosome to associate it with 130 * @param alignmentStart where to start the alignment 131 * @param length the length of the read 132 * @return the artificial GATKRead 133 */ createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length)134 public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { 135 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, length)); 136 } 137 138 /** 139 * Create an artificial GATKRead based on the parameters. The cigar string will be *M, where * is the length of the read 140 * 141 * @param header the SAM header to associate the read with 142 * @param name the name of the read 143 * @param refIndex the reference index, i.e. what chromosome to associate it with 144 * @param alignmentStart where to start the alignment 145 * @param bases the sequence of the read 146 * @param qual the qualities of the read 147 * @return the artificial GATKRead 148 */ createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual)149 public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { 150 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual)); 151 } 152 153 /** 154 * Create an artificial GATKRead based on the parameters. The cigar string will be *M, where * is the length of the read 155 * 156 * @param header the SAM header to associate the read with 157 * @param name the name of the read 158 * @param contig the contig to which the read is aligned 159 * @param alignmentStart where to start the alignment 160 * @param bases the sequence of the read 161 * @param qual the qualities of the read 162 * @return the artificial GATKRead 163 */ createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual)164 public static GATKRead createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual) { 165 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, header.getSequenceIndex(contig), alignmentStart, bases, qual)); 166 } 167 168 /** 169 * Create an artificial GATKRead based on the parameters 170 * 171 * @param header the SAM header to associate the read with 172 * @param name the name of the read 173 * @param refIndex the reference index, i.e. what chromosome to associate it with 174 * @param alignmentStart where to start the alignment 175 * @param bases the sequence of the read 176 * @param qual the qualities of the read 177 * @param cigar the cigar string of the read 178 * @return the artificial GATKRead 179 */ createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar)180 public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { 181 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual, cigar)); 182 } 183 184 185 /** 186 * Create an artificial GATKRead based on the parameters 187 * 188 * @param header the SAM header to associate the read with 189 * @param name the name of the read 190 * @param contig the contig to which the read is aligned 191 * @param alignmentStart where to start the alignment 192 * @param bases the sequence of the read 193 * @param qual the qualities of the read 194 * @param cigar the cigar string of the read 195 * @return the artificial GATKRead 196 */ createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual, String cigar)197 public static GATKRead createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual, String cigar) { 198 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, header.getSequenceIndex(contig), alignmentStart, bases, qual, cigar)); 199 } 200 201 /** 202 * Create an artificial GATKRead with the following default parameters : 203 * header: 204 * numberOfChromosomes = 1 205 * startingChromosome = 1 206 * chromosomeSize = 1000000 207 * read: 208 * name = "default_read" 209 * refIndex = 0 210 * alignmentStart = 10000 211 * 212 * @param header SAM header for the read 213 * @param bases the sequence of the read 214 * @param qual the qualities of the read 215 * @param cigar the cigar string of the read 216 * @return the artificial GATKRead 217 */ createArtificialRead(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar)218 public static GATKRead createArtificialRead(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar) { 219 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar)); 220 } 221 createArtificialRead(final byte[] bases, final byte[] qual, final String cigar)222 public static GATKRead createArtificialRead(final byte[] bases, final byte[] qual, final String cigar) { 223 SAMFileHeader header = createArtificialSamHeader(); 224 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar)); 225 } 226 createArtificialRead(final SAMFileHeader header, final String cigarString)227 public static GATKRead createArtificialRead(final SAMFileHeader header, final String cigarString) { 228 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, TextCigarCodec.decode(cigarString))); 229 } 230 createArtificialRead(final SAMFileHeader header, final Cigar cigar)231 public static GATKRead createArtificialRead(final SAMFileHeader header, final Cigar cigar) { 232 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, cigar)); 233 } 234 createArtificialRead(final Cigar cigar)235 public static GATKRead createArtificialRead(final Cigar cigar) { 236 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(cigar)); 237 } 238 239 /** 240 * Makes a new read with a name that is unique (so that it will return false to equals(otherRead) 241 */ createUniqueArtificialRead(final Cigar cigar)242 public static GATKRead createUniqueArtificialRead(final Cigar cigar) { 243 return new SAMRecordToGATKReadAdapter(createUniqueArtificialSAMRecord(cigar)); 244 } 245 createArtificialRead(final Cigar cigar, final String name)246 public static GATKRead createArtificialRead(final Cigar cigar, final String name) { 247 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(createArtificialSamHeader(), cigar, name)); 248 } 249 createArtificialRead(final String cigarString)250 public static GATKRead createArtificialRead(final String cigarString) { 251 return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(TextCigarCodec.decode(cigarString))); 252 } 253 createArtificialUnmappedRead(final SAMFileHeader header, final byte[] bases, final byte[] qual)254 public static GATKRead createArtificialUnmappedRead(final SAMFileHeader header, final byte[] bases, final byte[] qual) { 255 final SAMRecord read = new SAMRecord(header); 256 read.setReadUnmappedFlag(true); 257 read.setReadBases(bases); 258 read.setBaseQualities(qual); 259 260 return new SAMRecordToGATKReadAdapter(read); 261 } 262 createArtificialUnmappedReadWithAssignedPosition(final SAMFileHeader header, final String contig, final int alignmentStart, final byte[] bases, final byte[] qual)263 public static GATKRead createArtificialUnmappedReadWithAssignedPosition(final SAMFileHeader header, final String contig, final int alignmentStart, final byte[] bases, final byte[] qual) { 264 final SAMRecord read = new SAMRecord(header); 265 read.setReferenceName(contig); 266 read.setAlignmentStart(alignmentStart); 267 read.setReadUnmappedFlag(true); 268 read.setReadBases(bases); 269 read.setBaseQualities(qual); 270 271 return new SAMRecordToGATKReadAdapter(read); 272 } 273 274 /** 275 * Makes a new read with a name that is unique (so that it will return false to equals(otherRead) 276 */ createUniqueArtificialRead(final String cigarString)277 public static GATKRead createUniqueArtificialRead(final String cigarString) { 278 return new SAMRecordToGATKReadAdapter(createUniqueArtificialSAMRecord(TextCigarCodec.decode(cigarString))); 279 } 280 281 /** 282 * Creates an artificial GATKRead backed by a SAMRecord. 283 * 284 * The read will consist of the specified number of Q30 'A' bases, and will be 285 * mapped to contig "1" at the specified start position. 286 * 287 * @param name name of the new read 288 * @param start start position of the new read 289 * @param length number of bases in the new read 290 * @return an artificial GATKRead backed by a SAMRecord. 291 */ createSamBackedRead( final String name, final int start, final int length )292 public static GATKRead createSamBackedRead( final String name, final int start, final int length ) { 293 return createSamBackedRead(name, "1", start, length); 294 } 295 296 /** 297 * Creates an artificial GATKRead backed by a SAMRecord. 298 * 299 * The read will consist of the specified number of Q30 'A' bases, and will be 300 * mapped to the specified contig at the specified start position. 301 * 302 * @param name name of the new read 303 * @param contig contig the new read is mapped to 304 * @param start start position of the new read 305 * @param length number of bases in the new read 306 * @return an artificial GATKRead backed by a SAMRecord. 307 */ createSamBackedRead( final String name, final String contig, final int start, final int length )308 public static GATKRead createSamBackedRead( final String name, final String contig, final int start, final int length ) { 309 final SAMFileHeader header = createArtificialSamHeader(); 310 final byte[] bases = Utils.dupBytes((byte)'A', length); 311 final byte[] quals = Utils.dupBytes((byte) 30, length); 312 313 final SAMRecord sam = createArtificialSAMRecord(header, bases, quals, length + "M"); 314 sam.setReadName(name); 315 sam.setReferenceName(contig); 316 sam.setAlignmentStart(start); 317 return new SAMRecordToGATKReadAdapter(sam); 318 } 319 320 /** 321 * Creates an artificial GATKRead backed by a SAMRecord with no header. 322 * 323 * The read will consist of the specified number of Q30 'A' bases, and will be 324 * mapped to the specified contig at the specified start position. 325 * 326 * @param name name of the new read 327 * @param contig contig the new read is mapped to 328 * @param start start position of the new read 329 * @param length number of bases in the new read 330 * @return an artificial GATKRead backed by a SAMRecord. 331 */ createHeaderlessSamBackedRead( final String name, final String contig, final int start, final int length )332 public static GATKRead createHeaderlessSamBackedRead( final String name, final String contig, final int start, final int length ) { 333 GATKRead read = createSamBackedRead(name, contig, start, length); 334 ((SAMRecordToGATKReadAdapter) read).getEncapsulatedSamRecord().setHeaderStrict(null); 335 return read; 336 } 337 338 /** 339 * Create an artificial SAMRecord based on the parameters. The cigar string will be *M, where * is the length of the read 340 * 341 * @param header the SAM header to associate the read with 342 * @param name the name of the read 343 * @param refIndex the reference index, i.e. what chromosome to associate it with 344 * @param alignmentStart where to start the alignment 345 * @param length the length of the read 346 * @return the artificial SAMRecord 347 */ createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length)348 public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { 349 if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || 350 (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) 351 throw new IllegalArgumentException("Invalid alignment start for artificial read, start = " + alignmentStart); 352 SAMRecord record = new SAMRecord(header); 353 record.setReadName(name); 354 record.setReferenceIndex(refIndex); 355 record.setAlignmentStart(alignmentStart); 356 List<CigarElement> elements = new ArrayList<>(); 357 elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); 358 record.setCigar(new Cigar(elements)); 359 record.setProperPairFlag(false); 360 361 // our reads and quals are all 'A's by default 362 byte[] c = new byte[length]; 363 byte[] q = new byte[length]; 364 for (int x = 0; x < length; x++) 365 c[x] = q[x] = 'A'; 366 record.setReadBases(c); 367 record.setBaseQualities(q); 368 369 if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { 370 record.setReadUnmappedFlag(true); 371 } 372 373 return record; 374 } 375 376 /** 377 * Create an artificial SAMRecord based on the parameters. The cigar string will be *M, where * is the length of the read 378 * 379 * @param header the SAM header to associate the read with 380 * @param name the name of the read 381 * @param refIndex the reference index, i.e. what chromosome to associate it with 382 * @param alignmentStart where to start the alignment 383 * @param bases the sequence of the read 384 * @param qual the qualities of the read 385 * @return the artificial SAMRecord 386 */ createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual)387 public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { 388 if (bases.length != qual.length) { 389 throw new IllegalArgumentException("Passed in read string is different length then the quality array"); 390 } 391 SAMRecord rec = createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases.length); 392 rec.setReadBases(Arrays.copyOf(bases, bases.length)); 393 rec.setBaseQualities(Arrays.copyOf(qual, qual.length)); 394 rec.setAttribute(SAMTag.RG.name(), new SAMReadGroupRecord(READ_GROUP_ID).getId()); 395 if (refIndex == -1) { 396 rec.setReadUnmappedFlag(true); 397 } 398 399 return rec; 400 } 401 402 /** 403 * Create an artificial SAMRecord based on the parameters 404 * 405 * @param header the SAM header to associate the read with 406 * @param name the name of the read 407 * @param refIndex the reference index, i.e. what chromosome to associate it with 408 * @param alignmentStart where to start the alignment 409 * @param bases the sequence of the read 410 * @param qual the qualities of the read 411 * @param cigar the cigar string of the read 412 * @return the artificial SAMRecord 413 */ createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar)414 public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { 415 SAMRecord rec = createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual); 416 rec.setCigarString(cigar); 417 return rec; 418 } 419 420 /** 421 * Create an artificial SAMRecord with the following default parameters : 422 * header: 423 * numberOfChromosomes = 1 424 * startingChromosome = 1 425 * chromosomeSize = 1000000 426 * read: 427 * name = "default_read" 428 * refIndex = 0 429 * alignmentStart = 10000 430 * 431 * @param header SAM header for the read 432 * @param bases the sequence of the read 433 * @param qual the qualities of the read 434 * @param cigar the cigar string of the read 435 * @return the artificial SAMRecord 436 */ createArtificialSAMRecord(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar)437 public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar) { 438 return createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar); 439 } 440 createArtificialSAMRecord(final byte[] bases, final byte[] qual, final String cigar)441 public static SAMRecord createArtificialSAMRecord(final byte[] bases, final byte[] qual, final String cigar) { 442 final SAMFileHeader header = createArtificialSamHeader(); 443 return createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar); 444 } 445 createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar, final String name)446 public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar, final String name) { 447 final int length = cigar.getReadLength(); 448 final byte base = 'A'; 449 final byte qual = 30; 450 final byte [] bases = Utils.dupBytes(base, length); 451 final byte [] quals = Utils.dupBytes(qual, length); 452 return createArtificialSAMRecord(header, name, 0, 10000, bases, quals, cigar.toString()); 453 } 454 createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar)455 public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar) { 456 return createArtificialSAMRecord(header, cigar, "default_read"); 457 } 458 createArtificialSAMRecord(final Cigar cigar)459 public static SAMRecord createArtificialSAMRecord(final Cigar cigar) { 460 final SAMFileHeader header = createArtificialSamHeader(); 461 return createArtificialSAMRecord(header, cigar, "default_read"); 462 } 463 464 /** 465 * Makes a new read with a name that is unique (so that it will return false to equals(otherRead) 466 */ createUniqueArtificialSAMRecord(final Cigar cigar)467 public static SAMRecord createUniqueArtificialSAMRecord(final Cigar cigar) { 468 final SAMFileHeader header = createArtificialSamHeader(); 469 return createArtificialSAMRecord(header, cigar, UUID.randomUUID().toString()); 470 } 471 createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative)472 public static List<GATKRead> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { 473 return createPair(header, name, readLen, 0, leftStart, rightStart, leftIsFirst, leftIsNegative); 474 } 475 createPair(SAMFileHeader header, String name, int readLen, int refIndex, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative)476 public static List<GATKRead> createPair(SAMFileHeader header, String name, int readLen, int refIndex, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { 477 GATKRead left = createArtificialRead(header, name, refIndex, leftStart, readLen); 478 GATKRead right = createArtificialRead(header, name, refIndex, rightStart, readLen); 479 480 left.setIsPaired(true); 481 right.setIsPaired(true); 482 483 left.setIsProperlyPaired(true); 484 right.setIsProperlyPaired(true); 485 486 if ( leftIsFirst ) { 487 left.setIsFirstOfPair(); 488 right.setIsSecondOfPair(); 489 } 490 else { 491 left.setIsSecondOfPair(); 492 right.setIsFirstOfPair(); 493 } 494 495 left.setIsReverseStrand(leftIsNegative); 496 left.setMateIsReverseStrand(!leftIsNegative); 497 right.setIsReverseStrand(!leftIsNegative); 498 right.setMateIsReverseStrand(leftIsNegative); 499 500 left.setMatePosition(header.getSequence(refIndex).getSequenceName(), right.getStart()); 501 right.setMatePosition(header.getSequence(refIndex).getSequenceName(), left.getStart()); 502 503 int isize = rightStart + readLen - leftStart; 504 left.setFragmentLength(isize); 505 right.setFragmentLength(-isize); 506 507 return Arrays.asList(left, right); 508 } 509 510 /** 511 * Create a collection of identical artificial reads based on the parameters. The cigar string for each 512 * read will be *M, where * is the length of the read. 513 * 514 * Useful for testing things like positional downsampling where you care only about the position and 515 * number of reads, and not the other attributes. 516 * 517 * @param size number of identical reads to create 518 * @param header the SAM header to associate each read with 519 * @param name name associated with each read 520 * @param refIndex the reference index, i.e. what chromosome to associate them with 521 * @param alignmentStart where to start each alignment 522 * @param length the length of each read 523 * 524 * @return a collection of stackSize reads all sharing the above properties 525 */ createIdenticalArtificialReads(final int size, final SAMFileHeader header, final String name, final int refIndex, final int alignmentStart, final int length )526 public static Collection<GATKRead> createIdenticalArtificialReads(final int size, final SAMFileHeader header, final String name, final int refIndex, final int alignmentStart, final int length ) { 527 Utils.validateArg(size >= 0, "size must be non-negative"); 528 final Collection<GATKRead> coll = new ArrayList<>(size); 529 for ( int i = 1; i <= size; i++ ) { 530 coll.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); 531 } 532 return coll; 533 } 534 createRandomRead(SAMFileHeader header, int length)535 public static GATKRead createRandomRead(SAMFileHeader header, int length) { 536 List<CigarElement> cigarElements = new LinkedList<>(); 537 cigarElements.add(new CigarElement(length, CigarOperator.M)); 538 Cigar cigar = new Cigar(cigarElements); 539 return createArtificialRead(header, cigar); 540 } 541 createRandomRead(int length)542 public static GATKRead createRandomRead(int length) { 543 SAMFileHeader header = createArtificialSamHeader(); 544 return createRandomRead(header, length); 545 } 546 createRandomRead(SAMFileHeader header, int length, boolean allowNs)547 public static GATKRead createRandomRead(SAMFileHeader header, int length, boolean allowNs) { 548 byte[] quals = createRandomReadQuals(length); 549 byte[] bbases = createRandomReadBases(length, allowNs); 550 return createArtificialRead(bbases, quals, bbases.length + "M"); 551 } 552 createRandomRead(int length, boolean allowNs)553 public static GATKRead createRandomRead(int length, boolean allowNs) { 554 SAMFileHeader header = createArtificialSamHeader(); 555 return createRandomRead(header, length, allowNs); 556 } 557 558 /** 559 * Create random read qualities 560 * 561 * @param length the length of the read 562 * @return an array with randomized base qualities between 0 and 50 563 */ createRandomReadQuals(int length)564 public static byte[] createRandomReadQuals(int length) { 565 Random random = Utils.getRandomGenerator(); 566 byte[] quals = new byte[length]; 567 for (int i = 0; i < length; i++) 568 quals[i] = (byte) random.nextInt(50); 569 return quals; 570 } 571 572 /** 573 * Create random read qualities 574 * 575 * @param length the length of the read 576 * @param allowNs whether or not to allow N's in the read 577 * @return an array with randomized bases (A-N) with equal probability 578 */ createRandomReadBases(int length, boolean allowNs)579 public static byte[] createRandomReadBases(int length, boolean allowNs) { 580 Random random = Utils.getRandomGenerator(); 581 int numberOfBases = allowNs ? 5 : 4; 582 byte[] bases = new byte[length]; 583 for (int i = 0; i < length; i++) { 584 switch (random.nextInt(numberOfBases)) { 585 case 0: 586 bases[i] = 'A'; 587 break; 588 case 1: 589 bases[i] = 'C'; 590 break; 591 case 2: 592 bases[i] = 'G'; 593 break; 594 case 3: 595 bases[i] = 'T'; 596 break; 597 case 4: 598 bases[i] = 'N'; 599 break; 600 default: 601 throw new GATKException("Something went wrong, this is just impossible"); 602 } 603 } 604 return bases; 605 } 606 /** 607 * create an iterator containing the specified read piles 608 * 609 * @param startingChr the chromosome (reference ID) to start from 610 * @param endingChr the id to end with 611 * @param readCount the number of reads per chromosome 612 * @return iterator representing the specified amount of fake data 613 */ mappedReadIterator(int startingChr, int endingChr, int readCount)614 public static ArtificialReadQueryIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { 615 SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); 616 617 return new ArtificialReadQueryIterator(startingChr, endingChr, readCount, 0, header); 618 } 619 620 /** 621 * create an iterator containing the specified read piles 622 * 623 * @param startingChr the chromosome (reference ID) to start from 624 * @param endingChr the id to end with 625 * @param readCount the number of reads per chromosome 626 * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file 627 * @return iterator representing the specified amount of fake data 628 */ mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount)629 public static ArtificialReadQueryIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { 630 SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); 631 632 return new ArtificialReadQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); 633 } 634 635 /** 636 * Creates an artificial SAM header based on the sequence dictionary dict 637 * 638 * @return a new SAM header 639 */ createArtificialSamHeader(final SAMSequenceDictionary dict)640 public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { 641 SAMFileHeader header = new SAMFileHeader(); 642 header.setSortOrder(SAMFileHeader.SortOrder.coordinate); 643 header.setSequenceDictionary(dict); 644 return header; 645 } 646 647 /** 648 * Create a pileupElement with the given insertion added to the bases. 649 * 650 * Assumes the insertion is prepended with one "reference" base. 651 * 652 * @param offsetIntoRead the offset into the read where the insertion Allele should appear. As a reminder, the 653 * insertion allele should have a single ref base prepend. Must be 0 - (lengthOfRead-1) 654 * @param insertionAllele the allele as you would see in a VCF for the insertion. So, it is prepended with a ref base. Never {@code null} 655 * @param lengthOfRead the length of the artificial read. Does not include any length differences due to the spliced indel. Must be greater than zero. 656 * @return pileupElement with an artificial read containing the insertion. 657 */ createSplicedInsertionPileupElement(int offsetIntoRead, final Allele insertionAllele, final int lengthOfRead)658 public static PileupElement createSplicedInsertionPileupElement(int offsetIntoRead, final Allele insertionAllele, final int lengthOfRead) { 659 660 ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0."); 661 ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + "."); 662 Utils.nonNull(insertionAllele); 663 664 int remainingReadLength = lengthOfRead - ((offsetIntoRead + 1) + (insertionAllele.getBases().length - 1)); 665 String cigarString = (offsetIntoRead + 1) + "M" + (insertionAllele.getBases().length - 1) + "I"; 666 if (remainingReadLength > 0) { 667 cigarString += (remainingReadLength + "M"); 668 } 669 670 final Cigar cigar = TextCigarCodec.decode(cigarString); 671 final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar); 672 final PileupElement pileupElement = PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead); 673 674 // Splice in that insertion. 675 final byte[] bases = gatkRead.getBases(); 676 final int newReadLength = lengthOfRead + insertionAllele.getBases().length - 1; 677 final byte[] destBases = new byte[newReadLength]; 678 final byte[] basesToInsert = ArrayUtils.subarray(insertionAllele.getBases(), 1, insertionAllele.getBases().length); 679 System.arraycopy(bases, 0, destBases, 0, offsetIntoRead); 680 681 // Make sure that the one prepended "reference base" matches the input. 682 destBases[offsetIntoRead] = insertionAllele.getBases()[0]; 683 684 System.arraycopy(basesToInsert, 0, destBases, offsetIntoRead+1, basesToInsert.length); 685 686 if ((offsetIntoRead + 1) < lengthOfRead) { 687 System.arraycopy(bases, offsetIntoRead + 1, destBases, offsetIntoRead + basesToInsert.length + 1, bases.length - 1 - offsetIntoRead); 688 } 689 690 gatkRead.setBases(destBases); 691 692 return pileupElement; 693 } 694 695 /** See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}, except that this method returns a 696 * pileup element containing the specified deletion. 697 * 698 * @param offsetIntoRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement} 699 * @param referenceAllele the reference allele as you would see in a VCF for the deletion. 700 * In other words, it is the deletion prepended with a single ref base. Never {@code null} 701 * @param lengthOfRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement} 702 * @return pileupElement with an artificial read containing the deletion. 703 */ createSplicedDeletionPileupElement(int offsetIntoRead, final Allele referenceAllele, final int lengthOfRead)704 public static PileupElement createSplicedDeletionPileupElement(int offsetIntoRead, final Allele referenceAllele, final int lengthOfRead) { 705 ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0."); 706 ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + "."); 707 Utils.nonNull(referenceAllele); 708 709 // Do not include the prepended "ref" 710 final int numberOfSpecifiedBasesToDelete = referenceAllele.getBases().length - 1; 711 final int numberOfBasesToActuallyDelete = Math.min(numberOfSpecifiedBasesToDelete, lengthOfRead - offsetIntoRead - 1); 712 713 final int newReadLength = lengthOfRead - numberOfBasesToActuallyDelete; 714 715 String cigarString = (offsetIntoRead + 1) + "M"; 716 717 if (numberOfBasesToActuallyDelete > 0) { 718 cigarString += numberOfBasesToActuallyDelete + "D"; 719 } 720 final int remainingBases = lengthOfRead - (offsetIntoRead + 1) - numberOfBasesToActuallyDelete; 721 if (remainingBases > 0) { 722 cigarString += remainingBases + "M"; 723 } 724 725 final Cigar cigar = TextCigarCodec.decode(cigarString); 726 final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar); 727 final PileupElement pileupElement = PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead); 728 729 // The Cigar string has basically already told the initial generation of a read to delete bases. 730 final byte[] bases = gatkRead.getBases(); 731 732 // Make sure that the one prepended "reference base" matches the input. 733 bases[offsetIntoRead] = referenceAllele.getBases()[0]; 734 735 gatkRead.setBases(bases); 736 737 return pileupElement; 738 } 739 740 /** 741 * See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}, except that this method returns a 742 * pileup element containing base-by-base replacement. As a result, the length of the read will not change. 743 * 744 * @param offsetIntoRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement} 745 * @param newAllele The new bases that should be in the read at the specified position. If this allele causes the 746 * replacement to extend beyond the end of the read 747 * (i.e. offsetIntoRead + length(newAllele) is greater than length of read), 748 * the replacement will be truncated. 749 * @param lengthOfRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement} 750 * @return pileupElement with an artificial read containing the new bases specified by te given allele. 751 */ createNonIndelPileupElement(final int offsetIntoRead, final Allele newAllele, final int lengthOfRead)752 public static PileupElement createNonIndelPileupElement(final int offsetIntoRead, final Allele newAllele, final int lengthOfRead) { 753 ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0."); 754 ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + "."); 755 Utils.nonNull(newAllele); 756 757 final String cigarString = lengthOfRead + "M"; 758 759 final Cigar cigar = TextCigarCodec.decode(cigarString); 760 final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar); 761 final byte[] newBases = gatkRead.getBases(); 762 final int upperBound = Math.min(offsetIntoRead + newAllele.getBases().length, lengthOfRead); 763 for (int i = offsetIntoRead; i < upperBound; i++) { 764 newBases[i] = newAllele.getBases()[i - offsetIntoRead]; 765 } 766 gatkRead.setBases(newBases); 767 return PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead); 768 } 769 } 770