1 package org.broadinstitute.hellbender.utils.read;
2 
3 import htsjdk.samtools.*;
4 import htsjdk.samtools.util.SequenceUtil;
5 import htsjdk.variant.variantcontext.Allele;
6 import org.apache.commons.lang3.ArrayUtils;
7 import org.broadinstitute.hellbender.exceptions.GATKException;
8 import org.broadinstitute.hellbender.utils.Utils;
9 import org.broadinstitute.hellbender.utils.param.ParamUtils;
10 import org.broadinstitute.hellbender.utils.pileup.PileupElement;
11 
12 import java.util.*;
13 
14 public final class ArtificialReadUtils {
15     public static final int DEFAULT_READ_LENGTH = 50;
16 
17     private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup";
18     private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane";
19     private static final String DEFAULT_PLATFORM_PREFIX = "Platform";
20     private static final String DEFAULT_SAMPLE_NAME = "SampleX";
21     private static final String DEFAULT_PROGRAM_NAME = "Program";
22     public static final String READ_GROUP_ID = "x";
23 
24     /**
25      * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc
26      *
27      * @param numberOfChromosomes the number of chromosomes to create
28      * @param startingChromosome  the starting number for the chromosome (most likely set to 1)
29      * @param chromosomeSize      the length of each chromosome
30      * @return
31      */
createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize)32     public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) {
33         SAMFileHeader header = new SAMFileHeader();
34         header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
35         SAMSequenceDictionary dict = new SAMSequenceDictionary();
36         // make up some sequence records
37         for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
38             SAMSequenceRecord rec = new SAMSequenceRecord( Integer.toString(x), chromosomeSize /* size */);
39             rec.setSequenceLength(chromosomeSize);
40             dict.addSequence(rec);
41         }
42         header.setSequenceDictionary(dict);
43         return header;
44     }
45 
46     /**
47      * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc
48      * It also adds read groups.
49      *
50      * @param numberOfChromosomes the number of chromosomes to create
51      * @param startingChromosome  the starting number for the chromosome (most likely set to 1)
52      * @param chromosomeSize      the length of each chromosome
53      * @param groupCount          the number of groups to make
54      */
createArtificialSamHeaderWithGroups(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int groupCount)55     public static SAMFileHeader createArtificialSamHeaderWithGroups(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int groupCount) {
56         final SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
57 
58         final List<SAMReadGroupRecord> readGroups = new ArrayList<>();
59         for (int i = 0; i < groupCount; i++) {
60             SAMReadGroupRecord rec = new SAMReadGroupRecord(DEFAULT_READ_GROUP_PREFIX + i);
61             rec.setSample(DEFAULT_SAMPLE_NAME);
62             readGroups.add(rec);
63         }
64         header.setReadGroups(readGroups);
65 
66         for (int i = 0; i < groupCount; i++) {
67             final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroups.get(i).getId());
68             groupRecord.setPlatform(DEFAULT_PLATFORM_PREFIX + ((i % 2)+1));
69             groupRecord.setPlatformUnit(DEFAULT_PLATFORM_UNIT_PREFIX + ((i % 3)+1));
70         }
71         return header;
72     }
73 
74     /**
75      * Creates an artificial SAM header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc
76      * It also adds program records.
77      *
78      * @param numberOfChromosomes   the number of chromosomes to create
79      * @param startingChromosome    the starting number for the chromosome (most likely set to 1)
80      * @param chromosomeSize        the length of each chromosome
81      * @param programCount          the number of programs to make
82      * @return
83      */
createArtificialSamHeaderWithPrograms(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int programCount)84     public static SAMFileHeader createArtificialSamHeaderWithPrograms(int numberOfChromosomes, int startingChromosome, int chromosomeSize, int programCount) {
85         final SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
86 
87         final List<SAMProgramRecord> programRecords = new ArrayList<>();
88         for (int i = 0; i < programCount; i++) {
89             final SAMProgramRecord rec  = new SAMProgramRecord(Integer.toString(i));
90             rec.setCommandLine("run " + Integer.toString(i));
91             rec.setProgramVersion("1.0");
92             if (i > 0) {
93                 rec.setPreviousProgramGroupId(Integer.toString(i-1));
94             }
95             rec.setProgramName(DEFAULT_PROGRAM_NAME + i);
96             programRecords.add(rec);
97         }
98         header.setProgramRecords(programRecords);
99 
100         return header;
101     }
102 
103     /**
104      * Creates an artificial SAM header with standard test parameters
105      *
106      * @return the SAM header
107      */
createArtificialSamHeader()108     public static SAMFileHeader createArtificialSamHeader() {
109         return createArtificialSamHeader(22, 1, 1000000);
110     }
111 
112     /**
113      * Creates an artificial SAM header with standard test parameters and a Read Group
114      *
115      * @param readGroup read group
116      * @return the SAM header
117      */
createArtificialSamHeaderWithReadGroup( final SAMReadGroupRecord readGroup )118     public static SAMFileHeader createArtificialSamHeaderWithReadGroup( final SAMReadGroupRecord readGroup ) {
119         final SAMFileHeader header = createArtificialSamHeader();
120         header.addReadGroup(readGroup);
121         return header;
122     }
123 
124     /**
125      * Create an artificial GATKRead based on the parameters.  The cigar string will be *M, where * is the length of the read
126      *
127      * @param header         the SAM header to associate the read with
128      * @param name           the name of the read
129      * @param refIndex       the reference index, i.e. what chromosome to associate it with
130      * @param alignmentStart where to start the alignment
131      * @param length         the length of the read
132      * @return the artificial GATKRead
133      */
createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length)134     public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
135         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, length));
136     }
137 
138     /**
139      * Create an artificial GATKRead based on the parameters.  The cigar string will be *M, where * is the length of the read
140      *
141      * @param header         the SAM header to associate the read with
142      * @param name           the name of the read
143      * @param refIndex       the reference index, i.e. what chromosome to associate it with
144      * @param alignmentStart where to start the alignment
145      * @param bases          the sequence of the read
146      * @param qual           the qualities of the read
147      * @return the artificial GATKRead
148      */
createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual)149     public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) {
150         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual));
151     }
152 
153     /**
154      * Create an artificial GATKRead based on the parameters.  The cigar string will be *M, where * is the length of the read
155      *
156      * @param header         the SAM header to associate the read with
157      * @param name           the name of the read
158      * @param contig         the contig to which the read is aligned
159      * @param alignmentStart where to start the alignment
160      * @param bases          the sequence of the read
161      * @param qual           the qualities of the read
162      * @return the artificial GATKRead
163      */
createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual)164     public static GATKRead createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual) {
165         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, header.getSequenceIndex(contig), alignmentStart, bases, qual));
166     }
167 
168     /**
169      * Create an artificial GATKRead based on the parameters
170      *
171      * @param header         the SAM header to associate the read with
172      * @param name           the name of the read
173      * @param refIndex       the reference index, i.e. what chromosome to associate it with
174      * @param alignmentStart where to start the alignment
175      * @param bases          the sequence of the read
176      * @param qual           the qualities of the read
177      * @param cigar          the cigar string of the read
178      * @return the artificial GATKRead
179      */
createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar)180     public static GATKRead createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) {
181         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual, cigar));
182     }
183 
184 
185     /**
186      * Create an artificial GATKRead based on the parameters
187      *
188      * @param header         the SAM header to associate the read with
189      * @param name           the name of the read
190      * @param contig         the contig to which the read is aligned
191      * @param alignmentStart where to start the alignment
192      * @param bases          the sequence of the read
193      * @param qual           the qualities of the read
194      * @param cigar          the cigar string of the read
195      * @return the artificial GATKRead
196      */
createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual, String cigar)197     public static GATKRead createArtificialRead(SAMFileHeader header, String name, String contig, int alignmentStart, byte[] bases, byte[] qual, String cigar) {
198         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, name, header.getSequenceIndex(contig), alignmentStart, bases, qual, cigar));
199     }
200 
201     /**
202      * Create an artificial GATKRead with the following default parameters :
203      * header:
204      * numberOfChromosomes = 1
205      * startingChromosome = 1
206      * chromosomeSize = 1000000
207      * read:
208      * name = "default_read"
209      * refIndex = 0
210      * alignmentStart = 10000
211      *
212      * @param header SAM header for the read
213      * @param bases the sequence of the read
214      * @param qual  the qualities of the read
215      * @param cigar the cigar string of the read
216      * @return the artificial GATKRead
217      */
createArtificialRead(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar)218     public static GATKRead createArtificialRead(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar) {
219         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar));
220     }
221 
createArtificialRead(final byte[] bases, final byte[] qual, final String cigar)222     public static GATKRead createArtificialRead(final byte[] bases, final byte[] qual, final String cigar) {
223         SAMFileHeader header = createArtificialSamHeader();
224         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar));
225     }
226 
createArtificialRead(final SAMFileHeader header, final String cigarString)227     public static GATKRead createArtificialRead(final SAMFileHeader header, final String cigarString) {
228         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, TextCigarCodec.decode(cigarString)));
229     }
230 
createArtificialRead(final SAMFileHeader header, final Cigar cigar)231     public static GATKRead createArtificialRead(final SAMFileHeader header, final Cigar cigar) {
232         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(header, cigar));
233     }
234 
createArtificialRead(final Cigar cigar)235     public static GATKRead createArtificialRead(final Cigar cigar) {
236         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(cigar));
237     }
238 
239     /**
240      * Makes a new read with a name that is unique (so that it will return false to equals(otherRead)
241      */
createUniqueArtificialRead(final Cigar cigar)242     public static GATKRead createUniqueArtificialRead(final Cigar cigar) {
243         return new SAMRecordToGATKReadAdapter(createUniqueArtificialSAMRecord(cigar));
244     }
245 
createArtificialRead(final Cigar cigar, final String name)246     public static GATKRead createArtificialRead(final Cigar cigar, final String name) {
247         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(createArtificialSamHeader(), cigar, name));
248     }
249 
createArtificialRead(final String cigarString)250     public static GATKRead createArtificialRead(final String cigarString) {
251         return new SAMRecordToGATKReadAdapter(createArtificialSAMRecord(TextCigarCodec.decode(cigarString)));
252     }
253 
createArtificialUnmappedRead(final SAMFileHeader header, final byte[] bases, final byte[] qual)254     public static GATKRead createArtificialUnmappedRead(final SAMFileHeader header, final byte[] bases, final byte[] qual) {
255         final SAMRecord read = new SAMRecord(header);
256         read.setReadUnmappedFlag(true);
257         read.setReadBases(bases);
258         read.setBaseQualities(qual);
259 
260         return new SAMRecordToGATKReadAdapter(read);
261     }
262 
createArtificialUnmappedReadWithAssignedPosition(final SAMFileHeader header, final String contig, final int alignmentStart, final byte[] bases, final byte[] qual)263     public static GATKRead createArtificialUnmappedReadWithAssignedPosition(final SAMFileHeader header, final String contig, final int alignmentStart, final byte[] bases, final byte[] qual) {
264         final SAMRecord read = new SAMRecord(header);
265         read.setReferenceName(contig);
266         read.setAlignmentStart(alignmentStart);
267         read.setReadUnmappedFlag(true);
268         read.setReadBases(bases);
269         read.setBaseQualities(qual);
270 
271         return new SAMRecordToGATKReadAdapter(read);
272     }
273 
274     /**
275      * Makes a new read with a name that is unique (so that it will return false to equals(otherRead)
276      */
createUniqueArtificialRead(final String cigarString)277     public static GATKRead createUniqueArtificialRead(final String cigarString) {
278         return new SAMRecordToGATKReadAdapter(createUniqueArtificialSAMRecord(TextCigarCodec.decode(cigarString)));
279     }
280 
281     /**
282      * Creates an artificial GATKRead backed by a SAMRecord.
283      *
284      * The read will consist of the specified number of Q30 'A' bases, and will be
285      * mapped to contig "1" at the specified start position.
286      *
287      * @param name name of the new read
288      * @param start start position of the new read
289      * @param length number of bases in the new read
290      * @return an artificial GATKRead backed by a SAMRecord.
291      */
createSamBackedRead( final String name, final int start, final int length )292     public static GATKRead createSamBackedRead( final String name, final int start, final int length ) {
293         return createSamBackedRead(name, "1", start, length);
294     }
295 
296     /**
297      * Creates an artificial GATKRead backed by a SAMRecord.
298      *
299      * The read will consist of the specified number of Q30 'A' bases, and will be
300      * mapped to the specified contig at the specified start position.
301      *
302      * @param name name of the new read
303      * @param contig contig the new read is mapped to
304      * @param start start position of the new read
305      * @param length number of bases in the new read
306      * @return an artificial GATKRead backed by a SAMRecord.
307      */
createSamBackedRead( final String name, final String contig, final int start, final int length )308     public static GATKRead createSamBackedRead( final String name, final String contig, final int start, final int length ) {
309         final SAMFileHeader header = createArtificialSamHeader();
310         final byte[] bases = Utils.dupBytes((byte)'A', length);
311         final byte[] quals = Utils.dupBytes((byte) 30, length);
312 
313         final SAMRecord sam = createArtificialSAMRecord(header, bases, quals, length + "M");
314         sam.setReadName(name);
315         sam.setReferenceName(contig);
316         sam.setAlignmentStart(start);
317         return new SAMRecordToGATKReadAdapter(sam);
318     }
319 
320     /**
321      * Creates an artificial GATKRead backed by a SAMRecord with no header.
322      *
323      * The read will consist of the specified number of Q30 'A' bases, and will be
324      * mapped to the specified contig at the specified start position.
325      *
326      * @param name name of the new read
327      * @param contig contig the new read is mapped to
328      * @param start start position of the new read
329      * @param length number of bases in the new read
330      * @return an artificial GATKRead backed by a SAMRecord.
331      */
createHeaderlessSamBackedRead( final String name, final String contig, final int start, final int length )332     public static GATKRead createHeaderlessSamBackedRead( final String name, final String contig, final int start, final int length ) {
333         GATKRead read = createSamBackedRead(name, contig, start, length);
334         ((SAMRecordToGATKReadAdapter) read).getEncapsulatedSamRecord().setHeaderStrict(null);
335         return read;
336     }
337 
338     /**
339      * Create an artificial SAMRecord based on the parameters.  The cigar string will be *M, where * is the length of the read
340      *
341      * @param header         the SAM header to associate the read with
342      * @param name           the name of the read
343      * @param refIndex       the reference index, i.e. what chromosome to associate it with
344      * @param alignmentStart where to start the alignment
345      * @param length         the length of the read
346      * @return the artificial SAMRecord
347      */
createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length)348     public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) {
349         if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) ||
350                 (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START))
351             throw new IllegalArgumentException("Invalid alignment start for artificial read, start = " + alignmentStart);
352         SAMRecord record = new SAMRecord(header);
353         record.setReadName(name);
354         record.setReferenceIndex(refIndex);
355         record.setAlignmentStart(alignmentStart);
356         List<CigarElement> elements = new ArrayList<>();
357         elements.add(new CigarElement(length, CigarOperator.characterToEnum('M')));
358         record.setCigar(new Cigar(elements));
359         record.setProperPairFlag(false);
360 
361         // our reads and quals are all 'A's by default
362         byte[] c = new byte[length];
363         byte[] q = new byte[length];
364         for (int x = 0; x < length; x++)
365             c[x] = q[x] = 'A';
366         record.setReadBases(c);
367         record.setBaseQualities(q);
368 
369         if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
370             record.setReadUnmappedFlag(true);
371         }
372 
373         return record;
374     }
375 
376     /**
377      * Create an artificial SAMRecord based on the parameters.  The cigar string will be *M, where * is the length of the read
378      *
379      * @param header         the SAM header to associate the read with
380      * @param name           the name of the read
381      * @param refIndex       the reference index, i.e. what chromosome to associate it with
382      * @param alignmentStart where to start the alignment
383      * @param bases          the sequence of the read
384      * @param qual           the qualities of the read
385      * @return the artificial SAMRecord
386      */
createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual)387     public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) {
388         if (bases.length != qual.length) {
389             throw new IllegalArgumentException("Passed in read string is different length then the quality array");
390         }
391         SAMRecord rec = createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases.length);
392         rec.setReadBases(Arrays.copyOf(bases, bases.length));
393         rec.setBaseQualities(Arrays.copyOf(qual, qual.length));
394         rec.setAttribute(SAMTag.RG.name(), new SAMReadGroupRecord(READ_GROUP_ID).getId());
395         if (refIndex == -1) {
396             rec.setReadUnmappedFlag(true);
397         }
398 
399         return rec;
400     }
401 
402     /**
403      * Create an artificial SAMRecord based on the parameters
404      *
405      * @param header         the SAM header to associate the read with
406      * @param name           the name of the read
407      * @param refIndex       the reference index, i.e. what chromosome to associate it with
408      * @param alignmentStart where to start the alignment
409      * @param bases          the sequence of the read
410      * @param qual           the qualities of the read
411      * @param cigar          the cigar string of the read
412      * @return the artificial SAMRecord
413      */
createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar)414     public static SAMRecord createArtificialSAMRecord(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) {
415         SAMRecord rec = createArtificialSAMRecord(header, name, refIndex, alignmentStart, bases, qual);
416         rec.setCigarString(cigar);
417         return rec;
418     }
419 
420     /**
421      * Create an artificial SAMRecord with the following default parameters :
422      * header:
423      * numberOfChromosomes = 1
424      * startingChromosome = 1
425      * chromosomeSize = 1000000
426      * read:
427      * name = "default_read"
428      * refIndex = 0
429      * alignmentStart = 10000
430      *
431      * @param header SAM header for the read
432      * @param bases the sequence of the read
433      * @param qual  the qualities of the read
434      * @param cigar the cigar string of the read
435      * @return the artificial SAMRecord
436      */
createArtificialSAMRecord(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar)437     public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final byte[] bases, final byte[] qual, final String cigar) {
438         return createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar);
439     }
440 
createArtificialSAMRecord(final byte[] bases, final byte[] qual, final String cigar)441     public static SAMRecord createArtificialSAMRecord(final byte[] bases, final byte[] qual, final String cigar) {
442         final SAMFileHeader header = createArtificialSamHeader();
443         return createArtificialSAMRecord(header, "default_read", 0, 10000, bases, qual, cigar);
444     }
445 
createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar, final String name)446     public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar, final String name) {
447         final int length = cigar.getReadLength();
448         final byte base = 'A';
449         final byte qual = 30;
450         final byte [] bases = Utils.dupBytes(base, length);
451         final byte [] quals = Utils.dupBytes(qual, length);
452         return createArtificialSAMRecord(header, name, 0, 10000, bases, quals, cigar.toString());
453     }
454 
createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar)455     public static SAMRecord createArtificialSAMRecord(final SAMFileHeader header, final Cigar cigar) {
456         return createArtificialSAMRecord(header, cigar, "default_read");
457     }
458 
createArtificialSAMRecord(final Cigar cigar)459     public static SAMRecord createArtificialSAMRecord(final Cigar cigar) {
460         final SAMFileHeader header = createArtificialSamHeader();
461         return createArtificialSAMRecord(header, cigar, "default_read");
462     }
463 
464     /**
465      * Makes a new read with a name that is unique (so that it will return false to equals(otherRead)
466      */
createUniqueArtificialSAMRecord(final Cigar cigar)467     public static SAMRecord createUniqueArtificialSAMRecord(final Cigar cigar) {
468         final SAMFileHeader header = createArtificialSamHeader();
469         return createArtificialSAMRecord(header, cigar, UUID.randomUUID().toString());
470     }
471 
createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative)472     public static List<GATKRead> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
473         return createPair(header, name, readLen, 0, leftStart, rightStart, leftIsFirst, leftIsNegative);
474     }
475 
createPair(SAMFileHeader header, String name, int readLen, int refIndex, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative)476     public static List<GATKRead> createPair(SAMFileHeader header, String name, int readLen, int refIndex, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
477         GATKRead left = createArtificialRead(header, name, refIndex, leftStart, readLen);
478         GATKRead right = createArtificialRead(header, name, refIndex, rightStart, readLen);
479 
480         left.setIsPaired(true);
481         right.setIsPaired(true);
482 
483         left.setIsProperlyPaired(true);
484         right.setIsProperlyPaired(true);
485 
486         if ( leftIsFirst ) {
487             left.setIsFirstOfPair();
488             right.setIsSecondOfPair();
489         }
490         else {
491             left.setIsSecondOfPair();
492             right.setIsFirstOfPair();
493         }
494 
495         left.setIsReverseStrand(leftIsNegative);
496         left.setMateIsReverseStrand(!leftIsNegative);
497         right.setIsReverseStrand(!leftIsNegative);
498         right.setMateIsReverseStrand(leftIsNegative);
499 
500         left.setMatePosition(header.getSequence(refIndex).getSequenceName(), right.getStart());
501         right.setMatePosition(header.getSequence(refIndex).getSequenceName(), left.getStart());
502 
503         int isize = rightStart + readLen - leftStart;
504         left.setFragmentLength(isize);
505         right.setFragmentLength(-isize);
506 
507         return Arrays.asList(left, right);
508     }
509 
510     /**
511      * Create a collection of identical artificial reads based on the parameters.  The cigar string for each
512      * read will be *M, where * is the length of the read.
513      *
514      * Useful for testing things like positional downsampling where you care only about the position and
515      * number of reads, and not the other attributes.
516      *
517      * @param size           number of identical reads to create
518      * @param header         the SAM header to associate each read with
519      * @param name           name associated with each read
520      * @param refIndex       the reference index, i.e. what chromosome to associate them with
521      * @param alignmentStart where to start each alignment
522      * @param length         the length of each read
523      *
524      * @return a collection of stackSize reads all sharing the above properties
525      */
createIdenticalArtificialReads(final int size, final SAMFileHeader header, final String name, final int refIndex, final int alignmentStart, final int length )526     public static Collection<GATKRead> createIdenticalArtificialReads(final int size, final SAMFileHeader header, final String name, final int refIndex, final int alignmentStart, final int length ) {
527         Utils.validateArg(size >= 0, "size must be non-negative");
528         final Collection<GATKRead> coll = new ArrayList<>(size);
529         for ( int i = 1; i <= size; i++ ) {
530             coll.add(createArtificialRead(header, name, refIndex, alignmentStart, length));
531         }
532         return coll;
533     }
534 
createRandomRead(SAMFileHeader header, int length)535     public static GATKRead createRandomRead(SAMFileHeader header, int length) {
536         List<CigarElement> cigarElements = new LinkedList<>();
537         cigarElements.add(new CigarElement(length, CigarOperator.M));
538         Cigar cigar = new Cigar(cigarElements);
539         return createArtificialRead(header, cigar);
540     }
541 
createRandomRead(int length)542     public static GATKRead createRandomRead(int length) {
543         SAMFileHeader header = createArtificialSamHeader();
544         return createRandomRead(header, length);
545     }
546 
createRandomRead(SAMFileHeader header, int length, boolean allowNs)547     public static GATKRead createRandomRead(SAMFileHeader header, int length, boolean allowNs) {
548         byte[] quals = createRandomReadQuals(length);
549         byte[] bbases = createRandomReadBases(length, allowNs);
550         return createArtificialRead(bbases, quals, bbases.length + "M");
551     }
552 
createRandomRead(int length, boolean allowNs)553     public static GATKRead createRandomRead(int length, boolean allowNs) {
554         SAMFileHeader header = createArtificialSamHeader();
555         return createRandomRead(header, length, allowNs);
556     }
557 
558     /**
559      * Create random read qualities
560      *
561      * @param length the length of the read
562      * @return an array with randomized base qualities between 0 and 50
563      */
createRandomReadQuals(int length)564     public static byte[] createRandomReadQuals(int length) {
565         Random random = Utils.getRandomGenerator();
566         byte[] quals = new byte[length];
567         for (int i = 0; i < length; i++)
568             quals[i] = (byte) random.nextInt(50);
569         return quals;
570     }
571 
572     /**
573      * Create random read qualities
574      *
575      * @param length  the length of the read
576      * @param allowNs whether or not to allow N's in the read
577      * @return an array with randomized bases (A-N) with equal probability
578      */
createRandomReadBases(int length, boolean allowNs)579     public static byte[] createRandomReadBases(int length, boolean allowNs) {
580         Random random = Utils.getRandomGenerator();
581         int numberOfBases = allowNs ? 5 : 4;
582         byte[] bases = new byte[length];
583         for (int i = 0; i < length; i++) {
584             switch (random.nextInt(numberOfBases)) {
585                 case 0:
586                     bases[i] = 'A';
587                     break;
588                 case 1:
589                     bases[i] = 'C';
590                     break;
591                 case 2:
592                     bases[i] = 'G';
593                     break;
594                 case 3:
595                     bases[i] = 'T';
596                     break;
597                 case 4:
598                     bases[i] = 'N';
599                     break;
600                 default:
601                     throw new GATKException("Something went wrong, this is just impossible");
602             }
603         }
604         return bases;
605     }
606     /**
607      * create an iterator containing the specified read piles
608      *
609      * @param startingChr the chromosome (reference ID) to start from
610      * @param endingChr   the id to end with
611      * @param readCount   the number of reads per chromosome
612      * @return iterator representing the specified amount of fake data
613      */
mappedReadIterator(int startingChr, int endingChr, int readCount)614     public static ArtificialReadQueryIterator mappedReadIterator(int startingChr, int endingChr, int readCount) {
615         SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
616 
617         return new ArtificialReadQueryIterator(startingChr, endingChr, readCount, 0, header);
618     }
619 
620     /**
621      * create an iterator containing the specified read piles
622      *
623      * @param startingChr       the chromosome (reference ID) to start from
624      * @param endingChr         the id to end with
625      * @param readCount         the number of reads per chromosome
626      * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
627      * @return iterator representing the specified amount of fake data
628      */
mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount)629     public static ArtificialReadQueryIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) {
630         SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
631 
632         return new ArtificialReadQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
633     }
634 
635     /**
636      * Creates an artificial SAM header based on the sequence dictionary dict
637      *
638      * @return a new SAM header
639      */
createArtificialSamHeader(final SAMSequenceDictionary dict)640     public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) {
641         SAMFileHeader header = new SAMFileHeader();
642         header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
643         header.setSequenceDictionary(dict);
644         return header;
645     }
646 
647     /**
648      * Create a pileupElement with the given insertion added to the bases.
649      *
650      * Assumes the insertion is prepended with one "reference" base.
651      *
652      * @param offsetIntoRead the offset into the read where the insertion Allele should appear.  As a reminder, the
653      *                       insertion allele should have a single ref base prepend.  Must be 0 - (lengthOfRead-1)
654      * @param insertionAllele the allele as you would see in a VCF for the insertion.  So, it is prepended with a ref base.  Never {@code null}
655      * @param lengthOfRead the length of the artificial read.  Does not include any length differences due to the spliced indel.  Must be greater than zero.
656      * @return pileupElement with an artificial read containing the insertion.
657      */
createSplicedInsertionPileupElement(int offsetIntoRead, final Allele insertionAllele, final int lengthOfRead)658     public static PileupElement createSplicedInsertionPileupElement(int offsetIntoRead, final Allele insertionAllele, final int lengthOfRead) {
659 
660         ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0.");
661         ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + ".");
662         Utils.nonNull(insertionAllele);
663 
664         int remainingReadLength = lengthOfRead - ((offsetIntoRead + 1) + (insertionAllele.getBases().length - 1));
665         String cigarString = (offsetIntoRead + 1) + "M" + (insertionAllele.getBases().length - 1) + "I";
666         if (remainingReadLength > 0) {
667             cigarString += (remainingReadLength + "M");
668         }
669 
670         final Cigar cigar = TextCigarCodec.decode(cigarString);
671         final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar);
672         final PileupElement pileupElement = PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead);
673 
674         // Splice in that insertion.
675         final byte[] bases = gatkRead.getBases();
676         final int newReadLength = lengthOfRead + insertionAllele.getBases().length - 1;
677         final byte[] destBases = new byte[newReadLength];
678         final byte[] basesToInsert = ArrayUtils.subarray(insertionAllele.getBases(), 1, insertionAllele.getBases().length);
679         System.arraycopy(bases, 0, destBases, 0, offsetIntoRead);
680 
681         // Make sure that the one prepended "reference base" matches the input.
682         destBases[offsetIntoRead] = insertionAllele.getBases()[0];
683 
684         System.arraycopy(basesToInsert, 0, destBases, offsetIntoRead+1, basesToInsert.length);
685 
686         if ((offsetIntoRead + 1) < lengthOfRead) {
687             System.arraycopy(bases, offsetIntoRead + 1, destBases, offsetIntoRead + basesToInsert.length + 1, bases.length - 1 - offsetIntoRead);
688         }
689 
690         gatkRead.setBases(destBases);
691 
692         return pileupElement;
693     }
694 
695     /** See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}, except that this method returns a
696      * pileup element containing the specified deletion.
697      *
698      * @param offsetIntoRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}
699      * @param referenceAllele  the reference allele as you would see in a VCF for the deletion.
700      *                         In other words, it is the deletion prepended with a single ref base.  Never {@code null}
701      * @param lengthOfRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}
702      * @return pileupElement with an artificial read containing the deletion.
703      */
createSplicedDeletionPileupElement(int offsetIntoRead, final Allele referenceAllele, final int lengthOfRead)704     public static PileupElement createSplicedDeletionPileupElement(int offsetIntoRead, final Allele referenceAllele, final int lengthOfRead) {
705         ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0.");
706         ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + ".");
707         Utils.nonNull(referenceAllele);
708 
709         // Do not include the prepended "ref"
710         final int numberOfSpecifiedBasesToDelete = referenceAllele.getBases().length - 1;
711         final int numberOfBasesToActuallyDelete = Math.min(numberOfSpecifiedBasesToDelete, lengthOfRead - offsetIntoRead - 1);
712 
713         final int newReadLength = lengthOfRead - numberOfBasesToActuallyDelete;
714 
715         String cigarString = (offsetIntoRead + 1) + "M";
716 
717         if (numberOfBasesToActuallyDelete > 0) {
718             cigarString += numberOfBasesToActuallyDelete + "D";
719         }
720         final int remainingBases = lengthOfRead - (offsetIntoRead + 1) - numberOfBasesToActuallyDelete;
721         if (remainingBases > 0) {
722             cigarString += remainingBases + "M";
723         }
724 
725         final Cigar cigar = TextCigarCodec.decode(cigarString);
726         final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar);
727         final PileupElement pileupElement = PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead);
728 
729         // The Cigar string has basically already told the initial generation of a read to delete bases.
730         final byte[] bases = gatkRead.getBases();
731 
732         // Make sure that the one prepended "reference base" matches the input.
733         bases[offsetIntoRead] = referenceAllele.getBases()[0];
734 
735         gatkRead.setBases(bases);
736 
737         return pileupElement;
738     }
739 
740     /**
741      * See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}, except that this method returns a
742      * pileup element containing base-by-base replacement.  As a result, the length of the read will not change.
743      *
744      * @param offsetIntoRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}
745      * @param newAllele The new bases that should be in the read at the specified position.  If this allele causes the
746      *                  replacement to extend beyond the end of the read
747      *                  (i.e. offsetIntoRead + length(newAllele) is greater than length of read),
748      *                  the replacement will be truncated.
749      * @param lengthOfRead See {@link ArtificialReadUtils#createSplicedInsertionPileupElement}
750      * @return pileupElement with an artificial read containing the new bases specified by te given allele.
751      */
createNonIndelPileupElement(final int offsetIntoRead, final Allele newAllele, final int lengthOfRead)752     public static PileupElement createNonIndelPileupElement(final int offsetIntoRead, final Allele newAllele, final int lengthOfRead) {
753         ParamUtils.isPositive(lengthOfRead, "length of read is invalid for creating an artificial read, must be greater than 0.");
754         ParamUtils.inRange(offsetIntoRead, 0, lengthOfRead-1, "offset into read is invalid for creating an artificial read, must be 0-" + (lengthOfRead-1) + ".");
755         Utils.nonNull(newAllele);
756 
757         final String cigarString = lengthOfRead + "M";
758 
759         final Cigar cigar = TextCigarCodec.decode(cigarString);
760         final GATKRead gatkRead = ArtificialReadUtils.createArtificialRead(cigar);
761         final byte[] newBases = gatkRead.getBases();
762         final int upperBound = Math.min(offsetIntoRead + newAllele.getBases().length, lengthOfRead);
763         for (int i = offsetIntoRead; i < upperBound; i++) {
764             newBases[i] = newAllele.getBases()[i - offsetIntoRead];
765         }
766         gatkRead.setBases(newBases);
767         return PileupElement.createPileupForReadAndOffset(gatkRead, offsetIntoRead);
768     }
769 }
770