1 package org.broadinstitute.hellbender.tools.spark;
2 
3 import htsjdk.samtools.*;
4 import htsjdk.samtools.BAMSBIIndexer;
5 import htsjdk.samtools.seekablestream.SeekableFileStream;
6 import htsjdk.samtools.seekablestream.SeekableStream;
7 import htsjdk.samtools.util.BlockCompressedFilePointerUtil;
8 import htsjdk.samtools.util.FileExtensions;
9 import org.apache.logging.log4j.LogManager;
10 import org.apache.logging.log4j.Logger;
11 import org.broadinstitute.barclay.argparser.Argument;
12 import org.broadinstitute.barclay.argparser.BetaFeature;
13 import org.broadinstitute.barclay.argparser.CommandLineException;
14 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
15 import org.broadinstitute.barclay.help.DocumentedFeature;
16 import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
17 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
18 import org.broadinstitute.hellbender.exceptions.UserException;
19 import org.broadinstitute.hellbender.utils.io.IOUtils;
20 import org.broadinstitute.hellbender.utils.read.ReadConstants;
21 import org.codehaus.plexus.util.FileUtils;
22 import picard.cmdline.programgroups.OtherProgramGroup;
23 
24 import java.io.*;
25 
26 /**
27  * Create a Hadoop BAM splitting index and optionally a BAM index from a BAM file.
28  *
29  * <h3>Input</h3>
30  *
31  * <ul>
32  *     <li>A BAM file</li>
33  * </ul>
34  *
35  * <h3>Output</h3>
36  *
37  * <ul>
38  *     <li>BAM splitting index file</li>
39  *     <li>BAM bai index (optional)</li>
40  * </ul>
41  *
42  * <h3>Usage example</h3>
43  * <pre>
44  *     gatk CreateHadoopBamSplittingIndex \
45  *         -I input_reads.bam \
46  *         -O input_reads.bam.sbi
47  * </pre>
48  * or if one wants to generate bai as well
49  * <pre>
50  *     gatk CreateHadoopBamSplittingIndex \
51  *         -I input_reads.bam \
52  *         -O input_reads.bam.sbi \
53  *         --create-bai
54  * </pre>
55  */
56 @CommandLineProgramProperties(summary = "Create a Hadoop BAM splitting index and optionally a BAM index from a BAM file",
57         oneLineSummary = "Create a Hadoop BAM splitting index" ,
58         programGroup = OtherProgramGroup.class)
59 @DocumentedFeature
60 @BetaFeature
61 public final class CreateHadoopBamSplittingIndex extends CommandLineProgram {
62     private static final Logger logger = LogManager.getLogger(CreateHadoopBamSplittingIndex.class);
63 
64     public static final String SPLITTING_INDEX_GRANULARITY_LONG_NAME = "splitting-index-granularity";
65     public static final String CREATE_BAI_LONG_NAME = "create-bai";
66 
67     @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
68              shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
69              doc = "BAM file to create a HadoopBam splitting index for",
70              optional = false )
71     public File inputBam;
72 
73     @Argument(fullName = StandardArgumentDefinitions.READ_VALIDATION_STRINGENCY_LONG_NAME,
74             shortName = StandardArgumentDefinitions.READ_VALIDATION_STRINGENCY_SHORT_NAME,
75             doc = "Validation stringency for all SAM/BAM/CRAM/SRA files read by this program.  The default stringency value SILENT " +
76                     "can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) " +
77                     "do not otherwise need to be decoded.",
78             common=true,
79             optional=true)
80     public ValidationStringency readValidationStringency = ReadConstants.DEFAULT_READ_VALIDATION_STRINGENCY;
81 
82     @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
83             shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
84             doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " +
85                     "the input file but with the additional extension " + FileExtensions.SBI,
86             optional = true)
87     public File output;
88 
89     @Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME,
90             doc = "Splitting index granularity, an entry is created in the index every this many reads.",
91             optional = true)
92     public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY;
93 
94     @Argument(fullName = CREATE_BAI_LONG_NAME,
95             doc = "Set this to create a bai index at the same time as creating a splitting index",
96             optional = true)
97     public boolean createBai = false;
98 
99 
100     @Override
doWork()101     public Object doWork() {
102         if( granularity <= 0) {
103             throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0");
104         }
105         final File index = getOutputFile(output, inputBam);
106         if(createBai){
107             createBaiAndSplittingIndex(inputBam, index, granularity, readValidationStringency);
108         } else {
109             createOnlySplittingIndex(inputBam, index, granularity);
110         }
111 
112         return 0;
113     }
114 
createOnlySplittingIndex(final File inputBam, final File index, final long granularity)115     private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) {
116         assertIsBam(inputBam);
117         try(SeekableStream in = new SeekableFileStream(inputBam);
118             BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
119                 BAMSBIIndexer.createIndex(in, out, granularity);
120         } catch (final IOException e) {
121             throw new UserException("Couldn't create splitting index", e);
122         }
123     }
124 
createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency)125     private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) {
126         assertIsBam(inputBam);
127         try(SamReader reader = SamReaderFactory.makeDefault()
128                 .validationStringency(readValidationStringency)
129                 .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true)
130                 .open(inputBam);
131             BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) {
132                 final SAMFileHeader header = reader.getFileHeader();
133                 assertBamIsCoordinateSorted(header);
134                 final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity);
135 
136                 final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, FileExtensions.BAI_INDEX), header);
137                 BAMFileSpan lastFilePointer = null;
138                 for(final SAMRecord read : reader){
139                     BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer();
140                     indexer.processRecord(filePointer.getFirstOffset());
141                     bamIndexer.processAlignment(read);
142                     lastFilePointer = filePointer;
143                 }
144                 long nextStart = 0;
145                 if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) {
146                     nextStart = lastFilePointer.getChunks().get(0).getChunkEnd();
147                 }
148                 if (nextStart == 0) {
149                     nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads)
150                 }
151                 indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added
152                 bamIndexer.finish();
153         } catch (final IOException e) {
154             throw new UserException("Couldn't create splitting index", e);
155         }
156     }
157 
assertBamIsCoordinateSorted(final SAMFileHeader header)158     private static void assertBamIsCoordinateSorted(final SAMFileHeader header) {
159         if( header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
160             throw new UserException.BadInput("Cannot create a " + FileExtensions.BAI_INDEX + " index for a file " +
161                     "that isn't coordinate sorted.");
162         }
163     }
164 
165 
assertIsBam(final File inputBam)166     private static void assertIsBam(final File inputBam) {
167         if(!BamFileIoUtils.isBamFile(inputBam)) {
168             throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a "
169                     + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified.");
170         }
171     }
172 
getOutputFile(final File suggestedOutput, final File input)173     private static File getOutputFile(final File suggestedOutput, final File input) {
174         if(suggestedOutput == null){
175             return new File(input.getPath() + FileExtensions.SBI);
176         } else {
177             if (!suggestedOutput.getAbsolutePath().endsWith("bam" + FileExtensions.SBI)){
178                 logger.warn("Creating a splitting index (SBI) with an extension that doesn't match "
179                         + "bam"+ FileExtensions.SBI + ".  Output file: "+suggestedOutput);
180             }
181             return suggestedOutput;
182         }
183     }
184 }
185