1 package org.broadinstitute.hellbender.tools.spark; 2 3 import htsjdk.samtools.*; 4 import htsjdk.samtools.BAMSBIIndexer; 5 import htsjdk.samtools.seekablestream.SeekableFileStream; 6 import htsjdk.samtools.seekablestream.SeekableStream; 7 import htsjdk.samtools.util.BlockCompressedFilePointerUtil; 8 import htsjdk.samtools.util.FileExtensions; 9 import org.apache.logging.log4j.LogManager; 10 import org.apache.logging.log4j.Logger; 11 import org.broadinstitute.barclay.argparser.Argument; 12 import org.broadinstitute.barclay.argparser.BetaFeature; 13 import org.broadinstitute.barclay.argparser.CommandLineException; 14 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; 15 import org.broadinstitute.barclay.help.DocumentedFeature; 16 import org.broadinstitute.hellbender.cmdline.CommandLineProgram; 17 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; 18 import org.broadinstitute.hellbender.exceptions.UserException; 19 import org.broadinstitute.hellbender.utils.io.IOUtils; 20 import org.broadinstitute.hellbender.utils.read.ReadConstants; 21 import org.codehaus.plexus.util.FileUtils; 22 import picard.cmdline.programgroups.OtherProgramGroup; 23 24 import java.io.*; 25 26 /** 27 * Create a Hadoop BAM splitting index and optionally a BAM index from a BAM file. 28 * 29 * <h3>Input</h3> 30 * 31 * <ul> 32 * <li>A BAM file</li> 33 * </ul> 34 * 35 * <h3>Output</h3> 36 * 37 * <ul> 38 * <li>BAM splitting index file</li> 39 * <li>BAM bai index (optional)</li> 40 * </ul> 41 * 42 * <h3>Usage example</h3> 43 * <pre> 44 * gatk CreateHadoopBamSplittingIndex \ 45 * -I input_reads.bam \ 46 * -O input_reads.bam.sbi 47 * </pre> 48 * or if one wants to generate bai as well 49 * <pre> 50 * gatk CreateHadoopBamSplittingIndex \ 51 * -I input_reads.bam \ 52 * -O input_reads.bam.sbi \ 53 * --create-bai 54 * </pre> 55 */ 56 @CommandLineProgramProperties(summary = "Create a Hadoop BAM splitting index and optionally a BAM index from a BAM file", 57 oneLineSummary = "Create a Hadoop BAM splitting index" , 58 programGroup = OtherProgramGroup.class) 59 @DocumentedFeature 60 @BetaFeature 61 public final class CreateHadoopBamSplittingIndex extends CommandLineProgram { 62 private static final Logger logger = LogManager.getLogger(CreateHadoopBamSplittingIndex.class); 63 64 public static final String SPLITTING_INDEX_GRANULARITY_LONG_NAME = "splitting-index-granularity"; 65 public static final String CREATE_BAI_LONG_NAME = "create-bai"; 66 67 @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, 68 shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, 69 doc = "BAM file to create a HadoopBam splitting index for", 70 optional = false ) 71 public File inputBam; 72 73 @Argument(fullName = StandardArgumentDefinitions.READ_VALIDATION_STRINGENCY_LONG_NAME, 74 shortName = StandardArgumentDefinitions.READ_VALIDATION_STRINGENCY_SHORT_NAME, 75 doc = "Validation stringency for all SAM/BAM/CRAM/SRA files read by this program. The default stringency value SILENT " + 76 "can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) " + 77 "do not otherwise need to be decoded.", 78 common=true, 79 optional=true) 80 public ValidationStringency readValidationStringency = ReadConstants.DEFAULT_READ_VALIDATION_STRINGENCY; 81 82 @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, 83 shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, 84 doc = "The splitting index (SBI) file. If this is unspecified an index will be created with the same name as " + 85 "the input file but with the additional extension " + FileExtensions.SBI, 86 optional = true) 87 public File output; 88 89 @Argument(fullName = SPLITTING_INDEX_GRANULARITY_LONG_NAME, 90 doc = "Splitting index granularity, an entry is created in the index every this many reads.", 91 optional = true) 92 public long granularity = SBIIndexWriter.DEFAULT_GRANULARITY; 93 94 @Argument(fullName = CREATE_BAI_LONG_NAME, 95 doc = "Set this to create a bai index at the same time as creating a splitting index", 96 optional = true) 97 public boolean createBai = false; 98 99 100 @Override doWork()101 public Object doWork() { 102 if( granularity <= 0) { 103 throw new CommandLineException.BadArgumentValue(SPLITTING_INDEX_GRANULARITY_LONG_NAME, Long.toString(granularity), "Granularity must be > 0"); 104 } 105 final File index = getOutputFile(output, inputBam); 106 if(createBai){ 107 createBaiAndSplittingIndex(inputBam, index, granularity, readValidationStringency); 108 } else { 109 createOnlySplittingIndex(inputBam, index, granularity); 110 } 111 112 return 0; 113 } 114 createOnlySplittingIndex(final File inputBam, final File index, final long granularity)115 private static void createOnlySplittingIndex(final File inputBam, final File index, final long granularity) { 116 assertIsBam(inputBam); 117 try(SeekableStream in = new SeekableFileStream(inputBam); 118 BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) { 119 BAMSBIIndexer.createIndex(in, out, granularity); 120 } catch (final IOException e) { 121 throw new UserException("Couldn't create splitting index", e); 122 } 123 } 124 createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency)125 private static void createBaiAndSplittingIndex(final File inputBam, final File index, final long granularity, final ValidationStringency readValidationStringency) { 126 assertIsBam(inputBam); 127 try(SamReader reader = SamReaderFactory.makeDefault() 128 .validationStringency(readValidationStringency) 129 .setOption(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, true) 130 .open(inputBam); 131 BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(index))) { 132 final SAMFileHeader header = reader.getFileHeader(); 133 assertBamIsCoordinateSorted(header); 134 final SBIIndexWriter indexer = new SBIIndexWriter(out, granularity); 135 136 final BAMIndexer bamIndexer = new BAMIndexer(IOUtils.replaceExtension(index, FileExtensions.BAI_INDEX), header); 137 BAMFileSpan lastFilePointer = null; 138 for(final SAMRecord read : reader){ 139 BAMFileSpan filePointer = (BAMFileSpan) read.getFileSource().getFilePointer(); 140 indexer.processRecord(filePointer.getFirstOffset()); 141 bamIndexer.processAlignment(read); 142 lastFilePointer = filePointer; 143 } 144 long nextStart = 0; 145 if (lastFilePointer != null && !lastFilePointer.getChunks().isEmpty()) { 146 nextStart = lastFilePointer.getChunks().get(0).getChunkEnd(); 147 } 148 if (nextStart == 0) { 149 nextStart = BlockCompressedFilePointerUtil.makeFilePointer(inputBam.length()); // default to file length (in case of no reads) 150 } 151 indexer.finish(nextStart, inputBam.length()); // nextStart is start of next record that would be added 152 bamIndexer.finish(); 153 } catch (final IOException e) { 154 throw new UserException("Couldn't create splitting index", e); 155 } 156 } 157 assertBamIsCoordinateSorted(final SAMFileHeader header)158 private static void assertBamIsCoordinateSorted(final SAMFileHeader header) { 159 if( header.getSortOrder() != SAMFileHeader.SortOrder.coordinate) { 160 throw new UserException.BadInput("Cannot create a " + FileExtensions.BAI_INDEX + " index for a file " + 161 "that isn't coordinate sorted."); 162 } 163 } 164 165 assertIsBam(final File inputBam)166 private static void assertIsBam(final File inputBam) { 167 if(!BamFileIoUtils.isBamFile(inputBam)) { 168 throw new UserException.BadInput("A splitting index is only relevant for a bam file, but a " 169 + "file with extension "+ FileUtils.getExtension(inputBam.getName()) + " was specified."); 170 } 171 } 172 getOutputFile(final File suggestedOutput, final File input)173 private static File getOutputFile(final File suggestedOutput, final File input) { 174 if(suggestedOutput == null){ 175 return new File(input.getPath() + FileExtensions.SBI); 176 } else { 177 if (!suggestedOutput.getAbsolutePath().endsWith("bam" + FileExtensions.SBI)){ 178 logger.warn("Creating a splitting index (SBI) with an extension that doesn't match " 179 + "bam"+ FileExtensions.SBI + ". Output file: "+suggestedOutput); 180 } 181 return suggestedOutput; 182 } 183 } 184 } 185