1 /*=========================================================================== 2 * 3 * PUBLIC DOMAIN NOTICE 4 * National Center for Biotechnology Information 5 * 6 * This software/database is a "United States Government Work" under the 7 * terms of the United States Copyright Act. It was written as part of 8 * the author's official duties as a United States Government employee and 9 * thus cannot be copyrighted. This software/database is freely available 10 * to the public for use. The National Library of Medicine and the U.S. 11 * Government have not placed any restriction on its use or reproduction. 12 * 13 * Although all reasonable efforts have been taken to ensure the accuracy 14 * and reliability of the software and data, the NLM and the U.S. 15 * Government do not and cannot warrant the performance or results that 16 * may be obtained by using this software or data. The NLM and the U.S. 17 * Government disclaim all warranties, express or implied, including 18 * warranties of performance, merchantability or fitness for any particular 19 * purpose. 20 * 21 * Please cite the author in any work or product based on this material. 22 * 23 * =========================================================================== 24 * 25 */ 26 27 #ifndef _h_sam_dump_opts_ 28 #define _h_sam_dump_opts_ 29 30 #ifdef __cplusplus 31 extern "C" { 32 #endif 33 #if 0 34 } 35 #endif 36 37 #include <klib/container.h> 38 #include <klib/vector.h> 39 #include <klib/out.h> 40 #include <klib/text.h> 41 #include <klib/rc.h> 42 #include <klib/log.h> 43 #include <klib/namelist.h> 44 45 #include <kapp/args.h> 46 #include "perf_log.h" 47 #include "rna_splice_log.h" 48 49 #include <stdio.h> 50 #include <stdlib.h> 51 #include <string.h> 52 #include <assert.h> 53 #include <strtol.h> 54 55 56 #define OPT_UNALIGNED "unaligned" 57 #define OPT_PRIM_ONLY "primary" 58 #define OPT_CIGAR_LONG "cigar-long" 59 #define OPT_CG_SAM "CG-SAM" 60 #define OPT_CG_EVIDENCE "CG-evidence" 61 #define OPT_CG_EV_DNB "CG-ev-dnb" 62 #define OPT_CG_MAPP "CG-mappings" 63 #define OPT_REGION "aligned-region" 64 #define OPT_RECAL_HDR "header" 65 #define OPT_HDR_FILE "header-file" 66 #define OPT_NO_HDR "no-header" 67 #define OPT_USE_SEQID "seqid" 68 #define OPT_HIDE_IDENT "hide-identical" 69 #define OPT_CIGAR_CG "cigar-CG" 70 #define OPT_CIGAR_CG_M "cigar-CG-merge" 71 #define OPT_PREFIX "prefix" 72 #define OPT_REVERSE "reverse" 73 #define OPT_SPOTGRP "spot-group" 74 #define OPT_MATE_GAP "mate-cache-row-gap" 75 #define OPT_XI_DEBUG "XI" 76 #define OPT_Q_QUANT "qual-quant" 77 #define OPT_GZIP "gzip" 78 #define OPT_BZIP2 "bzip2" 79 #define OPT_FASTQ "fastq" 80 #define OPT_FASTA "fasta" 81 #define OPT_HDR_COMMENT "header-comment" 82 #define OPT_MATE_DIST "matepair-distance" 83 #define OPT_OUTPUTFILE "output-file" 84 #define OPT_OUTBUFSIZE "output-buffer-size" 85 #define OPT_REPORT "report" 86 #define OPT_CACHEREPORT "cachereport" 87 #define OPT_UNALIGNED_ONLY "unaligned-spots-only" 88 #define OPT_CG_NAMES "CG-names" 89 #define OPT_CIGAR_TEST "cigar-test" 90 #define OPT_CURSOR_CACHE "cursor-cache" 91 #define OPT_DUMP_MODE "dump-mode" 92 #define OPT_MIN_MAPQ "min-mapq" 93 #define OPT_NO_MATE_CACHE "no-mate-cache" 94 #define OPT_LEGACY "legacy" 95 #define OPT_NEW "new" 96 #define OPT_RNA_SPLICE "rna-splicing" 97 #define OPT_RNA_SPLICEL "rna-splice-level" 98 #define OPT_RNA_SPLICE_LOG "rna-splice-log" 99 #define OPT_NO_MT "disable-multithreading" 100 #define OPT_TIMING "timing" 101 #define OPT_MD_FLAG "with-md-flag" 102 #define OPT_NGC "ngc" 103 104 typedef struct range 105 { 106 uint64_t start; 107 uint64_t end; 108 } range; 109 110 111 typedef struct reference_region 112 { 113 BSTNode node; 114 const char * name; /* the name of the reference */ 115 Vector ranges; /* what regions on this reference */ 116 } reference_region; 117 118 119 enum header_mode 120 { 121 hm_none = 0, /* do not dump the headers at all */ 122 hm_recalc, /* recalculate the headers */ 123 hm_dump, /* dump the header found in metadata */ 124 hm_file /* take the complete header part from a file */ 125 }; 126 127 enum output_format 128 { 129 of_sam = 0, /* use sam-tools format */ 130 of_fasta, /* use fasta-format */ 131 of_fastq /* use fastq-format */ 132 }; 133 134 enum output_compression 135 { 136 oc_none = 0, /* do not compress output */ 137 oc_gzip, /* compress output with gzip */ 138 oc_bzip2 /* compress output with bzip2 */ 139 }; 140 141 enum cigar_treatment 142 { 143 ct_unchanged = 0, /* use the cigar-string as it is stored */ 144 ct_cg_style, /* transform cigar into cg-style ( has B/N ) */ 145 ct_cg_merge /* transform cg-data(length of read/patterns in cigar) into valid SAM (cigar/READ/QUALITY) */ 146 }; 147 148 149 enum dump_mode 150 { 151 /* in case of: aligned reads requested + no regions given */ 152 dm_one_ref_at_a_time = 0, /* create a set-iter each for every reference sequentially, put only one reference into it */ 153 dm_prepare_all_refs /* create only ONE set-iter, put ALL references into it */ 154 }; 155 156 157 typedef struct samdump_opts 158 { 159 /* tree with regions, each node has a sorted vector of ranges, can be empty ... */ 160 BSTree regions; /* contains reference_region structs */ 161 162 /* vector with header-comments, can be empty... */ 163 VNamelist * hdr_comments; 164 165 /* vector input files/accessions/url's */ 166 VNamelist * input_files; 167 168 /* vector with metapair-distances... */ 169 Vector mp_dist; 170 171 /* prepend qname with this prefix */ 172 const char * qname_prefix; 173 174 /* the quality quantization string */ 175 const char * qual_quant; 176 177 /* optional outputfile */ 178 const char * outputfile; 179 180 /* optional header-file */ 181 const char * header_file; 182 183 /* cigar-test >>> not advertized! */ 184 const char * cigar_test; 185 186 /* timing-file >>> not advertized! */ 187 const char * timing_file; 188 189 /* log file for rna-splicing-events */ 190 const char * rna_splice_log_file; 191 192 /* timing-performane-log, created if timing_file given */ 193 struct perf_log * perf_log; 194 195 /* logging of rna-splicing on reqest */ 196 struct rna_splice_log * rna_splice_log; 197 198 uint32_t region_count; 199 uint32_t input_file_count; 200 uint32_t rna_splice_level; /* can be 0 || 1 || 2 */ 201 202 int32_t min_mapq; 203 204 /* how much buffering on the output-buffer, of OFF if zero */ 205 uint32_t output_buffer_size; 206 207 /* mate's farther apart than this are not cached */ 208 uint32_t mape_gap_cache_limit; 209 210 size_t cursor_cache_size; 211 212 /* how the sam-headers are treated */ 213 enum header_mode header_mode; 214 215 /* how the cigar-string is treated */ 216 enum cigar_treatment cigar_treatment; 217 218 /* in which format should the output be created */ 219 enum output_format output_format; 220 221 /* should the output be compressed / in which format */ 222 enum output_compression output_compression; 223 224 /* how to process in case of: aligned reads requested + no regions given */ 225 enum dump_mode dump_mode; 226 227 /* use a mate-cache to dump aligned and half-aligned reads */ 228 bool use_mate_cache; 229 bool force_legacy; 230 bool force_new; 231 232 /* which tables have to be processed/dumped */ 233 bool dump_primary_alignments; 234 bool dump_secondary_alignments; 235 bool dump_cg_evidence; 236 bool dump_cg_sam; 237 bool dump_cg_ev_dnb; 238 bool merge_cg_cigar; 239 240 bool dump_unaligned_reads; 241 bool dump_unaligned_only; 242 bool dump_cga_tools_mode; 243 244 /* what alignment/unaligned reads should be dumped */ 245 bool print_half_unaligned_reads; 246 bool print_fully_unaligned_reads; 247 248 /* flag that shows if we need to filter by matepair-distance */ 249 bool use_matepair_filter; 250 bool use_min_mapq; 251 252 /* options changing the output-format */ 253 bool use_seqid_as_refname; 254 bool use_long_cigar; 255 bool print_matches_as_equal_sign; 256 bool print_spot_group_in_name; 257 bool reverse_unaligned_reads; 258 bool print_alignment_id_in_column_xi; 259 bool report_options; 260 bool report_cache; 261 bool print_cg_names; 262 bool rna_splicing; 263 264 /* option to disable multi-threading */ 265 bool no_mt; 266 267 bool with_md_flag; 268 269 uint8_t qual_quant_matrix[ 256 ]; 270 } samdump_opts; 271 272 273 typedef struct foreach_reference_func 274 { 275 rc_t ( CC * on_reference ) ( const char * name, Vector *ranges, void *data ); 276 const char * name; 277 void * data; 278 rc_t rc; 279 } foreach_reference_func; 280 281 282 rc_t foreach_reference( BSTree * regions, 283 rc_t ( CC * on_reference ) ( const char * name, Vector *ranges, void *data ), 284 void *data ); 285 286 int cmp_pchar( const char * a, const char * b ); 287 288 rc_t gather_options( Args * args, samdump_opts * opts ); 289 void report_options( const samdump_opts * opts ); 290 void release_options( samdump_opts * opts ); 291 292 bool filter_by_matepair_dist( const samdump_opts * opts, int32_t tlen ); 293 294 bool is_this_alignment_requested( const samdump_opts * opts, const char *refname, uint32_t refname_len, 295 uint64_t start, uint64_t len ); 296 297 rc_t dump_name( const samdump_opts * opts, int64_t seq_spot_id, 298 const char * spot_group, uint32_t spot_group_len ); 299 300 rc_t dump_name_legacy( const samdump_opts * opts, const char * name, size_t name_len, 301 const char * spot_group, uint32_t spot_group_len ); 302 303 rc_t dump_quality( const samdump_opts * opts, char const *quality, uint32_t qual_len, bool reverse ); 304 305 rc_t dump_quality_33( const samdump_opts * opts, char const *quality, uint32_t qual_len, bool reverse ); 306 307 #endif 308