1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #ifndef _h_sam_dump_opts_
28 #define _h_sam_dump_opts_
29 
30 #ifdef __cplusplus
31 extern "C" {
32 #endif
33 #if 0
34 }
35 #endif
36 
37 #include <klib/container.h>
38 #include <klib/vector.h>
39 #include <klib/out.h>
40 #include <klib/text.h>
41 #include <klib/rc.h>
42 #include <klib/log.h>
43 #include <klib/namelist.h>
44 
45 #include <kapp/args.h>
46 #include "perf_log.h"
47 #include "rna_splice_log.h"
48 
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <assert.h>
53 #include <strtol.h>
54 
55 
56 #define OPT_UNALIGNED   "unaligned"
57 #define OPT_PRIM_ONLY   "primary"
58 #define OPT_CIGAR_LONG  "cigar-long"
59 #define OPT_CG_SAM      "CG-SAM"
60 #define OPT_CG_EVIDENCE "CG-evidence"
61 #define OPT_CG_EV_DNB   "CG-ev-dnb"
62 #define OPT_CG_MAPP     "CG-mappings"
63 #define OPT_REGION      "aligned-region"
64 #define OPT_RECAL_HDR   "header"
65 #define OPT_HDR_FILE    "header-file"
66 #define OPT_NO_HDR      "no-header"
67 #define OPT_USE_SEQID   "seqid"
68 #define OPT_HIDE_IDENT  "hide-identical"
69 #define OPT_CIGAR_CG    "cigar-CG"
70 #define OPT_CIGAR_CG_M  "cigar-CG-merge"
71 #define OPT_PREFIX      "prefix"
72 #define OPT_REVERSE     "reverse"
73 #define OPT_SPOTGRP     "spot-group"
74 #define OPT_MATE_GAP    "mate-cache-row-gap"
75 #define OPT_XI_DEBUG    "XI"
76 #define OPT_Q_QUANT     "qual-quant"
77 #define OPT_GZIP        "gzip"
78 #define OPT_BZIP2       "bzip2"
79 #define OPT_FASTQ       "fastq"
80 #define OPT_FASTA       "fasta"
81 #define OPT_HDR_COMMENT "header-comment"
82 #define OPT_MATE_DIST   "matepair-distance"
83 #define OPT_OUTPUTFILE  "output-file"
84 #define OPT_OUTBUFSIZE  "output-buffer-size"
85 #define OPT_REPORT      "report"
86 #define OPT_CACHEREPORT "cachereport"
87 #define OPT_UNALIGNED_ONLY "unaligned-spots-only"
88 #define OPT_CG_NAMES    "CG-names"
89 #define OPT_CIGAR_TEST  "cigar-test"
90 #define OPT_CURSOR_CACHE "cursor-cache"
91 #define OPT_DUMP_MODE   "dump-mode"
92 #define OPT_MIN_MAPQ    "min-mapq"
93 #define OPT_NO_MATE_CACHE "no-mate-cache"
94 #define OPT_LEGACY      "legacy"
95 #define OPT_NEW         "new"
96 #define OPT_RNA_SPLICE  "rna-splicing"
97 #define OPT_RNA_SPLICEL "rna-splice-level"
98 #define OPT_RNA_SPLICE_LOG "rna-splice-log"
99 #define OPT_NO_MT       "disable-multithreading"
100 #define OPT_TIMING      "timing"
101 #define OPT_MD_FLAG     "with-md-flag"
102 #define OPT_NGC         "ngc"
103 
104 typedef struct range
105 {
106     uint64_t start;
107     uint64_t end;
108 } range;
109 
110 
111 typedef struct reference_region
112 {
113     BSTNode node;
114     const char * name;      /* the name of the reference */
115     Vector ranges;          /* what regions on this reference */
116 } reference_region;
117 
118 
119 enum header_mode
120 {
121     hm_none = 0,    /* do not dump the headers at all */
122     hm_recalc,      /* recalculate the headers */
123     hm_dump,        /* dump the header found in metadata */
124     hm_file         /* take the complete header part from a file */
125 };
126 
127 enum output_format
128 {
129     of_sam = 0,     /* use sam-tools format */
130     of_fasta,       /* use fasta-format */
131     of_fastq        /* use fastq-format */
132 };
133 
134 enum output_compression
135 {
136     oc_none = 0,    /* do not compress output */
137     oc_gzip,        /* compress output with gzip */
138     oc_bzip2        /* compress output with bzip2 */
139 };
140 
141 enum cigar_treatment
142 {
143     ct_unchanged = 0,   /* use the cigar-string as it is stored */
144     ct_cg_style,        /* transform cigar into cg-style ( has B/N ) */
145     ct_cg_merge         /* transform cg-data(length of read/patterns in cigar) into valid SAM (cigar/READ/QUALITY) */
146 };
147 
148 
149 enum dump_mode
150 {
151     /* in case of: aligned reads requested + no regions given */
152     dm_one_ref_at_a_time = 0,   /* create a set-iter each for every reference sequentially, put only one reference into it */
153     dm_prepare_all_refs         /* create only ONE set-iter, put ALL references into it */
154 };
155 
156 
157 typedef struct samdump_opts
158 {
159     /* tree with regions, each node has a sorted vector of ranges, can be empty ... */
160     BSTree regions;     /* contains reference_region structs */
161 
162     /* vector with header-comments, can be empty... */
163     VNamelist * hdr_comments;
164 
165     /* vector input files/accessions/url's */
166     VNamelist * input_files;
167 
168     /* vector with metapair-distances... */
169     Vector mp_dist;
170 
171     /* prepend qname with this prefix */
172     const char * qname_prefix;
173 
174     /* the quality quantization string */
175     const char * qual_quant;
176 
177     /* optional outputfile */
178     const char * outputfile;
179 
180     /* optional header-file */
181     const char * header_file;
182 
183     /* cigar-test >>> not advertized! */
184     const char * cigar_test;
185 
186     /* timing-file >>> not advertized! */
187     const char * timing_file;
188 
189     /* log file for rna-splicing-events */
190     const char * rna_splice_log_file;
191 
192     /* timing-performane-log, created if timing_file given */
193     struct perf_log * perf_log;
194 
195     /* logging of rna-splicing on reqest */
196     struct rna_splice_log * rna_splice_log;
197 
198     uint32_t region_count;
199     uint32_t input_file_count;
200     uint32_t rna_splice_level;  /* can be 0 || 1 || 2 */
201 
202     int32_t min_mapq;
203 
204     /* how much buffering on the output-buffer, of OFF if zero */
205     uint32_t output_buffer_size;
206 
207     /* mate's farther apart than this are not cached */
208     uint32_t mape_gap_cache_limit;
209 
210     size_t cursor_cache_size;
211 
212     /* how the sam-headers are treated */
213     enum header_mode header_mode;
214 
215     /* how the cigar-string is treated */
216     enum cigar_treatment cigar_treatment;
217 
218     /* in which format should the output be created */
219     enum output_format output_format;
220 
221     /* should the output be compressed / in which format */
222     enum output_compression output_compression;
223 
224     /* how to process in case of: aligned reads requested + no regions given */
225     enum dump_mode dump_mode;
226 
227     /* use a mate-cache to dump aligned and half-aligned reads */
228     bool use_mate_cache;
229     bool force_legacy;
230     bool force_new;
231 
232     /* which tables have to be processed/dumped */
233     bool dump_primary_alignments;
234     bool dump_secondary_alignments;
235     bool dump_cg_evidence;
236     bool dump_cg_sam;
237     bool dump_cg_ev_dnb;
238     bool merge_cg_cigar;
239 
240     bool dump_unaligned_reads;
241     bool dump_unaligned_only;
242     bool dump_cga_tools_mode;
243 
244     /* what alignment/unaligned reads should be dumped */
245     bool print_half_unaligned_reads;
246     bool print_fully_unaligned_reads;
247 
248     /* flag that shows if we need to filter by matepair-distance */
249     bool use_matepair_filter;
250     bool use_min_mapq;
251 
252     /* options changing the output-format */
253     bool use_seqid_as_refname;
254     bool use_long_cigar;
255     bool print_matches_as_equal_sign;
256     bool print_spot_group_in_name;
257     bool reverse_unaligned_reads;
258     bool print_alignment_id_in_column_xi;
259     bool report_options;
260     bool report_cache;
261     bool print_cg_names;
262     bool rna_splicing;
263 
264     /* option to disable multi-threading */
265     bool no_mt;
266 
267 	bool with_md_flag;
268 
269     uint8_t qual_quant_matrix[ 256 ];
270 } samdump_opts;
271 
272 
273 typedef struct foreach_reference_func
274 {
275     rc_t ( CC * on_reference ) ( const char * name, Vector *ranges, void *data );
276     const char * name;
277     void * data;
278     rc_t rc;
279 } foreach_reference_func;
280 
281 
282 rc_t foreach_reference( BSTree * regions,
283     rc_t ( CC * on_reference ) ( const char * name, Vector *ranges, void *data ),
284     void *data );
285 
286 int cmp_pchar( const char * a, const char * b );
287 
288 rc_t gather_options( Args * args, samdump_opts * opts );
289 void report_options( const samdump_opts * opts );
290 void release_options( samdump_opts * opts );
291 
292 bool filter_by_matepair_dist( const samdump_opts * opts, int32_t tlen );
293 
294 bool is_this_alignment_requested( const samdump_opts * opts, const char *refname, uint32_t refname_len,
295                                   uint64_t start, uint64_t len );
296 
297 rc_t dump_name( const samdump_opts * opts, int64_t seq_spot_id,
298                 const char * spot_group, uint32_t spot_group_len );
299 
300 rc_t dump_name_legacy( const samdump_opts * opts, const char * name, size_t name_len,
301                        const char * spot_group, uint32_t spot_group_len );
302 
303 rc_t dump_quality( const samdump_opts * opts, char const *quality, uint32_t qual_len, bool reverse );
304 
305 rc_t dump_quality_33( const samdump_opts * opts, char const *quality, uint32_t qual_len, bool reverse );
306 
307 #endif
308