1 #ifndef MINIMAP2_H
2 #define MINIMAP2_H
3 
4 #include <stdint.h>
5 #include <stdio.h>
6 #include <sys/types.h>
7 
8 #define MM_F_NO_DIAG       0x001 // no exact diagonal hit
9 #define MM_F_NO_DUAL       0x002 // skip pairs where query name is lexicographically larger than target name
10 #define MM_F_CIGAR         0x004
11 #define MM_F_OUT_SAM       0x008
12 #define MM_F_NO_QUAL       0x010
13 #define MM_F_OUT_CG        0x020
14 #define MM_F_OUT_CS        0x040
15 #define MM_F_SPLICE        0x080 // splice mode
16 #define MM_F_SPLICE_FOR    0x100 // match GT-AG
17 #define MM_F_SPLICE_REV    0x200 // match CT-AC, the reverse complement of GT-AG
18 #define MM_F_NO_LJOIN      0x400
19 #define MM_F_OUT_CS_LONG   0x800
20 #define MM_F_SR            0x1000
21 #define MM_F_FRAG_MODE     0x2000
22 #define MM_F_NO_PRINT_2ND  0x4000
23 #define MM_F_2_IO_THREADS  0x8000
24 #define MM_F_LONG_CIGAR    0x10000
25 #define MM_F_INDEPEND_SEG  0x20000
26 #define MM_F_SPLICE_FLANK  0x40000
27 #define MM_F_SOFTCLIP      0x80000
28 #define MM_F_FOR_ONLY      0x100000
29 #define MM_F_REV_ONLY      0x200000
30 #define MM_F_HEAP_SORT     0x400000
31 #define MM_F_ALL_CHAINS    0x800000
32 #define MM_F_OUT_MD        0x1000000
33 #define MM_F_COPY_COMMENT  0x2000000
34 #define MM_F_EQX           0x4000000 // use =/X instead of M
35 #define MM_F_PAF_NO_HIT    0x8000000 // output unmapped reads to PAF
36 #define MM_F_NO_END_FLT    0x10000000
37 #define MM_F_HARD_MLEVEL   0x20000000
38 #define MM_F_SAM_HIT_ONLY  0x40000000
39 #define MM_F_RMQ           (0x80000000LL)
40 #define MM_F_QSTRAND       (0x100000000LL)
41 #define MM_F_NO_INV        (0x200000000LL)
42 #define MM_F_NO_HASH_NAME  (0x400000000LL)
43 
44 #define MM_I_HPC          0x1
45 #define MM_I_NO_SEQ       0x2
46 #define MM_I_NO_NAME      0x4
47 
48 #define MM_IDX_MAGIC   "MMI\2"
49 
50 #define MM_MAX_SEG       255
51 
52 #define MM_CIGAR_MATCH      0
53 #define MM_CIGAR_INS        1
54 #define MM_CIGAR_DEL        2
55 #define MM_CIGAR_N_SKIP     3
56 #define MM_CIGAR_SOFTCLIP   4
57 #define MM_CIGAR_HARDCLIP   5
58 #define MM_CIGAR_PADDING    6
59 #define MM_CIGAR_EQ_MATCH   7
60 #define MM_CIGAR_X_MISMATCH 8
61 
62 #define MM_CIGAR_STR  "MIDNSHP=XB"
63 
64 #ifdef __cplusplus
65 extern "C" {
66 #endif
67 
68 // emulate 128-bit integers and arrays
69 typedef struct { uint64_t x, y; } mm128_t;
70 typedef struct { size_t n, m; mm128_t *a; } mm128_v;
71 
72 // minimap2 index
73 typedef struct {
74 	char *name;      // name of the db sequence
75 	uint64_t offset; // offset in mm_idx_t::S
76 	uint32_t len;    // length
77 	uint32_t is_alt;
78 } mm_idx_seq_t;
79 
80 typedef struct {
81 	int32_t b, w, k, flag;
82 	uint32_t n_seq;            // number of reference sequences
83 	int32_t index;
84 	int32_t n_alt;
85 	mm_idx_seq_t *seq;         // sequence name, length and offset
86 	uint32_t *S;               // 4-bit packed sequence
87 	struct mm_idx_bucket_s *B; // index (hidden)
88 	struct mm_idx_intv_s *I;   // intervals (hidden)
89 	void *km, *h;
90 } mm_idx_t;
91 
92 // minimap2 alignment
93 typedef struct {
94 	uint32_t capacity;                  // the capacity of cigar[]
95 	int32_t dp_score, dp_max, dp_max2;  // DP score; score of the max-scoring segment; score of the best alternate mappings
96 	uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
97 	uint32_t n_cigar;                   // number of cigar operations in cigar[]
98 	uint32_t cigar[];
99 } mm_extra_t;
100 
101 typedef struct {
102 	int32_t id;             // ID for internal uses (see also parent below)
103 	int32_t cnt;            // number of minimizers; if on the reverse strand
104 	int32_t rid;            // reference index; if this is an alignment from inversion rescue
105 	int32_t score;          // DP alignment score
106 	int32_t qs, qe, rs, re; // query start and end; reference start and end
107 	int32_t parent, subsc;  // parent==id if primary; best alternate mapping score
108 	int32_t as;             // offset in the a[] array (for internal uses only)
109 	int32_t mlen, blen;     // seeded exact match length; seeded alignment block length
110 	int32_t n_sub;          // number of suboptimal mappings
111 	int32_t score0;         // initial chaining score (before chain merging/spliting)
112 	uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
113 	uint32_t hash;
114 	float div;
115 	mm_extra_t *p;
116 } mm_reg1_t;
117 
118 // indexing and mapping options
119 typedef struct {
120 	short k, w, flag, bucket_bits;
121 	int64_t mini_batch_size;
122 	uint64_t batch_size;
123 } mm_idxopt_t;
124 
125 typedef struct {
126 	int64_t flag;    // see MM_F_* macros
127 	int seed;
128 	int sdust_thres; // score threshold for SDUST; 0 to disable
129 
130 	int max_qlen;    // max query length
131 
132 	int bw, bw_long; // bandwidth
133 	int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
134 	int max_frag_len;
135 	int max_chain_skip, max_chain_iter;
136 	int min_cnt;         // min number of minimizers on each chain
137 	int min_chain_score; // min chaining score
138 	float chain_gap_scale;
139 	float chain_skip_scale;
140 	int rmq_size_cap, rmq_inner_dist;
141 	int rmq_rescue_size;
142 	float rmq_rescue_ratio;
143 
144 	float mask_level;
145 	int mask_len;
146 	float pri_ratio;
147 	int best_n;      // top best_n chains are subjected to DP alignment
148 
149 	float alt_drop;
150 
151 	int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
152 	int sc_ambi; // score when one or both bases are "N"
153 	int noncan;      // cost of non-canonical splicing sites
154 	int junc_bonus;
155 	int zdrop, zdrop_inv;   // break alignment if alignment score drops too fast along the diagonal
156 	int end_bonus;
157 	int min_dp_max;  // drop an alignment if the score of the max scoring segment is below this threshold
158 	int min_ksw_len;
159 	int anchor_ext_len, anchor_ext_shift;
160 	float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
161 
162 	int rank_min_len;
163 	float rank_frac;
164 
165 	int pe_ori, pe_bonus;
166 
167 	float mid_occ_frac;  // only used by mm_mapopt_update(); see below
168 	float q_occ_frac;
169 	int32_t min_mid_occ, max_mid_occ;
170 	int32_t mid_occ;     // ignore seeds with occurrences above this threshold
171 	int32_t max_occ, max_max_occ, occ_dist;
172 	int64_t mini_batch_size; // size of a batch of query bases to process in parallel
173 	int64_t max_sw_mat;
174 	int64_t cap_kalloc;
175 
176 	const char *split_prefix;
177 } mm_mapopt_t;
178 
179 // index reader
180 typedef struct {
181 	int is_idx, n_parts;
182 	int64_t idx_size;
183 	mm_idxopt_t opt;
184 	FILE *fp_out;
185 	union {
186 		struct mm_bseq_file_s *seq;
187 		FILE *idx;
188 	} fp;
189 } mm_idx_reader_t;
190 
191 // memory buffer for thread-local storage during mapping
192 typedef struct mm_tbuf_s mm_tbuf_t;
193 
194 // global variables
195 extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
196 extern double mm_realtime0; // wall-clock timer
197 
198 /**
199  * Set default or preset parameters
200  *
201  * @param preset     NULL to set all parameters as default; otherwise apply preset to affected parameters
202  * @param io         pointer to indexing parameters
203  * @param mo         pointer to mapping parameters
204  *
205  * @return 0 if success; -1 if _present_ unknown
206  */
207 int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
208 int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
209 
210 /**
211  * Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
212  *
213  * If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
214  * more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
215  * occurrence.
216  *
217  * @param opt        mapping parameters
218  * @param mi         minimap2 index
219  */
220 void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
221 
222 void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
223 
224 /**
225  * Initialize an index reader
226  *
227  * @param fn         index or fasta/fastq file name (this function tests the file type)
228  * @param opt        indexing parameters
229  * @param fn_out     if not NULL, write built index to this file
230  *
231  * @return an index reader on success; NULL if fail to open _fn_
232  */
233 mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
234 
235 /**
236  * Read/build an index
237  *
238  * If the input file is an index file, this function reads one part of the
239  * index and returns. If the input file is a sequence file (fasta or fastq),
240  * this function constructs the index for about mm_idxopt_t::batch_size bases.
241  * Importantly, for a huge collection of sequences, this function may only
242  * return an index for part of sequences. It needs to be repeatedly called
243  * to traverse the entire index/sequence file.
244  *
245  * @param r          index reader
246  * @param n_threads  number of threads for constructing index
247  *
248  * @return an index on success; NULL if reaching the end of the input file
249  */
250 mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
251 
252 /**
253  * Destroy/deallocate an index reader
254  *
255  * @param r          index reader
256  */
257 void mm_idx_reader_close(mm_idx_reader_t *r);
258 
259 int mm_idx_reader_eof(const mm_idx_reader_t *r);
260 
261 /**
262  * Check whether the file contains a minimap2 index
263  *
264  * @param fn         file name
265  *
266  * @return the file size if fn is an index file; 0 if fn is not.
267  */
268 int64_t mm_idx_is_idx(const char *fn);
269 
270 /**
271  * Load a part of an index
272  *
273  * Given a uni-part index, this function loads the entire index into memory.
274  * Given a multi-part index, it loads one part only and places the file pointer
275  * at the end of that part.
276  *
277  * @param fp         pointer to FILE object
278  *
279  * @return minimap2 index read from fp
280  */
281 mm_idx_t *mm_idx_load(FILE *fp);
282 
283 /**
284  * Append an index (or one part of a full index) to file
285  *
286  * @param fp         pointer to FILE object
287  * @param mi         minimap2 index
288  */
289 void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
290 
291 /**
292  * Create an index from strings in memory
293  *
294  * @param w            minimizer window size
295  * @param k            minimizer k-mer size
296  * @param is_hpc       use HPC k-mer if true
297  * @param bucket_bits  number of bits for the first level of the hash table
298  * @param n            number of sequences
299  * @param seq          sequences in A/C/G/T
300  * @param name         sequence names; could be NULL
301  *
302  * @return minimap2 index
303  */
304 mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
305 
306 /**
307  * Print index statistics to stderr
308  *
309  * @param mi         minimap2 index
310  */
311 void mm_idx_stat(const mm_idx_t *idx);
312 
313 /**
314  * Destroy/deallocate an index
315  *
316  * @param r          minimap2 index
317  */
318 void mm_idx_destroy(mm_idx_t *mi);
319 
320 /**
321  * Initialize a thread-local buffer for mapping
322  *
323  * Each mapping thread requires a buffer specific to the thread (see mm_map()
324  * below). The primary purpose of this buffer is to reduce frequent heap
325  * allocations across threads. A buffer shall not be used by two or more
326  * threads.
327  *
328  * @return pointer to a thread-local buffer
329  */
330 mm_tbuf_t *mm_tbuf_init(void);
331 
332 /**
333  * Destroy/deallocate a thread-local buffer for mapping
334  *
335  * @param b          the buffer
336  */
337 void mm_tbuf_destroy(mm_tbuf_t *b);
338 
339 void *mm_tbuf_get_km(mm_tbuf_t *b);
340 
341 /**
342  * Align a query sequence against an index
343  *
344  * This function possibly finds multiple alignments of the query sequence.
345  * The returned array and the mm_reg1_t::p field of each element are allocated
346  * with malloc().
347  *
348  * @param mi         minimap2 index
349  * @param l_seq      length of the query sequence
350  * @param seq        the query sequence
351  * @param n_regs     number of hits (out)
352  * @param b          thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
353  * @param opt        mapping parameters
354  * @param name       query name, used for all-vs-all overlapping and debugging
355  *
356  * @return an array of hits which need to be deallocated with free() together
357  *         with mm_reg1_t::p of each element. The size is written to _n_regs_.
358  */
359 mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
360 
361 void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
362 
363 /**
364  * Align a fasta/fastq file and print alignments to stdout
365  *
366  * @param idx        minimap2 index
367  * @param fn         fasta/fastq file name
368  * @param opt        mapping parameters
369  * @param n_threads  number of threads
370  *
371  * @return 0 on success; -1 if _fn_ can't be read
372  */
373 int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
374 
375 int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
376 
377 /**
378  * Generate the cs tag (new in 2.12)
379  *
380  * @param km         memory blocks; set to NULL if unsure
381  * @param buf        buffer to write the cs/MD tag; typicall NULL on the first call
382  * @param max_len    max length of the buffer; typically set to 0 on the first call
383  * @param mi         index
384  * @param r          alignment
385  * @param seq        query sequence
386  * @param no_iden    true to use : instead of =
387  *
388  * @return the length of cs
389  */
390 int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
391 int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
392 
393 // query sequence name and sequence in the minimap2 index
394 int mm_idx_index_name(mm_idx_t *mi);
395 int mm_idx_name2id(const mm_idx_t *mi, const char *name);
396 int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
397 
398 int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
399 int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
400 int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
401 
402 // deprecated APIs for backward compatibility
403 void mm_mapopt_init(mm_mapopt_t *opt);
404 mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
405 
406 #ifdef __cplusplus
407 }
408 #endif
409 
410 #endif // MINIMAP2_H
411