1 #ifndef MINIMAP2_H 2 #define MINIMAP2_H 3 4 #include <stdint.h> 5 #include <stdio.h> 6 #include <sys/types.h> 7 8 #define MM_F_NO_DIAG 0x001 // no exact diagonal hit 9 #define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name 10 #define MM_F_CIGAR 0x004 11 #define MM_F_OUT_SAM 0x008 12 #define MM_F_NO_QUAL 0x010 13 #define MM_F_OUT_CG 0x020 14 #define MM_F_OUT_CS 0x040 15 #define MM_F_SPLICE 0x080 // splice mode 16 #define MM_F_SPLICE_FOR 0x100 // match GT-AG 17 #define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG 18 #define MM_F_NO_LJOIN 0x400 19 #define MM_F_OUT_CS_LONG 0x800 20 #define MM_F_SR 0x1000 21 #define MM_F_FRAG_MODE 0x2000 22 #define MM_F_NO_PRINT_2ND 0x4000 23 #define MM_F_2_IO_THREADS 0x8000 24 #define MM_F_LONG_CIGAR 0x10000 25 #define MM_F_INDEPEND_SEG 0x20000 26 #define MM_F_SPLICE_FLANK 0x40000 27 #define MM_F_SOFTCLIP 0x80000 28 #define MM_F_FOR_ONLY 0x100000 29 #define MM_F_REV_ONLY 0x200000 30 #define MM_F_HEAP_SORT 0x400000 31 #define MM_F_ALL_CHAINS 0x800000 32 #define MM_F_OUT_MD 0x1000000 33 #define MM_F_COPY_COMMENT 0x2000000 34 #define MM_F_EQX 0x4000000 // use =/X instead of M 35 #define MM_F_PAF_NO_HIT 0x8000000 // output unmapped reads to PAF 36 #define MM_F_NO_END_FLT 0x10000000 37 #define MM_F_HARD_MLEVEL 0x20000000 38 #define MM_F_SAM_HIT_ONLY 0x40000000 39 #define MM_F_RMQ (0x80000000LL) 40 #define MM_F_QSTRAND (0x100000000LL) 41 #define MM_F_NO_INV (0x200000000LL) 42 #define MM_F_NO_HASH_NAME (0x400000000LL) 43 44 #define MM_I_HPC 0x1 45 #define MM_I_NO_SEQ 0x2 46 #define MM_I_NO_NAME 0x4 47 48 #define MM_IDX_MAGIC "MMI\2" 49 50 #define MM_MAX_SEG 255 51 52 #define MM_CIGAR_MATCH 0 53 #define MM_CIGAR_INS 1 54 #define MM_CIGAR_DEL 2 55 #define MM_CIGAR_N_SKIP 3 56 #define MM_CIGAR_SOFTCLIP 4 57 #define MM_CIGAR_HARDCLIP 5 58 #define MM_CIGAR_PADDING 6 59 #define MM_CIGAR_EQ_MATCH 7 60 #define MM_CIGAR_X_MISMATCH 8 61 62 #define MM_CIGAR_STR "MIDNSHP=XB" 63 64 #ifdef __cplusplus 65 extern "C" { 66 #endif 67 68 // emulate 128-bit integers and arrays 69 typedef struct { uint64_t x, y; } mm128_t; 70 typedef struct { size_t n, m; mm128_t *a; } mm128_v; 71 72 // minimap2 index 73 typedef struct { 74 char *name; // name of the db sequence 75 uint64_t offset; // offset in mm_idx_t::S 76 uint32_t len; // length 77 uint32_t is_alt; 78 } mm_idx_seq_t; 79 80 typedef struct { 81 int32_t b, w, k, flag; 82 uint32_t n_seq; // number of reference sequences 83 int32_t index; 84 int32_t n_alt; 85 mm_idx_seq_t *seq; // sequence name, length and offset 86 uint32_t *S; // 4-bit packed sequence 87 struct mm_idx_bucket_s *B; // index (hidden) 88 struct mm_idx_intv_s *I; // intervals (hidden) 89 void *km, *h; 90 } mm_idx_t; 91 92 // minimap2 alignment 93 typedef struct { 94 uint32_t capacity; // the capacity of cigar[] 95 int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings 96 uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for - 97 uint32_t n_cigar; // number of cigar operations in cigar[] 98 uint32_t cigar[]; 99 } mm_extra_t; 100 101 typedef struct { 102 int32_t id; // ID for internal uses (see also parent below) 103 int32_t cnt; // number of minimizers; if on the reverse strand 104 int32_t rid; // reference index; if this is an alignment from inversion rescue 105 int32_t score; // DP alignment score 106 int32_t qs, qe, rs, re; // query start and end; reference start and end 107 int32_t parent, subsc; // parent==id if primary; best alternate mapping score 108 int32_t as; // offset in the a[] array (for internal uses only) 109 int32_t mlen, blen; // seeded exact match length; seeded alignment block length 110 int32_t n_sub; // number of suboptimal mappings 111 int32_t score0; // initial chaining score (before chain merging/spliting) 112 uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5; 113 uint32_t hash; 114 float div; 115 mm_extra_t *p; 116 } mm_reg1_t; 117 118 // indexing and mapping options 119 typedef struct { 120 short k, w, flag, bucket_bits; 121 int64_t mini_batch_size; 122 uint64_t batch_size; 123 } mm_idxopt_t; 124 125 typedef struct { 126 int64_t flag; // see MM_F_* macros 127 int seed; 128 int sdust_thres; // score threshold for SDUST; 0 to disable 129 130 int max_qlen; // max query length 131 132 int bw, bw_long; // bandwidth 133 int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window 134 int max_frag_len; 135 int max_chain_skip, max_chain_iter; 136 int min_cnt; // min number of minimizers on each chain 137 int min_chain_score; // min chaining score 138 float chain_gap_scale; 139 float chain_skip_scale; 140 int rmq_size_cap, rmq_inner_dist; 141 int rmq_rescue_size; 142 float rmq_rescue_ratio; 143 144 float mask_level; 145 int mask_len; 146 float pri_ratio; 147 int best_n; // top best_n chains are subjected to DP alignment 148 149 float alt_drop; 150 151 int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties 152 int sc_ambi; // score when one or both bases are "N" 153 int noncan; // cost of non-canonical splicing sites 154 int junc_bonus; 155 int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal 156 int end_bonus; 157 int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold 158 int min_ksw_len; 159 int anchor_ext_len, anchor_ext_shift; 160 float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio 161 162 int rank_min_len; 163 float rank_frac; 164 165 int pe_ori, pe_bonus; 166 167 float mid_occ_frac; // only used by mm_mapopt_update(); see below 168 float q_occ_frac; 169 int32_t min_mid_occ, max_mid_occ; 170 int32_t mid_occ; // ignore seeds with occurrences above this threshold 171 int32_t max_occ, max_max_occ, occ_dist; 172 int64_t mini_batch_size; // size of a batch of query bases to process in parallel 173 int64_t max_sw_mat; 174 int64_t cap_kalloc; 175 176 const char *split_prefix; 177 } mm_mapopt_t; 178 179 // index reader 180 typedef struct { 181 int is_idx, n_parts; 182 int64_t idx_size; 183 mm_idxopt_t opt; 184 FILE *fp_out; 185 union { 186 struct mm_bseq_file_s *seq; 187 FILE *idx; 188 } fp; 189 } mm_idx_reader_t; 190 191 // memory buffer for thread-local storage during mapping 192 typedef struct mm_tbuf_s mm_tbuf_t; 193 194 // global variables 195 extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag 196 extern double mm_realtime0; // wall-clock timer 197 198 /** 199 * Set default or preset parameters 200 * 201 * @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters 202 * @param io pointer to indexing parameters 203 * @param mo pointer to mapping parameters 204 * 205 * @return 0 if success; -1 if _present_ unknown 206 */ 207 int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo); 208 int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo); 209 210 /** 211 * Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac 212 * 213 * If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no 214 * more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher 215 * occurrence. 216 * 217 * @param opt mapping parameters 218 * @param mi minimap2 index 219 */ 220 void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi); 221 222 void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len); 223 224 /** 225 * Initialize an index reader 226 * 227 * @param fn index or fasta/fastq file name (this function tests the file type) 228 * @param opt indexing parameters 229 * @param fn_out if not NULL, write built index to this file 230 * 231 * @return an index reader on success; NULL if fail to open _fn_ 232 */ 233 mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out); 234 235 /** 236 * Read/build an index 237 * 238 * If the input file is an index file, this function reads one part of the 239 * index and returns. If the input file is a sequence file (fasta or fastq), 240 * this function constructs the index for about mm_idxopt_t::batch_size bases. 241 * Importantly, for a huge collection of sequences, this function may only 242 * return an index for part of sequences. It needs to be repeatedly called 243 * to traverse the entire index/sequence file. 244 * 245 * @param r index reader 246 * @param n_threads number of threads for constructing index 247 * 248 * @return an index on success; NULL if reaching the end of the input file 249 */ 250 mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads); 251 252 /** 253 * Destroy/deallocate an index reader 254 * 255 * @param r index reader 256 */ 257 void mm_idx_reader_close(mm_idx_reader_t *r); 258 259 int mm_idx_reader_eof(const mm_idx_reader_t *r); 260 261 /** 262 * Check whether the file contains a minimap2 index 263 * 264 * @param fn file name 265 * 266 * @return the file size if fn is an index file; 0 if fn is not. 267 */ 268 int64_t mm_idx_is_idx(const char *fn); 269 270 /** 271 * Load a part of an index 272 * 273 * Given a uni-part index, this function loads the entire index into memory. 274 * Given a multi-part index, it loads one part only and places the file pointer 275 * at the end of that part. 276 * 277 * @param fp pointer to FILE object 278 * 279 * @return minimap2 index read from fp 280 */ 281 mm_idx_t *mm_idx_load(FILE *fp); 282 283 /** 284 * Append an index (or one part of a full index) to file 285 * 286 * @param fp pointer to FILE object 287 * @param mi minimap2 index 288 */ 289 void mm_idx_dump(FILE *fp, const mm_idx_t *mi); 290 291 /** 292 * Create an index from strings in memory 293 * 294 * @param w minimizer window size 295 * @param k minimizer k-mer size 296 * @param is_hpc use HPC k-mer if true 297 * @param bucket_bits number of bits for the first level of the hash table 298 * @param n number of sequences 299 * @param seq sequences in A/C/G/T 300 * @param name sequence names; could be NULL 301 * 302 * @return minimap2 index 303 */ 304 mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name); 305 306 /** 307 * Print index statistics to stderr 308 * 309 * @param mi minimap2 index 310 */ 311 void mm_idx_stat(const mm_idx_t *idx); 312 313 /** 314 * Destroy/deallocate an index 315 * 316 * @param r minimap2 index 317 */ 318 void mm_idx_destroy(mm_idx_t *mi); 319 320 /** 321 * Initialize a thread-local buffer for mapping 322 * 323 * Each mapping thread requires a buffer specific to the thread (see mm_map() 324 * below). The primary purpose of this buffer is to reduce frequent heap 325 * allocations across threads. A buffer shall not be used by two or more 326 * threads. 327 * 328 * @return pointer to a thread-local buffer 329 */ 330 mm_tbuf_t *mm_tbuf_init(void); 331 332 /** 333 * Destroy/deallocate a thread-local buffer for mapping 334 * 335 * @param b the buffer 336 */ 337 void mm_tbuf_destroy(mm_tbuf_t *b); 338 339 void *mm_tbuf_get_km(mm_tbuf_t *b); 340 341 /** 342 * Align a query sequence against an index 343 * 344 * This function possibly finds multiple alignments of the query sequence. 345 * The returned array and the mm_reg1_t::p field of each element are allocated 346 * with malloc(). 347 * 348 * @param mi minimap2 index 349 * @param l_seq length of the query sequence 350 * @param seq the query sequence 351 * @param n_regs number of hits (out) 352 * @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time! 353 * @param opt mapping parameters 354 * @param name query name, used for all-vs-all overlapping and debugging 355 * 356 * @return an array of hits which need to be deallocated with free() together 357 * with mm_reg1_t::p of each element. The size is written to _n_regs_. 358 */ 359 mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name); 360 361 void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname); 362 363 /** 364 * Align a fasta/fastq file and print alignments to stdout 365 * 366 * @param idx minimap2 index 367 * @param fn fasta/fastq file name 368 * @param opt mapping parameters 369 * @param n_threads number of threads 370 * 371 * @return 0 on success; -1 if _fn_ can't be read 372 */ 373 int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads); 374 375 int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads); 376 377 /** 378 * Generate the cs tag (new in 2.12) 379 * 380 * @param km memory blocks; set to NULL if unsure 381 * @param buf buffer to write the cs/MD tag; typicall NULL on the first call 382 * @param max_len max length of the buffer; typically set to 0 on the first call 383 * @param mi index 384 * @param r alignment 385 * @param seq query sequence 386 * @param no_iden true to use : instead of = 387 * 388 * @return the length of cs 389 */ 390 int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden); 391 int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq); 392 393 // query sequence name and sequence in the minimap2 index 394 int mm_idx_index_name(mm_idx_t *mi); 395 int mm_idx_name2id(const mm_idx_t *mi, const char *name); 396 int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq); 397 398 int mm_idx_alt_read(mm_idx_t *mi, const char *fn); 399 int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc); 400 int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s); 401 402 // deprecated APIs for backward compatibility 403 void mm_mapopt_init(mm_mapopt_t *opt); 404 mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads); 405 406 #ifdef __cplusplus 407 } 408 #endif 409 410 #endif // MINIMAP2_H 411