1 /* $Id: dynprog.h 221582 2020-01-30 21:28:29Z twu $ */
2 #ifndef DYNPROG_INCLUDED
3 #define DYNPROG_INCLUDED
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>		/* For HAVE_SSE2, HAVE_SSE4_1 */
6 #endif
7 
8 
9 /* #define DEBUG_SIMD 1 */
10 /* #define DEBUG_AVX2 1 */
11 
12 /* Used by dynprog_simd.c and dynprog.c when comparing using DEBUG_SIMD or DEBUG_AVX2 */
13 /* #define ZERO_INITIAL_GAP_PENALTY 1 */  /* Needed for bam_indelfix */
14 #define INFINITE_INITIAL_GAP_PENALTY 1   /* Prevents any gaps on row 0 or column 0 */
15 
16 /* Note: If we select neither ZERO_INITIAL_GAP_PENALTY or
17    INFINITE_INITIAL_GAP_PENALTY, we have standard behavior, which
18    allows open+extend on row 0 or column 0 */
19 
20 
21 
22 /* BEST_LOCAL is a local alignment, whereas QUERYEND_INDELS and
23    QUERYEND_NOGAPS are global.  QUERYEND_GAP allows an intron at the
24    end */
25 typedef enum {QUERYEND_GAP, QUERYEND_INDELS, QUERYEND_NOGAPS, BEST_LOCAL} Endalign_T;
26 typedef struct Dynprog_T *Dynprog_T; /* Needed before header files below */
27 
28 #include "bool.h"
29 #include "list.h"
30 #include "genomicpos.h"
31 #include "pairpool.h"
32 #include "chrnum.h"
33 #include "iit-read.h"
34 #include "splicetrie_build.h"	/* For splicetype */
35 #include "genome.h"
36 #include "mode.h"
37 
38 #ifdef GSNAP
39 #include "compress.h"
40 #endif
41 
42 
43 #define FULLMATCH 3
44 #define HALFMATCH 1
45 #define AMBIGUOUS 3		/* Previously set to 0, but then strings of N's cause problems */
46 
47 /* These values were set to -5, -4, -3, but this led to chopped ends
48    in GMAP alignments, and failure to find chimeras */
49 #define MISMATCH_HIGHQ -3
50 #define MISMATCH_MEDQ -2
51 #define MISMATCH_LOWQ -1
52 
53 
54 typedef enum {HIGHQ, MEDQ, LOWQ, ENDQ} Mismatchtype_T;
55 #define NMISMATCHTYPES 4
56 
57 /* Note: HIGHQ, MEDQ, and LOWQ indicate parameters for high, medium,
58    and low sequence quality, respectively */
59 #define DEFECT_HIGHQ 0.003
60 #define DEFECT_MEDQ 0.014
61 
62 #define SINGLE_OPEN_HIGHQ -8
63 #define SINGLE_OPEN_MEDQ -7
64 #define SINGLE_OPEN_LOWQ -6
65 
66 #define SINGLE_EXTEND_HIGHQ -3
67 #define SINGLE_EXTEND_MEDQ -2
68 #define SINGLE_EXTEND_LOWQ -1
69 
70 #define PAIRED_OPEN_HIGHQ -8
71 #define PAIRED_OPEN_MEDQ -7
72 #define PAIRED_OPEN_LOWQ -6
73 
74 #define PAIRED_EXTEND_HIGHQ -3
75 #define PAIRED_EXTEND_MEDQ -2
76 #define PAIRED_EXTEND_LOWQ -1
77 
78 
79 #define UNKNOWNJUMP -1000000
80 
81 typedef char Score8_T;
82 typedef char Direction8_T;
83 
84 typedef short Score16_T;
85 typedef short Direction16_T;
86 
87 typedef short Pairdistance_T;
88 
89 /* For standard dynamic programming.  Use ints, so NEG_INFINITY_32 works. */
90 typedef int Score32_T;
91 typedef int Direction32_T;
92 
93 /* Genome is on the horizontal axis.  Query sequence is on the vertical axis.  Dynamic programming fills in matrices column by column */
94 /* The following values are for directions_nogap.  For directions_Egap, the choices are DIAG or not DIAG (meaning HORIZ). */
95 /*  For directions_Fgap, the choices are DIAG or not DIAG (meaning VERT) */
96 #define VERT -2			/* or VERT == -3 in SIMD code.  Don't check for dir == VERT.  Check instead if dir == DIAG or dir == HORIZ */
97 #define HORIZ -1
98 #define DIAG 0			/* Pre-dominant case.  Directions_alloc clears to this value. */
99 
100 #define NEG_INFINITY_8 (-128)
101 #define POS_INFINITY_8 (127)
102 #define MAX_CHAR (127)
103 
104 #define NEG_INFINITY_16 (-32768)
105 #define POS_INFINITY_16 (32767)
106 #define MAX_SHORT (32767)
107 
108 /* We can allow -128 and -32768 for NEG_INFINITY in SIMD procedures,
109    because we are using saturation */
110 #ifdef HAVE_AVX2
111 #define ALIGN_SIZE 32
112 #define ONE_CHAR 1
113 #define MID_CHAR_INSERT 16
114 #define LAST_CHAR_INSERT 0
115 #define SIMD_NCHARS 32		/* 32 8-bit chars in 256 bits */
116 
117 #define ONE_SHORT 2
118 #define MID_SHORT_INSERT 8
119 #define LAST_SHORT_INSERT 0
120 #define SIMD_NSHORTS 16		/* 16 16-bit shorts in 256 bits */
121 
122 #define LAST_CHAR_NONAVX2 15	/* For DEBUG_AVX2 */
123 #define LAST_SHORT_NONAVX2 14	/* For DEBUG_AVX2.  (8 - 1) * 2 */
124 
125 #ifdef DEBUG_AVX2
126 #define SIMD_NCHARS_NONAVX2 16
127 #define SIMD_NSHORTS_NONAVX2 8
128 #endif
129 
130 #elif defined(HAVE_SSE4_1) || defined(HAVE_SSE2)
131 #define ALIGN_SIZE 16
132 #define ONE_CHAR 1
133 #define LAST_CHAR_SHIFT 15
134 #define SIMD_NCHARS 16		/* 16 8-bit chars in 128 bits */
135 
136 #define ONE_SHORT 2
137 #define LAST_SHORT_SHIFT 14	/* (8 - 1) * 2 */
138 #define SIMD_NSHORTS 8		/* 8 16-bit shorts in 128 bits */
139 
140 #define LAST_CHAR_NONAVX2 15	/* For DEBUG_SIMD */
141 #define LAST_SHORT_NONAVX2 14	/* For DEBUG_SIMD */
142 
143 #endif
144 
145 /* Can allow -32768 in non-SIMD procedures, because we are using ints */
146 #define NEG_INFINITY_32 (-32768)
147 #define NEG_INFINITY_INT (-32768)
148 
149 
150 /* This is still needed, because sequences passed into compute_scores
151    might be lower-case */
152 #define PREUC 1
153 
154 extern int use8p_size[NMISMATCHTYPES];
155 extern Pairdistance_T **pairdistance_array[NMISMATCHTYPES];
156 #ifndef HAVE_SSE4_1
157 extern Pairdistance_T **pairdistance_array_plus_128[NMISMATCHTYPES];
158 #endif
159 extern bool **consistent_array[3];
160 extern int *nt_to_int_array;
161 
162 
163 struct Space_single_T {
164   void **matrix_ptrs, *matrix_space;
165   void **directions_ptrs_0, *directions_space_0;
166   void **directions_ptrs_1, *directions_space_1;
167   void **directions_ptrs_2, *directions_space_2;
168 };
169 
170 struct Space_double_T {
171   void **upper_matrix_ptrs, *upper_matrix_space;
172   void **upper_directions_ptrs_0, *upper_directions_space_0;
173   void **upper_directions_ptrs_1, *upper_directions_space_1;
174   void **lower_matrix_ptrs, *lower_matrix_space;
175   void **lower_directions_ptrs_0, *lower_directions_space_0;
176   void **lower_directions_ptrs_1, *lower_directions_space_1;
177 };
178 
179 
180 #define T Dynprog_T
181 struct T {
182   int max_rlength;
183   int max_glength;
184 
185 #ifdef DEBUG12
186   struct Int3_T **matrix3_ptrs, *matrix3_space;
187 #endif
188 
189 #if !defined(HAVE_SSE2) || defined(DEBUG_SIMD)
190   Score32_T **matrix_ptrs, *matrix_space;
191   Direction32_T **directions_ptrs_0, *directions_space_0;
192   Direction32_T **directions_ptrs_1, *directions_space_1;
193   Direction32_T **directions_ptrs_2, *directions_space_2;
194 #endif
195 #ifdef HAVE_SSE2
196   union {
197     struct Space_single_T one;
198     struct Space_double_T two;
199   } aligned;
200 #ifdef DEBUG_AVX2
201   union {
202     struct Space_single_T one;
203     struct Space_double_T two;
204   } aligned_std;
205 #endif
206 #endif
207   int nspaces;
208 };
209 
210 
211 extern char *
212 Dynprog_endalign_string (Endalign_T endalign);
213 
214 extern int
215 Dynprog_score (int matches, int mismatches, int qopens, int qindels, int topens, int tindels,
216 	       double defect_rate);
217 
218 extern T
219 Dynprog_new (int maxlookback, int extraquerygap, int maxpeelback,
220 	     int extramaterial_end, int extramaterial_paired,
221 	     bool doublep);
222 extern void
223 Dynprog_free (T *old);
224 
225 extern bool
226 Dynprog_consistent_p (int c, int g, int g_alt, int genestrand);
227 
228 extern void
229 Dynprog_term (Mode_T mode);
230 extern void
231 Dynprog_init (Mode_T mode);
232 
233 extern void
234 Dynprog_compute_bands (int *lband, int *uband, int rlength, int glength, int extraband, bool widebandp);
235 
236 
237 extern void
238 Dynprog_Matrix32_print (Score32_T **matrix, int rlength, int glength, char *rsequence,
239 			char *gsequence, char *gsequencealt,
240 			int goffset, Univcoord_T chroffset, Univcoord_T chrhigh,
241 			bool watsonp, bool revp, int lband, int uband);
242 
243 extern void
244 Dynprog_Directions32_print (Direction32_T **directions_nogap, Direction32_T **directions_Egap, Direction32_T **directions_Fgap,
245 			    int rlength, int glength, char *rsequence, char *gsequence, char *gsequence_alt,
246 			    int goffset, Univcoord_T chroffset, Univcoord_T chrhigh,
247 			    bool watsonp, bool revp, int lband, int uband);
248 
249 extern Score32_T **
250 Dynprog_standard (Direction32_T ***directions_nogap, Direction32_T ***directions_Egap, Direction32_T ***directions_Fgap,
251 		  T this, char *rsequence, char *gsequence, char *gsequence_alt,
252 		  int rlength, int glength,
253 		  int goffset, Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
254 		  Mismatchtype_T mismatchtype, Score32_T open, Score32_T extend,
255 		  int lband, int uband, bool jump_late_p, bool revp, int saturation,
256 		  bool upperp, bool lowerp);
257 
258 extern List_T
259 Dynprog_traceback_std (List_T pairs, int *nmatches, int *nmismatches, int *nopens, int *nindels,
260 		       Direction32_T **directions_nogap, Direction32_T **directions_Egap, Direction32_T **directions_Fgap,
261 		       int r, int c, char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_alt,
262 		       int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
263 		       Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
264 		       int dynprogindex);
265 
266 
267 #undef T
268 #endif
269