1 /* $Id: dropfz2.c 1280 2014-08-21 00:47:55Z wrp $ */
2 /* $Revision: 1280 $ */
3
4 /* copyright (c) 1998, 1999, 2014 by William R. Pearson and The Rector &
5 Vistors of the University of Virginia */
6
7 /* Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under this License is distributed on an "AS
15 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16 express or implied. See the License for the specific language
17 governing permissions and limitations under the License.
18 */
19
20 /* 18-Sept-2006 - removed static global variables for alignment */
21
22 /* 2002/06/23 finally correctly implement fix to translate 'N' to 'X' */
23
24 /* 1999/11/29 modification by Z. Zhang to translate DNA 'N' as 'X' */
25
26 /* implements an improved version of the fasty algorithm, see:
27
28 W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of
29 DNA sequences with protein sequences" Genomics 46:24-36
30
31 */
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <math.h>
36 #include <ctype.h>
37
38 #include "defs.h"
39 #include "param.h"
40 #define XTERNAL
41 #include "upam.h"
42 #include "uascii.h"
43
44 #define NT_N 16
45
46 /* globals for fasta */
47 #define MAXWINDOW 64
48
49 #ifndef MAXSAV
50 #define MAXSAV 10
51 #endif
52
53 #ifndef ALLOCN0
54 static char *verstr="3.8 June 2014";
55 #else
56 static char *verstr="3.8an0 Jul 2014";
57 #endif
58
59 struct dstruct /* diagonal structure for saving current run */
60 {
61 int score; /* hash score of current match */
62 int start; /* start of current match */
63 int stop; /* end of current match */
64 struct savestr *dmax; /* location in vmax[] where best score data saved */
65 };
66
67 struct savestr
68 {
69 int score; /* pam score with segment optimization */
70 int score0; /* pam score of best single segment */
71 int gscore; /* score from global match */
72 int dp; /* diagonal of match */
73 int start; /* start of match in lib seq */
74 int stop; /* end of match in lib seq */
75 };
76
77 struct update_code_str {
78 int p_op_idx;
79 int p_op_cnt;
80 int show_code;
81 int cigar_order;
82 int show_ext;
83 char *op_map;
84 };
85
86 #ifndef TFAST
87 static char *ori_code = "-x/=\\+*"; /* FASTX */
88 static char *cigar_code = "DXFMRI*";
89 #else
90 static char *ori_code = "+x/=\\-*"; /* TFASTX */
91 static char *cigar_code = "IXFMRD*";
92 #endif
93
94 static struct update_code_str *
95 init_update_data(int show_code);
96
97 static void
98 sprintf_code(char *tmp_str, struct update_code_str *, int op_idx, int op_cnt);
99
100 static void
101 update_code(char *al_str, int al_str_max,
102 struct update_code_str *update_data, int op,
103 int sim_code, unsigned char sp0, unsigned char sp1);
104
105 static void
106 close_update_data(char *al_str, int al_str_max,
107 struct update_code_str *update_data);
108
109 void kpsort (struct savestr **v, int n);
110 extern void *init_stack(int, int);
111 extern void push_stack(void *, void *);
112 extern void *pop_stack(void *);
113 extern void *free_stack(void *);
114
115 #define SGW1 100
116 #define SGW2 300
117 struct smgl_str {
118 int C[SGW1+1][SGW2+1];
119 int st[SGW1+1][SGW2+1];
120 int D[SGW2+7], I[SGW2+1];
121 };
122
123 struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };
124
125 struct wgt { int iii, ii, iv;};
126 struct wgtc {char c2, c3, c4, c5;};
127
128 typedef struct st_s { int C, I, D;} *st_ptr;
129
130 struct f_struct {
131 struct dstruct *diag;
132 int frame;
133 int ndo;
134 int noff;
135 int hmask; /* hash constants */
136 int *pamh1; /* pam based array */
137 int *pamh2; /* pam based kfact array */
138 int *link, *harr; /* hash arrays */
139 int kshft; /* shift width */
140 int nsav, lowscor; /* number of saved runs, worst saved run */
141 #ifndef TFAST
142 unsigned char *aa0x, *aa0v; /* aa0x - 111122223333 */
143 #else
144 unsigned char *aa1x, *aa1v; /* aa1x - 111122223333 */
145 #endif /* aa1v - computed codons */
146 struct sx_s *cur;
147 int cur_sp_size;
148 struct wgt **weight0;
149 struct wgt **weight1;
150 struct wgtc **weight_c;
151 int *waa;
152 int *res;
153 int max_res;
154 st_ptr up, down, tp;
155 struct smgl_str smgl_s;
156 };
157
158 #define DROP_INTERN
159 #include "drop_func.h"
160
161 static int dmatchz(const unsigned char *aa0, int n0,
162 const unsigned char *aa1, int n1,
163 const unsigned char *aa1v,
164 int hoff, int window,
165 int **pam2, int gdelval, int ggapval, int gshift,
166 struct f_struct *f_str);
167
168 int shscore(unsigned char *aa0, int n0, int **pam2);
169 int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);
170 extern int ELK_to_s(double E_join, int n0, int n1, double Lambda, double K, double H);
171
172 int savemax (struct dstruct *, int,
173 struct savestr *vmax, struct savestr **lowmax);
174
175 int spam (const unsigned char *aa0, const unsigned char *aa1,
176 struct savestr *dmax, int **pam2,
177 struct f_struct *f_str);
178 int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);
179 int lx_band(const unsigned char *prot_seq, int len_prot,
180 const unsigned char *dna_prot_seq, int len_dna_prot,
181 int **pam_matrix, int gopen, int gext,
182 int gshift, int start_diag, int width, struct f_struct *f_str);
183
184 struct a_res_str *
185 merge_ares_chains(struct a_res_str *cur_ares,
186 struct a_res_str *tmpl_ares,
187 int score_ix, const char *msg);
188
189 extern void w_abort (char *p, char *p1);
190 extern void aagetmap(char *to, int n);
191
192 /* initialize for fasta */
193 /* modified 30-August-1999 by Zheng Zhang to work with an extended alphabet */
194 /* Assume naa=47, and wgts[47][23] matches both upper and lower case
195 amoino acids with another amino acid. And also assume the DNA letter
196 does not have upper/lower case difference. If you also allow DNA
197 sequence to be upper/lower case letters, more needs be changed. Not
198 only here, but also in the alignment code, the way that pack a codon
199 into a number between 0-63 need be changed. */
200
201 /* modified so that if **weightci==NULL, do not fiddle with characters */
202
203 /* modified 3-Aug-2010 for NCBIstdaa alphabet, which requires MAXUC
204 28, MAXLC 56, so we must have 58, not 47, entries */
205
206 void
init_weights(struct wgt *** weighti,struct wgtc *** weightci,int ** wgts,int gshift,int gsubs,int naa)207 init_weights(struct wgt ***weighti, struct wgtc ***weightci,
208 int **wgts, int gshift, int gsubs, int naa)
209 {
210 int i, j, do_wgtc=0;
211 int aa, b, a, x, y, z;
212 int *wwt, e;
213 struct wgt **weight;
214 struct wgtc **weightc;
215 char aacmap[64];
216 int temp[MAXLC+1][64]; /*change*/
217 char le[MAXLC+1][64];
218
219 if (naa > MAXLC) {
220 fprintf(stderr,"*** dropfz2.c compilation problem naa(%d) > MAXLX(%d) ***\n",
221 naa, MAXLC);
222 }
223
224 if ((*weighti=(struct wgt **)calloc((size_t)(naa+1),sizeof(struct wgt *)))
225 ==NULL) {
226 fprintf(stderr," cannot allocate weights array: %d\n",naa);
227 exit(1);
228 }
229
230 weight = *weighti;
231 /* allocate weight[aa 0..MAXLC] */
232 for (aa=0; aa <= naa; aa++) {
233 if ((weight[aa]=(struct wgt *)calloc((size_t)256,sizeof(struct wgt)))
234 ==NULL) {
235 fprintf(stderr," cannot allocate weight[]: %d/%d\n",aa,naa);
236 exit(1);
237 }
238 }
239
240 /* allocate weightci[aa 0..MAXLC] */
241 if (weightci !=NULL) {
242 if ((*weightci=(struct wgtc **)calloc((size_t)(naa+1),
243 sizeof(struct wgtc *)))==NULL) {
244 fprintf(stderr," cannot allocate weight_c array: %d\n",naa);
245 exit(1);
246 }
247 weightc = *weightci;
248
249 for (aa=0; aa <= naa; aa++) {
250 if ((weightc[aa]=(struct wgtc *)calloc((size_t)256,sizeof(struct wgtc)))
251 ==NULL) {
252 fprintf(stderr," cannot allocate weightc[]: %d/%d\n",aa,naa);
253 exit(1);
254 }
255 }
256 do_wgtc = 1;
257 }
258 else do_wgtc = 0;
259
260 aagetmap(aacmap,64);
261
262 for (aa = 0; aa < naa; aa++) { /* change*/
263 wwt = wgts[aa]; /* pam matrix */
264 for (i = 0; i < 64; i++) { /* i iterates through the codons */
265 x = -10000; /* large negative */
266 y = i;
267 for (j = 0; j < 64; j++) { /* j iterates through the codons */
268 z = ((~i & j) | (i & ~j));
269 b = 0; /* score = 0 */
270 if (z % 4) b-= gsubs;
271 if (z /16) b-= gsubs;
272 if ((z /4) % 4) b -= gsubs;
273 b += wwt[aascii[aacmap[j]]]; /* add the match score for char j*/
274 if (b > x) {
275 x = b; /* x has the score */
276 y = j; /* y has the character (codon index)*/
277 }
278 }
279 #ifdef DEBUG
280 if (y < 0 || y > 63) printf("%d %d %d %d ",aa, i, x, y);
281 #endif
282 temp[aa][i] = x;
283 le[aa][i] = y;
284 }
285 /* printf("\n"); */
286 }
287
288 for (aa= 0; aa < naa; aa++) {
289 wwt = temp[aa];
290 for (i = 0; i < 256; i++) {
291 for (x=-100,b = 0; b < 4; b++) {
292 z = (i/ (1 << ((b+1)*2)))*(1<<(b*2))+(i%(1<<(b*2)));
293 if (x < (e=wwt[z])) {
294 x = e;
295 if (do_wgtc) weightc[aa][i].c4 = aacmap[le[aa][z]];
296 }
297 }
298 weight[aa][i].iv=x-gshift;
299 weight[aa][i].iii = wwt[i%64];
300
301 if (do_wgtc) {
302 weightc[aa][i].c5 = aacmap[le[aa][i%64]];
303 weightc[aa][i].c3 = aacmap[i%64];
304 }
305 x = i %16;
306 for (y = -100, b = 0; b < 3; b++) {
307 z = ((x >> (b*2)) << (b*2+2)) + (x % (1 << (b*2)));
308 for (a = 0; a < 4; a++) {
309 if ((e =wwt[z+(a<<(b*2))]) > y) {
310 y = e;
311 if (do_wgtc)
312 weightc[aa][i].c2 = aacmap[le[aa][z+(a<<(b*2))]];
313 }
314 }
315 }
316 weight[aa][i].ii = y-gshift;
317 }
318 }
319 /*106=CGGG*/
320 for (aa = 0; aa < naa; aa++) {
321 weight[aa][106].iii = wgts[aa][23]; /* is 23 the code for 'X'?*/
322 weight[aa][106].iv = weight[aa][106].ii = weight[aa][106].iii-gshift;
323 if (do_wgtc) {
324 weightc[aa][106].c5 = weightc[aa][106].c4 = weightc[aa][106].c3
325 = weightc[aa][106].c2 = 'X';
326 }
327 }
328 }
329
330 void
free_weights(struct wgt *** weighti0,struct wgt *** weighti1,struct wgtc *** weightci,int naa)331 free_weights(struct wgt ***weighti0, struct wgt ***weighti1,
332 struct wgtc ***weightci, int naa)
333 {
334 int aa;
335 struct wgt **weight0;
336 struct wgt **weight1;
337 struct wgtc **weightc;
338
339 weight0 = *weighti0;
340 weight1 = *weighti1;
341 weightc = *weightci;
342
343 for (aa=0; aa < naa; aa++) {free(weight0[aa]);}
344 for (aa=0; aa < naa; aa++) {free(weight1[aa]);}
345 for (aa=0; aa < naa; aa++) {free(weightc[aa]);}
346
347 free(weight0);
348 free(weight1);
349 free(weightc);
350 }
351
352 static void
pre_com(const unsigned char * aa0,int n0,unsigned char * aa0v)353 pre_com(const unsigned char *aa0, int n0, unsigned char *aa0v) {
354 int dnav, i;
355 dnav = (hnt[aa0[0]]<<2) + hnt[aa0[1]];
356 for (i=2; i<n0; i++) {
357 dnav = ((dnav<<2)+hnt[aa0[i]])&255;
358 if (aa0[i] == NT_N || aa0[i-1]==NT_N || aa0[i-2] == NT_N) {
359 aa0v[i-2] = 106;
360 }
361 else {
362 if (dnav == 106/*CGGG*/) {dnav = 42/*AGGG*/;}
363 aa0v[i-2]=dnav;
364 }
365 }
366 }
367
368 static void
pre_com_r(const unsigned char * aa0,int n0,unsigned char * aa0v)369 pre_com_r(const unsigned char *aa0, int n0, unsigned char *aa0v) {
370 int dnav, i, ir;
371 dnav = ((3-hnt[aa0[n0-1]])<<2) + 3-hnt[aa0[n0-2]];
372 for (i=2, ir=n0-3; i<n0; i++,ir--) {
373 dnav = ((dnav<<2)+3-hnt[aa0[ir]])&255;
374 if (aa0[ir] == NT_N || aa0[ir+1]==NT_N || aa0[ir+2] == NT_N) {
375 aa0v[i-2] = 106;
376 }
377 else {
378 if (dnav == 106) dnav = 42;
379 aa0v[i-2]=dnav;
380 }
381 }
382 }
383
384 void
init_work(unsigned char * aa0,int n0,struct pstruct * ppst,struct f_struct ** f_arg)385 init_work (unsigned char *aa0, int n0,
386 struct pstruct *ppst,
387 struct f_struct **f_arg)
388 {
389 int mhv, phv;
390 int hmax;
391 int i0, hv;
392 int pamfact;
393 int btemp;
394 struct f_struct *f_str;
395 struct bdstr *bss;
396 /* these used to be globals, but do not need to be */
397 int ktup, fact, kt1, lkt;
398
399 int maxn0;
400 int *pwaa;
401 int i, j, q;
402 struct swstr *ss, *r_ss;
403 int *waa;
404 int *res;
405 int nsq, ip, *hsq, naat;
406 #ifndef TFAST
407 int last_n0, itemp, dnav;
408 unsigned char *fd, *fs, *aa0x, *aa0v;
409 int n0x, n0x3;
410 #endif
411
412 if (nt[NT_N] != 'N') {
413 fprintf(stderr," nt[NT_N] (%d) != 'X' (%c) - recompile\n",NT_N,nt[NT_N]);
414 exit(1);
415 }
416
417 if (ppst->ext_sq_set) {
418 nsq = ppst->nsqx; ip = 1;
419 hsq = ppst->hsqx;
420 }
421 else {
422 nsq = ppst->nsqx; ip = 0;
423 hsq = ppst->hsq;
424 }
425
426 f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct));
427
428 if (!ppst->param_u.fa.use_E_thresholds) {
429 btemp = 2 * ppst->param_u.fa.bestoff / 3 +
430 n0 / ppst->param_u.fa.bestscale +
431 ppst->param_u.fa.bkfact *
432 (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup);
433 btemp = min (btemp, ppst->param_u.fa.bestmax);
434 if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5;
435
436 ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3;
437 if (ppst->param_u.fa.optcut_set != 1) {
438 #ifndef TFAST
439 ppst->param_u.fa.optcut = (btemp*5)/4;
440 #else
441 ppst->param_u.fa.optcut = (btemp*4)/3;
442 #endif
443 }
444 }
445
446 #ifdef OLD_FASTA_GAP
447 ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;
448 #else
449 ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;
450 #endif
451 pamfact = ppst->param_u.fa.pamfact;
452 ktup = ppst->param_u.fa.ktup;
453 fact = ppst->param_u.fa.scfact * ktup;
454
455 #ifndef TFAST
456 /* before hashing, we must set up some space and translate the sequence */
457
458 maxn0 = n0 + 2;
459 if ((aa0x =(unsigned char *)calloc((size_t)maxn0,
460 sizeof(unsigned char)))
461 == NULL) {
462 fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0);
463 exit (1);
464 }
465 aa0x++;
466 f_str->aa0x = aa0x;
467
468
469 if ((aa0v =(unsigned char *)calloc((size_t)maxn0,
470 sizeof(unsigned char)))
471 == NULL) {
472 fprintf (stderr, "cannot allocate aa0v array %d\n", maxn0);
473 exit (1);
474 }
475 aa0v++;
476 f_str->aa0v = aa0v;
477
478 /* make a precomputed codon number series */
479 pre_com(aa0, n0, aa0v);
480
481 last_n0 = 0;
482 for (itemp=0; itemp<3; itemp++) {
483 n0x=saatran(aa0,&aa0x[last_n0],n0,itemp);
484 /* for (i=0; i<n0x; i++) {
485 fprintf(stderr,"%c",aa[aa0x[last_n0+i]]);
486 if ((i%60)==59) fprintf(stderr,"\n");
487 }
488 fprintf(stderr,"\n");
489 */
490 last_n0 += n0x+1;
491 }
492
493 /* fprintf(stderr,"\n"); */
494 n0x = n0;
495 n0x3 = n0x/3;
496
497 /* now switch aa0 and aa0x for hashing functions */
498 fs = aa0;
499 aa0 = aa0x;
500 aa0x = fs;
501 #endif
502
503 /* naat must always be MAXLC because library can have LC aa residues */
504 /*
505 if (ppst->ext_sq_set) naat = MAXLC;
506 else naat = MAXUC;
507 */
508 naat = MAXLC;
509
510 init_weights(&f_str->weight0, NULL,
511 ppst->pam2[ip],-ppst->gshift,-ppst->gsubs,naat);
512 init_weights(&f_str->weight1, &f_str->weight_c,
513 ppst->pam2[0],-ppst->gshift,-ppst->gsubs,naat);
514
515 if (pamfact == -1)
516 pamfact = 0;
517 else if (pamfact == -2)
518 pamfact = 1;
519
520 for (i0 = 1, mhv = -1; i0 < ppst->nsq; i0++)
521 if (hsq[i0] < NMAP && hsq[i0] > mhv)
522 mhv = ppst->hsq[i0];
523
524 if (mhv <= 0)
525 {
526 fprintf (stderr, " maximum hsq <=0 %d\n", mhv);
527 exit (1);
528 }
529
530 for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;
531
532 /* kshft = 2; */
533 kt1 = ktup - 1;
534 hv = 1;
535 for (i0 = 0; i0 < ktup; i0++)
536 hv = hv << f_str->kshft;
537 hmax = hv;
538 f_str->hmask = (hmax >> f_str->kshft) - 1;
539
540 if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) {
541 fprintf (stderr, " cannot allocate hash array\n");
542 exit (1);
543 }
544 if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) {
545 fprintf (stderr, " cannot allocate pamh1 array\n");
546 exit (1);
547 }
548 if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) {
549 fprintf (stderr, " cannot allocate pamh2 array\n");
550 exit (1);
551 }
552 if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) {
553 fprintf (stderr, " cannot allocate hash link array");
554 exit (1);
555 }
556
557 for (i0 = 0; i0 < hmax; i0++)
558 f_str->harr[i0] = -1;
559 for (i0 = 0; i0 < n0; i0++)
560 f_str->link[i0] = -1;
561
562 /* encode the aa0 array */
563 phv = hv = 0;
564 lkt = kt1;
565 for (i0 = 0; i0 < min(n0,lkt); i0++) {
566 if (hsq[aa0[i0]] >= NMAP) {
567 hv=phv=0; lkt = i0+ktup; continue;
568 }
569 hv = (hv << f_str->kshft) + ppst->hsq[aa0[i0]];
570 phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup;
571 }
572
573 for (; i0 < n0; i0++) {
574 if (hsq[aa0[i0]] >= NMAP) {
575 hv=phv=0;
576 /* restart hv, phv calculation */
577 for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) {
578 if (hsq[aa0[i0]] >= NMAP) {
579 hv=phv=0;
580 lkt = i0+ktup;
581 continue;
582 }
583 hv = (hv << f_str->kshft) + hsq[aa0[i0]];
584 phv += ppst->pam2[ip][aa0[i0]][aa0[i0]]*ktup;
585 }
586 }
587 if (i0 >= n0) break;
588 hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]];
589 f_str->link[i0] = f_str->harr[hv];
590 f_str->harr[hv] = i0;
591 if (pamfact) {
592 f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup);
593 if (hsq[aa0[i0-kt1]] < NMAP)
594 phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup;
595 }
596 else f_str->pamh2[hv] = fact * ktup;
597 }
598
599 /* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the
600 pam2[0][0] is now undefined for consistency with blast
601 */
602
603 if (pamfact)
604 for (i0 = 1; i0 < ppst->nsq; i0++)
605 f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup;
606 else
607 for (i0 = 1; i0 < ppst->nsq; i0++)
608 f_str->pamh1[i0] = fact;
609
610 f_str->ndo = 0; /* used to save time on diagonals with long queries */
611
612
613 #ifndef ALLOCN0
614 if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG,
615 sizeof (struct dstruct)))==NULL) {
616 fprintf (stderr," cannot allocate diagonal arrays: %lu\n",
617 MAXDIAG *sizeof (struct dstruct));
618 exit (1);
619 };
620 #else
621 if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0,
622 sizeof (struct dstruct)))==NULL) {
623 fprintf (stderr," cannot allocate diagonal arrays: %ld\n",
624 (long)n0*sizeof (struct dstruct));
625 exit (1);
626 };
627 #endif
628
629 #ifndef TFAST
630 /* done hashing, now switch aa0, aa0x back */
631 fs = aa0;
632 aa0 = aa0x;
633 aa0x = fs;
634 #else
635 if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+4,
636 sizeof(unsigned char)))
637 == NULL) {
638 fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+4);
639 exit (1);
640 }
641 f_str->aa1x++;
642
643 if ((f_str->aa1v =(unsigned char *)calloc((size_t)ppst->maxlen+4,
644 sizeof(unsigned char))) == NULL) {
645 fprintf (stderr, "cannot allocate aa1v array %d\n", ppst->maxlen+4);
646 exit (1);
647 }
648 f_str->aa1v++;
649
650 #endif
651
652 if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) {
653 fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0);
654 exit(1);
655 }
656
657 pwaa = waa;
658 for (i=0; i<nsq; i++) {
659 for (j=0;j<n0; j++) {
660 *pwaa = ppst->pam2[ip][i][aa0[j]];
661 pwaa++;
662 }
663 }
664 f_str->waa = waa;
665
666 #ifndef TFAST
667 maxn0 = max(2*n0,MIN_RES);
668 #else
669 maxn0 = max(4*n0,MIN_RES);
670 #endif
671 if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) {
672 fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0);
673 exit(1);
674 }
675 f_str->res = res;
676 f_str->max_res = maxn0;
677
678 *f_arg = f_str;
679 }
680
681 /* pstring1 is a message to the manager, currently 512 */
682 /* pstring2 is the same information, but in a markx==10 format */
683 void
get_param(const struct pstruct * ppst,char ** pstring1,char * pstring2,struct score_count_s * s_info)684 get_param (const struct pstruct *ppst,
685 char **pstring1, char *pstring2,
686 struct score_count_s *s_info)
687 {
688 char options_str1[128];
689 char options_str2[128];
690 #ifndef TFAST
691 char *pg_str="FASTY";
692 #else
693 char *pg_str="TFASTY";
694 #endif
695
696 if (!ppst->param_u.fa.use_E_thresholds) {
697 sprintf(options_str1,"join: %d (%.3g), opt: %d (%.3g)",
698 ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
699 ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
700 sprintf(options_str2,"pg_join: %d (%.3g)\n; pg_optcut: %d (%.3g)",
701 ppst->param_u.fa.cgap, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
702 ppst->param_u.fa.optcut, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
703 }
704 else {
705 sprintf(options_str1,"E-join: %.2g (%.3g), E-opt: %.2g (%.3g)",
706 ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
707 ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
708 sprintf(options_str2,"pg_join_E(): %.2g (%.3g)\n; pg_optcut_E(): %.2g (%.3g)",
709 ppst->param_u.fa.E_join, (double)s_info->s_cnt[0]/(double)s_info->tot_scores,
710 ppst->param_u.fa.E_band_opt, (double)s_info->s_cnt[2]/(double)s_info->tot_scores);
711 }
712
713 if (!ppst->param_u.fa.optflag)
714 sprintf (pstring1[0], "%s (%s)",pg_str, verstr);
715 else
716 sprintf (pstring1[0], "%s (%s) [optimized]",pg_str, verstr);
717
718 sprintf (pstring1[1],
719 #ifdef OLD_FASTA_GAP
720 "%s matrix (%d:%d)%s, gap-pen: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
721 #else
722 "%s matrix (%d:%d)%s, open/ext: %3d/%3d, shift: %3d, subs: %3d\n ktup: %d, %s, width: %3d",
723 #endif
724 ppst->pam_name, ppst->pam_h,ppst->pam_l,
725 (ppst->ext_sq_set) ? "xS":"\0",
726 ppst->gdelval, ppst->ggapval,
727 ppst->gshift,ppst->gsubs,
728 ppst->param_u.fa.ktup, options_str1, ppst->param_u.fa.optwid);
729
730 if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1");
731
732 if (pstring2 != NULL) {
733 #ifdef OLD_FASTA_GAP
734 sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
735 ; pg_gap-pen: %d %d\n; pg_ktup: %d\n; %s\n",
736 #else
737 sprintf (pstring2, "; pg_name_alg: %s\n; pg_ver_rel: %s\n; pg_matrix: %s (%d:%d)%s\n\
738 ; pg_open-ext: %d %d\n; pg_ktup: %d\n; %s\n",
739 #endif
740 pg_str,verstr,ppst->pam_name, ppst->pam_h,ppst->pam_l,
741 (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval,
742 ppst->ggapval,ppst->param_u.fa.ktup,options_str2);
743 }
744 }
745
746 void
close_work(const unsigned char * aa0,int n0,struct pstruct * ppst,struct f_struct ** f_arg)747 close_work (const unsigned char *aa0, int n0,
748 struct pstruct *ppst,
749 struct f_struct **f_arg)
750 {
751 struct f_struct *f_str;
752 int naat;
753
754 f_str = *f_arg;
755
756 if (f_str != NULL) {
757 if (ppst->ext_sq_set) naat = MAXLC;
758 else naat = MAXUC;
759 free_weights(&f_str->weight0,&f_str->weight1,&f_str->weight_c,naat);
760 free(f_str->cur);
761 #ifndef TFAST
762 f_str->aa0v--;
763 free(f_str->aa0v);
764 f_str->aa0x--;
765 free(f_str->aa0x);
766 #else /* TFAST */
767 f_str->aa1x--;
768 free(f_str->aa1x);
769 f_str->aa1v--;
770 free(f_str->aa1v);
771 #endif
772 free(f_str->res);
773 free(f_str->waa);
774 free(f_str->diag);
775 free(f_str->link);
776 free(f_str->pamh2);
777 free(f_str->pamh1);
778 free(f_str->harr);
779 free(f_str);
780 *f_arg = NULL;
781 }
782 }
783
do_fastz(const unsigned char * aa0,int n0,const unsigned char * aa1,int n1,const unsigned char * aa1v,const struct pstruct * ppst,struct f_struct * f_str,struct rstruct * rst,int * hoff,int shuff_flg,struct score_count_s * s_info)784 void do_fastz (const unsigned char *aa0, int n0,
785 const unsigned char *aa1, int n1,
786 const unsigned char *aa1v,
787 const struct pstruct *ppst, struct f_struct *f_str,
788 struct rstruct *rst, int *hoff, int shuff_flg,
789 struct score_count_s *s_info)
790 {
791 int nd; /* diagonal array size */
792 int lhval;
793 int kfact;
794 int i;
795 register struct dstruct *dptr;
796 struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */
797 struct savestr *vptr[MAXSAV];
798 struct savestr *lowmax;
799 int lowscor;
800 register int tscor;
801 int xdebug = 0;
802
803 #ifndef ALLOCN0
804 register struct dstruct *diagp;
805 #else
806 register int dpos;
807 int lposn0;
808 #endif
809 struct dstruct *dpmax;
810 register int lpos;
811 int tpos;
812 struct savestr *vmptr;
813 int scor, tmp;
814 int im, ib, nsave;
815 int ktup, kt1, ip, lkt, ktup_sq;
816 const int *hsq;
817 int c_gap, opt_cut;
818 #ifndef TFAST
819 int n0x31, n0x32;
820 n0x31 = (n0-2)/3;
821 n0x32 = n0x31+1+(n0-n0x31-1)/2;
822 #else
823 unsigned char *fs, *fd;
824 int n1x31, n1x32, last_n1, itemp;
825 n1x31 = (n1-2)/3;
826 n1x32 = n1x31+1+(n1-n1x31-1)/2;
827 #endif
828
829 if (ppst->ext_sq_set) {
830 ip = 1;
831 hsq = ppst->hsqx;
832 }
833 else {
834 ip = 0;
835 hsq = ppst->hsq;
836 }
837
838 ktup = ppst->param_u.fa.ktup;
839 ktup_sq = ktup*ktup;
840 if (ktup == 1) ktup_sq *= 2;
841
842 kt1 = ktup-1;
843
844 if (n1 < ktup) {
845 rst->score[0] = rst->score[1] = rst->score[2] = 0;
846 return;
847 }
848
849 if (n0+n1+1 >= MAXDIAG) {
850 fprintf(stderr,"n0,n1 too large: %d, %d\n",n0,n1);
851 rst->score[0] = rst->score[1] = rst->score[2] = -1;
852 return;
853 }
854
855 if (ppst->param_u.fa.use_E_thresholds) {
856 c_gap = ELK_to_s(ppst->param_u.fa.E_join*ktup_sq*2.5, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
857 opt_cut = ELK_to_s(ppst->param_u.fa.E_band_opt*ktup_sq*2.0, n0, n1, ppst->pLambda, ppst->pK, ppst->pH);
858 rst->valid_stat = 0;
859 }
860 else {
861 c_gap = ppst->param_u.fa.cgap;
862 opt_cut = ppst->param_u.fa.optcut;
863 rst->valid_stat = 1;
864 }
865
866 f_str->noff = n0 - 1;
867
868 #ifdef ALLOCN0
869 nd = n0;
870 #endif
871
872 #ifndef ALLOCN0
873 nd = n0 + n1;
874 #endif
875
876 dpmax = &f_str->diag[nd];
877 for (dptr = &f_str->diag[f_str->ndo]; dptr < dpmax;)
878 {
879 dptr->stop = -1;
880 dptr->dmax = NULL;
881 dptr++->score = 0;
882 }
883
884 for (vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++)
885 vmptr->score = 0;
886 lowmax = vmax;
887 lowscor = 0;
888
889 if (n1 > 1000 && aa1[0]==23 && aa1[100]==23 &&
890 aa1[1400]==23 && aa1[1401]!=23) {
891 xdebug = 1;
892 }
893 else xdebug = 0;
894
895 /* start hashing */
896 lhval = 0;
897 lkt = kt1;
898 for (lpos = 0; (lpos < lkt || hsq[aa1[lpos]]>=NMAP) && lpos<n1; lpos++) {
899 /* restart lhval calculation */
900 if (hsq[aa1[lpos]]>=NMAP) {
901 lhval = 0; lkt=lpos+ktup;
902 continue;
903 }
904 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
905 }
906
907 #ifndef ALLOCN0
908 diagp = &f_str->diag[f_str->noff + lkt];
909 for (; lpos < n1; lpos++, diagp++) {
910 if (hsq[aa1[lpos]]>=NMAP) {
911 lpos++ ; diagp++;
912 while (lpos < n1 && hsq[aa1[lpos]]>=NMAP) {lpos++; diagp++;}
913 if (lpos >= n1) break;
914 lhval = 0;
915 }
916 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
917 for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
918 if ((tscor = (dptr = &diagp[-tpos])->stop) >= 0) {
919 #else
920 lposn0 = f_str->noff + lpos;
921 for (; lpos < n1; lpos++, lposn0++) {
922 if (hsq[aa1[lpos]]>=NMAP) {lhval = 0; goto loopl;}
923 lhval = ((lhval & f_str->hmask) << f_str->kshft) + ppst->hsq[aa1[lpos]];
924 for (tpos = f_str->harr[lhval]; tpos >= 0; tpos = f_str->link[tpos]) {
925 dpos = lposn0 - tpos;
926 if ((tscor = (dptr = &f_str->diag[dpos % nd])->stop) >= 0) {
927 #endif
928 tscor += ktup;
929 if ((tscor -= lpos) <= 0) {
930 scor = dptr->score;
931 if ((tscor += (kfact = f_str->pamh2[lhval])) < 0 && lowscor < scor) {
932 #ifdef ALLOCN0
933 lowscor = savemax (dptr, dpos, vmax, &lowmax);
934 #else
935 lowscor = savemax (dptr, dptr- f_str->diag, vmax, &lowmax);
936 #endif
937 }
938 if ((tscor += scor) >= kfact) {
939 dptr->score = tscor;
940 dptr->stop = lpos;
941 }
942 else {
943 dptr->score = kfact;
944 dptr->start = (dptr->stop = lpos) - kt1;
945 }
946 }
947 else {
948 dptr->score += f_str->pamh1[aa0[tpos]];
949 dptr->stop = lpos;
950 }
951 }
952 else {
953 dptr->score = f_str->pamh2[lhval];
954 dptr->start = (dptr->stop = lpos) - kt1;
955 }
956 } /* end tpos */
957
958 #ifdef ALLOCN0
959 /* reinitialize diag structure */
960 loopl:
961 if ((dptr = &f_str->diag[lpos % nd])->score > lowscor)
962 lowscor = savemax (dptr, lpos, vmax, &lowmax);
963 dptr->stop = -1;
964 dptr->dmax = NULL;
965 dptr->score = 0;
966 #endif
967 } /* end lpos */
968
969 #ifdef ALLOCN0
970 for (tpos = 0, dpos = f_str->noff + n1 - 1; tpos < n0; tpos++, dpos--) {
971 if ((dptr = &f_str->diag[dpos % nd])->score > lowscor)
972 lowscor = savemax (dptr, dpos, vmax, &lowmax, f_str);
973 }
974 #else
975 for (dptr = f_str->diag; dptr < dpmax;) {
976 if (dptr->score > lowscor) savemax (dptr, dptr - f_str->diag, vmax, &lowmax);
977 dptr->stop = -1;
978 dptr->dmax = NULL;
979 dptr++->score = 0;
980 }
981 f_str->ndo = nd;
982 #endif
983
984 for (nsave = 0, vmptr = vmax; vmptr < &vmax[MAXSAV]; vmptr++) {
985 if (vmptr->score > 0) {
986 vmptr->score = spam (aa0, aa1, vmptr, ppst->pam2[ip], f_str);
987 vptr[nsave++] = vmptr;
988 }
989 }
990
991 if (nsave <= 0) {
992 rst->score[0] = rst->score[1] = rst->score[2] = 0;
993 return;
994 }
995
996 #ifndef TFAST
997 /* FASTX code here to modify the start, stop points for
998 the three phases of the translated protein sequence
999 */
1000
1001 for (ib=0; ib<nsave; ib++) {
1002 if (f_str->noff-vptr[ib]->dp+vptr[ib]->start >= n0x32)
1003 vptr[ib]->dp += n0x32;
1004 if (f_str->noff-vptr[ib]->dp +vptr[ib]->start >= n0x31)
1005 vptr[ib]->dp += n0x31;
1006 }
1007 #else
1008 /* TFAST code here to modify the start, stop points for
1009 the three phases of the translated protein sequence
1010 TFAST modifies library start points, rather than
1011 query start points
1012 */
1013
1014 for (ib=0; ib<nsave; ib++) {
1015 if (vptr[ib]->start >= n1x32) {
1016 vptr[ib]->start -= n1x32;
1017 vptr[ib]->stop -= n1x32;
1018 vptr[ib]->dp -= n1x32;
1019 }
1020 if (vptr[ib]->start >= n1x31) {
1021 vptr[ib]->start -= n1x31;
1022 vptr[ib]->stop -= n1x31;
1023 vptr[ib]->dp -= n1x31;
1024 }
1025 }
1026 #endif /* TFAST */
1027
1028 scor = sconn (vptr, nsave, c_gap,
1029 ppst->param_u.fa.pgap, f_str);
1030
1031 for (vmptr=vptr[0],ib=1; ib<nsave; ib++)
1032 if (vptr[ib]->score > vmptr->score) vmptr=vptr[ib];
1033
1034 /* kssort (vptr, nsave); */
1035
1036 rst->score[1] = vmptr->score;
1037 rst->score[0] = max (scor, vmptr->score);
1038 rst->score[2] = rst->score[0]; /* initn */
1039
1040 s_info->tot_scores++;
1041 if (rst->score[0] > c_gap) { s_info->s_cnt[0]++;}
1042 #ifndef TFAST
1043 *hoff=f_str->noff - vmptr->dp;
1044 #else /* TFAST */
1045 *hoff=vmptr->dp-f_str->noff;
1046 #endif /* TFAST */
1047 if (ppst->param_u.fa.optflag) {
1048 if (/* shuff_flg || */ rst->score[0] > opt_cut) {
1049 s_info->s_cnt[2]++;
1050 rst->valid_stat = 1;
1051 rst->score[2] = dmatchz(aa0, n0,aa1,n1, aa1v,
1052 *hoff,ppst->param_u.fa.optwid,
1053 ppst->pam2[ip],
1054 ppst->gdelval,ppst->ggapval,ppst->gshift,
1055 f_str);
1056 }
1057 }
1058 }
1059
1060 void do_work (const unsigned char *aa0, int n0,
1061 const unsigned char *aa1, int n1,
1062 int frame,
1063 const struct pstruct *ppst,
1064 struct f_struct *f_str,
1065 int qr_flg, int shuff_flg, struct rstruct *rst,
1066 struct score_count_s *s_info)
1067 {
1068 int hoff;
1069 int last_n1, itx, dnav, n10, i, ir;
1070 unsigned char *aa1x;
1071
1072 rst->escore = 1.0;
1073 rst->segnum = rst->seglen = 1;
1074 rst->valid_stat = 0;
1075
1076 if (n1 < ppst->param_u.fa.ktup) {
1077 rst->score[0] = rst->score[1] = rst->score[2] = 0;
1078 return;
1079 }
1080
1081 #ifndef TFAST
1082 do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
1083 #else
1084 /* make a precomputed codon number series */
1085
1086 if (frame == 0) {
1087 pre_com(aa1, n1, f_str->aa1v);
1088 }
1089 else {
1090 pre_com_r(aa1, n1, f_str->aa1v);
1091 }
1092
1093 /* make translated sequence */
1094 last_n1 = 0;
1095 aa1x = f_str->aa1x;
1096 #ifdef DEBUG
1097 if (frame > 1) {
1098 fprintf(stderr, "*** fz_walign - frame: %d - out of range [0,1]\n",frame);
1099 }
1100 #endif
1101
1102 for (itx= frame*3; itx< frame*3+3; itx++) {
1103 n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
1104 last_n1 += n10+1;
1105 }
1106 n10 = last_n1-1;
1107
1108 do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, shuff_flg, s_info);
1109 #endif
1110
1111 rst->comp = rst->H = -1.0;
1112 }
1113
1114 void do_opt (const unsigned char *aa0, int n0,
1115 const unsigned char *aa1, int n1,
1116 int frame,
1117 struct pstruct *ppst,
1118 struct f_struct *f_str,
1119 struct rstruct *rst)
1120 {
1121 int optflag, tscore, hoff;
1122 int last_n1, itx, n10, i, ir;
1123 unsigned char *aa1x;
1124 struct score_count_s s_info = {0, 0, 0, 0};
1125
1126 optflag = ppst->param_u.fa.optflag;
1127 ppst->param_u.fa.optflag = 1;
1128
1129 #ifndef TFAST
1130 do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, rst, &hoff, 0, &s_info);
1131 #else
1132 /* make a precomputed codon number series */
1133
1134 if (frame == 0) {
1135 pre_com(aa1, n1, f_str->aa1v);
1136 }
1137 else {
1138 pre_com_r(aa1, n1, f_str->aa1v);
1139 }
1140
1141 /* make translated sequence */
1142 last_n1 = 0;
1143 aa1x = f_str->aa1x;
1144 for (itx= frame*3; itx< frame*3+3; itx++) {
1145 n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
1146 last_n1 += n10+1;
1147 }
1148 n10 = last_n1-1;
1149
1150 do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, rst, &hoff, 0, &s_info );
1151 #endif
1152
1153 ppst->param_u.fa.optflag = optflag;
1154 }
1155
1156 int
1157 savemax (struct dstruct *dptr, int dpos,
1158 struct savestr *vmax, struct savestr **lowmax)
1159 {
1160 struct savestr *vmptr;
1161 int i;
1162
1163 /* check to see if this is the continuation of a run that is already saved */
1164
1165 if ((vmptr = dptr->dmax) != NULL && vmptr->dp == dpos &&
1166 vmptr->start == dptr->start) {
1167 vmptr->stop = dptr->stop;
1168 if ((i = dptr->score) <= vmptr->score) return (*lowmax)->score;
1169 vmptr->score = i;
1170 if (vmptr != *lowmax) return (*lowmax)->score;
1171 }
1172 else {
1173 i = (*lowmax)->score = dptr->score;
1174 (*lowmax)->dp = dpos;
1175 (*lowmax)->start = dptr->start;
1176 (*lowmax)->stop = dptr->stop;
1177 dptr->dmax = *lowmax;
1178 }
1179
1180 for (vmptr = vmax; vmptr < vmax+MAXSAV; vmptr++) {
1181 if (vmptr->score < i) {
1182 i = vmptr->score;
1183 *lowmax = vmptr;
1184 }
1185 }
1186 return i;
1187 }
1188
1189 int spam (const unsigned char *aa0,
1190 const unsigned char *aa1,
1191 struct savestr *dmax, int **pam2,
1192 struct f_struct *f_str)
1193 {
1194 int lpos;
1195 int tot, mtot;
1196 struct {
1197 int start, stop, score;
1198 } curv, maxv;
1199 const unsigned char *aa0p, *aa1p;
1200
1201 aa1p = &aa1[lpos = dmax->start];
1202 aa0p = &aa0[lpos - dmax->dp + f_str->noff];
1203 curv.start = lpos;
1204
1205 tot = curv.score = maxv.score = 0;
1206 for (; lpos <= dmax->stop; lpos++) {
1207 tot += pam2[*aa0p++][*aa1p++];
1208 if (tot > curv.score) {
1209 curv.stop = lpos;
1210 curv.score = tot;
1211 }
1212 else if (tot < 0) {
1213 if (curv.score > maxv.score) {
1214 maxv.start = curv.start;
1215 maxv.stop = curv.stop;
1216 maxv.score = curv.score;
1217 }
1218 tot = curv.score = 0;
1219 curv.start = lpos+1;
1220 }
1221 }
1222
1223 if (curv.score > maxv.score) {
1224 maxv.start = curv.start;
1225 maxv.stop = curv.stop;
1226 maxv.score = curv.score;
1227 }
1228
1229 /* if (maxv.start != dmax->start || maxv.stop != dmax->stop)
1230 printf(" new region: %3d %3d %3d %3d\n",maxv.start,
1231 dmax->start,maxv.stop,dmax->stop);
1232 */
1233 dmax->start = maxv.start;
1234 dmax->stop = maxv.stop;
1235
1236 return maxv.score;
1237 }
1238
1239 #define XFACT 10
1240
1241 int sconn (struct savestr **v, int n,
1242 int cgap, int pgap, struct f_struct *f_str)
1243 {
1244 int i, si;
1245 struct slink {
1246 int score;
1247 struct savestr *vp;
1248 struct slink *next;
1249 } *start, *sl, *sj, *so, sarr[MAXSAV];
1250 int lstart, tstart, plstop, ptstop;
1251
1252 /* sort the score left to right in lib pos */
1253
1254 kpsort (v, n);
1255
1256 start = NULL;
1257
1258 /* for the remaining runs, see if they fit */
1259
1260 for (i = 0, si = 0; i < n; i++)
1261 {
1262
1263 /* if the score is less than the gap penalty, it never helps */
1264 if (v[i]->score < cgap)
1265 continue;
1266 lstart = v[i]->start;
1267 tstart = lstart - v[i]->dp + f_str->noff;
1268
1269 /* put the run in the group */
1270 sarr[si].vp = v[i];
1271 sarr[si].score = v[i]->score;
1272 sarr[si].next = NULL;
1273
1274 /* if it fits, then increase the score */
1275 for (sl = start; sl != NULL; sl = sl->next)
1276 {
1277 plstop = sl->vp->stop;
1278 ptstop = plstop - sl->vp->dp + f_str->noff;
1279 if (plstop < lstart+XFACT && ptstop < tstart+XFACT) {
1280 sarr[si].score = sl->score + v[i]->score + pgap;
1281 break;
1282 }
1283 }
1284
1285 /* now recalculate where the score fits */
1286 if (start == NULL)
1287 start = &sarr[si];
1288 else
1289 for (sj = start, so = NULL; sj != NULL; sj = sj->next)
1290 {
1291 if (sarr[si].score > sj->score)
1292 {
1293 sarr[si].next = sj;
1294 if (so != NULL)
1295 so->next = &sarr[si];
1296 else
1297 start = &sarr[si];
1298 break;
1299 }
1300 so = sj;
1301 }
1302 si++;
1303 }
1304
1305 if (start != NULL)
1306 return (start->score);
1307 else
1308 return (0);
1309 }
1310
1311 void
1312 kssort (v, n)
1313 struct savestr *v[];
1314 int n;
1315 {
1316 int gap, i, j;
1317 struct savestr *tmp;
1318
1319 for (gap = n / 2; gap > 0; gap /= 2)
1320 for (i = gap; i < n; i++)
1321 for (j = i - gap; j >= 0; j -= gap)
1322 {
1323 if (v[j]->score >= v[j + gap]->score)
1324 break;
1325 tmp = v[j];
1326 v[j] = v[j + gap];
1327 v[j + gap] = tmp;
1328 }
1329 }
1330
1331 void
1332 kpsort (struct savestr **v, int n) {
1333 int gap, i, j, k;
1334 int incs[4] = { 21, 7, 3, 1 };
1335 struct savestr *tmp;
1336 int v_start;
1337
1338 for ( k = 0; k < 4; k++) {
1339 gap = incs[k];
1340 for (i = gap; i < n; i++) {
1341 tmp = v[i];
1342 j = i;
1343 v_start = v[i]->start;
1344 while (j >= gap && v[j - gap]->start > v_start) {
1345 v[j] = v[j - gap];
1346 j -= gap;
1347 }
1348 v[j] = tmp;
1349 }
1350 }
1351 }
1352
1353 static int
1354 dmatchz(const unsigned char *aa0, int n0,
1355 const unsigned char *aa1, int n1,
1356 const unsigned char *aa1v,
1357 int hoff, int window,
1358 int **pam2, int gdelval, int ggapval, int gshift,
1359 struct f_struct *f_str)
1360 {
1361
1362 hoff -= window/2;
1363
1364 #ifndef TFAST
1365 return lx_band(aa1,n1,f_str->aa0v,n0-2,
1366 pam2,
1367 #ifdef OLD_FASTA_GAP
1368 -(gdelval - ggapval),
1369 #else
1370 -gdelval,
1371 #endif
1372 -ggapval,-gshift,
1373 hoff,window,f_str);
1374 #else
1375 return lx_band(aa0,n0,aa1v,n1-2,
1376 pam2,
1377 #ifdef OLD_FASTA_GAP
1378 -(gdelval - ggapval),
1379 #else
1380 -gdelval,
1381 #endif
1382 -ggapval,-gshift,
1383 hoff,window,f_str);
1384 #endif
1385 }
1386
1387 static void
1388 init_row(struct sx_s *row, int sp) {
1389 int i;
1390 for (i = 0; i < sp; i++) {
1391 row[i].C1 = row[i].I1 = 0;
1392 row[i].C2 = row[i].I2 = 0;
1393 row[i].C3 = row[i].I3 = 0;
1394 row[i].flag = 0;
1395 }
1396 }
1397
1398 int lx_band(const unsigned char *prot_seq, /* array with protein sequence numbers*/
1399 int len_prot, /* length of prot. seq */
1400 const unsigned char *dna_prot_seq, /* translated DNA sequence numbers*/
1401 int len_dna_prot, /* length trans. seq. */
1402 int **pam_matrix, /* scoring matrix */
1403 int gopen, int gext, /* gap open, gap extend penalties */
1404 int gshift, /* frame-shift penalty */
1405 int start_diag, /* start diagonal of band */
1406 int width, /* width for band alignment */
1407 struct f_struct *f_str)
1408 {
1409 void *ckalloc();
1410 int i, j, bd, bd1, x1, x2, sp, p1=0, p2=0, end_prot;
1411 struct sx_s *last, *tmp;
1412 int sc, del, best = 0, cd,ci, e1, e2, e3, cd1, cd2, cd3, f, gg;
1413 const unsigned char *dp;
1414 register struct sx_s *ap, *aq;
1415 struct wgt *wt, *ww;
1416 int aa, b, a,x,y,z;
1417
1418 sp = width+7;
1419 gg = gopen+gext;
1420 /* sp = sp/3+1; */
1421
1422 if (f_str->cur == NULL ) {
1423 f_str->cur_sp_size = sp;
1424 f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
1425 }
1426 else if (f_str->cur_sp_size != sp) {
1427 free(f_str->cur);
1428 f_str->cur = (struct sx_s *) ckalloc(sizeof(struct sx_s)*sp);
1429 f_str->cur_sp_size = sp;
1430 }
1431
1432 init_row(f_str->cur, sp);
1433
1434 /*
1435 if (start_diag %3 !=0) start_diag = start_diag/3-1;
1436 else start_diag = start_diag/3;
1437 if (width % 3 != 0) width = width/3+1;
1438 else width = width /3;
1439 */
1440
1441 x1 = start_diag; /* x1 = lower bound of DNA */
1442 x2 = 1; /* the amount of position shift from last row*/
1443
1444 end_prot = max(0,-width-start_diag) + (len_dna_prot+5)/3 + width;
1445 end_prot = min(end_prot,len_prot);
1446
1447 /* i counts through protein sequence, x1 through DNAp */
1448
1449 for (i = max(0, -width-start_diag), x1+=i; i < len_prot; i++, x1++) {
1450 bd = min(x1+width, (len_dna_prot+2)/3); /* upper bound of band */
1451 bd1 = max(0,x1); /* lower bound of band */
1452 wt = f_str->weight0[prot_seq[i]];
1453 del = 1-x1; /*adjustment*/
1454 bd += del;
1455 bd1 +=del;
1456
1457 ap = &f_str->cur[bd1]; aq = ap+1;
1458 e1 = f_str->cur[bd1-1].C3; e2 = ap->C1; cd1 = cd2= cd3= 0;
1459 for (dp = &dna_prot_seq[(bd1-del)*3]; ap < &f_str->cur[bd]; ap++) {
1460 ww = &wt[(unsigned char) *dp++];
1461 sc = max(max(e1+ww->iv, (e3=ap->C2)+ww->ii), e2+ww->iii);
1462 if (cd1 > sc) sc = cd1;
1463 cd1 -= gext;
1464 if ((ci = aq->I1) > 0) {
1465 if (sc < ci) { ap->C1 = ci; ap->I1 = ci-gext;}
1466 else {
1467 ap->C1 = sc;
1468 sc -= gg;
1469 if (sc > 0) {
1470 if (sc > best) best =sc;
1471 if (cd1 < sc) cd1 = sc;
1472 ap->I1 = max(ci-gext, sc);
1473 } else ap->I1 = ci-gext;
1474 }
1475 } else {
1476 if (sc <= 0) {
1477 ap->I1 = ap->C1 = 0;
1478 } else {
1479 ap->C1 = sc; sc-=gg;
1480 if (sc >0) {
1481 if (sc > best) best =sc;
1482 if (cd1 < sc) cd1 = sc;
1483 ap->I1 = sc;
1484 } else ap->I1 = 0;
1485 }
1486 }
1487 ww = &wt[(unsigned char) *dp++];
1488 sc = max(max(e2+ww->iv, (e1=ap->C3)+ww->ii), e3+ww->iii);
1489 if (cd2 > sc) sc = cd2;
1490 cd2 -= gext;
1491 if ((ci = aq->I2) > 0) {
1492 if (sc < ci) { ap->C2 = ci; ap->I2 = ci-gext;}
1493 else {
1494 ap->C2 = sc;
1495 sc -= gg;
1496 if (sc > 0) {
1497 if (sc > best) best =sc;
1498 if (cd2 < sc) cd2 = sc;
1499 ap->I2 = max(ci-gext, sc);
1500 }
1501 }
1502 } else {
1503 if (sc <= 0) {
1504 ap->I2 = ap->C2 = 0;
1505 } else {
1506 ap->C2 = sc; sc-=gg;
1507 if (sc >0) {
1508 if (sc > best) best =sc;
1509 if (cd2 < sc) cd2 = sc;
1510 ap->I2 = sc;
1511 } else ap->I2 = 0;
1512 }
1513 }
1514 ww = &wt[(unsigned char)*dp++];
1515 sc = max(max(e3+ww->iv, (e2=aq->C1)+ww->ii), e1+ww->iii);
1516 if (cd3 > sc) sc = cd3;
1517 cd3 -= gext;
1518 if ((ci = aq++->I3) > 0) {
1519 if (sc < ci) { ap->C3 = ci; ap->I3 = ci-gext;}
1520 else {
1521 ap->C3 = sc;
1522 sc -= gg;
1523 if (sc > 0) {
1524 if (sc > best) best =sc;
1525 if (cd3 < sc) cd3 = sc;
1526 ap->I3 = max(ci-gext, sc);
1527 }
1528 }
1529 } else {
1530 if (sc <= 0) {
1531 ap->I3 = ap->C3 = 0;
1532 } else {
1533 ap->C3 = sc; sc-=gg;
1534 if (sc >0) {
1535 if (sc > best) best =sc;
1536 if (cd3 < sc) cd3 = sc;
1537 ap->I3 = sc;
1538 } else ap->I3 = 0;
1539 }
1540 }
1541 }
1542 }
1543 /* printf("The best score is %d\n", best); */
1544 return best+gg;
1545 }
1546
1547 /* ckalloc - allocate space; check for success */
1548 void *ckalloc(size_t amount)
1549 {
1550 void *p;
1551
1552 if ((p = (void *)malloc( (size_t)amount)) == NULL)
1553 w_abort("Ran out of memory.","");
1554 return(p);
1555 }
1556
1557 /* calculate the 100% identical score */
1558 int
1559 shscore(unsigned char *aa0, int n0, int **pam2)
1560 {
1561 int i, sum;
1562 for (i=0,sum=0; i<n0; i++)
1563 sum += pam2[aa0[i]][aa0[i]];
1564 return sum;
1565 }
1566
1567 #define WIDTH 60
1568
1569 typedef struct mat *match_ptr;
1570
1571 typedef struct mat {
1572 int i, j, l;
1573 match_ptr next;
1574 } match_node;
1575
1576 typedef struct { int i,j;} state;
1577 typedef state *state_ptr;
1578
1579
1580 void *ckalloc();
1581 static match_ptr small_global(), global();
1582 static int local_align(), find_best();
1583 static void init_row2(), init_ROW();
1584
1585 int
1586 pro_dna(const unsigned char *prot_seq, /* array with prot. seq. numbers*/
1587 int len_prot, /* length of prot. seq */
1588 const unsigned char *dna_prot_seq, /* trans. DNA seq. numbers*/
1589 int len_dna_prot, /* length trans. seq. */
1590 int **pam_matrix, /* scoring matrix */
1591 int gopen, int gext, /* gap open, gap extend penalties */
1592 int gshift, /* frame-shift penalty */
1593 struct f_struct *f_str,
1594 int max_res,
1595 struct a_res_str *a_res) /* alignment info */
1596 {
1597 match_ptr align, ap, aq;
1598 int x, y, ex, ey, i, score;
1599 int *alignment;
1600
1601 f_str->up = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1602 f_str->down = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1603 f_str->tp = (st_ptr) ckalloc(sizeof(struct st_s)*(len_dna_prot+10));
1604
1605 /*local alignment find the best local alignment x and y
1606 is the starting position of the best local alignment
1607 and ex ey is the ending position */
1608
1609 score= local_align(&x, &y, &ex, &ey,
1610 pam_matrix, gopen, gext,
1611 dna_prot_seq, len_dna_prot,
1612 prot_seq, len_prot, f_str);
1613
1614 f_str->up += 3; f_str->down += 3; f_str->tp += 3;
1615
1616 /* x, y - start in prot, dna_prot */
1617 a_res->min0 = x; /* prot */
1618 a_res->min1 = y; /* DNA */
1619 a_res->max0 = ex; /* prot */
1620 a_res->max1 = ey; /* DNA */
1621
1622 align = global(x, y, ex, ey,
1623 pam_matrix, gopen, gext,
1624 dna_prot_seq, prot_seq,
1625 0, 0, f_str);
1626
1627 alignment = a_res->res;
1628
1629 for (ap = align, i= 0; ap; i++) {
1630 if (i < max_res) alignment[i] = ap->l;
1631 aq = ap->next; free(ap); ap = aq;
1632 }
1633 if (i >= max_res)
1634 fprintf(stderr,"***alignment truncated: %d/%d***\n", max_res,i);
1635
1636 /* up = &up[-3]; down = &down[-3]; tp = &tp[-3]; */
1637 free(&f_str->up[-3]); free(&f_str->tp[-3]); free(&f_str->down[-3]);
1638
1639 a_res->nres = i;
1640 return score;
1641 }
1642
1643 static void
1644 swap(void **a, void **b)
1645 {
1646 void *t = *a;
1647 *a = *b; *b = t;
1648 }
1649
1650 /*
1651 local alignment find the best local alignment x and y
1652 is the starting position of the best local alignment
1653 and ex ey is the ending position
1654 */
1655 static int
1656 local_align(int *x, int *y, int *ex, int *ey,
1657 int **wgts, int gop, int gext,
1658 const unsigned char *dnap, int ld,
1659 const unsigned char *pro, int lp,
1660 struct f_struct *f_str)
1661 {
1662 int i, j, score, x1,x2,x3,x4, e1 = 0, e2 = 0, e3,
1663 sc, del, e, best = 0, cd, ci, c;
1664 struct wgt *wt, *ww;
1665 state_ptr cur_st, last_st, cur_i_st;
1666 st_ptr cur, last;
1667 const unsigned char *dp;
1668 int *cur_d_st, *st_up;
1669
1670 /*
1671 Array rowiC stores the best scores of alignment ending at a position
1672 Arrays rowiD and rowiI store the best scores of alignment ending
1673 at a position with a deletion or insrtion
1674 Arrays sti stores the starting position of the best alignment whose
1675 score stored in the corresponding row array.
1676 The program stores two rows to complete the computation, same is
1677 for the global alignment routine.
1678 */
1679
1680
1681 st_up = (int *) ckalloc(sizeof(int)*(ld+10));
1682 init_row2(st_up, ld+5);
1683
1684 ld += 2;
1685
1686 init_ROW(f_str->up, ld+1);
1687 init_ROW(f_str->down, ld+1);
1688 cur = f_str->up+1;
1689 last = f_str->down+1;
1690
1691 cur_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1692 last_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1693 cur_i_st = (state_ptr) ckalloc(sizeof(state)*(ld+1));
1694 cur_d_st = st_up;
1695 dp = dnap-2;
1696 for (i = 0; i < lp; i++) {
1697 wt = f_str->weight1[pro[i]]; e2 =0; e1 = last[0].C;
1698 for (j = 0; j < 2; j++) {
1699 cur_st[j].i = i+1;
1700 cur_st[j].j = j+1;
1701 }
1702 for (j = 2; j < ld; j++) {
1703 ww = &wt[(unsigned char) dp[j]];
1704 del = -1;
1705 if (j >= 3) {
1706 sc = 0;
1707 e3 = e2; e2 = e1;
1708 e1 = last[j-2].C;
1709 if ((e=e2+ww->iii) > sc) {sc = e; del = 3;}
1710 if ((e=e1+ww->ii) > sc) {sc = e; del = 2;}
1711 if ((e = e3+ww->iv) > sc) {sc = e; del = 4;}
1712 } else {
1713 sc = e2 = 0;
1714 if (ww->iii > 0) {sc = ww->iii; del = 3;}
1715 }
1716 if (sc < (ci=last[j].I)) {
1717 sc = ci; del = 0;
1718 }
1719 if (sc < (cd=cur[j].D)) {
1720 sc = cd; del = 5;
1721 }
1722 cur[j].C = sc;
1723 e = sc - gop;
1724 if (e > cd) {
1725 cur[j+3].D = e-gext;
1726 cur_d_st[j+3] = 3;
1727 } else {
1728 cur[j+3].D = cd-gext;
1729 cur_d_st[j+3] = cur_d_st[j]+3;
1730 }
1731 switch(del) {
1732 case 5:
1733 c = cur_d_st[j];
1734 cur_st[j].i = cur_st[j-c].i;
1735 cur_st[j].j = cur_st[j-c].j;
1736 break;
1737 case 0:
1738 cur_st[j].i = cur_i_st[j].i;
1739 cur_st[j].j = cur_i_st[j].j;
1740 break;
1741 case 2:
1742 case 3:
1743 case 4:
1744 if (i) {
1745 if (j-del >= 0) {
1746 cur_st[j].i = last_st[j-del].i;
1747 cur_st[j].j = last_st[j-del].j;
1748 } else {
1749 cur_st[j].i = i;
1750 cur_st[j].j = 0;
1751 }
1752 } else {
1753 cur_st[j].i = 0;
1754 cur_st[j].j = max(0, j-del+1);
1755 }
1756 break;
1757 case -1:
1758 cur_st[j].i = i+1;
1759 cur_st[j].j = j+1;
1760 break;
1761 }
1762 if (e > ci) {
1763 cur[j].I = e -gext;
1764 cur_i_st[j].i = cur_st[j].i;
1765 cur_i_st[j].j = cur_st[j].j;
1766 } else {
1767 cur[j].I = ci- gext;
1768 }
1769 if (sc > best) {
1770 x1 = cur_st[j].i;
1771 x2 = cur_st[j].j;
1772 best =sc;
1773 x3 = i;
1774 x4 = j;
1775 }
1776 }
1777 swap((void *)&last, (void *)&cur);
1778 swap((void *)&cur_st, (void *)&last_st);
1779 }
1780 /* printf("The best score is %d\n", best);*/
1781 *x = x1; *y = x2; *ex = x3; *ey = x4;
1782 free(cur_st); free(last_st); free(cur_i_st);
1783 free(st_up);
1784 return best;
1785 }
1786
1787 /*
1788 Both global_up and global_down do linear space score only global
1789 alignments on subsequence pro[x]...pro[ex], and dna[y]...dna[ey].
1790 global_up do the algorithm upwards, from row x towards row y.
1791 global_down do the algorithm downwards, from row y towards x.
1792 */
1793
1794 static void
1795 global_up(st_ptr *row1, st_ptr *row2,
1796 int x, int y, int ex, int ey,
1797 int **wgts, int gop, int gext,
1798 unsigned char *dnap, unsigned char *pro,
1799 int N, struct f_struct *f_str)
1800 {
1801 int i, j, k, sc, e, e1, e2, e3, t, ci, cd, score;
1802 struct wgt *wt, *ww;
1803 st_ptr cur, last;
1804
1805 cur = *row1; last = *row2;
1806 sc = -gop;
1807 for (j = 0; j <= ey-y+1; j++) {
1808 if (j % 3 == 0) {last[j].C = sc; sc -= gext; last[j].I = sc-gop;}
1809 else { last[j].I = last[j].C = -10000;}
1810 }
1811 last[0].C = 0; cur[0].D = cur[1].D = cur[2].D = -10000;
1812 last[0].D = last[1].D = last[2].D = -10000;
1813 if (N) last[0].I = -gext;
1814 for (i = 1; i <= ex-x+1; i++) {
1815 wt = f_str->weight1[pro[i+x-1]]; e1 = -10000; e2 = last[0].C;
1816 for (j = 0; j <= ey-y+1; j++) {
1817 t = j+y;
1818 sc = -10000;
1819 ww = &wt[(unsigned char) dnap[t-3]];
1820 if (j < 4) {
1821 if (j == 3) {
1822 sc = e2+ww->iii;
1823 } else if (j == 2) {
1824 sc = e2 + ww->ii;
1825 }
1826 } else {
1827 e3 = e2; e2 = e1;
1828 e1 = last[j-2].C;
1829 sc = max(e2+ww->iii, max(e1+ww->ii, e3+ww->iv));
1830 }
1831 sc = max(sc, max(ci=last[j].I, cd = cur[j].D));
1832 cur[j].C = sc;
1833 cur[j+3].D = max(cd, sc-gop)-gext;
1834 cur[j].I = max(ci, sc-gop)-gext;
1835 }
1836 swap((void *)&last, (void *)&cur);
1837 }
1838 /*printf("global up score =%d\n", last[ey-y+1].C);*/
1839 for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
1840 if (*row1 != last) swap((void *)row1, (void *)row2);
1841 }
1842
1843 static void
1844 global_down(st_ptr *row1, st_ptr *row2,
1845 int x, int y, int ex, int ey,
1846 int **wgts, int gop, int gext,
1847 unsigned char *dnap, unsigned char *pro,
1848 int N, struct f_struct *f_str)
1849 {
1850 int i, j, k, sc, del, *tmp, e, t, e1,e2,e3, ci,cd, score;
1851 struct wgt *wt, *w1, *w2, *w3;
1852 st_ptr cur, last;
1853
1854 cur = (*row1); last = *row2;
1855 sc = -gop;
1856 for (j = ey-y+1; j >= 0; j--) {
1857 if ((ey-y+1-j) % 3) {last[j].C = sc; sc-=gext; last[j].I = sc-gop;}
1858 else last[j].I = last[j].C = -10000;
1859 cur[j].I = -10000;
1860 }
1861 last[ey-y+1].C = 0;
1862 if (N) last[ey-y+1].I = -gext;
1863 cur[ey-y+1].D = cur[ey-y].D = cur[ey-y-1].D = -10000;
1864 last[ey-y+1].D = last[ey-y].D = last[ey-y-1].D = -10000;
1865 for (i = ex-x; i >= 0; i--) {
1866 wt = f_str->weight1[pro[i+x]]; e2 = last[ey-y+1].C;
1867 e1 = -10000;
1868 w3 = &wt[(unsigned char) dnap[ey]];
1869 w2 = &wt[(unsigned char) dnap[ey-1]];
1870 for (j = ey-y+1; j >= 0; j--) {
1871 t = j+y;
1872 w1 = &wt[(unsigned char) dnap[t-1]];
1873 sc = -10000;
1874 if (t+3 > ey) {
1875 if (t+2 == ey) {
1876 sc = e2+w2->iii;
1877 } else if (t+1 == ey) {
1878 sc = e2+w1->ii;
1879 }
1880 } else {
1881 e3 = e2; e2 = e1;
1882 e1 = last[j+2].C;
1883 sc = max(e2+w2->iii, max(e1+w1->ii,e3+w3->iv)) ;
1884 }
1885 if (sc < (cd= cur[j].D)) {
1886 sc = cd;
1887 cur[j-3].D = cd-gext;
1888 } else cur[j-3].D =max(cd, sc-gop)-gext;
1889 if (sc < (ci= last[j].I)) {
1890 sc = ci;
1891 cur[j].I = ci - gext;
1892 } else cur[j].I = max(sc-gop,ci)-gext;
1893 cur[j].C = sc;
1894 w3 = w2; w2 = w1;
1895 }
1896 swap((void *)&last, (void *)&cur);
1897 }
1898 for (i = 0; i <= ey-y+1; i++) last[i].I = cur[i].I;
1899 if (*row1 != last) swap((void *)row1, (void *)row2);
1900 }
1901
1902 static void
1903 init_row2(int *row, int ld) {
1904 int i;
1905 for (i = 0; i < ld; i++) row[i] = 0;
1906 }
1907
1908 static void init_ROW(st_ptr row, int ld) {
1909 int i;
1910 for (i = 0; i < ld; i++) row[i].I = row[i].D = row[i].C = 0;
1911 }
1912
1913 static match_ptr
1914 combine(match_ptr x1, match_ptr x2, int st) {
1915 match_ptr x;
1916
1917 if (x1 == NULL) return x2;
1918 for (x = x1; x->next; x = x->next);
1919 x->next = x2;
1920 if (st) {
1921 for (x = x2; x; x = x->next) {
1922 x->j++;
1923 if (x->l == 3 || x->l == 4) break;
1924 }
1925 x->l--;
1926 }
1927 return x1;
1928 }
1929
1930 /*
1931 global use the two upwards and downwards score only linear
1932 space global alignment subroutine to recursively build the
1933 alignment.
1934 */
1935
1936 match_ptr
1937 global(int x, int y, int ex, int ey,
1938 int **wgts, int gop, int gext,
1939 unsigned char *dnap, unsigned char *pro, int N1, int N2,
1940 struct f_struct *f_str)
1941 {
1942 int m;
1943 int m1, m2;
1944 match_ptr x1, x2, mm1, mm2;
1945
1946 /*printf("%d %d %d %d %d %d\n", x,y, ex, ey, N1, N2);*/
1947 /*
1948 if the space required is limited, we can do a quadratic space
1949 algorithm to find the alignment.
1950 */
1951
1952 if (ex <= x) {
1953 mm1 = NULL;
1954 for (m = y+3; m <= ey; m+=3) {
1955 x1 = (match_ptr) ckalloc(sizeof(match_node));
1956 x1->l = 5; x1->next = mm1;
1957 if (mm1== NULL) mm2 = x1;
1958 mm1 = x1;
1959 }
1960 if (ex == x) {
1961 if ((ey-y) % 3 != 0) {
1962 x1 = (match_ptr) ckalloc(sizeof(match_node));
1963 x1->l = ((ey-y) % 3) +1; x1->next = NULL;
1964 if (mm1) mm2->next = x1; else mm1 = x1;
1965 } else mm2->l = 4;
1966 }
1967 return mm1;
1968 }
1969 if (ey <= y) {
1970 mm1 = NULL;
1971 for (m = x; m <= ex; m++) {
1972 x1 = (match_ptr) ckalloc(sizeof(match_node));
1973 x1->l = 0; x1->next = mm1; mm1 = x1;
1974 }
1975 return mm1;
1976 }
1977 if (ex -x < SGW1 && ey-y < SGW2)
1978 return small_global(x,y,ex,ey,wgts, gop, gext, dnap, pro, N1, N2,f_str);
1979 m = (x+ex)/2;
1980 /*
1981 Do the score only global alignment from row x to row m, m is
1982 the middle row of x and ex. Store the information of row m in
1983 upC, upD, and upI.
1984 */
1985 global_up(&f_str->up, &f_str->tp, x, y, m, ey,
1986 wgts, gop, gext,
1987 dnap, pro, N1, f_str);
1988 /*
1989 Do the score only global alignment downwards from row ex
1990 to row m+1, store information of row m+1 in downC downI and downD
1991 */
1992 global_down(&f_str->down, &f_str->tp, m+1, y, ex, ey,
1993 wgts, gop, gext,
1994 dnap, pro, N2, f_str);
1995
1996 /*
1997 Use this information for row m and m+1 to find the crossing
1998 point of the best alignment with the middle row. The crossing
1999 point is given by m1 and m2. Then we recursively call global
2000 itself to compute alignments in two smaller regions found by
2001 the crossing point and combine the two alignments to form a
2002 whole alignment. Return that alignment.
2003 */
2004 if (find_best(f_str->up, f_str->down, &m1, &m2, ey-y+1, y, gop)) {
2005 x1 = global(x, y, m, m1, wgts, gop, gext, dnap, pro, N1, 0, f_str);
2006 x2 = global(m+1, m2, ex, ey, wgts, gop, gext, dnap, pro, 0, N2, f_str);
2007 if (m1 == m2) x1 = combine(x1,x2,1);
2008 else x1 = combine(x1, x2,0);
2009 } else {
2010 x1 = global(x, y, m-1, m1, wgts, gop, gext, dnap, pro, N1, 1, f_str);
2011 x2 = global(m+2, m2, ex, ey, wgts, gop, gext, dnap, pro, 1, N2, f_str);
2012 mm1 = (match_ptr) ckalloc(sizeof(match_node));
2013 mm1->i = m; mm1->l = 0; mm1->j = m1;
2014 mm2 = (match_ptr) ckalloc(sizeof(match_node));
2015 mm2->i = m+1; mm2->l = 0; mm2->j = m1;
2016 mm1->next = mm2; mm2->next = x2;
2017 x1 = combine(x1, mm1, 0);
2018 }
2019 return x1;
2020 }
2021
2022 static int
2023 find_best(st_ptr up, st_ptr down, int *m1, int *m2, int ld, int y, int gop) {
2024
2025 int i, best = -1000, j = 0, s1, s2, s3, s4, st;
2026
2027 for (i = 1; i < ld; i++) {
2028 s2 = up[i].C + down[i].C;
2029 s4 = up[i].I + down[i].I + gop;
2030 if (best < s2) {
2031 best = s2; j = i; st = 1;
2032 }
2033 if (best < s4) {
2034 best = s4; j = i; st = 0;
2035 }
2036 }
2037 *m1 = j-1+y;
2038 *m2 = j+y;
2039 /*printf("score=%d\n", best);*/
2040 return st;
2041 }
2042
2043 /*
2044 An alignment is represented as a linked list whose element
2045 is of type match_node. Each element represent an edge in the
2046 path of the alignment graph. The fields of match_node are
2047 l --- gives the type of the edge.
2048 i, j --- give the end position.
2049 */
2050
2051 static match_ptr
2052 small_global(int x, int y, int ex, int ey,
2053 int **wgts, int gop, int gext,
2054 unsigned char *dnap, unsigned char *pro,
2055 int N1, int N2, struct f_struct *f_str) {
2056
2057 /* int C[SGW1+1][SGW2+1], st[SGW1+1][SGW2+1], D[SGW2+7], I[SGW2+1]; */
2058 int i, j, e, sc, score, del, k, t, ci, cd;
2059 int *cI, *cD, *cC, *lC, *cst, e2, e3, e4;
2060 match_ptr mp, first;
2061 struct wgt *wt, *ww;
2062
2063 /*printf("small_global %d %d %d %d\n", x, y, ex, ey);*/
2064 sc = -gop-gext; f_str->smgl_s.C[0][0] = 0;
2065
2066 cI = f_str->smgl_s.I;
2067 if (N1) cI[0] = -gext; else cI[0] = sc;
2068
2069 for (j = 1; j <= ey-y+1; j++) {
2070 if (j % 3== 0) {
2071 f_str->smgl_s.C[0][j] = sc;
2072 sc -= gext;
2073 cI[j] = sc-gop;
2074 }
2075 else {
2076 cI[j] = f_str->smgl_s.C[0][j] = -10000;
2077 }
2078 f_str->smgl_s.st[0][j] = 5;
2079 }
2080
2081 lC = &f_str->smgl_s.C[0][0];
2082 cD = f_str->smgl_s.D; cD[0] = cD[1] = cD[2] = -10000;
2083 for (i = 1; i <= ex-x+1; i++) {
2084 cC = &f_str->smgl_s.C[i][0];
2085 wt = f_str->weight1[pro[i+x-1]]; cst = &f_str->smgl_s.st[i][0];
2086 for (j = 0; j <=ey-y+1; j++) {
2087 ci = cI[j];
2088 cd= cD[j];
2089 t = j+y;
2090 ww = &wt[(unsigned char) dnap[t-3]];
2091 if (j >= 4) {
2092 sc = lC[j-3]+ww->iii; e2 = lC[j-2]+ww->ii;
2093 e4 = lC[j-4]+ww->iv; del = 3;
2094 if (e2 > sc) { sc = e2; del = 2;}
2095 if (e4 >= sc) { sc = e4; del = 4;}
2096 } else {
2097 if (j == 3) {
2098 sc = lC[0]+ww->iii; del =3;
2099 } else if (j == 2) {
2100 sc = lC[0]+ww->ii; del = 2;
2101 } else {sc = -10000; del = 0;}
2102 }
2103 if (sc < ci) {
2104 sc = ci; del = 0;
2105 }
2106 if (sc <= cd) {
2107 sc = cd;
2108 del = 5;
2109 }
2110 cC[j] = sc;
2111 sc -= gop;
2112 if (sc <= cd) {
2113 del += 10;
2114 cD[j+3] = cd - gext;
2115 } else cD[j+3] = sc -gext;
2116 if (sc < ci) {
2117 del += 20;
2118 cI[j] = ci-gext;
2119 } else cI[j] = sc-gext;
2120 *(cst++) = del;
2121 }
2122 lC = cC;
2123 }
2124 /*printf("small global score =%d\n", f_str->smgl_s.C[ex-x+1][ey-y+1]);*/
2125 if (N2 && cC[ey-y+1] < ci+gop) f_str->smgl_s.st[ex-x+1][ey-y+1] =0;
2126 first = NULL; e = 1;
2127 for (i = ex+1, j = ey+1; i > x || j > y; i--) {
2128 mp = (match_ptr) ckalloc(sizeof(match_node));
2129 mp->i = i-1;
2130 k = (t=f_str->smgl_s.st[i-x][j-y])%10;
2131 mp->j = j-1;
2132 if (e == 5 && (t/10)%2 == 1) k = 5;
2133 if (e == 0 && (t/20)== 1) k = 0;
2134 if (k == 5) { j -= 3; i++; e=5;}
2135 else {j -= k;if (k==0) e= 0; else e = 1;}
2136 mp->l = k;
2137 mp->next = first;
2138 first = mp;
2139 }
2140
2141 /* for (i = 0; i <= ex-x; i++) {
2142 for (j = 0; j <= ey-y; j++)
2143 printf("%d ", C[i][j]);
2144 printf("\n");
2145 }
2146 */
2147 return first;
2148 }
2149
2150 #define XTERNAL
2151 #include "upam.h"
2152
2153 void
2154 display_alig(int *a, unsigned char *dna, unsigned char *pro,
2155 int length, int ld, struct f_struct *f_str)
2156 {
2157 int len = 0, i, j, x, y, lines, k, iaa;
2158 static char line1[100], line2[100], line3[100],
2159 tmp[10] = " ", *st;
2160 char *dna1, c1, c2, c3;
2161
2162 line1[0] = line2[0] = line3[0] = '\0'; x= a[0]; y = a[1]-3;
2163
2164 printf("\n%5d\n%5d", y+3, x);
2165 for (len = 0, j = 2, lines = 0; j < length; j++) {
2166 i = a[j];
2167 line3[len] = ' ';
2168 switch (i) {
2169 case 3:
2170 y += 3;
2171 line2[len] = NCBIstdaa[iaa=pro[x++]];
2172 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c5;
2173 if (line1[len] != f_str->weight_c[iaa][(unsigned char) dna[y]].c3)
2174 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2175 break;
2176 case 2:
2177 y += 2;
2178 line1[len] = '\\';
2179 line2[len++] = ' ';
2180 line2[len] = NCBIstdaa[iaa=pro[x++]];
2181 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c2;
2182 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2183 break;
2184 case 4:
2185 y += 4;
2186 line1[len] = '/';
2187 line2[len++] = ' ';
2188 line2[len] = NCBIstdaa[iaa=pro[x++]];
2189 line1[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c4;
2190 line3[len] = f_str->weight_c[iaa][(unsigned char) dna[y]].c3;
2191 break;
2192 case 5:
2193 y += 3;
2194 line1[len] = f_str->weight_c[0][(unsigned char) dna[y]].c3;
2195 line2[len] = '-';
2196 break;
2197 case 0:
2198 line1[len] = '-';
2199 line2[len] = NCBIstdaa[pro[x++]];
2200 break;
2201 }
2202 len++;
2203 line1[len] = line2[len] = line3[len] = '\0';
2204 if (len >= WIDTH) {
2205 for (k = 10; k <= WIDTH; k+=10)
2206 printf(" . :");
2207 if (k-5 < WIDTH) printf(" .");
2208 c1 = line1[WIDTH]; c2 = line2[WIDTH]; c3 = line3[WIDTH];
2209 line1[WIDTH] = line2[WIDTH] = line3[WIDTH] = '\0';
2210 printf("\n %s\n %s\n %s\n", line1, line3, line2);
2211 line1[WIDTH] = c1; line2[WIDTH] = c2;
2212 strncpy(line1, &line1[WIDTH], sizeof(line1)-1);
2213 strncpy(line2, &line2[WIDTH], sizeof(line2)-1);
2214 strncpy(line3, &line3[WIDTH], sizeof(line3)-1);
2215 len = len - WIDTH;
2216 printf("\n%5d\n%5d", y+3, x);
2217 }
2218 }
2219 for (k = 10; k < len; k+=10)
2220 printf(" . :");
2221 if (k-5 < len) printf(" .");
2222 printf("\n %s\n %s\n %s\n", line1, line3, line2);
2223 }
2224
2225
2226 /* alignment store the operation that align the protein and dna sequence.
2227 The code of the number in the array is as follows:
2228 0: delete of an amino acid.
2229 2: frame shift, 2 nucleotides match with an amino acid
2230 3: match an amino acid with a codon
2231 4: the other type of frame shift
2232 5: delete of a codon
2233
2234
2235 Also the first two element of the array stores the starting point
2236 in the protein and dna sequences in the local alignment.
2237
2238 Display looks like where WIDTH is assumed to be divisible by 10.
2239
2240 0 . : . : . : . : . : . :
2241 AACE/N\PLK\G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ
2242 I S G S V F N R Q L A G S V F N R Q L A
2243 AACE P P-- G HK Y TWA A C E P P---- G HK Y TWA A C E P P----
2244
2245 60 . : . : . : . : . : . :
2246 /G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LWA\S\C\E/P\PRIRZ/G\HK\Y/LW
2247 G S V F N R Q L A G S V F N R Q L A G S V F
2248 G HK Y TWA A C E P P---- G HK Y TWA A C E P P---- G HK Y TW
2249
2250 For frame shift, the middle row show the letter in the original sequence,
2251 and the letter in the top row is the amino acid that is chose by the
2252 alignment (translated codon chosen from 4 nucleotides, or 2+1).
2253 */
2254
2255 /* fatal - print message and die */
2256 void
2257 fatal(msg)
2258 char *msg;
2259 {
2260 fprintf(stderr, "%s\n", msg);
2261 exit(1);
2262 }
2263
2264 /* 10-Feb-2010 - fz_walign modified to ensure that the final alignment
2265 overlaps the initial lz_band() region. In earlier versions, the
2266 final alignment (using pam2p[0]) might have been outside the band
2267 region */
2268
2269 void
2270 fz_walign (const unsigned char *aa0, int n0,
2271 const unsigned char *aa1, int n1,
2272 int frame, int max_res,
2273 struct pstruct *ppst,
2274 struct f_struct *f_str,
2275 struct a_res_str *a_res,
2276 int score_thresh)
2277 {
2278 int score;
2279 int i, last_n1, itemp, n10;
2280 int hoff, nt_min, nt_max, n_nt, n_aa, w_fact;
2281 int l_min, l_max, window;
2282 unsigned char *fs, *fd;
2283 /*
2284 unsigned char *aa1_min_s, aa1_max_s;
2285 */
2286 unsigned char *local_aa1;
2287 int optflag_s;
2288 int itx;
2289 unsigned char *aa1x;
2290 struct score_count_s s_info = {0,0,0,0};;
2291
2292 #ifndef TFAST
2293 do_fastz (f_str->aa0x, n0, aa1, n1, f_str->aa0v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
2294 #else
2295 /* make translated sequence */
2296 last_n1 = 0;
2297 aa1x = f_str->aa1x;
2298 for (itx= frame*3; itx< frame*3+3; itx++) {
2299 n10 = saatran(aa1,&aa1x[last_n1],n1,itx);
2300 last_n1 += n10+1;
2301 }
2302 n10 = last_n1-1;
2303
2304 /* do_fastz (lz_band) also needs a pre-computed number series */
2305 if (frame == 0) {
2306 pre_com(aa1, n1, f_str->aa1v);
2307 }
2308 else {
2309 pre_com_r(aa1, n1, f_str->aa1v);
2310 }
2311
2312 do_fastz (aa0, n0, f_str->aa1x, n10, f_str->aa1v, ppst, f_str, &a_res->rst, &hoff, 1, &s_info);
2313 #endif
2314
2315 if (a_res->rst.score[ppst->score_ix] < score_thresh) {
2316 a_res->sw_score = 0;
2317 a_res->n1 = n1;
2318 return;
2319 }
2320
2321 #ifndef TFAST
2322 window = min(n1, ppst->param_u.fa.optwid);
2323 l_min = max(0, -window - hoff);
2324 l_max = min(n1, n0-hoff+window);
2325 local_aa1 = (unsigned char *)aa1;
2326 if (l_min > 0 || l_max < n1 -1 ) {
2327 local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
2328 local_aa1++;
2329 memcpy(local_aa1,aa1+l_min, l_max - l_min);
2330 }
2331
2332 /*
2333 if (l_min > 0) {
2334 aa1_min_s = aa1[l_min-1];
2335 aa1[l_min-1] = '\0';
2336 }
2337 if (l_max < n1 - 1) {
2338 aa1_max_s = aa1[l_max];
2339 aa1[l_max] = '\0';
2340 }
2341 */
2342 #else
2343 window = min(n0, ppst->param_u.fa.optwid);
2344 if (frame==0) {
2345 l_min = max(0,(hoff-window)*3);
2346 l_max = min((hoff+window+n0)*3,n1);
2347
2348 local_aa1 = (unsigned char *)aa1;
2349 if (l_min > 0 || l_max < n1 -1 ) {
2350 local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
2351 local_aa1++;
2352 memcpy(local_aa1,aa1+l_min, l_max - l_min);
2353 }
2354
2355 /*
2356 if (l_min > 0) {
2357 aa1_min_s = aa1[l_min-1];
2358 aa1[l_min-1] = '\0';
2359 }
2360 if (l_max < n1-1) {
2361 aa1_max_s = aa1[l_max];
2362 aa1[l_max] = '\0';
2363 }
2364 */
2365 /* re-do precomputed codon number series for limited region */
2366 pre_com(local_aa1, l_max - l_min, f_str->aa1v);
2367 }
2368 else {
2369 /* things are more complicated here because the mapping of hoff is
2370 with respect to the reversed aa1 */
2371
2372 l_max = n1 - max(0,(hoff-window)*3);
2373 l_min = n1 - min((hoff+window+n0)*3,n1);
2374
2375 local_aa1 = (unsigned char *)aa1;
2376 if (l_min > 0 || l_max < n1 -1 ) {
2377 local_aa1 = (unsigned char *)calloc(l_max - l_min + 2,sizeof(unsigned char));
2378 local_aa1++;
2379 memcpy(local_aa1,aa1+l_min, l_max - l_min);
2380 }
2381
2382 /*
2383 if (l_min > 0) {
2384 aa1_min_s = aa1[l_min-1];
2385 aa1[l_min-1] = '\0';
2386 }
2387 if (l_max < n1-1) {
2388 aa1_max_s = aa1[l_max];
2389 aa1[l_max] = '\0';
2390 }
2391 */
2392
2393 pre_com_r(local_aa1, l_max - l_min, f_str->aa1v);
2394 }
2395 #endif
2396
2397 a_res->sw_score =
2398 pro_dna(
2399 #ifndef TFAST
2400 aa1+l_min, l_max - l_min,
2401 f_str->aa0v, n0-2,
2402 #else
2403 aa0, n0,
2404 f_str->aa1v, l_max - l_min-2,
2405 #endif
2406 ppst->pam2[0],
2407 -ppst->gdelval, -ppst->ggapval, -ppst->gshift,
2408 f_str, f_str->max_res, a_res);
2409
2410 if (l_min > 0 || l_max < n1 - 1) { free(--local_aa1); }
2411 /*
2412 if (l_min > 0) {
2413 aa1[l_min-1] = aa1_min_s;
2414 }
2415 if (l_max < n1 - 1) {
2416 aa1[l_max] = aa1_max_s;
2417 }
2418 */
2419 #ifndef TFAST
2420 a_res->min0 += l_min;
2421 a_res->max0 += l_min;
2422 #else
2423 if (frame==1) {
2424 a_res->min1 += n1 - l_max;
2425 a_res->max1 += n1 - l_max;
2426 }
2427 else {
2428 a_res->min1 += l_min;
2429 a_res->max1 += l_min;
2430 }
2431 #endif
2432
2433 /* display_alig(f_str->res,f_str->aa0v+2,aa1,*nres,n0-2,f_str); */
2434 }
2435
2436 /*
2437 fz_malign is a recursive interface to fz_walign() that is called
2438 from do_walign(). fz_malign() first does an alignment, then checks
2439 to see if the score is greater than the threshold. If so, it tries
2440 doing a left and right alignment.
2441
2442 In this implementation, the DNA sequence is preserved as DNA for
2443 TFAST, so that it can be sub-setted and translated correctly. Thus,
2444 the translation required for f_str->aa1x and f_str->aa1v is done at
2445 each recursive level (in fz_walign).
2446 */
2447
2448 struct a_res_str *
2449 fz_malign (const unsigned char *aa0, int n0,
2450 const unsigned char *aa1, int n1,
2451 int frame,
2452 int score_thresh, int max_res,
2453 struct pstruct *ppst,
2454 struct f_struct *f_str,
2455 struct a_res_str *cur_ares,
2456 int first_align)
2457 {
2458 struct a_res_str *tmpl_ares, *tmpr_ares, *this_ares;
2459 struct a_res_str *mtmpl_ares, *mtmpr_ares, *mt_next;
2460 int sq_start, sq_end, sq_save;
2461 int hoff, score_ix;
2462 int min_alen;
2463 struct rstruct rst;
2464 unsigned char *local_aa1;
2465 /* char save_res; */
2466 int iphase, i;
2467 unsigned char *fd;
2468 int max_sub_score = -1;
2469
2470 score_ix = ppst->score_ix;
2471
2472 #ifdef TFAST
2473 min_alen = min(n0,MIN_LOCAL_LEN)*3; /* n0 in aa, min_alen in nt */
2474 #else
2475 min_alen = min(n0/3,MIN_LOCAL_LEN); /* no in nt, min_alen in aa */
2476 #endif
2477
2478 /* now we need alignment storage - get it */
2479 if ((cur_ares->res = (int *)calloc((size_t)max_res,sizeof(int)))==NULL) {
2480 fprintf(stderr," *** cannot allocate alignment results array %d\n",max_res);
2481 exit(1);
2482 }
2483
2484 cur_ares->next = NULL;
2485
2486 fz_walign(aa0, n0, aa1, n1, frame, max_res, ppst, f_str, cur_ares, (first_align ? 1 : score_thresh));
2487
2488 /* in cur_ares, min0,max0 are always protein, min1,max1 are always
2489 DNA, but n0 could be protein or DNA, depending on
2490 FASTY/TFASTY */
2491
2492 if (!ppst->do_rep || cur_ares->rst.score[ppst->score_ix] < score_thresh) {
2493 return cur_ares;
2494 }
2495
2496 /* have a score >= threshold - try left and right */
2497
2498 /* in code below, cur_ares->min0/max0 always refers to aa
2499 cur_ares->min1/max1 always refers to nt
2500
2501 however, things are more complex because if frame==1, then
2502 offsets are from the end (n1), not the beginning. There is no
2503 frame==1 for fasty, only for TFASTY
2504 */
2505 cur_ares->v_start = sq_start = 0;
2506 #ifdef TFAST
2507 if (frame == 0) {sq_end = cur_ares->min1-1;} /* aa1[sq_start --> sq_end] */
2508 else {sq_end = n1 - cur_ares->max1;}
2509 sq_save = sq_end;
2510 #else
2511 sq_save = sq_end = cur_ares->min0;
2512 #endif
2513 cur_ares->v_len = sq_end - sq_start;
2514
2515 if (cur_ares->v_len >= min_alen) { /* try the left */
2516 /* allocate a_res */
2517 tmpl_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
2518 local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
2519 local_aa1++;
2520 memcpy(local_aa1, aa1, cur_ares->v_len);
2521
2522 /*
2523 save_res = aa1[sq_save];
2524 aa1[sq_save] = '\0';
2525 */
2526 tmpl_ares = fz_malign(aa0, n0, local_aa1, cur_ares->v_len,
2527 frame, score_thresh, max_res,
2528 ppst, f_str, tmpl_ares,0);
2529
2530 free(--local_aa1);
2531 /* aa1[sq_save] = save_res; */
2532
2533 if (tmpl_ares->rst.score[ppst->score_ix] > score_thresh) {
2534 max_sub_score = tmpl_ares->rst.score[ppst->score_ix];
2535 #ifdef TFAST
2536 if (frame == 1) {
2537 for (this_ares = tmpl_ares; this_ares; this_ares = this_ares->next) {
2538 this_ares->v_start += n1 - sq_end;
2539 this_ares->min1 += n1 - sq_end;
2540 this_ares->max1 += n1 - sq_end;
2541 }
2542 }
2543 #endif
2544 }
2545 else {
2546 if (tmpl_ares->res) free(tmpl_ares->res);
2547 free(tmpl_ares);
2548 tmpl_ares=NULL;
2549 }
2550 }
2551 else {tmpl_ares = NULL;}
2552
2553 /* now the right end */
2554 /* for fasty -- max positions refer to the aa,codon, not the next
2555 residue, so they must be incremented */
2556
2557 sq_end = n1;
2558 #if TFAST
2559 if (frame == 0) {sq_start = cur_ares->max1+1;}
2560 else {sq_start = n1 - cur_ares->min1;}
2561 #else
2562 sq_start = cur_ares->max0+1;
2563 #endif
2564 sq_save = sq_start-1;
2565 cur_ares->v_len = sq_end - sq_start;
2566
2567 if (cur_ares->v_len >= min_alen) { /* try the right */
2568 /* allocate a_res */
2569 tmpr_ares = (struct a_res_str *)calloc(1, sizeof(struct a_res_str));
2570
2571 /* find boundaries */
2572 local_aa1 = (unsigned char *)calloc(cur_ares->v_len+2,sizeof(unsigned char));
2573 local_aa1++;
2574 memcpy(local_aa1,aa1+sq_start,cur_ares->v_len);
2575 /*
2576 save_res = aa1[sq_save];
2577 aa1[sq_save] = '\0';
2578 */
2579
2580 tmpr_ares = fz_malign(aa0, n0,
2581 local_aa1, cur_ares->v_len,
2582 frame,
2583 score_thresh, max_res,
2584 ppst, f_str, tmpr_ares,0);
2585 free(--local_aa1);
2586 /*
2587 aa1[sq_save] = save_res;
2588 */
2589
2590 if (tmpr_ares->rst.score[ppst->score_ix] > score_thresh) {
2591 /* adjust the left boundary */
2592 for (this_ares = tmpr_ares; this_ares; this_ares = this_ares->next) {
2593 #ifndef TFAST
2594 this_ares->min0 += sq_start;
2595 this_ares->max0 += sq_start;
2596 #else
2597 if (frame == 0) {
2598 this_ares->v_start += sq_start;
2599 this_ares->min1 += sq_start;
2600 this_ares->max1 += sq_start;
2601 }
2602 #endif
2603 }
2604 if (tmpr_ares->rst.score[ppst->score_ix] > max_sub_score) {
2605 max_sub_score = tmpr_ares->rst.score[ppst->score_ix];
2606 }
2607 }
2608 else {
2609 if (tmpr_ares->res) free(tmpr_ares->res);
2610 free(tmpr_ares);
2611 tmpr_ares=NULL;
2612 }
2613 }
2614 else {tmpr_ares = NULL;}
2615
2616 if (max_sub_score < score_thresh) return cur_ares;
2617
2618 cur_ares = merge_ares_chains(cur_ares, tmpl_ares, score_ix, "left");
2619 cur_ares = merge_ares_chains(cur_ares, tmpr_ares, score_ix, "right");
2620
2621 return cur_ares;
2622 }
2623
2624 /* do_walign() can be called with aa0,n0 as nt (FASTY) or
2625 aa0,n0 as aa (TFASTY). if aa0 is nt, then f_str->aa0x,y have the
2626 translations already. if aa0 is aa, then f_str->aa1x,y must be
2627 generated.
2628 */
2629
2630 struct a_res_str *
2631 do_walign (const unsigned char *aa0, int n0,
2632 const unsigned char *aa1, int n1,
2633 int frame, int repeat_thresh,
2634 struct pstruct *ppst,
2635 struct f_struct *f_str,
2636 int *have_ares)
2637 {
2638 struct a_res_str *a_res, *tmp_a_res;
2639 int a_res_index;
2640 int hoff, use_E_thresholds_s, optflag_s, optcut_s, optwid_s, score;
2641 int last_n1, itx, itt, n10, iphase;
2642 unsigned char *fs, *fd;
2643 struct rstruct rst;
2644 #ifdef DEBUG
2645 unsigned long adler32_crc;
2646 #endif
2647
2648 *have_ares = 0x3; /* set 0x2 bit to indicate local copy */
2649
2650 if ((a_res = (struct a_res_str *)calloc(1, sizeof(struct a_res_str)))==NULL) {
2651 fprintf(stderr," [do_walign] Cannot allocate a_res");
2652 return NULL;
2653 }
2654
2655 #ifdef DEBUG
2656 adler32_crc = adler32(1L,aa1,n1);
2657 #endif
2658
2659 f_str->frame = frame; /* need frame for later pre_cons() in calcons() */
2660
2661 use_E_thresholds_s = ppst->param_u.fa.use_E_thresholds;
2662 optflag_s = ppst->param_u.fa.optflag;
2663 optcut_s = ppst->param_u.fa.optcut;
2664 optwid_s = ppst->param_u.fa.optwid;
2665 ppst->param_u.fa.use_E_thresholds = 0;
2666 ppst->param_u.fa.optflag = 1;
2667 if (!ppst->param_u.fa.optwid_set) {
2668 ppst->param_u.fa.optwid *= 2;
2669 }
2670
2671 a_res = fz_malign(aa0, n0, aa1, n1, frame,
2672 repeat_thresh, f_str->max_res,
2673 ppst, f_str, a_res, 1);
2674
2675 #ifdef DEBUG
2676 if (adler32(1L,aa1,n1) != adler32_crc) {
2677 fprintf(stderr,"*** error [%s:%d] adler32_crc mismatch n1: %d\n",__FILE__, __LINE__, n1);
2678 }
2679 #endif
2680
2681 a_res_index = 0;
2682 for (tmp_a_res=a_res; tmp_a_res; tmp_a_res = tmp_a_res->next) {
2683 tmp_a_res->index = a_res_index++;
2684 }
2685
2686 ppst->param_u.fa.use_E_thresholds = use_E_thresholds_s;
2687 ppst->param_u.fa.optflag = optflag_s;
2688 ppst->param_u.fa.optwid = optwid_s;
2689 return a_res;
2690 }
2691
2692 void
2693 pre_cons(const unsigned char *aa1, int n1, int frame, struct f_struct *f_str) {
2694
2695 #ifdef TFAST
2696 int i, last_n1, itemp, n10;
2697 unsigned char *fs, *fd;
2698 int itx;
2699
2700 /* make a precomputed codon number series */
2701 if (frame==0) {
2702 pre_com(aa1, n1, f_str->aa1v);
2703 }
2704 else { /* must do things backwards */
2705 pre_com_r(aa1, n1, f_str->aa1v);
2706 }
2707 #endif
2708 }
2709
2710 /* aln_func_vals - set up aln.qlfact, qlrev, llfact, llmult, frame, llrev */
2711 /* call from calcons, calc_id, calc_code */
2712 void
2713 aln_func_vals(int frame, struct a_struct *aln) {
2714
2715 #ifndef TFAST
2716 aln->llrev = 0;
2717 aln->llfact = 1;
2718 aln->llmult = 1;
2719 aln->qlfact = 3;
2720 aln->frame = frame;
2721 if (frame > 0) aln->qlrev = 1;
2722 else aln->qlrev = 0;
2723 aln->llrev = 0;
2724 #else /* TFASTX */
2725 aln->qlfact = 1;
2726 aln->qlrev = 0;
2727 aln->llfact = 3;
2728 aln->llmult = 1;
2729 aln->frame = frame;
2730 if (frame > 0) aln->llrev = 1;
2731 else aln->llrev = 0;
2732 aln->qlrev=0;
2733 #endif /* TFASTX */
2734 }
2735
2736 #include "structs.h"
2737 #include "a_mark.h"
2738
2739 extern int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim);
2740
2741 extern int
2742 next_annot_match(int *itmp, int *pam2aa0v, long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
2743 int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
2744 void *annot_stack, int *have_push_features, int *v_delta,
2745 struct annot_entry **region_p, struct annot_entry *tmp_region_p, int init_score);
2746
2747 extern void
2748 comment_var(long i0, char sp0, long i1, char sp1, char o_sp1, char sim_char,
2749 const char *ann_comment, struct dyn_string_str *annot_var_dyn, int target, int d_type);
2750
2751 void
2752 display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
2753 long i0, char sp0, long i1, char sp1, char sym,
2754 struct annot_entry **region0_p,
2755 struct annot_entry **region1_p,
2756 int score, double comp, int n0, int n1,
2757 void *pstat_void, int d_type);
2758
2759 #define DP_FULL_FMT 1 /* Region: score: bits: id: ... */
2760 #define Q_TARGET 0
2761 #define L_TARGET 1
2762
2763 int seq_pos(int pos, int rev,int off);
2764
2765 int
2766 calc_cons_a(const unsigned char *aa0, int n0,
2767 const unsigned char *aa1, int n1,
2768 int *nc,
2769 struct a_struct *aln,
2770 struct a_res_str *a_res,
2771 struct pstruct *ppst,
2772 char *seqc0, char *seqc1, char *seqca, int *cumm_seq_score,
2773 const unsigned char *ann_arr,
2774 const unsigned char *aa0a, const struct annot_str *annot0_p, char *seqc0a,
2775 const unsigned char *aa1a, const struct annot_str *annot1_p, char *seqc1a,
2776 int *score_delta,
2777 struct dyn_string_str *annot_var_dyn,
2778 struct f_struct *f_str,
2779 void *pstat_void
2780 )
2781 {
2782 int i0, i1;
2783 int lenc, not_c, itmp, ngap_p, ngap_d, nfs;
2784 char *sp0, *sp0a, *sp1, *sp1a, *spa, t_spa;
2785 int *i_spa;
2786 const unsigned char *sq;
2787 unsigned char aap;
2788
2789 const unsigned char *ap0, *ap1;
2790 const unsigned char *ap1a; /* ap1 always points to protein, and
2791 only protein has annotations */
2792 int *rp, *rpmax;
2793 int have_ann = 0;
2794
2795 /* variables for variant changes */
2796 char tmp_str[MAX_LSTR];
2797 int *annot_stack, annot_stack_n, annot_top=0;
2798 char *sim_sym = aln_map_sym[MX_ACC];
2799 struct annot_entry **s_annot1_arr_p, *region1_p, pre_annot1;
2800 struct annot_entry **s_annot0_arr_p, *region0_p, pre_annot0;
2801 struct annot_entry *this_annot_p;
2802 int i1_annot, v_delta, v_tmp;
2803 int have_push_features, prev_match;
2804 long i0_offset, i1_offset;
2805
2806 char *ann_comment;
2807
2808 *score_delta = 0;
2809
2810 if (ppst->ext_sq_set) {sq = ppst->sqx;}
2811 else {sq = ppst->sq;}
2812
2813 /* res[0] has start of protein sequence */
2814 /* res[1] has start of translated DNA sequence */
2815
2816 #ifndef TFAST
2817 aln->amin1 = aln->smin1 = a_res->min0; /* start in protein sequence */
2818 aln->amin0 = aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
2819
2820 i0_offset = aln->q_offset;
2821 i1_offset = aln->l_offset;
2822
2823 ap0 = f_str->aa0v; /* computed codons -> ap0*/
2824 ap1 = aa1; /* protein sequence -> ap1 */
2825
2826 sp0 = seqc0; /* translated DNA */
2827 sp1 = seqc1; /* protein */
2828
2829 have_ann = (seqc0a != NULL && aa1a != NULL);
2830 ap1a = aa1a;
2831 sp1a = seqc1a; /* protein library can have annotation */
2832 sp0a = seqc0a; /* sp0a is always ' ' - no translated
2833 annotation */
2834 #else /* TFASTYZ */
2835 if (aln->frame == 0) {
2836 pre_com(aa1, n1, f_str->aa1v);
2837 }
2838 else {
2839 pre_com_r(aa1, n1, f_str->aa1v);
2840 }
2841
2842 aln->amin0 = aln->smin0 = a_res->min0; /* start in protein sequence */
2843 aln->amin1 = aln->smin1 = a_res->min1; /* start in codon sequence */
2844
2845 i1_offset = aln->q_offset;
2846 i0_offset = aln->l_offset;
2847
2848 ap1 = aa0; /* protein sequence */
2849 ap0 = f_str->aa1v; /* computed codons -> ap0*/
2850
2851 sp0 = seqc1; /* protein */
2852 sp1 = seqc0; /* translated DNA */
2853
2854 have_ann = (seqc0a != NULL && aa0a != NULL);
2855 ap1a = aa0a;
2856 sp1a = seqc0a; /* protein query can have annotation */
2857 sp0a = seqc1a; /* sp0a is always ' ' - no translated
2858 annotation */
2859 #endif
2860 spa = seqca;
2861 i_spa = cumm_seq_score;
2862
2863 rp = a_res->res; /* start of alignment info */
2864 rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
2865
2866 lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
2867 i0 = a_res->min1-3; /* start of codon sequence */
2868 i1 = a_res->min0; /* start of protein sequence */
2869
2870 v_delta = 0;
2871 i1_annot = 0;
2872 region0_p = region1_p = NULL;
2873 s_annot0_arr_p = s_annot1_arr_p = NULL;
2874 annot_stack = NULL;
2875 have_push_features = prev_match = 0;
2876 if (have_ann) {
2877 if (annot1_p && annot1_p->n_annot > 0) annot_stack = init_stack(64,64);
2878 if (annot1_p && annot1_p->n_annot > 0) {
2879 s_annot1_arr_p = annot1_p->s_annot_arr_p;
2880 while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1) {
2881 if (s_annot1_arr_p[i1_annot]->label == '[') {
2882 memcpy(&pre_annot1,s_annot1_arr_p[i1_annot], sizeof(struct annot_entry));
2883 pre_annot1.pos = aln->amin1 + i1_offset;
2884 pre_annot1.a_pos = aln->amin0 + i0_offset;
2885 region1_p = &pre_annot1;
2886 region1_p->score = region1_p->n_aln = region1_p->n_ident = 0;
2887 }
2888 else if (s_annot1_arr_p[i1_annot]->label == ']') {
2889 region1_p = NULL;
2890 }
2891 i1_annot++;
2892 }
2893 }
2894 }
2895
2896 while (rp < rpmax ) {
2897 switch (*rp++) {
2898 case 3: /* match */
2899 i0 += 3;
2900
2901 *sp1 = sq[aap=ap1[i1]];
2902 *sp0 = f_str->weight_c[aap][ap0[i0]].c5;
2903 itmp = ppst->pam2[0][aap][pascii[*sp0]];
2904
2905
2906 if (have_ann) {
2907 *sp1a = ann_arr[ap1a[i1]];
2908 *sp0a = ' ';
2909 if (s_annot1_arr_p) {
2910 if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
2911 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
2912 i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
2913 i1_annot, annot1_p->n_annot, s_annot1_arr_p,
2914 &ann_comment, annot_stack, &have_push_features, &v_delta,
2915 ®ion1_p, &pre_annot1, 0);
2916
2917 /* must be out of the loop to capture the last value */
2918 if (ppst->sq[ap1[i1]] != *sp1) {
2919 t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
2920
2921 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
2922 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
2923 sq[ap1[i1]], sim_sym[t_spa], ann_comment,
2924 annot_var_dyn,1,1);
2925 }
2926 }
2927 prev_match = 1;
2928 if (region1_p) {region1_p->score += itmp;}
2929 }
2930 sp0a++; sp1a++;
2931 }
2932
2933 *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
2934
2935 if (region1_p) {
2936 region1_p->n_aln++;
2937 if (*spa == M_IDENT) {region1_p->n_ident++;}
2938 }
2939
2940 if (cumm_seq_score) *i_spa++ = itmp;
2941
2942 if (have_ann && have_push_features) {
2943 display_push_features(annot_stack, annot_var_dyn,
2944 i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
2945 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
2946 sim_sym[*spa], ®ion0_p, ®ion1_p,
2947 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
2948 have_push_features = 0;
2949 }
2950
2951 i1++;
2952 sp0++; sp1++; spa++;
2953 lenc++;
2954 break;
2955 case 2: /* frame shift +2, then match */
2956 nfs++;
2957 i0 += 2;
2958 *sp0++ = '/';
2959 *sp1++ = '-';
2960 *spa++ = M_DEL;
2961 if (cumm_seq_score) *i_spa++ = ppst->gshift;
2962
2963 if (have_ann) {*sp0a++ = *sp1a++ = ' ';}
2964 not_c++;
2965
2966 *sp1 = sq[aap=ap1[i1]];
2967 *sp0 = f_str->weight_c[aap][ap0[i0]].c2;
2968 itmp = ppst->pam2[0][pascii[*sp0]][aap];
2969
2970 if (have_ann) {
2971 *sp1a = ann_arr[ap1a[i1]];
2972 *sp0a = ' ';
2973 if (s_annot1_arr_p) {
2974 if (i1 + i1_offset == s_annot1_arr_p[i1_annot]->pos) {
2975 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
2976 i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
2977 i1_annot, annot1_p->n_annot, s_annot1_arr_p,
2978 &ann_comment, annot_stack, &have_push_features, &v_delta,
2979 ®ion1_p, &pre_annot1, 0);
2980
2981 /* must be out of the loop to capture the last value */
2982 if (ppst->sq[ap1[i1]] != *sp1) {
2983 t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
2984
2985 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
2986 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
2987 sq[ap1[i1]], sim_sym[t_spa], ann_comment,
2988 annot_var_dyn,1,DP_FULL_FMT);
2989 }
2990 }
2991 if (region1_p) {
2992 region1_p->score += ppst->gshift;
2993 region1_p->score += itmp;
2994 }
2995 prev_match = 1;
2996 }
2997 sp0a++; sp1a++;
2998 }
2999
3000 *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
3001
3002 if (region1_p) {
3003 region1_p->n_aln++;
3004 if (*spa == M_IDENT) {region1_p->n_ident++;}
3005 }
3006
3007 if (have_ann && have_push_features) {
3008 display_push_features(annot_stack, annot_var_dyn,
3009 i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
3010 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
3011 sim_sym[*spa], ®ion0_p, ®ion1_p,
3012 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
3013 have_push_features = 0;
3014 }
3015
3016 if (cumm_seq_score) *i_spa++ = itmp;
3017
3018 i1++;
3019 sp0++; sp1++; spa++;
3020 lenc++;
3021 break;
3022 case 4: /* frame shift, -1, then match */
3023 nfs++;
3024 i0 += 4;
3025 if (have_ann) {
3026 *sp1a++ = *sp0a++ = ' ';
3027 }
3028 *sp0++ = '\\';
3029 *sp1++ = '-';
3030 *spa++ = M_DEL;
3031 if (cumm_seq_score) *i_spa++ = ppst->gshift;
3032
3033 not_c++;
3034
3035 *sp1 = sq[aap=ap1[i1]];
3036 *sp0 = f_str->weight_c[aap][ap0[i0]].c4;
3037 itmp = ppst->pam2[0][pascii[*sp0]][aap];
3038
3039 if (have_ann) {
3040 *sp1a = ann_arr[ap1a[i1]];
3041 *sp0a = ' ';
3042 if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3043 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[*sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
3044 i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
3045 i1_annot, annot1_p->n_annot, s_annot1_arr_p, &ann_comment,
3046 annot_stack, &have_push_features, &v_delta, ®ion1_p, &pre_annot1, 0);
3047
3048 /* must be out of the loop to capture the last value */
3049 if (ppst->sq[ap1[i1]] != *sp1) {
3050 t_spa = align_type(itmp, *sp0, *sp1, 0, NULL, ppst->pam_x_id_sim);
3051 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
3052 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
3053 sq[ap1[i1]], sim_sym[t_spa], ann_comment,
3054 annot_var_dyn,1,DP_FULL_FMT);
3055 }
3056 prev_match = 1;
3057 if (region1_p) {region1_p->score += itmp;}
3058 }
3059 sp0a++; sp1a++;
3060 }
3061
3062 *spa = align_type(itmp, *sp0, *sp1, 0, aln, ppst->pam_x_id_sim);
3063 if (region1_p) {
3064 region1_p->n_aln++;
3065 if (*spa == M_IDENT) {region1_p->n_ident++;}
3066 }
3067
3068 if (cumm_seq_score) *i_spa++ = itmp;
3069
3070 if (have_ann && have_push_features) {
3071 display_push_features(annot_stack, annot_var_dyn,
3072 i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
3073 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
3074 sim_sym[*spa], ®ion0_p, ®ion1_p,
3075 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
3076 have_push_features = 0;
3077 }
3078
3079 i1++;
3080 sp0++; sp1++; spa++;
3081 lenc++;
3082 break;
3083 case 5: /* insertion in 0 */
3084 if (have_ann) {
3085 *sp1a++ = *sp0a++ = ' ';
3086 }
3087 i0 += 3;
3088 *sp0++ = f_str->weight_c[0][ap0[i0]].c3;
3089 *sp1++ = '-';
3090 *spa++ = M_DEL;
3091 lenc++;
3092 ngap_p++;
3093 if (cumm_seq_score) *i_spa++ = ppst->gdelval;
3094 break;
3095 case 0: /* insertion in 1 */
3096 *sp0++ = '-';
3097 *sp1++ = sq[ap1[i1]];
3098 *spa++ = M_DEL;
3099 if (cumm_seq_score) {
3100 if (prev_match) *i_spa = ppst->gdelval;
3101 *i_spa++ += ppst->gdelval;
3102 }
3103
3104 if (have_ann) {
3105 *sp0a = ' ';
3106 *sp1a = ann_arr[ap1a[i1]];
3107
3108 if (s_annot1_arr_p) {
3109 /* coordiates are much more complex for next_annot_match,
3110 and comment_var, because they may need to be reversed */
3111
3112 if (i1 + i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3113 i1_annot = next_annot_match(&itmp, ppst->pam2[0][ap0[i0]], i1_offset+seq_pos(i1,aln->llrev,0),
3114 i0_offset+seq_pos(i0,aln->qlrev,0), sp1, sp1a, sq,
3115 i1_annot, annot1_p->n_annot, s_annot1_arr_p,
3116 &ann_comment, annot_stack, &have_push_features, &v_delta,
3117 ®ion1_p, &pre_annot1, 0);
3118 }
3119
3120 if (region1_p) {
3121 if (prev_match) region1_p->score += ppst->gdelval;
3122 region1_p->score += ppst->ggapval;
3123 region1_p->n_aln++;
3124 }
3125 prev_match = 0;
3126 }
3127 sp0a++; sp1a++;
3128 }
3129
3130 if (have_ann && have_push_features) {
3131 display_push_features(annot_stack, annot_var_dyn,
3132 i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
3133 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
3134 sim_sym[*spa], ®ion0_p, ®ion1_p,
3135 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
3136 have_push_features = 0;
3137 }
3138
3139 i1++;
3140 lenc++;
3141 ngap_d++;
3142 break;
3143 }
3144 }
3145
3146 if (have_ann) {
3147 *sp0a = '\0';
3148 if (s_annot1_arr_p) {
3149 have_push_features = 0;
3150 while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < n1) {
3151 if (s_annot1_arr_p[i1_annot]->label == '[') break;
3152 if (s_annot1_arr_p[i1_annot]->label == ']') {
3153 push_stack(annot_stack, s_annot1_arr_p[i1_annot]);
3154 have_push_features = 1;
3155 }
3156 i1_annot++;
3157 }
3158 if (have_push_features) {
3159 display_push_features(annot_stack, annot_var_dyn,
3160 i0_offset+seq_pos(i0,aln->qlrev,0), *sp0,
3161 i1_offset+seq_pos(i1,aln->llrev,0), *sp1,
3162 sim_sym[*spa],®ion0_p, ®ion1_p,
3163 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, DP_FULL_FMT);
3164 have_push_features = 0;
3165 }
3166 }
3167 }
3168 *spa = '\0';
3169
3170 #ifndef TFAST
3171 aln->amax0 = i0+3; /* end of codon sequence */
3172 aln->amax1 = i1; /* end of protein sequence */
3173 aln->ngap_q = ngap_d;
3174 aln->ngap_l = ngap_p;
3175 #else
3176 aln->amax1 = i0+3; /* end of codon sequence */
3177 aln->amax0 = i1; /* end of protein sequence */
3178 aln->ngap_q = ngap_p;
3179 aln->ngap_l = ngap_d;
3180 #endif
3181 aln->calc_last_set = 1;
3182 aln->nfs = nfs;
3183 aln->amin0 = aln->smin0;
3184 aln->amin1 = aln->smin1;
3185
3186 *score_delta = v_delta;
3187
3188 free_stack(annot_stack);
3189
3190 if (lenc < 0) lenc = 1;
3191
3192 *nc = lenc;
3193 /* now we have the middle, get the right end */
3194
3195 return lenc+not_c;
3196 }
3197
3198 void
3199 calc_astruct(struct a_struct *aln_p, struct a_res_str *a_res_p, struct f_struct *f_str) {
3200
3201 aln_p->calc_last_set = 0;
3202
3203 #ifndef TFAST /* FASTX */
3204 aln_p->amin1 = a_res_p->min0; /* prot */
3205 aln_p->amin0 = a_res_p->min1; /* DNA */
3206 aln_p->amax1 = a_res_p->max0; /* prot */
3207 aln_p->amax0 = a_res_p->max1; /* DNA */
3208 #else /* TFASTX */
3209 aln_p->amin0 = a_res_p->min0; /* DNA */
3210 aln_p->amin1 = a_res_p->min1; /* prot */
3211 aln_p->amax0 = a_res_p->max0; /* DNA */
3212 aln_p->amax1 = a_res_p->max1; /* prot */
3213 #endif
3214 }
3215
3216 /* build an array of match/ins/del - length strings */
3217
3218 /* modified 10-June-2014 to distinguish matches from mismatches, op=1
3219 (previously unused) indicates an aligned non-identity */
3220
3221 /* op_codes are: 0 - aa insertion
3222 1 - (now) aligned non-identity
3223 2 - -1 frameshift
3224 3 - aligned identity
3225 4 - +1 frameshift
3226 5 - codon insertion
3227 */
3228
3229 static struct update_code_str *
3230 init_update_data(show_code) {
3231
3232 struct update_code_str *update_data_p;
3233
3234 if ((update_data_p = (struct update_code_str *)calloc(1,sizeof(struct update_code_str)))==NULL) {
3235 fprintf(stderr,"*** error [%s:%d] - init_update_data(): cannot allocate update_code_str\n",
3236 __FILE__, __LINE__);
3237 return NULL;
3238 }
3239
3240 update_data_p->p_op_cnt = 0;
3241 update_data_p->show_code = show_code;
3242
3243 if ((show_code & SHOW_CODE_MASK) == SHOW_CODE_CIGAR) {
3244 update_data_p->op_map = cigar_code;
3245 update_data_p->cigar_order = 1;
3246 }
3247 else {
3248 update_data_p->op_map = ori_code;
3249 update_data_p->cigar_order = 0;
3250 }
3251
3252 if ((show_code & SHOW_CODE_EXT) == SHOW_CODE_EXT) {
3253 update_data_p->show_ext = 1;
3254 }
3255 else {
3256 update_data_p->show_ext = 0;
3257 }
3258
3259 return update_data_p;
3260 }
3261
3262 static void
3263 close_update_data(char *al_str, int al_str_max,
3264 struct update_code_str *up_dp) {
3265 char tmp_cnt[MAX_SSTR];
3266
3267 if (!up_dp) return;
3268 sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx, up_dp->p_op_cnt);
3269 strncat(al_str,tmp_cnt,al_str_max);
3270
3271 free(up_dp);
3272 }
3273
3274 /* update_indel_code() has been modified to work more correctly with
3275 ggsearch/glsearch, which, because alignments can start with either
3276 insertions or deletions, can produce an initial code of "0=". When
3277 that happens, it is ignored and no code is added.
3278
3279 *al_str - alignment string [al_str_max] - not dynamic
3280 op -- encoded operation, currently 0=match, 1-delete, 2-insert, 3-term-match, 4-mismatch
3281 op_cnt -- length of run
3282 show_code -- SHOW_CODE_CIGAR uses cigar_code, otherwise legacy
3283 */
3284
3285 /* update_indel_code() is called for insertions and deletions
3286 update_match_code() is called for every match
3287 */
3288
3289 static void
3290 sprintf_code(char *tmp_str, struct update_code_str *up_dp, int op_idx, int op_cnt) {
3291
3292 if (op_cnt == 0) return;
3293
3294 if (up_dp->cigar_order) {
3295 sprintf(tmp_str,"%d%c",op_cnt,up_dp->op_map[op_idx]);
3296 }
3297 else {
3298 sprintf(tmp_str,"%c%d",up_dp->op_map[op_idx],op_cnt);
3299 }
3300 }
3301
3302 static void
3303 update_code(char *al_str, int al_str_max,
3304 struct update_code_str *up_dp, int op,
3305 int sim_code, unsigned char sp0, unsigned char sp1)
3306 {
3307 char tmp_cnt[MAX_SSTR];
3308
3309 /* there are two kinds of "op's", one time and accumulating */
3310 /* op == 2, 4 are one-time: */
3311
3312 switch (op) {
3313 case 2:
3314 case 4:
3315 sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
3316 strncat(al_str,tmp_cnt,al_str_max);
3317 sprintf_code(tmp_cnt,up_dp, op, 1);
3318 strncat(al_str,tmp_cnt,al_str_max);
3319 up_dp->p_op_cnt = 0;
3320 break;
3321 case 0:
3322 case 5:
3323 if (op == up_dp->p_op_idx) {
3324 up_dp->p_op_cnt++;
3325 }
3326 else {
3327 sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
3328 strncat(al_str,tmp_cnt,al_str_max);
3329 up_dp->p_op_idx = op;
3330 up_dp->p_op_cnt = 1;
3331 }
3332 break;
3333 case 1:
3334 case 3:
3335 if (sp0 != '*' && sp1 != '*') { /* default case, not termination */
3336 if (up_dp->show_ext) {
3337 if (sim_code != M_IDENT) { op = 1;}
3338 }
3339 }
3340 else { /* have a termination codon, output for !SHOW_CODE_CIGAR */
3341 if (!up_dp->cigar_order) {
3342 if (sp0 == '*' || sp1 == '*') { op = 6;}
3343 }
3344 else if (up_dp->show_ext && (sp0 != sp1)) { op = 1;}
3345 }
3346
3347 if (up_dp->p_op_cnt == 0) {
3348 up_dp->p_op_idx = op;
3349 up_dp->p_op_cnt = 1;
3350 }
3351 else if (op != up_dp->p_op_idx) {
3352 sprintf_code(tmp_cnt,up_dp, up_dp->p_op_idx,up_dp->p_op_cnt);
3353 strncat(al_str,tmp_cnt,al_str_max);
3354 up_dp->p_op_idx = op;
3355 up_dp->p_op_cnt = 1;
3356 }
3357 else {
3358 up_dp->p_op_cnt++;
3359 }
3360 break;
3361 }
3362 return;
3363 }
3364
3365 int calc_code(const unsigned char *aa0, int n0,
3366 const unsigned char *aa1, int n1,
3367 struct a_struct *aln,
3368 struct a_res_str *a_res,
3369 struct pstruct *ppst,
3370 char *al_str, int al_str_n,
3371 const unsigned char *ann_arr,
3372 const unsigned char *aa0a,
3373 const struct annot_str *annot0_p,
3374 const unsigned char *aa1a,
3375 const struct annot_str *annot1_p,
3376 struct dyn_string_str *annot_code_dyn,
3377 int *score_delta,
3378 struct f_struct *f_str,
3379 void *pstat_void,
3380 int display_code)
3381 {
3382 int i0, i1;
3383 int lenc, not_c, ngap_d, ngap_p, nfs;
3384 char sp0, sp1;
3385 struct update_code_str *update_data_p;
3386 char op_char[10], ann_ch0, ann_ch1;
3387 unsigned char aap;
3388 const unsigned char *ap0, *ap1, *ap1a;
3389 int *rp, *rpmax;
3390 const unsigned char *sq;
3391 int have_ann = 0;
3392 char tmp_astr[MAX_STR];
3393 int sim_code, t_spa;
3394 int show_code, annot_fmt;
3395 char *sim_sym= aln_map_sym[MX_ACC];
3396 /* variables for variant changes */
3397 void *annot_stack;
3398 struct annot_entry **s_annot1_arr_p, *region1_p, pre_annot1;
3399 struct annot_entry **s_annot0_arr_p, *region0_p, pre_annot0;
3400 int itmp, i1_annot, v_delta, v_tmp;
3401 int have_push_features, prev_match;
3402 long i0_offset, i1_offset;
3403
3404 *score_delta = 0;
3405
3406 show_code = (display_code & SHOW_CODE_MASK + SHOW_CODE_EXT);
3407 annot_fmt = 2;
3408 if (display_code & SHOW_ANNOT_FULL) {
3409 annot_fmt = 1;
3410 }
3411
3412 if (ppst->ext_sq_set) {sq = ppst->sqx;}
3413 else {sq = ppst->sq;}
3414
3415 /* don't fill in the ends */
3416 #ifndef TFAST
3417 ap0 = f_str->aa0v; /* computed codons -> ap0*/
3418 ap1 = aa1; /* protein sequence -> ap1 */
3419 aln->smin1 = a_res->min0; /* start in protein sequence */
3420 aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
3421
3422 i0_offset = aln->q_offset;
3423 i1_offset = aln->l_offset;
3424
3425 have_ann = (ann_arr[0] != '\0' && aa1a != NULL);
3426 ap1a = aa1a;
3427 #else /* TFASTYZ */
3428 if (aln->frame == 0) { pre_com(aa1, n1, f_str->aa1v);}
3429 else { pre_com_r(aa1, n1, f_str->aa1v);}
3430
3431 ap0 = f_str->aa1v; /* computed codons -> ap0*/
3432 ap1 = aa0; /* protein sequence */
3433 aln->smin0 = a_res->min0; /* start in protein sequence */
3434 aln->smin1 = a_res->min1; /* start in codon sequence */
3435
3436 i1_offset = aln->q_offset;
3437 i0_offset = aln->l_offset;
3438
3439 have_ann = (ann_arr[0] != '\0' && aa0a != NULL);
3440 ap1a = aa0a;
3441 #endif
3442
3443 rp = a_res->res; /* start of alignment info */
3444 rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
3445
3446 /* now get the middle */
3447
3448 lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
3449 update_data_p = init_update_data(show_code);
3450
3451 i0 = a_res->min1-3; /* start of codon sequence */
3452 i1 = a_res->min0; /* start of protein sequence */
3453
3454 v_delta = 0;
3455 i1_annot = 0;
3456 have_push_features = prev_match = 0;
3457 region0_p = region1_p = NULL;
3458 s_annot0_arr_p = s_annot1_arr_p = NULL;
3459 annot_stack = NULL;
3460 if (have_ann) {
3461 if (annot1_p && annot1_p->n_annot > 0) annot_stack = init_stack(64,64);
3462 if (annot1_p && annot1_p->n_annot > 0) {
3463 s_annot1_arr_p = annot1_p->s_annot_arr_p;
3464 while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1) {
3465 if (s_annot1_arr_p[i1_annot]->label == '[') {
3466 memcpy(&pre_annot1,s_annot1_arr_p[i1_annot], sizeof(struct annot_entry));
3467 pre_annot1.pos = aln->amin1 + i1_offset;
3468 pre_annot1.a_pos = aln->amin0 + i0_offset;
3469 region1_p = &pre_annot1;
3470 region1_p->score = region1_p->n_aln = region1_p->n_ident = 0;
3471 }
3472 else if (s_annot1_arr_p[i1_annot]->label == ']') {
3473 region1_p = NULL;
3474 }
3475 i1_annot++;
3476 }
3477 }
3478 }
3479
3480 while (rp < rpmax ) {
3481 switch (*rp++) {
3482 case 0: /* insert in 0 */
3483 sim_code = 5; /* indel code */
3484 update_code(al_str, al_str_n-strlen(al_str), update_data_p, 0, sim_code,'-','-');
3485
3486 if (s_annot1_arr_p) {
3487 if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3488 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
3489 i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3490 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3491 annot_stack, &have_push_features, &v_delta,
3492 ®ion1_p, &pre_annot1, 0);
3493 }
3494
3495 if (region1_p) {
3496 if (prev_match) region1_p->score += ppst->gdelval;
3497 region1_p->score += ppst->ggapval;
3498 region1_p->n_aln++;
3499 }
3500 prev_match = 0;
3501
3502 if (have_push_features) {
3503 display_push_features(annot_stack, annot_code_dyn,
3504 i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3505 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3506 sim_sym[sim_code], ®ion0_p, ®ion1_p,
3507 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
3508 have_push_features = 0;
3509 }
3510 }
3511
3512 i1++;
3513 lenc++;
3514 ngap_d++;
3515 break;
3516
3517 case 2: /* -1 frame shift */
3518 update_code(al_str, al_str_n-strlen(al_str), update_data_p, 2, sim_code,'-','-');
3519
3520 nfs++;
3521 i0 += 2;
3522 not_c++;
3523
3524 sp1 = ppst->sq[aap=ap1[i1]];
3525 sp0 = f_str->weight_c[aap][ap0[i0]].c2;
3526 itmp = ppst->pam2[0][pascii[sp0]][aap];
3527
3528 if (s_annot1_arr_p) {
3529 if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3530 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
3531 i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3532 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3533 annot_stack, &have_push_features, &v_delta,
3534 ®ion1_p, &pre_annot1, 0);
3535 }
3536
3537 if (sq[aap] != sp1) {
3538 t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
3539
3540 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3541 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3542 sq[aap], sim_sym[t_spa], NULL,
3543 annot_code_dyn,1,annot_fmt);
3544 }
3545
3546 if (region1_p) {
3547 region1_p->score += ppst->gshift;
3548 region1_p->score += itmp;
3549 }
3550 prev_match = 1;
3551 }
3552
3553 sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3554 if (region1_p) {
3555 region1_p->n_aln++;
3556 if (sim_code == M_IDENT) {region1_p->n_ident++;}
3557 }
3558
3559 /* check for an annotation */
3560 if (have_ann && !(ann_arr[aa1a[i1]] == ' ' || ann_arr[aa1a[i1]] == '[' || ann_arr[aa1a[i1]] == ']')) {
3561 #ifndef TFAST
3562 sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
3563 ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
3564 #else
3565 sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
3566 ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
3567
3568 #endif
3569 /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
3570 dyn_strcat(annot_code_dyn, tmp_astr);
3571 }
3572
3573 if (s_annot1_arr_p && have_push_features) {
3574 display_push_features(annot_stack, annot_code_dyn,
3575 i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3576 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3577 sim_sym[sim_code], ®ion0_p, ®ion1_p,
3578 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
3579 have_push_features = 0;
3580 }
3581
3582 i1++;
3583 lenc++;
3584 break;
3585
3586 case 3: /* match */
3587 i0 += 3;
3588
3589 sp1 = ppst->sq[aap=ap1[i1]];
3590 sp0 = f_str->weight_c[aap][ap0[i0]].c5;
3591 itmp = ppst->pam2[0][aap][pascii[sp0]];
3592
3593 if (s_annot1_arr_p) {
3594 if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3595 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
3596 i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3597 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3598 annot_stack, &have_push_features, &v_delta,
3599 ®ion1_p, &pre_annot1, 0);
3600 }
3601
3602 if (sq[aap] != sp1) {
3603 t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
3604
3605 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3606 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3607 sq[aap], sim_sym[t_spa], NULL, annot_code_dyn,
3608 1,annot_fmt);
3609 }
3610
3611 if (region1_p) {region1_p->score += itmp;}
3612 prev_match = 1;
3613 }
3614
3615 sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3616 if (region1_p) {
3617 region1_p->n_aln++;
3618 if (sim_code == M_IDENT) {region1_p->n_ident++;}
3619 }
3620
3621 /* check for an annotation */
3622 if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
3623 #ifndef TFAST
3624 sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
3625 ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
3626 #else
3627 sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
3628 ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
3629
3630 #endif
3631 /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
3632 dyn_strcat(annot_code_dyn, tmp_astr);
3633 }
3634
3635 if (s_annot1_arr_p && have_push_features) {
3636 display_push_features(annot_stack, annot_code_dyn,
3637 i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3638 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3639 sim_sym[sim_code], ®ion0_p, ®ion1_p,
3640 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
3641 have_push_features = 0;
3642 }
3643
3644 update_code(al_str, al_str_n-strlen(al_str), update_data_p, 3, sim_code,sp0,sp1);
3645
3646 i1++;
3647 lenc++;
3648 break;
3649
3650 case 4: /* +1 frame shift */
3651 /* finish previous run */
3652 update_code(al_str, al_str_n-strlen(al_str), update_data_p, 4, sim_code,'-','-');
3653 /* mark frameshift */
3654
3655 nfs++;
3656 i0 += 4;
3657 not_c++;
3658
3659 sp1 = ppst->sq[aap=ap1[i1]];
3660 sp0 = f_str->weight_c[aap][ap0[i0]].c2;
3661 itmp = ppst->pam2[0][aap][pascii[sp0]];
3662
3663 if (s_annot1_arr_p) {
3664 if (i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3665 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0),
3666 i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3667 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3668 annot_stack, &have_push_features, &v_delta, ®ion1_p, &pre_annot1, 0);
3669 }
3670
3671 if (sq[aap] != sp1) {
3672 t_spa = align_type(itmp, sp0, sp1, 0, NULL, ppst->pam_x_id_sim);
3673
3674 comment_var(i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3675 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3676 sq[aap], sim_sym[t_spa], NULL, annot_code_dyn,
3677 1,annot_fmt);
3678 }
3679
3680 if (region1_p) {
3681 region1_p->score += ppst->gshift;
3682 region1_p->score += itmp;
3683 }
3684 prev_match = 1;
3685 }
3686
3687 sim_code = align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3688 if (region1_p) {
3689 region1_p->n_aln++;
3690 if (sim_code == M_IDENT) {region1_p->n_ident++;}
3691 }
3692
3693 if (have_ann && !(ann_arr[ap1a[i1]] == ' ' || ann_arr[ap1a[i1]]=='[' || ann_arr[ap1a[i1]]==']')) {
3694 #ifndef TFAST
3695 sprintf(tmp_astr, "|X%c:%ld%c%c%ld%c",
3696 ann_arr[ap1a[i1]],i0_offset+seq_pos(i0,aln->qlrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i1,aln->llrev,0)+1,sp1);
3697 #else
3698 sprintf(tmp_astr, "|%cX:%ld%c%c%ld%c",
3699 ann_arr[ap1a[i1]],i0_offset+seq_pos(i1,aln->llrev,0)+1,sp0,sim_sym[sim_code],i1_offset+seq_pos(i0,aln->qlrev,0)+1,sp1);
3700
3701 #endif
3702 /* SAFE_STRNCAT(annot_code_s, tmp_astr, n_annot_code_s); */
3703 dyn_strcat(annot_code_dyn, tmp_astr);
3704 }
3705
3706 if (s_annot1_arr_p && have_push_features) {
3707 display_push_features(annot_stack, annot_code_dyn,
3708 i0_offset+seq_pos(i0,aln->qlrev,0), sp0,
3709 i1_offset+seq_pos(i1,aln->llrev,0), sp1,
3710 sim_sym[sim_code], ®ion0_p, ®ion1_p,
3711 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
3712 have_push_features = 0;
3713 }
3714
3715 i1++;
3716 lenc++;
3717 break;
3718
3719 case 5: /* insert in 1 */
3720 sim_code = 5;
3721 update_code(al_str, al_str_n-strlen(al_str), update_data_p, 5, sim_code,'-','-');
3722
3723 i0 += 3;
3724 lenc++;
3725 ngap_p++;
3726 break;
3727 }
3728 }
3729
3730 close_update_data(al_str, al_str_n-strlen(al_str), update_data_p);
3731
3732 #ifndef TFAST
3733 aln->amax0 = i0+3; /* end of codon sequence */
3734 aln->amax1 = i1; /* end of protein sequence */
3735 aln->ngap_q = ngap_d;
3736 aln->ngap_l = ngap_p;
3737 #else
3738 aln->amax1 = i0+3; /* end of codon sequence */
3739 aln->amax0 = i1; /* end of protein sequence */
3740 aln->ngap_q = ngap_p;
3741 aln->ngap_l = ngap_d;
3742 #endif
3743 aln->calc_last_set = 1;
3744 aln->nfs = nfs;
3745 aln->amin0 = aln->smin0;
3746 aln->amin1 = aln->smin1;
3747
3748 *score_delta = v_delta;
3749
3750 if (have_ann) {
3751 have_push_features = 0;
3752 if (s_annot1_arr_p) {
3753 /* also check for regions after alignment */
3754 while (i1_annot < annot1_p->n_annot && s_annot1_arr_p[i1_annot]->pos < i1_offset+n1) {
3755 if (s_annot1_arr_p[i1_annot]->label == '[') break;
3756 if (s_annot1_arr_p[i1_annot]->label == ']') {
3757 push_stack(annot_stack, s_annot1_arr_p[i1_annot]);
3758 have_push_features = 1;
3759 }
3760 i1_annot++;
3761 }
3762 }
3763
3764 if (have_push_features) {
3765 display_push_features(annot_stack, annot_code_dyn,
3766 i0_offset+a_res->max0-1, sp0,
3767 i1_offset+a_res->max1-1, sp1,
3768 sim_sym[sim_code], ®ion0_p, ®ion1_p,
3769 a_res->rst.score[ppst->score_ix], a_res->rst.comp, n0, n1, pstat_void, annot_fmt);
3770 }
3771
3772 if (!annot_stack) free_stack(annot_stack);
3773 }
3774
3775
3776 if (lenc < 0) lenc = 1;
3777
3778 /* now we have the middle, get the right end */
3779
3780 return lenc;
3781 }
3782
3783 int calc_id(const unsigned char *aa0, int n0,
3784 const unsigned char *aa1, int n1,
3785 struct a_struct *aln,
3786 struct a_res_str *a_res,
3787 struct pstruct *ppst,
3788 const struct annot_str *annot0_p,
3789 const struct annot_str *annot1_p,
3790 int *score_delta,
3791 struct dyn_string_str *annot_var_dyn,
3792 struct f_struct *f_str)
3793 {
3794 int i0, i1;
3795 int lenc, not_c, ngap_d, ngap_p, nfs;
3796 char sp0, sp1;
3797 unsigned char aap;
3798 const unsigned char *ap0, *ap1;
3799 int *rp, *rpmax;
3800
3801 int aa1c;
3802 /* variables for variant changes */
3803 struct annot_entry **s_annot1_arr_p;
3804 int itmp, i1_annot, v_delta, v_tmp;
3805 long i0_offset, i1_offset;
3806
3807 char tmp_str[MAX_SSTR];
3808 const unsigned char *sq;
3809
3810 *score_delta = 0;
3811 NULL_dyn_string(annot_var_dyn);
3812
3813 if (ppst->ext_sq_set) {sq = ppst->sqx;}
3814 else {sq = ppst->sq;}
3815
3816 /* don't fill in the ends */
3817 #ifndef TFAST /* FASTYZ */
3818 ap0 = f_str->aa0v; /* computed codons -> ap0*/
3819 ap1 = aa1; /* protein sequence -> ap1 */
3820 aln->smin1 = a_res->min0; /* start in protein sequence */
3821 aln->smin0 = a_res->min1; /* start in DNA/codon sequence */
3822
3823 i0_offset = aln->q_offset;
3824 i1_offset = aln->l_offset;
3825 #else /* TFASTYZ */
3826
3827 if (aln->frame == 0) {
3828 pre_com(aa1, n1, f_str->aa1v);
3829 }
3830 else {
3831 pre_com_r(aa1, n1, f_str->aa1v);
3832 }
3833
3834 ap0 = f_str->aa1v; /* computed codons -> ap0*/
3835 ap1 = aa0; /* protein sequence */
3836 aln->smin0 = a_res->min0; /* start in protein sequence */
3837 aln->smin1 = a_res->min1; /* start in codon sequence */
3838
3839 i1_offset = aln->q_offset;
3840 i0_offset = aln->l_offset;
3841 #endif
3842
3843 rp = a_res->res; /* start of alignment info */
3844 rpmax = &a_res->res[a_res->nres]; /* end of alignment info */
3845
3846 /* now get the middle */
3847
3848 lenc = not_c = aln->nident = aln->nmismatch = aln->nsim = aln->npos = ngap_d = ngap_p = nfs = 0;
3849 i0 = a_res->min1-3; /* start of codon sequence */
3850 i1 = a_res->min0; /* start of protein sequence */
3851
3852 v_delta = 0;
3853 i1_annot = 0;
3854 s_annot1_arr_p = NULL;
3855 if (annot1_p && annot1_p->n_annot > 0) s_annot1_arr_p = annot1_p->s_annot_arr_p;
3856
3857 while (rp < rpmax ) {
3858 switch (*rp++) {
3859 case 3: /* match */
3860 i0 += 3;
3861 sp1 = ppst->sq[aap=ap1[i1]];
3862 sp0 = f_str->weight_c[aap][ap0[i0]].c5;
3863
3864 itmp = ppst->pam2[0][pascii[sp0]][aap];
3865
3866 if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3867 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3868 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3869 NULL, NULL, &v_delta,NULL, NULL, 0);
3870
3871 if (ppst->sq[aap] != sp1) {
3872 sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
3873 /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
3874 dyn_strcat(annot_var_dyn, tmp_str);
3875 }
3876 }
3877
3878 align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3879
3880 i1++;
3881 lenc++;
3882 break;
3883 case 2:
3884 nfs++;
3885 i0 += 2;
3886 not_c++;
3887 sp1 = ppst->sq[aap=ap1[i1]];
3888 sp0 = f_str->weight_c[aap][ap0[i0]].c2;
3889
3890 itmp = ppst->pam2[0][aap][pascii[sp0]];
3891
3892 if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3893 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3894 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3895 NULL, NULL, &v_delta,NULL, NULL, 0);
3896
3897 if (ppst->sq[aap] != sp1) {
3898 sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
3899 /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
3900 dyn_strcat(annot_var_dyn, tmp_str);
3901 }
3902 }
3903
3904 align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3905
3906 i1++;
3907 lenc++;
3908 break;
3909 case 4:
3910 nfs++;
3911 i0 += 4;
3912 not_c++;
3913 sp1 = ppst->sq[aap=ap1[i1]];
3914 sp0 = f_str->weight_c[aap][ap0[i0]].c4;
3915 itmp = ppst->pam2[0][pascii[sp0]][aap];
3916
3917 if (s_annot1_arr_p && i1+i1_offset == s_annot1_arr_p[i1_annot]->pos) {
3918 i1_annot = next_annot_match(&itmp, ppst->pam2[0][pascii[sp0]], i1_offset+seq_pos(i1,aln->llrev,0), i0_offset+seq_pos(i0,aln->qlrev,0), &sp1, NULL, sq,
3919 i1_annot, annot1_p->n_annot, s_annot1_arr_p, NULL,
3920 NULL, NULL, &v_delta,NULL, NULL, 0);
3921
3922 if (ppst->sq[aap] != sp1) {
3923 sprintf(tmp_str,"%c%d%c;",ppst->sq[aap],i1+1,sp1);
3924 /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
3925 dyn_strcat(annot_var_dyn, tmp_str);
3926 }
3927 }
3928
3929 align_type(itmp, sp0, sp1, 0, aln, ppst->pam_x_id_sim);
3930
3931 i1++;
3932 lenc++;
3933 break;
3934 case 5:
3935 i0 += 3;
3936 lenc++;
3937 ngap_p++;
3938 break;
3939 case 0:
3940 i1++;
3941 lenc++;
3942 ngap_d++;
3943 break;
3944 }
3945 }
3946
3947 #ifndef TFAST
3948 aln->amax0 = i0+3; /* end of codon sequence */
3949 aln->amax1 = i1; /* end of protein sequence */
3950 aln->ngap_q = ngap_d;
3951 aln->ngap_l = ngap_p;
3952 #else
3953 aln->amax1 = i0+3; /* end of codon sequence */
3954 aln->amax0 = i1; /* end of protein sequence */
3955 aln->ngap_q = ngap_p;
3956 aln->ngap_l = ngap_d;
3957 #endif
3958 aln->calc_last_set = 1;
3959 aln->nfs = nfs;
3960 aln->amin0 = aln->smin0;
3961 aln->amin1 = aln->smin1;
3962
3963 if (lenc < 0) lenc = 1;
3964
3965 /* now we have the middle, get the right end */
3966
3967 return lenc;
3968 }
3969