1 /* $Id: compacc2.c 1280 2014-08-21 00:47:55Z wrp $ */
2
3 /* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
4 The Rector & Visitors of the University of Virginia */
5
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing,
13 software distributed under this License is distributed on an "AS
14 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15 express or implied. See the License for the specific language
16 governing permissions and limitations under the License.
17 */
18
19 /* Concurrent read version */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #if defined(UNIX)
24 #include <unistd.h>
25 #endif
26 #if defined(UNIX) || defined(WIN32)
27 #include <sys/types.h>
28 #endif
29
30 #include <limits.h>
31 #include <ctype.h>
32 #include <float.h>
33
34 #include <string.h>
35 #include <time.h>
36 #include <math.h>
37
38 #include "defs.h"
39 #include "param.h"
40 #include "structs.h"
41
42 #include "mm_file.h"
43 #include "best_stats.h"
44
45 #define XTERNAL
46 #include "uascii.h"
47 #include "upam.h"
48 #undef XTERNAL
49
50 #ifdef DEBUG
51 extern char ext_qtitle[];
52 #endif
53
54 extern void abort ();
55
56 #include "drop_func.h" /* get init_work() */
57 /* drop_func.h includes dyn_string.h */
58
59 void revcomp(unsigned char *seq, int n, int *c_nt);
60 extern void qshuffle(unsigned char *aa0, int n0, int nm0, void *);
61 #ifdef DEBUG
62 unsigned long adler32(unsigned long, const unsigned char *, unsigned int);
63 #endif
64
65 void
66 s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line);
67
68 extern void add_annot_def(struct mngmsg *m_msp, char *line, int qa_flag);
69 int add_annot_char(unsigned char *ann_arr, char ctmp_label);
70
71 int get_annot(char *sname, struct mngmsg *, char *bline, long offset, int n1,
72 struct annot_str **annot_p,int target, int debug);
73 int
74 get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr,
75 int nbest,int target, int debug);
76 void
77 print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long mem_use);
78 int
79 check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str);
80 /* print timing information */
81 extern void ptime (FILE *, long);
82
83 /* this function consolidates code in comp_lib4.c for non-threaded, and in
84 work_thr2.c (threads) and work_comp2.c (worker nodes)
85 */
86
87 void
init_aa0(unsigned char ** aa0,int n0,int nm0,unsigned char ** aa0s,unsigned char ** aa1s,int qframe,int qshuffle_flg,int max_tot,struct pstruct * ppst,void ** f_str,void ** qf_str,void * my_rand_state)88 init_aa0(unsigned char **aa0, int n0, int nm0,
89 unsigned char **aa0s, unsigned char **aa1s,
90 int qframe, int qshuffle_flg, int max_tot,
91 struct pstruct *ppst, void **f_str, void **qf_str,
92 void *my_rand_state) {
93 int id;
94
95 /* note that aa[5,4,3,2] are never used, but are provided so that frame
96 can range from 0 .. 5; likewise for f_str[5..2] */
97
98 aa0[5] = aa0[4] = aa0[3] = aa0[2] = aa0[1] = aa0[0];
99
100 /* zero out for SSE2/ALTIVEC -- make sure this is ALWAYS done */
101 for (id=0; id < SEQ_PAD; id++) aa0[0][n0+id] = '\0';
102
103 init_work (aa0[0], n0, ppst, &f_str[0]);
104 f_str[5] = f_str[4] = f_str[3] = f_str[2] = f_str[1] = f_str[0];
105
106 if (qframe == 2) {
107 if ((aa0[1]=(unsigned char *)calloc((size_t)n0+2+SEQ_PAD,sizeof(unsigned char)))==NULL) {
108 fprintf(stderr,"*** error [%s:%d] - cannot allocate aa01[%d]\n", __FILE__, __LINE__, n0);
109 }
110 *aa0[1]='\0';
111 aa0[1]++;
112 memcpy(aa0[1],aa0[0],n0+1);
113 /* for ALTIVEC/SSE2, must pad with 16 NULL's */
114 for (id=0; id<SEQ_PAD; id++) {aa0[1][n0+id]=0;}
115 revcomp(aa0[1],n0,ppst->c_nt);
116 init_work (aa0[1], n0, ppst, &f_str[1]);
117 }
118
119 if (qshuffle_flg) {
120 if ((*aa0s=(unsigned char *)calloc(n0+2+SEQ_PAD,sizeof(char)))==NULL) {
121 fprintf(stderr,"*** error [%s:%d] - cannot allocate aa0s[%d]\n",__FILE__, __LINE__, n0+2);
122 exit(1);
123 }
124 **aa0s='\0';
125 (*aa0s)++;
126 memcpy(*aa0s,aa0[0],n0);
127 qshuffle(*aa0s,n0,nm0, my_rand_state);
128 /* for SSE2/ALTIVEC, must pad with 16 NULL's */
129 for (id=0; id<SEQ_PAD; id++) {(*aa0s)[n0+id]=0;}
130 init_work (*aa0s, n0, ppst, qf_str);
131 }
132
133 /* always allocate shuffle space */
134 if((*aa1s=calloc(max_tot+1,sizeof(char))) == NULL) {
135 fprintf(stderr,"*** error [%s:%d] - unable to allocate shuffled library sequence [%d]\n", __FILE__, __LINE__, max_tot);
136 exit(1);
137 }
138 else {
139 **aa1s=0;
140 (*aa1s)++;
141 }
142 }
143
144 /* because it is used to pre-allocate space, maxn has various
145 constraints. For "simple" comparisons, it is simply the length of
146 the longest library sequence. But for translated comparisons, it
147 must be 3 or 6X the length of the query sequence.
148
149 In addition, however, it can be reduced to make certain that
150 sequences are read in smaller chunks. And, maxn affect how large
151 overlaps must be when sequences are read in chunks.
152 */
153
154 int
reset_maxn(struct mngmsg * m_msp,int over_len,int maxn)155 reset_maxn(struct mngmsg *m_msp, int over_len, int maxn) {
156
157 /* reduce maxn if requested */
158 if (m_msp->ldb_info.maxn > 0 && m_msp->ldb_info.maxn < maxn) maxn = m_msp->ldb_info.maxn;
159
160 if (m_msp->qdnaseq==m_msp->ldb_info.ldnaseq || m_msp->qdnaseq==SEQT_DNA ||
161 m_msp->qdnaseq == SEQT_RNA) {/* !TFAST - either FASTA or FASTX */
162
163 if (m_msp->n0 > m_msp->max_tot - m_msp->ldb_info.dupn) {
164 fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d > %d - %d %s\n",
165 __FILE__, __LINE__,
166 m_msp->n0,
167 m_msp->max_tot, m_msp->ldb_info.dupn,
168 m_msp->sqnam);
169 exit(1);
170 }
171
172 m_msp->ldb_info.l_overlap = over_len;
173 m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap;
174 }
175 else { /* is TFAST */
176 if (m_msp->n0 > MAXTST) {
177 fprintf(stderr,"*** error [%s:%d] - query sequence is too long %d %s\n",
178 __FILE__, __LINE__, m_msp->n0,m_msp->sqnam);
179 exit(1);
180 }
181
182 if (m_msp->n0*3 > maxn ) { /* n0*3 for the three frames - this
183 will only happen if maxn has been
184 set low manually */
185
186 if (m_msp->n0*4+2 < m_msp->max_tot) { /* m_msg0*3 + m_msg0 */
187 fprintf(stderr,
188 "*** error [%s:%d] - query sequence too long for library segment: %d - resetting to %d\n",
189 __FILE__, __LINE__,
190 maxn,m_msp->n0*3);
191 maxn = m_msp->ldb_info.maxn = m_msp->n0*3;
192 }
193 else {
194 fprintf(stderr,"*** error [%s:%d] - query sequence too long for translated search: %d * 4 > %d %s\n",
195 __FILE__, __LINE__, m_msp->n0,maxn, m_msp->sqnam);
196 exit(1);
197 }
198 }
199
200 /* set up some constants for overlaps */
201 m_msp->ldb_info.l_overlap = 3*over_len;
202 m_msp->ldb_info.maxt3 = maxn-m_msp->ldb_info.l_overlap-3;
203 m_msp->ldb_info.maxt3 -= m_msp->ldb_info.maxt3%3;
204 m_msp->ldb_info.maxt3++;
205
206 maxn = maxn - 3; maxn -= maxn%3; maxn++;
207 }
208 return maxn;
209 }
210
211
212 int
scanseq(unsigned char * seq,int n,char * str)213 scanseq(unsigned char *seq, int n, char *str) {
214 int tot,i;
215 char aaray[128]; /* this must be set > nsq */
216
217 for (i=0; i<128; i++) aaray[i]=0;
218 for (i=0; i < (int)strlen(str); i++) aaray[qascii[str[i]]]=1;
219 for (i=tot=0; i<n; i++) tot += aaray[seq[i]];
220 return tot;
221 }
222
223 /* subs_env takes a string, possibly with ${ENV}, and looks up all the
224 potential environment variables and substitutes them into the
225 string */
226
subs_env(char * dest,char * src,int dest_size)227 void subs_env(char *dest, char *src, int dest_size) {
228 char *last_src, *bp, *bp1;
229
230 last_src = src;
231
232 if ((bp = strchr(src,'$'))==NULL) {
233 strncpy(dest, src, dest_size);
234 dest[dest_size-1] = '\0';
235 }
236 else {
237 *dest = '\0';
238 while (strlen(dest) < dest_size-1 && bp != NULL ) {
239 /* copy stuff before ${*/
240 *bp = '\0';
241 strncpy(dest, last_src, dest_size);
242 *bp = '$';
243
244 /* copy ENV */
245 if (*(bp+1) != '{') {
246 strncat(dest, "$", dest_size - strlen(dest) -1);
247 dest[dest_size-1] = '\0';
248 bp += 1;
249 }
250 else { /* have ${ENV} - put it in */
251 if ((bp1 = strchr(bp+2,'}'))==NULL) {
252 fprintf(stderr, "*** error [%s:%d] - Unterminated ENV: %s\n",
253 __FILE__, __LINE__, src);
254 break;
255 }
256 else {
257 *bp1 = '\0';
258 if (getenv(bp+2)!=NULL) {
259 strncat(dest, getenv(bp+2), dest_size - strlen(dest) - 1);
260 dest[dest_size-1] = '\0';
261 *bp1 = '}';
262 }
263 bp = bp1+1; /* bump bp even if getenv == NULL */
264 }
265 }
266 last_src = bp;
267
268 /* now get the next ${ENV} if present */
269 bp = strchr(last_src,'$');
270 }
271 /* now copy the last stuff */
272 strncat(dest, last_src, dest_size - strlen(dest) - 1);
273 dest[dest_size-1]='\0';
274 }
275 }
276
277
278 void
selectbest(struct beststr ** bptr,int k,int n)279 selectbest(struct beststr **bptr, int k, int n) /* k is rank in array */
280 {
281 int v, i, j, l, r;
282 struct beststr *tmptr;
283
284 l=0; r=n-1;
285
286 while ( r > l ) {
287 v = bptr[r]->rst.score[0];
288 i = l-1;
289 j = r;
290 do {
291 while (bptr[++i]->rst.score[0] > v) ;
292 while (bptr[--j]->rst.score[0] < v) ;
293 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
294 } while (j > i);
295 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
296 if (i>=k) r = i-1;
297 if (i<=k) l = i+1;
298 }
299 }
300
301 void
selectbestz(struct beststr ** bptr,int k,int n)302 selectbestz(struct beststr **bptr, int k, int n) /* k is rank in array */
303 {
304 int i, j, l, r;
305 struct beststr *tmptr;
306 double v;
307
308 l=0; r=n-1;
309
310 while ( r > l ) {
311 v = bptr[r]->zscore;
312 i = l-1;
313 j = r;
314 do {
315 while (bptr[++i]->zscore > v) ;
316 while (bptr[--j]->zscore < v) ;
317 tmptr = bptr[i]; bptr[i]=bptr[j]; bptr[j]=tmptr;
318 } while (j > i);
319 bptr[j]=bptr[i]; bptr[i]=bptr[r]; bptr[r]=tmptr;
320 if (i>=k) r = i-1;
321 if (i<=k) l = i+1;
322 }
323 }
324
325 /* improved shellsort with high-performance increments */
326 /*
327 shellsort(itemType a[], int l, int r)
328 { int i, j, k, h; itemType v;
329 int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
330 13776, 4592, 1968, 861, 336,
331 112, 48, 21, 7, 3, 1 };
332 for ( k = 0; k < 16; k++)
333 for (h = incs[k], i = l+h; i <= r; i++) {
334 v = a[i]; j = i;
335 while (j > h && a[j-h] > v) {
336 a[j] = a[j-h]; j -= h;
337 }
338 a[j] = v;
339 }
340 }
341 */
342
343 /* ?improved? version of sortbestz using optimal increments and fewer
344 exchanges */
sortbestz(struct beststr ** bptr,int nbest)345 void sortbestz(struct beststr **bptr, int nbest)
346 {
347 int gap, i, j, k;
348 struct beststr *tmp;
349 double v;
350 int incs[14] = { 198768, 86961, 33936,
351 13776, 4592, 1968, 861, 336,
352 112, 48, 21, 7, 3, 1 };
353
354 for ( k = 0; k < 14; k++) {
355 gap = incs[k];
356 for (i=gap; i < nbest; i++) {
357 tmp = bptr[i];
358 j = i;
359 v = bptr[i]->zscore;
360 while ( j >= gap && bptr[j-gap]->zscore < v) {
361 bptr[j] = bptr[j - gap];
362 j -= gap;
363 }
364 bptr[j] = tmp;
365 }
366 }
367 }
368
369
370 /* sort based on sequence index */
sortbesti(struct beststr ** bptr,int nbest)371 void sortbesti(struct beststr **bptr, int nbest)
372 {
373 int gap, i, j, k;
374 struct beststr *tmp;
375 double v;
376 int incs[12] = { 33936, 13776, 4592, 1968, 861, 336,
377 112, 48, 21, 7, 3, 1 };
378
379 for ( k = 0; k < 12; k++) {
380 gap = incs[k];
381 for (i=gap; i < nbest; i++) {
382 tmp = bptr[i];
383 j = i;
384 v = bptr[i]->seq->index;
385 while ( j >= gap && bptr[j-gap]->seq->index < v) {
386 bptr[j] = bptr[j - gap];
387 j -= gap;
388 }
389 bptr[j] = tmp;
390 }
391 }
392 }
393
394 void
sortbeste(struct beststr ** bptr,int nbest)395 sortbeste(struct beststr **bptr, int nbest)
396 {
397 int gap, i, j, k;
398 struct beststr *tmp;
399 double v;
400 int incs[14] = { 198768, 86961, 33936,
401 13776, 4592, 1968, 861, 336,
402 112, 48, 21, 7, 3, 1 };
403
404 for ( k = 0; k < 14; k++) {
405 gap = incs[k];
406 for (i=gap; i < nbest; i++) {
407 j = i;
408 tmp = bptr[i];
409 v = tmp->rst.escore;
410 while ( j >= gap && bptr[j-gap]->rst.escore > v) {
411 bptr[j] = bptr[j - gap];
412 j -= gap;
413 }
414 bptr[j] = tmp;
415 }
416 }
417
418 /* sometimes there are many high scores with E()==0.0, sort
419 those by z() score */
420
421 j = 0;
422 while (j < nbest && bptr[j]->rst.escore <= 2.0*DBL_MIN ) {j++;}
423 if (j > 1) sortbestz(bptr,j);
424 }
425
426 extern char *prog_func;
427 extern char *verstr, *iprompt0, *refstr, *mp_verstr;
428 extern long tstart, tscan, tprev, tdone; /* Timing */
429 #ifdef COMP_MLIB
430 extern long ttscan, ttdisp;
431 #endif
432 extern time_t tdstart, tddone;
433
434 /* ****************************************************************
435 print command line arguments (argv_line)
436 possibly HTML header
437 !BLAST
438 please cite
439 version
440 BLAST
441 Reference version
442 **************************************************************** */
443 void
print_header1(FILE * fd,const char * argv_line,const struct mngmsg * m_msp,const struct pstruct * ppst)444 print_header1(FILE *fd, const char *argv_line,
445 const struct mngmsg *m_msp, const struct pstruct *ppst) {
446 int i;
447
448 #ifdef PGM_DOC
449 if (!(m_msp->markx & (MX_M8OUT+MX_MBLAST2))) fprintf(fd, "#%s\n",argv_line);
450 #endif
451
452 if (m_msp->markx & MX_M11OUT) {
453 fprintf(fd, "#:lav\n\nd {\n \"%s\"\n}\n",argv_line+1);
454 }
455
456 if (m_msp->markx & MX_HTML) {
457 #ifdef HTML_HEAD
458 fprintf(fd,"<html>\n<head>\n<title>%s Results</title>\n</head>\n<body>\n",prog_func);
459 #endif
460 fprintf(fd,"<pre>\n");
461 }
462
463 if (m_msp->std_output) {
464 fprintf(fd,"%s\n",iprompt0);
465 if (refstr != NULL && refstr[0] != '\0') {
466 fprintf(fd," version %s%s\nPlease cite:\n %s\n",verstr,mp_verstr,refstr);
467 }
468 else {
469 fprintf(fd," version %s%s\n",verstr,mp_verstr);
470 }
471 }
472
473 if (m_msp->markx & MX_MBLAST2) {
474 if (refstr != NULL && refstr[0] != '\0') {
475 fprintf(fd,"%s %s%s\n\nReference: %s\n\n", prog_func, verstr, mp_verstr, refstr);
476 }
477 else {
478 fprintf(fd,"%s %s%s\n\n", prog_func, verstr, mp_verstr);
479 }
480 }
481
482 fflush(fd);
483 }
484
485 /* ****************************************************************
486 MX_HTML: <pre>
487 Query:
488 1>>>accession description # aa
489 Annotation:
490 Library:
491 **************************************************************** */
492 void
print_header2(FILE * fd,int qlib,char * info_qlabel,unsigned char ** aa0,const struct mngmsg * m_msp,const struct pstruct * ppst,const char * info_lib_range_p)493 print_header2(FILE *fd, int qlib, char *info_qlabel, unsigned char **aa0,
494 const struct mngmsg *m_msp, const struct pstruct *ppst,
495 const char * info_lib_range_p) {
496 int j;
497 char tmp_str[MAX_STR];
498 double db_tt;
499
500 /* if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd); */
501
502 if (m_msp->std_output) {
503 if (qlib==1) {
504 fprintf(fd,"Query: %s\n", m_msp->tname);
505 }
506
507 if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
508 strncpy(tmp_str,(m_msp->qframe==1)? " (forward-only)" : "\0",sizeof(tmp_str));
509 tmp_str[sizeof(tmp_str)-1]='\0';
510 }
511 else tmp_str[0]='\0';
512
513 fprintf(fd,"%3d>>>%s%s\n", qlib,
514 m_msp->qtitle,
515 (m_msp->revcomp ? " (reverse complement)" : tmp_str));
516
517 /* check for annotation */
518 if (m_msp->ann_flg && m_msp->aa0a != NULL) {
519 fprintf(fd,"Annotation: ");
520 for (j=0; j<m_msp->n0; j++) {
521 if (m_msp->aa0a[j] && m_msp->ann_arr[m_msp->aa0a[j]] != ' ' ) {
522 fprintf(fd,"|%ld:%c%c",
523 j+m_msp->q_off,m_msp->ann_arr[m_msp->aa0a[j]],ppst->sq[aa0[0][j]]);
524 }
525 }
526 fprintf(fd,"\n");
527 }
528
529 fprintf(fd,"Library: %s%s\n", m_msp->ltitle,info_lib_range_p);
530
531 if (m_msp->db.carry==0) {
532 fprintf(fd, " %7ld residues in %5ld sequences\n", m_msp->db.length, m_msp->db.entries);
533 }
534 else {
535 db_tt = (double)m_msp->db.carry*(double)LONG_MAX + (double)m_msp->db.length;
536 fprintf(fd, " %.0f residues in %5ld library sequences\n", db_tt, m_msp->db.entries);
537 }
538
539 }
540 else {
541 if ((m_msp->markx & (MX_M8OUT + MX_M8COMMENT)) == (MX_M8OUT+MX_M8COMMENT)) {
542 fprintf(fd,"# %s %s%s\n",prog_func,verstr,mp_verstr);
543 fprintf(fd,"# Query: %s\n",m_msp->qtitle);
544 fprintf(fd,"# Database: %s\n",m_msp->ltitle);
545 }
546 }
547 if (m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
548 fflush(fd);
549 }
550
551 /* **************************************************************** */
552 /* before showbest */
553 /* **************************************************************** */
print_header3(FILE * fd,int qlib,struct mngmsg * m_msp,struct pstruct * ppst)554 void print_header3(FILE *fd, int qlib, struct mngmsg *m_msp, struct pstruct *ppst) {
555
556 if (m_msp->markx & MX_MBLAST2) {
557 if (qlib == 1) {
558 fprintf(fd, "\nDatabase: %s\n %12ld sequences; %ld total letters\n\n\n",
559 m_msp->ltitle, m_msp->db.entries, m_msp->db.length);
560 }
561 fprintf(fd, "\nQuery= %s\nLength=%d\n", m_msp->qtitle, m_msp->n0);
562 }
563 }
564
565
566 /* **************************************************************** */
567 /* alignment tranistion */
568 /* **************************************************************** */
print_header4(FILE * fd,char * info_qlabel,char * argv_line,char * info_gstring3,char * info_hstring_p[2],struct mngmsg * m_msp,struct pstruct * ppst)569 void print_header4(FILE *fd, char *info_qlabel, char *argv_line, char *info_gstring3, char *info_hstring_p[2],
570 struct mngmsg *m_msp, struct pstruct *ppst) {
571
572 if (m_msp->std_output && (m_msp->markx & (MX_AMAP+ MX_HTML + MX_M9SUMM)) && !(m_msp->markx & MX_M10FORM)) {
573 fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
574 info_qlabel,(m_msp->revcomp ? "_rev":"\0"), m_msp->n0,
575 m_msp->sqnam,m_msp->lname);
576 }
577
578 if (m_msp->markx & MX_M10FORM) {
579 fprintf(fd,"\n>>>%s%s, %d %s vs %s library\n",
580 info_qlabel,(m_msp->revcomp ? "-":"\0"), m_msp->n0, m_msp->sqnam,
581 m_msp->lname);
582 fprintf(fd,"; pg_name: %s\n",m_msp->pgm_name);
583 fprintf(fd,"; pg_ver: %s%s\n",verstr,mp_verstr);
584 fprintf(fd,"; pg_argv: %s",argv_line);
585 fputs(info_gstring3,fd);
586 fputs(info_hstring_p[0],fd);
587 fputs(info_hstring_p[1],fd);
588 }
589 }
590
print_header4a(FILE * outfd,struct mngmsg * m_msp)591 void print_header4a(FILE *outfd, struct mngmsg *m_msp) {
592 if (!(m_msp->markx & MX_M8OUT) && (m_msp->markx & (MX_M10FORM+MX_M9SUMM)) && m_msp->show_code != SHOW_CODE_ID) {
593 fprintf(outfd,">>><<<\n");
594 }
595 }
596
print_header5(FILE * fd,int qlib,struct db_str * qtt,struct mngmsg * m_msp,struct pstruct * ppst,int in_mem,long tot_memK)597 void print_header5(FILE *fd, int qlib, struct db_str *qtt,
598 struct mngmsg *m_msp, struct pstruct *ppst,
599 int in_mem, long tot_memK) {
600
601 /* for MX_MBLAST2, show some statistics results */
602 if (m_msp->markx & MX_MBLAST2) {
603 fprintf(fd,"\n\nLambda K H\n");
604 fprintf(fd," %6.3f %6.3f %6.3f\n\n",ppst->pLambda,ppst->pK,ppst->pH);
605 fprintf(fd,"\nGapped\nLambda\n");
606 fprintf(fd," %6.3f %6.3f %6.3f\n",ppst->pLambda,ppst->pK,ppst->pH);
607 fprintf(fd,"\nEffective search space used: %ld\n\n",m_msp->db.entries);
608 }
609
610 if (m_msp->markx & MX_M8COMMENT) {
611 fprintf(fd, "# %s processed %d queries\n",prog_func,qlib);
612 }
613
614 if ( !((m_msp->markx & MX_M8OUT) || (m_msp->markx & MX_HTML))
615 && (m_msp->markx & (MX_M10FORM+MX_M9SUMM))) {
616 fprintf(fd,">>>///\n");
617 }
618
619 if ( m_msp->markx & MX_HTML) fputs("<pre>",fd);
620 if (m_msp->std_output) {
621 print_sum(fd, qtt, &m_msp->db, in_mem, tot_memK);}
622 if ( m_msp->markx & MX_HTML) fputs("</pre>\n",fd);
623 #ifdef HTML_HEAD
624 if (m_msp->markx & MX_HTML) fprintf(fd,"</body>\n</html>\n");
625 #endif
626
627 if (m_msp->markx & MX_MBLAST2) {
628 fprintf(fd,"\n Database: %s\n",m_msp->ltitle);
629 fprintf(fd," Number of letters in database: %ld\n",m_msp->db.length);
630 fprintf(fd," Number of sequences in database: %ld\n",m_msp->db.entries);
631 fprintf(fd,"\n\n\nMatrix: %s\n",ppst->pam_name);
632 fprintf(fd,"Gap Penalties: Existence: %d, Extension: %d\n",ppst->gdelval, ppst->ggapval);
633 }
634 }
635
636 void
print_annot_header(FILE * fd,struct mngmsg * m_msp)637 print_annot_header(FILE *fd, struct mngmsg *m_msp) {
638 int i;
639
640 if (m_msp->ann_arr_def[1]) {
641 if (m_msp->markx & MX_HTML) {fprintf(fd,"<pre>");}
642 fprintf(fd, "Annotation symbols:\n");
643 for (i=1; m_msp->ann_arr[i]; i++) {
644 if (m_msp->ann_arr_def[i]) {
645 fprintf(fd, " %c : %s\n",m_msp->ann_arr[i], m_msp->ann_arr_def[i]);
646 }
647 }
648 if (m_msp->markx & MX_HTML) {fputs("</pre><hr />\n",fd);}
649 }
650 }
651
652 extern int fa_max_workers;
653
654 void
print_sum(FILE * fd,struct db_str * qtt,struct db_str * ntt,int in_mem,long tot_memK)655 print_sum(FILE *fd, struct db_str *qtt, struct db_str *ntt, int in_mem, long tot_memK)
656 {
657 double db_tt;
658 char tstr1[26], tstr2[26];
659 char memstr[256];
660
661 strncpy(tstr1,ctime(&tdstart),sizeof(tstr1));
662 strncpy(tstr2,ctime(&tddone),sizeof(tstr1));
663 tstr1[24]=tstr2[24]='\0';
664
665 /* Print timing to output file as well */
666
667 fprintf(fd, "\n%ld residues in %ld query sequences\n", qtt->length, qtt->entries);
668 if (ntt->carry == 0)
669 fprintf(fd, "%ld residues in %ld library sequences\n", ntt->length, ntt->entries);
670 else {
671 db_tt = (double)ntt->carry*(double)LONG_MAX + (double)ntt->length;
672 fprintf(fd, "%.0f residues in %ld library sequences\n", db_tt, ntt->entries);
673 }
674
675 memstr[0]='\0';
676 if (tot_memK && in_mem != 0) {
677 sprintf(memstr," in memory [%ldG]",(tot_memK >> 20));
678 }
679
680 #if !defined(COMP_THR) && !defined(PCOMPLIB)
681 fprintf(fd," Scomplib [%s%s]\n start: %s done: %s\n",verstr,mp_verstr,tstr1,tstr2);
682 #endif
683 #if defined(COMP_THR)
684 fprintf(fd," Tcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
685 fa_max_workers, memstr, tstr1,tstr2);
686 #endif
687 #if defined(PCOMPLIB)
688 fprintf(fd," Pcomplib [%s%s] (%d proc%s)\n start: %s done: %s\n", verstr, mp_verstr,
689 fa_max_workers, memstr, tstr1,tstr2);
690 #endif
691 #ifndef COMP_MLIB
692 fprintf(fd," Scan time: ");
693 ptime(fd, tscan - tprev);
694 fprintf (fd," Display time: ");
695 ptime (fd, tdone - tscan);
696 #else
697 fprintf(fd," Total Scan time: ");
698 ptime(fd, ttscan);
699 fprintf (fd," Total Display time: ");
700 ptime (fd, ttdisp);
701 #endif
702 fprintf (fd,"\n");
703 fprintf (fd, "\nFunction used was %s [%s%s]\n", prog_func,verstr,mp_verstr);
704 }
705
706 extern double zs_to_Ec(double zs, long entries);
707 extern double zs_to_bit(double zs, int n0, int n1);
708 extern double zs_to_p(double zs);
709
710 #include "aln_structs.h"
711
712 void
prhist(FILE * fd,const struct mngmsg * m_msp,struct pstruct * ppst,struct hist_str hist,int nstats,int sstats,struct db_str ntt,char * stat_info2,char * lib_range,char ** info_gstring2,char ** info_hstring,long tscan)713 prhist(FILE *fd, const struct mngmsg *m_msp,
714 struct pstruct *ppst,
715 struct hist_str hist,
716 int nstats, int sstats,
717 struct db_str ntt,
718 char *stat_info2,
719 char *lib_range,
720 char **info_gstring2,
721 char **info_hstring,
722 long tscan)
723 {
724 int i,j,hl,hll, el, ell, ev;
725 char hline[80], pch, *bp;
726 int mh1, mht;
727 int maxval, maxvalt, dotsiz, ddotsiz,doinset;
728 double cur_e, prev_e, f_int;
729 double max_dev, x_tmp;
730 double db_tt;
731 int n_chi_sq, cum_hl=0, max_i=0, max_dev_i;
732 double zs10_off;
733
734
735 if (m_msp->markx & MX_HTML) fputs("<pre>\n",fd);
736 else {fprintf(fd,"\n");}
737
738 if (ppst->zsflag_f < 0) {
739 if (!m_msp->nohist) {
740 fprintf(fd, " %7ld residues in %5ld sequences", ntt.length,ntt.entries);
741 fprintf(fd, "%s\n",lib_range);
742 }
743 fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
744 return;
745 }
746
747 if (nstats > 20) {
748 zs10_off = ppst->zs_off * 10.0;
749
750 max_dev = 0.0;
751 mh1 = hist.maxh-1; /* max value for histogram */
752 mht = (3*hist.maxh-3)/4 - 1; /* x-coordinate for expansion */
753 n_chi_sq = 0;
754
755 if (!m_msp->nohist && mh1 > 0) {
756 for (i=0,maxval=0,maxvalt=0; i<hist.maxh; i++) {
757 if (hist.hist_a[i] > maxval) maxval = hist.hist_a[i];
758 if (i >= mht && hist.hist_a[i]>maxvalt) maxvalt = hist.hist_a[i];
759 }
760 cum_hl = -hist.hist_a[0];
761 dotsiz = (maxval-1)/60+1;
762 ddotsiz = (maxvalt-1)/50+1;
763 doinset = (ddotsiz < dotsiz && dotsiz > 2);
764
765 if (ppst->zsflag_f>=0)
766 fprintf(fd," opt E()\n");
767 else
768 fprintf(fd," opt\n");
769
770 prev_e = zs_to_Ec((double)(hist.min_hist-hist.histint/2)-zs10_off,hist.entries);
771 for (i=0; i<=mh1; i++) {
772 pch = (i==mh1) ? '>' : ' ';
773 pch = (i==0) ? '<' : pch;
774 hll = hl = hist.hist_a[i];
775 if (ppst->zsflag_f>=0) {
776 cum_hl += hl;
777 f_int = (double)(i*hist.histint+hist.min_hist)+(double)hist.histint/2.0;
778 cur_e = zs_to_Ec(f_int-zs10_off,hist.entries);
779 ev = el = ell = (int)(cur_e - prev_e + 0.5);
780 if (hl > 0 && i > 5 && i < (90-hist.min_hist)/hist.histint) {
781 x_tmp = fabs(cum_hl - cur_e);
782 if ( x_tmp > max_dev) {
783 max_dev = x_tmp;
784 max_i = i;
785 }
786 n_chi_sq++;
787 }
788 if ((el=(el+dotsiz-1)/dotsiz) > 60) el = 60;
789 if ((ell=(ell+ddotsiz-1)/ddotsiz) > 40) ell = 40;
790 fprintf(fd,"%c%3d %5d %5d:",
791 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
792 mh1*hist.histint+hist.min_hist,hl,ev);
793 }
794 else fprintf(fd,"%c%3d %5d :",
795 pch,(i<mh1)?(i)*hist.histint+hist.min_hist :
796 mh1*hist.histint+hist.min_hist,hl);
797
798 if ((hl=(hl+dotsiz-1)/dotsiz) > 60) hl = 60;
799 if ((hll=(hll+ddotsiz-1)/ddotsiz) > 40) hll = 40;
800 for (j=0; j<hl; j++) hline[j]='=';
801 if (ppst->zsflag_f>=0) {
802 if (el <= hl ) {
803 if (el > 0) hline[el-1]='*';
804 hline[hl]='\0';
805 }
806 else {
807 for (j = hl; j < el; j++) hline[j]=' ';
808 hline[el-1]='*';
809 hline[hl=el]='\0';
810 }
811 }
812 else hline[hl] = 0;
813 if (i==1) {
814 for (j=hl; j<10; j++) hline[j]=' ';
815 sprintf(&hline[10]," one = represents %d library sequences",dotsiz);
816 }
817 if (doinset && i == mht-2) {
818 for (j = hl; j < 10; j++) hline[j]=' ';
819 sprintf(&hline[10]," inset = represents %d library sequences",ddotsiz);
820 }
821 if (i >= mht&& doinset ) {
822 for (j = hl; j < 10; j++) hline[j]=' ';
823 hline[10]=':';
824 for (j = 11; j<11+hll; j++) hline[j]='=';
825 hline[11+hll]='\0';
826 if (ppst->zsflag_f>=0) {
827 if (ell <= hll) hline[10+ell]='*';
828 else {
829 for (j = 11+hll; j < 10+ell; j++) hline[j]=' ';
830 hline[10+ell] = '*';
831 hline[11+ell] = '\0';
832 }
833 }
834 }
835
836 fprintf(fd,"%s\n",hline);
837 prev_e = cur_e;
838 }
839 }
840 max_dev_i = max_i*hist.histint+hist.min_hist;
841 }
842 else {
843 max_dev = 0.0;
844 n_chi_sq = 0;
845 max_i = 0;
846 max_dev_i = 0;
847 }
848
849 if (ppst->zsflag_f >=0 ) {
850 if (!m_msp->nohist) {
851 if (ntt.carry==0) {
852 fprintf(fd, " %7ld residues in %5ld sequences", ntt.length, ntt.entries);
853 }
854 else {
855 db_tt = (double)ntt.carry*(double)LONG_MAX + (double)ntt.length;
856 fprintf(fd, " %.0f residues in %5ld library sequences", db_tt, ntt.entries);
857 }
858 fprintf(fd, "%s\n",lib_range);
859 }
860 fprintf(fd,"Statistics: %s\n",hist.stat_info);
861 if (stat_info2) {
862 fprintf(fd," Statistics E2: %s\n",stat_info2);
863 }
864
865 #ifdef SAMP_STATS
866 fprintf(fd," statistics sampled from %ld (%d) to %ld sequences\n",
867 (hist.entries > nstats ? nstats : hist.entries),sstats, hist.entries);
868 #else
869 fprintf(fd," statistics extrapolated from %ld to %ld sequences\n",
870 (hist.entries > nstats ? nstats : hist.entries),hist.entries);
871 #endif
872
873 if (!m_msp->nohist && cum_hl > 0) {
874 fprintf(fd," Kolmogorov-Smirnov statistic: %6.4f (N=%d) at %3d\n",
875 max_dev/(float)cum_hl, n_chi_sq,max_dev_i);
876 }
877 if (m_msp->markx & MX_M10FORM) {
878 while ((bp=strchr(hist.stat_info,'\n'))!=NULL) *bp=' ';
879 if (cum_hl <= 0) cum_hl = -1;
880 sprintf(info_hstring[0],"; mp_extrap: %d %ld\n; mp_stats: %s\n; mp_KS: %6.4f (N=%d) at %3d\n",
881 MAX_STATS,hist.entries,hist.stat_info,max_dev/(float)cum_hl,
882 n_chi_sq,max_dev_i);
883 }
884 }
885
886 if (m_msp->markx & MX_M10FORM) {
887 if ((bp = strchr(info_gstring2[1],'\n'))!=NULL) *bp = ' ';
888 sprintf(info_hstring[1],"; mp_Algorithm: %s\n; mp_Parameters: %s\n",info_gstring2[0],info_gstring2[1]);
889 if (bp != NULL ) *bp = '\n';
890 }
891
892 if (ppst->other_info != NULL) {
893 fputs(ppst->other_info, fd);
894 }
895
896 fprintf(fd,"Algorithm: %s\nParameters: %s\n",info_gstring2[0],info_gstring2[1]);
897
898 fprintf (fd," Scan time: ");
899 ptime(fd,tscan);
900 fprintf(fd,"\n");
901 if (!m_msp->annot1_sname[0] && m_msp->markx & MX_HTML) {
902 fputs("</pre>\n<hr />\n",fd);
903 }
904
905 fflush(fd);
906 }
907
908 extern char prog_name[], *verstr;
909
910 #ifdef PCOMPLIB
911 #include "mpi.h"
912 #endif
913
s_abort(char * p,char * p1)914 void s_abort (char *p, char *p1)
915 {
916 int i;
917
918 fprintf (stderr, "\n***[%s] %s%s***\n", prog_name, p, p1);
919 #ifdef PCOMPLIB
920 MPI_Abort(MPI_COMM_WORLD,1);
921 MPI_Finalize();
922 #endif
923 exit (1);
924 }
925
w_abort(char * p,char * p1)926 void w_abort (char *p, char *p1)
927 {
928 fprintf (stderr, "\n***[%s] %s%s***\n\n", prog_name, p, p1);
929 exit (1);
930 }
931
932 extern struct a_res_str *
933 build_ares_code(unsigned char *aa0, int n0,
934 unsigned char *aa1, struct seq_record *seq,
935 int frame, int *have_ares, int repeat_thresh,
936 const struct mngmsg *m_msp, struct pstruct *ppst,
937 void *f_str
938 );
939
940 extern struct lmf_str *
941 re_openlib(struct lmf_str *, int outtty);
942
943 #define MAX_BLINE 2048
944 #define RANLIB (m_fptr->ranlib)
945
946 extern int
947 re_getlib(unsigned char *, struct annot_str **,
948 int, int, int, int, int, long *, long *,
949 struct lmf_str *m_fptr);
950
951 /*
952 pre_load_best loads a set of sequences using re_getlib
953
954 it should be used for getting sequences for shuffling, and for showbest() if m_msg->quiet
955
956 it both opens the m_file_p buffer, gets the bline[] descriptions,
957 and reads the actual sequences. In reading the sequences, it
958 should first allocate one large buffer so that individual buffers do not need to be freed.
959 */
960
961 void
pre_load_best(unsigned char * aa1save,int maxn,struct beststr ** bbp_arr,int nbest,struct mngmsg * m_msp,int debug)962 pre_load_best(unsigned char *aa1save, int maxn,
963 struct beststr **bbp_arr, int nbest,
964 struct mngmsg *m_msp, int debug)
965 {
966 int i, n1, bl_len, tmp_bline_len, l_llen;
967 int seq_buf_len;
968 char bline[MAX_BLINE];
969 unsigned char *seq_buf_p;
970 char *bline_buf_p;
971
972 struct beststr *bbp;
973 struct lmf_str *m_fptr;
974
975 /*
976 calculate how much room we need for sequences and blines
977 */
978
979 if (m_msp->pre_load_done) return;
980
981 seq_buf_len = 1;
982 for (i=0; i<nbest; i++) {
983 /* we are not (currently) allocating more than n1+1, because alignment is not vectorized,
984 if it were vectorized, we would need n+16
985 */
986 #ifdef DEBUG
987 if (bbp_arr[i]->n1 != bbp_arr[i]->seq->n1) {
988 fprintf(stderr,"*** error [%s:%d] - n1 (%d) != seq->n1 (%d)\n",
989 __FILE__, __LINE__, bbp_arr[i]->n1, bbp_arr[i]->seq->n1);
990 }
991 #endif
992
993 if (bbp_arr[i]->seq->aa1b == NULL) {
994 seq_buf_len += bbp_arr[i]->seq->n1 + 1;
995 }
996 }
997
998 /* have required sequence space (seq_buf_len), allocate it */
999
1000 if ((m_msp->aa1save_buf_b=(unsigned char *)calloc(seq_buf_len, sizeof(char)))==NULL) {
1001 fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for sequence encoding\n",
1002 __FILE__, __LINE__, seq_buf_len);
1003 exit(1);
1004 }
1005 else {
1006 seq_buf_p = m_msp->aa1save_buf_b+1; /* ensure there is an initial '\0' */
1007 }
1008
1009 /* adjust description line length */
1010 l_llen = m_msp->aln.llen;
1011 if ((m_msp->markx & MX_M9SUMM) && m_msp->show_code != SHOW_CODE_ID) {
1012 l_llen += 40;
1013 if (l_llen > 200) l_llen=200;
1014 }
1015
1016 tmp_bline_len = sizeof(bline)-1;
1017 if (!(m_msp->markx & MX_M10FORM) && !m_msp->long_info) {tmp_bline_len = l_llen-5;}
1018
1019 /* allocate more bline than we need for simplicity */
1020 if ((bline_buf_p=m_msp->bline_buf_b=(char *)calloc(nbest*tmp_bline_len, sizeof(char)))==NULL) {
1021 fprintf(stderr, "*** error [%s:%d] - cannot allocate space[%d] for bline descriptions\n",
1022 __FILE__, __LINE__, nbest*tmp_bline_len);
1023 exit(1);
1024 }
1025
1026 for (i=0; i<nbest; i++) {
1027 bbp = bbp_arr[i];
1028
1029 if ((m_fptr=re_openlib(bbp->mseq->m_file_p,!m_msp->quiet))==NULL) {
1030 fprintf(stderr,"*** error [%s:%d] - cannot re-open %s\n",
1031 __FILE__, __LINE__, bbp->mseq->m_file_p->lb_name);
1032 exit(1);
1033 }
1034 RANLIB(bline,tmp_bline_len,bbp->mseq->lseek,bbp->mseq->libstr,m_fptr);
1035 bl_len = strlen(bline);
1036 bbp->mseq->bline = bline_buf_p;
1037 bbp->mseq->bline_max = m_msp->aln.llen;
1038 strncpy(bbp->mseq->bline, bline, bl_len);
1039 bline_buf_p += bl_len+1;
1040
1041 /* make sure we get annotation if present, and sequence if necessary */
1042 if (bbp->seq->aa1b==NULL || (m_msp->ann_flg==1 && bbp->seq->annot_p==NULL)) {
1043 n1 = re_getlib(aa1save, (m_msp->ann_flg==1) ? &(bbp->seq->annot_p) : NULL,
1044 maxn,m_msp->ldb_info.maxt3, m_msp->ldb_info.l_overlap,
1045 bbp->mseq->cont,m_msp->ldb_info.term_code,
1046 &bbp->seq->l_offset,&bbp->seq->l_off,bbp->mseq->m_file_p);
1047 if (n1 != bbp->seq->n1) {
1048 fprintf(stderr,"*** error [%s:%d] - n1[%d/%d] != n1[%d] from re_getlib() at %s [maxn:%d/maxt3:%d]\n",
1049 __FILE__, __LINE__,
1050 bbp->n1, bbp->seq->n1, n1, bbp->mseq->libstr, maxn, m_msp->ldb_info.maxt3);
1051 }
1052
1053 #ifdef DEBUG
1054 if (adler32(1L,aa1save,n1)!=bbp->adler32_crc) {
1055 fprintf(stderr,"*** error [%s:%d] - adler32_crc from re_getlib() at %d(%d): %s\n",
1056 __FILE__, __LINE__,
1057 bbp->mseq->index,bbp->n1, bline);
1058 }
1059 #endif
1060
1061 /* if we don't have the sequence in the aa1b buffer, copy it from re_getlib */
1062 if (bbp->seq->aa1b == NULL) {
1063 bbp->seq->aa1b = seq_buf_p;
1064 memcpy(bbp->seq->aa1b, aa1save, bbp->seq->n1+1);
1065 seq_buf_p += bbp->seq->n1+1;
1066 }
1067 }
1068 }
1069
1070 /* here, we are getting query annots after all the bptr[]s have been processed */
1071 /* moved to comp_lib9.c */
1072 /*
1073 if (m_msp->annot0_sname[0]) {
1074 if (get_annot(m_msp->annot0_sname, m_msp, m_msp->qtitle, m_msp->q_offset+m_msp->q_off-1,m_msp->n0, &m_msp->annot_p, 0, debug) < 0) {
1075 fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations\n",__FILE__, __LINE__, m_msp->annot0_sname);
1076 m_msp->annot0_sname[0] = '\0';
1077 }
1078 if (m_msp->annot_p && m_msp->annot_p->n_annot > 0) {
1079 m_msp->aa0a = m_msp->annot_p->aa1_ann;
1080 }
1081 if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
1082 }
1083 */
1084
1085 /* if we have an variant annotation script, execute it and capture the output */
1086 /* must do after bline is set */
1087 if (m_msp->annot1_sname[0]) {
1088 if (get_annot_list(m_msp->annot1_sname, m_msp, bbp_arr, nbest, 1, debug)< 0) {
1089 fprintf(stderr,"*** error [%s:%d] - %s did not produce annotations for %s\n",__FILE__, __LINE__, m_msp->annot1_sname,m_msp->qtitle);
1090 m_msp->annot1_sname[0] = '\0';
1091 };
1092 if (!m_msp->ann_arr[0]) {m_msp->ann_arr[0] = ' '; m_msp->ann_arr[1] = '\0';}
1093 }
1094
1095 m_msp->pre_load_done = 1;
1096 }
1097
1098 /* merge_ares_chains()
1099
1100 seeks to merge two ares chains, producing a single chain that is
1101 sorted by sw_score.
1102
1103 Strategy -- choose the chain with the highest score, and go down
1104 it until the head of the other chain has higher score, then link
1105 the other chain to the main chain, breaking the first, and
1106 continue the process.
1107
1108 The two pointers, max_next and alt_next, keep track of the best
1109 and the alternate chain
1110 */
1111
1112
1113 #undef SHOW_MERGE_CHAIN
1114
1115 struct a_res_str *
merge_ares_chains(struct a_res_str * cur_ares,struct a_res_str * tmp_ares,int score_ix,const char * msg)1116 merge_ares_chains(struct a_res_str *cur_ares,
1117 struct a_res_str *tmp_ares,
1118 int score_ix,
1119 const char *msg)
1120 {
1121 struct a_res_str *max_next, *max_ares, *alt_ares, *prev_next;
1122
1123 if (!tmp_ares) return cur_ares;
1124
1125 #ifdef SHOW_MERGE_CHAIN
1126 fprintf(stderr,"cur_ares->");
1127 for (max_next = cur_ares; max_next; max_next = max_next->next) {
1128 fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
1129 }
1130
1131 fprintf(stderr,"||\n");
1132 fprintf(stderr,"tmp_ares->");
1133 for (max_next = tmp_ares; max_next; max_next = max_next->next) {
1134 fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
1135 }
1136 fprintf(stderr,"||\n");
1137 #endif
1138
1139 /* start with the maximum score */
1140
1141 if (cur_ares->rst.score[score_ix] >= tmp_ares->rst.score[score_ix]) {
1142 max_ares = max_next = prev_next = cur_ares;
1143 alt_ares = tmp_ares;
1144 }
1145 else {
1146 max_ares = max_next = prev_next = tmp_ares;
1147 alt_ares = cur_ares;
1148 }
1149
1150 while (max_next && alt_ares) {
1151 /* this is guaranteed true for the first iteration */
1152 while (max_next && max_next->rst.score[score_ix] >= alt_ares->rst.score[score_ix]) {
1153 prev_next = max_next;
1154 max_next = max_next->next;
1155 }
1156 if (max_next==NULL) break;
1157 else { /* max_next->rst.score[score_ix] no longer greater, switch
1158 pointers */
1159 prev_next->next = alt_ares;
1160 alt_ares = max_next;
1161 max_next = prev_next->next;
1162 }
1163 }
1164
1165 /* we quit whenever max_next or alt_ares == NULL; if
1166 (max_next==NULL), then continue adding the rest of alt_ares */
1167
1168 if (max_next==NULL) {
1169 prev_next->next = alt_ares;
1170 }
1171
1172
1173 #ifdef SHOW_MERGE_CHAIN
1174 fprintf(stderr,"[%s] merge_ares->",msg);
1175 for (max_next = max_ares; max_next; max_next = max_next->next) {
1176 fprintf(stderr,"%d->",max_next->rst.score[score_ix]);
1177 }
1178 fprintf(stderr,"||\n\n");
1179 #endif
1180
1181 return max_ares;
1182 }
1183
1184 /* copies from from to to shuffling */
1185
1186 extern int my_nrand(int, void *);
1187
1188 void
shuffle(unsigned char * from,unsigned char * to,int n,void * rand_state)1189 shuffle(unsigned char *from, unsigned char *to, int n, void *rand_state)
1190 {
1191 int i,j; unsigned char tmp;
1192
1193 if (from != to) memcpy((void *)to,(void *)from,n);
1194
1195 for (i=n; i>0; i--) {
1196 j = my_nrand(i, rand_state);
1197 tmp = to[j];
1198 to[j] = to[i-1];
1199 to[i-1] = tmp;
1200 }
1201 to[n] = 0;
1202 }
1203
1204 /* shuffles DNA sequences as codons */
1205 void
shuffle3(unsigned char * from,unsigned char * to,int n,void * rand_state)1206 shuffle3(unsigned char *from, unsigned char *to, int n, void *rand_state)
1207 {
1208 int i, j, i3,j3; unsigned char tmp;
1209 int n3;
1210
1211 if (from != to) memcpy((void *)to,(void *)from,n);
1212
1213 n3 = n/3;
1214
1215 for (i=n3; i>0; i--) {
1216 j = my_nrand(i, rand_state);
1217 i3 = i*3;
1218 j3 = j*3;
1219 tmp = to[j3];
1220 to[j3] = to[i3-1];
1221 to[i3-1] = tmp;
1222 tmp = to[j3+1];
1223 to[j3+1] = to[i3];
1224 to[i3] = tmp;
1225 tmp = to[j3+2];
1226 to[j3+2] = to[i3+1];
1227 to[i3+1] = tmp;
1228 }
1229 to[n] = 0;
1230 }
1231
1232 /* "shuffles" by reversing the sequence */
1233 void
rshuffle(unsigned char * from,unsigned char * to,int n)1234 rshuffle(unsigned char *from, unsigned char *to, int n)
1235 {
1236 unsigned char *ptr = from + n;
1237
1238 while (n-- > 0) {
1239 *to++ = *ptr--;
1240 }
1241 *to = '\0';
1242 }
1243
1244 static int ieven = 0;
1245 /* copies from from to from shuffling, ieven changed for threads */
1246 void
wshuffle(unsigned char * from,unsigned char * to,int n,int wsiz,void * rand_state)1247 wshuffle(unsigned char *from, unsigned char *to, int n, int wsiz, void *rand_state)
1248 {
1249 int i,j, k, mm;
1250 unsigned char tmp, *top;
1251
1252 memcpy((void *)to,(void *)from,n);
1253
1254 mm = n%wsiz;
1255
1256 if (ieven) {
1257 for (k=0; k<(n-wsiz); k += wsiz) {
1258 top = &to[k];
1259 for (i=wsiz; i>0; i--) {
1260 j = my_nrand(i, rand_state);
1261 tmp = top[j];
1262 top[j] = top[i-1];
1263 top[i-1] = tmp;
1264 }
1265 }
1266 top = &to[n-mm];
1267 for (i=mm; i>0; i--) {
1268 j = my_nrand(i, rand_state);
1269 tmp = top[j];
1270 top[j] = top[i-1];
1271 top[i-1] = tmp;
1272 }
1273 ieven = 0;
1274 }
1275 else {
1276 for (k=n; k>=wsiz; k -= wsiz) {
1277 top = &to[k-wsiz];
1278 for (i=wsiz; i>0; i--) {
1279 j = my_nrand(i, rand_state);
1280 tmp = top[j];
1281 top[j] = top[i-1];
1282 top[i-1] = tmp;
1283 }
1284 }
1285 top = &to[0];
1286 for (i=mm; i>0; i--) {
1287 j = my_nrand(i, rand_state);
1288 tmp = top[j];
1289 top[j] = top[i-1];
1290 top[i-1] = tmp;
1291 }
1292 ieven = 1;
1293 }
1294 to[n] = 0;
1295 }
1296
1297 int
sfn_cmp(int * q,int * s)1298 sfn_cmp(int *q, int *s)
1299 {
1300 if (*q == *s) return *q;
1301 while (*q && *s) {
1302 if (*q == *s) return *q;
1303 else if (*q < *s) q++;
1304 else if (*q > *s) s++;
1305 }
1306 return 0;
1307 }
1308
1309 #ifndef MPI_SRC
1310 void
revcomp(unsigned char * seq,int n,int * c_nt)1311 revcomp(unsigned char *seq, int n, int *c_nt)
1312 {
1313 unsigned char tmp;
1314 int i, ni;
1315
1316 for (i=0, ni = n-1; i< n/2; i++,ni--) {
1317 tmp = c_nt[seq[i]];
1318 seq[i] = c_nt[seq[ni]];
1319 seq[ni] = tmp;
1320 }
1321 if ((n%2)==1) {
1322 i = n/2;
1323 seq[i] = c_nt[seq[i]];
1324 }
1325 seq[n]=0;
1326 }
1327 #endif
1328
1329 /* check to see whether this score (or a shuff score) should
1330 be included in statistics */
samp_stats_idx(int * pre_nstats,int nstats,void * rand_state)1331 int samp_stats_idx (int *pre_nstats, int nstats, void *rand_state) {
1332 int jstats = -1;
1333
1334 /* this code works when every score can be used for statistics
1335 estimates, but fails for fasta/[t]fast[xy] where only a fraction
1336 of scores are used */
1337
1338 if (*pre_nstats < MAX_STATS) {
1339 jstats = (*pre_nstats)++;
1340 }
1341
1342 /* here, the problem is that while we may have pre_nstats
1343 possible samplings, in some cases (-M subsets, fasta,
1344 [t]fast[xy] we don't have MAX_STATS samples yet. Until we
1345 have MAX_STATS, we want more. But the stats_idx strategy
1346 means that there may be additional samples in the buffers
1347 that are not reflected in nstats.
1348 */
1349
1350 else {
1351 #ifdef SAMP_STATS_LESS
1352 /* now we have MAX_STATS samples
1353 we want to sample 1/2 of 60K - 120K, 1/3 of 120K - 180K, etc */
1354 /* check every 15K to see if we have gone past the next threshold */
1355
1356 /* pre_nstats cannot be incremented before the % to ensure
1357 that stats_inc is incremented exactly at 60000, 120000, etc.
1358 use ">=" in case increment comes later
1359 tests suggest the first 60K are sampled about 25% more
1360 than the rest
1361 */
1362 if (nstats < MAX_STATS) {
1363 jstats = MAX_STATS - my_nrand(MAX_STATS - nstats, rand_state)-1;
1364 }
1365 else if (((*pre_nstats)++ % (MAX_STATS/4)) == 0 &&
1366 *pre_nstats >= stats_inc * MAX_STATS) {
1367 stats_inc = (*pre_nstats / MAX_STATS) + 1;
1368 }
1369 if ((*pre_nstats % stats_inc) == 0) {
1370 jstats = my_nrand(MAX_STATS, rand_state);
1371 }
1372 #else
1373 /* this sampling strategy calls my_nrand() for every
1374 sequence > 60K, but provides a very uniform sampling */
1375 jstats = my_nrand(++(*pre_nstats), rand_state);
1376 if (jstats >= MAX_STATS) { jstats = -1;}
1377 #endif
1378 }
1379 return jstats;
1380 }
1381
1382 /* **************************************************************** */
1383 /* build_link_data -- produces fasta file from m_msp->
1384 (1) generate a temporary file name
1385 (2) write out accessions \t expects to the temporary file
1386 (3) run script against temporary file, producing fasta_file_expansion_file
1387 (4) return fasta expansion filename for standard fasta openlib().
1388
1389 returns: the expansion library file name
1390 **link_link_file_p is the name of the file with the data
1391 that will be removed.
1392 */
1393 /* **************************************************************** */
1394 char *
build_link_data(char ** link_lib_file_p,struct mngmsg * m_msp,struct beststr ** bestp_arr,int debug)1395 build_link_data(char **link_lib_file_p,
1396 struct mngmsg *m_msp, struct beststr **bestp_arr,
1397 int debug) {
1398 int i, status;
1399 char tmp_line[MAX_SSTR];
1400 char link_acc_file[MAX_STR];
1401 int link_acc_fd;
1402 char *link_lib_file;
1403 char *link_lib_str;
1404 char link_script[MAX_LSTR];
1405 int link_lib_type;
1406 char *bp, *link_bp;
1407 FILE *link_fd=NULL; /* file for link accessions */
1408
1409 #ifndef UNIX
1410 return NULL;
1411 #else
1412 /* get two tmpfiles, one for accessions, one for library */
1413 link_acc_file[0] = '\0';
1414
1415 if ((link_lib_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
1416 fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_file",
1417 __FILE__, __LINE__);
1418 }
1419 link_lib_file[0] = '\0';
1420
1421 if ((bp=getenv("TMP_DIR"))!=NULL) {
1422 strncpy(link_acc_file,bp,sizeof(link_acc_file));
1423 link_acc_file[sizeof(link_acc_file)-1] = '\0';
1424 SAFE_STRNCAT(link_acc_file,"/",sizeof(link_acc_file));
1425 }
1426
1427 SAFE_STRNCAT(link_acc_file,"link_acc_XXXXXX",sizeof(link_acc_file));
1428 link_acc_fd = mkstemp(link_acc_file);
1429 strncpy(link_lib_file,link_acc_file,MAX_STR);
1430 link_acc_file[sizeof(link_acc_file)-1] = '\0';
1431 SAFE_STRNCAT(link_lib_file,".lib",MAX_STR);
1432
1433 /* write out accessions to link_acc_file */
1434 if ((link_fd =fdopen(link_acc_fd,"w"))==NULL) {
1435 fprintf(stderr,"*** error [%s:%d] - Cannot open link_acc_file: %s\n",
1436 __FILE__, __LINE__, link_acc_file);
1437 goto no_links;
1438 }
1439
1440 for (i=0; i<m_msp->nskip + m_msp->nshow; i++) {
1441 if ((bp=strchr(bestp_arr[i]->mseq->bline,' '))!=NULL) {
1442 *bp = '\0';
1443 }
1444 fprintf(link_fd,"%s\t%.3g\n",bestp_arr[i]->mseq->bline,bestp_arr[i]->rst.escore);
1445 if (bp != NULL) *bp=' ';
1446 }
1447 fclose(link_fd);
1448
1449 /* build link_script link_acc_file > link_lib_file */
1450 /* check for indirect */
1451 link_bp = &m_msp->link_lname[0];
1452 if (*link_bp == '!') {
1453 link_bp++;
1454 }
1455 if (*link_bp == '@') {
1456 link_bp++;
1457 }
1458
1459 /* remove library type */
1460 if ((bp=strchr(link_bp,' '))!=NULL) {
1461 *bp = '\0';
1462 sscanf(bp+1,"%d",&link_lib_type);
1463 }
1464 else {
1465 link_lib_type = 0;
1466 }
1467
1468 strncpy(link_script,link_bp,sizeof(link_script));
1469 link_script[sizeof(link_script)-1] = '\0';
1470 SAFE_STRNCAT(link_script," ",sizeof(link_script));
1471 SAFE_STRNCAT(link_script,link_acc_file,sizeof(link_script));
1472 SAFE_STRNCAT(link_script," >",sizeof(link_script));
1473 SAFE_STRNCAT(link_script,link_lib_file,sizeof(link_script));
1474
1475 /* un-edit m_msp->link_lname */
1476 if (bp != NULL) *bp = ' ';
1477
1478 /* run link_script link_acc_file > link_lib_file */
1479 status = system(link_script);
1480 if (!debug) {
1481 #ifdef UNIX
1482 unlink(link_acc_file);
1483 #else
1484 _unlink(link_acc_file);
1485 #endif
1486 }
1487
1488 if (status == NO_FILE_EXIT) { /* my specific return for no links */
1489 goto no_links;
1490 }
1491
1492 if (status < 0 || status == 127) {
1493 fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
1494 __FILE__, __LINE__,link_script);
1495 goto no_links;
1496 }
1497
1498 if ((link_fd=fopen(link_lib_file,"r"))==NULL) {
1499 goto no_links;
1500 }
1501 else fclose(link_fd);
1502
1503 if ((link_lib_str=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
1504 fprintf(stderr,"*** error [%s:%d] - [build_link_data] Cannot allocate link_lib_str",
1505 __FILE__, __LINE__);
1506 }
1507
1508 /* build the file string (possibly @link_lib_file libtype) */
1509 link_lib_str[0]='\0';
1510 if (m_msp->link_lname[0] == '@') {
1511 SAFE_STRNCAT(link_lib_str,"@",MAX_STR);
1512 }
1513 SAFE_STRNCAT(link_lib_str,link_lib_file,MAX_STR);
1514 if (link_lib_type > 0) {
1515 sprintf(tmp_line," %d",link_lib_type);
1516 SAFE_STRNCAT(link_lib_str,tmp_line,MAX_STR);
1517 }
1518
1519 *link_lib_file_p = link_lib_file;
1520 return link_lib_str;
1521
1522 no_links:
1523 free(link_lib_file);
1524 *link_lib_file_p = NULL;
1525 return NULL;
1526 #endif
1527 }
1528
1529 /* **************************************************************** */
1530 /* build_lib_db -- produces fasta file from script
1531 (1) generate a temporary file name lib_db_file
1532 (2) run script producing data in lib_db_file
1533
1534 returns: the expansion library file name
1535 **db_str_file_p is the name of the file with the data
1536 that will be removed.
1537 */
1538 /* **************************************************************** */
1539 char *
build_lib_db(char * script_file)1540 build_lib_db(char *script_file) {
1541 int i, status;
1542 char tmp_line[MAX_SSTR];
1543 char *lib_db_file, *lib_db_str;
1544 char lib_db_script[MAX_LSTR];
1545 int lib_db_indirect;
1546 int lib_db_type;
1547 int lib_db_str_len;
1548 char *bp, *lib_bp;
1549
1550 if ((lib_db_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
1551 fprintf(stderr,"*** error [%s:%d] - [build_lib_db] Cannot allocate lib_db_file",
1552 __FILE__, __LINE__);
1553 goto no_lib;
1554 }
1555
1556 if ((bp=getenv("TMP_DIR"))!=NULL) {
1557 strncpy(lib_db_file,bp,MAX_STR);
1558 lib_db_file[sizeof(lib_db_file)-1] = '\0';
1559 SAFE_STRNCAT(lib_db_file,"/",sizeof(lib_db_file));
1560 }
1561
1562 SAFE_STRNCAT(lib_db_file,"lib_db_XXXXXX",MAX_STR);
1563 mktemp(lib_db_file);
1564 lib_db_str_len = strlen(lib_db_file)+1;
1565
1566 /* check for indirect */
1567 lib_bp = script_file;
1568 if (*lib_bp == '@') {
1569 lib_bp++;
1570 lib_db_str_len++;
1571 }
1572 /* remove library type */
1573 if ((bp=strchr(lib_bp,' '))!=NULL) {
1574 *bp = '\0';
1575 sscanf(bp+1,"%d",&lib_db_type);
1576 lib_db_str_len += (strlen(bp+1)+1);
1577 }
1578 else {
1579 lib_db_type = 0;
1580 }
1581
1582 strncpy(lib_db_script,lib_bp,sizeof(lib_db_script));
1583 lib_db_script[sizeof(lib_db_script)-1] = '\0';
1584 SAFE_STRNCAT(lib_db_script," >",sizeof(lib_db_script));
1585 SAFE_STRNCAT(lib_db_script,lib_db_file,sizeof(lib_db_script));
1586
1587 if (bp != NULL) *bp = ' ';
1588
1589 /* run lib_db_script link_acc_file > lib_db_file */
1590 status = system(lib_db_script);
1591
1592 if (status == NO_FILE_EXIT) { /* my specific return for no links */
1593 goto no_lib;
1594 }
1595
1596 if (status < 0 || status == 127) {
1597 fprintf(stderr,"*** error [%s:%d] - [build_lib_db] script: %s failed\n",
1598 __FILE__, __LINE__, lib_db_script);
1599 goto no_lib;
1600 }
1601
1602 /* build the file string (possibly @lib_db_str libtype) */
1603 if ((lib_db_str=calloc(lib_db_str_len+1,sizeof(char)))==NULL) {
1604 fprintf(stderr,"*** error [%s:%d] - [build_lib_db] cannot allocate lib_db_str[%d]\n",
1605 __FILE__, __LINE__, lib_db_str_len+1);
1606 goto no_lib;
1607 }
1608 lib_db_str[0]='\0';
1609 if (*script_file == '@') {
1610 SAFE_STRNCAT(lib_db_str,"@",MAX_STR);
1611 }
1612 SAFE_STRNCAT(lib_db_str,lib_db_file,MAX_STR);
1613 if (lib_db_type > 0) {
1614 sprintf(tmp_line," %d",lib_db_type);
1615 SAFE_STRNCAT(lib_db_str,tmp_line,MAX_STR);
1616 }
1617
1618 return lib_db_str;
1619
1620 no_lib:
1621 return NULL;
1622 }
1623
1624 /* used to temporarily allocate annotation array in next_annot_entry()*/
1625 struct annot_mstr {
1626 int max_annot;
1627 struct annot_entry *tmp_arr_p;
1628 };
1629
1630 /* init_tmp_annot_mstr(size) intializes the structure used to track annots */
1631 int
init_tmp_annot(struct annot_mstr * this,int size)1632 init_tmp_annot(struct annot_mstr *this, int size) {
1633 struct annot_entry *tmp_ann_astr;
1634
1635 /* only reset if array is NULL */
1636 if (this->tmp_arr_p == NULL || this->max_annot <= 0) {
1637 this->max_annot = 32;
1638 if ((this->tmp_arr_p=(struct annot_entry *)calloc(this->max_annot, sizeof(struct annot_entry)))==NULL) {
1639 fprintf(stderr,"*** error [%s:%d] - cannot allocate annot_entry[%d]\n",
1640 __FILE__,__LINE__,this->max_annot);
1641 return 0;
1642 }
1643 }
1644 return 1;
1645 }
1646
1647 int
update_tmp_annot(struct annot_mstr * this)1648 update_tmp_annot(struct annot_mstr *this) {
1649
1650 this->max_annot += (this->max_annot/2);
1651 if ((this->tmp_arr_p= (struct annot_entry *)realloc(this->tmp_arr_p, this->max_annot*sizeof(struct annot_entry)))==NULL) {
1652 fprintf(stderr,"[*** error [%s:%d] - cannot reallocate tmp_ann_astr[%d]\n",
1653 __FILE__, __LINE__, this->max_annot);
1654 return 0;
1655 }
1656 return 1;
1657 }
1658
1659 struct annot_str *
1660 next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line,
1661 struct annot_str *annot_p,
1662 struct annot_mstr *mtmp_annot_p,
1663 struct mngmsg *m_msp, int target);
1664
1665 /* **************************************************************** */
1666 /* get_annot_list -- produces fasta file from sname
1667 if sname[0]=='<', read the file directly, goto (4)
1668 if sname[0]=='!', run a script
1669 (1) generate a temporary file name
1670 (2) write out list of blines
1671 (3) run m_msp->annot1_sname[] script against temporary file, producing table of annotations
1672
1673 (4) read in the annotations and merge them into beststr
1674 (5) return number of annotations
1675 */
1676 /* **************************************************************** */
1677
1678 int
get_annot_list(char * sname,struct mngmsg * m_msp,struct beststr ** bestp_arr,int nbest,int target,int debug)1679 get_annot_list(char *sname, struct mngmsg *m_msp, struct beststr **bestp_arr, int nbest,
1680 int target, int debug) {
1681 int i, status;
1682 long l_offset;
1683 char tmp_line[MAX_STR];
1684 char annot_bline_file[MAX_STR];
1685 int annot_bline_fd;
1686 char *annot_descr_file;
1687 char annot_script[MAX_LSTR];
1688 struct annot_str *annot_p;
1689 char *bp;
1690 int annot_seq_cnt;
1691 FILE *annot_fd=NULL; /* file for annot accessions */
1692 struct annot_mstr mtmp_annot;
1693
1694 #ifndef UNIX
1695 return 0;
1696 #else
1697
1698 if (sname[0] == '!') {
1699
1700 /* get two tmpfiles, one for bline, one for returned annotations
1701 (but it would make more sense to use popen() to get the
1702 annotations back
1703 */
1704
1705 annot_bline_file[0] = '\0';
1706
1707 if ((annot_descr_file=(char *)calloc(MAX_STR,sizeof(char)))==NULL) {
1708 fprintf(stderr,"*** error [%s:%d] - [get_annot_list] Cannot allocate annot_file",
1709 __FILE__, __LINE__);
1710 }
1711 annot_descr_file[0] = '\0';
1712
1713 if ((bp=getenv("TMP_DIR"))!=NULL) {
1714 strncpy(annot_bline_file,bp,sizeof(annot_bline_file));
1715 annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
1716 SAFE_STRNCAT(annot_bline_file,"/",sizeof(annot_bline_file));
1717 }
1718
1719 SAFE_STRNCAT(annot_bline_file,"annot_bline_XXXXXX",sizeof(annot_bline_file));
1720 annot_bline_fd = mkstemp(annot_bline_file);
1721 strncpy(annot_descr_file,annot_bline_file,MAX_STR);
1722 annot_bline_file[sizeof(annot_bline_file)-1] = '\0';
1723 SAFE_STRNCAT(annot_descr_file,".annot",MAX_STR);
1724
1725 /* write out accessions to annot_bline_file */
1726 if ((annot_fd =fdopen(annot_bline_fd,"w"))==NULL) {
1727 fprintf(stderr,"*** error [%s:%d] - Cannot open annot_bline_file: %s\n",__FILE__, __LINE__, annot_bline_file);
1728 goto no_annots;
1729 }
1730
1731 for (i=0; i<nbest; i++) {
1732 if (bestp_arr[i]->mseq->annot_req_flag) { continue; }
1733 if ((strlen(bestp_arr[i]->mseq->bline) > DESCR_OFFSET) &&
1734 (bp=strchr(bestp_arr[i]->mseq->bline+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
1735 else {bp = NULL;}
1736 /* provide sequence length with offset, but only if offset is positive */
1737 l_offset = bestp_arr[i]->seq->l_offset+bestp_arr[i]->seq->l_off -1;
1738 if (l_offset < 0) { l_offset = 0;}
1739 fprintf(annot_fd,"%s\t%ld\n",bestp_arr[i]->mseq->bline,
1740 l_offset + bestp_arr[i]->seq->n1);
1741 if (bp != NULL) *bp=' ';
1742 bestp_arr[i]->mseq->annot_req_flag = 1;
1743 }
1744 fclose(annot_fd);
1745
1746 subs_env(annot_script, sname+1, sizeof(annot_script));
1747 annot_script[sizeof(annot_script)-1] = '\0';
1748 SAFE_STRNCAT(annot_script," ",sizeof(annot_script));
1749 SAFE_STRNCAT(annot_script,annot_bline_file,sizeof(annot_script));
1750 SAFE_STRNCAT(annot_script," >",sizeof(annot_script));
1751 SAFE_STRNCAT(annot_script,annot_descr_file,sizeof(annot_script));
1752
1753 /* run annot_script annot_bline_file > annot_descr_file */
1754 status = system(annot_script);
1755 if (!debug) {
1756 #ifdef UNIX
1757 unlink(annot_bline_file);
1758 #else
1759 _unlink(annot_bline_file);
1760 #endif
1761 }
1762
1763 if (status == NO_FILE_EXIT) { /* my specific return for no annots */
1764 goto no_annots;
1765 }
1766
1767 if (status < 0 || status == 127) {
1768 fprintf(stderr,"*** error [%s:%d] - script: %s failed\n",
1769 __FILE__, __LINE__, annot_script);
1770 goto no_annots;
1771 }
1772 }
1773 else if (sname[0] == '<') {
1774 annot_descr_file = sname+1;
1775 }
1776 else {
1777 fprintf(stderr,"*** error [%s:%d] - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
1778 goto no_annots;
1779 }
1780
1781 /* read annotation file */
1782
1783 if ((annot_fd=fopen(annot_descr_file,"r"))==NULL) {
1784 goto no_annots;
1785 }
1786
1787 /* be sure to ask for annotation once */
1788 for (i=0; i<nbest; i++) {
1789 bestp_arr[i]->mseq->annot_req_flag = 1;
1790 }
1791 /* we have some annotations */
1792 /* the annotation script MUST return the annotations ordered as in annot_descr_file,
1793 in "fasta" form:
1794
1795 >bline_descr
1796 pos<tab>label<tab>value?<tab>comment (which is not read in this version)
1797 1 *
1798 11 V N
1799 */
1800
1801 /* now read the annotation/variant file */
1802
1803 /* read #comments, =annot_defs at beginning of file */
1804 tmp_line[0] = '#';
1805 while (tmp_line[0] == '#' || tmp_line[0] == '=') {
1806 if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
1807 if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
1808 fprintf(stderr,"*** error [%s:%d] - premature annotation file end (%s)\n",
1809 __FILE__,__LINE__, annot_descr_file);
1810 goto no_annots;
1811 }
1812 }
1813
1814 /* set mtmp_annot to be initialized */
1815 mtmp_annot.tmp_arr_p = NULL;
1816 mtmp_annot.max_annot = 0;
1817
1818 annot_seq_cnt = 0;
1819
1820 /* now read in the annotations, but only the first time if asked multiple times */
1821 for (i=0; i<nbest; i++) {
1822 if (!bestp_arr[i]->mseq->annot_req_flag) {
1823 continue;
1824 }
1825 bestp_arr[i]->mseq->annot_req_flag = 0;
1826
1827 if ((bp=strchr(tmp_line,'\n'))!=NULL) *bp = '\0';
1828 if ((bp=strchr(tmp_line,'\t'))!=NULL) *bp = '\0';
1829 if (tmp_line[0] != '>' || strncmp(&tmp_line[1], bestp_arr[i]->mseq->bline, strlen(&tmp_line[1])) != 0) {
1830 fprintf(stderr,"*** error [%s:%d] - %s description mismatch (%s:%s)\n",
1831 __FILE__,__LINE__,annot_descr_file, tmp_line, bestp_arr[i]->mseq->bline);
1832 goto no_annots;
1833 }
1834
1835 annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), bestp_arr[i]->seq->annot_p, &mtmp_annot, m_msp, target);
1836
1837 if (annot_p) {
1838 bestp_arr[i]->seq->annot_p = annot_p;
1839 s_annot_to_aa1a(bestp_arr[i]->seq->l_offset + bestp_arr[i]->seq->l_off - 1,
1840 bestp_arr[i]->seq->n1, annot_p,m_msp->ann_arr, bestp_arr[i]->mseq->libstr);
1841 annot_seq_cnt++;
1842 mtmp_annot.tmp_arr_p = NULL;
1843 }
1844 else {
1845 if (bestp_arr[i]->seq->annot_p) {
1846 bestp_arr[i]->seq->annot_p->n_annot = 0;
1847 }
1848 }
1849 }
1850
1851 if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
1852
1853 fclose(annot_fd);
1854 if (sname[0]=='!') {
1855 if (!debug) {
1856 #ifdef UNIX
1857 unlink(annot_descr_file);
1858 #else
1859 _unlink(annot_descr_file);
1860 #endif
1861 }
1862 free(annot_descr_file);
1863 }
1864 return annot_seq_cnt;
1865
1866 no_annots:
1867 for (i=0; i<nbest; i++) {
1868 if (bestp_arr[i]->seq->annot_p) {
1869 if (bestp_arr[i]->seq->annot_p->n_annot > 0) {
1870 bestp_arr[i]->seq->annot_p->n_annot = 0;
1871 }
1872 }
1873 }
1874 if (sname[0] == '!') free(annot_descr_file);
1875 return -1;
1876 #endif
1877 }
1878
sort_annots(struct annot_entry ** s_annot,int n_annot)1879 void sort_annots(struct annot_entry **s_annot, int n_annot)
1880 {
1881 int gap, i, j, k;
1882 struct annot_entry *tmp;
1883 int v;
1884 int incs[6] = { 112, 48, 21, 7, 3, 1 };
1885
1886 for ( k = 0; k < 6; k++) {
1887 gap = incs[k];
1888 for (i=gap; i < n_annot; i++) {
1889 tmp = s_annot[i];
1890 j = i;
1891 v = s_annot[i]->pos;
1892 while ( j >= gap && s_annot[j-gap]->pos > v) {
1893 s_annot[j] = s_annot[j - gap];
1894 j -= gap;
1895 }
1896 s_annot[j] = tmp;
1897 }
1898 }
1899 }
1900
1901 struct annot_str *
next_annot_entry(FILE * annot_fd,char * tmp_line,int n_tmp_line,struct annot_str * annot_p,struct annot_mstr * mtmp_annot_p,struct mngmsg * m_msp,int target)1902 next_annot_entry(FILE *annot_fd, char *tmp_line, int n_tmp_line, struct annot_str *annot_p,
1903 struct annot_mstr *mtmp_annot_p, struct mngmsg *m_msp, int target) {
1904
1905 char ctmp_label, ctmp_value, tmp_comment[MAX_STR], annot_acc[MAX_STR];
1906 char *bp;
1907 int f_pos, f_end;
1908 int i_ann, l_doms, r_doms;
1909 int n_annot = 0;
1910
1911 struct annot_entry *tmp_ann_entry_arr, **s_tmp_ann_entry_arr;
1912
1913 SAFE_STRNCPY(annot_acc, tmp_line, sizeof(annot_acc));
1914
1915 if (init_tmp_annot(mtmp_annot_p, 32)==0) return NULL;
1916
1917 tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
1918
1919 /* read through each annotation in file */
1920 while (fgets(tmp_line, n_tmp_line, annot_fd)!=NULL ) {
1921 if (tmp_line[0] == '>') goto next_bline; /* start of new annotation */
1922 if (tmp_line[0] == '#') continue; /* ignore comments */
1923 if (tmp_line[0] == '=') { /* symbol definition */
1924 add_annot_def(m_msp, tmp_line+1,1);
1925 continue;
1926 }
1927
1928 if (n_annot >= mtmp_annot_p->max_annot - 1) {
1929 /* try to expand annotation array */
1930 if (update_tmp_annot(mtmp_annot_p)==0) {
1931 return NULL;
1932 }
1933 tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
1934 }
1935
1936 /* sscanf cannot give strings with blanks */
1937 /* sscanf(tmp_line,"%d %c %c %s", &f_pos, &ctmp_label, &ctmp_value, tmp_comment); */
1938 tmp_comment[0] = '\0';
1939 if ((bp=strchr(tmp_line,'\r')) || (bp=strchr(tmp_line,'\n'))) *bp='\0'; /* clean up EOL */
1940 if ((bp=strchr(tmp_line,'\t'))!=NULL) { /* fields MUST be tab delimited */
1941 f_pos=atoi(tmp_line) - 1; /* get first field -- f_pos, converted to 0-offset */
1942 ctmp_label = bp[1]; /* get second field -- ctmp_label */
1943 if ((bp=strchr(bp+1,'\t'))!=NULL) { /* next field could be f_end or ctmp_value */
1944 if (ctmp_label == '-') { f_end = atoi(bp+1) -1; ctmp_value = '\0';}
1945 else {ctmp_value = bp[1]; f_end = f_pos;} /* have variant, not coordinate */
1946 if ((bp=strchr(bp+1,'\t'))!=NULL) { /* if last <tab>, get comment */
1947 strncpy(tmp_comment,bp+1,sizeof(tmp_comment));
1948 }
1949 }
1950 }
1951 else { /* no tab */
1952 continue;
1953 }
1954
1955 tmp_ann_entry_arr[n_annot].pos = f_pos;
1956 tmp_ann_entry_arr[n_annot].end = f_end;
1957 tmp_ann_entry_arr[n_annot].label=ctmp_label;
1958 tmp_ann_entry_arr[n_annot].value=ctmp_value;
1959 tmp_ann_entry_arr[n_annot].comment = NULL;
1960 tmp_ann_entry_arr[n_annot].target = target;
1961 tmp_ann_entry_arr[n_annot].start = NULL;
1962
1963 if (tmp_comment[0]) {
1964 if ((tmp_ann_entry_arr[n_annot].comment=(char *)calloc(strlen(tmp_comment)+1,sizeof(char)))!=NULL) {
1965 strncpy(tmp_ann_entry_arr[n_annot].comment,tmp_comment,strlen(tmp_comment));
1966 }
1967 }
1968 if (ctmp_label== 'V') {
1969 /* map the .value from ascii to encoded residue */
1970 /* this must be lascii, not qascii, because the script
1971 describes the library, not the query, and for FASTX/TFASTX,
1972 those are different */
1973 tmp_ann_entry_arr[n_annot].value = lascii[tmp_ann_entry_arr[n_annot].value];
1974 }
1975 else if (ctmp_label == '[') {
1976 i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
1977 if (i_ann > 0) {
1978 qascii[ctmp_label] = NANN + i_ann;
1979 m_msp->ann_arr_def[i_ann] = NULL;
1980 }
1981 }
1982 else if (ctmp_label == ']') {
1983 i_ann = add_annot_char(m_msp->ann_arr, ctmp_label);
1984 if (i_ann > 0) {
1985 qascii[ctmp_label] = NANN + i_ann;
1986 m_msp->ann_arr_def[i_ann] = NULL;
1987 }
1988 }
1989 else if (ctmp_label == '-') { /* if ctmp_label == '-', have f_end, which must be added with ']' */
1990 /* make sure start/stop characters are in annotation alphabet */
1991 i_ann = add_annot_char(m_msp->ann_arr, '[');
1992 if (i_ann > 0) {
1993 qascii['['] = NANN + i_ann;
1994 m_msp->ann_arr_def[i_ann] = NULL;
1995 }
1996
1997 tmp_ann_entry_arr[n_annot].label='[';
1998 n_annot++;
1999
2000 if (n_annot >= mtmp_annot_p->max_annot - 1) {
2001 if (update_tmp_annot(mtmp_annot_p)==0) {
2002 return NULL;
2003 }
2004 tmp_ann_entry_arr = mtmp_annot_p->tmp_arr_p;
2005 }
2006
2007 /* only required for start - stop annotations; requires sort
2008 later (which is not currently used */
2009 tmp_ann_entry_arr[n_annot].pos = f_end;
2010 tmp_ann_entry_arr[n_annot].end = f_end;
2011 tmp_ann_entry_arr[n_annot].label=']';
2012 tmp_ann_entry_arr[n_annot].value=ctmp_value;
2013 tmp_ann_entry_arr[n_annot].comment = NULL;
2014 tmp_ann_entry_arr[n_annot].target = target;
2015 tmp_ann_entry_arr[n_annot].start = &tmp_ann_entry_arr[n_annot-1];
2016
2017 i_ann = add_annot_char(m_msp->ann_arr, ']');
2018 if (i_ann > 0) {
2019 qascii[']'] = NANN + i_ann;
2020 m_msp->ann_arr_def[i_ann] = NULL;
2021 }
2022 }
2023 else if ((i_ann = add_annot_char(m_msp->ann_arr, ctmp_label)) > 0) {
2024 m_msp->ann_arr_def[i_ann] = NULL;
2025 qascii[ctmp_label] = NANN + i_ann;
2026 }
2027 n_annot++;
2028 }
2029
2030 next_bline:
2031 if (n_annot) { /* if we have annotations, save them and set tmp_ann_entry_arr = NULL */
2032 tmp_ann_entry_arr = (struct annot_entry *)realloc(tmp_ann_entry_arr, (n_annot+1)*sizeof(struct annot_entry));
2033
2034 if ((s_tmp_ann_entry_arr = (struct annot_entry **)calloc((n_annot+1),sizeof(struct annot_entry *)))==NULL) {
2035 fprintf(stderr,"*** error [%s:%d] - cannot alloc s_tmp_ann_entry_arr[%d]",
2036 __FILE__,__LINE__, n_annot+1);
2037 exit(1);
2038 }
2039
2040 /* pair every domain start/stop */
2041 /* (1) count number of domains/check for consistency */
2042 l_doms = r_doms = 0;
2043 for (i_ann=0; i_ann < n_annot; i_ann++) {
2044 if (tmp_ann_entry_arr[i_ann].label == '[') l_doms++;
2045 if (tmp_ann_entry_arr[i_ann].label == ']') r_doms++;
2046 }
2047 if (l_doms != r_doms) {
2048 fprintf(stderr,"*** error [%s:%d] - unpaired domains: %s %d != %d\n",
2049 __FILE__,__LINE__, annot_acc, l_doms, r_doms);
2050 #ifdef DEBUG
2051 for (i_ann=0; i_ann < n_annot; i_ann++) {
2052 if (tmp_ann_entry_arr[i_ann].label == '[')
2053 fprintf(stderr, "%ld %c %s\n",tmp_ann_entry_arr[i_ann].pos,tmp_ann_entry_arr[i_ann].label,tmp_ann_entry_arr[i_ann].comment);
2054 if (tmp_ann_entry_arr[i_ann].label == ']')
2055 fprintf(stderr, "%ld %c\n",tmp_ann_entry_arr[i_ann].pos,tmp_ann_entry_arr[i_ann].label);
2056 }
2057 #endif
2058 }
2059 else if (l_doms > 0) {
2060 if ((tmp_domain_entry_arr = (struct annot_entry *)calloc((l_doms+1),sizeof(struct annot_entry)))==NULL) {
2061 fprintf(stderr,"*** error [%s:%d] - cannot alloc s_tmp_ann_entry_arr[%d]",
2062 __FILE__,__LINE__, l_doms+1);
2063 }
2064 else {
2065 l_doms = 0;
2066 for (i_ann=0; i_ann < n_annot+1; i_ann++) {
2067 if (tmp_ann_entry_arr[i_ann].label == '[') {
2068 tmp_domain_entry_arr[l_doms].pos = tmp_ann_entry_arr[i_ann].pos;
2069 tmp_domain_entry_arr[l_doms].label = '-';
2070 tmp_domain_entry_arr[l_doms].comment = tmp_ann_entry_arr[i_ann].comment;
2071 }
2072 else if (tmp_ann_entry_arr[i_ann].label == ']') {
2073 tmp_domain_entry_arr[l_doms].end = tmp_ann_entry_arr[i_ann].pos;
2074 l_doms++;
2075 }
2076 }
2077 }
2078 }
2079
2080 for (i_ann=0; i_ann < n_annot+1; i_ann++) {
2081 s_tmp_ann_entry_arr[i_ann] = &tmp_ann_entry_arr[i_ann];
2082 }
2083
2084 sort_annots(s_tmp_ann_entry_arr,n_annot);
2085
2086 /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
2087 if (annot_p || (annot_p = calloc(1,sizeof(struct annot_str)))!=NULL) {
2088 annot_p->annot_arr_p = tmp_ann_entry_arr;
2089 annot_p->s_annot_arr_p = s_tmp_ann_entry_arr;
2090 annot_p->n_annot = n_annot;
2091 annot_p->n_domains = l_doms;
2092 /* set to NULL to re-initialize */
2093 }
2094 }
2095 else {
2096 annot_p = NULL;
2097 }
2098 return annot_p;
2099 }
2100
2101
2102 /* **************************************************************** */
2103 /* add_annot_char(ann_arr, ctmp_label) --
2104
2105 (1) add annotation character to ann_arr if not present
2106 (2) return i_ann if added
2107 */
2108 /* **************************************************************** */
2109
2110 int
add_annot_char(unsigned char * ann_arr,char ctmp_label)2111 add_annot_char(unsigned char *ann_arr, char ctmp_label) {
2112 int i_ann;
2113
2114 if (ann_arr[0] == '\0') {
2115 ann_arr[0] = ' '; ann_arr[1] = '\0';
2116 }
2117
2118 /* check to see if already there? */
2119 if (strchr((char *)ann_arr,ctmp_label)==NULL) {
2120 /* check for room for another character */
2121 if (strlen((char *)ann_arr) >= MAX_FN) {
2122 fprintf(stderr,"*** error [%s:%d] - too many annotation characters: len(%s) + %c > %d\n",
2123 __FILE__, __LINE__, ann_arr, ctmp_label, MAX_FN-1);
2124 return 0;
2125 }
2126 else {
2127 ann_arr[i_ann=strlen((char *)ann_arr)] = ctmp_label; /* add the character */
2128 ann_arr[i_ann+1] = '\0'; /* guarantee null termination */
2129 return i_ann;
2130 }
2131 }
2132 else {
2133 return 0;
2134 }
2135 }
2136
2137 /* **************************************************************** */
2138 /* get_annot -- produces fasta file from m_msp->sname script
2139 (modified 20-Sept-2012 to not use intermediate file)
2140
2141 # (1) generate a temporary file name
2142 # (2) write out one bline (or portion that include accession)
2143 # (3) run sname[] script against temporary file, producing table of annotations
2144 (1) run script bline_id
2145 (4) read in the annotations and put them in struct annot_entry;
2146 (5) modify *annot_p to point to structure
2147 (6) return number of annotations
2148 */
2149 /* **************************************************************** */
2150
2151 int
get_annot(char * sname,struct mngmsg * m_msp,char * bline,long offset,int n1,struct annot_str ** annot_p,int target,int debug)2152 get_annot(char *sname, struct mngmsg *m_msp, char *bline, long offset, int n1, struct annot_str **annot_p,
2153 int target, int debug) {
2154
2155 char tmp_line[MAX_STR];
2156 FILE *annot_data_fd;
2157 char bline_descr[MAX_STR];
2158 char annot_data_file[MAX_LSTR];
2159 char annot_script[MAX_LSTR];
2160 long q_offset;
2161
2162 char *bp;
2163 FILE *annot_fd=NULL; /* file for annot accessions */
2164 struct annot_mstr mtmp_annot;
2165
2166 #ifndef UNIX
2167 return 0;
2168 #else
2169
2170 if (sname[0] == '!') {
2171 /* popen implementation */
2172
2173 annot_data_file[0] = '\0';
2174
2175 if (bline[0] == '>') {
2176 SAFE_STRNCPY(bline_descr, bline+1,sizeof(bline_descr));
2177 }
2178 else {
2179 SAFE_STRNCPY(bline_descr, bline,sizeof(bline_descr));
2180 }
2181 if ((strlen(bline_descr) > DESCR_OFFSET) &&
2182 (bp=strchr(bline_descr+DESCR_OFFSET,' '))!=NULL) {*bp = '\0';}
2183 else {bp = NULL;}
2184
2185 q_offset = m_msp->q_offset + m_msp->q_off - 1;
2186 if (q_offset < 0) { q_offset = 0;}
2187 sprintf(annot_script,"%s \"%s\" %ld",sname+1, bline_descr,q_offset+m_msp->n0);
2188 annot_script[sizeof(annot_script)-1] = '\0';
2189
2190 annot_fd = popen(annot_script,"r");
2191 }
2192 else if (sname[0] == '<') {
2193 SAFE_STRNCPY(annot_data_file,sname+1,sizeof(annot_data_file));
2194 annot_fd=fopen(annot_data_file,"r");
2195 }
2196 else {
2197 fprintf(stderr,"*** error [%s:%d] - %s not script (!) or file (<)\n",__FILE__, __LINE__, sname);
2198 goto no_annots;
2199 }
2200
2201 if (!annot_fd) {
2202 goto no_annots;
2203 }
2204 else { /* read the annotations into the array */
2205
2206 /* read #comments, =annot_defs at beginning of file */
2207 tmp_line[0] = '#';
2208 while (tmp_line[0] == '#' || tmp_line[0] == '=') {
2209 if (tmp_line[0] == '=') add_annot_def(m_msp, tmp_line+1,1);
2210 if (fgets(tmp_line, sizeof(tmp_line), annot_fd)==NULL) {
2211 fprintf(stderr,"*** error [%s:%d] - premature annotation file end (%s)\n",
2212 __FILE__,__LINE__, annot_data_file);
2213 goto no_annots;
2214 }
2215 }
2216
2217 /* set mtmp_annot to be initialized */
2218 mtmp_annot.tmp_arr_p = NULL;
2219 mtmp_annot.max_annot = 0;
2220
2221 /* strlen(&tmp_line[1])-1 to remove '>' and beginning and '\n' at end */
2222 if (tmp_line[0] != '>') {
2223 fprintf(stderr,"*** error [%s:%d] - no %s description: [%s]\n",
2224 __FILE__,__LINE__,annot_data_file, tmp_line);
2225 goto no_annots;
2226 }
2227
2228 *annot_p = next_annot_entry(annot_fd, tmp_line, sizeof(tmp_line), *annot_p, &mtmp_annot, m_msp, target);
2229
2230 if (sname[0] == '!') {
2231 pclose(annot_fd);
2232 }
2233 else {
2234 fclose(annot_fd);
2235 }
2236
2237 /* now allocate an annot_p if necessary, and link tmp_ann_entry_arr to it */
2238 if (*annot_p) {
2239 s_annot_to_aa1a(offset, n1, (*annot_p),m_msp->ann_arr,"get_annot");
2240 return (*annot_p)->n_annot;
2241 }
2242 else {
2243 if (mtmp_annot.tmp_arr_p) free(mtmp_annot.tmp_arr_p);
2244 return 0;
2245 }
2246 }
2247
2248 no_annots:
2249 return -1;
2250 #endif
2251 }
2252
2253 /* s_annot_to_aa1a -- takes an annot_entry[] and converts it to an *aa1_ann
2254 */
2255 void
s_annot_to_aa1a(long offset,int n1,struct annot_str * annot_p,unsigned char * ann_arr,char * tmp_line)2256 s_annot_to_aa1a(long offset, int n1, struct annot_str *annot_p, unsigned char *ann_arr, char *tmp_line) {
2257 unsigned char *aa1a_tmp;
2258 int i, ic, n_annot;
2259 struct annot_entry *this_annot;
2260 char *bp;
2261
2262 if ((aa1a_tmp = (unsigned char *)calloc(n1+2,sizeof(char)))==NULL) {
2263 fprintf(stderr,"*** error [%s:%d] - cannot allocate aa1a_ann[%d] array\n",
2264 __FILE__, __LINE__, n1);
2265 return;
2266 }
2267
2268 if (offset < 0) offset++;
2269
2270 for (i=0; i < annot_p->n_annot; i++) {
2271 this_annot = &annot_p->annot_arr_p[i];
2272 /* skip VAR labels */
2273 if (this_annot->label == 'V') { continue; }
2274 if (this_annot->label == '-') {
2275 aa1a_tmp[this_annot->pos]=qascii['['] - NANN;
2276 aa1a_tmp[this_annot->end]=qascii[']'] - NANN;
2277 continue;
2278 }
2279 if (strchr((char *)ann_arr, this_annot->label)==NULL) {continue;}
2280 if (this_annot->pos - offset < n1) {
2281 if (this_annot->pos >= offset) { /* not an error, but annotation must be in range */
2282 aa1a_tmp[this_annot->pos - offset]=qascii[this_annot->label] - NANN;
2283 }
2284 }
2285 else {
2286 fprintf(stderr, "*** error [%s:%d] - this_annot->pos:[%ld - %ld] out of range: %d : %s\n",
2287 __FILE__, __LINE__, this_annot->pos,offset, n1, tmp_line);
2288 }
2289 }
2290 annot_p->aa1_ann = aa1a_tmp;
2291 }
2292
2293 /* save_best captures much of the complexity of saving the best scores
2294 and appropriately sampling the scores for statistical analysis. It
2295 does the following:
2296
2297 (1) update s_info counts for functions like fasta/x/y that don't
2298 optimize every score
2299
2300 (2) for every result in the buffer:
2301 (a) decide if it should be used for statistical sampling
2302 (b) if the number of samples > MAX_STATS, then run
2303 process_hist() and update all the zscores
2304 (c) reset everything for next sequence
2305
2306 (3) must ensure that -BIGNUM are never in best[]
2307
2308 */
2309
2310 #include "thr_buf_structs.h"
2311 #ifndef PCOMPLIB
2312 #define RESULTS_BUF reader_buf
2313 #define XTERNAL
2314 #include "thr_bufs2.h"
2315 #else
2316 #define RESULTS_BUF worker_buf
2317 #include "pcomp_bufs.h"
2318 #endif
2319
2320 extern char *prog_func; /* function label */
2321 extern int fa_max_workers;
2322 extern struct buf_head *lib_buf2_list;
2323 #ifdef DEBUG
2324 void check_rbuf(struct buf_head *cur_buf);
2325 #endif
2326 extern void get_rbuf(struct buf_head **lib_buf, int max_work_buf);
2327 extern void put_rbuf(struct buf_head *lib_buf, int max_work_buf);
2328 extern void wait_rbuf(int max_work_buf);
2329 extern void rbuf_done(int nthreads);
2330 extern void put_rbuf_done(int nthreads, struct buf_head *lib_buf,
2331 int max_work_buf);
2332 extern int
2333 process_hist(struct stat_str *sptr, int nstats,
2334 const struct mngmsg *m_msg,
2335 struct pstruct *ppst,
2336 struct hist_str *hist, void **pstat_void, struct score_count_s *s_info, int do_hist);
2337
2338 extern void addhistz(double, struct hist_str *); /* scaleswn.c */
2339 void selectbestz(struct beststr **, int, int );
2340 extern double find_z(int score, double escore, int length, double comp, void *);
2341 extern double zs_to_E(double zs,int n1, int dnaseq, long entries, struct db_str db);
2342 extern struct beststr **bestp_arr; /* array of pointers */
2343 extern int nbest;
2344 extern int nstats, nqstats, nrstats, pre_nstats, kstats, shuff_tot, sstats;
2345 extern double zbestcut; /* cut off for best z-score */
2346 extern int bestfull; /* index for selectbest() */
2347 extern int stats_done; /* flag for z-value processing */
2348 extern void *rand_state;
2349 extern struct stat_str *stats; /* array of scores for statistics from real
2350 (or shuffled) sequences*/
2351 extern struct stat_str *qstats; /* array of scores for shuffled query stats */
2352 extern struct stat_str *rstats; /* array of scores from shuffled library */
2353
2354 /* in the current version (fasta_35_01) save_best is used by both
2355 threaded and unthreaded versions */
2356
2357 #define COPY_RST_P(d,s) \
2358 { d->rst.score[0] = s->rst.score[0]; \
2359 d->rst.score[1] = s->rst.score[1]; \
2360 d->rst.score[2] = s->rst.score[2]; \
2361 d->rst.valid_stat = s->rst.valid_stat; \
2362 d->rst.comp = s->rst.comp; \
2363 d->rst.H = s->rst.H; \
2364 d->rst.escore = s->rst.escore; \
2365 d->rst.segnum = s->rst.segnum; \
2366 d->rst.seglen = s->rst.seglen; \
2367 }
2368
2369 void
save_best(struct buf_head * lib_bhead_p,const struct mngmsg * m_msp,struct pstruct * ppst,struct db_str * ldb,FILE * fdata,struct hist_str * histp,void ** pstat_voidp,struct score_count_s * s_info)2370 save_best(struct buf_head *lib_bhead_p,
2371 const struct mngmsg *m_msp, struct pstruct *ppst,
2372 struct db_str *ldb, FILE *fdata,
2373 struct hist_str *histp, void **pstat_voidp,
2374 struct score_count_s *s_info)
2375 {
2376 double zscore;
2377 int i_score;
2378 struct beststr *bbp;
2379 struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
2380 struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
2381 int i, t_best, t_rbest, t_qrbest, tm_best, t_n1, sc_ix;
2382 int t_valid_stat, tr_valid_stat, use_shuff, zsflag_save;
2383 double e_score, tm_escore, t_rescore, t_qrescore;
2384 int buf2_cnt;
2385
2386 if (!lib_bhead_p->hdr.have_results) return;
2387 if ((buf2_cnt=lib_bhead_p->hdr.buf2_cnt) <= 0) return;
2388 lib_buf2_dp = lib_bhead_p->buf2_data;
2389 lib_buf2_rp = lib_bhead_p->buf2_res;
2390
2391 shuff_tot += lib_bhead_p->hdr.shuff_cnt;
2392 s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
2393 s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
2394 s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
2395 s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
2396
2397 sc_ix = ppst->score_ix;
2398
2399 t_best = t_rbest = t_qrbest = -BIGNUM;
2400 tm_escore = t_rescore = t_qrescore = FLT_MAX;
2401 t_valid_stat = tr_valid_stat = 0;
2402 if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
2403 else { use_shuff = 0;}
2404
2405 #ifdef DEBUG
2406 if (fdata) {
2407 fprintf(fdata,">save_best: %d\n",buf2_cnt);
2408 }
2409 #endif
2410
2411 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2412 while (buf2_cnt--) { /* count down the number of results */
2413 rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
2414 rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
2415
2416 /* perhaps should use explicit flag to indicate no score */
2417 if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
2418
2419 /* i_score: current raw sorting score */
2420 i_score = rbuf_rp->rst.score[sc_ix];
2421 /* e_score, current escore */
2422 e_score = rbuf_rp->rst.escore;
2423
2424 /* this should be done in the thread, and a sorted set of indexes
2425 should be produced by the thread, so we just go down the list
2426 to the zscore threshold */
2427 zscore = (double)i_score;
2428 if (stats_done) {
2429 zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
2430 *pstat_voidp);
2431 }
2432
2433 /* we have complex logic to decide:
2434 (a) for multiframe results, which is the best
2435 (b) information about valid stats
2436 we should simply return a stats array where all this is figured
2437 out in the thread.
2438 */
2439 t_n1 = rbuf_dp->seq->n1;
2440 if (i_score > t_best) tm_best = t_best = i_score;
2441 if (e_score < tm_escore) tm_escore = e_score;
2442 if (rbuf_rp->rst.valid_stat > t_valid_stat) {
2443 t_valid_stat = 1;
2444 }
2445
2446 /* this stuff happens only for fasts/fastm/fastf
2447 again, the t_qrbest stuff should be done in the thread
2448 rather than check for every hit, run through the loop
2449 only if necessary.
2450 */
2451 if (m_msp->qshuffle) {
2452 if (rbuf_rp->qr_score > t_qrbest)
2453 t_qrbest = rbuf_rp->qr_score;
2454 if (rbuf_rp->qr_escore < t_qrescore)
2455 t_qrescore = rbuf_rp->qr_escore;
2456
2457 if (rbuf_dp->frame == m_msp->nitt1 && t_qrbest > 0 && nqstats < m_msp->shuff_max) {
2458 qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
2459 qstats[nqstats].comp = rbuf_rp->rst.comp;
2460 qstats[nqstats].H = rbuf_rp->rst.H;
2461 qstats[nqstats].escore = t_qrescore;
2462 qstats[nqstats++].score = t_qrbest;
2463 t_qrbest = -BIGNUM; /* reset t_qrbest, t_qrescore */
2464 t_qrescore = FLT_MAX;
2465 }
2466 } /* m_msp->qshuffle */
2467
2468 if (use_shuff) {
2469 /* this check is required because some sequences scheduled to be
2470 used for statistics may not in fact be returning a score (if
2471 they are outside the -M range, for example.
2472 */
2473 if (rbuf_rp->r_rst.score[0] == -BIGNUM) { tr_valid_stat = 0; }
2474 if (rbuf_rp->r_rst.valid_stat > tr_valid_stat) {
2475 tr_valid_stat = 1;
2476 }
2477 if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
2478 t_rbest = rbuf_rp->r_rst.score[sc_ix];
2479 t_rescore = rbuf_rp->r_rst.escore;
2480 }
2481 }
2482
2483 /* need to look for frame 0 if TFASTA, then save stats at frame 6 */
2484 if (fdata) {
2485 fprintf(fdata,
2486 "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
2487 rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
2488 rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
2489 t_valid_stat, rbuf_rp->rst.alg_info,
2490 (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
2491 (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
2492 (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
2493 tr_valid_stat, rbuf_rp->r_rst.alg_info,
2494 rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
2495 }
2496
2497 /* statistics done for best score of set */
2498
2499 if (rbuf_dp->frame == m_msp->nitt1) {
2500 ldb->entries++;
2501 ldb->length += t_n1;
2502 if (ldb->length > LONG_MAX) {
2503 ldb->length -= LONG_MAX; ldb->carry++;
2504 }
2505 }
2506
2507 if (rbuf_dp->frame == m_msp->nitt1 && ppst->zsflag >= 0) {
2508 /* if this sample should be used for statistics */
2509 if (use_shuff) t_valid_stat = tr_valid_stat;
2510 if (t_valid_stat) {
2511 /* we've got our initial MAX_STATS values */
2512 if (nstats >= MAX_STATS) {
2513 if (!stats_done) {
2514 zsflag_save = ppst->zsflag;
2515 if (ppst->zsflag > 20) {
2516 ppst->zsflag -= 20;
2517 }
2518 ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
2519 histp, pstat_voidp,s_info, 0);
2520 ppst->zsflag = zsflag_save;
2521 kstats = nstats;
2522 if (ppst->zsflag >= 0) { /* this is redundant, but rare */
2523 stats_done = 1;
2524 for (i=0; i< nstats; i++) {
2525 bestp_arr[i]->zscore =
2526 find_z(bestp_arr[i]->rst.score[ppst->score_ix],
2527 bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
2528 bestp_arr[i]->rst.comp, *pstat_voidp);
2529 }
2530 }
2531 }
2532 }
2533 else {
2534 /* this logic allows stats_idx to be over-ruled for searches
2535 where every query does not generate a score */
2536 rbuf_dp->stats_idx = nstats;
2537 nstats++;
2538 }
2539 }
2540
2541 if (rbuf_dp->stats_idx >= 0 && t_valid_stat) {
2542 if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
2543 fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
2544 __FILE__, __LINE__,
2545 rbuf_dp->stats_idx, nstats,MAX_STATS);
2546 }
2547 else { /* stats_idx is in range */
2548 sstats++;
2549 stats[rbuf_dp->stats_idx].n1 = t_n1;
2550 stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
2551 stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
2552 if (use_shuff) { /* use shuffled score */
2553 stats[rbuf_dp->stats_idx].escore = t_rescore;
2554 stats[rbuf_dp->stats_idx].score = t_rbest;
2555 }
2556 else { /* real score, not shuffled */
2557 stats[rbuf_dp->stats_idx].escore = tm_escore;
2558 stats[rbuf_dp->stats_idx].score = tm_best;
2559 }
2560 } /* end stats_idx in range */
2561 } /* end have valid stats_idx */
2562
2563 if (t_valid_stat && stats_done && histp) {
2564 addhistz(find_z(t_best, tm_escore, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
2565 *pstat_voidp), histp);
2566 }
2567 /* reset best scores */
2568 t_best = t_rbest = -BIGNUM;
2569 tm_escore = t_rescore = FLT_MAX;
2570 t_valid_stat = tr_valid_stat = 0;
2571 }
2572
2573 /*
2574 if (rbuf_rp->rst.score[ppst->score_ix] > 200) {
2575 fprintf(stderr, "high score[%d]: %s %d: %d\n", rbuf_dp->seq->index,
2576 rbuf_dp->mseq->libstr, rbuf_dp->seq->n1, rbuf_rp->rst.score[ppst->score_ix]);
2577 }
2578 */
2579
2580 if (zscore > zbestcut) {
2581 if (nbest >= MAX_BEST) {
2582 bestfull = nbest-MAX_BEST/4;
2583 selectbestz(bestp_arr,bestfull-1,nbest);
2584 zbestcut = bestp_arr[bestfull-1]->zscore;
2585 nbest = bestfull;
2586 }
2587 bbp = bestp_arr[nbest++];
2588
2589 COPY_RST_P(bbp, rbuf_rp);
2590
2591 bbp->seq = rbuf_dp->seq;
2592 bbp->mseq = rbuf_dp->mseq;
2593 bbp->n1 = rbuf_dp->seq->n1;
2594 #ifdef DEBUG
2595 bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
2596 #endif
2597 /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
2598 if (rbuf_dp->best_save) {
2599 /* a previous rbuf_dp->seq is in best_str at best_save */
2600 if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
2601 /* the best_save->seq matches the rbuf_dp->seq */
2602 bbp->bbp_link = rbuf_dp->best_save;
2603 /* bbp_link tells where this ->seq can be found */
2604 }
2605 else {
2606 bbp->bbp_link = NULL;
2607 }
2608 }
2609 rbuf_dp->best_save = bbp;
2610 lib_bhead_p->hdr.have_best_save = 1;
2611 bbp->zscore = zscore;
2612 bbp->frame = rbuf_dp->frame;
2613 }
2614 }
2615 }
2616
2617 void
save_best2(struct buf_head * lib_bhead_p,const struct mngmsg * m_msp,struct pstruct * ppst,struct db_str * ldb,FILE * fdata,struct hist_str * histp,void ** pstat_voidp,struct score_count_s * s_info)2618 save_best2(struct buf_head *lib_bhead_p,
2619 const struct mngmsg *m_msp, struct pstruct *ppst,
2620 struct db_str *ldb, FILE *fdata,
2621 struct hist_str *histp, void **pstat_voidp,
2622 struct score_count_s *s_info)
2623 {
2624 double zscore;
2625 int i_score;
2626 struct beststr *bbp;
2627 struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
2628 struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
2629 int i, sc_ix;
2630 int t_valid_stat, use_shuff, zsflag_save;
2631 double e_score;
2632 int buf2_cnt;
2633
2634 if (!lib_bhead_p->hdr.have_results) return;
2635 if ((buf2_cnt = lib_bhead_p->hdr.buf2_cnt) <= 0) return;
2636
2637 /*
2638 #ifdef DEBUG
2639 fprintf(stderr," save_best2: lib_bhead_p->buf2_data[0]->mseq->index/lseek: %d,%lld\n",
2640 lib_bhead_p->buf2_data[0].mseq->index,lib_bhead_p->buf2_data[0].mseq->lseek);
2641 #endif
2642 */
2643 if (ppst->zsflag >= 10 && ppst->zsflag < 20) { use_shuff = 1;}
2644 else {use_shuff = 0;}
2645
2646 shuff_tot += lib_bhead_p->hdr.shuff_cnt;
2647 s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
2648 s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
2649 s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
2650 s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;;
2651 sc_ix = ppst->score_ix;
2652
2653 /* save the raw data if requested */
2654 if (fdata) {
2655 #ifdef DEBUG
2656 fprintf(fdata,">save_best: %d\n",buf2_cnt);
2657 #endif
2658 lib_buf2_dp = lib_bhead_p->buf2_data;
2659 lib_buf2_rp = lib_bhead_p->buf2_res;
2660 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2661 while (buf2_cnt--) { /* count down the number of results */
2662
2663 rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
2664 rbuf_dp = lib_buf2_dp++; /* step through the data buffer */
2665
2666 /* perhaps should use explicit flag to indicate no score */
2667 if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
2668
2669 fprintf(fdata,
2670 "%-12s %6d %d %.5f %.5f %4d %4d %4d %2d %2d %4d %4d %4d %2d %2d %5d %8lld\n",
2671 rbuf_dp->mseq->libstr, rbuf_dp->seq->n1,rbuf_dp->frame,rbuf_rp->rst.comp,rbuf_rp->rst.H,
2672 rbuf_rp->rst.score[0],rbuf_rp->rst.score[1],rbuf_rp->rst.score[2],
2673 rbuf_rp->is_valid_stat, rbuf_rp->rst.alg_info,
2674 (rbuf_rp->r_rst.score[0]<0 ? -1 : rbuf_rp->r_rst.score[0]),
2675 (rbuf_rp->r_rst.score[1]<0 ? -1 : rbuf_rp->r_rst.score[1]),
2676 (rbuf_rp->r_rst.score[2]<0 ? -1 : rbuf_rp->r_rst.score[2]),
2677 rbuf_rp->is_valid_stat, rbuf_rp->r_rst.alg_info,
2678 rbuf_dp->stats_idx, rbuf_dp->mseq->lseek);
2679 }
2680 }
2681
2682 /* save the high-scoring data */
2683 lib_buf2_rp = lib_bhead_p->buf2_res;
2684 lib_buf2_dp = lib_bhead_p->buf2_data;
2685 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2686 while (buf2_cnt--) {
2687 rbuf_rp = lib_buf2_rp++;
2688 rbuf_dp = lib_buf2_dp++;
2689
2690 /* perhaps should use explicit flag to indicate no score */
2691 if (rbuf_rp->rst.score[0] == -BIGNUM) continue;
2692
2693 /* i_score: current raw sorting score */
2694 i_score = rbuf_rp->rst.score[sc_ix];
2695 /* e_score, current escore */
2696 e_score = rbuf_rp->rst.escore;
2697 /* this should be done in the thread, and a sorted set of indexes
2698 should be produced by the thread, so we just go down the list
2699 to the zscore threshold */
2700 zscore = (double)i_score;
2701 if (stats_done) {
2702 zscore=find_z(i_score, e_score, rbuf_dp->seq->n1,(double)rbuf_rp->rst.comp,
2703 *pstat_voidp);
2704 }
2705
2706 if (rbuf_dp->frame == m_msp->nitt1) {
2707 ldb->entries++;
2708 ldb->length += rbuf_dp->seq->n1;
2709 if (ldb->length > LONG_MAX) {
2710 ldb->length -= LONG_MAX; ldb->carry++;
2711 }
2712 }
2713
2714 if (zscore > zbestcut) {
2715 if (nbest >= MAX_BEST) {
2716 bestfull = nbest-MAX_BEST/4;
2717 selectbestz(bestp_arr,bestfull-1,nbest);
2718 zbestcut = bestp_arr[bestfull-1]->zscore;
2719 nbest = bestfull;
2720 }
2721 bbp = bestp_arr[nbest++];
2722
2723 COPY_RST_P(bbp, rbuf_rp);
2724
2725 bbp->seq = rbuf_dp->seq;
2726 bbp->mseq = rbuf_dp->mseq;
2727 bbp->n1 = rbuf_dp->seq->n1;
2728 #ifdef DEBUG
2729 bbp->adler32_crc = rbuf_dp->seq->adler32_crc;
2730 #endif
2731 /* rbuf_dp->best_save is set after a rbuf_dp is entered into best_str */
2732 if (rbuf_dp->best_save) {
2733 /* a previous rbuf_dp->seq is in best_str at best_save */
2734 if (rbuf_dp->best_save->seq == rbuf_dp->seq) {
2735 /* the best_save->seq matches the rbuf_dp->seq */
2736 bbp->bbp_link = rbuf_dp->best_save;
2737 /* bbp_link tells where this ->seq can be found */
2738 }
2739 else {
2740 bbp->bbp_link = NULL;
2741 }
2742 }
2743 rbuf_dp->best_save = bbp;
2744 lib_bhead_p->hdr.have_best_save = 1;
2745 bbp->zscore = zscore;
2746 bbp->frame = rbuf_dp->frame;
2747 }
2748 }
2749
2750 /* process results for statistics */
2751 lib_buf2_dp = lib_bhead_p->buf2_data;
2752 lib_buf2_rp = lib_bhead_p->buf2_res;
2753 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2754 while (buf2_cnt--) { /* count down the number of results */
2755 rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
2756 rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
2757
2758 if (!rbuf_rp->is_valid_stat) { continue;}
2759
2760
2761 if (use_shuff) {
2762 i_score = rbuf_rp->r_rst.score[sc_ix];
2763 e_score = rbuf_rp->r_rst.escore;
2764 }
2765 else {
2766 i_score = rbuf_rp->rst.score[sc_ix];
2767 e_score = rbuf_rp->rst.escore;
2768 }
2769
2770 if (rbuf_dp->stats_idx >= MAX_STATS || nstats > MAX_STATS) {
2771 fprintf(stderr, "*** error [%s:%d] - nstats index [%d] out of range [%d,%d]\n",
2772 __FILE__, __LINE__,
2773 rbuf_dp->stats_idx, nstats,MAX_STATS);
2774 continue;
2775 }
2776
2777 if (nstats < MAX_STATS) {
2778 /* this logic allows stats_idx to be over-ruled for searches
2779 where every query does not generate a score */
2780 rbuf_dp->stats_idx = nstats;
2781 nstats++;
2782 }
2783
2784 if (stats_done && histp) {
2785 addhistz(find_z(i_score, e_score, rbuf_dp->seq->n1, (double) rbuf_rp->rst.comp,
2786 *pstat_voidp), histp);
2787 }
2788
2789 if (rbuf_dp->stats_idx < 0) {
2790 continue;
2791 }
2792
2793 sstats++;
2794 stats[rbuf_dp->stats_idx].n1 = rbuf_dp->seq->n1;
2795 stats[rbuf_dp->stats_idx].comp = rbuf_rp->rst.comp;
2796 stats[rbuf_dp->stats_idx].H = rbuf_rp->rst.H;
2797 stats[rbuf_dp->stats_idx].escore = e_score;
2798 stats[rbuf_dp->stats_idx].score = i_score;
2799 }
2800
2801
2802 /* fill the qstats[] array if m_msp->qshuffle */
2803 if (m_msp->qshuffle && nqstats < m_msp->shuff_max) {
2804 lib_buf2_dp = lib_bhead_p->buf2_data;
2805 lib_buf2_rp = lib_bhead_p->buf2_res;
2806 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2807 while (buf2_cnt--) {
2808 rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
2809 rbuf_dp = lib_buf2_dp++;
2810
2811 if (rbuf_rp->is_valid_stat && rbuf_rp->qr_score > 0
2812 && nqstats < m_msp->shuff_max) {
2813 qstats[nqstats].n1 = rbuf_dp->seq->n1; /* save the best score */
2814 qstats[nqstats].comp = rbuf_rp->rst.comp;
2815 qstats[nqstats].H = rbuf_rp->rst.H;
2816 qstats[nqstats].escore = rbuf_rp->qr_escore;
2817 qstats[nqstats++].score = rbuf_rp->qr_score;
2818 }
2819 } /* m_msp->qshuffle */
2820 }
2821
2822 /* check if we have enough data to do stats */
2823 if (!stats_done && nstats >= MAX_STATS) {
2824 zsflag_save = ppst->zsflag;
2825 if (ppst->zsflag > 20) {
2826 ppst->zsflag -= 20;
2827 }
2828 ppst->zsflag_f = process_hist(stats,nstats,m_msp, ppst,
2829 histp, pstat_voidp,s_info, 0);
2830 ppst->zsflag = zsflag_save;
2831 kstats = nstats;
2832 stats_done = 1;
2833 for (i=0; i< nstats; i++) {
2834 bestp_arr[i]->zscore =
2835 find_z(bestp_arr[i]->rst.score[ppst->score_ix],
2836 bestp_arr[i]->rst.escore, bestp_arr[i]->seq->n1,
2837 bestp_arr[i]->rst.comp, *pstat_voidp);
2838 }
2839 }
2840
2841 }
2842
2843 void
save_shuf(struct buf_head * lib_bhead_p,int nitt1,int shuff_max,int sc_ix,struct score_count_s * s_info)2844 save_shuf(struct buf_head *lib_bhead_p, int nitt1, int shuff_max, int sc_ix,
2845 struct score_count_s *s_info)
2846 {
2847 struct buf2_data_s *rbuf_dp, *lib_buf2_dp;
2848 struct buf2_res_s *rbuf_rp, *lib_buf2_rp;
2849 int t_valid_stat;
2850 int t_rbest;
2851 double t_rescore;
2852 int buf2_cnt, jstats;
2853 static int kstats=0;
2854
2855
2856 lib_buf2_dp = lib_bhead_p->buf2_data;
2857 lib_buf2_rp = lib_bhead_p->buf2_res;
2858 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2859
2860 s_info->s_cnt[0] += lib_bhead_p->s_cnt_info.s_cnt[0];
2861 s_info->s_cnt[1] += lib_bhead_p->s_cnt_info.s_cnt[1];
2862 s_info->s_cnt[2] += lib_bhead_p->s_cnt_info.s_cnt[2];
2863
2864 s_info->tot_scores += lib_bhead_p->s_cnt_info.tot_scores;
2865 /* this is done because we are not using r_rst->valid_stat to limit selection of scores */
2866 /* s_info->s_cnt[sc_ix] = s_info->tot_scores; */
2867
2868 t_rbest = -BIGNUM;
2869 t_valid_stat = 0;
2870
2871 while (buf2_cnt--) { /* count down the number of results */
2872 rbuf_dp = lib_buf2_dp++; /* step through the results buffer */
2873 rbuf_rp = lib_buf2_rp++; /* step through the results buffer */
2874
2875 /* perhaps should use explicit flag to indicate no score */
2876 if (rbuf_rp->r_rst.score[0] == -BIGNUM) continue;
2877
2878 if (rbuf_rp->r_rst.score[sc_ix] > t_rbest) {
2879 t_rbest = rbuf_rp->r_rst.score[sc_ix];
2880 t_rescore = rbuf_rp->r_rst.escore;
2881 }
2882
2883 if (rbuf_rp->r_rst.valid_stat > t_valid_stat) {
2884 t_valid_stat = 1;
2885 }
2886
2887 /* statistics done for best score of set */
2888 /* currently no check for rst->valid_stat, which causes
2889 over-estimates of shuffles */
2890
2891 if (rbuf_dp->frame == nitt1) {
2892 if (t_valid_stat) {
2893 if (nrstats < shuff_max ) { kstats = jstats = nrstats++; }
2894 else { /* randomly replace */
2895 jstats = my_nrand(++kstats,rand_state);
2896 if (jstats >= shuff_max) goto done;
2897 }
2898
2899 rstats[jstats].n1 = rbuf_dp->seq->n1;
2900 rstats[jstats].comp = rbuf_rp->r_rst.comp;
2901 rstats[jstats].H = rbuf_rp->r_rst.H;
2902 rstats[jstats].escore = t_rescore;
2903 rstats[jstats].score = t_rbest;
2904 done:
2905 t_rbest = -BIGNUM;
2906 }
2907 }
2908 }
2909 }
2910
2911 int
save_align(struct buf_head * lib_bhead_p,struct beststr ** bestp_arr)2912 save_align(struct buf_head *lib_bhead_p, struct beststr **bestp_arr)
2913 {
2914 struct buf2_ares_s *rbuf_ap, *lib_buf2_ap;
2915 int buf2_cnt;
2916
2917 if (!lib_bhead_p->hdr.have_results || lib_bhead_p->hdr.buf2_cnt <= 0) return 0;
2918
2919 lib_buf2_ap = lib_bhead_p->buf2_ares;
2920 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2921
2922 while (buf2_cnt-- > 0) { /* count down the number of results */
2923 rbuf_ap = lib_buf2_ap++; /* step through the results buffer */
2924 if (bestp_arr[rbuf_ap->best_idx]->a_res == NULL) {
2925 bestp_arr[rbuf_ap->best_idx]->have_ares = rbuf_ap->have_ares;
2926 bestp_arr[rbuf_ap->best_idx]->a_res = rbuf_ap->a_res;
2927 }
2928 #ifdef DEBUG
2929 else {
2930 fprintf(stderr,"*** error [%s:%d] - attempt to re-save a_res for [%d]: %s\n",
2931 __FILE__, __LINE__, rbuf_ap->best_idx, bestp_arr[rbuf_ap->best_idx]->mseq->bline);
2932 }
2933 #endif
2934 }
2935
2936 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2937 lib_bhead_p->hdr.have_results = 0;
2938 lib_bhead_p->hdr.buf2_cnt = 0;
2939 return buf2_cnt;
2940 }
2941
2942 /* buf_do_work fills in the lib_bhead_p->buf2_res[] array with the
2943 do_work() results,
2944
2945 inputs: **aa0, n0 (query)
2946 lib_bhead_p->buf2_data lib_bhead_p->hdr.buf2_cnt library sequences
2947 max_frame (used to set statistics info)
2948 ppst,
2949 void *f_struct prepared by init_work()
2950
2951 results: lib_bhead_p->buf2_res[]
2952
2953 included in buf2_res[] is use_stat, which captures the
2954 logic required to decide whether a value should be saved
2955 in the stats[] buffer. This complexity mostly arises
2956 because there can be more scores than sequences, but there
2957 can only on statistics score per sequence (the best score).
2958 */
2959 void
buf_do_work(unsigned char ** aa0,int n0,struct buf_head * lib_bhead_p,int max_frame,struct pstruct * ppst,void ** f_str)2960 buf_do_work(unsigned char **aa0, int n0,
2961 struct buf_head *lib_bhead_p,
2962 int max_frame,
2963 struct pstruct *ppst, void **f_str) {
2964
2965 int buf2_cnt;
2966 unsigned long atmp;
2967 struct buf2_data_s *lib_buf2_dp;
2968 struct buf2_res_s *lib_buf2_rp, *t_best_rp;
2969 int t_best, sc_ix;
2970 double t_escore;
2971
2972 sc_ix = ppst->score_ix;
2973
2974 lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
2975 lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
2976
2977 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
2978 lib_buf2_dp = lib_bhead_p->buf2_data;
2979 lib_buf2_rp = lib_bhead_p->buf2_res;
2980
2981 t_best_rp = NULL;
2982 t_best = -BIGNUM;
2983 t_escore = 1000.0;
2984
2985 while (buf2_cnt-- > 0) {
2986
2987 lib_buf2_rp->rst.score[0] =
2988 lib_buf2_rp->rst.score[1] =
2989 lib_buf2_rp->rst.score[2] = -BIGNUM;
2990
2991 lib_buf2_rp->is_valid_stat = 0;
2992
2993 if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
2994 lib_buf2_dp->seq->n1 > ppst->n1_high ) {
2995 /* tells save_best() there is no stats score here -- not
2996 necessary as -BIGNUM indicates no score */
2997 lib_buf2_dp->stats_idx = -1;
2998 goto next_seq;
2999 }
3000
3001 #ifdef DEBUG
3002 if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
3003 ppst->nsqx, "buf_do_work()")) {
3004 fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] range error at: %d/%d (n1:%d)\n",
3005 __FILE__, __LINE__,
3006 prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
3007 lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
3008 goto next_seq;
3009 };
3010
3011 /* also check for adler32_crc match */
3012 if (lib_buf2_dp->seq->adler32_crc != (atmp=adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1))) {
3013 fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_work] CRC error [%lu!=%lu] at: %d/%d (n1:%d/l_offset:%ld)\n",
3014 __FILE__, __LINE__,
3015 prog_func,lib_buf2_dp->seq->adler32_crc, atmp,
3016 lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
3017 lib_bhead_p->hdr.buf2_cnt,lib_buf2_dp->seq->n1,
3018 lib_buf2_dp->seq->l_offset);
3019 goto next_seq;
3020 }
3021 #endif
3022
3023 do_work (aa0[lib_buf2_dp->frame], n0,
3024 lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
3025 lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 0,
3026 &(lib_buf2_rp->rst), &(lib_bhead_p->s_cnt_info));
3027
3028 if (lib_buf2_rp->rst.valid_stat) {
3029 if (lib_buf2_rp->rst.escore < t_escore) {
3030 t_escore = lib_buf2_rp->rst.escore;
3031 t_best_rp = lib_buf2_rp;
3032 }
3033 if (lib_buf2_rp->rst.score[sc_ix] > t_best) {
3034 t_best = lib_buf2_rp->rst.score[sc_ix];
3035 t_best_rp = lib_buf2_rp;
3036 }
3037 }
3038
3039 if (lib_buf2_dp->frame == max_frame) {
3040 if (t_best_rp!=NULL) {
3041 t_best_rp->is_valid_stat = 1;
3042 t_best_rp = NULL;
3043 }
3044 t_best = -BIGNUM;
3045 t_escore = 1000.0;
3046 }
3047
3048 next_seq:
3049 lib_buf2_dp++;
3050 lib_buf2_rp++;
3051 }
3052
3053 /* place to produce z_scores */
3054 /* place to produce sorted array */
3055
3056 lib_bhead_p->hdr.have_results = 1;
3057 }
3058
3059 void
buf_do_align(unsigned char ** aa0,int n0,struct buf_head * lib_bhead_p,struct pstruct * ppst,const struct mngmsg * m_msp,void ** f_str)3060 buf_do_align(unsigned char **aa0, int n0,
3061 struct buf_head *lib_bhead_p,
3062 struct pstruct *ppst, const struct mngmsg *m_msp,
3063 void **f_str) {
3064
3065 int buf2_cnt, i, nsq;
3066 struct buf2_data_s *lib_buf2_dp;
3067 struct buf2_res_s *lib_buf2_rp;
3068 struct buf2_ares_s *lib_buf2_ap;
3069 struct rstruct rst;
3070
3071 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
3072 lib_buf2_dp = lib_bhead_p->buf2_data;
3073 lib_buf2_rp = lib_bhead_p->buf2_res;
3074 lib_buf2_ap = lib_bhead_p->buf2_ares;
3075
3076 while (buf2_cnt-- > 0) {
3077 if ( m_msp->stages > 1) {
3078 /* this is not typically done unless m_msp->stages > 1 */
3079 do_opt (aa0[lib_buf2_dp->frame], n0, lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
3080 lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], &rst);
3081 lib_buf2_rp->rst.score[2]=rst.score[2];
3082 }
3083
3084 #ifdef DEBUG
3085 if (lib_buf2_dp->seq->aa1b == NULL) {
3086 fprintf(stderr,"*** error [%s:%d] - [buf_do_align] null aa1b\n",__FILE__, __LINE__);
3087 lib_buf2_ap->a_res = NULL;
3088 break;
3089 }
3090 if (check_seq_range(lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
3091 ppst->nsqx, "buf_do_align()")) {
3092 fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] range error at: %d/%d (n1:%d)\n",
3093 __FILE__, __LINE__,
3094 prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
3095 lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
3096 };
3097
3098 /* also check for adler32_crc match */
3099 if (lib_buf2_dp->seq->adler32_crc != adler32(1L,lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1)) {
3100 fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_align] CRC error at: %d/%d (n1:%d)\n",
3101 __FILE__, __LINE__,
3102 prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
3103 lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
3104 }
3105 #endif
3106
3107 lib_buf2_ap->a_res = build_ares_code(aa0[lib_buf2_dp->frame], m_msp->n0,
3108 lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq,
3109 lib_buf2_dp->frame, &lib_buf2_ap->have_ares,
3110 lib_buf2_dp->repeat_thresh, m_msp, ppst, f_str[lib_buf2_dp->frame] );
3111
3112 lib_buf2_dp++;
3113 lib_buf2_ap++;
3114 lib_buf2_rp++;
3115 }
3116 lib_bhead_p->hdr.have_results = 1;
3117 }
3118
3119 void
buf_qshuf_work(unsigned char * aa0s,int n0,struct buf_head * lib_bhead_p,int max_frame,struct pstruct * ppst,void * qf_str,int ix_score)3120 buf_qshuf_work(unsigned char *aa0s, int n0,
3121 struct buf_head *lib_bhead_p,
3122 int max_frame,
3123 struct pstruct *ppst, void *qf_str,
3124 int ix_score)
3125 {
3126 int buf2_cnt;
3127 struct buf2_data_s *lib_buf2_dp;
3128 struct buf2_res_s *lib_buf2_rp, *tq_best_rp;
3129 struct rstruct rrst;
3130 struct score_count_s q_scnt_info;
3131 int tq_best;
3132 double tq_escore;
3133
3134 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
3135 lib_buf2_dp = lib_bhead_p->buf2_data;
3136 lib_buf2_rp = lib_bhead_p->buf2_res;
3137
3138 tq_best_rp = NULL;
3139 tq_best = -BIGNUM;
3140 tq_escore = 1000.0;
3141
3142 while (buf2_cnt-- > 0) {
3143 rrst.score[0] = rrst.score[1] = rrst.score[2] = -BIGNUM;
3144 rrst.valid_stat = 0;
3145
3146 if (lib_buf2_dp->seq->n1 < ppst->n1_low ||
3147 lib_buf2_dp->seq->n1 > ppst->n1_high ) {
3148 lib_buf2_dp++;
3149 lib_buf2_rp++;
3150 tq_best_rp = NULL;
3151 tq_best = -BIGNUM;
3152 tq_escore = 1000.0;
3153 continue;
3154 }
3155
3156 do_work (aa0s, n0,
3157 lib_buf2_dp->seq->aa1b, lib_buf2_dp->seq->n1,
3158 lib_buf2_dp->frame, ppst, qf_str, 1, 0,
3159 &rrst, &q_scnt_info);
3160
3161 /* buf_qshuf_work() is always called after buf_do_work(), which
3162 sets rp->is_valid_stat */
3163 if (lib_buf2_rp->is_valid_stat) {
3164 tq_best_rp = lib_buf2_rp;
3165 }
3166
3167 if (rrst.escore < tq_escore) {
3168 tq_escore = rrst.escore;
3169 }
3170 if (rrst.score[ix_score] > tq_best) {
3171 tq_best = rrst.score[ix_score];
3172 }
3173
3174 if (lib_buf2_dp->frame == max_frame) {
3175 if (tq_best_rp!=NULL) {
3176 tq_best_rp->qr_score = tq_best;
3177 tq_best_rp->qr_escore = tq_escore;
3178 tq_best_rp = NULL;
3179 }
3180 #ifdef DEBUG
3181 else {
3182 fprintf(stderr,"*** error [%s:%d] - tq_best_rp NULL at: %ld\n",
3183 __FILE__, __LINE__, lib_buf2_rp - lib_bhead_p->buf2_res);
3184 }
3185 #endif
3186 tq_best = -BIGNUM;
3187 tq_escore = 1000.0;
3188 }
3189 lib_buf2_dp++;
3190 lib_buf2_rp++;
3191 }
3192 }
3193
3194 void
buf_shuf_work(unsigned char ** aa0,int n0,unsigned char * aa1s,struct buf_head * lib_bhead_p,int max_frame,struct pstruct * ppst,void ** f_str,int ix_score,void * rand_state)3195 buf_shuf_work(unsigned char **aa0, int n0, unsigned char *aa1s, struct buf_head *lib_bhead_p,
3196 int max_frame, struct pstruct *ppst, void **f_str,
3197 int ix_score, void *rand_state)
3198 {
3199 int buf2_cnt;
3200 int shuff_cnt;
3201 struct buf2_data_s *lib_buf2_dp;
3202 struct buf2_res_s *lib_buf2_rp, *tr_best_rp;
3203 int tr_best, sc_ix;
3204 double tr_escore;
3205
3206 sc_ix = ppst->score_ix;
3207
3208 lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
3209 lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
3210
3211 shuff_cnt = 0;
3212 buf2_cnt = lib_bhead_p->hdr.buf2_cnt;
3213 lib_buf2_dp = lib_bhead_p->buf2_data;
3214 lib_buf2_rp = lib_bhead_p->buf2_res;
3215
3216 tr_best_rp = NULL;
3217 tr_best = -BIGNUM;
3218 tr_escore = 1000.0;
3219
3220 while (buf2_cnt-- > 0) {
3221 lib_buf2_rp->r_rst.score[0] = lib_buf2_rp->r_rst.score[1] =
3222 lib_buf2_rp->r_rst.score[2] = -BIGNUM;
3223 lib_buf2_rp->r_rst.valid_stat = lib_buf2_rp->is_valid_stat = 0;
3224
3225 if ((lib_buf2_dp->stats_idx < 0) || lib_buf2_dp->seq->n1 < ppst->n1_low ||
3226 lib_buf2_dp->seq->n1 > ppst->n1_high ) {
3227 lib_buf2_dp++;
3228 lib_buf2_rp++;
3229 tr_best_rp = NULL;
3230 tr_best = -BIGNUM;
3231 tr_escore = 1000.0;
3232 continue;
3233 }
3234
3235 shuff_cnt++;
3236 if (ppst->zs_win > 0) {
3237 wshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1,ppst->zs_win, rand_state);
3238 }
3239 else {
3240 if (ppst->shuffle_dna3) {shuffle3(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
3241 else {shuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1, rand_state);}
3242 }
3243
3244 /* rshuffle(lib_buf2_dp->seq->aa1b,aa1s,lib_buf2_dp->seq->n1); */
3245
3246 #ifdef DEBUG
3247 if (check_seq_range(aa1s, lib_buf2_dp->seq->n1,
3248 ppst->nsqx, "buf_do_align()")) {
3249 fprintf(stderr, "*** error [%s:%d] - [%s/buf_do_shuff] range error at: %d/%d (n1:%d)\n",
3250 __FILE__, __LINE__,
3251 prog_func,lib_bhead_p->hdr.buf2_cnt - (buf2_cnt+1),
3252 lib_bhead_p->hdr.buf2_cnt, lib_buf2_dp->seq->n1);
3253 };
3254 #endif
3255
3256 do_work (aa0[lib_buf2_dp->frame], n0,
3257 aa1s, lib_buf2_dp->seq->n1,
3258 lib_buf2_dp->frame, ppst, f_str[lib_buf2_dp->frame], 0, 1,
3259 &lib_buf2_rp->r_rst, &(lib_bhead_p->s_cnt_info));
3260
3261 if (lib_buf2_rp->r_rst.valid_stat) {
3262 if (lib_buf2_rp->r_rst.escore < tr_escore) {
3263 tr_escore = lib_buf2_rp->r_rst.escore;
3264 tr_best_rp = lib_buf2_rp;
3265 }
3266 if (lib_buf2_rp->r_rst.score[sc_ix] > tr_best) {
3267 tr_best = lib_buf2_rp->r_rst.score[sc_ix];
3268 tr_best_rp = lib_buf2_rp;
3269 }
3270 }
3271
3272 if (lib_buf2_dp->frame == max_frame) {
3273 if (tr_best_rp!=NULL) {
3274 tr_best_rp->is_valid_stat = 1;
3275 tr_best_rp = NULL;
3276 }
3277 tr_best = -BIGNUM;
3278 tr_escore = 1000.0;
3279 }
3280
3281 lib_buf2_dp++;
3282 lib_buf2_rp++;
3283 }
3284 lib_bhead_p->hdr.shuff_cnt = shuff_cnt;
3285 lib_bhead_p->hdr.have_results = 1;
3286 }
3287
3288 /* buf_shuf_seq is designed to:
3289 (1) take a list of sequences (specified by bptr[])
3290 (2) collect them from the database if they are not already available
3291 (3) send them to the threads or shuffle them directly and calculate scores
3292 */
3293
3294 void
buf_shuf_seq(unsigned char ** aa0,int n0,unsigned char ** aa1shuff_b,unsigned char * aa1save,int maxn,struct beststr ** bestp_arr,int nbest,struct pstruct * ppst,struct mngmsg * m_msp,struct mng_thr * m_bufi_p,void ** f_str,struct score_count_s * s_info)3295 buf_shuf_seq(unsigned char **aa0, int n0,
3296 unsigned char **aa1shuff_b, unsigned char *aa1save, int maxn,
3297 struct beststr **bestp_arr, int nbest,
3298 struct pstruct *ppst, struct mngmsg *m_msp,
3299 struct mng_thr *m_bufi_p
3300 #if !defined(COMP_THR) && !defined(PCOMPLIB)
3301 , void **f_str
3302 #endif
3303 , struct score_count_s *s_info)
3304 {
3305 unsigned char *aa1shuff;
3306 struct beststr *bbp, **tmp_bestp;
3307 char l_bline[MAX_SSTR];
3308 int n1lib_req, shuff_mult;
3309 long loffset, l_off;
3310 int n1, itt;
3311 int max_do_cnt, ndiff, prev_index;
3312 int istats;
3313 int i, j;
3314
3315 /* these variables track buffers of library sequences */
3316 int cur_buf_size, max_buf_size;
3317 struct buf_head *lib_bhead_p;
3318 struct buf2_data_s *lib_buf2_dp;
3319 struct buf2_res_s *lib_buf2_rp;
3320
3321 /* (1) get the sequences into a buffer - the sequence information is
3322 currently in the bestp_arr - find out how many we have, and how
3323 many we will need - the number to shuffle */
3324
3325 /* figure out how much space we need, first checking whether we have
3326 dups */
3327 if ((tmp_bestp = (struct beststr **)calloc(nbest, sizeof(struct beststr *)))==NULL) {
3328 fprintf(stderr,"*** error [%s:%d] - %s/buf_shuf_seq() *** cannot allocate tmp_bestp[%d]\n",
3329 __FILE__, __LINE__, prog_name, nbest);
3330 exit(1);
3331 }
3332 for (i = 0; i < nbest; i++) {
3333 tmp_bestp[i] = bestp_arr[i];
3334 }
3335
3336 /* sort tmp_bestp[] by sequence index, so duplicates are adjacent */
3337 sortbesti(tmp_bestp, nbest);
3338
3339 /* count number of different sequence indices, get required space
3340 without dups */
3341 prev_index = -1;
3342 n1lib_req = ndiff = 0;
3343 for (i = 0; i < nbest; i++) {
3344 if (tmp_bestp[i]->seq->index > prev_index) {
3345 prev_index = tmp_bestp[i]->seq->index;
3346 n1lib_req += tmp_bestp[i]->n1+ 2;
3347 ndiff++;
3348 }
3349 }
3350
3351 #if !defined(COMP_THR) && !defined(PCOMPLIB)
3352 if (n1lib_req >= maxn) { /* we need new space, aa1shuff is too small */
3353 if ((*aa1shuff_b = aa1shuff =
3354 (unsigned char *)realloc(*aa1shuff_b, n1lib_req*sizeof(char)))==NULL) {
3355 fprintf(stderr,"*** error [%s:%d] - cannot realloc aa1shuff[%d]\n",
3356 __FILE__, __LINE__, n1lib_req);
3357 exit(1);
3358 }
3359 }
3360 else { aa1shuff = *aa1shuff_b;}
3361 *aa1shuff = '\0';
3362 aa1shuff++;
3363
3364 #else
3365 if (n1lib_req < 2) {
3366 fprintf(stderr,"*** error [%s:%d] - [%s/buf_shuf_seq] no residues to shuffle: %d (%d)\n",
3367 __FILE__, __LINE__,
3368 prog_func,n1lib_req,ndiff);
3369 exit(1);
3370 }
3371
3372 if ((*aa1shuff_b = aa1shuff =
3373 (unsigned char *)calloc(n1lib_req,sizeof(char)))==NULL) {
3374 fprintf(stderr,"*** error [%s:%d] - cannot calloc aa1shuff[%d]\n",
3375 __FILE__, __LINE__, n1lib_req);
3376 exit(1);
3377 }
3378 *aa1shuff = '\0';
3379 aa1shuff++;
3380 #endif
3381
3382 shuff_mult = (m_msp->shuff_max+1)/ndiff;
3383 istats = 0;
3384
3385 /* setup lib_bhead buffers for shuffle comparisons */
3386 #if defined(COMP_THR) || defined(PCOMPLIB) /* threaded/parallel */
3387 /* max_do_cnt can be smaller than max_buf2_cnt, but not larger */
3388 max_do_cnt = min(m_bufi_p->max_buf2_res,
3389 m_msp->shuff_max / (2 * fa_max_workers));
3390 /* we don't have a left over one, so we need one */
3391 get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
3392 #else /* not threaded */
3393 max_do_cnt = m_bufi_p->max_buf2_res;
3394 lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
3395 #endif
3396 max_buf_size = n1lib_req;
3397 cur_buf_size = 0;
3398 lib_bhead_p->hdr.buf2_cnt = 0;
3399 lib_bhead_p->hdr.have_results = 0;
3400 lib_bhead_p->hdr.stop_work = 0;
3401 lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
3402 lib_bhead_p->hdr.seq_record_continuous = 0;
3403 lib_buf2_dp = lib_bhead_p->buf2_data;
3404 lib_buf2_rp = lib_bhead_p->buf2_res;
3405
3406 /* read sequences into shuffle buffer */
3407
3408 for (i = 0; i < ndiff; i++) {
3409 bbp = tmp_bestp[i];
3410 if (bbp->seq->aa1b == NULL) {
3411 /* get the sequence */
3412 (bbp->mseq->m_file_p->ranlib)(l_bline, sizeof(l_bline),
3413 bbp->mseq->lseek,bbp->mseq->libstr,bbp->mseq->m_file_p);
3414 n1 = re_getlib(aa1save,NULL, maxn,m_msp->ldb_info.maxt3,
3415 m_msp->ldb_info.l_overlap,bbp->mseq->cont,m_msp->ldb_info.term_code,
3416 &loffset,&l_off,bbp->mseq->m_file_p);
3417
3418 /* fprintf(stderr, " %d gets %d %d\n",i,tmp_bestp[i]->seq->n1,n1); */
3419
3420 memcpy(aa1shuff, aa1save, n1+1);
3421 bbp->seq->aa1b = aa1shuff;
3422 aa1shuff += n1 + 1;
3423 }
3424
3425 /* lib_buf2_dp is used up by scores, the sequence is not sent multiple times */
3426 cur_buf_size += bbp->seq->n1+1;
3427 for (j = 0; j < shuff_mult; j++ ) {
3428 for (itt = m_msp->revcomp; itt <= m_msp->nitt1; itt++) {
3429 #ifdef PCOMPLIB
3430 lib_buf2_dp->seq_dup = 0; /* mark first ->seq as original, not duplicate */
3431 #endif
3432 lib_buf2_dp->seq = bbp->seq;
3433 /* this invalidates lib_buf2_p->seq */
3434 lib_buf2_dp->stats_idx = istats++;
3435 lib_buf2_dp->frame = itt;
3436 lib_buf2_dp++; /* point to next buf2 */
3437 lib_buf2_rp++; /* point to next buf2 */
3438 lib_bhead_p->hdr.buf2_cnt++;
3439
3440 if (lib_bhead_p->hdr.buf2_cnt >= max_do_cnt ||
3441 cur_buf_size >= max_buf_size) {
3442 /* (2) send sequences for shuffling */
3443 #if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
3444 /* provide empty buffer to workers */
3445 lib_bhead_p->hdr.aa1b_used = cur_buf_size;
3446 lib_bhead_p->hdr.have_data = 1;
3447 lib_bhead_p->hdr.seq_record_continuous = 0;
3448 put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
3449 get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
3450 #else /* non-thread - just do the searches */
3451 if (lib_bhead_p->hdr.buf2_type & BUF2_DOSHUF) {
3452 buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
3453 m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
3454 }
3455 #endif
3456 /* (3) save results in the rstats structure */
3457 if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
3458 save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max,ppst->score_ix,s_info);
3459 }
3460
3461 lib_bhead_p->s_cnt_info.s_cnt[0] = lib_bhead_p->s_cnt_info.s_cnt[1] =
3462 lib_bhead_p->s_cnt_info.s_cnt[2] = lib_bhead_p->s_cnt_info.tot_scores = 0;
3463
3464 lib_bhead_p->hdr.buf2_cnt = 0;
3465 cur_buf_size = 0;
3466 lib_bhead_p->hdr.have_results = 0;
3467 lib_bhead_p->hdr.buf2_type=BUF2_DOSHUF;
3468 lib_bhead_p->hdr.seq_record_continuous = 0; /* seq_records are coming from bestptr in any order */
3469 lib_bhead_p->hdr.stop_work = 0;
3470 lib_buf2_dp = lib_bhead_p->buf2_data;
3471 }
3472 } /* for (itt .. */
3473 }
3474 } /* done with tmp_bestp[] */
3475 free(tmp_bestp);
3476
3477 #if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR/PCOMPLIB - fill and empty buffers */
3478 /* check last buffers for any results */
3479 lib_bhead_p->hdr.seq_record_continuous = 0;
3480 put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
3481
3482 /* wait for the threads to finish */
3483
3484 wait_rbuf(m_bufi_p->max_work_buf);
3485 /*
3486 fprintf(stderr, " num_reader[%d]-empty[%d]: %d\tnrstats: %d\n",
3487 num_reader_bufs,empty_reader_bufs,
3488 num_reader_bufs-empty_reader_bufs, nrstats);
3489 */
3490
3491 for (i=0; i < num_reader_bufs; i++) {
3492 if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
3493 save_shuf(RESULTS_BUF[i],m_msp->nitt1, m_msp->shuff_max, ppst->score_ix, s_info);
3494 RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
3495 }
3496 }
3497 #else /* just do the searches */
3498 /* aa1save is used for shuffles, not aa1shuf, because aa1shuf
3499 has library sequences */
3500 buf_shuf_work(aa0,m_msp->n0, aa1save, lib_bhead_p,
3501 m_msp->nitt1, ppst, f_str, ppst->score_ix, rand_state);
3502
3503 save_shuf(lib_bhead_p,m_msp->nitt1,m_msp->shuff_max, ppst->score_ix, s_info);
3504 lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
3505 #endif
3506 }
3507
3508 /* buf_align_seq is structurally almost identical to buf_shuf_seq,
3509 except that the appropriate sequences are pre-loaded into bbp->seq
3510 (and ->bline), and it gets bbp->a_res, rather than scores */
3511
3512 void
buf_align_seq(unsigned char ** aa0,int n0,struct beststr ** bestp_arr,int nbest,struct pstruct * ppst,struct mngmsg * m_msp,struct mng_thr * m_bufi_p,void ** f_str)3513 buf_align_seq(unsigned char **aa0, int n0,
3514 struct beststr **bestp_arr, int nbest,
3515 struct pstruct *ppst, struct mngmsg *m_msp,
3516 struct mng_thr *m_bufi_p
3517 #if !defined(COMP_THR) && !defined(PCOMPLIB)
3518 , void **f_str
3519 #endif
3520 )
3521 {
3522 struct beststr *bbp;
3523 int max_align_cnt;
3524 int i, n_pre_align;
3525 int cur_buf_size, max_buf_size;
3526 struct buf_head *lib_bhead_p;
3527 struct buf2_data_s *lib_buf2_dp;
3528 struct buf2_ares_s *lib_buf2_ap;
3529
3530 /* setup lib_bhead buffers for alignments */
3531 #if defined(COMP_THR) || defined(PCOMPLIB) /* threaded */
3532 /* max_do_cnt can be smaller than max_buf2_res, but not larger */
3533 #ifdef COMP_THR
3534 max_align_cnt = min(m_bufi_p->max_buf2_res,
3535 nbest / (4 * fa_max_workers));
3536 #else
3537 max_align_cnt = min(m_bufi_p->max_buf2_res, nbest / fa_max_workers);
3538 #endif
3539 if (max_align_cnt < 1) max_align_cnt = 1;
3540
3541 get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
3542 #else /* not threaded */
3543 max_align_cnt = m_bufi_p->max_buf2_res;
3544 lib_bhead_p = lib_buf2_list; /* equivalent to un-threaded get_rbuf() */
3545 #endif
3546
3547 max_buf_size = lib_bhead_p->hdr.aa1b_size;
3548 lib_bhead_p->hdr.buf2_cnt = 0;
3549 lib_bhead_p->hdr.have_results = 0;
3550 lib_bhead_p->hdr.stop_work = 0;
3551 lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
3552 lib_buf2_dp = lib_bhead_p->buf2_data;
3553 lib_buf2_ap = lib_bhead_p->buf2_ares;
3554
3555 /* read sequences into align buffer */
3556
3557 n_pre_align = 0;
3558 cur_buf_size = 0;
3559 for (i = 0; i < nbest; i++) {
3560 bbp = bestp_arr[i];
3561
3562 /* this invalidates lib_buf2_p->seq */
3563 lib_buf2_dp->seq = bbp->seq;
3564 cur_buf_size += bbp->seq->n1+1;
3565 lib_buf2_dp->frame = bbp->frame;
3566 lib_buf2_dp->repeat_thresh = bbp->repeat_thresh;
3567 #ifdef PCOMPLIB
3568 lib_buf2_dp->seq_dup = 0;
3569 #endif
3570 lib_buf2_ap->have_ares = 0;
3571 lib_buf2_ap->a_res = NULL;
3572 lib_buf2_ap->best_idx = i;
3573 lib_buf2_dp++; /* point to next buf2_data */
3574 lib_buf2_ap++; /* point to next buf2_ares */
3575 lib_bhead_p->hdr.buf2_cnt++;
3576
3577 if (lib_bhead_p->hdr.buf2_cnt >= max_align_cnt ||
3578 cur_buf_size >= max_buf_size - m_msp->ldb_info.maxn) {
3579 /* (2) send sequences for alignment */
3580 #if defined(COMP_THR) || defined(PCOMPLIB) /* threaded - fill and empty buffers */
3581 /* provide empty buffer to workers */
3582 lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
3583 lib_bhead_p->hdr.have_data = 1;
3584 lib_bhead_p->hdr.aa1b_used = cur_buf_size;
3585 lib_bhead_p->hdr.seq_record_continuous = 0;
3586 put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
3587 get_rbuf(&lib_bhead_p,m_bufi_p->max_work_buf);
3588 #else /* non-thread - just do the searches */
3589 buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
3590 #endif
3591
3592 /* (3) save alignments */
3593 if (lib_bhead_p->hdr.buf2_cnt > 0 && lib_bhead_p->hdr.have_results) {
3594 n_pre_align += save_align(lib_bhead_p,bestp_arr);
3595 }
3596
3597 cur_buf_size = 0;
3598 max_buf_size = lib_bhead_p->hdr.aa1b_size;
3599 lib_bhead_p->hdr.buf2_cnt = 0;
3600 lib_bhead_p->hdr.have_results = 0;
3601 lib_bhead_p->hdr.buf2_type=BUF2_DOALIGN;
3602 lib_bhead_p->hdr.stop_work = 0;
3603 lib_buf2_dp = lib_bhead_p->buf2_data;
3604 lib_buf2_ap = lib_bhead_p->buf2_ares;
3605 }
3606 } /* done with bestp_arr[] */
3607
3608 #if defined(COMP_THR) || defined(PCOMPLIB) /* if COMP_THR - fill and empty buffers */
3609 /* check last buffers for any results */
3610 lib_bhead_p->hdr.seqr_cnt = lib_bhead_p->hdr.buf2_cnt; /* for alignments, they are the same */
3611 lib_bhead_p->hdr.have_data = 1;
3612 lib_bhead_p->hdr.aa1b_used = cur_buf_size;
3613 lib_bhead_p->hdr.seq_record_continuous = 0;
3614 put_rbuf(lib_bhead_p,m_bufi_p->max_work_buf);
3615
3616 /* wait for the threads to finish */
3617
3618 wait_rbuf(m_bufi_p->max_work_buf);
3619
3620 for (i=0; i < num_reader_bufs; i++) {
3621 if (RESULTS_BUF[i]->hdr.buf2_cnt > 0 && RESULTS_BUF[i]->hdr.have_results) {
3622 n_pre_align += save_align(RESULTS_BUF[i],bestp_arr);
3623 RESULTS_BUF[i]->hdr.buf2_cnt = RESULTS_BUF[i]->hdr.have_results = 0;
3624 }
3625 }
3626 #else /* just do the searches */
3627 buf_do_align(aa0, m_msp->n0, lib_bhead_p, ppst, m_msp, f_str);
3628 n_pre_align += save_align(lib_bhead_p,bestp_arr);
3629 lib_bhead_p->hdr.buf2_cnt = lib_bhead_p->hdr.have_results = 0;
3630 #endif
3631
3632 m_msp->align_done = 1;
3633
3634 if (n_pre_align != nbest) {
3635 fprintf(stderr,"*** error [%s:%d] - n_pre_align:%d != nbest: %d\n",
3636 __FILE__, __LINE__, n_pre_align, nbest);
3637 }
3638 for (i=0; i < nbest; i++) {
3639 if (bestp_arr[i]->a_res == NULL) {
3640 fprintf(stderr, "*** error [%s:%d] - have NULL a_res: %d\n",
3641 __FILE__, __LINE__, i);
3642 }
3643 }
3644 }
3645
3646 int
check_seq_range(unsigned char * aa1b,int n1,int nsq,char * str)3647 check_seq_range(unsigned char *aa1b, int n1, int nsq, char *str) {
3648 int i, range_error;
3649 unsigned char *aa1p;
3650
3651 range_error = 0;
3652 for (aa1p = aa1b, i=0; i < n1; i++, aa1p++) {
3653 if (*aa1p > nsq) {
3654 range_error = 1;
3655 /* fprintf(stderr, "%s seq %d (%c) out of range at %d\n",
3656 str, *aa1p, *aa1p,i);
3657 */
3658 }
3659 }
3660 return range_error;
3661 }
3662
3663 struct stack_str {
3664 void **stack;
3665 int size;
3666 int inc;
3667 int top;
3668 };
3669
init_stack(int size,int inc)3670 struct stack_str *init_stack(int size, int inc) {
3671 struct stack_str *stack;
3672
3673 if ((stack=(struct stack_str *)calloc(1,sizeof(struct stack_str)))==NULL) {
3674 fprintf(stderr,"*** error [%s:%d] - cannot allocate stack\n",
3675 __FILE__, __LINE__);
3676 return NULL;
3677 }
3678
3679 if ((stack->stack=(void *)calloc(size,sizeof(void *)))==NULL) {
3680 fprintf(stderr,"*** error [%s:%d] - cannot allocate stack->stack[%d]\n",
3681 __FILE__, __LINE__,size);
3682 free(stack);
3683 return NULL;
3684 }
3685
3686 stack->size = size;
3687 stack->inc = inc;
3688 stack->top = 0;
3689 return stack;
3690 }
3691
push_stack(struct stack_str * stack,void * value)3692 void push_stack(struct stack_str *stack, void *value) {
3693
3694 if (!stack) return;
3695 if (stack->top >= stack->size) {
3696 stack->size += stack->inc;
3697 if ((stack->stack = (void *)realloc(stack->stack, stack->size*sizeof(void *)))==NULL) {
3698 fprintf(stderr,"*** error [%s:%d] - cannot re-allocate stack to [%d]\n",
3699 __FILE__, __LINE__, stack->size);
3700 return;
3701 }
3702 }
3703 stack->stack[stack->top++] = value;
3704 }
3705
pop_stack(struct stack_str * stack)3706 void * pop_stack(struct stack_str *stack) {
3707 if (stack == NULL) {
3708 #ifdef DEBUG
3709 fprintf(stderr," *** error [%s:%d] - pop_stack NULL stack\n",__FILE__, __LINE__);
3710 #endif
3711 return NULL;
3712 }
3713
3714 if (stack->top-- > 0) {
3715 return stack->stack[stack->top];
3716 }
3717 else {
3718 stack->top = 0;
3719 return NULL;
3720 }
3721 }
3722
free_stack(struct stack_str * stack)3723 void * free_stack(struct stack_str *stack) {
3724 if (stack==NULL) return NULL;
3725 if (stack->stack != NULL) free(stack->stack);
3726 free(stack);
3727 return NULL;
3728 }
3729
3730 struct dyn_string_str *
init_dyn_string(int size,int inc)3731 init_dyn_string(int size, int inc) {
3732 struct dyn_string_str *dyn_string;
3733
3734 if ((dyn_string=(struct dyn_string_str *)calloc(1,sizeof(struct dyn_string_str)))==NULL) {
3735 fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string\n",
3736 __FILE__, __LINE__);
3737 return NULL;
3738 }
3739
3740 if ((dyn_string->string=(void *)calloc(size,sizeof(void *)))==NULL) {
3741 fprintf(stderr,"*** error [%s:%d] - cannot allocate dyn_string->string[%d]\n",
3742 __FILE__, __LINE__,size);
3743 free(dyn_string);
3744 return NULL;
3745 }
3746
3747 dyn_string->c_size = 0;
3748 dyn_string->inc = inc;
3749 dyn_string->mx_size = size;
3750 return dyn_string;
3751 }
3752
3753 void
dyn_strcat(struct dyn_string_str * dyn_string,char * value)3754 dyn_strcat(struct dyn_string_str *dyn_string, char *value) {
3755 size_t add_len;
3756
3757 add_len = strlen(value);
3758
3759 if (!dyn_string) return;
3760 if (add_len + dyn_string->c_size + 1 >= dyn_string->mx_size) {
3761 while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
3762 dyn_string->mx_size += dyn_string->inc;
3763 if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
3764 fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
3765 __FILE__, __LINE__, dyn_string->mx_size);
3766 dyn_string->mx_size = 0;
3767 return;
3768 }
3769 }
3770 SAFE_STRNCAT(dyn_string->string,value,dyn_string->mx_size);
3771 dyn_string->c_size += add_len;
3772 }
3773
dyn_strcpy(struct dyn_string_str * dyn_string,char * value)3774 void dyn_strcpy(struct dyn_string_str *dyn_string, char *value) {
3775 size_t add_len;
3776
3777 add_len = strlen(value);
3778
3779 if (!dyn_string) return;
3780 if (add_len + 1>= dyn_string->mx_size) {
3781 while (dyn_string->inc < add_len) { dyn_string->inc *= 2; }
3782 dyn_string->mx_size += dyn_string->inc;
3783 if ((dyn_string->string = (void *)realloc(dyn_string->string, dyn_string->mx_size))==NULL) {
3784 fprintf(stderr,"*** error [%s:%d] - cannot re-allocate dyn_string to [%d]\n",
3785 __FILE__, __LINE__, dyn_string->mx_size);
3786 dyn_string->mx_size = 0;
3787 return;
3788 }
3789 }
3790 SAFE_STRNCPY(dyn_string->string,value,dyn_string->mx_size);
3791 }
3792
free_dyn_string(struct dyn_string_str * dyn_string)3793 void free_dyn_string(struct dyn_string_str *dyn_string) {
3794 if (dyn_string==NULL) return;
3795 if (dyn_string->string != NULL) free(dyn_string->string);
3796 free(dyn_string);
3797 }
3798
3799 #include "a_mark.h"
3800
3801 /* *itmp has the current alignment score, if *annot_arr[i_annot].label='V',
3802 this can be increased (total increase in *v_delta)
3803 *pam2aa0v[.value] gives possibly better pam score for variant
3804 *ip is position in annotated sequence (&i0 for annot0_p)
3805 *ia is position in aligned sequence (&i1 for annot0_p)
3806 sp1 is the array for the (possibly modified) displayed sequence
3807 sp1a is the array for the associated annotation
3808 sq maps encoded residues to displayed characters
3809 i_annot -- current annotation index in annot0_p->annot_arr_p[i_annot]
3810 annot_arr = annot0/1_p->annot_arr_p
3811 annot_stack = save current annotation
3812 *have_push_features = set for annotations pushed in stack (not 'V')
3813 *v_delta = change in score from variant at this position
3814 **region_p = set for '[' region start
3815 init_score -- used to initialize tmp_region_p->score.
3816
3817 */
3818
3819 int
next_annot_match(int * itmp,int * pam2aa0v,long ip,long ia,char * sp1,char * sp1a,const unsigned char * sq,int i_annot,int n_annot,struct annot_entry ** annot_arr,char ** ann_comment,void * annot_stack,int * have_push_features,int * v_delta,struct annot_entry ** region_p,struct annot_entry * tmp_region_p,int init_score)3820 next_annot_match(int *itmp, int *pam2aa0v, long ip, long ia, char *sp1, char *sp1a, const unsigned char *sq,
3821 int i_annot, int n_annot, struct annot_entry **annot_arr, char **ann_comment,
3822 void *annot_stack, int *have_push_features, int *v_delta,
3823 struct annot_entry **region_p, struct annot_entry *tmp_region_p,
3824 int init_score) {
3825 int v_tmp;
3826
3827 if (ann_comment) *ann_comment = NULL;
3828
3829 /* count through the annotations at this position (long ip) */
3830
3831 while (i_annot < n_annot && ip == annot_arr[i_annot]->pos) {
3832 if (annot_arr[i_annot]->label == 'V') { /* label == 'V' */
3833 v_tmp = pam2aa0v[annot_arr[i_annot]->value];
3834 if (v_tmp > *itmp) {
3835 *v_delta += (v_tmp- *itmp);
3836 *itmp = v_tmp;
3837 *sp1 = sq[annot_arr[i_annot]->value];
3838 if (sp1a) *sp1a = 'V';
3839 if (ann_comment) *ann_comment = annot_arr[i_annot]->comment;
3840 }
3841 }
3842 else if (annot_arr[i_annot]->label == '[') {
3843 /* region_p needs to point to a more sophisticated data
3844 structure that keeps track of all the current regions being
3845 updated
3846
3847 to start, region_p could include a linked list and a pointer to
3848 the current left-most region, which would be used for ']'
3849 detection
3850
3851 for efficiency, update the ->score only when a new
3852 (overlapping) region is started or stopped
3853
3854 same for n_indent, n_aln
3855 */
3856
3857 if (region_p) {
3858 memcpy(tmp_region_p, annot_arr[i_annot],sizeof(struct annot_entry));
3859 tmp_region_p->a_pos = ia;
3860 tmp_region_p->score = init_score;
3861 tmp_region_p->n_ident = tmp_region_p->n_aln = 0;
3862 *region_p = tmp_region_p;
3863 }
3864 }
3865 else if (annot_arr[i_annot]->label == ']') {
3866 if (have_push_features) *have_push_features = 1;
3867 push_stack(annot_stack, annot_arr[i_annot]);
3868 }
3869 else if (annot_stack) {
3870 if (have_push_features) *have_push_features = 1;
3871 push_stack(annot_stack, annot_arr[i_annot]);
3872 }
3873 i_annot++;
3874 } /* everything at this alignment position is checked */
3875 return i_annot;
3876 }
3877
3878 /* returns M_NEG, M_ZERO, M_POS, M_IDENT, M_DEL (a_mark.h)
3879 updates *aln->nsim, npos, nident, nmismatch
3880
3881 */
align_type(int score,char sp0,char sp1,int nt_align,struct a_struct * aln,int pam_x_id_sim)3882 int align_type(int score, char sp0, char sp1, int nt_align, struct a_struct *aln, int pam_x_id_sim) {
3883 int spa_val;
3884
3885 if (score<0) {
3886 spa_val = M_NEG;
3887 }
3888 else if (score == 0) {
3889 spa_val = M_ZERO;
3890 if (aln) aln->nsim++;
3891 }
3892 else {
3893 spa_val = M_POS;
3894 if (aln) {aln->nsim++; aln->npos++;}
3895 }
3896
3897 /* correct for score < 0 with 'N:N'/'X:X' */
3898 if (pam_x_id_sim > 0) { /* > 0 -> identical, similar */
3899 if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
3900 (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
3901 spa_val = M_POS;
3902 if (aln) {
3903 aln->nsim++;
3904 }
3905 }
3906 }
3907
3908 if (aln) aln->nmismatch++;
3909 if (toupper(sp0) == toupper(sp1)) {
3910 spa_val = M_IDENT;
3911 if (aln) {
3912 aln->nident++;
3913 aln->nmismatch--;
3914 }
3915 }
3916 else if (nt_align) {
3917 if ((toupper(sp0) == 'T' && toupper(sp1) == 'U') ||
3918 (toupper(sp0)=='U' && toupper(sp1)=='T')) {
3919 spa_val = M_IDENT;
3920 if (aln) {
3921 aln->nident++;
3922 aln->nmismatch--;
3923 }
3924 }
3925 /* add to gap count for 'N' matches ?? */
3926 else if (aln && toupper(sp0) == 'N') aln->ngap_q++;
3927 else if (aln && toupper(sp1) == 'N') aln->ngap_l++;
3928 }
3929
3930 /* correct nident, nmismatch for N:N / X:X */
3931 if (pam_x_id_sim < 0) { /* > 0 -> identical, similar */
3932 if ((nt_align && toupper(sp0)=='N' && toupper(sp1)=='N') ||
3933 (!nt_align && toupper(sp0)=='X' && toupper(sp1)=='X')) {
3934 if (aln) {
3935 aln->nident--;
3936 aln->nmismatch++;
3937 }
3938 }
3939 }
3940
3941 return spa_val;
3942 }
3943
3944 /* seq_pos works with comment_var()/display_push_features()/do_url1() where
3945 i_offset = nn for reversed sequences
3946 off = 0 for 0 based offsets, 1 for 1-based offsets
3947 */
3948 int
seq_pos(int pos,int rev,int off)3949 seq_pos(int pos, int rev, int off) {
3950
3951 if (rev) {
3952 return -pos-1 + off;
3953 }
3954 else {
3955 return pos;
3956 }
3957 }
3958
3959 /* target = 0 (aa0), 1 (aa1)
3960
3961 d_type = display_type (annot_fmt in cal_cons.c):
3962 1 (long text), d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
3963 2 (-m 9c code) sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
3964
3965 i0_pos/i1_pos have already been converted to reverse coordinate if necessary
3966 */
comment_var(long i0_pos,char sp0,long i1_pos,char sp1,char o_sp1,char sim_char,const char * ann_comment,struct dyn_string_str * annot_var_dyn,int target,int d_type)3967 void comment_var (long i0_pos, char sp0, long i1_pos, char sp1, char o_sp1,
3968 char sim_char, const char *ann_comment,
3969 struct dyn_string_str *annot_var_dyn, int target, int d_type)
3970 {
3971 char tmp_str[MAX_LSTR], tc, ann_ch0, ann_ch1;
3972 char *d1_fmt;
3973
3974 if (d_type == 1) {
3975 if (target ==1) {
3976 d1_fmt = " Variant: %d%c%c%d%c : %c%d%c";
3977 sprintf(tmp_str,d1_fmt,
3978 i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,i1_pos+1,sp1);
3979 }
3980 else {
3981 d1_fmt = " qVariant: %d%c%c%d%c : %c%d%c";
3982 sprintf(tmp_str,d1_fmt,
3983 i0_pos+1, sp0, sim_char, i1_pos+1,sp1, o_sp1,i1_pos+1,sp0);
3984 }
3985
3986 /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
3987 dyn_strcat(annot_var_dyn, tmp_str);
3988
3989 if (ann_comment) {
3990 sprintf(tmp_str," : %s",ann_comment);
3991 /* SAFE_STRNCAT(annot_var_s,tmp_str,n_annot_var_s); */
3992 dyn_strcat(annot_var_dyn, tmp_str);
3993 }
3994
3995 /* SAFE_STRNCAT(annot_var_s,"\n",n_annot_var_s); */
3996 dyn_strcat(annot_var_dyn, "\n");
3997 }
3998 else if (d_type == 2) {
3999 if (target == 1) {
4000 ann_ch0 = 'X';
4001 ann_ch1 = 'V';
4002 }
4003 else {
4004 ann_ch0 = 'V';
4005 ann_ch1 = 'X';
4006 }
4007
4008 sprintf(tmp_str, "|%c%c:%ld%c%c%ld%c",
4009 ann_ch0,ann_ch1,
4010 i0_pos+1,sp0, sim_char,i1_pos+1,sp1);
4011 /* SAFE_STRNCAT(annot_var_s, tmp_str, n_annot_var_s); */
4012 dyn_strcat(annot_var_dyn, tmp_str);
4013 }
4014 }
4015
4016 void
display_push_features(void * annot_stack,struct dyn_string_str * annot_var_dyn,long i0_pos,char sp0,long i1_pos,char sp1,char sym,struct annot_entry ** region0_p,struct annot_entry ** region1_p,int tot_score,double comp,int n0,int n1,void * pstat_void,int d_type)4017 display_push_features(void *annot_stack, struct dyn_string_str *annot_var_dyn,
4018 long i0_pos, char sp0, long i1_pos, char sp1, char sym,
4019 struct annot_entry **region0_p,
4020 struct annot_entry **region1_p,
4021 int tot_score, double comp, int n0, int n1,
4022 void *pstat_void, int d_type) {
4023 struct annot_entry *this_annot_p;
4024 double lbits, total_bits, zscore, lprob, lpercid;
4025 char *ann_comment, *bp;
4026 struct annot_entry *region_p;
4027 char tmp_lstr[MAX_LSTR], ctarget, tmp_sstr[MAX_SSTR];
4028 int q_min, q_max, l_min, l_max;
4029 char *dt1_fmt, *dt2_fmt;
4030
4031 zscore = find_z(tot_score, 1.0, n1, comp, pstat_void);
4032 total_bits = zs_to_bit(zscore, n0, n1);
4033
4034 while ((this_annot_p = (struct annot_entry *)pop_stack(annot_stack))!=NULL) {
4035
4036 if (this_annot_p->label == ']') {
4037 if (this_annot_p->target == 1) {
4038 region_p = *region1_p;
4039 if (!region_p) {
4040 fprintf(stderr,"*** error [%s:%d] *** -- target==1 but region1_p is null\n",__FILE__, __LINE__);
4041 #ifdef DEBUG
4042 fprintf(stderr,"*** qtitle: %s\n",ext_qtitle);
4043 #endif
4044 continue;
4045 }
4046 q_min = region_p->a_pos+1;
4047 l_min = region_p->pos+1;
4048 dt2_fmt = "|XR:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
4049 }
4050 else {
4051 region_p = *region0_p;
4052 if (!region_p) {
4053 fprintf(stderr,"*** error [%s:%d] *** -- target==0 but region0_p is null\n",__FILE__, __LINE__);
4054 #ifdef DEBUG
4055 fprintf(stderr,"*** qtitle: %s\n",ext_qtitle);
4056 #endif
4057 continue;
4058 }
4059 q_min = region_p->pos+1;
4060 l_min = region_p->a_pos+1;
4061 dt2_fmt = "|RX:%d-%d:%d-%d:s=%d;b=%.1f;I=%.3f;Q=%.1f";
4062 }
4063
4064 if (region_p->score < 0) {
4065 lbits = 0.0;
4066 lprob = 1.0;
4067 }
4068 else {
4069 lbits = total_bits * (double)region_p->score/tot_score;
4070 zscore = find_z(region_p->score, 1.0, n1, comp, pstat_void);
4071 lprob = zs_to_p(zscore);
4072 }
4073
4074 if (lprob > 0.99) lprob = 0.0;
4075 else if (lprob < 1e-300) lprob = 3000.0;
4076 else lprob = -10.0*log(lprob)/log(10.0);
4077
4078 if (region_p->n_aln > 0) {
4079 lpercid = ((double)region_p->n_ident)/(double)region_p->n_aln;
4080 }
4081 else lpercid = -1.0;
4082
4083 if (d_type == 1) {
4084 if (this_annot_p->target == 0) {dt1_fmt = " qRegion: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";}
4085 else {dt1_fmt = " Region: %d-%d:%d-%d : score=%d; bits=%.1f; Id=%.3f; Q=%.1f : %s\n";}
4086 sprintf(tmp_lstr, dt1_fmt, q_min, i0_pos+1,
4087 l_min, i1_pos+1, region_p->score, lbits, lpercid, lprob,
4088 (region_p->comment) ? region_p->comment : '\0');
4089
4090 }
4091 else if (d_type == 2) {
4092 sprintf(tmp_lstr,dt2_fmt,
4093 q_min, i0_pos+1,
4094 l_min, i1_pos+1, region_p->score, lbits,lpercid, lprob);
4095
4096 if (region_p->comment) {
4097 SAFE_STRNCPY(tmp_sstr,region_p->comment,sizeof(tmp_sstr));
4098 if ((bp=strchr(tmp_sstr,' '))!=NULL) { *bp = '\0';}
4099 SAFE_STRNCAT(tmp_lstr,";C=",sizeof(tmp_lstr));
4100 SAFE_STRNCAT(tmp_lstr,tmp_sstr,sizeof(tmp_lstr));
4101 }
4102 }
4103 /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
4104 dyn_strcat(annot_var_dyn, tmp_lstr);
4105 region_p->score = 0;
4106 region_p = NULL;
4107 }
4108 else if ((ann_comment = this_annot_p->comment)) {
4109 if (d_type == 1 ) {
4110 if (this_annot_p->target == 0) {dt1_fmt = " qSite:%c : %d%c%c%d%c : %s\n";}
4111 else {dt1_fmt = " Site:%c : %d%c%c%d%c : %s\n";}
4112 sprintf(tmp_lstr,dt1_fmt, this_annot_p->label,i0_pos+1, sp0,
4113 sym, i1_pos+1, sp1, ann_comment);
4114 /* SAFE_STRNCAT(annot_var_s,tmp_lstr,n_annot_var_s); */
4115 dyn_strcat(annot_var_dyn, tmp_lstr);
4116 }
4117 }
4118 }
4119 }
4120