1 /*	initfa.c	*/
2 /*  $Id: initfa.c 1274 2014-08-07 18:30:56Z wrp $ */
3 
4 /* copyright (c) 1996, 1997, 1998, 2014 by William R. Pearson and The
5    Rector & Visitors of the University of Virginia */
6 
7 /* Licensed under the Apache License, Version 2.0 (the "License");
8    you may not use this file except in compliance with the License.
9    You may obtain a copy of the License at
10 
11    http://www.apache.org/licenses/LICENSE-2.0
12 
13    Unless required by applicable law or agreed to in writing,
14    software distributed under this License is distributed on an "AS
15    IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16    express or implied.  See the License for the specific language
17    governing permissions and limitations under the License.
18 */
19 
20 /* init??.c files provide function specific initializations */
21 
22 /* h_init()	- called from comp_lib.c, comp_thr.c to initialize pstruct ppst
23    		  which includes the alphabet, and pam matrix
24 
25    alloc_pam()	- allocate pam matrix space
26    init_pam2()	- convert from 1D to 2D pam
27 
28    init_pamx()	- extend pam2 for 'X', 'N', or lower case characters
29 
30    f_initenv()	- set up mngmsg and pstruct defaults
31    f_getopt()	- read fasta specific command line options
32    f_getarg()	- read ktup
33 
34    resetp()	- reset the parameters, scoring matrix for DNA-DNA/DNA-prot
35 
36    query_parm()	- ask for ktup
37    last_init()	- some things must be done last
38 
39    f_initpam()	- set some parameters based on the pam matrix
40 */
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <ctype.h>
45 #include <string.h>
46 #include <math.h>
47 
48 #ifdef UNIX
49 #include <sys/types.h>
50 #include <sys/stat.h>
51 #endif
52 
53 #include "defs.h"
54 #include "structs.h"
55 #include "param.h"
56 #include "best_stats.h"
57 
58 #define XTERNAL
59 #include "upam.h"
60 #include "uascii.h"
61 #undef XTERNAL
62 
63 #define MAXWINDOW 32
64 
65 int initpam(char *, struct pstruct *);
66 void init_pam2 (struct pstruct *ppst);
67 void init_altpam(struct pstruct *ppst);
68 void init_pamx (struct pstruct *ppst);
69 void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst);
70 void build_xascii(int *qascii, char *save_str);
71 void add_ascii_ann(int *qascii, unsigned char *ann_arr);
72 void re_ascii(int *qascii, int *pascii, int max_ann_arr);
73 extern int my_nrand(int, void *);
74 
75 /*  at some point, all the defaults should be driven from this table */
76 /*
77 #pgm	q_seq	l_seq	p_seq	matrix	g_open	g_ext	fr_shft	e_cut	ktup  E_band_opt
78 #	-n/-p		-s	-e	-f	-h/-j	-E	argv[3]
79 fasta	prot(0)	prot(0)	prot(0)	bl50	-10	-2	-	10.0	2	0.02
80 fasta	dna(1)	dna(1)	dna(1)	+5/-4	-12	-4	-	2.0	6	0.01
81 ssearch	prot(0)	prot(0)	prot(0)	bl50	-10	-2	-	10.0	-	-
82 ssearch	dna(1)	dna(1)	dna(1)	+5/-4	-16	-4	-	2.0	-	-
83 fastx	dna(1)	prot(0)	prot(0)	BL50	-12	-2	-20	5.0	2	0.02
84 fasty	dna(1)	prot(0)	prot(0)	BL50	-12	-2	-20/-24	5.0	2	0.02
85 tfastx	dna(1)	prot(0)	prot(0)	BL50	-14	-2	-20	5.0	2	0.01
86 tfasty	dna(1)	prot(0)	prot(0)	BL50	-14	-2	-20/-24	5.0	2	0.01
87 fasts	prot(0)	prot(0)	prot(0)	MD20-MS	-	-	-	5.0	-	-
88 fasts	dna(1)	dna(1)	dna(1)	+2/-4	-	-	-	5.0	1	-
89 tfasts	prot(0)	dna(1)	prot(0)	MD10-MS	-	-	-	2.0	1	-
90 fastf	prot(0)	prot(0)	prot(0)	MD20	-	-	-	2.0	1	-
91 tfastf	prot(0)	dna(1)	prot(0)	MD10	-	-	-	1.0	1	-
92 fastm	prot(0)	prot(0)	prot(0)	MD20	-	-	-	5.0	1	-
93 fastm	dna(1)	dna(1)	dna(1)	+2/-4	-	-	-	2.0	1	-
94 tfastm	prot(0)	dna(1)	prot(0)	MD10	-	-	-	2.0	1	-
95 lalign	prot(0) prot(0) prot(0) BL50    -12     -2	-       1.0	-	-
96 lalign	dna(1)  dna(1)  dna(1)  +5/-4   -12     -4	-	0.1	-	-
97 */
98 
99 void show_help(char *, int );
100 
101 char *ref_str_a[]={
102 /* 0 */ "W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448\n",
103 /* 1 */ "T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197; \n W.R. Pearson (1991) Genomics 11:635-650\n",
104 /* 2 */ "Pearson et al, Genomics (1997) 46:24-36\n",
105 /* 3 */ "Mackey et al. Mol. Cell. Proteomics  (2002) 1:139-147\n",
106 /* 4 */ "W.R. Pearson (1996) Meth. Enzymol. 266:227-258\n",
107 /* 5 */ "X. Huang and W. Miller (1991) Adv. Appl. Math. 12:373-381\n",
108   ""
109 };
110 
111 #define FA_PID	1
112 #define SS_PID	2
113 #define FX_PID	3
114 #define FY_PID	4
115 #define FS_PID	5
116 #define FF_PID	6
117 #define FM_PID	7
118 #define RSS_PID	8
119 #define RFX_PID	9
120 #define SSS_PID 10	/* old (slow) non-PG Smith-Waterman */
121 #define TFA_PID	FA_PID+10
122 #define TFX_PID	FX_PID+10
123 #define TFY_PID	FY_PID+10
124 #define TFS_PID	FS_PID+10
125 #define TFF_PID	FF_PID+10
126 #define TFM_PID FM_PID+10
127 #define LAL_PID 18
128 #define LNW_PID 19
129 #define GNW_PID 20
130 
131 struct pgm_def_str {
132   int pgm_id;
133   char *prog_func;
134   char *info_pgm_abbr;
135   char *iprompt0;
136   char *ref_str;
137   int PgmDID;
138   char *smstr;
139   int g_open_mod;
140   int g_ext_mod;
141   int gshift;
142   int hshift;
143   double e_cut;
144   int ktup;
145   double E_band_opt;
146   int can_pre_align;
147 };
148 
149 static struct pgm_def_str
150 pgm_def_arr[21] = {
151   {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0 },  /* 0 */
152   {FA_PID, "FASTA", "fa",
153    "FASTA searches a protein or DNA sequence data bank",
154    NULL, 401, "BL50", 0, 0, 0, 0, 10.0, 2, 0.2, 1}, /* 1 - FASTA */
155   {SS_PID, "SSEARCH","gsw","SSEARCH performs a Smith-Waterman search",
156    NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - SSEARCH */
157   {FX_PID, "FASTX","fx",
158    "FASTX compares a DNA sequence to a protein sequence data bank",
159    NULL, 405, "BL50", -2, 0, -20, 0, 5.0, 2, 0.10, 1}, /* 3 - FASTX */
160   {FY_PID, "FASTY", "fy",
161    "FASTY compares a DNA sequence to a protein sequence data bank",
162    NULL, 405, "BL50", -2, 0, -20, -24, 5.0, 2, 0.10, 1}, /* 4 - FASTY */
163   {FS_PID, "FASTS", "fs",
164    "FASTS compares linked peptides to a protein data bank",
165    NULL, 400, "MD20-MS", 0, 0, 0, 0, 5.0, 1, 0.0, 0}, /* 5 - FASTS */
166   {FF_PID, "FASTF", "ff",
167    "FASTF compares mixed peptides to a protein databank",
168    NULL, 400, "MD20", 0, 0, 0, 0, 2.0, 1, 0.0, 0 }, /* 6 - FASTF */
169   {FM_PID, "FASTM", "fm",
170    "FASTM compares ordered peptides to a protein data bank",
171    NULL, 400, "MD20", 0, 0, 0, 0, 5.0, 1, 0.0, 0 }, /* 7 - FASTM */
172   {RSS_PID, "PRSS", "rss",
173    "PRSS evaluates statistical signficance using Smith-Waterman",
174    NULL, 401, "BL50", 0, 0, 0, 0, 1000.0, 0, 0.0, 1 }, /* 8 - PRSS */
175   {RFX_PID,"PRFX", "rfx",
176    "PRFX evaluates statistical signficance using FASTX",
177    NULL, 401, "BL50", -2, 0, -20, -24, 1000.0, 2, 0.2, 1 }, /* 9 - PRFX */
178   {SSS_PID, "OSEARCH","ssw","OSEARCH searches a sequence data bank",
179    NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - OSEARCH */
180   {TFA_PID, "TFASTA", "tfa",
181    "TFASTA compares a protein  to a translated DNA data bank",
182    NULL, 402, "BL50", -2, 0, 0, 0, 5.0, 2, 0.1, 1},
183   {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0.0 },  /* 0 */
184   {TFX_PID, "TFASTX", "tfx",
185    "TFASTX compares a protein to a translated DNA data bank",
186    NULL, 406, "BL50", -2, 0, -20, 0, 2.0, 2, 0.10, 1},
187   {TFY_PID, "TFASTY", "tfy",
188    "TFASTY compares a protein to a translated DNA data bank",
189    NULL, 406, "BL50", -2, 0, -20, -24, 2.0, 2, 0.10, 1},
190   {TFS_PID, "TFASTS", "tfs",
191    "TFASTS compares linked peptides to a translated DNA data bank",
192    NULL, 400, "MD10-MS", 0, 0, 0, 0, 2.0, 2, 0.0, 0 },
193   {TFF_PID, "TFASTF", "tff",
194    "TFASTF compares mixed peptides to a protein databank",
195    NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
196   {TFM_PID, "TFASTM", "tfm",
197    "TFASTM compares ordered peptides to a translated DNA databank",
198    NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
199   {LAL_PID, "LALIGN", "lal",
200    "LALIGN finds non-overlapping local alignments",
201    NULL, 404, "BL50", -2, 0, 0, 0, 1.0, 0, 0.0, 1}, 	/* 18 - LALIGN */
202   {LNW_PID, "GLSEARCH", "lnw",
203    "GLSEARCH performs a global-query/local-library search",
204    NULL, 404, "BL50", -2, 0, 0, 0, 10.0, 0, 0.0, 1}, 	/* 19 - GLSEARCH */
205   {GNW_PID, "GGSEARCH", "gnw",
206    "GGSEARCH performs a global/global database searches",
207    NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, 	/* 20 - GGSEARCH */
208 };
209 
210 struct msg_def_str {
211   int pgm_id;
212   int q_seqt;
213   int l_seqt;
214   int p_seqt;
215   int sw_flag;
216   int stages;
217   int qframe;
218   int nframe;
219   int nrelv, srelv, arelv;
220   char *f_id0, *f_id1, *label, *alabel;
221 };
222 
223 /* align_label must be < MAX_SSTR (32) */
224 char *align_label[]={
225   "Smith-Waterman",		/* 0 */
226   "banded Smith-Waterman",	/* 1 */
227   "Waterman-Eggert",		/* 2 */
228   "trans. Smith-Waterman",	/* 3 */
229   "global/local",		/* 4 */
230   "trans. global/local",	/* 5 */
231   "global/global (N-W)"	/* 6 */
232 };
233 
234 /* pgm_id    q_seqt     l_seqt   p_seqt sw_f st qf nf nrv srv arv s_ix */
235 static struct msg_def_str
236 msg_def_arr[21] = {
237   {0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, "", "", ""},	/* ID=0 */
238   {FA_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 1, 3,
239    "fa","sw", "opt"},
240   {SS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
241    "sw","sw", "s-w"},
242   {FX_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
243    "fx","sx", "opt"},
244   {FY_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
245    "fy","sy", "opt"},
246   {FS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
247    "fs","fs", "initn init1"},
248   {FF_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
249    "ff","ff", "initn init1"},
250   {FM_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
251    "fm","fm","initn init1"},
252   {RSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 0, 1, 1, -1, 1, 1, 1,
253    "rss","sw","s-w"},
254   {RFX_PID, SEQT_DNA,SEQT_PROT, SEQT_PROT, 0, 1, 2, -1, 3, 1, 3,
255    "rfx","sx","opt"},
256   {SSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
257    "sw","sw", "s-w"},
258   {TFA_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 0, 1, 1, 6, 3, 1, 3,
259    "tfa","fa","initn init1"},
260   {0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, "", "", ""},	/* ID=12 */
261   {TFX_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
262    "tfx","sx","initn opt"},
263   {TFY_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
264    "tfy","sy","initn opt"},
265   {TFS_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
266    "tfs","fs","initn init1"},
267   {TFF_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
268    "tff","ff","initn init1"},
269   {TFM_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
270    "tfm","fm","initn init1"},
271   {LAL_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
272    "lsw","lsw", "ls-w"},
273   {LNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
274    "gnw","gnw", "n-w"},
275   {GNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
276    "gnw","gnw", "n-w"},
277 };
278 
279 int
get_pgm_id()280 get_pgm_id() {
281 
282   int rval=0;
283 
284 #ifdef FASTA
285 #ifndef TFAST
286   pgm_def_arr[FA_PID].ref_str = ref_str_a[0];
287   msg_def_arr[FA_PID].alabel = align_label[0];
288   rval=FA_PID;
289 #else
290   pgm_def_arr[TFA_PID].ref_str = ref_str_a[0];
291   msg_def_arr[TFA_PID].alabel = align_label[2];
292   rval=TFA_PID;
293 #endif
294 #endif
295 
296 #ifdef FASTX
297 #ifndef TFAST
298 #ifndef PRSS
299   pgm_def_arr[FX_PID].ref_str = ref_str_a[2];
300   msg_def_arr[FX_PID].alabel = align_label[3];
301   rval=FX_PID;
302 #else
303   pgm_def_arr[RFX_PID].ref_str = ref_str_a[2];
304   msg_def_arr[FX_PID].alabel = align_label[3];
305   rval=RFX_PID;
306 #endif
307 #else
308   pgm_def_arr[TFX_PID].ref_str = ref_str_a[2];
309   msg_def_arr[TFX_PID].alabel = align_label[3];
310   rval=TFX_PID;
311 #endif
312 #endif
313 
314 #ifdef FASTY
315 #ifndef TFAST
316   pgm_def_arr[FY_PID].ref_str = ref_str_a[2];
317   msg_def_arr[FY_PID].alabel = align_label[3];
318   rval=FY_PID;
319 #else
320   pgm_def_arr[TFY_PID].ref_str = ref_str_a[2];
321   msg_def_arr[TFY_PID].alabel = align_label[3];
322   rval=TFY_PID;
323 #endif
324 #endif
325 
326 #ifdef FASTS
327 #ifndef TFAST
328   pgm_def_arr[FS_PID].ref_str = ref_str_a[3];
329   msg_def_arr[FS_PID].alabel = align_label[4];
330   rval=FS_PID;
331 #else
332   pgm_def_arr[TFS_PID].ref_str = ref_str_a[3];
333   msg_def_arr[TFS_PID].alabel = align_label[5];
334   rval=TFS_PID;
335 #endif
336 #endif
337 
338 #ifdef FASTF
339 #ifndef TFAST
340   pgm_def_arr[FF_PID].ref_str = ref_str_a[3];
341   msg_def_arr[FF_PID].alabel = align_label[4];
342   rval=FF_PID;
343 #else
344   pgm_def_arr[TFF_PID].ref_str = ref_str_a[3];
345   msg_def_arr[TFF_PID].alabel = align_label[5];
346   rval=TFF_PID;
347 #endif
348 #endif
349 
350 #ifdef FASTM
351 #ifndef TFAST
352   pgm_def_arr[FM_PID].ref_str = ref_str_a[3];
353   msg_def_arr[FM_PID].alabel = align_label[4];
354   rval=FM_PID;
355 #else
356   pgm_def_arr[TFM_PID].ref_str = ref_str_a[3];
357   msg_def_arr[TFM_PID].alabel = align_label[5];
358   rval=TFM_PID;
359 #endif
360 #endif
361 
362 #ifdef SSEARCH
363 #define CAN_PSSM
364   pgm_def_arr[SS_PID].ref_str = ref_str_a[1];
365   msg_def_arr[SS_PID].alabel = align_label[0];
366   rval=SS_PID;
367 #endif
368 
369 #ifdef OSEARCH
370   pgm_def_arr[SSS_PID].ref_str = ref_str_a[1];
371   msg_def_arr[SSS_PID].alabel = align_label[0];
372   rval=SSS_PID;
373 #endif
374 
375 #ifdef LALIGN
376 #define CAN_PSSM
377   pgm_def_arr[LAL_PID].ref_str = ref_str_a[5];
378   msg_def_arr[LAL_PID].alabel = align_label[2];
379   rval=LAL_PID;
380 #endif
381 
382 #ifdef GLSEARCH
383 #define CAN_PSSM
384   pgm_def_arr[LNW_PID].ref_str = ref_str_a[6];
385   msg_def_arr[LNW_PID].alabel = align_label[4];
386   rval=LNW_PID;
387 #endif
388 
389 #ifdef GGSEARCH
390 #define CAN_PSSM
391   pgm_def_arr[GNW_PID].ref_str = ref_str_a[6];
392   msg_def_arr[GNW_PID].alabel = align_label[6];
393   rval=GNW_PID;
394 #endif
395 
396   return rval;
397 }
398 
399 extern struct opt_def_str g_options[];
400 extern void set_opt_disp_defs(char opt_char, struct opt_def_str *options, int type,
401 			      int i_param1, int i_param2,
402 			      double d_param1, double d_param2, char *s_param);
403 
404 static char z_opt_descr[] = "Statistics estimation method:\n      1 - regression; -1 - no stats.; 0 - no scaling; 2 - Maximum Likelihood Est.;\n      3 - Altschul/Gish; 4 - iter. regress.; 5 - regress w/variance;\n      6 - MLE with comp. adj.;\n     11 - 16 - estimates from shuffled library sequences;\n     21 - 26 - E2()-stats from shuffled high-scoring sequences;";
405 
406 static char s_opt_descr[] = "Scoring matrix: (protein)\n      BL50, BP62 (sets -f -11 -g -1); P250, OPT5, VT200,\n      VT160, P120, VT120, BL80, VT80, MD40, VT40, MD20, VT20, MD10, VT10;\n      scoring matrix file name; -s ?BL50 adjusts matrix for short queries;";
407 
408 
409 struct opt_def_str f_options[] = {
410   {'3', 0, "norevcomp", "compare forward strand only", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
411 #if defined(FASTA) || defined(SSEARCH)
412   {'a', 0, "show_all", "show complete Query/Sbjct sequences in alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
413   {'W', 1, "context", "alignment context length (surrounding unaligned sequence)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
414 #endif
415 #if defined(FASTA)
416   {'A', 0, "sw_align", "Smith-Waterman for final DNA alignment, band alignment for protein\n      default is band-alignment for DNA, Smith-Waterman for protein", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
417 #endif
418   {'b', 1, "num_descriptions", "high scores reported (limited by -E by default)",
419    "high scores reported (limited by -E by default);\n      =<int> forces <int> results;", 0, 0, 0, 0, 0.0, 0.0, NULL},
420   {'d', 1, "num_alignments", "number of alignments shown (limited by -E by default)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
421 #if defined(FASTA) || defined(FASTX) || defined(FASTY)
422   {'c', 1, "opt_join", "expected fraction for band-optimization, joining", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
423 #endif
424   {'E', 1, "evalue", "E()-value threshold", "E()-value,E()-repeat threshold", 0, 0, 0, 0, 0.0, 0.0, NULL},
425   {'f', 1, "gapopen", "gap-open penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
426   {'g', 1, "gapext", "gap-extension penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
427 #ifdef SHOW_HELP
428   {'h', 0, "help", "help - show options, arguments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
429 #ifdef FASTY
430   {'j', 1, "frame_subs", "frame-shift, codon substitution penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
431 #else
432 #ifdef FASTX
433   {'j', 1, "frame_shift", "frame-shift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
434 #endif
435 #endif
436 #else
437 #ifndef LALIGN
438   {'h', 1, "frame", "frameshift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
439   {'j', 1, "codon_subs", "codon substitution", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
440 #endif
441 #endif
442 #if defined(LALIGN)
443   {'J', 0, "show_ident", "show identity alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
444   {'K', 1, "max_repeat", "maximum number of non-intersecting alignments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
445 #endif
446   {'k', 1, "nshuffle", "number of shuffles", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
447   {'M', 1, "range", "filter on library sequence length", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
448   {'n', 0, "dna", "DNA/RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
449   {'p', 0, "prot", "protein query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
450 #if defined(CAN_PSSM)
451   {'P', 1, "pssm", "PSSM file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
452 #endif
453   {'r', 1, "dna_ratio", " +match/-mismatch for DNA/RNA", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
454   {'s', 1, "matrix", "scoring matrix", &s_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
455 #if !defined(LALIGN) && !defined(FASTS) && !defined(FASTM)
456   {'S', 0, "seg", "filter lowercase (seg) residues", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
457 #endif
458 #if defined(TFAST) || defined(FASTX) || defined(FASTY)
459   {'t', 1, "gencode", "translation genetic code", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
460 #endif
461   {'U', 0, "rna", "RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
462   {'X', 1, "ext_opts", "Extended options", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
463   {'z', 1, "stats", "Statistics estimation method", &z_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
464   {'\0', 0, "", "", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL}
465 };
466 
f_init_opts(int pgm_id,struct mngmsg * m_msp,struct pstruct * ppst)467 void f_init_opts(int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
468 #if defined(FASTA) || defined(FASTX) || defined(FASTY)
469   set_opt_disp_defs('c', f_options, 4, 0, 0, ppst->param_u.fa.E_band_opt, max(ppst->param_u.fa.E_band_opt*5.0,1.0), NULL);
470 #endif
471 #ifndef LALIGN
472   set_opt_disp_defs('E', f_options, 4, 0, 0, ppst->e_cut, ppst->e_cut_r, NULL);
473 #else
474   set_opt_disp_defs('E', f_options, 3, 0, 0, ppst->e_cut, 0.0, NULL);
475 #endif
476   set_opt_disp_defs('s', f_options, 5, 0, 0, 0.0, 0.0, ppst->pamfile);
477   set_opt_disp_defs('f', f_options, 1, ppst->gdelval, 0, 0.0, 0.0, NULL);
478   set_opt_disp_defs('g', f_options, 1, ppst->ggapval, 0, 0.0, 0.0, NULL);
479 #ifdef FASTY
480   set_opt_disp_defs('j', f_options, 2, ppst->gshift, ppst->gsubs, 0.0, 0.0, NULL);
481 #endif
482 #ifdef FASTX
483   set_opt_disp_defs('j', f_options, 1, ppst->gshift, 0, 0.0, 0.0, NULL);
484 #endif
485 #ifdef LALIGN
486   set_opt_disp_defs('K', f_options, 1, ppst->max_repeat, 0, 0.0, 0.0, NULL);
487 #endif
488   set_opt_disp_defs('k', f_options, 1, m_msp->shuff_max, 0, 0.0, 0.0, NULL);
489   set_opt_disp_defs('r', f_options, 2, ppst->p_d_mat, ppst->p_d_mis, 0.0, 0.0, NULL);
490   set_opt_disp_defs('t', f_options, 1, ppst->tr_type, 0, 0.0, 0.0, NULL);
491   set_opt_disp_defs('z', f_options, 1, ppst->zsflag, 0, 0.0, 0.0, NULL);
492 #if defined(FASTA) || defined(SSEARCH)
493   set_opt_disp_defs('W', f_options, 1, m_msp->aln.llcntx, 0, 0.0, 0.0, NULL);
494 #endif
495 }
496 
497 char *iprompt1=" test sequence file name: ";
498 char *iprompt2=" database file name: ";
499 
500 #ifdef PCOMPLIB
501 char *verstr="36.3.8 Jul, 2015 MPI";
502 #else
503 char *verstr="36.3.8 Jul, 2015";
504 #endif
505 
506 static int mktup=3;
507 static int ktup_set = 0;
508 static int gap_set=0;
509 static int del_set=0;
510 static int mshuff_set = 0;
511 static int prot2dna = 0;
512 void
513 parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst);
514 
515 extern int fa_max_workers;
516 
517 extern void s_abort(char *, char *);
518 extern void init_ascii0(int *sascii, char *sq_map, int sq_map_n, struct pstruct *ppst);
519 extern void init_ascii(int ext_sq, int *sascii, int nsq, int dnaseq);
520 extern void validate_novel_aa(int *sascii, int p_nsq, int dnaseq);
521 extern int standard_pam(char *smstr, struct pstruct *ppst,
522 			int del_set, int gap_set);
523 extern int min_pam_bits(int n0_eff, double bit_thresh, struct pstruct *ppst,
524 			int del_set, int gap_set);
525 extern void mk_n_pam(int *arr,int siz, int mat, int mis);
526 extern int karlin(int , int, double *, double *, double *);
527 extern void init_karlin_a(struct pstruct *, double *, double **);
528 extern int do_karlin_a(int **, const struct pstruct *, double *,
529 		       double *, double *, double *, double *);
530 
531 #if defined(TFAST) || defined(FASTX) || defined(FASTY)
532 extern void aainit(int tr_type, int debug);
533 #endif
534 
535 char *iprompt0, *prog_func, *refstr;
536 
537 /* Sets defaults assuming a protein sequence */
h_init(struct pstruct * ppst,struct mngmsg * m_msp,char * info_pgm_abbr)538 void h_init (struct pstruct *ppst, struct mngmsg *m_msp, char *info_pgm_abbr)
539 {
540   struct pgm_def_str pgm_def;
541   int i, pgm_id;
542 
543   ppst->pgm_id  = pgm_id =  get_pgm_id();
544   pgm_def = pgm_def_arr[pgm_id];
545 
546   /* check that pgm_def_arr[] is valid */
547   if (pgm_def.pgm_id != pgm_id) {
548     fprintf(stderr,
549 	    "**pgm_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
550 	    pgm_def.pgm_id, pgm_id);
551     exit(1);
552   }
553 
554   /* check that msg_def_arr[] is valid */
555   if (msg_def_arr[pgm_id].pgm_id != pgm_id) {
556     fprintf(stderr,
557 	    "**msg_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
558 	    msg_def_arr[pgm_id].pgm_id, pgm_id);
559     exit(1);
560   }
561 
562   SAFE_STRNCPY(info_pgm_abbr,pgm_def.info_pgm_abbr,MAX_SSTR);
563   iprompt0 = pgm_def.iprompt0;
564   refstr = pgm_def.ref_str;
565   prog_func = pgm_def.prog_func;
566 
567   /* used to be MAXTOT = MAXTST+MAXLIB, but now fixed at MAXLIB for
568      pre-loaded libraries */
569   m_msp->max_tot = MAXLIB;
570 
571   init_ascii0(aascii, NCBIstdaa, NCBIstdaa_n, ppst);
572   pascii = aascii;
573 
574   /* set up DNA query sequence if required*/
575   if (msg_def_arr[pgm_id].q_seqt == SEQT_DNA) {
576     memcpy(qascii,nascii,sizeof(qascii));
577     m_msp->qdnaseq = SEQT_DNA;
578   }
579   else { 	/* when SEQT_UNK, start with protein */
580     memcpy(qascii,aascii,sizeof(qascii));
581     m_msp->qdnaseq = msg_def_arr[pgm_id].q_seqt;
582   }
583 
584 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
585   qascii[','] = ESS;
586   /* also initialize aascii, nascii for databases */
587   qascii['*'] = NA;
588   ppst->pam_ms = 1;
589   ppst->do_rep=0;		/* disable multiple alignments */
590   ppst->pseudocts = 200;
591 #else
592   ppst->pam_ms = 0;
593   ppst->do_rep=1;		/* enable multiple alignments */
594 #endif
595 
596   /* initialize a pam matrix */
597   SAFE_STRNCPY(ppst->pamfile,pgm_def.smstr,MAX_FN);
598   standard_pam(ppst->pamfile,ppst,del_set,gap_set);
599   ppst->have_pam2 = 0;
600 
601   /* specify pre-alignment */
602   ppst->can_pre_align = pgm_def.can_pre_align;
603 
604   ppst->p_d_mat = 5;
605   ppst->p_d_mis = -4;
606 
607   /* this is always protein by default */
608   ppst->nsq = NCBIstdaa_n;
609   ppst->nsqx = NCBIstdaa_ext_n;
610   /* we need to populate ppst->sq to nsqx for direct lc mapping */
611   for (i=0; i<ppst->nsqx; i++) {
612     ppst->sq[i] = NCBIstdaa_l[i];
613     ppst->hsq[i] = h_NCBIstdaa[i];
614   }
615   for (i=0; i<ppst->nsqx; i++) {
616     ppst->sqx[i]=NCBIstdaa_ext[i];
617     ppst->hsqx[i]=h_NCBIstdaa_ext[i];
618   }
619   ppst->sq[ppst->nsq] = ppst->sqx[ppst->nsqx] = '\0';
620 
621   /* set up the c_nt[] mapping */
622 
623 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
624   ppst->c_nt[ESS] = ESS;
625 #endif
626   ppst->c_nt[0]=0;
627   for (i=1; i<nnt; i++) {
628     ppst->c_nt[i]=gc_nt[i];
629     ppst->c_nt[i+nnt]=gc_nt[i]+nnt;
630   }
631 
632 #ifdef CAN_PSSM
633   ppst->pam2p[0] = NULL;
634   ppst->pam2p[1] = NULL;
635 #endif
636 }
637 
638 /*
639  * alloc_pam(): allocates memory for the 2D pam matrix as well
640  * as for the integer array used to transmit the pam matrix
641  */
642 void
alloc_pam(int d1,int d2,struct pstruct * ppst)643 alloc_pam (int d1, int d2, struct pstruct *ppst)
644 {
645   int     i, *d2p;
646   char err_str[128];
647 
648   if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
649      sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
650      s_abort (err_str,"");
651   }
652 
653   if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
654      sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
655      s_abort (err_str,"");
656   }
657 
658   if ((d2p = (int *) calloc (d1 * d2, sizeof (int))) == NULL) {
659      sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
660      s_abort (err_str,"");
661    }
662 
663    for (i = 0; i < d1; i++, d2p += d2)
664       ppst->pam2[0][i] = d2p;
665 
666    if ((d2p= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
667      sprintf(err_str,"Cannot allocate 2d pam matrix: %d",d2);
668      s_abort (err_str,"");
669    }
670 
671    for (i = 0;  i < d1; i++, d2p += d2)
672       ppst->pam2[1][i] = d2p;
673 
674    ppst->have_pam2 = 1;
675 }
676 
677 /*
678  *  init_pam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
679  *  currently, this function is very protein centric
680  */
681 void
init_pam2(struct pstruct * ppst)682 init_pam2 (struct pstruct *ppst) {
683   int     i, j, k, nsq, sa_t;
684   int ix_j, ix_l, ix_i, p_i, p_j;
685 
686   nsq = ppst->nsq;
687 
688   ppst->pam2[0][0][0] = -BIGNUM;
689   ppst->pam_h = -1; ppst->pam_l = 1;
690 
691   k = 0;
692 
693   if (ppst->dnaseq == 0) { /* not DNA */
694     sa_t = aascii['*'];	/* this is the last character for which pam[] is available */
695     pam_sq = apam_sq;
696     pam_sq_n = apam_sq_n;
697   }
698   else {	/* have DNA, no '*' */
699     sa_t = nascii['X'];
700     pam_sq = npam_sq;
701     pam_sq_n = npam_sq_n;
702   }
703 
704   /* we use sa_t here because that is the last position in the 1-D
705      matrix */
706   for (i = 1; i < sa_t; i++) {
707     p_i = pascii[pam_sq[i]];
708     ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
709     for (j = 1; j <= i; j++) {
710       /* here is where the pam file is actually set */
711       p_j = pascii[pam_sq[j]];
712       ppst->pam2[0][p_j][p_i] = ppst->pam2[0][p_i][p_j] = pam[k++] - ppst->pamoff;
713       if (ppst->pam_l > ppst->pam2[0][p_i][p_j]) ppst->pam_l = ppst->pam2[0][p_i][p_j];
714       if (ppst->pam_h < ppst->pam2[0][p_i][p_j]) ppst->pam_h = ppst->pam2[0][p_i][p_j];
715     }
716   }
717 
718   /* need to do the same thing for characters > sa_t */
719   for (i = sa_t+1; i < pam_sq_n; i++) {
720     p_i = pascii[pam_sq[i]];
721     ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
722   }
723 
724   if (ppst->dnaseq == 0) {
725     init_altpam(ppst);
726   }
727 }
728 
729 void
init_altpam(struct pstruct * ppst)730 init_altpam(struct pstruct *ppst) {
731   int ix_i, ix_l, ix_j, p_i, p_j, i;
732 
733   /* add values for 'J' (I/L) value, which are not present in 1-D matrices */
734     ix_i = pascii['I'];
735     ix_l = pascii['L'];
736     ix_j = pascii['J'];
737     if (strchr(pam_sq,'J')==NULL) {
738       ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
739       /* get the identities */
740       ppst->pam2[0][ix_j][ix_j] =
741 	max(ppst->pam2[0][ix_i][ix_i],ppst->pam2[0][ix_l][ix_l]);
742       for (i=1; i < pam_sq_n; i++) {
743 	p_i = pascii[pam_sq[i]];
744 	/* do not assume symmetric matrices */
745 	ppst->pam2[0][ix_j][p_i] =
746 	  max(ppst->pam2[0][ix_i][p_i],ppst->pam2[0][ix_l][p_i]);
747 	ppst->pam2[0][p_i][ix_j] =
748 	  max(ppst->pam2[0][p_i][ix_i],ppst->pam2[0][p_i][ix_l]);
749       }
750     }
751     /* add values for 'O' (K) value, which are not present in 1-D matrices */
752     ix_i = pascii['K'];
753     ix_j = pascii['O'];
754       if (ix_j < ppst->nsq) {	/* is it in the NCBIstdaa alphabet ? */
755       ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
756       /* get the identity */
757       ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
758       /* do not assume symmetric matrices */
759       for (i=1; i < pam_sq_n; i++) {
760 	p_i = pascii[pam_sq[i]];
761 	ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
762 	ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
763       }
764     }
765     else {
766       pascii['O'] = pascii['K'];
767       pascii['o'] = pascii['k'];
768     }
769 
770     /* add values for 'U' (C) value, which are not present in 1-D matrices */
771     ix_i = pascii['C'];
772     ix_j = pascii['U'];
773       if (ix_j < ppst->nsq) {	/* is it in the NCBIstdaa alphabet */
774       ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
775       /* get the identity */
776       ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
777       /* do not assume symmetric matrices */
778       for (i=1; i < pam_sq_n; i++) {
779 	p_i = pascii[pam_sq[i]];
780 	ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
781 	ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
782       }
783     }
784     else {
785       pascii['U'] = pascii['C'];
786       pascii['u'] = pascii['c'];
787     }
788 }
789 
790 /* extend the standard pam matrix for special residues
791    (a) 'X' for protein and 'N' for DNA, 'G' and 'U' for RNA,
792    (b) lower-case characters for ext_sq_set
793 
794    must be called after init_pam2()
795 */
796 void
init_pamx(struct pstruct * ppst)797 init_pamx (struct pstruct *ppst) {
798   int     i, j, k, nsq;
799   int sa_x, sa_t, tmp;
800 
801   nsq = ppst->nsq;
802 
803   ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
804 
805   /* sa_x is the 'unknown' character, 'X' for proteins, 'N' for DNA/RNA */
806   /* sa_t is the termination character -- not used for DNA */
807 
808   if (ppst->nt_align) {
809     sa_x = pascii['N'];
810     sa_t = sa_x;
811   }
812   else {
813     sa_x = pascii['X'];
814     sa_t = pascii['*'];
815   }
816 
817   /* build an asymmetric matrix for RNA */
818   if (ppst->dnaseq == SEQT_RNA && !ppst->pam_set) {
819     tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 3;
820     ppst->pam2[0][nascii['A']][nascii['G']] =
821       ppst->pam2[0][nascii['C']][nascii['T']] =
822       ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
823   }
824 
825   if (ppst->pam_x_set) {
826     for (i=1; i<nsq; i++) {
827       if (sa_x < nsq) ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
828       if (sa_t < nsq) ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
829     }
830     if (sa_x < nsq) ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
831     if (sa_t < nsq) ppst->pam2[0][sa_t][sa_t]=ppst->pam_xx;
832   }
833   else {
834     ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
835     ppst->pam_xm = ppst->pam2[0][1][sa_x];
836   }
837 
838     /* fill in pam2[1] matrix */
839     ppst->pam2[1][0][0] = -BIGNUM;
840     /* fill in additional parts of the matrix */
841     for (i = 0; i < nsq; i++) {
842       /* -BIGNUM to all matches vs 0 */
843       ppst->pam2[0][0][i+nsq] = ppst->pam2[0][i+nsq][0] =
844 	ppst->pam2[1][0][i+nsq] = ppst->pam2[1][i+nsq][0] =
845 	ppst->pam2[0][0][i] = ppst->pam2[0][i][0] =
846 	ppst->pam2[1][0][i] = ppst->pam2[1][i][0] = -BIGNUM;
847 
848       for (j = 0; j < nsq; j++) {
849 	/* replicate pam2[0] to i+nsq, j+nsq, also initialize lowest of pam2[1]*/
850 	ppst->pam2[0][i+nsq][j] = ppst->pam2[0][i][j+nsq] =  ppst->pam2[0][i+nsq][j+nsq] =
851 	  ppst->pam2[1][i][j] = ppst->pam2[0][i][j];
852 
853 	/* set the high portion of pam2[1] to the corresponding value
854 	   of pam2[1][sa_x][j] */
855 
856 	ppst->pam2[1][i+nsq][j] = ppst->pam2[1][i][j+nsq]=
857 	  ppst->pam2[1][i+nsq][j+nsq]=ppst->pam2[0][sa_x][j];
858       }
859     }
860 
861     /* set matches to the internal '-' to pam_xx */
862     /* this needs to be adjusted for multiple internal '-----' */
863     for (i=1; i< nsq; i++) {
864       ppst->pam2[0][nsq][i] = ppst->pam2[0][i][nsq] =
865 	ppst->pam2[0][nsq][i+nsq] = ppst->pam2[0][i+nsq][nsq] =
866 	ppst->pam2[1][nsq][i] = ppst->pam2[1][i][nsq] =
867 	ppst->pam2[1][nsq][i+nsq] = ppst->pam2[1][i+nsq][nsq] = ppst->pam_xm;
868     }
869     ppst->pam2[0][nsq][nsq] = ppst->pam2[1][nsq][nsq] = ppst->pam_xm;
870 }
871 
872 /*  function specific initializations */
873 void
f_initenv(struct mngmsg * m_msp,struct pstruct * ppst,unsigned char ** aa0)874 f_initenv (struct mngmsg *m_msp, struct pstruct *ppst, unsigned char **aa0) {
875   struct msg_def_str m_msg_def;
876   int pgm_id;
877 
878   pgm_id = ppst->pgm_id;
879   m_msg_def = msg_def_arr[pgm_id];
880 
881   m_msp->last_calc_flg=0;
882 
883   SAFE_STRNCPY(m_msp->f_id0,m_msg_def.f_id0,sizeof(m_msp->f_id0));
884   SAFE_STRNCPY(m_msp->f_id1,m_msg_def.f_id1,sizeof(m_msp->f_id1));
885   SAFE_STRNCPY (m_msp->label, m_msg_def.label, sizeof(m_msp->label));
886   SAFE_STRNCPY(m_msp->alabel, m_msg_def.alabel, sizeof(m_msp->alabel));
887 
888 #if !defined(SSEARCH) && !defined(GGSEARCH) && !defined(GLSEARCH) && !defined(LALIGN)
889   SAFE_STRNCPY (m_msp->alab[0],"initn",20);
890   SAFE_STRNCPY (m_msp->alab[1],"init1",20);
891   SAFE_STRNCPY (m_msp->alab[2],"opt",20);
892 #else
893 #if defined(SSEARCH) || defined(LALIGN)
894   SAFE_STRNCPY (m_msp->alab[0],"s-w opt",20);
895 #else
896   SAFE_STRNCPY (m_msp->alab[0],"n-w opt",20);
897 #endif
898 #endif
899 
900 #if defined(GGSEARCH) || defined(GLSEARCH)
901   m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 0;
902 #else
903   m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 1;
904   m_msp->zsflag2 = ppst->zsflag2 = 1;
905 #endif
906 
907   ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;
908   ppst->ggapval += pgm_def_arr[pgm_id].g_ext_mod;
909 #if defined(FASTX) || defined(FASTY)
910   ppst->gshift = pgm_def_arr[pgm_id].gshift;
911   ppst->gsubs = pgm_def_arr[pgm_id].hshift;
912 #endif
913   ppst->sw_flag = m_msg_def.sw_flag;
914   ppst->e_cut = m_msp->e_cut=pgm_def_arr[pgm_id].e_cut;
915 #ifndef LALIGN
916   ppst->e_cut_r = ppst->e_cut/10.0;	/* more significant */
917 #else
918   ppst->e_cut_r = ppst->e_cut;		/* everything if local */
919 #endif
920 
921   ppst->score_ix = 0;
922   ppst->histint = 2;
923   m_msp->qframe = m_msg_def.qframe;
924   m_msp->nframe = m_msg_def.nframe;
925   m_msp->nrelv = m_msg_def.nrelv;
926   m_msp->srelv = m_msg_def.srelv;
927   m_msp->arelv = m_msg_def.arelv;
928   m_msp->stages = m_msg_def.stages;
929   m_msp->shuff_wid = 0;
930   m_msp->shuff_max = MAX_RSTATS;
931   m_msp->shuff_max_save = m_msp->shuff_max;
932 
933   /* see param.h for the definition of all these */
934 
935   m_msp->qshuffle = 0;
936   m_msp->nm0 = 1;
937   m_msp->escore_flg = 0;
938 
939   /* pam information */
940   ppst->pam_pssm = 0;
941 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
942    ppst->pam_xx = ppst->pam_xm = 0;
943 #else
944   ppst->pam_xx = 1;  /* set >0 to use pam['X']['X'] value */
945   ppst->pam_xm = -1;  /* set >0 to use pam['X']['A-Z'] value */
946 #endif
947   ppst->pam_x_set = 0;
948   ppst->pam_x_id_sim = 0;
949   ppst->pam_set = ppst->pam_variable = 0;
950   ppst->pam_pssm = 0;
951   ppst->p_d_set = 0;
952   ppst->pamoff = 0;
953   ppst->ext_sq_set = 0;
954   ppst->nsq_e = ppst->nsq;
955 
956   /* initial settings for protein */
957   if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
958     mktup = 3;
959     ppst->param_u.fa.bestscale = 300;
960     ppst->param_u.fa.bestoff = 36;
961     ppst->param_u.fa.bkfact = 6;
962     ppst->param_u.fa.scfact = 3;
963     ppst->param_u.fa.bktup = mktup;
964     ppst->param_u.fa.ktup = 0;
965     ppst->param_u.fa.bestmax = 50;
966     ppst->param_u.fa.pamfact = 1;
967     ppst->param_u.fa.altflag = 0;
968     ppst->param_u.fa.optflag = 1;
969     ppst->param_u.fa.iniflag = 0;
970     ppst->param_u.fa.optcut = 0;
971     ppst->param_u.fa.optcut_set = 0;
972     ppst->param_u.fa.cgap = 0;
973     ppst->param_u.fa.optwid = 16;
974     ppst->param_u.fa.optwid_set = 0;
975     ppst->param_u.fa.E_band_opt = pgm_def_arr[ppst->pgm_id].E_band_opt;
976     ppst->param_u.fa.use_E_thresholds = 1; /* disable E-thresholds for now */
977   }
978 
979   f_init_opts(pgm_id, m_msp, ppst);
980 }
981 
982 /*  switches for fasta only */
983 
984 static int shift_set=0;
985 static int subs_set=0;
986 static int sw_flag_set=0;
987 static int nframe_set=0;
988 static int E_thresh_set = 0;
989 static int E_cgap_set = 0;
990 
991 void
f_getopt(char copt,char * optarg,struct mngmsg * m_msg,struct pstruct * ppst)992 f_getopt (char copt, char *optarg,
993 	  struct mngmsg *m_msg, struct pstruct *ppst)
994 {
995   int pgm_id;
996   double tmp_f, tmp_f1;
997   double tmp_e_cut, tmp_e_rep;
998   int dnaseq_save;
999   char *bp;
1000 
1001   pgm_id = ppst->pgm_id;
1002 
1003   switch (copt) {
1004   case '3':
1005     nframe_set = 1;
1006     if (pgm_id == TFA_PID) {
1007       m_msg->nframe = 3; break;
1008     }
1009     else {
1010       m_msg->nframe = 1;	/* for TFASTXY */
1011       m_msg->qframe = 1;  /* for FASTA, FASTX */
1012     }
1013     break;
1014   case 'a': m_msg->aln.showall = 1; break;
1015   case 'A':
1016     if (ppst->sw_flag) ppst->sw_flag=0;
1017     else ppst->sw_flag= 1;
1018     sw_flag_set = 1;
1019     break;
1020   case 'b':
1021     if (optarg[0] == '$') {
1022       m_msg->mshow = -1;
1023       m_msg->e_cut = 10000000.0;
1024       break;
1025     }
1026     else if (optarg[0] == '=') {
1027       m_msg->e_cut = 10000000.0;
1028       m_msg->e_cut_set = 1;
1029       m_msg->mshow_min = 1;
1030       sscanf (optarg+1, "%d", &m_msg->mshow);
1031     }
1032     else if (optarg[0] == '>') {
1033       m_msg->mshow_min = 2;
1034       sscanf (optarg+1, "%d", &m_msg->mshow);
1035     }
1036     else {
1037       sscanf (optarg, "%d", &m_msg->mshow);
1038       m_msg->mshow_min = 0;
1039     }
1040     m_msg->mshow_set = 1;
1041     break;
1042   case 'c':
1043     tmp_f = tmp_f1 = 0.0;
1044     if (*optarg == 'O') {
1045       ppst->param_u.fa.use_E_thresholds = 0;
1046       optarg++;
1047     }
1048     if (*optarg != '\0' && pgm_def_arr[pgm_id].ktup > 0) {
1049       sscanf (optarg, "%lf %lf", &tmp_f, &tmp_f1);
1050       if (tmp_f > 1.0) {
1051 	ppst->param_u.fa.optcut = (int)(tmp_f+0.1);
1052 	ppst->param_u.fa.use_E_thresholds = 0;
1053 	ppst->param_u.fa.optcut_set = 1;
1054       }
1055       else if (tmp_f <= 0.0) {
1056 	ppst->param_u.fa.use_E_thresholds = 1;
1057       }
1058       else {	/* 0.0 < tmp_f <= 1.0 */
1059 	ppst->param_u.fa.use_E_thresholds = 1;
1060 	ppst->param_u.fa.E_band_opt = min(tmp_f,1.0);
1061 	E_thresh_set = 1;
1062 	if (tmp_f1 > 0.0) {
1063 	  tmp_f1 = min(tmp_f1,1.0);	/* may want to do max(tmp_f1,tmp_f) */
1064 	  ppst->param_u.fa.E_join = tmp_f1;
1065 	  E_cgap_set=1;
1066 	}
1067       }
1068     }
1069     break;
1070   case 'd': sscanf(optarg,"%d",&m_msg->ashow);
1071     if ((m_msg->mshow > 0) && (m_msg->ashow > m_msg->mshow)) m_msg->mshow=m_msg->ashow;
1072     m_msg->ashow_set = 1;
1073     break;
1074   case 'E':
1075     if (strchr(optarg,' ')) {	/* check for 1 or 2 values */
1076       sscanf(optarg,"%lf %lf",&tmp_e_cut, &tmp_e_rep);
1077       if (tmp_e_rep <= 0.0) {	/* two values, 2nd <= 0.0, no do_rep */
1078 	ppst->do_rep = 0;
1079 	ppst->e_cut_r = 1E-100;
1080 	tmp_e_rep = -2.0;
1081       }
1082       else {ppst->do_rep = 1;}
1083     }
1084     else {	/* one value, do_rep; tmp_e_rep=10.0 */
1085       sscanf(optarg,"%lf",&tmp_e_cut);
1086 #ifndef LALIGN
1087       tmp_e_rep = 10.0;
1088 #else
1089       tmp_e_rep = 1.0;
1090 #endif
1091       ppst->do_rep = 1;
1092     }
1093     if (!m_msg->e_cut_set && tmp_e_cut > 0.0 ) {
1094       ppst->e_cut = m_msg->e_cut = tmp_e_cut;
1095     }
1096     m_msg->e_cut_set = 1;
1097 
1098     if (tmp_e_rep > 0.0) {
1099       if (tmp_e_rep >= 1.0) { ppst->e_cut_r = ppst->e_cut/tmp_e_rep;}
1100       else { ppst->e_cut_r = tmp_e_rep;}
1101     }
1102     break;
1103   case 'f':
1104     sscanf (optarg, "%d", &ppst->gdelval);
1105     if (ppst->gdelval > 0) ppst->gdelval = -ppst->gdelval;
1106     del_set = 1;
1107     break;
1108   case 'g':
1109     sscanf (optarg, "%d", &ppst->ggapval);
1110     if (ppst->ggapval > 0) ppst->ggapval = -ppst->ggapval;
1111     gap_set = 1;
1112     break;
1113 #ifndef SHOW_HELP
1114   case 'h':
1115     sscanf (optarg, "%d", &ppst->gshift);
1116     if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
1117     shift_set = 1;
1118     break;
1119   case 'j':
1120     sscanf (optarg, "%d", &ppst->gsubs);
1121     if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1122     subs_set = 1;
1123     break;
1124 #else
1125   case 'h':
1126     show_help(m_msg->pgm_name, pgm_id);
1127     break;
1128   case 'j':
1129 #ifdef FASTY
1130     if (strchr(optarg,' ')) {
1131       sscanf (optarg, "%d %d", &ppst->gshift, &ppst->gsubs);
1132       subs_set = 1;
1133       if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1134     }
1135     else if (strchr(optarg,',')) {
1136       sscanf (optarg, "%d,%d", &ppst->gshift, &ppst->gsubs);
1137       subs_set = 1;
1138       if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1139     }
1140     else {
1141       sscanf (optarg, "%d", &ppst->gshift);
1142     }
1143 #else
1144 #ifdef FASTX
1145     sscanf (optarg, "%d", &ppst->gshift);
1146 #endif
1147 #endif
1148     if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
1149     shift_set = 1;
1150     break;
1151 #endif
1152   case 'J':
1153 #ifdef LALIGN
1154     ppst->show_ident=1;
1155 #else
1156     ppst->show_ident=0;
1157 #endif
1158     break;
1159 
1160 
1161 #ifdef LALIGN
1162   case 'K':
1163     sscanf(optarg,"%d", &ppst->max_repeat);
1164     break;
1165 #endif
1166   case 'k':
1167     sscanf (optarg, "%d", &m_msg->shuff_max);
1168     m_msg->shuff_max_save = m_msg->shuff_max;
1169     mshuff_set = 1;
1170     break;
1171   case 'M':
1172     sscanf(optarg,"%d-%d",&m_msg->n1_low,&m_msg->n1_high);
1173     if (m_msg->n1_low < 0) {
1174       m_msg->n1_high = -m_msg->n1_low;
1175       m_msg->n1_low = 0;
1176     }
1177     if (m_msg->n1_high == 0) m_msg->n1_high = BIGNUM;
1178     if (m_msg->n1_low > m_msg->n1_high) {
1179       fprintf(stderr," low cutoff %d greater than high %d\n",
1180 	      m_msg->n1_low, m_msg->n1_high);
1181       m_msg->n1_low = 0;
1182       m_msg->n1_high = BIGNUM;
1183     }
1184     ppst->n1_low = m_msg->n1_low;
1185     ppst->n1_high = m_msg->n1_high;
1186     break;
1187   case 'n':
1188     m_msg->qdnaseq = SEQT_DNA;
1189     re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
1190     SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1191     prot2dna = 1;
1192     break;
1193   case 'o':
1194   case 'p':
1195     m_msg->qdnaseq = SEQT_PROT;
1196     ppst->dnaseq = SEQT_PROT;
1197     SAFE_STRNCPY(m_msg->sqnam,"aa",4);
1198     break;
1199   case 'P':
1200     SAFE_STRNCPY(ppst->pgpfile,optarg,MAX_FN);
1201     if ((bp=strchr(ppst->pgpfile,' '))!=NULL) {
1202       *bp='\0';
1203       ppst->pgpfile_type = atoi(bp+1);
1204     }
1205     else ppst->pgpfile_type = 0;
1206     ppst->pam_pssm = 1;
1207     break;
1208   case 'r':
1209     sscanf(optarg,"%d/%d",&ppst->p_d_mat,&ppst->p_d_mis);
1210     ppst->pam_set = 0;
1211     ppst->p_d_set = 1;
1212 
1213     SAFE_STRNCPY(ppst->pam_name, "DNA", 4);
1214     if (ppst->dnaseq != SEQT_RNA) ppst->dnaseq = SEQT_DNA;
1215     if (ppst->p_d_mat > 0 && ppst->p_d_mis < 0) {
1216       ppst->p_d_set = 1;
1217       SAFE_STRNCPY(ppst->pamfile,optarg,40);
1218     }
1219     break;
1220     /* modified Sept, 2011, to recognize that a scoring matrix
1221        specifies a sequence alphabet */
1222   case 's':
1223     if (*optarg == '?') {
1224       ppst->pam_variable = 1;
1225       optarg++;
1226     }
1227     if (*optarg == '\0') break;
1228     SAFE_STRNCPY (ppst->pamfile, optarg, MAX_FN);
1229     dnaseq_save = ppst->dnaseq;
1230     /* check for default abbreviation */
1231     if (!standard_pam(ppst->pamfile,ppst,del_set, gap_set)) {
1232       /* check/load matrix file */
1233       if (!initpam (ppst->pamfile, ppst)) {
1234 	/* matrix file failed, use default matrix */
1235 	SAFE_STRNCPY(ppst->pamfile,pgm_def_arr[pgm_id].smstr,MAX_FN);
1236       }
1237     }
1238     ppst->pam_set=1;
1239     /* check for changing alphabet here */
1240     if (ppst->dnaseq != dnaseq_save && ppst->dnaseq >= SEQT_DNA) {
1241       m_msg->qdnaseq = SEQT_DNA;
1242       re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
1243       SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1244       prot2dna = 1;
1245     }
1246     break;
1247   case 'S':	/* turn on extended alphabet for seg */
1248     ppst->ext_sq_set = 1;
1249     ppst->nsq_e = ppst->nsqx;
1250     break;
1251   case 't':
1252     if (tolower(optarg[0])=='t') {
1253       m_msg->ldb_info.term_code = aascii['*'];
1254       optarg++;
1255     }
1256     if (*optarg) {sscanf (optarg, "%d", &ppst->tr_type);}
1257     break;
1258   case 'U':
1259     m_msg->qdnaseq = SEQT_RNA;
1260     memcpy(qascii,nascii,sizeof(qascii));
1261     SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1262     nt[nascii['T']]='U';
1263     prot2dna=1;
1264     break;
1265   case 'W':
1266     sscanf (optarg,"%d",&m_msg->aln.llcntx);
1267     m_msg->aln.llcntx_set = 1;
1268     break;
1269   case 'X':
1270     parse_ext_opts(optarg, pgm_id, m_msg, ppst);
1271     break;
1272   case 'z':
1273     if (strchr(optarg,' ')!=NULL) {
1274       sscanf(optarg,"%d %d",&ppst->zsflag,&ppst->zsflag2);
1275       if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
1276     }
1277     else if (strchr(optarg,',')!=NULL) {
1278       sscanf(optarg,"%d,%d",&ppst->zsflag,&ppst->zsflag2);
1279       if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
1280     }
1281     else {
1282       sscanf(optarg,"%d",&ppst->zsflag);
1283       ppst->zsflag2 = (ppst->zsflag % 10);
1284     }
1285     break;
1286   }
1287 }
1288 
1289 static char my_opts[] = "1BIM:ox:y:N:";
1290 
1291 void
parse_ext_opts(char * opt_arg,int pgm_id,struct mngmsg * m_msp,struct pstruct * ppst)1292 parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
1293   long l_arg;
1294   char c_arg, c_opt, *the_arg, *bp;
1295 
1296   c_opt = *opt_arg;
1297   if ((bp=strchr(my_opts, c_opt))==NULL) {
1298     return;
1299   }
1300 
1301   if (*(bp+1) == ':') the_arg = opt_arg+1;
1302 
1303   switch (c_opt) {
1304   case '1':
1305     if (pgm_def_arr[pgm_id].ktup > 0) {
1306       ppst->param_u.fa.iniflag=1;
1307     }
1308     break;
1309   case 'B': m_msp->z_bits = 0; break;
1310   case 'I':
1311     m_msp->tot_ident = 1;
1312     /*
1313     l_arg = 0;
1314     sscanf(the_arg,"%ld",&l_arg);
1315     if (l_arg > 0) m_msp->tot_ident = l_arg;
1316     */
1317     break;
1318   case 'M':
1319     c_arg = '\0';
1320     sscanf(the_arg,"%ld%c",&l_arg,&c_arg);
1321     if (l_arg < 0) m_msp->max_memK = BIGNUM;
1322     else {
1323       l_arg *= 1024;
1324       if (c_arg == 'G') l_arg *= 1024;
1325       m_msp->max_memK = l_arg;
1326     }
1327     break;
1328   case 'N':
1329   case 'X':
1330     ppst->pam_x_id_sim = 0;
1331     if (*the_arg == 'S' || *the_arg == '+') {
1332       ppst->pam_x_id_sim = 1;
1333     }
1334     else if (*the_arg == 'D' || *the_arg == '-') {
1335       ppst->pam_x_id_sim = -1;
1336     }
1337     break;
1338   case 'o':
1339     if (pgm_def_arr[pgm_id].ktup > 0) {
1340       ppst->param_u.fa.optflag = 0;
1341       msg_def_arr[pgm_id].nrelv = m_msp->nrelv = 2;
1342     }
1343     break;
1344   case 'x':
1345     if (strchr(the_arg,' ')!=NULL) {
1346       sscanf (the_arg,"%d %d",&ppst->pam_xx, &ppst->pam_xm);
1347     }
1348     else if (strchr(the_arg,',')!=NULL) {
1349       sscanf (the_arg,"%d,%d",&ppst->pam_xx, &ppst->pam_xm);
1350     }
1351     else {
1352       sscanf (the_arg,"%d",&ppst->pam_xx);
1353       ppst->pam_xm = ppst->pam_xx;
1354     }
1355     ppst->pam_x_set=1;
1356     break;
1357   case 'y':
1358     if (pgm_def_arr[pgm_id].ktup > 0) {
1359       sscanf (the_arg, "%d", &ppst->param_u.fa.optwid);
1360       ppst->param_u.fa.optwid_set = 1;
1361     }
1362     break;
1363   }
1364 }
1365 
1366 void
f_lastenv(struct mngmsg * m_msg,struct pstruct * ppst)1367 f_lastenv (struct mngmsg *m_msg, struct pstruct *ppst)
1368 {
1369   char save_str[MAX_SSTR];
1370 
1371 #if !defined(FASTM) && !defined(FASTS) && !defined(FASTF)
1372   SAFE_STRNCPY(save_str,"*",sizeof(save_str));
1373 #else
1374   SAFE_STRNCPY(save_str,",",sizeof(save_str));
1375 #endif
1376 
1377   if (m_msg->qdnaseq == SEQT_UNK) {
1378     build_xascii(qascii,save_str);
1379     if (m_msg->ann_flg) add_ascii_ann(qascii,m_msg->ann_arr);
1380   }
1381 /* this check allows lc DNA sequence queries with FASTX */
1382   else {
1383 #if !defined(FASTS) && !defined(FASTM) && !defined(FASTF) && !defined(FASTX) && !defined(FASTY)
1384     init_ascii(ppst->ext_sq_set,qascii, ppst->nsq, m_msg->qdnaseq);
1385 #endif
1386     validate_novel_aa(qascii, ppst->nsq, m_msg->qdnaseq);
1387   }
1388 }
1389 
1390 void
f_getarg(int argc,char ** argv,int optind,struct mngmsg * m_msg,struct pstruct * ppst)1391 f_getarg (int argc, char **argv, int optind,
1392 	  struct mngmsg *m_msg, struct pstruct *ppst)
1393 {
1394 
1395   if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1396     if (argc - optind >= 4) {
1397       sscanf (argv[optind + 3], "%d", &ppst->param_u.fa.ktup);
1398       ktup_set = 1;
1399     }
1400     else {
1401       ppst->param_u.fa.ktup = -pgm_def_arr[ppst->pgm_id].ktup;
1402     }
1403   }
1404 
1405   if (ppst->pgm_id == RSS_PID && argc - optind > 3) {
1406     sscanf (argv[optind + 3], "%d", &m_msg->shuff_max);
1407   }
1408 
1409   if (ppst->pgm_id == RFX_PID && argc - optind > 4) {
1410     sscanf (argv[optind + 4], "%d", &m_msg->shuff_max);
1411   }
1412   m_msg->shuff_max_save = m_msg->shuff_max;
1413 }
1414 
1415 /* fills in the query ascii mapping from the parameter
1416    ascii mapping.
1417 */
1418 
1419 void
re_ascii(int * qascii,int * pascii,int max_ann_arr)1420 re_ascii(int *qascii, int *pascii, int max_ann_arr) {
1421   int i;
1422 
1423   for (i=0; i < 128; i++) {
1424     if (qascii[i] > NANN+max_ann_arr || qascii[i] < ESS) {
1425       qascii[i] = pascii[i];
1426     }
1427   }
1428 }
1429 
1430 
1431 /* recode has become function specific to accommodate FASTS/M */
1432 /* modified 28-Dec-2004 to ensure that all mapped characters
1433    are valid */
1434 int
recode(unsigned char * seq,int n,int * qascii,int nsqx)1435 recode(unsigned char *seq, int n, int *qascii, int nsqx) {
1436   int i,j;
1437   char save_c;
1438 
1439 #if defined(FASTS) || defined(FASTM)
1440   qascii[',']=ESS;
1441 #endif
1442 
1443   for (i=0; i < n; i++) {
1444     save_c = seq[i];
1445     if (seq[i] > '@' || seq[i]=='*') seq[i] = qascii[seq[i]];
1446     if (seq[i] > nsqx && seq[i]!=ESS) {
1447       fprintf(stderr, "*** Warning - unrecognized residue at %d:%c - %2d\n",
1448 	      i,save_c,save_c);
1449       seq[i] = qascii['X'];
1450     }
1451   }
1452   seq[i]=EOSEQ;
1453   return i;
1454 }
1455 
1456 /* here we have the query sequence, all the command line options,
1457    but we need to set various parameter options based on the type
1458    of the query sequence (m_msg->qdnaseq = 0:protein/1:DNA) and
1459    the function (FASTA/FASTX/TFASTA)
1460 
1461    29-Jun-2008 add code to ensure that weird ('O', 'U') amino-acids
1462    are read properly.
1463 
1464    15-Nov-2010 -- modify scoring matrix for very short query sequences
1465    (e.g. short read metagenomics)
1466 */
1467 
1468 /* this resetp is for conventional a FASTA/TFASTXYZ search */
1469 void
resetp(struct mngmsg * m_msg,struct pstruct * ppst)1470 resetp (struct mngmsg *m_msg, struct pstruct *ppst) {
1471   int i, pgm_id;
1472   int n0_eff;
1473 
1474   pgm_id = ppst->pgm_id;
1475 
1476   /* check for alphabet conflict */
1477 
1478   ppst->shuffle_dna3 = 0;
1479 #if defined(TFAST)
1480   if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
1481     fprintf(stderr," %s compares a protein to a translated\n\
1482 DNA sequence library.  Do not use a DNA query/scoring matrix.\n",prog_func);
1483     exit(1);
1484   }
1485   ppst->shuffle_dna3 = 1;
1486 #else
1487 #if (defined(FASTX) || defined(FASTY))
1488   if (!(m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA)) {
1489     fprintf(stderr," FASTX/Y compares a DNA sequence to a protein database\n");
1490     fprintf(stderr," Use a DNA query\n");
1491     exit(1);
1492   }
1493 #endif
1494 #endif
1495 
1496   /* **************************************************************** */
1497   /* adjust alphabets for prot:prot or DNA:DNA alignments */
1498 
1499   /* this code changes parameters for programs (FA_PID, SS_PID, FS_PID,
1500      RSS_PID) that can examine either protein (initial state) or DNA
1501      Modified May, 2006 to reset e_cut for DNA comparisons.
1502   */
1503   /* **************************************************************** */
1504 
1505   if (msg_def_arr[pgm_id].q_seqt == SEQT_UNK) {
1506     if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
1507       msg_def_arr[pgm_id].q_seqt = m_msg->qdnaseq;
1508       msg_def_arr[pgm_id].p_seqt = SEQT_DNA;
1509       msg_def_arr[pgm_id].l_seqt = SEQT_DNA;
1510       if (m_msg->qdnaseq == SEQT_DNA) msg_def_arr[pgm_id].qframe = 2;
1511       if (!m_msg->e_cut_set) {
1512 	pgm_def_arr[pgm_id].e_cut /= 5.0;
1513 	ppst->e_cut_r = 0.001;
1514       }
1515     }
1516     else {
1517       msg_def_arr[pgm_id].q_seqt = SEQT_PROT;
1518     }
1519   }
1520 
1521   /* set the comparison type (PROT/DNA) in ppst */
1522   ppst->dnaseq = msg_def_arr[pgm_id].p_seqt;
1523 
1524   if (!sw_flag_set) ppst->sw_flag = msg_def_arr[pgm_id].sw_flag;
1525   if (!m_msg->e_cut_set) {
1526     ppst->e_cut = m_msg->e_cut=pgm_def_arr[pgm_id].e_cut;
1527 #ifdef LALIGN
1528     ppst->e_cut_r = ppst->e_cut;
1529 #endif
1530   }
1531 
1532   if (ppst->dnaseq == SEQT_DNA && m_msg->qdnaseq==SEQT_RNA) {
1533     ppst->dnaseq = SEQT_RNA;
1534     ppst->nt_align = 1;
1535   }
1536   if (ppst->dnaseq==SEQT_DNA) pascii = &nascii[0];
1537   else if (ppst->dnaseq==SEQT_RNA) {
1538     pascii = &nascii[0];
1539     ppst->sq[nascii['T']] = 'U';
1540   }
1541   else pascii = &aascii[0];
1542   m_msg->ldb_info.ldnaseq = msg_def_arr[pgm_id].l_seqt;
1543 
1544   if (m_msg->ldb_info.ldnaseq & SEQT_DNA) {
1545     memcpy(lascii,nascii,sizeof(lascii));
1546 #ifndef TFAST
1547 #ifdef DNALIB_LC
1548     init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1549 #endif
1550 #else
1551   /* no init_ascii() because we translate lower case library sequences */
1552 #endif
1553     validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1554   }
1555   else {
1556     memcpy(lascii,aascii,sizeof(lascii));	/* initialize lib mapping */
1557     if (m_msg->ann_flg && strchr((char *)m_msg->ann_arr,'*')) {lascii['*'] = NA;}
1558 
1559 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1560     lascii['*'] = NA;
1561 #endif
1562     init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1563     validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1564   }
1565 
1566   /* have lascii - initialize l_ann_ascii[] if necessary */
1567   if (m_msg->ann_flg) {
1568     memcpy(l_ann_ascii,lascii,sizeof(l_ann_ascii));
1569     /* make certain that '*' is treated correctly */
1570     if (strchr((char *)m_msg->ann_arr,'*')) {l_ann_ascii['*'] = NA;}
1571     add_ascii_ann(l_ann_ascii, m_msg->ann_arr);
1572   }
1573 
1574   /* **************************************************************** */
1575   /* adjust qframe/nframe if DNA/translated DNA search  */
1576   /* **************************************************************** */
1577 
1578   if (!nframe_set) {
1579     m_msg->qframe = msg_def_arr[pgm_id].qframe;
1580     m_msg->nframe = msg_def_arr[pgm_id].nframe;
1581   }
1582 
1583   /* the possibilities:
1584   	     -i  -3	qframe	revcomp
1585    FA_D/FX   -    -        2       0
1586    FA_D/FX   +    -        2       1
1587    FA_D/FX   -    +        1       0
1588    FA_D/FX   +    +        2       1
1589   */
1590 
1591   if (m_msg->qdnaseq == SEQT_DNA) {
1592     m_msg->nframe = 1;
1593     if (m_msg->qframe == 1 && m_msg->revcomp==1) {
1594       m_msg->qframe = m_msg->revcomp+1;
1595     }
1596   }
1597   else if (m_msg->qdnaseq == SEQT_RNA) {
1598     m_msg->qframe = m_msg->revcomp+1;
1599     m_msg->nframe = 1;
1600   }
1601 
1602   /* **************************************************************** */
1603   /* adjust FASTA heuristics for  DNA/translated DNA search  */
1604   /* **************************************************************** */
1605 
1606   if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
1607     ppst->histint = 4;
1608 
1609     if (!del_set) {
1610 #ifdef OLD_FASTA_GAP
1611       ppst->gdelval = -16;	/* def. del penalty */
1612 #else
1613       ppst->gdelval = -12;	/* def. open penalty */
1614 #endif
1615     }
1616     if (!gap_set) ppst->ggapval = -4;	/* def. gap penalty */
1617 
1618     ppst->nsq = nnt;
1619     ppst->nsqx = nntx;
1620     ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
1621 
1622     /* reset parameters for DNA */
1623     if (pgm_def_arr[pgm_id].ktup > 0) {
1624       /* these parameters are used to scale optcut, and are being replaced
1625 	 by statistically based parameters */
1626       /* largest ktup */
1627       pgm_def_arr[pgm_id].ktup = mktup = 6;
1628       if (!ppst->param_u.fa.optwid_set) ppst->param_u.fa.optwid = 16;
1629       ppst->param_u.fa.bestscale = 80;
1630       ppst->param_u.fa.bkfact = 5;
1631       ppst->param_u.fa.scfact = 1;
1632       ppst->param_u.fa.bktup = mktup;
1633       ppst->param_u.fa.bestmax = 80;
1634       ppst->param_u.fa.bestoff = 45;
1635       if (!E_thresh_set) ppst->param_u.fa.E_band_opt = 0.05;
1636 
1637       if (!sw_flag_set) {
1638 	ppst->sw_flag = 0;
1639 	SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
1640 	SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
1641       }
1642 
1643       /* largest ktup */
1644       mktup = 6;
1645 
1646       if (ppst->param_u.fa.pamfact >= 0) ppst->param_u.fa.pamfact = 0;
1647       if (ppst->param_u.fa.ktup < 0)
1648 	ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup;
1649     }
1650 
1651     for (i=0; i<=ppst->nsqx; i++) {
1652       ppst->hsq[i] = hnt[i];
1653       ppst->sq[i] = nt[i];
1654       ppst->hsqx[i] = hntx[i];
1655       ppst->sqx[i] = ntx[i];
1656     }
1657 
1658     /* **************************************************************** */
1659     /* adjust scoring matrix for DNA:DNA search  */
1660     /* **************************************************************** */
1661 
1662     if (!ppst->pam_set) {
1663       if (ppst->p_d_set)
1664 	mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1665 #if !defined(FASTS) && !defined(FASTM)
1666       else if (ppst->pamfile[0]=='\0' || strncmp(ppst->pamfile,"BL50",4)==0) {
1667 	SAFE_STRNCPY (ppst->pamfile, "+5/-4", sizeof(ppst->pamfile));
1668 	SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
1669 	SAFE_STRNCPY (ppst->pam_name, "+5/-4", sizeof(ppst->pamfile));
1670       }
1671 #else
1672       else if (strncmp(ppst->pamfile,"MD20",4)==0) {
1673 	SAFE_STRNCPY (ppst->pamfile, "+2/-2", sizeof(ppst->pamfile));
1674 	SAFE_STRNCPY (ppst->pam_name, "+2/-2", sizeof(ppst->pam_name));
1675 	SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
1676 	ppst->p_d_mat = +2;
1677 	ppst->p_d_mis = -2;
1678       	mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1679       }
1680 #endif
1681       pam = npam;
1682     }
1683 
1684     SAFE_STRNCPY (m_msg->sqnam, "nt",sizeof(m_msg->sqnam));
1685     SAFE_STRNCPY (m_msg->sqtype, "DNA",sizeof(m_msg->sqtype));
1686   }	/* end DNA reset */
1687 
1688   else {  /* other parameters for protein comparison */
1689     if (pgm_def_arr[pgm_id].ktup > 0) {
1690       if (!ppst->param_u.fa.optwid_set) {
1691 	if (ppst->param_u.fa.ktup==1) ppst->param_u.fa.optwid = 32;
1692 	else ppst->param_u.fa.optwid = 16;
1693       }
1694     }
1695     if (!shift_set) {ppst->gshift = pgm_def_arr[pgm_id].gshift;}
1696     if (!subs_set) {ppst->gsubs = pgm_def_arr[pgm_id].hshift;}
1697   }
1698 
1699   SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
1700 }
1701 
1702 /* query_parm()	this function asks for any additional parameters
1703 	that have not been provided.  Could be null. */
1704 void
query_parm(struct mngmsg * m_msp,struct pstruct * ppst)1705 query_parm (struct mngmsg *m_msp, struct pstruct *ppst)
1706 {
1707    char    qline[40];
1708 
1709    if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1710      if (ppst->param_u.fa.ktup < 0)
1711        ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1712 
1713      if (ppst->param_u.fa.ktup == 0) {
1714        printf (" ktup? (1 to %d) [%d] ", mktup, pgm_def_arr[ppst->pgm_id].ktup);
1715        if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1716        else sscanf(qline,"%d",&ppst->param_u.fa.ktup);
1717      }
1718      if (ppst->param_u.fa.ktup == 0)
1719        ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
1720      else ktup_set = 1;
1721    }
1722 
1723 #if defined(PRSS)
1724    if (m_msp->shuff_max < 10) m_msp->shuff_max = MAX_RSTATS;
1725 
1726    if (!mshuff_set) {
1727      printf(" number of shuffles [%d]? ",m_msp->shuff_max);
1728      fflush(stdout);
1729      if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1730      else sscanf(qline,"%d",&m_msp->shuff_max);
1731    }
1732 
1733    if (ppst->zs_win == 0) {
1734      printf (" local (window) (w) or uniform (u) shuffle [u]? ");
1735      if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1736      else if (qline[0]=='w' || qline[0]=='W') {
1737        m_msp->shuff_wid = 20;
1738        printf(" local shuffle window size [%d]? ",m_msp->shuff_wid);
1739        if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1740        else sscanf(qline,"%d",&m_msp->shuff_wid);
1741      }
1742    }
1743 #endif
1744 }
1745 
1746 /* last_init() cannot look at aa0, n0, because it is only run once,
1747    it is not run before each new aa0 search */
1748 void
last_init(struct mngmsg * m_msg,struct pstruct * ppst)1749 last_init (struct mngmsg *m_msg, struct pstruct *ppst)
1750 {
1751   int ix_l, ix_i, i, pgm_id;
1752   double *kar_p;
1753   double aa0_f[MAXSQ];
1754 
1755   m_msg->zsflag = ppst->zsflag;
1756   m_msg->zsflag2 = ppst->zsflag2;
1757 
1758   if (ppst->zsflag < 0) {
1759     ppst->do_rep = 0;
1760   }
1761 
1762   pgm_id = ppst->pgm_id;
1763 
1764 #ifdef LALIGN
1765   m_msg->do_showbest = 1;
1766   m_msg->quiet = 1;
1767 #endif
1768 
1769 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1770   m_msg->nohist = 1;
1771   m_msg->shuff_max = 2000;
1772   ppst->shuff_node = m_msg->shuff_max/fa_max_workers;
1773 #else
1774   m_msg->shuff_max = m_msg->shuff_max_save;
1775 #endif
1776 
1777   if (m_msg->aln.llen < 1) {
1778     m_msg->aln.llen = 60;
1779   }
1780 
1781   if (m_msg->ldb_info.ldnaseq== SEQT_PROT) {
1782     m_msg->max_tot = MAXLIB_P;
1783   }
1784 
1785 #if defined(FASTX) || defined(FASTY) || defined(TFAST)
1786   /* set up translation tables: faatran.c */
1787   aainit(ppst->tr_type,ppst->debug_lib);
1788 #endif
1789 
1790 /* a sanity check */
1791 #if !defined(TFAST)
1792    if (m_msg->revcomp && m_msg->qdnaseq!=SEQT_DNA && m_msg->qdnaseq!=SEQT_RNA) {
1793      fprintf(stderr," cannot reverse complement protein\n");
1794      m_msg->revcomp = 0;
1795    }
1796 #endif
1797 
1798    if (pgm_def_arr[pgm_id].ktup > 0) {
1799 
1800      if (ppst->param_u.fa.ktup < 0)
1801        ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1802 
1803      if (ppst->param_u.fa.ktup < 1 || ppst->param_u.fa.ktup > mktup) {
1804        fprintf(stderr," warning ktup = %d out of range [1..%d], reset to %d\n",
1805 	       ppst->param_u.fa.ktup, mktup, ppst->param_u.fa.bktup);
1806        ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
1807      }
1808 
1809      if (ppst->sw_flag) {
1810        SAFE_STRNCPY(m_msg->f_id1,"sw",sizeof(m_msg->f_id1));
1811        SAFE_STRNCPY(m_msg->alabel, align_label[0], sizeof(m_msg->alabel));
1812      }
1813      else {
1814        SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
1815        SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
1816      }
1817    }
1818 
1819    if (pgm_id == TFA_PID) {
1820      m_msg->revcomp *= 3;
1821      if (m_msg->nframe == 3) m_msg->nframe += m_msg->revcomp;
1822    }
1823    else if (pgm_id == TFX_PID || pgm_id == TFY_PID) {
1824      if (m_msg->nframe == 1) m_msg->nframe += m_msg->revcomp;
1825    }
1826 
1827 #if !defined(TFAST)
1828   /* for fasta/fastx searches, itt iterates the the query strand */
1829   m_msg->nitt1 = m_msg->qframe-1;
1830 #else
1831   /* for tfasta/tfastxy searches, itt iterates the library frames */
1832   m_msg->nitt1 = m_msg->nframe-1;
1833 #endif
1834 
1835   if (pgm_def_arr[pgm_id].ktup > 0) {	/* its FASTA, not SSEARCH */
1836     if (ppst->param_u.fa.ktup>=2 && !ppst->param_u.fa.optwid_set) {
1837       ppst->param_u.fa.optwid=16;
1838       switch (pgm_id) {
1839       case FA_PID:
1840       case FX_PID:
1841       case FY_PID:
1842 	m_msg->thr_fact = 8;
1843 	m_msg->thr_fact = 8;
1844 	break;
1845       case TFA_PID:
1846       case TFX_PID:
1847       case TFY_PID:
1848 	m_msg->thr_fact = 4;
1849 	break;
1850       default:
1851 	m_msg->thr_fact = 4;
1852       }
1853     }
1854     else { m_msg->thr_fact = 4;}
1855   }
1856   else {
1857 #if !defined(SW_ALTIVEC) && !defined(SW_SSE2)
1858     m_msg->thr_fact = 1;	/* unvectorized SSEARCH  */
1859 #else
1860     m_msg->thr_fact = 8;	/* vectorized SSEARCH */
1861 #endif
1862   }
1863 
1864 #ifdef PCOMPLIB
1865   m_msg->thr_fact = 1;	/* use much larger buffers */
1866 #endif
1867 
1868 #if defined(PRSS)
1869    if (m_msg->shuff_max < 10) m_msg->shuff_max = MAX_RSTATS;
1870    if (ppst->zsflag < 10) ppst->zsflag += 10;
1871    if (ppst->zs_win > 0) {
1872      m_msg->shuff_wid = ppst->zs_win;
1873    }
1874 #endif
1875 
1876    if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1877      if (ppst->param_u.fa.iniflag) {
1878        ppst->score_ix = 1;
1879        SAFE_STRNCPY (m_msg->label, "initn init1", sizeof(m_msg->label));
1880      }
1881      else if (ppst->param_u.fa.optflag) {
1882        ppst->score_ix = 2;
1883        m_msg->stages = 1;
1884      }
1885    }
1886 
1887    if (!ppst->have_pam2) {
1888      alloc_pam (MAXSQ, MAXSQ, ppst);
1889      init_pam2(ppst);
1890    }
1891    init_pamx(ppst);
1892 
1893    if (ppst->pam_ms) {
1894      if (m_msg->qdnaseq == SEQT_PROT) {
1895        /* code to make 'L'/'I' identical scores */
1896        ix_l = pascii['L'];
1897        ix_i = pascii['I'];
1898        ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1899 	 ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1900 	 max(ppst->pam2[0][ix_l][ix_l],ppst->pam2[0][ix_i][ix_i]);
1901        for (i=1; i<=ppst->nsq; i++) {
1902 	 ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1903 	   max(ppst->pam2[0][i][ix_l],ppst->pam2[0][i][ix_i]);
1904 	 ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1905 	   max(ppst->pam2[0][ix_i][i],ppst->pam2[0][ix_l][i]);
1906        }
1907 
1908        /* code to make 'Q'/'K' identical scores */
1909        if (!shift_set) {
1910 	 ix_l = pascii['Q'];
1911 	 ix_i = pascii['K'];
1912 	 ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1913 	   ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1914 	   (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2;
1915 	 for (i=1; i<=ppst->nsq; i++) {
1916 	   ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1917 	     (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2;
1918 	   ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1919 	     (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2;
1920 	 }
1921        }
1922      }
1923    }
1924 
1925    /*
1926    print_pam(ppst);
1927    */
1928 
1929    /* once we have a complete pam matrix, we can calculate Lambda and K
1930       for "average" sequences */
1931    kar_p = NULL;
1932    init_karlin_a(ppst, aa0_f, &kar_p);
1933    do_karlin_a(ppst->pam2[0], ppst, aa0_f,
1934 	       kar_p, &m_msg->Lambda, &m_msg->K, &m_msg->H);
1935    ppst->pLambda = m_msg->Lambda;
1936    ppst->pK = m_msg->K;
1937    ppst->pH = m_msg->H;
1938    ppst->LK_set = 1;
1939    free(kar_p);
1940 
1941 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1942    if (ppst->ext_sq_set) {
1943      fprintf(stderr," -S not available on [t]fast[fs]\n");
1944      ppst->ext_sq_set = 0;
1945      ppst->nsq_e = ppst->nsq;
1946 
1947      /* reset sascii to ignore -S, map lc */
1948      init_ascii(0,lascii, ppst->nsq, 0);
1949      validate_novel_aa(lascii, ppst->nsq, 0);
1950    }
1951 #endif
1952 }
1953 
1954 /* alloc_pam2p creates a profile structure */
1955 int **
alloc_pam2p(int ** pam2p,int len,int nsq)1956 alloc_pam2p(int **pam2p, int len, int nsq) {
1957   int i, pam2p_len;
1958   int *pam2pp;
1959 
1960   if (pam2p == NULL) {
1961     if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
1962       fprintf(stderr," Cannot allocate pam2p: %d\n",len);
1963       return NULL;
1964     }
1965 
1966     if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
1967       fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
1968       free(pam2p);
1969       return NULL;
1970     }
1971   }
1972   else {
1973     pam2p_len = (nsq+1)*len*sizeof(int);
1974     pam2pp = pam2p[0];
1975     if ((pam2pp = (int *)realloc(pam2pp,pam2p_len))==NULL) {
1976       fprintf(stderr,
1977 	      "Cannot reallocate pam2p[0]: %ld\n", (nsq+1)*len*sizeof(int));
1978       return NULL;
1979     }
1980     memset(pam2pp,0,pam2p_len);
1981 
1982     if ((pam2p = (int **)realloc(pam2p,len*sizeof(int *)))==NULL) {
1983       fprintf(stderr," Cannot reallocate pam2p: %d\n",len);
1984       return NULL;
1985     }
1986     pam2p[0] = pam2pp;
1987   }
1988 
1989   for (i=1; i<len; i++) {
1990     pam2p[i] = pam2p[0] + (i*(nsq+1));
1991   }
1992 
1993   return pam2p;
1994 }
1995 
free_pam2p(int ** pam2p)1996 void free_pam2p(int **pam2p) {
1997   if (pam2p) {
1998     free(pam2p[0]);
1999     free(pam2p);
2000   }
2001 }
2002 
2003 /* sortbest has now become comparison function specific so that we can use
2004    a different comparison for fasts/f
2005 */
2006 #if !defined(FASTS) && !defined (FASTF) && !defined(FASTM)
2007 void
qshuffle()2008 qshuffle() {}
2009 
2010 #ifndef LALIGN	 /* LALIGN has last_calc() in last_thresh.c */
2011 int
last_calc(unsigned char * aa0,unsigned char * aa1,int maxn,struct beststr ** bestp_arr,int nbest,struct mngmsg m_msg,struct pstruct * ppst,void ** f_str,void * pstat_str)2012 last_calc(
2013 	  unsigned char *aa0, unsigned char *aa1, int maxn,
2014 	  struct beststr **bestp_arr, int nbest,
2015 	  struct mngmsg m_msg, struct pstruct *ppst
2016 	  , void **f_str
2017 	  , void *pstat_str)
2018 {
2019   return nbest;
2020 }
2021 #endif
2022 
2023 /* this function is almost never called, thus a slow shell sort */
sortbest(bptr,nbest,irelv)2024 void sortbest (bptr, nbest, irelv)
2025 struct beststr **bptr;
2026 int nbest, irelv;
2027 {
2028     int gap, i, j;
2029     struct beststr *tmp;
2030 
2031     for (gap = nbest/2; gap > 0; gap /= 2)
2032 	for (i = gap; i < nbest; i++)
2033 	    for (j = i - gap; j >= 0; j-= gap) {
2034 	      if (bptr[j]->rst.score[irelv] >= bptr[j + gap]->rst.score[irelv]) break;
2035 	      tmp = bptr[j];
2036 	      bptr[j] = bptr[j + gap];
2037 	      bptr[j + gap] = tmp;
2038 	    }
2039 }
2040 
show_aux(FILE * fp,struct beststr * bptr)2041 void show_aux(FILE *fp, struct beststr *bptr) {}
header_aux(FILE * fp)2042 void header_aux(FILE *fp) {}
2043 
2044 #else
2045 /* this function is almost never called, thus a slow shell sort */
sortbest(bptr,nbest,irelv)2046 void sortbest (bptr, nbest, irelv)
2047 struct beststr **bptr;
2048 int nbest, irelv;
2049 {
2050     int gap, i, j;
2051     struct beststr *tmp;
2052 
2053     for (gap = nbest/2; gap > 0; gap /= 2)
2054 	for (i = gap; i < nbest; i++)
2055 	    for (j = i - gap; j >= 0; j-= gap) {
2056 	      if (bptr[j]->rst.escore < bptr[j + gap]->rst.escore) break;
2057 	      tmp = bptr[j];
2058 	      bptr[j] = bptr[j + gap];
2059 	      bptr[j + gap] = tmp;
2060 	    }
2061 }
2062 
2063 #if defined(FASTS) || defined(FASTM)
2064 
2065 /* this shuffle is for FASTS  */
2066 /* convert ',' -> '\0', shuffle each of the substrings */
2067 void
qshuffle(unsigned char * aa0,int n0,int nm0,void * rand_state)2068 qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
2069 
2070   unsigned char **aa0start, *aap, tmp;
2071   int i,j,k, ns;
2072 
2073   if ((aa0start=(unsigned char **)calloc(nm0+1,
2074 					 sizeof(unsigned char *)))==NULL) {
2075     fprintf(stderr,"cannot calloc for qshuffle %d\n",nm0);
2076     exit(1);
2077   }
2078 
2079   aa0start[0]=aa0;
2080   for (k=1,i=0; i<n0; i++) {
2081     if (aa0[i]==EOSEQ || aa0[i]==ESS) {
2082       aa0[i]='\0';
2083       aa0start[k++] = &aa0[i+1];
2084     }
2085   }
2086 
2087   /* aa0start has the beginning of each substring */
2088   for (k=0; k<nm0; k++) {
2089     aap=aa0start[k];
2090     ns = strlen((const char *)aap);
2091     for (i=ns; i>1; i--) {
2092       j = my_nrand(i, rand_state);
2093       tmp = aap[j];
2094       aap[j] = aap[i-1];
2095       aap[i-1] = tmp;
2096     }
2097     aap[ns] = 0;
2098   }
2099 
2100   for (k=1; k<nm0; k++) {
2101 /*  aap = aa0start[k];
2102     while (*aap) fputc(pst.sq[*aap++],stderr);
2103     fputc('\n',stderr);
2104 */
2105     aa0start[k][-1]=ESS;
2106   }
2107 
2108   free(aa0start);
2109 }
2110 #endif
2111 
2112 #ifdef FASTF
qshuffle(unsigned char * aa0,int n0,int nm0,void * rand_state)2113 void qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
2114 
2115   int i, j, k, nmpos;
2116   unsigned char tmp;
2117   int nmoff;
2118 
2119   nmoff = (n0 - nm0 - 1)/nm0 + 1;
2120 
2121   for (i = nmoff-1 ; i > 0 ; i--) {
2122 
2123     /* j = nrand(i); if (i == j) continue;*/       /* shuffle columns */
2124     j = (nmoff -1 ) - i;
2125     if (i <= j) break; /* reverse columns */
2126 
2127     /* swap all i'th column residues for all j'th column residues */
2128     for(nmpos = 0, k = 0 ; k < nm0 ; k++, nmpos += nmoff+1 ) {
2129       tmp = aa0[nmpos + i];
2130       aa0[nmpos + i] = aa0[nmpos + j];
2131       aa0[nmpos + j] = tmp;
2132     }
2133   }
2134 }
2135 #endif
2136 
2137 
2138 /* show additional best_str values */
show_aux(FILE * fp,struct beststr * bptr)2139 void show_aux(FILE *fp, struct beststr *bptr) {
2140   fprintf(fp," %2d %3d",bptr->rst.segnum,bptr->rst.seglen);
2141 }
2142 
header_aux(FILE * fp)2143 void header_aux(FILE *fp) {
2144   fprintf(fp, " sn  sl");
2145 }
2146 #endif
2147 
2148 void
fill_pam(int ** pam2p,int n0,int nsq,double ** freq2d,double scale)2149 fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) {
2150   int i, j, new_j;
2151   double freq;
2152 
2153   /* fprintf(stderr, "scale: %g\n", scale); */
2154 
2155   /* now fill in the pam matrix: */
2156   for (j = 1 ; j <=20 ; j++) {
2157     new_j = qascii[pssm_aa[j]];
2158     for (i = 0 ; i < n0 ; i++) {
2159       freq = scale * freq2d[i][j-1];
2160       if ( freq < 0.0) freq -= 0.5;
2161       else freq += 0.5;
2162       pam2p[i][new_j] = (int)(freq);
2163     }
2164   }
2165 }
2166 
2167 double
get_lambda(int ** pam2p,int n0,int nsq,unsigned char * query)2168 get_lambda(int **pam2p, int n0, int nsq, unsigned char *query) {
2169   double lambda, H;
2170   double *pr, tot, sum;
2171   int i, ioff, j, min, max, q_i;
2172 
2173   /* get min and max scores */
2174   min = BIGNUM;
2175   max = -BIGNUM;
2176   if(pam2p[0][1] == -BIGNUM) {
2177     ioff = 1;
2178     n0++;
2179   } else {
2180     ioff = 0;
2181   }
2182 
2183   for (i = ioff ; i < n0 ; i++) {
2184     for (j = 1; j < nsq ; j++) {
2185       if (min > pam2p[i][j])
2186 	min = pam2p[i][j];
2187       if (max < pam2p[i][j])
2188 	max = pam2p[i][j];
2189     }
2190   }
2191 
2192   /*  fprintf(stderr, "min: %d\tmax:%d\n", min, max); */
2193 
2194   if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
2195     fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
2196     exit(1);
2197   }
2198 
2199   tot = (double) rrtotal * (double) rrtotal * (double) n0;
2200   for (i = ioff ; i < n0 ; i++) {
2201 
2202     if (query[i] < 'A') {q_i = query[i];}
2203     else {q_i= aascii[query[i]];}
2204 
2205     for (j = 1; j < nsq ; j++) {
2206       pr[pam2p[i][j] - min] +=
2207 	(double) ((double) rrcounts[q_i] * (double) rrcounts[j]) / tot;
2208     }
2209   }
2210 
2211   sum = 0.0;
2212   for(i = 0 ; i <= max-min ; i++) {
2213     sum += pr[i];
2214     /*     fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
2215   }
2216   /*   fprintf(stderr, "sum: %g\n", sum); */
2217 
2218   for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
2219 
2220   if (!karlin(min, max, pr, &lambda, &H)) {
2221     fprintf(stderr, "Karlin lambda estimation failed\n");
2222   }
2223 
2224   /*   fprintf(stderr, "lambda: %g\n", lambda); */
2225   free(pr);
2226 
2227   return lambda;
2228 }
2229 
2230 /*
2231    *aa0 - query sequence
2232    n0   - length
2233    pamscale - scaling for pam matrix - provided by apam.c, either
2234               0.346574 = ln(2)/2 (P120, BL62) or
2235 	      0.231049 = ln(2)/3 (P250, BL50)
2236 */
2237 
2238 void
2239 scale_pssm(int **pssm2p, double **freq2d,
2240 	   unsigned char *query, int n0,
2241 	   int **pam2, double pamscale);
2242 
2243 static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";
2244 
2245 void
read_pssm(unsigned char * aa0,int n0,int nsq,double pamscale,FILE * fp,int pgpf_type,struct pstruct * ppst)2246 read_pssm(unsigned char *aa0, int n0, int nsq,
2247 	  double pamscale,
2248 	  FILE *fp, int pgpf_type, struct pstruct *ppst) {
2249   int i, j, len, k;
2250   int qi, rj;	/* qi - index query; rj - index residues (1-20) */
2251   int **pam2p;
2252   int first, too_high;
2253   unsigned char *query, ctmp;
2254   char dline[512];
2255   double freq, **freq2d, lambda, new_lambda;
2256   double scale, scale_high, scale_low;
2257 
2258   pam2p = ppst->pam2p[0];
2259 
2260   if (pgpf_type == 0) {
2261 
2262     if (1 != fread(&len, sizeof(int), 1, fp)) {
2263       fprintf(stderr, "error reading from checkpoint file: %d\n", len);
2264       exit(1);
2265     }
2266 
2267     if (len != n0) {
2268       fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
2269 	      len,n0);
2270       exit(1);
2271     }
2272 
2273     /* read over query sequence stored in BLAST profile */
2274     if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
2275       fprintf(stderr, "Couldn't allocate memory for query!\n");
2276       exit(1);
2277     }
2278 
2279     if (len != fread(query, sizeof(char), len, fp)) {
2280       fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
2281       exit(1);
2282     }
2283   }
2284   else if (pgpf_type == 1) {
2285 
2286     if ((fgets(dline,sizeof(dline),fp) == NULL)  ||
2287 	(1 != sscanf(dline, "%d",&len))) {
2288       fprintf(stderr, "error reading from checkpoint file: %d\n", len);
2289       exit(1);
2290     }
2291 
2292     if(len != n0) {
2293       fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
2294 	      len,n0);
2295       exit(1);
2296     }
2297 
2298     /* read over query sequence stored in BLAST profile */
2299     if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
2300       fprintf(stderr, "Couldn't allocate memory for query!\n");
2301       exit(1);
2302     }
2303 
2304     if (fgets((char *)query,len+2,fp)==NULL) {
2305       fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
2306       exit(1);
2307     }
2308   }
2309   else {
2310     fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type);
2311     exit(1);
2312   }
2313 
2314   /* currently we don't do anything with query; ideally, we should
2315      check to see that it actually matches aa0 ... */
2316 
2317   /* quick 2d array alloc: */
2318   if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
2319     fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
2320     exit(1);
2321   }
2322 
2323   if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) {
2324     fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
2325     exit(1);
2326   }
2327 
2328   /* a little pointer arithmetic to fill out 2d array: */
2329   for (i = 1 ; i < n0 ; i++) {
2330     freq2d[i] = freq2d[i-1] + 20;
2331   }
2332 
2333   if (pgpf_type == 0) {
2334     for (qi = 0 ; qi < n0 ; qi++) {
2335       for (rj = 0 ; rj < 20 ; rj++) {
2336 	if(1 != fread(&freq, sizeof(double), 1, fp)) {
2337 	  fprintf(stderr, "Error while reading frequencies!\n");
2338 	  exit(1);
2339 	}
2340 	freq2d[qi][rj] = freq;
2341       }
2342     }
2343   }
2344   else {
2345     for (qi = 0 ; qi < n0 ; qi++) {
2346       if ((fgets(dline,sizeof(dline),fp) ==NULL) ||
2347       (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n",
2348 		 &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4],
2349 		 &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9],
2350 		 &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14],
2351 		      &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) {
2352 	fprintf(stderr, "Error while reading frequencies: %d read!\n",k);
2353 	exit(1);
2354       }
2355       for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; }	/* reverse scaling */
2356     }
2357   }
2358 
2359   scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
2360 
2361   free(freq2d[0]);
2362   free(freq2d);
2363 
2364   free(query);
2365 }
2366 
2367 /* before fasta-36.3.6 (with reordered amino-acid mapping), scale_pssm()
2368    simply produced a log(q_ij/p_j) and put it into pam2p.
2369 
2370    But pssm's use pssm_aa encoding, while fasta-36.3.6 use NCBIstdaa
2371    encoding, so the pam2p must be re-mapped
2372 */
2373 
2374 void
scale_pssm(int ** pssm2p,double ** freq2d,unsigned char * query,int n0,int ** pam2,double pamscale)2375 scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) {
2376   int i, qi, rj;
2377   double freq, new_lambda, lambda;
2378   int first, too_high;
2379   double scale, scale_high, scale_low;
2380 
2381   for (rj = 0 ; rj < 20 ; rj++) {
2382     for (qi = 0 ; qi < n0 ; qi++) {
2383       if (freq2d[qi][rj] > 1e-20) {
2384 	freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal));
2385 	freq /= pamscale; /* this gets us close to originial pam scores */
2386 	freq2d[qi][rj] = freq;
2387       }
2388       else {
2389 	/* when blastpgp decides to leave something out, it puts 0's in all the frequencies
2390 	   in the binary checkpoint file.  In the ascii version, however, it uses BLOSUM62
2391 	   values.  I will put in scoring matrix values as well */
2392 
2393 	if (query[qi] < 'A') {
2394 	  freq2d[qi][rj] = pam2[query[qi]][rj+1];
2395 	}
2396 	else {
2397 	  freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1];
2398 	}
2399       }
2400     }
2401   }
2402 
2403   /* now figure out the right scale */
2404   scale = 1.0;
2405   lambda = get_lambda(pam2, 20, 20, ustandard_aa);
2406 
2407   /* should be near 1.0 because of our initial scaling by ppst->pamscale */
2408   /* fprintf(stderr, "real_lambda: %g\n", lambda); */
2409 
2410   /* get initial high/low scale values: */
2411   first = 1;
2412   while (1) {
2413     fill_pam(pssm2p, n0, 20, freq2d, scale);
2414     new_lambda = get_lambda(pssm2p, n0, 20, query);
2415 
2416     if (new_lambda > lambda) {
2417       if (first) {
2418 	first = 0;
2419 	scale = scale_high = 1.0 + 0.05;
2420 	scale_low = 1.0;
2421 	too_high = 1;
2422       } else {
2423 	if (!too_high) break;
2424 	scale = (scale_high += scale_high - 1.0);
2425       }
2426     } else if (new_lambda > 0) {
2427       if (first) {
2428 	first = 0;
2429 	scale_high = 1.0;
2430 	scale = scale_low = 1.0 - 0.05;
2431 	too_high = 0;
2432       } else {
2433 	if (too_high) break;
2434 	scale = (scale_low += scale_low - 1.0);
2435       }
2436     } else {
2437       fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
2438       exit(1);
2439     }
2440   }
2441 
2442   /* now do binary search between low and high */
2443   for (i = 0 ; i < 10 ; i++) {
2444     scale = 0.5 * (scale_high + scale_low);
2445     fill_pam(pssm2p, n0, 20, freq2d, scale);
2446     new_lambda = get_lambda(pssm2p, n0, 20, query);
2447 
2448     if (new_lambda > lambda) scale_low = scale;
2449     else scale_high = scale;
2450   }
2451 
2452   scale = 0.5 * (scale_high + scale_low);
2453   fill_pam(pssm2p, n0, 20, freq2d, scale);
2454 
2455   /*
2456   fprintf(stderr, "final scale: %g\n", scale);
2457 
2458   for (qi = 0 ; qi < n0 ; qi++) {
2459     fprintf(stderr, "%4d %c:  ", qi+1, query[qi]);
2460     for (rj = 1 ; rj <= 20 ; rj++) {
2461       fprintf(stderr, "%4d", pssm2p[qi][rj]);
2462     }
2463     fprintf(stderr, "\n");
2464   }
2465   */
2466 }
2467 
2468 #if defined(CAN_PSSM)
2469 int
2470 parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
2471 		  unsigned char **query, double ***freqs,
2472 		  char *matrix, int *gap_open, int *gap_extend,
2473 		  double *lambda);
2474 
2475 /* the ASN.1 pssm includes information about the scoring matrix used
2476    (though not the gap penalty in the current version PSSM:2) The PSSM
2477    scoring matrix and gap penalties should become the default if they
2478    have not been set explicitly.
2479 */
2480 
2481 /* read the PSSM from an open FILE *fp - but nothing has been read
2482    from *fp */
2483 
2484 int
read_asn_pssm(unsigned char * aa0,int n0,int nsq,double pamscale,FILE * fp,struct pstruct * ppst)2485 read_asn_pssm(unsigned char *aa0, int n0, int nsq,
2486 	      double pamscale, FILE *fp, struct pstruct *ppst) {
2487 
2488   int i, j, len, k;
2489   int qi, rj;	/* qi - index query; rj - index residues (1-20) */
2490   int **pam2p;
2491   int first, too_high;
2492   unsigned char *query, ctmp;
2493   char dline[512];
2494   char matrix[MAX_SSTR];
2495   double psi2_lambda;
2496   double freq, **freq2d, lambda, new_lambda;
2497   double scale, scale_high, scale_low;
2498   int gap_open, gap_extend;
2499   int n_rows, n_cols;
2500 
2501   pam2p = ppst->pam2p[0];
2502 
2503   if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d,
2504 			matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {
2505     return -1;
2506   }
2507 
2508   if (query == NULL) { query = aa0;}
2509 
2510   if (!gap_set) {
2511     if (gap_open) {
2512       if (gap_open > 0) {gap_open = -gap_open;}
2513       ppst->gdelval = gap_open;
2514     }
2515     else if (strncmp(matrix,"BLOSUM62",8)==0) {
2516       ppst->gdelval = -11;
2517     }
2518     gap_set = 1;
2519   }
2520   if (!del_set) {
2521     if (gap_extend) {
2522       if (gap_extend > 0) {gap_extend = -gap_extend;}
2523       ppst->ggapval = gap_extend;
2524     }
2525     else if (strncmp(matrix,"BLOSUM62",8)==0) {
2526       ppst->ggapval = -1;
2527     }
2528     del_set = 1;
2529   }
2530 
2531   if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {
2532     SAFE_STRNCPY(ppst->pamfile, "BL62", 120);
2533     SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
2534     standard_pam(ppst->pamfile,ppst,del_set, gap_set);
2535     if (!ppst->have_pam2) {
2536      alloc_pam (MAXSQ, MAXSQ, ppst);
2537     }
2538     init_pam2(ppst);
2539     ppst->pam_set = 1;
2540   }
2541 
2542   if (n_cols < n0) {
2543     fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);
2544     exit(1);
2545   }
2546 
2547   scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
2548 
2549   free(freq2d[0]);
2550   free(freq2d);
2551 
2552   if (query != aa0) free(query);
2553   return 1;
2554 }
2555 #endif
2556 
2557 /* last_params() sets up values in pstruct *ppst now that all
2558    parameters and data is available.
2559 
2560    It:
2561    (1) moves m_msg->n0 to ppst->n0 for statistics calculations
2562    (2) sets ppst->nsq_e
2563    (3) reads the PSSM file if one is being used
2564    (4) calculates m_msg->nm0 for FASTF/S/M
2565    (5) determines statistical strategy for FASTF/S, sets last_calc_flg
2566        and qshuffle
2567    (6) lowers ktup for short sequences
2568 */
2569 
2570 void
last_params(unsigned char * aa0,int n0,struct mngmsg * m_msp,struct pstruct * ppst)2571 last_params(unsigned char *aa0, int n0,
2572 	    struct mngmsg *m_msp,
2573 	    struct pstruct *ppst
2574 	    ) {
2575   int i, nsq;
2576   FILE *fp;
2577   int is_fastxy=0;
2578   int n0_eff;
2579   /* do_karlin_a() must be re-run everytime the scoring matrix changes */
2580   double *kar_p;
2581   double aa0_f[MAXSQ];
2582 
2583   if (n0 < 0) { return;}
2584 
2585 
2586   n0_eff = m_msp->n0;
2587   ppst->n0 = m_msp->n0;
2588 #if !defined(TFAST) && (defined(FASTX) || defined(FASTY))
2589   n0_eff /= 3;
2590   is_fastxy = 1;
2591 #endif
2592 
2593   /* reset the PAMFILE to the original value */
2594   if (strncmp(ppst->pamfile, ppst->pamfile_save,120)!=0) {
2595     SAFE_STRNCPY(ppst->pamfile, ppst->pamfile_save, 120);
2596     standard_pam(ppst->pamfile,ppst,del_set, gap_set);
2597     init_pam2(ppst);
2598     init_pamx(ppst);
2599   }
2600 
2601   /* **************************************************************** */
2602   /* adjust scoring matrix for short protein/translated protein queries */
2603   /* **************************************************************** */
2604 
2605   if (ppst->pam_variable) {
2606     if (min_pam_bits(n0_eff, DEF_MIN_BITS, ppst, del_set, gap_set)) {
2607       init_pam2(ppst);
2608       init_pamx(ppst);
2609       kar_p = NULL;
2610       init_karlin_a(ppst, aa0_f, &kar_p);
2611       do_karlin_a(ppst->pam2[0], ppst, aa0_f,
2612 		  kar_p, &m_msp->Lambda, &m_msp->K, &m_msp->H);
2613       ppst->pLambda = m_msp->Lambda;
2614       ppst->pK = m_msp->K;
2615       ppst->pH = m_msp->H;
2616       ppst->LK_set = 1;
2617       free(kar_p);
2618     }
2619   }
2620 
2621   if (ppst->ext_sq_set) { ppst->nsq_e = nsq = 2*ppst->nsq; }
2622   else {ppst->nsq_e = nsq = ppst->nsq;}
2623 
2624 #if defined(CAN_PSSM)
2625   ppst->pam2p[0] = alloc_pam2p(ppst->pam2p[0],n0,MAXSQ);
2626   ppst->pam2p[1] = alloc_pam2p(ppst->pam2p[1],n0,MAXSQ);
2627 
2628   if (ppst->pam_pssm) {
2629     if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) {
2630       read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst);
2631       extend_pssm(aa0, n0, ppst);
2632     }
2633     else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) {
2634       read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst);
2635       extend_pssm(aa0, n0, ppst);
2636     }
2637     else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) {
2638       if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) {
2639 	extend_pssm(aa0, n0, ppst);
2640       }
2641       else {
2642 	fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile);
2643 	ppst->pam_pssm = 0;
2644 	return;
2645       }
2646     }
2647     else {
2648       fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile);
2649       ppst->pam_pssm = 0;
2650       return;
2651     }
2652   }
2653 #endif
2654 
2655 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
2656   m_msp->nm0 = 1;
2657   for (i=0; i<n0; i++)
2658     if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msp->nm0++;
2659 
2660 /*
2661   for FASTS, we can do statistics in one of two different ways
2662   if there are <= 10 query fragments, then we calculate probabilistic
2663   scores for every library sequence.  If there are > 10 fragments, this
2664   takes much too long and too much memory, so we use the old fashioned
2665   raw score only z-score normalized method initially, and then calculate
2666   the probabilistic scores for the best hits.  To scale those scores, we
2667   also need a set of random probabilistic scores.  So we do the qshuffle
2668   to get them.
2669 
2670   For FASTF, precalculating probabilities is prohibitively expensive,
2671   so we never do it; FASTF always acts like FASTS with nfrags>10.
2672 
2673 */
2674 
2675 #if defined(FASTS) || defined(FASTM)
2676   if (m_msp->nm0 > 10) m_msp->escore_flg = 0;
2677   else m_msp->escore_flg = 1;
2678 #endif
2679 
2680   if (m_msp->escore_flg && (ppst->zsflag&1)) {
2681     m_msp->last_calc_flg = 0;
2682     m_msp->qshuffle = 0;
2683   }
2684   else {	/* need random query, second set of 2000 scores */
2685     m_msp->last_calc_flg = 1;
2686     m_msp->qshuffle = 1;
2687   }
2688 #else
2689 #ifndef LALIGN
2690   m_msp->last_calc_flg = 0;
2691 #else
2692   m_msp->last_calc_flg = 1;	/* LALIGN needs last_calc for threshold */
2693 #endif
2694   m_msp->qshuffle = 0;
2695   m_msp->escore_flg = 0;
2696   m_msp->nm0 = 1;
2697 #endif
2698 
2699 /* adjust the ktup if appropriate */
2700 
2701   if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
2702     if (!ktup_set ) {
2703       ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
2704       if (m_msp->qdnaseq == SEQT_PROT || is_fastxy) {
2705 #if defined(FASTS) || defined(FASTM)
2706 	if (n0_eff > 100 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
2707 #endif
2708 	if (n0_eff <= 40 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
2709       }
2710       else if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
2711 	if (n0_eff <= 20 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
2712 #if defined(FASTS) || defined(FASTM)
2713 	/* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */
2714 	else ppst->param_u.fa.ktup = 2;
2715 #else
2716 	else if (n0 < 50 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
2717 	else if (n0 < 100 && ppst->param_u.fa.ktup > 3)  ppst->param_u.fa.ktup = 3;
2718 #endif
2719       }
2720     }
2721     /* regardless of ktup state */
2722     if (ppst->param_u.fa.use_E_thresholds) {
2723       ppst->param_u.fa.use_E_thresholds = ppst->LK_set;
2724     }
2725     if (!E_cgap_set) {
2726       ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * 5;
2727     }
2728     else {
2729       if (ppst->param_u.fa.E_join > 1.0) {
2730 	ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * ppst->param_u.fa.E_join;
2731       }
2732     }
2733   }
2734 }
2735 
2736 /* validate_params() exists because of bugs that keep appearing
2737 
2738    (1) pam2[0][x][0] or pam2[0][0][x] are not -BIGNUM
2739    (2) sascii[] (or qascii[], lascii[]) have values outside nsq_e.
2740  */
2741 
2742 int
validate_params(const unsigned char * aa0,int n0,const struct mngmsg * m_msg,const struct pstruct * ppst,const int * lascii,const int * pascii)2743 validate_params(const unsigned char *aa0, int n0,
2744 		const struct mngmsg *m_msg,
2745 		const struct pstruct *ppst,
2746 		const int *lascii, const int *pascii) {
2747   int good_params = 1;
2748   int i;
2749 
2750   /* check for -BIGNUM for boundaries of pam2[0][0:x][x:0] */
2751 
2752   for (i=0; i< ppst->nsq; i++) {
2753     if (ppst->pam2[0][0][i] > -1000) {
2754       fprintf(stderr," *** ERROR ***  pam2[0][0][%d/%c] == %d\n",
2755 	      i,NCBIstdaa[i],ppst->pam2[0][0][i]);
2756       good_params = 0;
2757     }
2758     if (ppst->pam2[0][i][0] > -1000) {
2759       fprintf(stderr," *** ERROR ***  pam2[0][%d/%c][0] == %d\n",
2760 	      i,NCBIstdaa[i],ppst->pam2[0][i][0]);
2761       good_params = 0;
2762     }
2763   }
2764 
2765   /* check for -BIGNUM for boundaries of pam2[1][0:x][x:0] */
2766   if (ppst->ext_sq_set) {
2767     for (i=0; i< ppst->nsqx; i++) {
2768       if (ppst->pam2[1][0][i] > -1000) {
2769 	fprintf(stderr," *** ERROR ***  pam2[1][0][%d] == %d\n",
2770 		i,ppst->pam2[1][0][i]);
2771       good_params = 0;
2772       }
2773       if (ppst->pam2[1][i][0] > -1000) {
2774 	fprintf(stderr," *** ERROR ***  pam2[1][%d][0] == %d\n",
2775 		i,ppst->pam2[1][i][0]);
2776       good_params = 0;
2777       }
2778     }
2779   }
2780 
2781   /* check for valid residues in query */
2782   for (i=0; i<n0; i++) {
2783     if (aa0[i] > ppst->nsq_e && aa0[i] != ESS) {
2784       fprintf(stderr," *** ERROR *** aa0[%d] = %c[%d > %d] out of range\n",
2785 	      i, aa0[i], aa0[i], ppst->nsq_e);
2786       good_params = 0;
2787     }
2788   }
2789 
2790   for (i=0; i<128; i++) {
2791     if (lascii[i] < NA && lascii[i] > ppst->nsq_e) {
2792       fprintf(stderr," *** ERROR *** lascii [%c|%d] = %d > %d out of range\n",
2793 	      i, i, lascii[i], ppst->nsq_e);
2794       good_params = 0;
2795     }
2796 
2797     /*  currently, pascii[] is not reset for upper-case only
2798     if (pascii[i] < NA && pascii[i] > ppst->nsq_e) {
2799       fprintf(stderr," *** WARNING *** pascii[%c|%d] = %d > %d out of range\n",
2800 	      i, i, pascii[i], ppst->nsq_e);
2801     }
2802     */
2803 
2804   }
2805 
2806   return good_params;
2807 }
2808 
2809 /* given a good profile in ppst->pam2p[0], make an extended profile
2810    in ppst->pam2p[1]
2811 */
2812 void
extend_pssm(unsigned char * aa0,int n0,struct pstruct * ppst)2813 extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) {
2814 
2815   int i, j, nsq;
2816   int sa_x, sa_t, sa_b, sa_z, sa_j;
2817   int **pam2p0, **pam2p1;
2818 
2819   nsq = ppst->nsq;
2820 
2821   pam2p0 = ppst->pam2p[0];
2822   pam2p1 = ppst->pam2p[1];
2823 
2824   sa_x = pascii['X'];
2825   sa_t = pascii['*'];
2826   if (sa_t >= ppst->nsq) {sa_t = sa_x;}
2827   sa_b = pascii['B'];
2828   sa_z = pascii['Z'];
2829   sa_j = pascii['J'];
2830 
2831   /* fill in boundaries, B, Z, *, X */
2832   for (i=0; i<n0; i++) {
2833     pam2p0[i][0] = -BIGNUM;
2834     pam2p0[i][sa_b] = (int)
2835       (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0);
2836     pam2p0[i][sa_z] = (int)
2837       (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0);
2838     pam2p0[i][sa_j] = (int)
2839       (((float)pam2p0[i][pascii['I']]+(float)pam2p0[i][pascii['L']]+0.5)/2.0);
2840     pam2p0[i][sa_x] = ppst->pam_xm;
2841     pam2p0[i][sa_t] = ppst->pam_xm;
2842   }
2843 
2844   /* copy pam2p0 into pam2p1 */
2845   for (i=0; i<n0; i++) {
2846     pam2p1[i][0] = -BIGNUM;
2847     for (j=1; j<=ppst->nsq; j++) {
2848       pam2p1[i][j] = pam2p0[i][j];
2849     }
2850   }
2851 
2852   /* then fill in extended characters, if necessary */
2853   if (ppst->ext_sq_set) {
2854     for (i=0; i<n0; i++) {
2855       for (j=1; j<=ppst->nsq; j++) {
2856 	pam2p0[i][nsq+j] = pam2p0[i][j];
2857 	pam2p1[i][nsq+j] = ppst->pam_xm;
2858       }
2859     }
2860   }
2861 }
2862 
format_params(struct opt_def_str * opt_ptr,char * string)2863 void format_params(struct opt_def_str *opt_ptr, char *string) {
2864 
2865   if (opt_ptr->opt_char == 'r') {
2866     sprintf(string, " [+%d/%d]", opt_ptr->i_param1, opt_ptr->i_param2);
2867     return;
2868   }
2869 
2870   switch (opt_ptr->fmt_type) {
2871 
2872   case 1:
2873     sprintf(string, " [%d]", opt_ptr->i_param1); break;
2874   case 2:
2875     sprintf(string, " [%d,%d]", opt_ptr->i_param1, opt_ptr->i_param2); break;
2876   case 3:
2877     sprintf(string, " [%.4g]", opt_ptr->d_param1); break;
2878   case 4:
2879     sprintf(string, " [%.4g,%.4g]", opt_ptr->d_param1, opt_ptr->d_param2); break;
2880   case 5:
2881     sprintf(string, " [%s]", opt_ptr->s_param); break;
2882   case 0:
2883   default:
2884     string[0] = '\0'; break;
2885   }
2886 }
2887 
2888 
2889 #if defined(FASTX) || defined(FASTY)
2890 static char *common_opts = "sfgjSEbdI";
2891 #else
2892 #if defined(LALIGN)
2893 static char *common_opts = "sfgEZI";
2894 #else
2895 static char *common_opts = "sfgSbdI";
2896 #endif
2897 #endif
2898 
2899 void
show_help(char * pgm_name,int pgm_id)2900 show_help(char *pgm_name, int pgm_id) {
2901   int i, j;
2902   int opt_line_cnt=0;
2903   char tmp_string[MAX_STR];
2904   struct opt_def_str *opt_ptr;
2905 
2906   printf("USAGE\n");
2907 #ifndef LALIGN
2908   printf(" %s [-options] query_file library_file",pgm_name);
2909   if (pgm_def_arr[pgm_id].ktup > 0) {
2910     printf(" [ktup]\n");
2911   }
2912   else {printf("\n");}
2913 #else
2914   printf(" %s [-options] seq_file1 seq_file2\n",pgm_name);
2915 #endif
2916   printf(" %s -help for a complete option list\n",pgm_name);
2917   printf("\nDESCRIPTION\n");
2918   printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
2919   printf("\n");
2920   printf("COMMON OPTIONS (options must preceed query_file library_file)\n");
2921 
2922   for (i=0; i<strlen(common_opts); i++) {
2923     opt_ptr = g_options;
2924     for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
2925       if (common_opts[i]==opt_ptr[j].opt_char) {
2926 	format_params(&opt_ptr[j], tmp_string);
2927 	printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
2928 	       tmp_string, opt_ptr[j].opt_descr_s);
2929 	/* if ((++opt_line_cnt % 2) == 0) printf("\n"); */
2930 	printf("\n");
2931 	goto next_option;
2932       }
2933     }
2934 
2935     opt_ptr = f_options;
2936     for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
2937       if (common_opts[i]==opt_ptr[j].opt_char) {
2938 	format_params(&opt_ptr[j], tmp_string);
2939 	printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
2940 	       tmp_string, opt_ptr[j].opt_descr_s);
2941 	/* if ((++opt_line_cnt % 2)==0) printf("\n"); */
2942 	printf("\n");
2943       }
2944     }
2945   next_option: continue;
2946   }
2947   if ((opt_line_cnt % 2) != 0) printf("\n");
2948   exit(0);
2949 }
2950 
2951 /* sorts a list of options, with upper and lower case characters
2952    sorted together */
sort_opt_list(char * v,int n)2953 void sort_opt_list(char *v, int n) {
2954   int gap, i, j, k;
2955   int incs[7] = { 336, 112, 48, 21, 7, 3, 1 };
2956   char tmp_c, tmp_u;
2957   int v_start;
2958 
2959   /* first shell sort the list using toupper() */
2960   for ( k = 0; k < 7; k++) {
2961     gap = incs[k];
2962     for (i = gap; i < n; i++) {
2963       tmp_c = v[i];
2964       tmp_u = toupper(tmp_c);
2965       j = i;
2966       while (j >= gap && toupper(v[j - gap]) > tmp_u) {
2967 	v[j] = v[j - gap];
2968 	j -= gap;
2969       }
2970       v[j] = tmp_c;
2971     }
2972   }
2973   /* then sort the toupper(==) pairs lower-case, upper-case */
2974 
2975   for (i=1; i<n; i++) {
2976     if (toupper(v[i])==toupper(v[i-1])) {
2977       if (v[i] > v[i-1]) { tmp_c = v[i]; v[i] = v[i-1]; v[i-1]=tmp_c;}
2978     }
2979   }
2980 }
2981 
2982 char *
sort_options(struct opt_def_str * g_options,struct opt_def_str * f_options)2983 sort_options (struct opt_def_str *g_options, struct opt_def_str *f_options) {
2984   struct opt_def_str *this_option;
2985   char *sorted_list, *sort_ptr;
2986   int i, opt_count;
2987 
2988   opt_count=0;
2989   this_option = g_options;
2990   while ((this_option++)->opt_char!='\0') { opt_count++;}
2991   this_option = f_options;
2992   while ((this_option++)->opt_char!='\0') { opt_count++;}
2993 
2994   if ((sorted_list = (char *)calloc(opt_count+1, sizeof(char)))==NULL) {
2995     fprintf(stderr," cannot allocate sorted_list[%d]\n",opt_count+1);
2996     exit(1);
2997   }
2998 
2999   sort_ptr = sorted_list;
3000   this_option = g_options;
3001   while (this_option->opt_char!='\0') {
3002     *sort_ptr++ = this_option->opt_char;
3003     this_option++;
3004   }
3005 
3006   this_option = f_options;
3007   while (this_option->opt_char!='\0') {
3008     *sort_ptr++ = this_option->opt_char;
3009     this_option++;
3010   }
3011 
3012   sort_opt_list(sorted_list, opt_count);
3013 
3014   return sorted_list;
3015 }
3016 
3017 void
show_all_help(char * pgm_name,int pgm_id)3018 show_all_help(char *pgm_name, int pgm_id) {
3019   int i, j;
3020   struct opt_def_str *opt_ptr;
3021   char tmp_string[MAX_STR];
3022   char *descr_ptr;
3023   char *sorted_list;
3024 
3025   sorted_list = sort_options(g_options, f_options);
3026 
3027   printf("USAGE\n");
3028   printf(" %s [-options] query_file library_file",pgm_name);
3029   if (pgm_def_arr[pgm_id].ktup > 0) {
3030     printf(" [ktup]\n");
3031   }
3032   else {printf("\n");}
3033 
3034   printf(" \"@\" query_file uses stdin; query_file:begin-end sets subset range\n");
3035   printf(" library file formats: 0:FASTA; 1:GenBankFF; 3:EMBL_FF; 7:FASTQ; 10:subset; 12:NCBI blastdbcmd;\n");
3036   printf("   alternate library formats: \"library_file 7\" for 7:FASTQ\n");
3037 
3038   printf("\nDESCRIPTION\n");
3039   printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
3040   printf("\n");
3041   printf("OPTIONS (options must preceed query_file library_file)\n");
3042 
3043   for (i=0; i<strlen(sorted_list); i++) {
3044     opt_ptr = g_options;
3045     for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
3046       if (sorted_list[i]==opt_ptr[j].opt_char) {
3047 	descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
3048 	format_params(&opt_ptr[j], tmp_string);
3049 	printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),tmp_string, descr_ptr);
3050 	goto next_option;
3051       }
3052     }
3053 
3054     opt_ptr = f_options;
3055     for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
3056       if (sorted_list[i]==opt_ptr[j].opt_char) {
3057 	descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
3058 	format_params(&opt_ptr[j], tmp_string);
3059 	printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '), tmp_string, descr_ptr);
3060       }
3061     }
3062   next_option: continue;
3063   }
3064 
3065   free(sorted_list);
3066 
3067   exit(0);
3068 }
3069