1 /* initfa.c */
2 /* $Id: initfa.c 1274 2014-08-07 18:30:56Z wrp $ */
3
4 /* copyright (c) 1996, 1997, 1998, 2014 by William R. Pearson and The
5 Rector & Visitors of the University of Virginia */
6
7 /* Licensed under the Apache License, Version 2.0 (the "License");
8 you may not use this file except in compliance with the License.
9 You may obtain a copy of the License at
10
11 http://www.apache.org/licenses/LICENSE-2.0
12
13 Unless required by applicable law or agreed to in writing,
14 software distributed under this License is distributed on an "AS
15 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
16 express or implied. See the License for the specific language
17 governing permissions and limitations under the License.
18 */
19
20 /* init??.c files provide function specific initializations */
21
22 /* h_init() - called from comp_lib.c, comp_thr.c to initialize pstruct ppst
23 which includes the alphabet, and pam matrix
24
25 alloc_pam() - allocate pam matrix space
26 init_pam2() - convert from 1D to 2D pam
27
28 init_pamx() - extend pam2 for 'X', 'N', or lower case characters
29
30 f_initenv() - set up mngmsg and pstruct defaults
31 f_getopt() - read fasta specific command line options
32 f_getarg() - read ktup
33
34 resetp() - reset the parameters, scoring matrix for DNA-DNA/DNA-prot
35
36 query_parm() - ask for ktup
37 last_init() - some things must be done last
38
39 f_initpam() - set some parameters based on the pam matrix
40 */
41
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <ctype.h>
45 #include <string.h>
46 #include <math.h>
47
48 #ifdef UNIX
49 #include <sys/types.h>
50 #include <sys/stat.h>
51 #endif
52
53 #include "defs.h"
54 #include "structs.h"
55 #include "param.h"
56 #include "best_stats.h"
57
58 #define XTERNAL
59 #include "upam.h"
60 #include "uascii.h"
61 #undef XTERNAL
62
63 #define MAXWINDOW 32
64
65 int initpam(char *, struct pstruct *);
66 void init_pam2 (struct pstruct *ppst);
67 void init_altpam(struct pstruct *ppst);
68 void init_pamx (struct pstruct *ppst);
69 void extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst);
70 void build_xascii(int *qascii, char *save_str);
71 void add_ascii_ann(int *qascii, unsigned char *ann_arr);
72 void re_ascii(int *qascii, int *pascii, int max_ann_arr);
73 extern int my_nrand(int, void *);
74
75 /* at some point, all the defaults should be driven from this table */
76 /*
77 #pgm q_seq l_seq p_seq matrix g_open g_ext fr_shft e_cut ktup E_band_opt
78 # -n/-p -s -e -f -h/-j -E argv[3]
79 fasta prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 2 0.02
80 fasta dna(1) dna(1) dna(1) +5/-4 -12 -4 - 2.0 6 0.01
81 ssearch prot(0) prot(0) prot(0) bl50 -10 -2 - 10.0 - -
82 ssearch dna(1) dna(1) dna(1) +5/-4 -16 -4 - 2.0 - -
83 fastx dna(1) prot(0) prot(0) BL50 -12 -2 -20 5.0 2 0.02
84 fasty dna(1) prot(0) prot(0) BL50 -12 -2 -20/-24 5.0 2 0.02
85 tfastx dna(1) prot(0) prot(0) BL50 -14 -2 -20 5.0 2 0.01
86 tfasty dna(1) prot(0) prot(0) BL50 -14 -2 -20/-24 5.0 2 0.01
87 fasts prot(0) prot(0) prot(0) MD20-MS - - - 5.0 - -
88 fasts dna(1) dna(1) dna(1) +2/-4 - - - 5.0 1 -
89 tfasts prot(0) dna(1) prot(0) MD10-MS - - - 2.0 1 -
90 fastf prot(0) prot(0) prot(0) MD20 - - - 2.0 1 -
91 tfastf prot(0) dna(1) prot(0) MD10 - - - 1.0 1 -
92 fastm prot(0) prot(0) prot(0) MD20 - - - 5.0 1 -
93 fastm dna(1) dna(1) dna(1) +2/-4 - - - 2.0 1 -
94 tfastm prot(0) dna(1) prot(0) MD10 - - - 2.0 1 -
95 lalign prot(0) prot(0) prot(0) BL50 -12 -2 - 1.0 - -
96 lalign dna(1) dna(1) dna(1) +5/-4 -12 -4 - 0.1 - -
97 */
98
99 void show_help(char *, int );
100
101 char *ref_str_a[]={
102 /* 0 */ "W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448\n",
103 /* 1 */ "T. F. Smith and M. S. Waterman, (1981) J. Mol. Biol. 147:195-197; \n W.R. Pearson (1991) Genomics 11:635-650\n",
104 /* 2 */ "Pearson et al, Genomics (1997) 46:24-36\n",
105 /* 3 */ "Mackey et al. Mol. Cell. Proteomics (2002) 1:139-147\n",
106 /* 4 */ "W.R. Pearson (1996) Meth. Enzymol. 266:227-258\n",
107 /* 5 */ "X. Huang and W. Miller (1991) Adv. Appl. Math. 12:373-381\n",
108 ""
109 };
110
111 #define FA_PID 1
112 #define SS_PID 2
113 #define FX_PID 3
114 #define FY_PID 4
115 #define FS_PID 5
116 #define FF_PID 6
117 #define FM_PID 7
118 #define RSS_PID 8
119 #define RFX_PID 9
120 #define SSS_PID 10 /* old (slow) non-PG Smith-Waterman */
121 #define TFA_PID FA_PID+10
122 #define TFX_PID FX_PID+10
123 #define TFY_PID FY_PID+10
124 #define TFS_PID FS_PID+10
125 #define TFF_PID FF_PID+10
126 #define TFM_PID FM_PID+10
127 #define LAL_PID 18
128 #define LNW_PID 19
129 #define GNW_PID 20
130
131 struct pgm_def_str {
132 int pgm_id;
133 char *prog_func;
134 char *info_pgm_abbr;
135 char *iprompt0;
136 char *ref_str;
137 int PgmDID;
138 char *smstr;
139 int g_open_mod;
140 int g_ext_mod;
141 int gshift;
142 int hshift;
143 double e_cut;
144 int ktup;
145 double E_band_opt;
146 int can_pre_align;
147 };
148
149 static struct pgm_def_str
150 pgm_def_arr[21] = {
151 {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0 }, /* 0 */
152 {FA_PID, "FASTA", "fa",
153 "FASTA searches a protein or DNA sequence data bank",
154 NULL, 401, "BL50", 0, 0, 0, 0, 10.0, 2, 0.2, 1}, /* 1 - FASTA */
155 {SS_PID, "SSEARCH","gsw","SSEARCH performs a Smith-Waterman search",
156 NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - SSEARCH */
157 {FX_PID, "FASTX","fx",
158 "FASTX compares a DNA sequence to a protein sequence data bank",
159 NULL, 405, "BL50", -2, 0, -20, 0, 5.0, 2, 0.10, 1}, /* 3 - FASTX */
160 {FY_PID, "FASTY", "fy",
161 "FASTY compares a DNA sequence to a protein sequence data bank",
162 NULL, 405, "BL50", -2, 0, -20, -24, 5.0, 2, 0.10, 1}, /* 4 - FASTY */
163 {FS_PID, "FASTS", "fs",
164 "FASTS compares linked peptides to a protein data bank",
165 NULL, 400, "MD20-MS", 0, 0, 0, 0, 5.0, 1, 0.0, 0}, /* 5 - FASTS */
166 {FF_PID, "FASTF", "ff",
167 "FASTF compares mixed peptides to a protein databank",
168 NULL, 400, "MD20", 0, 0, 0, 0, 2.0, 1, 0.0, 0 }, /* 6 - FASTF */
169 {FM_PID, "FASTM", "fm",
170 "FASTM compares ordered peptides to a protein data bank",
171 NULL, 400, "MD20", 0, 0, 0, 0, 5.0, 1, 0.0, 0 }, /* 7 - FASTM */
172 {RSS_PID, "PRSS", "rss",
173 "PRSS evaluates statistical signficance using Smith-Waterman",
174 NULL, 401, "BL50", 0, 0, 0, 0, 1000.0, 0, 0.0, 1 }, /* 8 - PRSS */
175 {RFX_PID,"PRFX", "rfx",
176 "PRFX evaluates statistical signficance using FASTX",
177 NULL, 401, "BL50", -2, 0, -20, -24, 1000.0, 2, 0.2, 1 }, /* 9 - PRFX */
178 {SSS_PID, "OSEARCH","ssw","OSEARCH searches a sequence data bank",
179 NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 2 - OSEARCH */
180 {TFA_PID, "TFASTA", "tfa",
181 "TFASTA compares a protein to a translated DNA data bank",
182 NULL, 402, "BL50", -2, 0, 0, 0, 5.0, 2, 0.1, 1},
183 {0, "", "", "", NULL, 400, "", 0, 0, 0, 0, 1.0, 0, 0.0 }, /* 0 */
184 {TFX_PID, "TFASTX", "tfx",
185 "TFASTX compares a protein to a translated DNA data bank",
186 NULL, 406, "BL50", -2, 0, -20, 0, 2.0, 2, 0.10, 1},
187 {TFY_PID, "TFASTY", "tfy",
188 "TFASTY compares a protein to a translated DNA data bank",
189 NULL, 406, "BL50", -2, 0, -20, -24, 2.0, 2, 0.10, 1},
190 {TFS_PID, "TFASTS", "tfs",
191 "TFASTS compares linked peptides to a translated DNA data bank",
192 NULL, 400, "MD10-MS", 0, 0, 0, 0, 2.0, 2, 0.0, 0 },
193 {TFF_PID, "TFASTF", "tff",
194 "TFASTF compares mixed peptides to a protein databank",
195 NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
196 {TFM_PID, "TFASTM", "tfm",
197 "TFASTM compares ordered peptides to a translated DNA databank",
198 NULL, 400, "MD10", 0, 0, 0, 0, 1.0, 1, 0.0, 0 },
199 {LAL_PID, "LALIGN", "lal",
200 "LALIGN finds non-overlapping local alignments",
201 NULL, 404, "BL50", -2, 0, 0, 0, 1.0, 0, 0.0, 1}, /* 18 - LALIGN */
202 {LNW_PID, "GLSEARCH", "lnw",
203 "GLSEARCH performs a global-query/local-library search",
204 NULL, 404, "BL50", -2, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 19 - GLSEARCH */
205 {GNW_PID, "GGSEARCH", "gnw",
206 "GGSEARCH performs a global/global database searches",
207 NULL, 404, "BL50", 0, 0, 0, 0, 10.0, 0, 0.0, 1}, /* 20 - GGSEARCH */
208 };
209
210 struct msg_def_str {
211 int pgm_id;
212 int q_seqt;
213 int l_seqt;
214 int p_seqt;
215 int sw_flag;
216 int stages;
217 int qframe;
218 int nframe;
219 int nrelv, srelv, arelv;
220 char *f_id0, *f_id1, *label, *alabel;
221 };
222
223 /* align_label must be < MAX_SSTR (32) */
224 char *align_label[]={
225 "Smith-Waterman", /* 0 */
226 "banded Smith-Waterman", /* 1 */
227 "Waterman-Eggert", /* 2 */
228 "trans. Smith-Waterman", /* 3 */
229 "global/local", /* 4 */
230 "trans. global/local", /* 5 */
231 "global/global (N-W)" /* 6 */
232 };
233
234 /* pgm_id q_seqt l_seqt p_seqt sw_f st qf nf nrv srv arv s_ix */
235 static struct msg_def_str
236 msg_def_arr[21] = {
237 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=0 */
238 {FA_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 1, 3,
239 "fa","sw", "opt"},
240 {SS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
241 "sw","sw", "s-w"},
242 {FX_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
243 "fx","sx", "opt"},
244 {FY_PID, SEQT_DNA, SEQT_PROT, SEQT_PROT, 1, 1, 2, -1, 3, 1, 3,
245 "fy","sy", "opt"},
246 {FS_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
247 "fs","fs", "initn init1"},
248 {FF_PID, SEQT_PROT,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
249 "ff","ff", "initn init1"},
250 {FM_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 3, 2, 3,
251 "fm","fm","initn init1"},
252 {RSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 0, 1, 1, -1, 1, 1, 1,
253 "rss","sw","s-w"},
254 {RFX_PID, SEQT_DNA,SEQT_PROT, SEQT_PROT, 0, 1, 2, -1, 3, 1, 3,
255 "rfx","sx","opt"},
256 {SSS_PID, SEQT_UNK,SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
257 "sw","sw", "s-w"},
258 {TFA_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 0, 1, 1, 6, 3, 1, 3,
259 "tfa","fa","initn init1"},
260 {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", "", ""}, /* ID=12 */
261 {TFX_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
262 "tfx","sx","initn opt"},
263 {TFY_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 2, 3, 2, 3,
264 "tfy","sy","initn opt"},
265 {TFS_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
266 "tfs","fs","initn init1"},
267 {TFF_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
268 "tff","ff","initn init1"},
269 {TFM_PID, SEQT_PROT,SEQT_DNA, SEQT_PROT, 1, 1, 1, 6, 3, 2, 3,
270 "tfm","fm","initn init1"},
271 {LAL_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
272 "lsw","lsw", "ls-w"},
273 {LNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
274 "gnw","gnw", "n-w"},
275 {GNW_PID, SEQT_UNK, SEQT_PROT, SEQT_PROT, 1, 1, 1, -1, 1, 1, 1,
276 "gnw","gnw", "n-w"},
277 };
278
279 int
get_pgm_id()280 get_pgm_id() {
281
282 int rval=0;
283
284 #ifdef FASTA
285 #ifndef TFAST
286 pgm_def_arr[FA_PID].ref_str = ref_str_a[0];
287 msg_def_arr[FA_PID].alabel = align_label[0];
288 rval=FA_PID;
289 #else
290 pgm_def_arr[TFA_PID].ref_str = ref_str_a[0];
291 msg_def_arr[TFA_PID].alabel = align_label[2];
292 rval=TFA_PID;
293 #endif
294 #endif
295
296 #ifdef FASTX
297 #ifndef TFAST
298 #ifndef PRSS
299 pgm_def_arr[FX_PID].ref_str = ref_str_a[2];
300 msg_def_arr[FX_PID].alabel = align_label[3];
301 rval=FX_PID;
302 #else
303 pgm_def_arr[RFX_PID].ref_str = ref_str_a[2];
304 msg_def_arr[FX_PID].alabel = align_label[3];
305 rval=RFX_PID;
306 #endif
307 #else
308 pgm_def_arr[TFX_PID].ref_str = ref_str_a[2];
309 msg_def_arr[TFX_PID].alabel = align_label[3];
310 rval=TFX_PID;
311 #endif
312 #endif
313
314 #ifdef FASTY
315 #ifndef TFAST
316 pgm_def_arr[FY_PID].ref_str = ref_str_a[2];
317 msg_def_arr[FY_PID].alabel = align_label[3];
318 rval=FY_PID;
319 #else
320 pgm_def_arr[TFY_PID].ref_str = ref_str_a[2];
321 msg_def_arr[TFY_PID].alabel = align_label[3];
322 rval=TFY_PID;
323 #endif
324 #endif
325
326 #ifdef FASTS
327 #ifndef TFAST
328 pgm_def_arr[FS_PID].ref_str = ref_str_a[3];
329 msg_def_arr[FS_PID].alabel = align_label[4];
330 rval=FS_PID;
331 #else
332 pgm_def_arr[TFS_PID].ref_str = ref_str_a[3];
333 msg_def_arr[TFS_PID].alabel = align_label[5];
334 rval=TFS_PID;
335 #endif
336 #endif
337
338 #ifdef FASTF
339 #ifndef TFAST
340 pgm_def_arr[FF_PID].ref_str = ref_str_a[3];
341 msg_def_arr[FF_PID].alabel = align_label[4];
342 rval=FF_PID;
343 #else
344 pgm_def_arr[TFF_PID].ref_str = ref_str_a[3];
345 msg_def_arr[TFF_PID].alabel = align_label[5];
346 rval=TFF_PID;
347 #endif
348 #endif
349
350 #ifdef FASTM
351 #ifndef TFAST
352 pgm_def_arr[FM_PID].ref_str = ref_str_a[3];
353 msg_def_arr[FM_PID].alabel = align_label[4];
354 rval=FM_PID;
355 #else
356 pgm_def_arr[TFM_PID].ref_str = ref_str_a[3];
357 msg_def_arr[TFM_PID].alabel = align_label[5];
358 rval=TFM_PID;
359 #endif
360 #endif
361
362 #ifdef SSEARCH
363 #define CAN_PSSM
364 pgm_def_arr[SS_PID].ref_str = ref_str_a[1];
365 msg_def_arr[SS_PID].alabel = align_label[0];
366 rval=SS_PID;
367 #endif
368
369 #ifdef OSEARCH
370 pgm_def_arr[SSS_PID].ref_str = ref_str_a[1];
371 msg_def_arr[SSS_PID].alabel = align_label[0];
372 rval=SSS_PID;
373 #endif
374
375 #ifdef LALIGN
376 #define CAN_PSSM
377 pgm_def_arr[LAL_PID].ref_str = ref_str_a[5];
378 msg_def_arr[LAL_PID].alabel = align_label[2];
379 rval=LAL_PID;
380 #endif
381
382 #ifdef GLSEARCH
383 #define CAN_PSSM
384 pgm_def_arr[LNW_PID].ref_str = ref_str_a[6];
385 msg_def_arr[LNW_PID].alabel = align_label[4];
386 rval=LNW_PID;
387 #endif
388
389 #ifdef GGSEARCH
390 #define CAN_PSSM
391 pgm_def_arr[GNW_PID].ref_str = ref_str_a[6];
392 msg_def_arr[GNW_PID].alabel = align_label[6];
393 rval=GNW_PID;
394 #endif
395
396 return rval;
397 }
398
399 extern struct opt_def_str g_options[];
400 extern void set_opt_disp_defs(char opt_char, struct opt_def_str *options, int type,
401 int i_param1, int i_param2,
402 double d_param1, double d_param2, char *s_param);
403
404 static char z_opt_descr[] = "Statistics estimation method:\n 1 - regression; -1 - no stats.; 0 - no scaling; 2 - Maximum Likelihood Est.;\n 3 - Altschul/Gish; 4 - iter. regress.; 5 - regress w/variance;\n 6 - MLE with comp. adj.;\n 11 - 16 - estimates from shuffled library sequences;\n 21 - 26 - E2()-stats from shuffled high-scoring sequences;";
405
406 static char s_opt_descr[] = "Scoring matrix: (protein)\n BL50, BP62 (sets -f -11 -g -1); P250, OPT5, VT200,\n VT160, P120, VT120, BL80, VT80, MD40, VT40, MD20, VT20, MD10, VT10;\n scoring matrix file name; -s ?BL50 adjusts matrix for short queries;";
407
408
409 struct opt_def_str f_options[] = {
410 {'3', 0, "norevcomp", "compare forward strand only", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
411 #if defined(FASTA) || defined(SSEARCH)
412 {'a', 0, "show_all", "show complete Query/Sbjct sequences in alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
413 {'W', 1, "context", "alignment context length (surrounding unaligned sequence)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
414 #endif
415 #if defined(FASTA)
416 {'A', 0, "sw_align", "Smith-Waterman for final DNA alignment, band alignment for protein\n default is band-alignment for DNA, Smith-Waterman for protein", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
417 #endif
418 {'b', 1, "num_descriptions", "high scores reported (limited by -E by default)",
419 "high scores reported (limited by -E by default);\n =<int> forces <int> results;", 0, 0, 0, 0, 0.0, 0.0, NULL},
420 {'d', 1, "num_alignments", "number of alignments shown (limited by -E by default)", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
421 #if defined(FASTA) || defined(FASTX) || defined(FASTY)
422 {'c', 1, "opt_join", "expected fraction for band-optimization, joining", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
423 #endif
424 {'E', 1, "evalue", "E()-value threshold", "E()-value,E()-repeat threshold", 0, 0, 0, 0, 0.0, 0.0, NULL},
425 {'f', 1, "gapopen", "gap-open penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
426 {'g', 1, "gapext", "gap-extension penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
427 #ifdef SHOW_HELP
428 {'h', 0, "help", "help - show options, arguments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
429 #ifdef FASTY
430 {'j', 1, "frame_subs", "frame-shift, codon substitution penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
431 #else
432 #ifdef FASTX
433 {'j', 1, "frame_shift", "frame-shift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
434 #endif
435 #endif
436 #else
437 #ifndef LALIGN
438 {'h', 1, "frame", "frameshift penalty", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
439 {'j', 1, "codon_subs", "codon substitution", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
440 #endif
441 #endif
442 #if defined(LALIGN)
443 {'J', 0, "show_ident", "show identity alignment", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
444 {'K', 1, "max_repeat", "maximum number of non-intersecting alignments", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
445 #endif
446 {'k', 1, "nshuffle", "number of shuffles", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
447 {'M', 1, "range", "filter on library sequence length", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
448 {'n', 0, "dna", "DNA/RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
449 {'p', 0, "prot", "protein query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
450 #if defined(CAN_PSSM)
451 {'P', 1, "pssm", "PSSM file", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
452 #endif
453 {'r', 1, "dna_ratio", " +match/-mismatch for DNA/RNA", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
454 {'s', 1, "matrix", "scoring matrix", &s_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
455 #if !defined(LALIGN) && !defined(FASTS) && !defined(FASTM)
456 {'S', 0, "seg", "filter lowercase (seg) residues", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
457 #endif
458 #if defined(TFAST) || defined(FASTX) || defined(FASTY)
459 {'t', 1, "gencode", "translation genetic code", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
460 #endif
461 {'U', 0, "rna", "RNA query", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
462 {'X', 1, "ext_opts", "Extended options", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL},
463 {'z', 1, "stats", "Statistics estimation method", &z_opt_descr[0], 0, 0, 0, 0, 0.0, 0.0, NULL},
464 {'\0', 0, "", "", NULL, 0, 0, 0, 0, 0.0, 0.0, NULL}
465 };
466
f_init_opts(int pgm_id,struct mngmsg * m_msp,struct pstruct * ppst)467 void f_init_opts(int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
468 #if defined(FASTA) || defined(FASTX) || defined(FASTY)
469 set_opt_disp_defs('c', f_options, 4, 0, 0, ppst->param_u.fa.E_band_opt, max(ppst->param_u.fa.E_band_opt*5.0,1.0), NULL);
470 #endif
471 #ifndef LALIGN
472 set_opt_disp_defs('E', f_options, 4, 0, 0, ppst->e_cut, ppst->e_cut_r, NULL);
473 #else
474 set_opt_disp_defs('E', f_options, 3, 0, 0, ppst->e_cut, 0.0, NULL);
475 #endif
476 set_opt_disp_defs('s', f_options, 5, 0, 0, 0.0, 0.0, ppst->pamfile);
477 set_opt_disp_defs('f', f_options, 1, ppst->gdelval, 0, 0.0, 0.0, NULL);
478 set_opt_disp_defs('g', f_options, 1, ppst->ggapval, 0, 0.0, 0.0, NULL);
479 #ifdef FASTY
480 set_opt_disp_defs('j', f_options, 2, ppst->gshift, ppst->gsubs, 0.0, 0.0, NULL);
481 #endif
482 #ifdef FASTX
483 set_opt_disp_defs('j', f_options, 1, ppst->gshift, 0, 0.0, 0.0, NULL);
484 #endif
485 #ifdef LALIGN
486 set_opt_disp_defs('K', f_options, 1, ppst->max_repeat, 0, 0.0, 0.0, NULL);
487 #endif
488 set_opt_disp_defs('k', f_options, 1, m_msp->shuff_max, 0, 0.0, 0.0, NULL);
489 set_opt_disp_defs('r', f_options, 2, ppst->p_d_mat, ppst->p_d_mis, 0.0, 0.0, NULL);
490 set_opt_disp_defs('t', f_options, 1, ppst->tr_type, 0, 0.0, 0.0, NULL);
491 set_opt_disp_defs('z', f_options, 1, ppst->zsflag, 0, 0.0, 0.0, NULL);
492 #if defined(FASTA) || defined(SSEARCH)
493 set_opt_disp_defs('W', f_options, 1, m_msp->aln.llcntx, 0, 0.0, 0.0, NULL);
494 #endif
495 }
496
497 char *iprompt1=" test sequence file name: ";
498 char *iprompt2=" database file name: ";
499
500 #ifdef PCOMPLIB
501 char *verstr="36.3.8 Jul, 2015 MPI";
502 #else
503 char *verstr="36.3.8 Jul, 2015";
504 #endif
505
506 static int mktup=3;
507 static int ktup_set = 0;
508 static int gap_set=0;
509 static int del_set=0;
510 static int mshuff_set = 0;
511 static int prot2dna = 0;
512 void
513 parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst);
514
515 extern int fa_max_workers;
516
517 extern void s_abort(char *, char *);
518 extern void init_ascii0(int *sascii, char *sq_map, int sq_map_n, struct pstruct *ppst);
519 extern void init_ascii(int ext_sq, int *sascii, int nsq, int dnaseq);
520 extern void validate_novel_aa(int *sascii, int p_nsq, int dnaseq);
521 extern int standard_pam(char *smstr, struct pstruct *ppst,
522 int del_set, int gap_set);
523 extern int min_pam_bits(int n0_eff, double bit_thresh, struct pstruct *ppst,
524 int del_set, int gap_set);
525 extern void mk_n_pam(int *arr,int siz, int mat, int mis);
526 extern int karlin(int , int, double *, double *, double *);
527 extern void init_karlin_a(struct pstruct *, double *, double **);
528 extern int do_karlin_a(int **, const struct pstruct *, double *,
529 double *, double *, double *, double *);
530
531 #if defined(TFAST) || defined(FASTX) || defined(FASTY)
532 extern void aainit(int tr_type, int debug);
533 #endif
534
535 char *iprompt0, *prog_func, *refstr;
536
537 /* Sets defaults assuming a protein sequence */
h_init(struct pstruct * ppst,struct mngmsg * m_msp,char * info_pgm_abbr)538 void h_init (struct pstruct *ppst, struct mngmsg *m_msp, char *info_pgm_abbr)
539 {
540 struct pgm_def_str pgm_def;
541 int i, pgm_id;
542
543 ppst->pgm_id = pgm_id = get_pgm_id();
544 pgm_def = pgm_def_arr[pgm_id];
545
546 /* check that pgm_def_arr[] is valid */
547 if (pgm_def.pgm_id != pgm_id) {
548 fprintf(stderr,
549 "**pgm_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
550 pgm_def.pgm_id, pgm_id);
551 exit(1);
552 }
553
554 /* check that msg_def_arr[] is valid */
555 if (msg_def_arr[pgm_id].pgm_id != pgm_id) {
556 fprintf(stderr,
557 "**msg_def integrity failure: def.pgm_id %d != pgm_id %d**\n",
558 msg_def_arr[pgm_id].pgm_id, pgm_id);
559 exit(1);
560 }
561
562 SAFE_STRNCPY(info_pgm_abbr,pgm_def.info_pgm_abbr,MAX_SSTR);
563 iprompt0 = pgm_def.iprompt0;
564 refstr = pgm_def.ref_str;
565 prog_func = pgm_def.prog_func;
566
567 /* used to be MAXTOT = MAXTST+MAXLIB, but now fixed at MAXLIB for
568 pre-loaded libraries */
569 m_msp->max_tot = MAXLIB;
570
571 init_ascii0(aascii, NCBIstdaa, NCBIstdaa_n, ppst);
572 pascii = aascii;
573
574 /* set up DNA query sequence if required*/
575 if (msg_def_arr[pgm_id].q_seqt == SEQT_DNA) {
576 memcpy(qascii,nascii,sizeof(qascii));
577 m_msp->qdnaseq = SEQT_DNA;
578 }
579 else { /* when SEQT_UNK, start with protein */
580 memcpy(qascii,aascii,sizeof(qascii));
581 m_msp->qdnaseq = msg_def_arr[pgm_id].q_seqt;
582 }
583
584 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
585 qascii[','] = ESS;
586 /* also initialize aascii, nascii for databases */
587 qascii['*'] = NA;
588 ppst->pam_ms = 1;
589 ppst->do_rep=0; /* disable multiple alignments */
590 ppst->pseudocts = 200;
591 #else
592 ppst->pam_ms = 0;
593 ppst->do_rep=1; /* enable multiple alignments */
594 #endif
595
596 /* initialize a pam matrix */
597 SAFE_STRNCPY(ppst->pamfile,pgm_def.smstr,MAX_FN);
598 standard_pam(ppst->pamfile,ppst,del_set,gap_set);
599 ppst->have_pam2 = 0;
600
601 /* specify pre-alignment */
602 ppst->can_pre_align = pgm_def.can_pre_align;
603
604 ppst->p_d_mat = 5;
605 ppst->p_d_mis = -4;
606
607 /* this is always protein by default */
608 ppst->nsq = NCBIstdaa_n;
609 ppst->nsqx = NCBIstdaa_ext_n;
610 /* we need to populate ppst->sq to nsqx for direct lc mapping */
611 for (i=0; i<ppst->nsqx; i++) {
612 ppst->sq[i] = NCBIstdaa_l[i];
613 ppst->hsq[i] = h_NCBIstdaa[i];
614 }
615 for (i=0; i<ppst->nsqx; i++) {
616 ppst->sqx[i]=NCBIstdaa_ext[i];
617 ppst->hsqx[i]=h_NCBIstdaa_ext[i];
618 }
619 ppst->sq[ppst->nsq] = ppst->sqx[ppst->nsqx] = '\0';
620
621 /* set up the c_nt[] mapping */
622
623 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
624 ppst->c_nt[ESS] = ESS;
625 #endif
626 ppst->c_nt[0]=0;
627 for (i=1; i<nnt; i++) {
628 ppst->c_nt[i]=gc_nt[i];
629 ppst->c_nt[i+nnt]=gc_nt[i]+nnt;
630 }
631
632 #ifdef CAN_PSSM
633 ppst->pam2p[0] = NULL;
634 ppst->pam2p[1] = NULL;
635 #endif
636 }
637
638 /*
639 * alloc_pam(): allocates memory for the 2D pam matrix as well
640 * as for the integer array used to transmit the pam matrix
641 */
642 void
alloc_pam(int d1,int d2,struct pstruct * ppst)643 alloc_pam (int d1, int d2, struct pstruct *ppst)
644 {
645 int i, *d2p;
646 char err_str[128];
647
648 if ((ppst->pam2[0] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
649 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
650 s_abort (err_str,"");
651 }
652
653 if ((ppst->pam2[1] = (int **) malloc (d1 * sizeof (int *))) == NULL) {
654 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
655 s_abort (err_str,"");
656 }
657
658 if ((d2p = (int *) calloc (d1 * d2, sizeof (int))) == NULL) {
659 sprintf(err_str,"Cannot allocate 2D pam matrix: %d",d1);
660 s_abort (err_str,"");
661 }
662
663 for (i = 0; i < d1; i++, d2p += d2)
664 ppst->pam2[0][i] = d2p;
665
666 if ((d2p= (int *) malloc (d1 * d2 * sizeof (int))) == NULL) {
667 sprintf(err_str,"Cannot allocate 2d pam matrix: %d",d2);
668 s_abort (err_str,"");
669 }
670
671 for (i = 0; i < d1; i++, d2p += d2)
672 ppst->pam2[1][i] = d2p;
673
674 ppst->have_pam2 = 1;
675 }
676
677 /*
678 * init_pam2(struct pstruct pst): Converts 1-D pam matrix to 2-D
679 * currently, this function is very protein centric
680 */
681 void
init_pam2(struct pstruct * ppst)682 init_pam2 (struct pstruct *ppst) {
683 int i, j, k, nsq, sa_t;
684 int ix_j, ix_l, ix_i, p_i, p_j;
685
686 nsq = ppst->nsq;
687
688 ppst->pam2[0][0][0] = -BIGNUM;
689 ppst->pam_h = -1; ppst->pam_l = 1;
690
691 k = 0;
692
693 if (ppst->dnaseq == 0) { /* not DNA */
694 sa_t = aascii['*']; /* this is the last character for which pam[] is available */
695 pam_sq = apam_sq;
696 pam_sq_n = apam_sq_n;
697 }
698 else { /* have DNA, no '*' */
699 sa_t = nascii['X'];
700 pam_sq = npam_sq;
701 pam_sq_n = npam_sq_n;
702 }
703
704 /* we use sa_t here because that is the last position in the 1-D
705 matrix */
706 for (i = 1; i < sa_t; i++) {
707 p_i = pascii[pam_sq[i]];
708 ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
709 for (j = 1; j <= i; j++) {
710 /* here is where the pam file is actually set */
711 p_j = pascii[pam_sq[j]];
712 ppst->pam2[0][p_j][p_i] = ppst->pam2[0][p_i][p_j] = pam[k++] - ppst->pamoff;
713 if (ppst->pam_l > ppst->pam2[0][p_i][p_j]) ppst->pam_l = ppst->pam2[0][p_i][p_j];
714 if (ppst->pam_h < ppst->pam2[0][p_i][p_j]) ppst->pam_h = ppst->pam2[0][p_i][p_j];
715 }
716 }
717
718 /* need to do the same thing for characters > sa_t */
719 for (i = sa_t+1; i < pam_sq_n; i++) {
720 p_i = pascii[pam_sq[i]];
721 ppst->pam2[0][0][p_i] = ppst->pam2[0][p_i][0] = -BIGNUM;
722 }
723
724 if (ppst->dnaseq == 0) {
725 init_altpam(ppst);
726 }
727 }
728
729 void
init_altpam(struct pstruct * ppst)730 init_altpam(struct pstruct *ppst) {
731 int ix_i, ix_l, ix_j, p_i, p_j, i;
732
733 /* add values for 'J' (I/L) value, which are not present in 1-D matrices */
734 ix_i = pascii['I'];
735 ix_l = pascii['L'];
736 ix_j = pascii['J'];
737 if (strchr(pam_sq,'J')==NULL) {
738 ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
739 /* get the identities */
740 ppst->pam2[0][ix_j][ix_j] =
741 max(ppst->pam2[0][ix_i][ix_i],ppst->pam2[0][ix_l][ix_l]);
742 for (i=1; i < pam_sq_n; i++) {
743 p_i = pascii[pam_sq[i]];
744 /* do not assume symmetric matrices */
745 ppst->pam2[0][ix_j][p_i] =
746 max(ppst->pam2[0][ix_i][p_i],ppst->pam2[0][ix_l][p_i]);
747 ppst->pam2[0][p_i][ix_j] =
748 max(ppst->pam2[0][p_i][ix_i],ppst->pam2[0][p_i][ix_l]);
749 }
750 }
751 /* add values for 'O' (K) value, which are not present in 1-D matrices */
752 ix_i = pascii['K'];
753 ix_j = pascii['O'];
754 if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet ? */
755 ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
756 /* get the identity */
757 ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
758 /* do not assume symmetric matrices */
759 for (i=1; i < pam_sq_n; i++) {
760 p_i = pascii[pam_sq[i]];
761 ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
762 ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
763 }
764 }
765 else {
766 pascii['O'] = pascii['K'];
767 pascii['o'] = pascii['k'];
768 }
769
770 /* add values for 'U' (C) value, which are not present in 1-D matrices */
771 ix_i = pascii['C'];
772 ix_j = pascii['U'];
773 if (ix_j < ppst->nsq) { /* is it in the NCBIstdaa alphabet */
774 ppst->pam2[0][ix_j][0] = ppst->pam2[0][0][ix_j] = -BIGNUM;
775 /* get the identity */
776 ppst->pam2[0][ix_j][ix_j] = ppst->pam2[0][ix_i][ix_i];
777 /* do not assume symmetric matrices */
778 for (i=1; i < pam_sq_n; i++) {
779 p_i = pascii[pam_sq[i]];
780 ppst->pam2[0][ix_j][p_i] = ppst->pam2[0][ix_i][p_i];
781 ppst->pam2[0][p_i][ix_j] = ppst->pam2[0][p_i][ix_i];
782 }
783 }
784 else {
785 pascii['U'] = pascii['C'];
786 pascii['u'] = pascii['c'];
787 }
788 }
789
790 /* extend the standard pam matrix for special residues
791 (a) 'X' for protein and 'N' for DNA, 'G' and 'U' for RNA,
792 (b) lower-case characters for ext_sq_set
793
794 must be called after init_pam2()
795 */
796 void
init_pamx(struct pstruct * ppst)797 init_pamx (struct pstruct *ppst) {
798 int i, j, k, nsq;
799 int sa_x, sa_t, tmp;
800
801 nsq = ppst->nsq;
802
803 ppst->nt_align = (ppst->dnaseq== SEQT_DNA || ppst->dnaseq == SEQT_RNA);
804
805 /* sa_x is the 'unknown' character, 'X' for proteins, 'N' for DNA/RNA */
806 /* sa_t is the termination character -- not used for DNA */
807
808 if (ppst->nt_align) {
809 sa_x = pascii['N'];
810 sa_t = sa_x;
811 }
812 else {
813 sa_x = pascii['X'];
814 sa_t = pascii['*'];
815 }
816
817 /* build an asymmetric matrix for RNA */
818 if (ppst->dnaseq == SEQT_RNA && !ppst->pam_set) {
819 tmp = ppst->pam2[0][nascii['G']][nascii['G']] - 3;
820 ppst->pam2[0][nascii['A']][nascii['G']] =
821 ppst->pam2[0][nascii['C']][nascii['T']] =
822 ppst->pam2[0][nascii['C']][nascii['U']] = tmp;
823 }
824
825 if (ppst->pam_x_set) {
826 for (i=1; i<nsq; i++) {
827 if (sa_x < nsq) ppst->pam2[0][sa_x][i] = ppst->pam2[0][i][sa_x]=ppst->pam_xm;
828 if (sa_t < nsq) ppst->pam2[0][sa_t][i] = ppst->pam2[0][i][sa_t]=ppst->pam_xm;
829 }
830 if (sa_x < nsq) ppst->pam2[0][sa_x][sa_x]=ppst->pam_xx;
831 if (sa_t < nsq) ppst->pam2[0][sa_t][sa_t]=ppst->pam_xx;
832 }
833 else {
834 ppst->pam_xx = ppst->pam2[0][sa_x][sa_x];
835 ppst->pam_xm = ppst->pam2[0][1][sa_x];
836 }
837
838 /* fill in pam2[1] matrix */
839 ppst->pam2[1][0][0] = -BIGNUM;
840 /* fill in additional parts of the matrix */
841 for (i = 0; i < nsq; i++) {
842 /* -BIGNUM to all matches vs 0 */
843 ppst->pam2[0][0][i+nsq] = ppst->pam2[0][i+nsq][0] =
844 ppst->pam2[1][0][i+nsq] = ppst->pam2[1][i+nsq][0] =
845 ppst->pam2[0][0][i] = ppst->pam2[0][i][0] =
846 ppst->pam2[1][0][i] = ppst->pam2[1][i][0] = -BIGNUM;
847
848 for (j = 0; j < nsq; j++) {
849 /* replicate pam2[0] to i+nsq, j+nsq, also initialize lowest of pam2[1]*/
850 ppst->pam2[0][i+nsq][j] = ppst->pam2[0][i][j+nsq] = ppst->pam2[0][i+nsq][j+nsq] =
851 ppst->pam2[1][i][j] = ppst->pam2[0][i][j];
852
853 /* set the high portion of pam2[1] to the corresponding value
854 of pam2[1][sa_x][j] */
855
856 ppst->pam2[1][i+nsq][j] = ppst->pam2[1][i][j+nsq]=
857 ppst->pam2[1][i+nsq][j+nsq]=ppst->pam2[0][sa_x][j];
858 }
859 }
860
861 /* set matches to the internal '-' to pam_xx */
862 /* this needs to be adjusted for multiple internal '-----' */
863 for (i=1; i< nsq; i++) {
864 ppst->pam2[0][nsq][i] = ppst->pam2[0][i][nsq] =
865 ppst->pam2[0][nsq][i+nsq] = ppst->pam2[0][i+nsq][nsq] =
866 ppst->pam2[1][nsq][i] = ppst->pam2[1][i][nsq] =
867 ppst->pam2[1][nsq][i+nsq] = ppst->pam2[1][i+nsq][nsq] = ppst->pam_xm;
868 }
869 ppst->pam2[0][nsq][nsq] = ppst->pam2[1][nsq][nsq] = ppst->pam_xm;
870 }
871
872 /* function specific initializations */
873 void
f_initenv(struct mngmsg * m_msp,struct pstruct * ppst,unsigned char ** aa0)874 f_initenv (struct mngmsg *m_msp, struct pstruct *ppst, unsigned char **aa0) {
875 struct msg_def_str m_msg_def;
876 int pgm_id;
877
878 pgm_id = ppst->pgm_id;
879 m_msg_def = msg_def_arr[pgm_id];
880
881 m_msp->last_calc_flg=0;
882
883 SAFE_STRNCPY(m_msp->f_id0,m_msg_def.f_id0,sizeof(m_msp->f_id0));
884 SAFE_STRNCPY(m_msp->f_id1,m_msg_def.f_id1,sizeof(m_msp->f_id1));
885 SAFE_STRNCPY (m_msp->label, m_msg_def.label, sizeof(m_msp->label));
886 SAFE_STRNCPY(m_msp->alabel, m_msg_def.alabel, sizeof(m_msp->alabel));
887
888 #if !defined(SSEARCH) && !defined(GGSEARCH) && !defined(GLSEARCH) && !defined(LALIGN)
889 SAFE_STRNCPY (m_msp->alab[0],"initn",20);
890 SAFE_STRNCPY (m_msp->alab[1],"init1",20);
891 SAFE_STRNCPY (m_msp->alab[2],"opt",20);
892 #else
893 #if defined(SSEARCH) || defined(LALIGN)
894 SAFE_STRNCPY (m_msp->alab[0],"s-w opt",20);
895 #else
896 SAFE_STRNCPY (m_msp->alab[0],"n-w opt",20);
897 #endif
898 #endif
899
900 #if defined(GGSEARCH) || defined(GLSEARCH)
901 m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 0;
902 #else
903 m_msp->zsflag = ppst->zsflag = ppst->zsflag_f = 1;
904 m_msp->zsflag2 = ppst->zsflag2 = 1;
905 #endif
906
907 ppst->gdelval += pgm_def_arr[pgm_id].g_open_mod;
908 ppst->ggapval += pgm_def_arr[pgm_id].g_ext_mod;
909 #if defined(FASTX) || defined(FASTY)
910 ppst->gshift = pgm_def_arr[pgm_id].gshift;
911 ppst->gsubs = pgm_def_arr[pgm_id].hshift;
912 #endif
913 ppst->sw_flag = m_msg_def.sw_flag;
914 ppst->e_cut = m_msp->e_cut=pgm_def_arr[pgm_id].e_cut;
915 #ifndef LALIGN
916 ppst->e_cut_r = ppst->e_cut/10.0; /* more significant */
917 #else
918 ppst->e_cut_r = ppst->e_cut; /* everything if local */
919 #endif
920
921 ppst->score_ix = 0;
922 ppst->histint = 2;
923 m_msp->qframe = m_msg_def.qframe;
924 m_msp->nframe = m_msg_def.nframe;
925 m_msp->nrelv = m_msg_def.nrelv;
926 m_msp->srelv = m_msg_def.srelv;
927 m_msp->arelv = m_msg_def.arelv;
928 m_msp->stages = m_msg_def.stages;
929 m_msp->shuff_wid = 0;
930 m_msp->shuff_max = MAX_RSTATS;
931 m_msp->shuff_max_save = m_msp->shuff_max;
932
933 /* see param.h for the definition of all these */
934
935 m_msp->qshuffle = 0;
936 m_msp->nm0 = 1;
937 m_msp->escore_flg = 0;
938
939 /* pam information */
940 ppst->pam_pssm = 0;
941 #if defined(FASTS) || defined(FASTF) || defined(FASTM)
942 ppst->pam_xx = ppst->pam_xm = 0;
943 #else
944 ppst->pam_xx = 1; /* set >0 to use pam['X']['X'] value */
945 ppst->pam_xm = -1; /* set >0 to use pam['X']['A-Z'] value */
946 #endif
947 ppst->pam_x_set = 0;
948 ppst->pam_x_id_sim = 0;
949 ppst->pam_set = ppst->pam_variable = 0;
950 ppst->pam_pssm = 0;
951 ppst->p_d_set = 0;
952 ppst->pamoff = 0;
953 ppst->ext_sq_set = 0;
954 ppst->nsq_e = ppst->nsq;
955
956 /* initial settings for protein */
957 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
958 mktup = 3;
959 ppst->param_u.fa.bestscale = 300;
960 ppst->param_u.fa.bestoff = 36;
961 ppst->param_u.fa.bkfact = 6;
962 ppst->param_u.fa.scfact = 3;
963 ppst->param_u.fa.bktup = mktup;
964 ppst->param_u.fa.ktup = 0;
965 ppst->param_u.fa.bestmax = 50;
966 ppst->param_u.fa.pamfact = 1;
967 ppst->param_u.fa.altflag = 0;
968 ppst->param_u.fa.optflag = 1;
969 ppst->param_u.fa.iniflag = 0;
970 ppst->param_u.fa.optcut = 0;
971 ppst->param_u.fa.optcut_set = 0;
972 ppst->param_u.fa.cgap = 0;
973 ppst->param_u.fa.optwid = 16;
974 ppst->param_u.fa.optwid_set = 0;
975 ppst->param_u.fa.E_band_opt = pgm_def_arr[ppst->pgm_id].E_band_opt;
976 ppst->param_u.fa.use_E_thresholds = 1; /* disable E-thresholds for now */
977 }
978
979 f_init_opts(pgm_id, m_msp, ppst);
980 }
981
982 /* switches for fasta only */
983
984 static int shift_set=0;
985 static int subs_set=0;
986 static int sw_flag_set=0;
987 static int nframe_set=0;
988 static int E_thresh_set = 0;
989 static int E_cgap_set = 0;
990
991 void
f_getopt(char copt,char * optarg,struct mngmsg * m_msg,struct pstruct * ppst)992 f_getopt (char copt, char *optarg,
993 struct mngmsg *m_msg, struct pstruct *ppst)
994 {
995 int pgm_id;
996 double tmp_f, tmp_f1;
997 double tmp_e_cut, tmp_e_rep;
998 int dnaseq_save;
999 char *bp;
1000
1001 pgm_id = ppst->pgm_id;
1002
1003 switch (copt) {
1004 case '3':
1005 nframe_set = 1;
1006 if (pgm_id == TFA_PID) {
1007 m_msg->nframe = 3; break;
1008 }
1009 else {
1010 m_msg->nframe = 1; /* for TFASTXY */
1011 m_msg->qframe = 1; /* for FASTA, FASTX */
1012 }
1013 break;
1014 case 'a': m_msg->aln.showall = 1; break;
1015 case 'A':
1016 if (ppst->sw_flag) ppst->sw_flag=0;
1017 else ppst->sw_flag= 1;
1018 sw_flag_set = 1;
1019 break;
1020 case 'b':
1021 if (optarg[0] == '$') {
1022 m_msg->mshow = -1;
1023 m_msg->e_cut = 10000000.0;
1024 break;
1025 }
1026 else if (optarg[0] == '=') {
1027 m_msg->e_cut = 10000000.0;
1028 m_msg->e_cut_set = 1;
1029 m_msg->mshow_min = 1;
1030 sscanf (optarg+1, "%d", &m_msg->mshow);
1031 }
1032 else if (optarg[0] == '>') {
1033 m_msg->mshow_min = 2;
1034 sscanf (optarg+1, "%d", &m_msg->mshow);
1035 }
1036 else {
1037 sscanf (optarg, "%d", &m_msg->mshow);
1038 m_msg->mshow_min = 0;
1039 }
1040 m_msg->mshow_set = 1;
1041 break;
1042 case 'c':
1043 tmp_f = tmp_f1 = 0.0;
1044 if (*optarg == 'O') {
1045 ppst->param_u.fa.use_E_thresholds = 0;
1046 optarg++;
1047 }
1048 if (*optarg != '\0' && pgm_def_arr[pgm_id].ktup > 0) {
1049 sscanf (optarg, "%lf %lf", &tmp_f, &tmp_f1);
1050 if (tmp_f > 1.0) {
1051 ppst->param_u.fa.optcut = (int)(tmp_f+0.1);
1052 ppst->param_u.fa.use_E_thresholds = 0;
1053 ppst->param_u.fa.optcut_set = 1;
1054 }
1055 else if (tmp_f <= 0.0) {
1056 ppst->param_u.fa.use_E_thresholds = 1;
1057 }
1058 else { /* 0.0 < tmp_f <= 1.0 */
1059 ppst->param_u.fa.use_E_thresholds = 1;
1060 ppst->param_u.fa.E_band_opt = min(tmp_f,1.0);
1061 E_thresh_set = 1;
1062 if (tmp_f1 > 0.0) {
1063 tmp_f1 = min(tmp_f1,1.0); /* may want to do max(tmp_f1,tmp_f) */
1064 ppst->param_u.fa.E_join = tmp_f1;
1065 E_cgap_set=1;
1066 }
1067 }
1068 }
1069 break;
1070 case 'd': sscanf(optarg,"%d",&m_msg->ashow);
1071 if ((m_msg->mshow > 0) && (m_msg->ashow > m_msg->mshow)) m_msg->mshow=m_msg->ashow;
1072 m_msg->ashow_set = 1;
1073 break;
1074 case 'E':
1075 if (strchr(optarg,' ')) { /* check for 1 or 2 values */
1076 sscanf(optarg,"%lf %lf",&tmp_e_cut, &tmp_e_rep);
1077 if (tmp_e_rep <= 0.0) { /* two values, 2nd <= 0.0, no do_rep */
1078 ppst->do_rep = 0;
1079 ppst->e_cut_r = 1E-100;
1080 tmp_e_rep = -2.0;
1081 }
1082 else {ppst->do_rep = 1;}
1083 }
1084 else { /* one value, do_rep; tmp_e_rep=10.0 */
1085 sscanf(optarg,"%lf",&tmp_e_cut);
1086 #ifndef LALIGN
1087 tmp_e_rep = 10.0;
1088 #else
1089 tmp_e_rep = 1.0;
1090 #endif
1091 ppst->do_rep = 1;
1092 }
1093 if (!m_msg->e_cut_set && tmp_e_cut > 0.0 ) {
1094 ppst->e_cut = m_msg->e_cut = tmp_e_cut;
1095 }
1096 m_msg->e_cut_set = 1;
1097
1098 if (tmp_e_rep > 0.0) {
1099 if (tmp_e_rep >= 1.0) { ppst->e_cut_r = ppst->e_cut/tmp_e_rep;}
1100 else { ppst->e_cut_r = tmp_e_rep;}
1101 }
1102 break;
1103 case 'f':
1104 sscanf (optarg, "%d", &ppst->gdelval);
1105 if (ppst->gdelval > 0) ppst->gdelval = -ppst->gdelval;
1106 del_set = 1;
1107 break;
1108 case 'g':
1109 sscanf (optarg, "%d", &ppst->ggapval);
1110 if (ppst->ggapval > 0) ppst->ggapval = -ppst->ggapval;
1111 gap_set = 1;
1112 break;
1113 #ifndef SHOW_HELP
1114 case 'h':
1115 sscanf (optarg, "%d", &ppst->gshift);
1116 if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
1117 shift_set = 1;
1118 break;
1119 case 'j':
1120 sscanf (optarg, "%d", &ppst->gsubs);
1121 if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1122 subs_set = 1;
1123 break;
1124 #else
1125 case 'h':
1126 show_help(m_msg->pgm_name, pgm_id);
1127 break;
1128 case 'j':
1129 #ifdef FASTY
1130 if (strchr(optarg,' ')) {
1131 sscanf (optarg, "%d %d", &ppst->gshift, &ppst->gsubs);
1132 subs_set = 1;
1133 if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1134 }
1135 else if (strchr(optarg,',')) {
1136 sscanf (optarg, "%d,%d", &ppst->gshift, &ppst->gsubs);
1137 subs_set = 1;
1138 if (ppst->gsubs > 0) ppst->gsubs = -ppst->gsubs;
1139 }
1140 else {
1141 sscanf (optarg, "%d", &ppst->gshift);
1142 }
1143 #else
1144 #ifdef FASTX
1145 sscanf (optarg, "%d", &ppst->gshift);
1146 #endif
1147 #endif
1148 if (ppst->gshift > 0) ppst->gshift = -ppst->gshift;
1149 shift_set = 1;
1150 break;
1151 #endif
1152 case 'J':
1153 #ifdef LALIGN
1154 ppst->show_ident=1;
1155 #else
1156 ppst->show_ident=0;
1157 #endif
1158 break;
1159
1160
1161 #ifdef LALIGN
1162 case 'K':
1163 sscanf(optarg,"%d", &ppst->max_repeat);
1164 break;
1165 #endif
1166 case 'k':
1167 sscanf (optarg, "%d", &m_msg->shuff_max);
1168 m_msg->shuff_max_save = m_msg->shuff_max;
1169 mshuff_set = 1;
1170 break;
1171 case 'M':
1172 sscanf(optarg,"%d-%d",&m_msg->n1_low,&m_msg->n1_high);
1173 if (m_msg->n1_low < 0) {
1174 m_msg->n1_high = -m_msg->n1_low;
1175 m_msg->n1_low = 0;
1176 }
1177 if (m_msg->n1_high == 0) m_msg->n1_high = BIGNUM;
1178 if (m_msg->n1_low > m_msg->n1_high) {
1179 fprintf(stderr," low cutoff %d greater than high %d\n",
1180 m_msg->n1_low, m_msg->n1_high);
1181 m_msg->n1_low = 0;
1182 m_msg->n1_high = BIGNUM;
1183 }
1184 ppst->n1_low = m_msg->n1_low;
1185 ppst->n1_high = m_msg->n1_high;
1186 break;
1187 case 'n':
1188 m_msg->qdnaseq = SEQT_DNA;
1189 re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
1190 SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1191 prot2dna = 1;
1192 break;
1193 case 'o':
1194 case 'p':
1195 m_msg->qdnaseq = SEQT_PROT;
1196 ppst->dnaseq = SEQT_PROT;
1197 SAFE_STRNCPY(m_msg->sqnam,"aa",4);
1198 break;
1199 case 'P':
1200 SAFE_STRNCPY(ppst->pgpfile,optarg,MAX_FN);
1201 if ((bp=strchr(ppst->pgpfile,' '))!=NULL) {
1202 *bp='\0';
1203 ppst->pgpfile_type = atoi(bp+1);
1204 }
1205 else ppst->pgpfile_type = 0;
1206 ppst->pam_pssm = 1;
1207 break;
1208 case 'r':
1209 sscanf(optarg,"%d/%d",&ppst->p_d_mat,&ppst->p_d_mis);
1210 ppst->pam_set = 0;
1211 ppst->p_d_set = 1;
1212
1213 SAFE_STRNCPY(ppst->pam_name, "DNA", 4);
1214 if (ppst->dnaseq != SEQT_RNA) ppst->dnaseq = SEQT_DNA;
1215 if (ppst->p_d_mat > 0 && ppst->p_d_mis < 0) {
1216 ppst->p_d_set = 1;
1217 SAFE_STRNCPY(ppst->pamfile,optarg,40);
1218 }
1219 break;
1220 /* modified Sept, 2011, to recognize that a scoring matrix
1221 specifies a sequence alphabet */
1222 case 's':
1223 if (*optarg == '?') {
1224 ppst->pam_variable = 1;
1225 optarg++;
1226 }
1227 if (*optarg == '\0') break;
1228 SAFE_STRNCPY (ppst->pamfile, optarg, MAX_FN);
1229 dnaseq_save = ppst->dnaseq;
1230 /* check for default abbreviation */
1231 if (!standard_pam(ppst->pamfile,ppst,del_set, gap_set)) {
1232 /* check/load matrix file */
1233 if (!initpam (ppst->pamfile, ppst)) {
1234 /* matrix file failed, use default matrix */
1235 SAFE_STRNCPY(ppst->pamfile,pgm_def_arr[pgm_id].smstr,MAX_FN);
1236 }
1237 }
1238 ppst->pam_set=1;
1239 /* check for changing alphabet here */
1240 if (ppst->dnaseq != dnaseq_save && ppst->dnaseq >= SEQT_DNA) {
1241 m_msg->qdnaseq = SEQT_DNA;
1242 re_ascii(qascii,nascii,strlen((char *)m_msg->ann_arr+1));
1243 SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1244 prot2dna = 1;
1245 }
1246 break;
1247 case 'S': /* turn on extended alphabet for seg */
1248 ppst->ext_sq_set = 1;
1249 ppst->nsq_e = ppst->nsqx;
1250 break;
1251 case 't':
1252 if (tolower(optarg[0])=='t') {
1253 m_msg->ldb_info.term_code = aascii['*'];
1254 optarg++;
1255 }
1256 if (*optarg) {sscanf (optarg, "%d", &ppst->tr_type);}
1257 break;
1258 case 'U':
1259 m_msg->qdnaseq = SEQT_RNA;
1260 memcpy(qascii,nascii,sizeof(qascii));
1261 SAFE_STRNCPY(m_msg->sqnam,"nt",4);
1262 nt[nascii['T']]='U';
1263 prot2dna=1;
1264 break;
1265 case 'W':
1266 sscanf (optarg,"%d",&m_msg->aln.llcntx);
1267 m_msg->aln.llcntx_set = 1;
1268 break;
1269 case 'X':
1270 parse_ext_opts(optarg, pgm_id, m_msg, ppst);
1271 break;
1272 case 'z':
1273 if (strchr(optarg,' ')!=NULL) {
1274 sscanf(optarg,"%d %d",&ppst->zsflag,&ppst->zsflag2);
1275 if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
1276 }
1277 else if (strchr(optarg,',')!=NULL) {
1278 sscanf(optarg,"%d,%d",&ppst->zsflag,&ppst->zsflag2);
1279 if (ppst->zsflag2 < 1 || ppst->zsflag2 > 6) ppst->zsflag2 = 2;
1280 }
1281 else {
1282 sscanf(optarg,"%d",&ppst->zsflag);
1283 ppst->zsflag2 = (ppst->zsflag % 10);
1284 }
1285 break;
1286 }
1287 }
1288
1289 static char my_opts[] = "1BIM:ox:y:N:";
1290
1291 void
parse_ext_opts(char * opt_arg,int pgm_id,struct mngmsg * m_msp,struct pstruct * ppst)1292 parse_ext_opts(char *opt_arg, int pgm_id, struct mngmsg *m_msp, struct pstruct *ppst) {
1293 long l_arg;
1294 char c_arg, c_opt, *the_arg, *bp;
1295
1296 c_opt = *opt_arg;
1297 if ((bp=strchr(my_opts, c_opt))==NULL) {
1298 return;
1299 }
1300
1301 if (*(bp+1) == ':') the_arg = opt_arg+1;
1302
1303 switch (c_opt) {
1304 case '1':
1305 if (pgm_def_arr[pgm_id].ktup > 0) {
1306 ppst->param_u.fa.iniflag=1;
1307 }
1308 break;
1309 case 'B': m_msp->z_bits = 0; break;
1310 case 'I':
1311 m_msp->tot_ident = 1;
1312 /*
1313 l_arg = 0;
1314 sscanf(the_arg,"%ld",&l_arg);
1315 if (l_arg > 0) m_msp->tot_ident = l_arg;
1316 */
1317 break;
1318 case 'M':
1319 c_arg = '\0';
1320 sscanf(the_arg,"%ld%c",&l_arg,&c_arg);
1321 if (l_arg < 0) m_msp->max_memK = BIGNUM;
1322 else {
1323 l_arg *= 1024;
1324 if (c_arg == 'G') l_arg *= 1024;
1325 m_msp->max_memK = l_arg;
1326 }
1327 break;
1328 case 'N':
1329 case 'X':
1330 ppst->pam_x_id_sim = 0;
1331 if (*the_arg == 'S' || *the_arg == '+') {
1332 ppst->pam_x_id_sim = 1;
1333 }
1334 else if (*the_arg == 'D' || *the_arg == '-') {
1335 ppst->pam_x_id_sim = -1;
1336 }
1337 break;
1338 case 'o':
1339 if (pgm_def_arr[pgm_id].ktup > 0) {
1340 ppst->param_u.fa.optflag = 0;
1341 msg_def_arr[pgm_id].nrelv = m_msp->nrelv = 2;
1342 }
1343 break;
1344 case 'x':
1345 if (strchr(the_arg,' ')!=NULL) {
1346 sscanf (the_arg,"%d %d",&ppst->pam_xx, &ppst->pam_xm);
1347 }
1348 else if (strchr(the_arg,',')!=NULL) {
1349 sscanf (the_arg,"%d,%d",&ppst->pam_xx, &ppst->pam_xm);
1350 }
1351 else {
1352 sscanf (the_arg,"%d",&ppst->pam_xx);
1353 ppst->pam_xm = ppst->pam_xx;
1354 }
1355 ppst->pam_x_set=1;
1356 break;
1357 case 'y':
1358 if (pgm_def_arr[pgm_id].ktup > 0) {
1359 sscanf (the_arg, "%d", &ppst->param_u.fa.optwid);
1360 ppst->param_u.fa.optwid_set = 1;
1361 }
1362 break;
1363 }
1364 }
1365
1366 void
f_lastenv(struct mngmsg * m_msg,struct pstruct * ppst)1367 f_lastenv (struct mngmsg *m_msg, struct pstruct *ppst)
1368 {
1369 char save_str[MAX_SSTR];
1370
1371 #if !defined(FASTM) && !defined(FASTS) && !defined(FASTF)
1372 SAFE_STRNCPY(save_str,"*",sizeof(save_str));
1373 #else
1374 SAFE_STRNCPY(save_str,",",sizeof(save_str));
1375 #endif
1376
1377 if (m_msg->qdnaseq == SEQT_UNK) {
1378 build_xascii(qascii,save_str);
1379 if (m_msg->ann_flg) add_ascii_ann(qascii,m_msg->ann_arr);
1380 }
1381 /* this check allows lc DNA sequence queries with FASTX */
1382 else {
1383 #if !defined(FASTS) && !defined(FASTM) && !defined(FASTF) && !defined(FASTX) && !defined(FASTY)
1384 init_ascii(ppst->ext_sq_set,qascii, ppst->nsq, m_msg->qdnaseq);
1385 #endif
1386 validate_novel_aa(qascii, ppst->nsq, m_msg->qdnaseq);
1387 }
1388 }
1389
1390 void
f_getarg(int argc,char ** argv,int optind,struct mngmsg * m_msg,struct pstruct * ppst)1391 f_getarg (int argc, char **argv, int optind,
1392 struct mngmsg *m_msg, struct pstruct *ppst)
1393 {
1394
1395 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1396 if (argc - optind >= 4) {
1397 sscanf (argv[optind + 3], "%d", &ppst->param_u.fa.ktup);
1398 ktup_set = 1;
1399 }
1400 else {
1401 ppst->param_u.fa.ktup = -pgm_def_arr[ppst->pgm_id].ktup;
1402 }
1403 }
1404
1405 if (ppst->pgm_id == RSS_PID && argc - optind > 3) {
1406 sscanf (argv[optind + 3], "%d", &m_msg->shuff_max);
1407 }
1408
1409 if (ppst->pgm_id == RFX_PID && argc - optind > 4) {
1410 sscanf (argv[optind + 4], "%d", &m_msg->shuff_max);
1411 }
1412 m_msg->shuff_max_save = m_msg->shuff_max;
1413 }
1414
1415 /* fills in the query ascii mapping from the parameter
1416 ascii mapping.
1417 */
1418
1419 void
re_ascii(int * qascii,int * pascii,int max_ann_arr)1420 re_ascii(int *qascii, int *pascii, int max_ann_arr) {
1421 int i;
1422
1423 for (i=0; i < 128; i++) {
1424 if (qascii[i] > NANN+max_ann_arr || qascii[i] < ESS) {
1425 qascii[i] = pascii[i];
1426 }
1427 }
1428 }
1429
1430
1431 /* recode has become function specific to accommodate FASTS/M */
1432 /* modified 28-Dec-2004 to ensure that all mapped characters
1433 are valid */
1434 int
recode(unsigned char * seq,int n,int * qascii,int nsqx)1435 recode(unsigned char *seq, int n, int *qascii, int nsqx) {
1436 int i,j;
1437 char save_c;
1438
1439 #if defined(FASTS) || defined(FASTM)
1440 qascii[',']=ESS;
1441 #endif
1442
1443 for (i=0; i < n; i++) {
1444 save_c = seq[i];
1445 if (seq[i] > '@' || seq[i]=='*') seq[i] = qascii[seq[i]];
1446 if (seq[i] > nsqx && seq[i]!=ESS) {
1447 fprintf(stderr, "*** Warning - unrecognized residue at %d:%c - %2d\n",
1448 i,save_c,save_c);
1449 seq[i] = qascii['X'];
1450 }
1451 }
1452 seq[i]=EOSEQ;
1453 return i;
1454 }
1455
1456 /* here we have the query sequence, all the command line options,
1457 but we need to set various parameter options based on the type
1458 of the query sequence (m_msg->qdnaseq = 0:protein/1:DNA) and
1459 the function (FASTA/FASTX/TFASTA)
1460
1461 29-Jun-2008 add code to ensure that weird ('O', 'U') amino-acids
1462 are read properly.
1463
1464 15-Nov-2010 -- modify scoring matrix for very short query sequences
1465 (e.g. short read metagenomics)
1466 */
1467
1468 /* this resetp is for conventional a FASTA/TFASTXYZ search */
1469 void
resetp(struct mngmsg * m_msg,struct pstruct * ppst)1470 resetp (struct mngmsg *m_msg, struct pstruct *ppst) {
1471 int i, pgm_id;
1472 int n0_eff;
1473
1474 pgm_id = ppst->pgm_id;
1475
1476 /* check for alphabet conflict */
1477
1478 ppst->shuffle_dna3 = 0;
1479 #if defined(TFAST)
1480 if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
1481 fprintf(stderr," %s compares a protein to a translated\n\
1482 DNA sequence library. Do not use a DNA query/scoring matrix.\n",prog_func);
1483 exit(1);
1484 }
1485 ppst->shuffle_dna3 = 1;
1486 #else
1487 #if (defined(FASTX) || defined(FASTY))
1488 if (!(m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA)) {
1489 fprintf(stderr," FASTX/Y compares a DNA sequence to a protein database\n");
1490 fprintf(stderr," Use a DNA query\n");
1491 exit(1);
1492 }
1493 #endif
1494 #endif
1495
1496 /* **************************************************************** */
1497 /* adjust alphabets for prot:prot or DNA:DNA alignments */
1498
1499 /* this code changes parameters for programs (FA_PID, SS_PID, FS_PID,
1500 RSS_PID) that can examine either protein (initial state) or DNA
1501 Modified May, 2006 to reset e_cut for DNA comparisons.
1502 */
1503 /* **************************************************************** */
1504
1505 if (msg_def_arr[pgm_id].q_seqt == SEQT_UNK) {
1506 if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {
1507 msg_def_arr[pgm_id].q_seqt = m_msg->qdnaseq;
1508 msg_def_arr[pgm_id].p_seqt = SEQT_DNA;
1509 msg_def_arr[pgm_id].l_seqt = SEQT_DNA;
1510 if (m_msg->qdnaseq == SEQT_DNA) msg_def_arr[pgm_id].qframe = 2;
1511 if (!m_msg->e_cut_set) {
1512 pgm_def_arr[pgm_id].e_cut /= 5.0;
1513 ppst->e_cut_r = 0.001;
1514 }
1515 }
1516 else {
1517 msg_def_arr[pgm_id].q_seqt = SEQT_PROT;
1518 }
1519 }
1520
1521 /* set the comparison type (PROT/DNA) in ppst */
1522 ppst->dnaseq = msg_def_arr[pgm_id].p_seqt;
1523
1524 if (!sw_flag_set) ppst->sw_flag = msg_def_arr[pgm_id].sw_flag;
1525 if (!m_msg->e_cut_set) {
1526 ppst->e_cut = m_msg->e_cut=pgm_def_arr[pgm_id].e_cut;
1527 #ifdef LALIGN
1528 ppst->e_cut_r = ppst->e_cut;
1529 #endif
1530 }
1531
1532 if (ppst->dnaseq == SEQT_DNA && m_msg->qdnaseq==SEQT_RNA) {
1533 ppst->dnaseq = SEQT_RNA;
1534 ppst->nt_align = 1;
1535 }
1536 if (ppst->dnaseq==SEQT_DNA) pascii = &nascii[0];
1537 else if (ppst->dnaseq==SEQT_RNA) {
1538 pascii = &nascii[0];
1539 ppst->sq[nascii['T']] = 'U';
1540 }
1541 else pascii = &aascii[0];
1542 m_msg->ldb_info.ldnaseq = msg_def_arr[pgm_id].l_seqt;
1543
1544 if (m_msg->ldb_info.ldnaseq & SEQT_DNA) {
1545 memcpy(lascii,nascii,sizeof(lascii));
1546 #ifndef TFAST
1547 #ifdef DNALIB_LC
1548 init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1549 #endif
1550 #else
1551 /* no init_ascii() because we translate lower case library sequences */
1552 #endif
1553 validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1554 }
1555 else {
1556 memcpy(lascii,aascii,sizeof(lascii)); /* initialize lib mapping */
1557 if (m_msg->ann_flg && strchr((char *)m_msg->ann_arr,'*')) {lascii['*'] = NA;}
1558
1559 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1560 lascii['*'] = NA;
1561 #endif
1562 init_ascii(ppst->ext_sq_set,lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1563 validate_novel_aa(lascii, ppst->nsq, m_msg->ldb_info.ldnaseq);
1564 }
1565
1566 /* have lascii - initialize l_ann_ascii[] if necessary */
1567 if (m_msg->ann_flg) {
1568 memcpy(l_ann_ascii,lascii,sizeof(l_ann_ascii));
1569 /* make certain that '*' is treated correctly */
1570 if (strchr((char *)m_msg->ann_arr,'*')) {l_ann_ascii['*'] = NA;}
1571 add_ascii_ann(l_ann_ascii, m_msg->ann_arr);
1572 }
1573
1574 /* **************************************************************** */
1575 /* adjust qframe/nframe if DNA/translated DNA search */
1576 /* **************************************************************** */
1577
1578 if (!nframe_set) {
1579 m_msg->qframe = msg_def_arr[pgm_id].qframe;
1580 m_msg->nframe = msg_def_arr[pgm_id].nframe;
1581 }
1582
1583 /* the possibilities:
1584 -i -3 qframe revcomp
1585 FA_D/FX - - 2 0
1586 FA_D/FX + - 2 1
1587 FA_D/FX - + 1 0
1588 FA_D/FX + + 2 1
1589 */
1590
1591 if (m_msg->qdnaseq == SEQT_DNA) {
1592 m_msg->nframe = 1;
1593 if (m_msg->qframe == 1 && m_msg->revcomp==1) {
1594 m_msg->qframe = m_msg->revcomp+1;
1595 }
1596 }
1597 else if (m_msg->qdnaseq == SEQT_RNA) {
1598 m_msg->qframe = m_msg->revcomp+1;
1599 m_msg->nframe = 1;
1600 }
1601
1602 /* **************************************************************** */
1603 /* adjust FASTA heuristics for DNA/translated DNA search */
1604 /* **************************************************************** */
1605
1606 if (ppst->dnaseq == SEQT_DNA || ppst->dnaseq == SEQT_RNA) {
1607 ppst->histint = 4;
1608
1609 if (!del_set) {
1610 #ifdef OLD_FASTA_GAP
1611 ppst->gdelval = -16; /* def. del penalty */
1612 #else
1613 ppst->gdelval = -12; /* def. open penalty */
1614 #endif
1615 }
1616 if (!gap_set) ppst->ggapval = -4; /* def. gap penalty */
1617
1618 ppst->nsq = nnt;
1619 ppst->nsqx = nntx;
1620 ppst->sq[ppst->nsqx+1] = ppst->sqx[ppst->nsqx+1] = '\0';
1621
1622 /* reset parameters for DNA */
1623 if (pgm_def_arr[pgm_id].ktup > 0) {
1624 /* these parameters are used to scale optcut, and are being replaced
1625 by statistically based parameters */
1626 /* largest ktup */
1627 pgm_def_arr[pgm_id].ktup = mktup = 6;
1628 if (!ppst->param_u.fa.optwid_set) ppst->param_u.fa.optwid = 16;
1629 ppst->param_u.fa.bestscale = 80;
1630 ppst->param_u.fa.bkfact = 5;
1631 ppst->param_u.fa.scfact = 1;
1632 ppst->param_u.fa.bktup = mktup;
1633 ppst->param_u.fa.bestmax = 80;
1634 ppst->param_u.fa.bestoff = 45;
1635 if (!E_thresh_set) ppst->param_u.fa.E_band_opt = 0.05;
1636
1637 if (!sw_flag_set) {
1638 ppst->sw_flag = 0;
1639 SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
1640 SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
1641 }
1642
1643 /* largest ktup */
1644 mktup = 6;
1645
1646 if (ppst->param_u.fa.pamfact >= 0) ppst->param_u.fa.pamfact = 0;
1647 if (ppst->param_u.fa.ktup < 0)
1648 ppst->param_u.fa.ktup = -ppst->param_u.fa.bktup;
1649 }
1650
1651 for (i=0; i<=ppst->nsqx; i++) {
1652 ppst->hsq[i] = hnt[i];
1653 ppst->sq[i] = nt[i];
1654 ppst->hsqx[i] = hntx[i];
1655 ppst->sqx[i] = ntx[i];
1656 }
1657
1658 /* **************************************************************** */
1659 /* adjust scoring matrix for DNA:DNA search */
1660 /* **************************************************************** */
1661
1662 if (!ppst->pam_set) {
1663 if (ppst->p_d_set)
1664 mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1665 #if !defined(FASTS) && !defined(FASTM)
1666 else if (ppst->pamfile[0]=='\0' || strncmp(ppst->pamfile,"BL50",4)==0) {
1667 SAFE_STRNCPY (ppst->pamfile, "+5/-4", sizeof(ppst->pamfile));
1668 SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
1669 SAFE_STRNCPY (ppst->pam_name, "+5/-4", sizeof(ppst->pamfile));
1670 }
1671 #else
1672 else if (strncmp(ppst->pamfile,"MD20",4)==0) {
1673 SAFE_STRNCPY (ppst->pamfile, "+2/-2", sizeof(ppst->pamfile));
1674 SAFE_STRNCPY (ppst->pam_name, "+2/-2", sizeof(ppst->pam_name));
1675 SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, sizeof(ppst->pamfile_save));
1676 ppst->p_d_mat = +2;
1677 ppst->p_d_mis = -2;
1678 mk_n_pam(npam,nnt,ppst->p_d_mat,ppst->p_d_mis);
1679 }
1680 #endif
1681 pam = npam;
1682 }
1683
1684 SAFE_STRNCPY (m_msg->sqnam, "nt",sizeof(m_msg->sqnam));
1685 SAFE_STRNCPY (m_msg->sqtype, "DNA",sizeof(m_msg->sqtype));
1686 } /* end DNA reset */
1687
1688 else { /* other parameters for protein comparison */
1689 if (pgm_def_arr[pgm_id].ktup > 0) {
1690 if (!ppst->param_u.fa.optwid_set) {
1691 if (ppst->param_u.fa.ktup==1) ppst->param_u.fa.optwid = 32;
1692 else ppst->param_u.fa.optwid = 16;
1693 }
1694 }
1695 if (!shift_set) {ppst->gshift = pgm_def_arr[pgm_id].gshift;}
1696 if (!subs_set) {ppst->gsubs = pgm_def_arr[pgm_id].hshift;}
1697 }
1698
1699 SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
1700 }
1701
1702 /* query_parm() this function asks for any additional parameters
1703 that have not been provided. Could be null. */
1704 void
query_parm(struct mngmsg * m_msp,struct pstruct * ppst)1705 query_parm (struct mngmsg *m_msp, struct pstruct *ppst)
1706 {
1707 char qline[40];
1708
1709 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1710 if (ppst->param_u.fa.ktup < 0)
1711 ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1712
1713 if (ppst->param_u.fa.ktup == 0) {
1714 printf (" ktup? (1 to %d) [%d] ", mktup, pgm_def_arr[ppst->pgm_id].ktup);
1715 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1716 else sscanf(qline,"%d",&ppst->param_u.fa.ktup);
1717 }
1718 if (ppst->param_u.fa.ktup == 0)
1719 ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
1720 else ktup_set = 1;
1721 }
1722
1723 #if defined(PRSS)
1724 if (m_msp->shuff_max < 10) m_msp->shuff_max = MAX_RSTATS;
1725
1726 if (!mshuff_set) {
1727 printf(" number of shuffles [%d]? ",m_msp->shuff_max);
1728 fflush(stdout);
1729 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1730 else sscanf(qline,"%d",&m_msp->shuff_max);
1731 }
1732
1733 if (ppst->zs_win == 0) {
1734 printf (" local (window) (w) or uniform (u) shuffle [u]? ");
1735 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1736 else if (qline[0]=='w' || qline[0]=='W') {
1737 m_msp->shuff_wid = 20;
1738 printf(" local shuffle window size [%d]? ",m_msp->shuff_wid);
1739 if (fgets (qline, sizeof(qline), stdin) == NULL) exit (0);
1740 else sscanf(qline,"%d",&m_msp->shuff_wid);
1741 }
1742 }
1743 #endif
1744 }
1745
1746 /* last_init() cannot look at aa0, n0, because it is only run once,
1747 it is not run before each new aa0 search */
1748 void
last_init(struct mngmsg * m_msg,struct pstruct * ppst)1749 last_init (struct mngmsg *m_msg, struct pstruct *ppst)
1750 {
1751 int ix_l, ix_i, i, pgm_id;
1752 double *kar_p;
1753 double aa0_f[MAXSQ];
1754
1755 m_msg->zsflag = ppst->zsflag;
1756 m_msg->zsflag2 = ppst->zsflag2;
1757
1758 if (ppst->zsflag < 0) {
1759 ppst->do_rep = 0;
1760 }
1761
1762 pgm_id = ppst->pgm_id;
1763
1764 #ifdef LALIGN
1765 m_msg->do_showbest = 1;
1766 m_msg->quiet = 1;
1767 #endif
1768
1769 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1770 m_msg->nohist = 1;
1771 m_msg->shuff_max = 2000;
1772 ppst->shuff_node = m_msg->shuff_max/fa_max_workers;
1773 #else
1774 m_msg->shuff_max = m_msg->shuff_max_save;
1775 #endif
1776
1777 if (m_msg->aln.llen < 1) {
1778 m_msg->aln.llen = 60;
1779 }
1780
1781 if (m_msg->ldb_info.ldnaseq== SEQT_PROT) {
1782 m_msg->max_tot = MAXLIB_P;
1783 }
1784
1785 #if defined(FASTX) || defined(FASTY) || defined(TFAST)
1786 /* set up translation tables: faatran.c */
1787 aainit(ppst->tr_type,ppst->debug_lib);
1788 #endif
1789
1790 /* a sanity check */
1791 #if !defined(TFAST)
1792 if (m_msg->revcomp && m_msg->qdnaseq!=SEQT_DNA && m_msg->qdnaseq!=SEQT_RNA) {
1793 fprintf(stderr," cannot reverse complement protein\n");
1794 m_msg->revcomp = 0;
1795 }
1796 #endif
1797
1798 if (pgm_def_arr[pgm_id].ktup > 0) {
1799
1800 if (ppst->param_u.fa.ktup < 0)
1801 ppst->param_u.fa.ktup = -ppst->param_u.fa.ktup;
1802
1803 if (ppst->param_u.fa.ktup < 1 || ppst->param_u.fa.ktup > mktup) {
1804 fprintf(stderr," warning ktup = %d out of range [1..%d], reset to %d\n",
1805 ppst->param_u.fa.ktup, mktup, ppst->param_u.fa.bktup);
1806 ppst->param_u.fa.ktup = ppst->param_u.fa.bktup;
1807 }
1808
1809 if (ppst->sw_flag) {
1810 SAFE_STRNCPY(m_msg->f_id1,"sw",sizeof(m_msg->f_id1));
1811 SAFE_STRNCPY(m_msg->alabel, align_label[0], sizeof(m_msg->alabel));
1812 }
1813 else {
1814 SAFE_STRNCPY(m_msg->f_id1,"bs",sizeof(m_msg->f_id1));
1815 SAFE_STRNCPY(m_msg->alabel, align_label[1], sizeof(m_msg->alabel));
1816 }
1817 }
1818
1819 if (pgm_id == TFA_PID) {
1820 m_msg->revcomp *= 3;
1821 if (m_msg->nframe == 3) m_msg->nframe += m_msg->revcomp;
1822 }
1823 else if (pgm_id == TFX_PID || pgm_id == TFY_PID) {
1824 if (m_msg->nframe == 1) m_msg->nframe += m_msg->revcomp;
1825 }
1826
1827 #if !defined(TFAST)
1828 /* for fasta/fastx searches, itt iterates the the query strand */
1829 m_msg->nitt1 = m_msg->qframe-1;
1830 #else
1831 /* for tfasta/tfastxy searches, itt iterates the library frames */
1832 m_msg->nitt1 = m_msg->nframe-1;
1833 #endif
1834
1835 if (pgm_def_arr[pgm_id].ktup > 0) { /* its FASTA, not SSEARCH */
1836 if (ppst->param_u.fa.ktup>=2 && !ppst->param_u.fa.optwid_set) {
1837 ppst->param_u.fa.optwid=16;
1838 switch (pgm_id) {
1839 case FA_PID:
1840 case FX_PID:
1841 case FY_PID:
1842 m_msg->thr_fact = 8;
1843 m_msg->thr_fact = 8;
1844 break;
1845 case TFA_PID:
1846 case TFX_PID:
1847 case TFY_PID:
1848 m_msg->thr_fact = 4;
1849 break;
1850 default:
1851 m_msg->thr_fact = 4;
1852 }
1853 }
1854 else { m_msg->thr_fact = 4;}
1855 }
1856 else {
1857 #if !defined(SW_ALTIVEC) && !defined(SW_SSE2)
1858 m_msg->thr_fact = 1; /* unvectorized SSEARCH */
1859 #else
1860 m_msg->thr_fact = 8; /* vectorized SSEARCH */
1861 #endif
1862 }
1863
1864 #ifdef PCOMPLIB
1865 m_msg->thr_fact = 1; /* use much larger buffers */
1866 #endif
1867
1868 #if defined(PRSS)
1869 if (m_msg->shuff_max < 10) m_msg->shuff_max = MAX_RSTATS;
1870 if (ppst->zsflag < 10) ppst->zsflag += 10;
1871 if (ppst->zs_win > 0) {
1872 m_msg->shuff_wid = ppst->zs_win;
1873 }
1874 #endif
1875
1876 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
1877 if (ppst->param_u.fa.iniflag) {
1878 ppst->score_ix = 1;
1879 SAFE_STRNCPY (m_msg->label, "initn init1", sizeof(m_msg->label));
1880 }
1881 else if (ppst->param_u.fa.optflag) {
1882 ppst->score_ix = 2;
1883 m_msg->stages = 1;
1884 }
1885 }
1886
1887 if (!ppst->have_pam2) {
1888 alloc_pam (MAXSQ, MAXSQ, ppst);
1889 init_pam2(ppst);
1890 }
1891 init_pamx(ppst);
1892
1893 if (ppst->pam_ms) {
1894 if (m_msg->qdnaseq == SEQT_PROT) {
1895 /* code to make 'L'/'I' identical scores */
1896 ix_l = pascii['L'];
1897 ix_i = pascii['I'];
1898 ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1899 ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1900 max(ppst->pam2[0][ix_l][ix_l],ppst->pam2[0][ix_i][ix_i]);
1901 for (i=1; i<=ppst->nsq; i++) {
1902 ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1903 max(ppst->pam2[0][i][ix_l],ppst->pam2[0][i][ix_i]);
1904 ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1905 max(ppst->pam2[0][ix_i][i],ppst->pam2[0][ix_l][i]);
1906 }
1907
1908 /* code to make 'Q'/'K' identical scores */
1909 if (!shift_set) {
1910 ix_l = pascii['Q'];
1911 ix_i = pascii['K'];
1912 ppst->pam2[0][ix_l][ix_i] = ppst->pam2[0][ix_i][ix_l] =
1913 ppst->pam2[0][ix_l][ix_l] = ppst->pam2[0][ix_i][ix_i] =
1914 (ppst->pam2[0][ix_l][ix_l]+ppst->pam2[0][ix_i][ix_i]+1)/2;
1915 for (i=1; i<=ppst->nsq; i++) {
1916 ppst->pam2[0][i][ix_i] = ppst->pam2[0][i][ix_l] =
1917 (ppst->pam2[0][i][ix_l]+ppst->pam2[0][i][ix_i]+1)/2;
1918 ppst->pam2[0][ix_i][i] = ppst->pam2[0][ix_l][i] =
1919 (ppst->pam2[0][ix_i][i]+ppst->pam2[0][ix_l][i]+1)/2;
1920 }
1921 }
1922 }
1923 }
1924
1925 /*
1926 print_pam(ppst);
1927 */
1928
1929 /* once we have a complete pam matrix, we can calculate Lambda and K
1930 for "average" sequences */
1931 kar_p = NULL;
1932 init_karlin_a(ppst, aa0_f, &kar_p);
1933 do_karlin_a(ppst->pam2[0], ppst, aa0_f,
1934 kar_p, &m_msg->Lambda, &m_msg->K, &m_msg->H);
1935 ppst->pLambda = m_msg->Lambda;
1936 ppst->pK = m_msg->K;
1937 ppst->pH = m_msg->H;
1938 ppst->LK_set = 1;
1939 free(kar_p);
1940
1941 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
1942 if (ppst->ext_sq_set) {
1943 fprintf(stderr," -S not available on [t]fast[fs]\n");
1944 ppst->ext_sq_set = 0;
1945 ppst->nsq_e = ppst->nsq;
1946
1947 /* reset sascii to ignore -S, map lc */
1948 init_ascii(0,lascii, ppst->nsq, 0);
1949 validate_novel_aa(lascii, ppst->nsq, 0);
1950 }
1951 #endif
1952 }
1953
1954 /* alloc_pam2p creates a profile structure */
1955 int **
alloc_pam2p(int ** pam2p,int len,int nsq)1956 alloc_pam2p(int **pam2p, int len, int nsq) {
1957 int i, pam2p_len;
1958 int *pam2pp;
1959
1960 if (pam2p == NULL) {
1961 if ((pam2p = (int **)calloc(len,sizeof(int *)))==NULL) {
1962 fprintf(stderr," Cannot allocate pam2p: %d\n",len);
1963 return NULL;
1964 }
1965
1966 if((pam2p[0] = (int *)calloc((nsq+1)*len,sizeof(int)))==NULL) {
1967 fprintf(stderr, "Cannot allocate pam2p[0]: %d\n", (nsq+1)*len);
1968 free(pam2p);
1969 return NULL;
1970 }
1971 }
1972 else {
1973 pam2p_len = (nsq+1)*len*sizeof(int);
1974 pam2pp = pam2p[0];
1975 if ((pam2pp = (int *)realloc(pam2pp,pam2p_len))==NULL) {
1976 fprintf(stderr,
1977 "Cannot reallocate pam2p[0]: %ld\n", (nsq+1)*len*sizeof(int));
1978 return NULL;
1979 }
1980 memset(pam2pp,0,pam2p_len);
1981
1982 if ((pam2p = (int **)realloc(pam2p,len*sizeof(int *)))==NULL) {
1983 fprintf(stderr," Cannot reallocate pam2p: %d\n",len);
1984 return NULL;
1985 }
1986 pam2p[0] = pam2pp;
1987 }
1988
1989 for (i=1; i<len; i++) {
1990 pam2p[i] = pam2p[0] + (i*(nsq+1));
1991 }
1992
1993 return pam2p;
1994 }
1995
free_pam2p(int ** pam2p)1996 void free_pam2p(int **pam2p) {
1997 if (pam2p) {
1998 free(pam2p[0]);
1999 free(pam2p);
2000 }
2001 }
2002
2003 /* sortbest has now become comparison function specific so that we can use
2004 a different comparison for fasts/f
2005 */
2006 #if !defined(FASTS) && !defined (FASTF) && !defined(FASTM)
2007 void
qshuffle()2008 qshuffle() {}
2009
2010 #ifndef LALIGN /* LALIGN has last_calc() in last_thresh.c */
2011 int
last_calc(unsigned char * aa0,unsigned char * aa1,int maxn,struct beststr ** bestp_arr,int nbest,struct mngmsg m_msg,struct pstruct * ppst,void ** f_str,void * pstat_str)2012 last_calc(
2013 unsigned char *aa0, unsigned char *aa1, int maxn,
2014 struct beststr **bestp_arr, int nbest,
2015 struct mngmsg m_msg, struct pstruct *ppst
2016 , void **f_str
2017 , void *pstat_str)
2018 {
2019 return nbest;
2020 }
2021 #endif
2022
2023 /* this function is almost never called, thus a slow shell sort */
sortbest(bptr,nbest,irelv)2024 void sortbest (bptr, nbest, irelv)
2025 struct beststr **bptr;
2026 int nbest, irelv;
2027 {
2028 int gap, i, j;
2029 struct beststr *tmp;
2030
2031 for (gap = nbest/2; gap > 0; gap /= 2)
2032 for (i = gap; i < nbest; i++)
2033 for (j = i - gap; j >= 0; j-= gap) {
2034 if (bptr[j]->rst.score[irelv] >= bptr[j + gap]->rst.score[irelv]) break;
2035 tmp = bptr[j];
2036 bptr[j] = bptr[j + gap];
2037 bptr[j + gap] = tmp;
2038 }
2039 }
2040
show_aux(FILE * fp,struct beststr * bptr)2041 void show_aux(FILE *fp, struct beststr *bptr) {}
header_aux(FILE * fp)2042 void header_aux(FILE *fp) {}
2043
2044 #else
2045 /* this function is almost never called, thus a slow shell sort */
sortbest(bptr,nbest,irelv)2046 void sortbest (bptr, nbest, irelv)
2047 struct beststr **bptr;
2048 int nbest, irelv;
2049 {
2050 int gap, i, j;
2051 struct beststr *tmp;
2052
2053 for (gap = nbest/2; gap > 0; gap /= 2)
2054 for (i = gap; i < nbest; i++)
2055 for (j = i - gap; j >= 0; j-= gap) {
2056 if (bptr[j]->rst.escore < bptr[j + gap]->rst.escore) break;
2057 tmp = bptr[j];
2058 bptr[j] = bptr[j + gap];
2059 bptr[j + gap] = tmp;
2060 }
2061 }
2062
2063 #if defined(FASTS) || defined(FASTM)
2064
2065 /* this shuffle is for FASTS */
2066 /* convert ',' -> '\0', shuffle each of the substrings */
2067 void
qshuffle(unsigned char * aa0,int n0,int nm0,void * rand_state)2068 qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
2069
2070 unsigned char **aa0start, *aap, tmp;
2071 int i,j,k, ns;
2072
2073 if ((aa0start=(unsigned char **)calloc(nm0+1,
2074 sizeof(unsigned char *)))==NULL) {
2075 fprintf(stderr,"cannot calloc for qshuffle %d\n",nm0);
2076 exit(1);
2077 }
2078
2079 aa0start[0]=aa0;
2080 for (k=1,i=0; i<n0; i++) {
2081 if (aa0[i]==EOSEQ || aa0[i]==ESS) {
2082 aa0[i]='\0';
2083 aa0start[k++] = &aa0[i+1];
2084 }
2085 }
2086
2087 /* aa0start has the beginning of each substring */
2088 for (k=0; k<nm0; k++) {
2089 aap=aa0start[k];
2090 ns = strlen((const char *)aap);
2091 for (i=ns; i>1; i--) {
2092 j = my_nrand(i, rand_state);
2093 tmp = aap[j];
2094 aap[j] = aap[i-1];
2095 aap[i-1] = tmp;
2096 }
2097 aap[ns] = 0;
2098 }
2099
2100 for (k=1; k<nm0; k++) {
2101 /* aap = aa0start[k];
2102 while (*aap) fputc(pst.sq[*aap++],stderr);
2103 fputc('\n',stderr);
2104 */
2105 aa0start[k][-1]=ESS;
2106 }
2107
2108 free(aa0start);
2109 }
2110 #endif
2111
2112 #ifdef FASTF
qshuffle(unsigned char * aa0,int n0,int nm0,void * rand_state)2113 void qshuffle(unsigned char *aa0, int n0, int nm0, void *rand_state) {
2114
2115 int i, j, k, nmpos;
2116 unsigned char tmp;
2117 int nmoff;
2118
2119 nmoff = (n0 - nm0 - 1)/nm0 + 1;
2120
2121 for (i = nmoff-1 ; i > 0 ; i--) {
2122
2123 /* j = nrand(i); if (i == j) continue;*/ /* shuffle columns */
2124 j = (nmoff -1 ) - i;
2125 if (i <= j) break; /* reverse columns */
2126
2127 /* swap all i'th column residues for all j'th column residues */
2128 for(nmpos = 0, k = 0 ; k < nm0 ; k++, nmpos += nmoff+1 ) {
2129 tmp = aa0[nmpos + i];
2130 aa0[nmpos + i] = aa0[nmpos + j];
2131 aa0[nmpos + j] = tmp;
2132 }
2133 }
2134 }
2135 #endif
2136
2137
2138 /* show additional best_str values */
show_aux(FILE * fp,struct beststr * bptr)2139 void show_aux(FILE *fp, struct beststr *bptr) {
2140 fprintf(fp," %2d %3d",bptr->rst.segnum,bptr->rst.seglen);
2141 }
2142
header_aux(FILE * fp)2143 void header_aux(FILE *fp) {
2144 fprintf(fp, " sn sl");
2145 }
2146 #endif
2147
2148 void
fill_pam(int ** pam2p,int n0,int nsq,double ** freq2d,double scale)2149 fill_pam(int **pam2p, int n0, int nsq, double **freq2d, double scale) {
2150 int i, j, new_j;
2151 double freq;
2152
2153 /* fprintf(stderr, "scale: %g\n", scale); */
2154
2155 /* now fill in the pam matrix: */
2156 for (j = 1 ; j <=20 ; j++) {
2157 new_j = qascii[pssm_aa[j]];
2158 for (i = 0 ; i < n0 ; i++) {
2159 freq = scale * freq2d[i][j-1];
2160 if ( freq < 0.0) freq -= 0.5;
2161 else freq += 0.5;
2162 pam2p[i][new_j] = (int)(freq);
2163 }
2164 }
2165 }
2166
2167 double
get_lambda(int ** pam2p,int n0,int nsq,unsigned char * query)2168 get_lambda(int **pam2p, int n0, int nsq, unsigned char *query) {
2169 double lambda, H;
2170 double *pr, tot, sum;
2171 int i, ioff, j, min, max, q_i;
2172
2173 /* get min and max scores */
2174 min = BIGNUM;
2175 max = -BIGNUM;
2176 if(pam2p[0][1] == -BIGNUM) {
2177 ioff = 1;
2178 n0++;
2179 } else {
2180 ioff = 0;
2181 }
2182
2183 for (i = ioff ; i < n0 ; i++) {
2184 for (j = 1; j < nsq ; j++) {
2185 if (min > pam2p[i][j])
2186 min = pam2p[i][j];
2187 if (max < pam2p[i][j])
2188 max = pam2p[i][j];
2189 }
2190 }
2191
2192 /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */
2193
2194 if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {
2195 fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);
2196 exit(1);
2197 }
2198
2199 tot = (double) rrtotal * (double) rrtotal * (double) n0;
2200 for (i = ioff ; i < n0 ; i++) {
2201
2202 if (query[i] < 'A') {q_i = query[i];}
2203 else {q_i= aascii[query[i]];}
2204
2205 for (j = 1; j < nsq ; j++) {
2206 pr[pam2p[i][j] - min] +=
2207 (double) ((double) rrcounts[q_i] * (double) rrcounts[j]) / tot;
2208 }
2209 }
2210
2211 sum = 0.0;
2212 for(i = 0 ; i <= max-min ; i++) {
2213 sum += pr[i];
2214 /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */
2215 }
2216 /* fprintf(stderr, "sum: %g\n", sum); */
2217
2218 for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }
2219
2220 if (!karlin(min, max, pr, &lambda, &H)) {
2221 fprintf(stderr, "Karlin lambda estimation failed\n");
2222 }
2223
2224 /* fprintf(stderr, "lambda: %g\n", lambda); */
2225 free(pr);
2226
2227 return lambda;
2228 }
2229
2230 /*
2231 *aa0 - query sequence
2232 n0 - length
2233 pamscale - scaling for pam matrix - provided by apam.c, either
2234 0.346574 = ln(2)/2 (P120, BL62) or
2235 0.231049 = ln(2)/3 (P250, BL50)
2236 */
2237
2238 void
2239 scale_pssm(int **pssm2p, double **freq2d,
2240 unsigned char *query, int n0,
2241 int **pam2, double pamscale);
2242
2243 static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";
2244
2245 void
read_pssm(unsigned char * aa0,int n0,int nsq,double pamscale,FILE * fp,int pgpf_type,struct pstruct * ppst)2246 read_pssm(unsigned char *aa0, int n0, int nsq,
2247 double pamscale,
2248 FILE *fp, int pgpf_type, struct pstruct *ppst) {
2249 int i, j, len, k;
2250 int qi, rj; /* qi - index query; rj - index residues (1-20) */
2251 int **pam2p;
2252 int first, too_high;
2253 unsigned char *query, ctmp;
2254 char dline[512];
2255 double freq, **freq2d, lambda, new_lambda;
2256 double scale, scale_high, scale_low;
2257
2258 pam2p = ppst->pam2p[0];
2259
2260 if (pgpf_type == 0) {
2261
2262 if (1 != fread(&len, sizeof(int), 1, fp)) {
2263 fprintf(stderr, "error reading from checkpoint file: %d\n", len);
2264 exit(1);
2265 }
2266
2267 if (len != n0) {
2268 fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
2269 len,n0);
2270 exit(1);
2271 }
2272
2273 /* read over query sequence stored in BLAST profile */
2274 if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
2275 fprintf(stderr, "Couldn't allocate memory for query!\n");
2276 exit(1);
2277 }
2278
2279 if (len != fread(query, sizeof(char), len, fp)) {
2280 fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
2281 exit(1);
2282 }
2283 }
2284 else if (pgpf_type == 1) {
2285
2286 if ((fgets(dline,sizeof(dline),fp) == NULL) ||
2287 (1 != sscanf(dline, "%d",&len))) {
2288 fprintf(stderr, "error reading from checkpoint file: %d\n", len);
2289 exit(1);
2290 }
2291
2292 if(len != n0) {
2293 fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",
2294 len,n0);
2295 exit(1);
2296 }
2297
2298 /* read over query sequence stored in BLAST profile */
2299 if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {
2300 fprintf(stderr, "Couldn't allocate memory for query!\n");
2301 exit(1);
2302 }
2303
2304 if (fgets((char *)query,len+2,fp)==NULL) {
2305 fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);
2306 exit(1);
2307 }
2308 }
2309 else {
2310 fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type);
2311 exit(1);
2312 }
2313
2314 /* currently we don't do anything with query; ideally, we should
2315 check to see that it actually matches aa0 ... */
2316
2317 /* quick 2d array alloc: */
2318 if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {
2319 fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
2320 exit(1);
2321 }
2322
2323 if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) {
2324 fprintf(stderr, "Couldn't allocate memory for frequencies!\n");
2325 exit(1);
2326 }
2327
2328 /* a little pointer arithmetic to fill out 2d array: */
2329 for (i = 1 ; i < n0 ; i++) {
2330 freq2d[i] = freq2d[i-1] + 20;
2331 }
2332
2333 if (pgpf_type == 0) {
2334 for (qi = 0 ; qi < n0 ; qi++) {
2335 for (rj = 0 ; rj < 20 ; rj++) {
2336 if(1 != fread(&freq, sizeof(double), 1, fp)) {
2337 fprintf(stderr, "Error while reading frequencies!\n");
2338 exit(1);
2339 }
2340 freq2d[qi][rj] = freq;
2341 }
2342 }
2343 }
2344 else {
2345 for (qi = 0 ; qi < n0 ; qi++) {
2346 if ((fgets(dline,sizeof(dline),fp) ==NULL) ||
2347 (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n",
2348 &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4],
2349 &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9],
2350 &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14],
2351 &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) {
2352 fprintf(stderr, "Error while reading frequencies: %d read!\n",k);
2353 exit(1);
2354 }
2355 for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; } /* reverse scaling */
2356 }
2357 }
2358
2359 scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
2360
2361 free(freq2d[0]);
2362 free(freq2d);
2363
2364 free(query);
2365 }
2366
2367 /* before fasta-36.3.6 (with reordered amino-acid mapping), scale_pssm()
2368 simply produced a log(q_ij/p_j) and put it into pam2p.
2369
2370 But pssm's use pssm_aa encoding, while fasta-36.3.6 use NCBIstdaa
2371 encoding, so the pam2p must be re-mapped
2372 */
2373
2374 void
scale_pssm(int ** pssm2p,double ** freq2d,unsigned char * query,int n0,int ** pam2,double pamscale)2375 scale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) {
2376 int i, qi, rj;
2377 double freq, new_lambda, lambda;
2378 int first, too_high;
2379 double scale, scale_high, scale_low;
2380
2381 for (rj = 0 ; rj < 20 ; rj++) {
2382 for (qi = 0 ; qi < n0 ; qi++) {
2383 if (freq2d[qi][rj] > 1e-20) {
2384 freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal));
2385 freq /= pamscale; /* this gets us close to originial pam scores */
2386 freq2d[qi][rj] = freq;
2387 }
2388 else {
2389 /* when blastpgp decides to leave something out, it puts 0's in all the frequencies
2390 in the binary checkpoint file. In the ascii version, however, it uses BLOSUM62
2391 values. I will put in scoring matrix values as well */
2392
2393 if (query[qi] < 'A') {
2394 freq2d[qi][rj] = pam2[query[qi]][rj+1];
2395 }
2396 else {
2397 freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1];
2398 }
2399 }
2400 }
2401 }
2402
2403 /* now figure out the right scale */
2404 scale = 1.0;
2405 lambda = get_lambda(pam2, 20, 20, ustandard_aa);
2406
2407 /* should be near 1.0 because of our initial scaling by ppst->pamscale */
2408 /* fprintf(stderr, "real_lambda: %g\n", lambda); */
2409
2410 /* get initial high/low scale values: */
2411 first = 1;
2412 while (1) {
2413 fill_pam(pssm2p, n0, 20, freq2d, scale);
2414 new_lambda = get_lambda(pssm2p, n0, 20, query);
2415
2416 if (new_lambda > lambda) {
2417 if (first) {
2418 first = 0;
2419 scale = scale_high = 1.0 + 0.05;
2420 scale_low = 1.0;
2421 too_high = 1;
2422 } else {
2423 if (!too_high) break;
2424 scale = (scale_high += scale_high - 1.0);
2425 }
2426 } else if (new_lambda > 0) {
2427 if (first) {
2428 first = 0;
2429 scale_high = 1.0;
2430 scale = scale_low = 1.0 - 0.05;
2431 too_high = 0;
2432 } else {
2433 if (too_high) break;
2434 scale = (scale_low += scale_low - 1.0);
2435 }
2436 } else {
2437 fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);
2438 exit(1);
2439 }
2440 }
2441
2442 /* now do binary search between low and high */
2443 for (i = 0 ; i < 10 ; i++) {
2444 scale = 0.5 * (scale_high + scale_low);
2445 fill_pam(pssm2p, n0, 20, freq2d, scale);
2446 new_lambda = get_lambda(pssm2p, n0, 20, query);
2447
2448 if (new_lambda > lambda) scale_low = scale;
2449 else scale_high = scale;
2450 }
2451
2452 scale = 0.5 * (scale_high + scale_low);
2453 fill_pam(pssm2p, n0, 20, freq2d, scale);
2454
2455 /*
2456 fprintf(stderr, "final scale: %g\n", scale);
2457
2458 for (qi = 0 ; qi < n0 ; qi++) {
2459 fprintf(stderr, "%4d %c: ", qi+1, query[qi]);
2460 for (rj = 1 ; rj <= 20 ; rj++) {
2461 fprintf(stderr, "%4d", pssm2p[qi][rj]);
2462 }
2463 fprintf(stderr, "\n");
2464 }
2465 */
2466 }
2467
2468 #if defined(CAN_PSSM)
2469 int
2470 parse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,
2471 unsigned char **query, double ***freqs,
2472 char *matrix, int *gap_open, int *gap_extend,
2473 double *lambda);
2474
2475 /* the ASN.1 pssm includes information about the scoring matrix used
2476 (though not the gap penalty in the current version PSSM:2) The PSSM
2477 scoring matrix and gap penalties should become the default if they
2478 have not been set explicitly.
2479 */
2480
2481 /* read the PSSM from an open FILE *fp - but nothing has been read
2482 from *fp */
2483
2484 int
read_asn_pssm(unsigned char * aa0,int n0,int nsq,double pamscale,FILE * fp,struct pstruct * ppst)2485 read_asn_pssm(unsigned char *aa0, int n0, int nsq,
2486 double pamscale, FILE *fp, struct pstruct *ppst) {
2487
2488 int i, j, len, k;
2489 int qi, rj; /* qi - index query; rj - index residues (1-20) */
2490 int **pam2p;
2491 int first, too_high;
2492 unsigned char *query, ctmp;
2493 char dline[512];
2494 char matrix[MAX_SSTR];
2495 double psi2_lambda;
2496 double freq, **freq2d, lambda, new_lambda;
2497 double scale, scale_high, scale_low;
2498 int gap_open, gap_extend;
2499 int n_rows, n_cols;
2500
2501 pam2p = ppst->pam2p[0];
2502
2503 if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d,
2504 matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {
2505 return -1;
2506 }
2507
2508 if (query == NULL) { query = aa0;}
2509
2510 if (!gap_set) {
2511 if (gap_open) {
2512 if (gap_open > 0) {gap_open = -gap_open;}
2513 ppst->gdelval = gap_open;
2514 }
2515 else if (strncmp(matrix,"BLOSUM62",8)==0) {
2516 ppst->gdelval = -11;
2517 }
2518 gap_set = 1;
2519 }
2520 if (!del_set) {
2521 if (gap_extend) {
2522 if (gap_extend > 0) {gap_extend = -gap_extend;}
2523 ppst->ggapval = gap_extend;
2524 }
2525 else if (strncmp(matrix,"BLOSUM62",8)==0) {
2526 ppst->ggapval = -1;
2527 }
2528 del_set = 1;
2529 }
2530
2531 if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {
2532 SAFE_STRNCPY(ppst->pamfile, "BL62", 120);
2533 SAFE_STRNCPY(ppst->pamfile_save, ppst->pamfile, 120);
2534 standard_pam(ppst->pamfile,ppst,del_set, gap_set);
2535 if (!ppst->have_pam2) {
2536 alloc_pam (MAXSQ, MAXSQ, ppst);
2537 }
2538 init_pam2(ppst);
2539 ppst->pam_set = 1;
2540 }
2541
2542 if (n_cols < n0) {
2543 fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);
2544 exit(1);
2545 }
2546
2547 scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);
2548
2549 free(freq2d[0]);
2550 free(freq2d);
2551
2552 if (query != aa0) free(query);
2553 return 1;
2554 }
2555 #endif
2556
2557 /* last_params() sets up values in pstruct *ppst now that all
2558 parameters and data is available.
2559
2560 It:
2561 (1) moves m_msg->n0 to ppst->n0 for statistics calculations
2562 (2) sets ppst->nsq_e
2563 (3) reads the PSSM file if one is being used
2564 (4) calculates m_msg->nm0 for FASTF/S/M
2565 (5) determines statistical strategy for FASTF/S, sets last_calc_flg
2566 and qshuffle
2567 (6) lowers ktup for short sequences
2568 */
2569
2570 void
last_params(unsigned char * aa0,int n0,struct mngmsg * m_msp,struct pstruct * ppst)2571 last_params(unsigned char *aa0, int n0,
2572 struct mngmsg *m_msp,
2573 struct pstruct *ppst
2574 ) {
2575 int i, nsq;
2576 FILE *fp;
2577 int is_fastxy=0;
2578 int n0_eff;
2579 /* do_karlin_a() must be re-run everytime the scoring matrix changes */
2580 double *kar_p;
2581 double aa0_f[MAXSQ];
2582
2583 if (n0 < 0) { return;}
2584
2585
2586 n0_eff = m_msp->n0;
2587 ppst->n0 = m_msp->n0;
2588 #if !defined(TFAST) && (defined(FASTX) || defined(FASTY))
2589 n0_eff /= 3;
2590 is_fastxy = 1;
2591 #endif
2592
2593 /* reset the PAMFILE to the original value */
2594 if (strncmp(ppst->pamfile, ppst->pamfile_save,120)!=0) {
2595 SAFE_STRNCPY(ppst->pamfile, ppst->pamfile_save, 120);
2596 standard_pam(ppst->pamfile,ppst,del_set, gap_set);
2597 init_pam2(ppst);
2598 init_pamx(ppst);
2599 }
2600
2601 /* **************************************************************** */
2602 /* adjust scoring matrix for short protein/translated protein queries */
2603 /* **************************************************************** */
2604
2605 if (ppst->pam_variable) {
2606 if (min_pam_bits(n0_eff, DEF_MIN_BITS, ppst, del_set, gap_set)) {
2607 init_pam2(ppst);
2608 init_pamx(ppst);
2609 kar_p = NULL;
2610 init_karlin_a(ppst, aa0_f, &kar_p);
2611 do_karlin_a(ppst->pam2[0], ppst, aa0_f,
2612 kar_p, &m_msp->Lambda, &m_msp->K, &m_msp->H);
2613 ppst->pLambda = m_msp->Lambda;
2614 ppst->pK = m_msp->K;
2615 ppst->pH = m_msp->H;
2616 ppst->LK_set = 1;
2617 free(kar_p);
2618 }
2619 }
2620
2621 if (ppst->ext_sq_set) { ppst->nsq_e = nsq = 2*ppst->nsq; }
2622 else {ppst->nsq_e = nsq = ppst->nsq;}
2623
2624 #if defined(CAN_PSSM)
2625 ppst->pam2p[0] = alloc_pam2p(ppst->pam2p[0],n0,MAXSQ);
2626 ppst->pam2p[1] = alloc_pam2p(ppst->pam2p[1],n0,MAXSQ);
2627
2628 if (ppst->pam_pssm) {
2629 if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) {
2630 read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst);
2631 extend_pssm(aa0, n0, ppst);
2632 }
2633 else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) {
2634 read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst);
2635 extend_pssm(aa0, n0, ppst);
2636 }
2637 else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) {
2638 if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) {
2639 extend_pssm(aa0, n0, ppst);
2640 }
2641 else {
2642 fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile);
2643 ppst->pam_pssm = 0;
2644 return;
2645 }
2646 }
2647 else {
2648 fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile);
2649 ppst->pam_pssm = 0;
2650 return;
2651 }
2652 }
2653 #endif
2654
2655 #if defined(FASTF) || defined(FASTS) || defined(FASTM)
2656 m_msp->nm0 = 1;
2657 for (i=0; i<n0; i++)
2658 if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msp->nm0++;
2659
2660 /*
2661 for FASTS, we can do statistics in one of two different ways
2662 if there are <= 10 query fragments, then we calculate probabilistic
2663 scores for every library sequence. If there are > 10 fragments, this
2664 takes much too long and too much memory, so we use the old fashioned
2665 raw score only z-score normalized method initially, and then calculate
2666 the probabilistic scores for the best hits. To scale those scores, we
2667 also need a set of random probabilistic scores. So we do the qshuffle
2668 to get them.
2669
2670 For FASTF, precalculating probabilities is prohibitively expensive,
2671 so we never do it; FASTF always acts like FASTS with nfrags>10.
2672
2673 */
2674
2675 #if defined(FASTS) || defined(FASTM)
2676 if (m_msp->nm0 > 10) m_msp->escore_flg = 0;
2677 else m_msp->escore_flg = 1;
2678 #endif
2679
2680 if (m_msp->escore_flg && (ppst->zsflag&1)) {
2681 m_msp->last_calc_flg = 0;
2682 m_msp->qshuffle = 0;
2683 }
2684 else { /* need random query, second set of 2000 scores */
2685 m_msp->last_calc_flg = 1;
2686 m_msp->qshuffle = 1;
2687 }
2688 #else
2689 #ifndef LALIGN
2690 m_msp->last_calc_flg = 0;
2691 #else
2692 m_msp->last_calc_flg = 1; /* LALIGN needs last_calc for threshold */
2693 #endif
2694 m_msp->qshuffle = 0;
2695 m_msp->escore_flg = 0;
2696 m_msp->nm0 = 1;
2697 #endif
2698
2699 /* adjust the ktup if appropriate */
2700
2701 if (pgm_def_arr[ppst->pgm_id].ktup > 0) {
2702 if (!ktup_set ) {
2703 ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;
2704 if (m_msp->qdnaseq == SEQT_PROT || is_fastxy) {
2705 #if defined(FASTS) || defined(FASTM)
2706 if (n0_eff > 100 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
2707 #endif
2708 if (n0_eff <= 40 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
2709 }
2710 else if (m_msp->qdnaseq == SEQT_DNA || m_msp->qdnaseq == SEQT_RNA) {
2711 if (n0_eff <= 20 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;
2712 #if defined(FASTS) || defined(FASTM)
2713 /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */
2714 else ppst->param_u.fa.ktup = 2;
2715 #else
2716 else if (n0 < 50 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;
2717 else if (n0 < 100 && ppst->param_u.fa.ktup > 3) ppst->param_u.fa.ktup = 3;
2718 #endif
2719 }
2720 }
2721 /* regardless of ktup state */
2722 if (ppst->param_u.fa.use_E_thresholds) {
2723 ppst->param_u.fa.use_E_thresholds = ppst->LK_set;
2724 }
2725 if (!E_cgap_set) {
2726 ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * 5;
2727 }
2728 else {
2729 if (ppst->param_u.fa.E_join > 1.0) {
2730 ppst->param_u.fa.E_join = ppst->param_u.fa.E_band_opt * ppst->param_u.fa.E_join;
2731 }
2732 }
2733 }
2734 }
2735
2736 /* validate_params() exists because of bugs that keep appearing
2737
2738 (1) pam2[0][x][0] or pam2[0][0][x] are not -BIGNUM
2739 (2) sascii[] (or qascii[], lascii[]) have values outside nsq_e.
2740 */
2741
2742 int
validate_params(const unsigned char * aa0,int n0,const struct mngmsg * m_msg,const struct pstruct * ppst,const int * lascii,const int * pascii)2743 validate_params(const unsigned char *aa0, int n0,
2744 const struct mngmsg *m_msg,
2745 const struct pstruct *ppst,
2746 const int *lascii, const int *pascii) {
2747 int good_params = 1;
2748 int i;
2749
2750 /* check for -BIGNUM for boundaries of pam2[0][0:x][x:0] */
2751
2752 for (i=0; i< ppst->nsq; i++) {
2753 if (ppst->pam2[0][0][i] > -1000) {
2754 fprintf(stderr," *** ERROR *** pam2[0][0][%d/%c] == %d\n",
2755 i,NCBIstdaa[i],ppst->pam2[0][0][i]);
2756 good_params = 0;
2757 }
2758 if (ppst->pam2[0][i][0] > -1000) {
2759 fprintf(stderr," *** ERROR *** pam2[0][%d/%c][0] == %d\n",
2760 i,NCBIstdaa[i],ppst->pam2[0][i][0]);
2761 good_params = 0;
2762 }
2763 }
2764
2765 /* check for -BIGNUM for boundaries of pam2[1][0:x][x:0] */
2766 if (ppst->ext_sq_set) {
2767 for (i=0; i< ppst->nsqx; i++) {
2768 if (ppst->pam2[1][0][i] > -1000) {
2769 fprintf(stderr," *** ERROR *** pam2[1][0][%d] == %d\n",
2770 i,ppst->pam2[1][0][i]);
2771 good_params = 0;
2772 }
2773 if (ppst->pam2[1][i][0] > -1000) {
2774 fprintf(stderr," *** ERROR *** pam2[1][%d][0] == %d\n",
2775 i,ppst->pam2[1][i][0]);
2776 good_params = 0;
2777 }
2778 }
2779 }
2780
2781 /* check for valid residues in query */
2782 for (i=0; i<n0; i++) {
2783 if (aa0[i] > ppst->nsq_e && aa0[i] != ESS) {
2784 fprintf(stderr," *** ERROR *** aa0[%d] = %c[%d > %d] out of range\n",
2785 i, aa0[i], aa0[i], ppst->nsq_e);
2786 good_params = 0;
2787 }
2788 }
2789
2790 for (i=0; i<128; i++) {
2791 if (lascii[i] < NA && lascii[i] > ppst->nsq_e) {
2792 fprintf(stderr," *** ERROR *** lascii [%c|%d] = %d > %d out of range\n",
2793 i, i, lascii[i], ppst->nsq_e);
2794 good_params = 0;
2795 }
2796
2797 /* currently, pascii[] is not reset for upper-case only
2798 if (pascii[i] < NA && pascii[i] > ppst->nsq_e) {
2799 fprintf(stderr," *** WARNING *** pascii[%c|%d] = %d > %d out of range\n",
2800 i, i, pascii[i], ppst->nsq_e);
2801 }
2802 */
2803
2804 }
2805
2806 return good_params;
2807 }
2808
2809 /* given a good profile in ppst->pam2p[0], make an extended profile
2810 in ppst->pam2p[1]
2811 */
2812 void
extend_pssm(unsigned char * aa0,int n0,struct pstruct * ppst)2813 extend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) {
2814
2815 int i, j, nsq;
2816 int sa_x, sa_t, sa_b, sa_z, sa_j;
2817 int **pam2p0, **pam2p1;
2818
2819 nsq = ppst->nsq;
2820
2821 pam2p0 = ppst->pam2p[0];
2822 pam2p1 = ppst->pam2p[1];
2823
2824 sa_x = pascii['X'];
2825 sa_t = pascii['*'];
2826 if (sa_t >= ppst->nsq) {sa_t = sa_x;}
2827 sa_b = pascii['B'];
2828 sa_z = pascii['Z'];
2829 sa_j = pascii['J'];
2830
2831 /* fill in boundaries, B, Z, *, X */
2832 for (i=0; i<n0; i++) {
2833 pam2p0[i][0] = -BIGNUM;
2834 pam2p0[i][sa_b] = (int)
2835 (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0);
2836 pam2p0[i][sa_z] = (int)
2837 (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0);
2838 pam2p0[i][sa_j] = (int)
2839 (((float)pam2p0[i][pascii['I']]+(float)pam2p0[i][pascii['L']]+0.5)/2.0);
2840 pam2p0[i][sa_x] = ppst->pam_xm;
2841 pam2p0[i][sa_t] = ppst->pam_xm;
2842 }
2843
2844 /* copy pam2p0 into pam2p1 */
2845 for (i=0; i<n0; i++) {
2846 pam2p1[i][0] = -BIGNUM;
2847 for (j=1; j<=ppst->nsq; j++) {
2848 pam2p1[i][j] = pam2p0[i][j];
2849 }
2850 }
2851
2852 /* then fill in extended characters, if necessary */
2853 if (ppst->ext_sq_set) {
2854 for (i=0; i<n0; i++) {
2855 for (j=1; j<=ppst->nsq; j++) {
2856 pam2p0[i][nsq+j] = pam2p0[i][j];
2857 pam2p1[i][nsq+j] = ppst->pam_xm;
2858 }
2859 }
2860 }
2861 }
2862
format_params(struct opt_def_str * opt_ptr,char * string)2863 void format_params(struct opt_def_str *opt_ptr, char *string) {
2864
2865 if (opt_ptr->opt_char == 'r') {
2866 sprintf(string, " [+%d/%d]", opt_ptr->i_param1, opt_ptr->i_param2);
2867 return;
2868 }
2869
2870 switch (opt_ptr->fmt_type) {
2871
2872 case 1:
2873 sprintf(string, " [%d]", opt_ptr->i_param1); break;
2874 case 2:
2875 sprintf(string, " [%d,%d]", opt_ptr->i_param1, opt_ptr->i_param2); break;
2876 case 3:
2877 sprintf(string, " [%.4g]", opt_ptr->d_param1); break;
2878 case 4:
2879 sprintf(string, " [%.4g,%.4g]", opt_ptr->d_param1, opt_ptr->d_param2); break;
2880 case 5:
2881 sprintf(string, " [%s]", opt_ptr->s_param); break;
2882 case 0:
2883 default:
2884 string[0] = '\0'; break;
2885 }
2886 }
2887
2888
2889 #if defined(FASTX) || defined(FASTY)
2890 static char *common_opts = "sfgjSEbdI";
2891 #else
2892 #if defined(LALIGN)
2893 static char *common_opts = "sfgEZI";
2894 #else
2895 static char *common_opts = "sfgSbdI";
2896 #endif
2897 #endif
2898
2899 void
show_help(char * pgm_name,int pgm_id)2900 show_help(char *pgm_name, int pgm_id) {
2901 int i, j;
2902 int opt_line_cnt=0;
2903 char tmp_string[MAX_STR];
2904 struct opt_def_str *opt_ptr;
2905
2906 printf("USAGE\n");
2907 #ifndef LALIGN
2908 printf(" %s [-options] query_file library_file",pgm_name);
2909 if (pgm_def_arr[pgm_id].ktup > 0) {
2910 printf(" [ktup]\n");
2911 }
2912 else {printf("\n");}
2913 #else
2914 printf(" %s [-options] seq_file1 seq_file2\n",pgm_name);
2915 #endif
2916 printf(" %s -help for a complete option list\n",pgm_name);
2917 printf("\nDESCRIPTION\n");
2918 printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
2919 printf("\n");
2920 printf("COMMON OPTIONS (options must preceed query_file library_file)\n");
2921
2922 for (i=0; i<strlen(common_opts); i++) {
2923 opt_ptr = g_options;
2924 for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
2925 if (common_opts[i]==opt_ptr[j].opt_char) {
2926 format_params(&opt_ptr[j], tmp_string);
2927 printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
2928 tmp_string, opt_ptr[j].opt_descr_s);
2929 /* if ((++opt_line_cnt % 2) == 0) printf("\n"); */
2930 printf("\n");
2931 goto next_option;
2932 }
2933 }
2934
2935 opt_ptr = f_options;
2936 for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
2937 if (common_opts[i]==opt_ptr[j].opt_char) {
2938 format_params(&opt_ptr[j], tmp_string);
2939 printf(" -%c%c %s %s;",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),
2940 tmp_string, opt_ptr[j].opt_descr_s);
2941 /* if ((++opt_line_cnt % 2)==0) printf("\n"); */
2942 printf("\n");
2943 }
2944 }
2945 next_option: continue;
2946 }
2947 if ((opt_line_cnt % 2) != 0) printf("\n");
2948 exit(0);
2949 }
2950
2951 /* sorts a list of options, with upper and lower case characters
2952 sorted together */
sort_opt_list(char * v,int n)2953 void sort_opt_list(char *v, int n) {
2954 int gap, i, j, k;
2955 int incs[7] = { 336, 112, 48, 21, 7, 3, 1 };
2956 char tmp_c, tmp_u;
2957 int v_start;
2958
2959 /* first shell sort the list using toupper() */
2960 for ( k = 0; k < 7; k++) {
2961 gap = incs[k];
2962 for (i = gap; i < n; i++) {
2963 tmp_c = v[i];
2964 tmp_u = toupper(tmp_c);
2965 j = i;
2966 while (j >= gap && toupper(v[j - gap]) > tmp_u) {
2967 v[j] = v[j - gap];
2968 j -= gap;
2969 }
2970 v[j] = tmp_c;
2971 }
2972 }
2973 /* then sort the toupper(==) pairs lower-case, upper-case */
2974
2975 for (i=1; i<n; i++) {
2976 if (toupper(v[i])==toupper(v[i-1])) {
2977 if (v[i] > v[i-1]) { tmp_c = v[i]; v[i] = v[i-1]; v[i-1]=tmp_c;}
2978 }
2979 }
2980 }
2981
2982 char *
sort_options(struct opt_def_str * g_options,struct opt_def_str * f_options)2983 sort_options (struct opt_def_str *g_options, struct opt_def_str *f_options) {
2984 struct opt_def_str *this_option;
2985 char *sorted_list, *sort_ptr;
2986 int i, opt_count;
2987
2988 opt_count=0;
2989 this_option = g_options;
2990 while ((this_option++)->opt_char!='\0') { opt_count++;}
2991 this_option = f_options;
2992 while ((this_option++)->opt_char!='\0') { opt_count++;}
2993
2994 if ((sorted_list = (char *)calloc(opt_count+1, sizeof(char)))==NULL) {
2995 fprintf(stderr," cannot allocate sorted_list[%d]\n",opt_count+1);
2996 exit(1);
2997 }
2998
2999 sort_ptr = sorted_list;
3000 this_option = g_options;
3001 while (this_option->opt_char!='\0') {
3002 *sort_ptr++ = this_option->opt_char;
3003 this_option++;
3004 }
3005
3006 this_option = f_options;
3007 while (this_option->opt_char!='\0') {
3008 *sort_ptr++ = this_option->opt_char;
3009 this_option++;
3010 }
3011
3012 sort_opt_list(sorted_list, opt_count);
3013
3014 return sorted_list;
3015 }
3016
3017 void
show_all_help(char * pgm_name,int pgm_id)3018 show_all_help(char *pgm_name, int pgm_id) {
3019 int i, j;
3020 struct opt_def_str *opt_ptr;
3021 char tmp_string[MAX_STR];
3022 char *descr_ptr;
3023 char *sorted_list;
3024
3025 sorted_list = sort_options(g_options, f_options);
3026
3027 printf("USAGE\n");
3028 printf(" %s [-options] query_file library_file",pgm_name);
3029 if (pgm_def_arr[pgm_id].ktup > 0) {
3030 printf(" [ktup]\n");
3031 }
3032 else {printf("\n");}
3033
3034 printf(" \"@\" query_file uses stdin; query_file:begin-end sets subset range\n");
3035 printf(" library file formats: 0:FASTA; 1:GenBankFF; 3:EMBL_FF; 7:FASTQ; 10:subset; 12:NCBI blastdbcmd;\n");
3036 printf(" alternate library formats: \"library_file 7\" for 7:FASTQ\n");
3037
3038 printf("\nDESCRIPTION\n");
3039 printf(" %s\n version: %s\n",pgm_def_arr[pgm_id].iprompt0, verstr);
3040 printf("\n");
3041 printf("OPTIONS (options must preceed query_file library_file)\n");
3042
3043 for (i=0; i<strlen(sorted_list); i++) {
3044 opt_ptr = g_options;
3045 for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
3046 if (sorted_list[i]==opt_ptr[j].opt_char) {
3047 descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
3048 format_params(&opt_ptr[j], tmp_string);
3049 printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '),tmp_string, descr_ptr);
3050 goto next_option;
3051 }
3052 }
3053
3054 opt_ptr = f_options;
3055 for (j=0; opt_ptr[j].opt_char != '\0'; j++) {
3056 if (sorted_list[i]==opt_ptr[j].opt_char) {
3057 descr_ptr = (opt_ptr[j].opt_descr_l) ? opt_ptr[j].opt_descr_l : opt_ptr[j].opt_descr_s;
3058 format_params(&opt_ptr[j], tmp_string);
3059 printf(" -%c%c %s %s\n",opt_ptr[j].opt_char, (opt_ptr[j].has_arg? ':' : ' '), tmp_string, descr_ptr);
3060 }
3061 }
3062 next_option: continue;
3063 }
3064
3065 free(sorted_list);
3066
3067 exit(0);
3068 }
3069