1 /* $Id: blast_api.c,v 1.59 2016/06/21 13:53:41 madden Exp $
2 ***************************************************************************
3 * *
4 * COPYRIGHT NOTICE *
5 * *
6 * This software/database is categorized as "United States Government *
7 * Work" under the terms of the United States Copyright Act. It was *
8 * produced as part of the author's official duties as a Government *
9 * employee and thus can not be copyrighted. This software/database is *
10 * freely available to the public for use without a copyright notice. *
11 * Restrictions can not be placed on its present or future use. *
12 * *
13 * Although all reasonable efforts have been taken to ensure the accuracy *
14 * and reliability of the software and data, the National Library of *
15 * Medicine (NLM) and the U.S. Government do not and can not warrant the *
16 * performance or results that may be obtained by using this software, *
17 * data, or derivative works thereof. The NLM and the U.S. Government *
18 * disclaim any and all warranties, expressed or implied, as to the *
19 * performance, merchantability or fitness for any particular purpose or *
20 * use. *
21 * *
22 * In any work or product derived from this material, proper attribution *
23 * of the author(s) as the source of the software or data would be *
24 * appreciated. *
25 *
26 * ===========================================================================
27 *
28 * Author: Ilya Dondoshansky
29 *
30 * ===========================================================================
31 */
32
33 /** @file blast_api.c
34 * Functions for C toolkit applications to perform a BLAST search
35 * against a BLAST database, using the rewritten blast engine.
36 */
37
38 #include <algo/blast/api/blast_api.h>
39 #include <algo/blast/core/blast_setup.h>
40 #include <algo/blast/core/blast_filter.h>
41 #include <algo/blast/core/blast_util.h>
42 #include <algo/blast/core/blast_message.h>
43 #include <algo/blast/core/blast_engine.h>
44 #include <algo/blast/core/blast_traceback.h>
45 #include <algo/blast/core/blast_hspstream.h>
46 #include <algo/blast/core/blast_hspfilter.h>
47 #include <algo/blast/core/hspfilter_collector.h>
48 #include <algo/blast/api/hspfilter_queue.h>
49 #include <algo/blast/core/phi_lookup.h>
50 #include <algo/blast/core/blast_psi.h>
51 #include <algo/blast/api/blast_mtlock.h>
52 #include <algo/blast/api/blast_prelim.h>
53 #include <algo/blast/api/blast_seq.h>
54 #include <algo/blast/api/seqsrc_readdb.h>
55 #include <algo/blast/api/seqsrc_multiseq.h>
56 #include <algo/blast/api/blast_seqalign.h>
57 #include <algo/blast/api/dust_filter.h>
58 #include <algo/blast/api/blast_message_api.h>
59 #include <algo/blast/core/gencode_singleton.h>
60
61 /** @addtogroup CToolkitAlgoBlast
62 *
63 * @{
64 */
65
66 /** Initializes the auxiliary structure with RPS BLAST database information.
67 * @param ppinfo Resulting structure. [out]
68 * @param rps_mmap Memory mapped lookup table [out]
69 * @param rps_pssm_mmap Memory mapped PSSM [out]
70 * @param dbname Name of the database [in]
71 */
72 static Int2
s_BlastRPSInfoInit(BlastRPSInfo ** ppinfo,Nlm_MemMap ** rps_mmap,Nlm_MemMap ** rps_pssm_mmap,const char * dbname)73 s_BlastRPSInfoInit(BlastRPSInfo **ppinfo, Nlm_MemMap **rps_mmap,
74 Nlm_MemMap **rps_pssm_mmap, const char* dbname)
75 {
76 char filename[PATH_MAX];
77 char pathname[PATH_MAX];
78 BlastRPSInfo *info;
79 FILE *auxfile;
80 Int4 i;
81 Int4 seq_size;
82 Int4 num_db_seqs;
83 Nlm_MemMapPtr lut_mmap;
84 Nlm_MemMapPtr pssm_mmap;
85 char buffer[PATH_MAX];
86 ReadDBFILEPtr rdfp;
87 char *tmp_dbname;
88 Uint4 version;
89
90 info = (BlastRPSInfo *)malloc(sizeof(BlastRPSInfo));
91 if (info == NULL)
92 ErrPostEx(SEV_FATAL, 1, 0, "Memory allocation failed");
93
94 /* find the path to the RPS database */
95 tmp_dbname = strdup(dbname);
96 rdfp = readdb_new_ex2(tmp_dbname, READDB_DB_IS_PROT,
97 READDB_NEW_DO_REPORT, NULL, NULL);
98 sfree(tmp_dbname);
99 if (rdfp == NULL)
100 ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST database");
101 sprintf(pathname, "%s", rdfp->full_filename);
102 rdfp = readdb_destruct(rdfp);
103
104 sprintf(filename, "%s.loo", (char *)pathname);
105 lut_mmap = Nlm_MemMapInit(filename);
106 if (lut_mmap == NULL)
107 ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST lookup file");
108
109 info->lookup_header = (BlastRPSLookupFileHeader *)lut_mmap->mmp_begin;
110 version = info->lookup_header->magic_number;
111 if (version != RPS_MAGIC_NUM && version != RPS_MAGIC_NUM_28) {
112
113 version = Nlm_SwitchUint4(version);
114 if (version == RPS_MAGIC_NUM || version == RPS_MAGIC_NUM_28) {
115 ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST lookup file was created "
116 "on an incompatible platform");
117 }
118 else {
119 ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST lookup file is corrupt");
120 }
121 }
122
123 sprintf(filename, "%s.rps", (char *)pathname);
124 pssm_mmap = Nlm_MemMapInit(filename);
125 if (pssm_mmap == NULL)
126 ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST profile file");
127
128 info->profile_header = (BlastRPSProfileHeader *)pssm_mmap->mmp_begin;
129 version = info->profile_header->magic_number;
130 if (version != RPS_MAGIC_NUM && version != RPS_MAGIC_NUM_28) {
131
132 version = Nlm_SwitchUint4(version);
133 if (version == RPS_MAGIC_NUM || version == RPS_MAGIC_NUM_28) {
134 ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST profile file was created "
135 "on an incompatible platform");
136 }
137 else {
138 ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST profile file is corrupt");
139 }
140 }
141
142 num_db_seqs = info->profile_header->num_profiles;
143
144 sprintf(filename, "%s.aux", (char *)pathname);
145 auxfile = FileOpen(filename, "r");
146 if (auxfile == NULL)
147 ErrPostEx(SEV_FATAL, 1, 0,"Cannot open RPS BLAST parameters file");
148
149 fscanf(auxfile, "%s", buffer);
150 info->aux_info.orig_score_matrix = strdup(buffer);
151 fscanf(auxfile, "%d", &info->aux_info.gap_open_penalty);
152 fscanf(auxfile, "%d", &info->aux_info.gap_extend_penalty);
153 fscanf(auxfile, "%le", &info->aux_info.ungapped_k);
154 fscanf(auxfile, "%le", &info->aux_info.ungapped_h);
155 fscanf(auxfile, "%d", &info->aux_info.max_db_seq_length);
156 fscanf(auxfile, "%d", &info->aux_info.db_length);
157 fscanf(auxfile, "%lf", &info->aux_info.scale_factor);
158
159 info->aux_info.karlin_k = (double *)malloc(num_db_seqs * sizeof(double));
160 for (i = 0; i < num_db_seqs && !feof(auxfile); i++) {
161 fscanf(auxfile, "%d", &seq_size); /* not used */
162 fscanf(auxfile, "%le", &info->aux_info.karlin_k[i]);
163 }
164
165 if (i < num_db_seqs)
166 ErrPostEx(SEV_FATAL, 1, 0, "Missing Karlin parameters");
167
168 FileClose(auxfile);
169 *ppinfo = info;
170 *rps_mmap = lut_mmap;
171 *rps_pssm_mmap = pssm_mmap;
172 return 0;
173 }
174
175 /** Initializes and populates the RPS BLAST specific structures.
176 * @param seq_src Database sequences source [in]
177 * @param options All search options [in]
178 * @param rps_options Copy of options, with RPS-specific modifications for
179 * scoring and hit saving options. All other options pointers
180 * are left the same as in input "options". [out]
181 * @param rps_info_out Auxiliary structure with RPS-specific information [out]
182 * @param rps_mmap Memory mapped lookup table [out]
183 * @param rps_pssm_mmap Memory mapped PSSM [out]
184 * @param scale_factor Scaling factor for RPS matrix. [out]
185 * @param extra_returns Structure containing error information [in] [out]
186 * @return Status.
187 */
188 static Int2
s_RPSExtraStructsSetUp(const BlastSeqSrc * seq_src,const SBlastOptions * options,SBlastOptions ** rps_options,BlastRPSInfo ** rps_info_out,Nlm_MemMapPtr * rps_mmap,Nlm_MemMapPtr * rps_pssm_mmap,double * scale_factor,Blast_SummaryReturn * extra_returns)189 s_RPSExtraStructsSetUp(const BlastSeqSrc* seq_src, const SBlastOptions* options,
190 SBlastOptions* *rps_options, BlastRPSInfo* *rps_info_out,
191 Nlm_MemMapPtr *rps_mmap, Nlm_MemMapPtr *rps_pssm_mmap,
192 double *scale_factor, Blast_SummaryReturn* extra_returns)
193 {
194 const char* kDbName;
195 BlastRPSInfo* rps_info = NULL;
196 BlastScoringOptions* rps_score_options;
197 BlastHitSavingOptions* rps_hit_options;
198 Int2 status = 0;
199
200 /* The caller has already checked these, so we are just asserting it here. */
201 ASSERT(seq_src && options && extra_returns);
202
203 kDbName = BlastSeqSrcGetName(seq_src);
204
205 if (kDbName == NULL ||
206 (status = s_BlastRPSInfoInit(&rps_info, rps_mmap,
207 rps_pssm_mmap, kDbName)) != 0) {
208 SBlastMessageWrite(&extra_returns->error, SEV_WARNING, "RPS BLAST database setup failed", NULL, FALSE);
209 return status;
210 }
211 *rps_info_out = rps_info;
212 *scale_factor = rps_info->aux_info.scale_factor;
213 rps_score_options = (BlastScoringOptions*)
214 BlastMemDup(options->score_options, sizeof(BlastScoringOptions));
215 rps_hit_options = (BlastHitSavingOptions*)
216 BlastMemDup(options->hit_options, sizeof(BlastHitSavingOptions));
217 rps_score_options->gap_open =
218 rps_info->aux_info.gap_open_penalty;
219 rps_score_options->gap_extend =
220 rps_info->aux_info.gap_extend_penalty;
221 rps_score_options->matrix =
222 strdup(rps_info->aux_info.orig_score_matrix);
223
224 *rps_options = (SBlastOptions*) BlastMemDup(options, sizeof(SBlastOptions));
225 (*rps_options)->score_options = rps_score_options;
226 (*rps_options)->hit_options = rps_hit_options;
227
228 return 0;
229 }
230
231 /** Frees the RPS BLAST specific extra structures.
232 * @param rps_info Auxiliary structure with RPS-specific information [in]
233 * @param rps_mmap Memory mapped lookup table [in]
234 * @param rps_pssm_mmap Memory mapped PSSM [in]
235 * @param options Copy of the options wrapper structure, containing scoring
236 * and hit saving options, specially modified for RPS search.
237 * All other options are the same as in the original structure,
238 * so they should not be freed here. [in]
239 */
240 static void
s_RPSExtraStructsFree(BlastRPSInfo * rps_info,Nlm_MemMapPtr rps_mmap,Nlm_MemMapPtr rps_pssm_mmap,SBlastOptions * options)241 s_RPSExtraStructsFree(BlastRPSInfo* rps_info, Nlm_MemMapPtr rps_mmap,
242 Nlm_MemMapPtr rps_pssm_mmap, SBlastOptions* options)
243 {
244 Nlm_MemMapFini(rps_mmap);
245 Nlm_MemMapFini(rps_pssm_mmap);
246
247 if (rps_info) {
248 sfree(rps_info->aux_info.karlin_k);
249 sfree(rps_info->aux_info.orig_score_matrix);
250 sfree(rps_info);
251 }
252 if (options) {
253 if (options->score_options) {
254 sfree(options->score_options->matrix);
255 sfree(options->score_options);
256 }
257 sfree(options->hit_options);
258 sfree(options);
259 }
260 }
261
262 /** Sets up the HSP stream, depending on whether the search is single or
263 * multithreaded, and whether on-the-fly tabular output option is set.
264 */
265 static Int2
s_BlastHSPStreamSetUp(BLAST_SequenceBlk * query,BlastQueryInfo * query_info,const BlastSeqSrc * seq_src,const SBlastOptions * options,BlastScoreBlk * sbp,BlastTabularFormatData * tf_data,BlastHSPStream ** hsp_stream,Blast_SummaryReturn * extra_returns)266 s_BlastHSPStreamSetUp(BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
267 const BlastSeqSrc* seq_src, const SBlastOptions* options,
268 BlastScoreBlk* sbp, BlastTabularFormatData* tf_data,
269 BlastHSPStream* *hsp_stream,
270 Blast_SummaryReturn* extra_returns)
271 {
272 Int2 status = 0;
273 const Int4 kNumResults = query_info->num_queries;
274 BlastHSPWriterInfo* writer_info = NULL;
275
276 /* If any of the required inputs were NULL, the caller would have exited
277 before getting to this point. ASSERT this here. */
278 ASSERT(query && query_info && seq_src && options && sbp);
279
280 if (!tf_data) {
281 writer_info = BlastHSPCollectorInfoNew(
282 BlastHSPCollectorParamsNew(
283 options->hit_options,
284 options->ext_options->compositionBasedStats,
285 options->score_options->gapped_calculation));
286
287 *hsp_stream = BlastHSPStreamNew(
288 options->program,
289 options->ext_options,
290 FALSE,
291 kNumResults,
292 BlastHSPWriterNew(&writer_info, NULL, query));
293
294 if (options->num_cpus > 1) {
295 MT_LOCK lock = Blast_MT_LOCKInit();
296 BlastHSPStreamRegisterMTLock(*hsp_stream, lock);
297 }
298 } else {
299 writer_info = BlastHSPQueueInfoNew(BlastHSPQueueParamsNew());
300 *hsp_stream = BlastHSPStreamNew(
301 options->program,
302 options->ext_options,
303 FALSE,
304 kNumResults,
305 BlastHSPWriterNew(&writer_info, NULL, query));
306 Blast_TabularFormatDataSetUp(tf_data, options->program,
307 *hsp_stream, seq_src, query, query_info,
308 options->score_options, sbp, options->eff_len_options,
309 options->ext_options, options->hit_options,
310 options->db_options);
311 }
312 return status;
313 }
314
315 /** Starts and joins all threads performing a multi-threaded search, with or
316 * without on-the-fly output, or performs a single-threaded search.
317 */
318 static Int2
s_BlastThreadManager(BLAST_SequenceBlk * query,BlastQueryInfo * query_info,const BlastSeqSrc * seq_src,const SBlastOptions * options,LookupTableWrap * lookup_wrap,BlastScoreBlk * sbp,BlastHSPStream * hsp_stream,BlastRPSInfo * rps_info,BlastTabularFormatData * tf_data,BlastHSPResults ** results,Blast_SummaryReturn * extra_returns)319 s_BlastThreadManager(BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
320 const BlastSeqSrc* seq_src, const SBlastOptions* options,
321 LookupTableWrap* lookup_wrap, BlastScoreBlk* sbp,
322 BlastHSPStream* hsp_stream, BlastRPSInfo* rps_info,
323 BlastTabularFormatData* tf_data, BlastHSPResults **results,
324 Blast_SummaryReturn* extra_returns)
325 {
326 Int2 status = 0;
327 /* The options input cannot be NULL here. The program would have exited
328 before entering this function if it was. */
329 const BlastInitialWordOptions* word_options = options->word_options;
330 const BlastScoringOptions* score_options = options->score_options;
331 const BlastExtensionOptions* ext_options = options->ext_options;
332 const BlastHitSavingOptions* hit_options = options->hit_options;
333 const BlastEffectiveLengthsOptions* eff_len_options =
334 options->eff_len_options;
335 const PSIBlastOptions* psi_options = options->psi_options;
336 const BlastDatabaseOptions* db_options = options->db_options;
337 TNlmThread format_thread = NULL;
338 BlastDiagnostics* diagnostics = NULL;
339 const EBlastProgramType kProgram = options->program;
340 const int kNumCpus = options->num_cpus;
341
342 /* Assert that all required inputs are not NULL. They must be - otherwise
343 the program should have exited before entering this function. */
344 ASSERT(query && query_info && seq_src && lookup_wrap && sbp &&
345 hsp_stream && extra_returns);
346
347 BlastSeqSrcResetChunkIterator((BlastSeqSrc*) seq_src);
348
349 /* Start the formatting thread */
350 if(tf_data && NlmThreadsAvailable() &&
351 (format_thread =
352 NlmThreadCreate(Blast_TabularFormatThread, (void*) tf_data))
353 == NULL_thread) {
354 SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
355 "Cannot create thread for formatting tabular output\n", NULL, options->believe_query);
356 return -1;
357 }
358
359 if (NlmThreadsAvailable() && kNumCpus > 1) {
360 TNlmThread* thread_array =
361 (TNlmThread*) calloc(kNumCpus, sizeof(TNlmThread));
362 BlastPrelimSearchThreadData* search_data = NULL;
363 void* join_status = NULL;
364 int index;
365
366 diagnostics = Blast_DiagnosticsInitMT(Blast_MT_LOCKInit());
367
368 for (index = 0; index < kNumCpus; index++) {
369 search_data =
370 BlastPrelimSearchThreadDataInit(kProgram, query,
371 query_info, seq_src, lookup_wrap, score_options,
372 word_options, ext_options, hit_options, eff_len_options,
373 psi_options, db_options, sbp, diagnostics, hsp_stream);
374
375 thread_array[index] =
376 NlmThreadCreate(Blast_PrelimSearchThreadRun,
377 (void*) search_data);
378 }
379 for (index = 0; index < kNumCpus; index++)
380 NlmThreadJoin(thread_array[index], &join_status);
381
382 MemFree(thread_array);
383
384 if (!tf_data) {
385 SPHIPatternSearchBlk* pattern_blk = NULL;
386 if (Blast_ProgramIsPhiBlast(kProgram)) {
387 pattern_blk = (SPHIPatternSearchBlk*) lookup_wrap->lut;
388 pattern_blk->num_patterns_db =
389 (Int4)diagnostics->ungapped_stat->lookup_hits;
390 }
391
392 if ((status = Blast_RunTracebackSearch(kProgram, query,
393 query_info, seq_src, score_options,
394 ext_options, hit_options, eff_len_options,
395 db_options, psi_options, sbp, hsp_stream,
396 rps_info, pattern_blk, results, kNumCpus)) != 0) {
397 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
398 "Traceback engine failed\n", NULL, options->believe_query);
399 }
400 }
401 } else {
402 diagnostics = Blast_DiagnosticsInit();
403
404 if (tf_data) { /* Single thread, tabular */
405 if ((status =
406 Blast_RunPreliminarySearch(kProgram, query, query_info,
407 seq_src, score_options, sbp, lookup_wrap, word_options,
408 ext_options, hit_options, eff_len_options, psi_options,
409 db_options, hsp_stream, diagnostics)) != 0) {
410 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
411 "Preliminary search engine failed\n", NULL, options->believe_query);
412 }
413 } else { /* Single thread, non-tabular */
414 if ((status=Blast_RunFullSearch(kProgram, query, query_info,
415 seq_src, sbp, score_options, lookup_wrap,
416 word_options, ext_options, hit_options,
417 eff_len_options, psi_options, db_options, hsp_stream,
418 rps_info, diagnostics, results, 0, 0)) != 0) {
419 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
420 "Blast_RunFullSearch failed\n", NULL, options->believe_query);
421 }
422 }
423 }
424
425 if (tf_data) {
426 void* join_status = NULL;
427 BlastHSPStreamClose(hsp_stream);
428 NlmThreadJoin(format_thread, &join_status);
429 /* Free the internally allocated structures used for tabular
430 formatting. */
431 BlastTabularFormatDataClean(tf_data);
432 }
433
434 hsp_stream = BlastHSPStreamFree(hsp_stream);
435 Blast_SummaryReturnFill(kProgram, score_options, sbp, options->lookup_options,
436 word_options, ext_options, hit_options,
437 eff_len_options, options->query_options, query_info,
438 seq_src, &diagnostics, extra_returns);
439
440 return status;
441 }
442
443 /** GET_MATRIX_PATH callback to find the path to a specified matrix.
444 * Looks first in current directory, then one specified by
445 * .ncbirc, then in local data directory, then env
446 * variables.
447 * @param matrix_name name of the matrix (e.g., BLOSUM50) [in]
448 * @param is_prot protein matrix if TRUE [in]
449 * @return path to matrix if found, or NULL.
450 */
451 static char*
s_BlastFindMatrixPath(const char * matrix_name,Boolean is_prot)452 s_BlastFindMatrixPath(const char* matrix_name, Boolean is_prot)
453 {
454 char* matrix_path = NULL; /* return value. */
455 char buf_path[PATH_MAX]; /* Used for path without matrix filename. */
456 char buf_full[PATH_MAX]; /* used for full path with filename. */
457 char* ptr = NULL;
458
459 if (matrix_name == NULL)
460 return NULL;
461
462 /* current directory */
463 if (Nlm_FileLength((char*) matrix_name) > 0)
464 {
465 char buf_path_2[PATH_MAX];
466 Nlm_ProgramPath(buf_path, PATH_MAX);
467 ptr = StringRChr (buf_path, DIRDELIMCHR);
468 if (ptr != NULL)
469 *ptr = '\0';
470 sprintf(buf_path_2, "%s%s", buf_path, DIRDELIMSTR);
471 matrix_path = StringSave(buf_path_2);
472 return matrix_path;
473 }
474
475 /* local data directory. */
476 sprintf(buf_full, "data%s%s", DIRDELIMSTR, matrix_name);
477 if (Nlm_FileLength(buf_full) > 0)
478 {
479 char buf_path_2[PATH_MAX];
480 Nlm_ProgramPath(buf_path, PATH_MAX);
481 ptr = StringRChr (buf_path, DIRDELIMCHR);
482 if (ptr != NULL)
483 *ptr = '\0';
484 sprintf(buf_path_2, "%s%sdata%s", buf_path, DIRDELIMSTR, DIRDELIMSTR);
485 matrix_path = StringSave(buf_path_2);
486 return matrix_path;
487 }
488
489 if(FindPath("ncbi", "ncbi", "data", buf_path, PATH_MAX)) {
490 sprintf(buf_full, "%s%s", buf_path, matrix_name);
491 if(FileLength(buf_full) > 0) {
492 matrix_path = StringSave(buf_path);
493 return matrix_path;
494 } else {
495 char alphabet_type[3]; /* aa or nt */
496 if (is_prot)
497 Nlm_StringNCpy(alphabet_type, "aa", 2);
498 else
499 Nlm_StringNCpy(alphabet_type, "nt", 2);
500 alphabet_type[2] = NULLB;
501
502 sprintf(buf_full, "%s%s%s%s", buf_path,
503 alphabet_type, DIRDELIMSTR, matrix_name);
504 if(FileLength(buf_full) > 0)
505 {
506 matrix_path = StringSave(buf_path);
507 return matrix_path;
508 }
509 }
510 }
511
512 return NULL;
513 }
514
515
516 /**
517 * Read a checkpoint file and set the necessary structures in a
518 * BlastScoreBlk: the psi_matrix, kbp_psi[0], and kbp_gap_psi[0].
519 *
520 * @param sbp a BlastScoreBlk to receive a PSSM [in/out]
521 * @param query query sequence data
522 * @param psi_matrix_file checkpoint file to read
523 * @pcore_msg a pointer to receive error and warning messages
524 */
525 static int
s_SetupScoreBlkPssmFromChkpt(BlastScoreBlk * sbp,BLAST_SequenceBlk * query,Blast_PsiCheckpointLoc * psi_checkpoint,Blast_Message ** pcore_msg)526 s_SetupScoreBlkPssmFromChkpt(BlastScoreBlk * sbp,
527 BLAST_SequenceBlk * query,
528 Blast_PsiCheckpointLoc * psi_checkpoint,
529 Blast_Message* *pcore_msg)
530 {
531 int status = 0;
532 /* An intermediate representation of the PSSM data that is used
533 in PSIBlast routines */
534 PSIMatrix * pssm = NULL;
535 /* The actual PSSM that is saved in the BlastScoreBlk */
536 SPsiBlastScoreMatrix * psi_matrix = NULL;
537 size_t i, j;
538
539 psi_matrix = SPsiBlastScoreMatrixNew(query->length);
540 if (!psi_matrix) {
541 ErrPostEx(SEV_FATAL, 1, 0,
542 "Out-of-memory: cannot allocate a PSSM of length %d.\n",
543 query->length);
544 status = -1;
545 goto error_return;
546 }
547 status = Blast_PosReadCheckpoint(psi_matrix->freq_ratios,
548 query->length, query->sequence,
549 psi_checkpoint,
550 pcore_msg);
551 if (status != 0) {
552 goto error_return;
553 }
554 Blast_KarlinBlkCopy(psi_matrix->kbp, sbp->kbp_gap_std[0]);
555 status = PSICreatePssmFromFrequencyRatios(query->sequence,
556 query->length, sbp,
557 psi_matrix->freq_ratios,
558 kPSSM_NoImpalaScaling,
559 &pssm);
560 if (0 != status) {
561 goto error_return;
562 }
563 for (i = 0; i < psi_matrix->pssm->ncols; i++) {
564 for (j = 0; j < psi_matrix->pssm->nrows; j++) {
565 psi_matrix->pssm->data[i][j] = pssm->pssm[i][j];
566 }
567 }
568 PSIMatrixFree(pssm);
569 sbp->psi_matrix = psi_matrix;
570 return 0;
571 error_return:
572 if (psi_matrix)
573 SPsiBlastScoreMatrixFree(psi_matrix);
574 return status;
575 }
576
577
578 Int2
Blast_RunSearch(SeqLoc * query_seqloc,Blast_PsiCheckpointLoc * psi_checkpoint,BlastSeqSrc * seq_src,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,BlastHSPResults ** results,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)579 Blast_RunSearch(SeqLoc* query_seqloc,
580 Blast_PsiCheckpointLoc * psi_checkpoint,
581 BlastSeqSrc* seq_src,
582 SeqLoc* masking_locs,
583 const SBlastOptions* options,
584 BlastTabularFormatData* tf_data,
585 BlastHSPResults **results,
586 SeqLoc** filter_out,
587 Blast_SummaryReturn* extra_returns)
588 {
589 Int2 status = 0;
590 BLAST_SequenceBlk *query = NULL;
591 BlastQueryInfo* query_info = NULL;
592 double scale_factor = 1.0;
593 BlastSeqLoc* lookup_segments = NULL;
594 BlastScoreBlk* sbp = NULL;
595 LookupTableWrap* lookup_wrap = NULL;
596 BlastMaskLoc* mask_loc = NULL;
597 BlastHSPStream* hsp_stream = NULL;
598 const EBlastProgramType kProgram = options->program;
599 const Boolean kRpsBlast =
600 (kProgram == eBlastTypeRpsBlast ||
601 kProgram == eBlastTypeRpsTblastn);
602 BlastRPSInfo* rps_info = NULL;
603 Nlm_MemMapPtr rps_mmap = NULL;
604 Nlm_MemMapPtr rps_pssm_mmap = NULL;
605 const QuerySetUpOptions* query_options = options->query_options;
606 const LookupTableOptions* lookup_options = options->lookup_options;
607 const BlastScoringOptions* score_options = options->score_options;
608 const BlastHitSavingOptions* hit_options = options->hit_options;
609 SBlastOptions* rps_options = NULL;
610 const Boolean kPhiBlast = Blast_ProgramIsPhiBlast(kProgram);
611 const Uint1 kDeallocateMe = 253;
612 Blast_Message *core_msg = NULL;
613
614 if (!query_seqloc || !seq_src || !options || !extra_returns)
615 return -1;
616
617 if ((status =
618 BLAST_ValidateOptions(kProgram, options->ext_options, score_options,
619 lookup_options, options->word_options, hit_options,
620 &core_msg)) != 0) {
621 extra_returns->error = Blast_MessageToSBlastMessage(core_msg, NULL, NULL, options->believe_query);
622 core_msg = Blast_MessageFree(core_msg);
623
624 return status;
625 }
626
627 if (options->program == eBlastTypeBlastn)
628 {
629 SeqLoc* dust_mask = NULL; /* Dust mask locations */
630 Blast_FindDustSeqLoc(query_seqloc, options, &dust_mask);
631 /* Combine dust mask with lower case mask
632 The dust mask will be deallocated by the end of this function
633 though as it's copied in BLAST_MainSetUp
634 Not deallocating it will result in a memory leak if masking_locs
635 was NULL at the start of this function */
636 if (dust_mask)
637 {
638 SeqLoc* dust_mask_var = dust_mask;
639 while (dust_mask_var)
640 {
641 dust_mask_var->choice = kDeallocateMe;
642 dust_mask_var = dust_mask_var->next;
643 }
644 ValNodeLink(&masking_locs, dust_mask);
645 }
646 }
647
648 if (kRpsBlast) {
649 if ((status =
650 s_RPSExtraStructsSetUp(seq_src, options, &rps_options, &rps_info,
651 &rps_mmap, &rps_pssm_mmap, &scale_factor,
652 extra_returns)))
653 return status;
654 score_options = rps_options->score_options;
655 hit_options = rps_options->hit_options;
656 options = rps_options; /* This will not change the caller's pointer. */
657 }
658
659 if ((status = BLAST_SetUpQuery(kProgram, query_seqloc, query_options,
660 masking_locs, &query_info, &query))) {
661 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
662 "BLAST_SetUpQuery returned non-zero status\n", NULL, FALSE);
663 return status;
664 }
665
666 status =
667 BLAST_MainSetUp(kProgram, query_options, score_options, query,
668 query_info, scale_factor, &lookup_segments, &mask_loc,
669 &sbp, &core_msg, s_BlastFindMatrixPath);
670 if (core_msg)
671 {
672 extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
673 core_msg = Blast_MessageFree(core_msg);
674 }
675
676 if (status)
677 return status;
678
679 if (psi_checkpoint) {
680 core_msg = NULL;
681 status = s_SetupScoreBlkPssmFromChkpt(sbp, query, psi_checkpoint,
682 &core_msg);
683 if (core_msg) {
684 extra_returns->error =
685 Blast_MessageToSBlastMessage(core_msg, query_seqloc,
686 query_info,
687 options->believe_query);
688 core_msg = Blast_MessageFree(core_msg);
689 }
690 if (status)
691 return status;
692 }
693 if (filter_out) {
694 *filter_out =
695 BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc);
696 }
697
698 /* Mask locations in BlastMaskLoc form are no longer needed. */
699 BlastMaskLocFree(mask_loc);
700
701 if (masking_locs)
702 {
703 SeqLocPtr slp_var = masking_locs;
704 SeqLocPtr last = NULL;
705 while (slp_var)
706 {
707 if (slp_var->choice == kDeallocateMe)
708 {
709 if (last == NULL)
710 {
711 masking_locs = slp_var->next;
712 slp_var->next = NULL;
713 Blast_ValNodeMaskListFree(slp_var);
714 slp_var = masking_locs;
715 }
716 else
717 {
718 last->next = slp_var->next;
719 slp_var->next = NULL;
720 Blast_ValNodeMaskListFree(slp_var);
721 slp_var = last->next;
722 }
723 }
724 else
725 {
726 last = slp_var;
727 slp_var = slp_var->next;
728 }
729 }
730 }
731
732 status = LookupTableWrapInit(query, lookup_options, query_options,
733 lookup_segments, sbp, &lookup_wrap, rps_info, &core_msg, seq_src);
734 if (core_msg)
735 {
736 extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
737 core_msg = Blast_MessageFree(core_msg);
738 }
739 if (status)
740 return status;
741
742 /* For PHI BLAST, save information about pattern occurrences in
743 query in the BlastQueryInfo structure. */
744 if (kPhiBlast) {
745 SPHIPatternSearchBlk* pattern_blk =
746 (SPHIPatternSearchBlk*) lookup_wrap->lut;
747 Blast_SetPHIPatternInfo(kProgram, pattern_blk, query, lookup_segments,
748 query_info, &core_msg);
749 if (core_msg)
750 {
751 extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
752 core_msg = Blast_MessageFree(core_msg);
753 }
754
755 }
756 /* Only need for the setup of lookup table. */
757 lookup_segments = BlastSeqLocFree(lookup_segments);
758
759 if ((status = s_BlastHSPStreamSetUp(query, query_info, seq_src, options, sbp,
760 tf_data, &hsp_stream, extra_returns)))
761 return status;
762
763 if ((status = s_BlastThreadManager(query, query_info, seq_src, options,
764 lookup_wrap, sbp, hsp_stream, rps_info,
765 tf_data, results, extra_returns)))
766 return status;
767
768 lookup_wrap = LookupTableWrapFree(lookup_wrap);
769
770 query = BlastSequenceBlkFree(query);
771 query_info = BlastQueryInfoFree(query_info);
772 BlastScoreBlkFree(sbp);
773
774 if (kRpsBlast)
775 s_RPSExtraStructsFree(rps_info, rps_mmap, rps_pssm_mmap, rps_options);
776
777 return status;
778 }
779
780 Int2
Blast_DatabaseSearch(SeqLoc * query_seqloc,Blast_PsiCheckpointLoc * psi_checkpoint,char * db_name,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,SBlastSeqalignArray ** seqalign_arr,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)781 Blast_DatabaseSearch(SeqLoc* query_seqloc,
782 Blast_PsiCheckpointLoc * psi_checkpoint,
783 char* db_name,
784 SeqLoc* masking_locs,
785 const SBlastOptions* options,
786 BlastTabularFormatData* tf_data,
787 SBlastSeqalignArray* *seqalign_arr,
788 SeqLoc** filter_out,
789 Blast_SummaryReturn* extra_returns)
790 {
791 BlastSeqSrc *seq_src = NULL;
792 Boolean db_is_prot;
793 Int2 status = 0;
794 BlastHSPResults* results = NULL;
795 ReadDBFILE* rdfp = NULL;
796
797 if (!options || !query_seqloc || !db_name || !extra_returns)
798 return -1;
799
800 db_is_prot =
801 (options->program == eBlastTypeBlastp ||
802 options->program == eBlastTypeBlastx ||
803 options->program == eBlastTypeRpsBlast ||
804 options->program == eBlastTypeRpsTblastn);
805
806 rdfp = readdb_new(db_name, db_is_prot);
807
808 seq_src = ReaddbBlastSeqSrcAttach(rdfp);
809
810 if (seq_src == NULL) {
811 SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
812 "Initialization of subject sequences source failed",
813 NULL, options->believe_query);
814 } else if (BlastSeqSrcGetNumSeqs(seq_src) == 0) {
815 SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
816 "Database is empty", NULL, options->believe_query);
817 } else {
818 char* error_str = BlastSeqSrcGetInitError(seq_src);
819 if (error_str)
820 SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
821 }
822
823 /* If there was an error initializing the sequence source, return without
824 doing the search. */
825 if (extra_returns->error)
826 return -1;
827
828 status =
829 Blast_RunSearch(query_seqloc, psi_checkpoint, seq_src,
830 masking_locs, options, tf_data, &results,
831 filter_out, extra_returns);
832
833 /* The ReadDBFILE structure will not be destroyed here, because the
834 initialising function used readdb_attach */
835 BlastSeqSrcFree(seq_src);
836
837 if (!status && !tf_data) {
838 status =
839 BLAST_ResultsToSeqAlign(options->program, &results,
840 query_seqloc, rdfp, NULL,
841 options->score_options->gapped_calculation,
842 options->score_options->is_ooframe,
843 seqalign_arr);
844 }
845
846 readdb_destruct(rdfp);
847
848 if (status)
849 return status;
850
851 return status;
852 }
853
854 /** Splits the PHI BLAST results corresponding to different pattern occurrences
855 * in query, converts them to Seq-aligns and puts in a list of ValNodes.
856 * @param results All results from different pattern occurrences
857 * mixed together. On return points to NULL. [in]
858 * @param pattern_info Query pattern occurrences information [in]
859 * @param program Program type (phiblastp or phiblastn) [in]
860 * @param query_seqloc List of query locations [in]
861 * @param rdfp blast db object [in]
862 * @param phivnps List of ValNodes containing Seq-aligns. [out]
863 * @return Status, 0 on success, -1 on failure.
864 */
865 static Int2
s_PHIResultsToSeqAlign(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info,EBlastProgramType program,SeqLoc * query_seqloc,ReadDBFILE * rdfp,ValNode ** phivnps)866 s_PHIResultsToSeqAlign(const BlastHSPResults* results,
867 const SPHIQueryInfo* pattern_info,
868 EBlastProgramType program, SeqLoc* query_seqloc,
869 ReadDBFILE* rdfp, ValNode* *phivnps)
870 {
871 Int2 status = 0;
872 /* Split results into an array of BlastHSPResults structures corresponding
873 to different pattern occurrences. */
874 BlastHSPResults* *phi_results =
875 PHIBlast_HSPResultsSplit(results, pattern_info);
876
877 if (phi_results) {
878 int pattern_index; /* Index over pattern occurrences. */
879
880 for (pattern_index = 0; pattern_index < pattern_info->num_patterns;
881 ++pattern_index) {
882 SBlastSeqalignArray* seqalign_arr = NULL;
883 SeqAlign* seqalign = NULL;
884 BlastHSPResults* one_phi_results = phi_results[pattern_index];
885
886 if (one_phi_results) {
887 /* PHI BLAST is always gapped, and never out-of-frame, hence
888 * TRUE and FALSE values for the respective booleans in the next
889 * call.
890 */
891 status =
892 BLAST_ResultsToSeqAlign(program, &one_phi_results,
893 query_seqloc, rdfp, NULL, TRUE,
894 FALSE, &seqalign_arr);
895 if (seqalign_arr)
896 {
897 seqalign = seqalign_arr->array[0];
898 seqalign_arr->array[0] = NULL;
899 SBlastSeqalignArrayFree(seqalign_arr);
900 }
901 ValNodeAddPointer(phivnps, pattern_index, seqalign);
902 }
903 }
904 sfree(phi_results);
905 }
906 return status;
907 }
908
909 Int2
PHIBlastRunSearch(SeqLoc * query_seqloc,char * db_name,SeqLoc * masking_locs,const SBlastOptions * options,ValNode ** phivnps,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)910 PHIBlastRunSearch(SeqLoc* query_seqloc, char* db_name, SeqLoc* masking_locs,
911 const SBlastOptions* options, ValNode* *phivnps,
912 SeqLoc** filter_out, Blast_SummaryReturn* extra_returns)
913 {
914 BlastSeqSrc *seq_src = NULL;
915 Boolean is_prot;
916 Int2 status = 0;
917 BlastHSPResults* results = NULL;
918 ReadDBFILE* rdfp = NULL;
919
920 if (!options || !query_seqloc || !db_name || !extra_returns || !phivnps)
921 return -1;
922
923 ASSERT(Blast_ProgramIsPhiBlast(options->program));
924
925 is_prot = (options->program == eBlastTypePhiBlastp);
926
927 rdfp = readdb_new(db_name, is_prot);
928
929 seq_src = ReaddbBlastSeqSrcAttach(rdfp);
930
931 if (seq_src == NULL) {
932 SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
933 "Initialization of subject sequences source failed", NULL, options->believe_query);
934 } else {
935 char* error_str = BlastSeqSrcGetInitError(seq_src);
936 if (error_str)
937 SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
938 }
939
940 /* If there was an error initializing the sequence source, return without
941 doing the search. */
942 if (extra_returns->error)
943 return -1;
944
945 /* Masking at hash and on-the-fly tabular output are not applicable for
946 PHI BLAST, so pass NULL in corresponding arguments. */
947 status =
948 Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL,
949 seq_src, masking_locs, options,
950 (BlastTabularFormatData*) NULL, &results,
951 filter_out, extra_returns);
952
953 /* The ReadDBFILE structure will not be destroyed here, because the
954 initialising function used readdb_attach */
955 BlastSeqSrcFree(seq_src);
956
957 *phivnps = NULL;
958
959 if (!status) {
960 status =
961 s_PHIResultsToSeqAlign(results, extra_returns->pattern_info,
962 options->program, query_seqloc, rdfp,
963 phivnps);
964 }
965
966 results = Blast_HSPResultsFree(results);
967
968 readdb_destruct(rdfp);
969 return status;
970 }
971
972 Int2
Blast_TwoSeqLocSetsAdvanced(SeqLoc * query_seqloc,SeqLoc * subject_seqloc,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,SBlastSeqalignArray ** seqalign_arr,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)973 Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
974 SeqLoc* subject_seqloc,
975 SeqLoc* masking_locs,
976 const SBlastOptions* options,
977 BlastTabularFormatData* tf_data,
978 SBlastSeqalignArray* *seqalign_arr,
979 SeqLoc** filter_out,
980 Blast_SummaryReturn* extra_returns)
981 {
982 BlastSeqSrc *seq_src = NULL;
983 Int2 status = 0;
984 BlastHSPResults* results = NULL;
985
986
987 if (!options || !query_seqloc || !subject_seqloc || !extra_returns)
988 return -1;
989
990
991 seq_src = MultiSeqBlastSeqSrcInit(subject_seqloc, options->program);
992
993 if (seq_src == NULL) {
994 SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
995 "Initialization of subject sequences source failed", NULL, options->believe_query);
996 } else {
997 char* error_str = BlastSeqSrcGetInitError(seq_src);
998 if (error_str)
999 SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
1000 }
1001
1002 /* If there was an error initializing the sequence source, return without
1003 doing the search. */
1004 if (extra_returns->error)
1005 return -1;
1006
1007 status =
1008 Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL,
1009 seq_src, masking_locs, options, tf_data,
1010 &results, filter_out, extra_returns);
1011
1012 /* The ReadDBFILE structure will not be destroyed here, because the
1013 initialising function used readdb_attach */
1014 BlastSeqSrcFree(seq_src);
1015
1016 if (!status) {
1017 status =
1018 BLAST_ResultsToSeqAlign(options->program, &results, query_seqloc,
1019 NULL, subject_seqloc,
1020 options->score_options->gapped_calculation,
1021 options->score_options->is_ooframe,
1022 seqalign_arr);
1023 }
1024
1025 if (status)
1026 return status;
1027
1028 return status;
1029 }
1030
GeneticCodeSingletonInit()1031 void GeneticCodeSingletonInit()
1032 {
1033 Uint1* gc = NULL;
1034 GenCodeSingletonInit();
1035 BLAST_GeneticCodeFind(BLAST_GENETIC_CODE, &gc);
1036 GenCodeSingletonAdd(BLAST_GENETIC_CODE, gc);
1037 free(gc);
1038 }
1039
GeneticCodeSingletonFini()1040 void GeneticCodeSingletonFini()
1041 {
1042 GenCodeSingletonFini();
1043 }
1044
1045 /* @} */
1046
1047
1048
1049