1 /* $Id: blast_api.c,v 1.59 2016/06/21 13:53:41 madden Exp $
2 ***************************************************************************
3 *                                                                         *
4 *                             COPYRIGHT NOTICE                            *
5 *                                                                         *
6 * This software/database is categorized as "United States Government      *
7 * Work" under the terms of the United States Copyright Act.  It was       *
8 * produced as part of the author's official duties as a Government        *
9 * employee and thus can not be copyrighted.  This software/database is    *
10 * freely available to the public for use without a copyright notice.      *
11 * Restrictions can not be placed on its present or future use.            *
12 *                                                                         *
13 * Although all reasonable efforts have been taken to ensure the accuracy  *
14 * and reliability of the software and data, the National Library of       *
15 * Medicine (NLM) and the U.S. Government do not and can not warrant the   *
16 * performance or results that may be obtained by using this software,     *
17 * data, or derivative works thereof.  The NLM and the U.S. Government     *
18 * disclaim any and all warranties, expressed or implied, as to the        *
19 * performance, merchantability or fitness for any particular purpose or   *
20 * use.                                                                    *
21 *                                                                         *
22 * In any work or product derived from this material, proper attribution   *
23 * of the author(s) as the source of the software or data would be         *
24 * appreciated.                                                            *
25 *
26 * ===========================================================================
27 *
28 * Author:  Ilya Dondoshansky
29 *
30 * ===========================================================================
31 */
32 
33 /** @file blast_api.c
34  * Functions for C toolkit applications to perform a BLAST search
35  * against a BLAST database, using the rewritten blast engine.
36  */
37 
38 #include <algo/blast/api/blast_api.h>
39 #include <algo/blast/core/blast_setup.h>
40 #include <algo/blast/core/blast_filter.h>
41 #include <algo/blast/core/blast_util.h>
42 #include <algo/blast/core/blast_message.h>
43 #include <algo/blast/core/blast_engine.h>
44 #include <algo/blast/core/blast_traceback.h>
45 #include <algo/blast/core/blast_hspstream.h>
46 #include <algo/blast/core/blast_hspfilter.h>
47 #include <algo/blast/core/hspfilter_collector.h>
48 #include <algo/blast/api/hspfilter_queue.h>
49 #include <algo/blast/core/phi_lookup.h>
50 #include <algo/blast/core/blast_psi.h>
51 #include <algo/blast/api/blast_mtlock.h>
52 #include <algo/blast/api/blast_prelim.h>
53 #include <algo/blast/api/blast_seq.h>
54 #include <algo/blast/api/seqsrc_readdb.h>
55 #include <algo/blast/api/seqsrc_multiseq.h>
56 #include <algo/blast/api/blast_seqalign.h>
57 #include <algo/blast/api/dust_filter.h>
58 #include <algo/blast/api/blast_message_api.h>
59 #include <algo/blast/core/gencode_singleton.h>
60 
61 /** @addtogroup CToolkitAlgoBlast
62  *
63  * @{
64  */
65 
66 /** Initializes the auxiliary structure with RPS BLAST database information.
67  * @param ppinfo Resulting structure. [out]
68  * @param rps_mmap Memory mapped lookup table [out]
69  * @param rps_pssm_mmap Memory mapped PSSM [out]
70  * @param dbname Name of the database [in]
71  */
72 static Int2
s_BlastRPSInfoInit(BlastRPSInfo ** ppinfo,Nlm_MemMap ** rps_mmap,Nlm_MemMap ** rps_pssm_mmap,const char * dbname)73 s_BlastRPSInfoInit(BlastRPSInfo **ppinfo, Nlm_MemMap **rps_mmap,
74                    Nlm_MemMap **rps_pssm_mmap, const char* dbname)
75 {
76    char filename[PATH_MAX];
77    char pathname[PATH_MAX];
78    BlastRPSInfo *info;
79    FILE *auxfile;
80    Int4 i;
81    Int4 seq_size;
82    Int4 num_db_seqs;
83    Nlm_MemMapPtr lut_mmap;
84    Nlm_MemMapPtr pssm_mmap;
85    char buffer[PATH_MAX];
86    ReadDBFILEPtr rdfp;
87    char *tmp_dbname;
88    Uint4 version;
89 
90    info = (BlastRPSInfo *)malloc(sizeof(BlastRPSInfo));
91    if (info == NULL)
92       ErrPostEx(SEV_FATAL, 1, 0, "Memory allocation failed");
93 
94    /* find the path to the RPS database */
95    tmp_dbname = strdup(dbname);
96    rdfp = readdb_new_ex2(tmp_dbname, READDB_DB_IS_PROT,
97                          READDB_NEW_DO_REPORT, NULL, NULL);
98    sfree(tmp_dbname);
99    if (rdfp == NULL)
100       ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST database");
101    sprintf(pathname, "%s", rdfp->full_filename);
102    rdfp = readdb_destruct(rdfp);
103 
104    sprintf(filename, "%s.loo", (char *)pathname);
105    lut_mmap = Nlm_MemMapInit(filename);
106    if (lut_mmap == NULL)
107       ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST lookup file");
108 
109    info->lookup_header = (BlastRPSLookupFileHeader *)lut_mmap->mmp_begin;
110    version = info->lookup_header->magic_number;
111    if (version != RPS_MAGIC_NUM && version != RPS_MAGIC_NUM_28) {
112 
113       version = Nlm_SwitchUint4(version);
114       if (version == RPS_MAGIC_NUM || version == RPS_MAGIC_NUM_28) {
115          ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST lookup file was created "
116                            "on an incompatible platform");
117       }
118       else {
119          ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST lookup file is corrupt");
120       }
121    }
122 
123    sprintf(filename, "%s.rps", (char *)pathname);
124    pssm_mmap = Nlm_MemMapInit(filename);
125    if (pssm_mmap == NULL)
126       ErrPostEx(SEV_FATAL, 1, 0, "Cannot map RPS BLAST profile file");
127 
128    info->profile_header = (BlastRPSProfileHeader *)pssm_mmap->mmp_begin;
129    version = info->profile_header->magic_number;
130    if (version != RPS_MAGIC_NUM && version != RPS_MAGIC_NUM_28) {
131 
132       version = Nlm_SwitchUint4(version);
133       if (version == RPS_MAGIC_NUM || version == RPS_MAGIC_NUM_28) {
134          ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST profile file was created "
135                            "on an incompatible platform");
136       }
137       else {
138          ErrPostEx(SEV_FATAL, 1, 0, "RPS BLAST profile file is corrupt");
139       }
140    }
141 
142    num_db_seqs = info->profile_header->num_profiles;
143 
144    sprintf(filename, "%s.aux", (char *)pathname);
145    auxfile = FileOpen(filename, "r");
146    if (auxfile == NULL)
147       ErrPostEx(SEV_FATAL, 1, 0,"Cannot open RPS BLAST parameters file");
148 
149    fscanf(auxfile, "%s", buffer);
150    info->aux_info.orig_score_matrix = strdup(buffer);
151    fscanf(auxfile, "%d", &info->aux_info.gap_open_penalty);
152    fscanf(auxfile, "%d", &info->aux_info.gap_extend_penalty);
153    fscanf(auxfile, "%le", &info->aux_info.ungapped_k);
154    fscanf(auxfile, "%le", &info->aux_info.ungapped_h);
155    fscanf(auxfile, "%d", &info->aux_info.max_db_seq_length);
156    fscanf(auxfile, "%d", &info->aux_info.db_length);
157    fscanf(auxfile, "%lf", &info->aux_info.scale_factor);
158 
159    info->aux_info.karlin_k = (double *)malloc(num_db_seqs * sizeof(double));
160    for (i = 0; i < num_db_seqs && !feof(auxfile); i++) {
161       fscanf(auxfile, "%d", &seq_size); /* not used */
162       fscanf(auxfile, "%le", &info->aux_info.karlin_k[i]);
163    }
164 
165    if (i < num_db_seqs)
166       ErrPostEx(SEV_FATAL, 1, 0, "Missing Karlin parameters");
167 
168    FileClose(auxfile);
169    *ppinfo = info;
170    *rps_mmap = lut_mmap;
171    *rps_pssm_mmap = pssm_mmap;
172    return 0;
173 }
174 
175 /** Initializes and populates the RPS BLAST specific structures.
176  * @param seq_src Database sequences source [in]
177  * @param options All search options [in]
178  * @param rps_options Copy of options, with RPS-specific modifications for
179  *                    scoring and hit saving options. All other options pointers
180  *                    are left the same as in input "options". [out]
181  * @param rps_info_out Auxiliary structure with RPS-specific information [out]
182  * @param rps_mmap Memory mapped lookup table [out]
183  * @param rps_pssm_mmap Memory mapped PSSM [out]
184  * @param scale_factor Scaling factor for RPS matrix. [out]
185  * @param extra_returns Structure containing error information [in] [out]
186  * @return Status.
187  */
188 static Int2
s_RPSExtraStructsSetUp(const BlastSeqSrc * seq_src,const SBlastOptions * options,SBlastOptions ** rps_options,BlastRPSInfo ** rps_info_out,Nlm_MemMapPtr * rps_mmap,Nlm_MemMapPtr * rps_pssm_mmap,double * scale_factor,Blast_SummaryReturn * extra_returns)189 s_RPSExtraStructsSetUp(const BlastSeqSrc* seq_src, const SBlastOptions* options,
190                        SBlastOptions* *rps_options, BlastRPSInfo* *rps_info_out,
191                        Nlm_MemMapPtr *rps_mmap, Nlm_MemMapPtr *rps_pssm_mmap,
192                        double *scale_factor, Blast_SummaryReturn* extra_returns)
193 {
194     const char* kDbName;
195     BlastRPSInfo* rps_info = NULL;
196     BlastScoringOptions* rps_score_options;
197     BlastHitSavingOptions* rps_hit_options;
198     Int2 status = 0;
199 
200     /* The caller has already checked these, so we are just asserting it here. */
201     ASSERT(seq_src && options && extra_returns);
202 
203     kDbName = BlastSeqSrcGetName(seq_src);
204 
205     if (kDbName == NULL ||
206         (status = s_BlastRPSInfoInit(&rps_info, rps_mmap,
207                                      rps_pssm_mmap, kDbName)) != 0) {
208         SBlastMessageWrite(&extra_returns->error, SEV_WARNING, "RPS BLAST database setup failed", NULL, FALSE);
209         return status;
210     }
211     *rps_info_out = rps_info;
212     *scale_factor = rps_info->aux_info.scale_factor;
213     rps_score_options = (BlastScoringOptions*)
214         BlastMemDup(options->score_options, sizeof(BlastScoringOptions));
215     rps_hit_options = (BlastHitSavingOptions*)
216         BlastMemDup(options->hit_options, sizeof(BlastHitSavingOptions));
217     rps_score_options->gap_open =
218         rps_info->aux_info.gap_open_penalty;
219     rps_score_options->gap_extend =
220         rps_info->aux_info.gap_extend_penalty;
221     rps_score_options->matrix =
222         strdup(rps_info->aux_info.orig_score_matrix);
223 
224     *rps_options = (SBlastOptions*) BlastMemDup(options, sizeof(SBlastOptions));
225     (*rps_options)->score_options = rps_score_options;
226     (*rps_options)->hit_options = rps_hit_options;
227 
228     return 0;
229 }
230 
231 /** Frees the RPS BLAST specific extra structures.
232  * @param rps_info Auxiliary structure with RPS-specific information [in]
233  * @param rps_mmap Memory mapped lookup table [in]
234  * @param rps_pssm_mmap Memory mapped PSSM [in]
235  * @param options Copy of the options wrapper structure, containing scoring
236  *                and hit saving options, specially modified for RPS search.
237  *                All other options are the same as in the original structure,
238  *                so they should not be freed here. [in]
239  */
240 static void
s_RPSExtraStructsFree(BlastRPSInfo * rps_info,Nlm_MemMapPtr rps_mmap,Nlm_MemMapPtr rps_pssm_mmap,SBlastOptions * options)241 s_RPSExtraStructsFree(BlastRPSInfo* rps_info, Nlm_MemMapPtr rps_mmap,
242                       Nlm_MemMapPtr rps_pssm_mmap, SBlastOptions* options)
243 {
244     Nlm_MemMapFini(rps_mmap);
245     Nlm_MemMapFini(rps_pssm_mmap);
246 
247     if (rps_info) {
248         sfree(rps_info->aux_info.karlin_k);
249         sfree(rps_info->aux_info.orig_score_matrix);
250         sfree(rps_info);
251     }
252     if (options) {
253         if (options->score_options) {
254             sfree(options->score_options->matrix);
255             sfree(options->score_options);
256         }
257         sfree(options->hit_options);
258         sfree(options);
259     }
260 }
261 
262 /** Sets up the HSP stream, depending on whether the search is single or
263  * multithreaded, and whether on-the-fly tabular output option is set.
264  */
265 static Int2
s_BlastHSPStreamSetUp(BLAST_SequenceBlk * query,BlastQueryInfo * query_info,const BlastSeqSrc * seq_src,const SBlastOptions * options,BlastScoreBlk * sbp,BlastTabularFormatData * tf_data,BlastHSPStream ** hsp_stream,Blast_SummaryReturn * extra_returns)266 s_BlastHSPStreamSetUp(BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
267                       const BlastSeqSrc* seq_src, const SBlastOptions* options,
268                       BlastScoreBlk* sbp, BlastTabularFormatData* tf_data,
269                       BlastHSPStream* *hsp_stream,
270                       Blast_SummaryReturn* extra_returns)
271 {
272     Int2 status = 0;
273     const Int4 kNumResults = query_info->num_queries;
274     BlastHSPWriterInfo* writer_info = NULL;
275 
276     /* If any of the required inputs were NULL, the caller would have exited
277        before getting to this point. ASSERT this here. */
278     ASSERT(query && query_info && seq_src && options && sbp);
279 
280     if (!tf_data) {
281         writer_info = BlastHSPCollectorInfoNew(
282                           BlastHSPCollectorParamsNew(
283                                        options->hit_options,
284                                        options->ext_options->compositionBasedStats,
285                                        options->score_options->gapped_calculation));
286 
287         *hsp_stream = BlastHSPStreamNew(
288                                        options->program,
289                                        options->ext_options,
290                                        FALSE,
291                                        kNumResults,
292                                        BlastHSPWriterNew(&writer_info, NULL, query));
293 
294         if (options->num_cpus > 1) {
295             MT_LOCK lock = Blast_MT_LOCKInit();
296             BlastHSPStreamRegisterMTLock(*hsp_stream, lock);
297         }
298     } else {
299         writer_info = BlastHSPQueueInfoNew(BlastHSPQueueParamsNew());
300         *hsp_stream = BlastHSPStreamNew(
301                                        options->program,
302                                        options->ext_options,
303                                        FALSE,
304                                        kNumResults,
305                                        BlastHSPWriterNew(&writer_info, NULL, query));
306         Blast_TabularFormatDataSetUp(tf_data, options->program,
307                            *hsp_stream, seq_src, query, query_info,
308                            options->score_options, sbp, options->eff_len_options,
309                            options->ext_options, options->hit_options,
310                            options->db_options);
311     }
312     return status;
313 }
314 
315 /** Starts and joins all threads performing a multi-threaded search, with or
316  * without on-the-fly output, or performs a single-threaded search.
317  */
318 static Int2
s_BlastThreadManager(BLAST_SequenceBlk * query,BlastQueryInfo * query_info,const BlastSeqSrc * seq_src,const SBlastOptions * options,LookupTableWrap * lookup_wrap,BlastScoreBlk * sbp,BlastHSPStream * hsp_stream,BlastRPSInfo * rps_info,BlastTabularFormatData * tf_data,BlastHSPResults ** results,Blast_SummaryReturn * extra_returns)319 s_BlastThreadManager(BLAST_SequenceBlk* query, BlastQueryInfo* query_info,
320                      const BlastSeqSrc* seq_src, const SBlastOptions* options,
321                      LookupTableWrap* lookup_wrap, BlastScoreBlk* sbp,
322                      BlastHSPStream* hsp_stream, BlastRPSInfo* rps_info,
323                      BlastTabularFormatData* tf_data, BlastHSPResults **results,
324                      Blast_SummaryReturn* extra_returns)
325 {
326     Int2 status = 0;
327     /* The options input cannot be NULL here. The program would have exited
328        before entering this function if it was. */
329     const BlastInitialWordOptions* word_options = options->word_options;
330     const BlastScoringOptions* score_options = options->score_options;
331     const BlastExtensionOptions* ext_options = options->ext_options;
332     const BlastHitSavingOptions* hit_options = options->hit_options;
333     const BlastEffectiveLengthsOptions* eff_len_options =
334         options->eff_len_options;
335     const PSIBlastOptions* psi_options = options->psi_options;
336     const BlastDatabaseOptions* db_options = options->db_options;
337     TNlmThread format_thread = NULL;
338     BlastDiagnostics* diagnostics = NULL;
339     const EBlastProgramType kProgram = options->program;
340     const int kNumCpus = options->num_cpus;
341 
342     /* Assert that all required inputs are not NULL. They must be - otherwise
343        the program should have exited before entering this function. */
344     ASSERT(query && query_info && seq_src && lookup_wrap && sbp &&
345            hsp_stream && extra_returns);
346 
347     BlastSeqSrcResetChunkIterator((BlastSeqSrc*) seq_src);
348 
349     /* Start the formatting thread */
350     if(tf_data && NlmThreadsAvailable() &&
351        (format_thread =
352         NlmThreadCreate(Blast_TabularFormatThread, (void*) tf_data))
353        == NULL_thread) {
354         SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
355                            "Cannot create thread for formatting tabular output\n", NULL, options->believe_query);
356         return -1;
357     }
358 
359     if (NlmThreadsAvailable() && kNumCpus > 1) {
360         TNlmThread* thread_array =
361             (TNlmThread*) calloc(kNumCpus, sizeof(TNlmThread));
362         BlastPrelimSearchThreadData* search_data = NULL;
363         void* join_status = NULL;
364         int index;
365 
366         diagnostics = Blast_DiagnosticsInitMT(Blast_MT_LOCKInit());
367 
368         for (index = 0; index < kNumCpus; index++) {
369             search_data =
370                 BlastPrelimSearchThreadDataInit(kProgram, query,
371                     query_info, seq_src, lookup_wrap, score_options,
372                     word_options, ext_options, hit_options, eff_len_options,
373                     psi_options, db_options, sbp, diagnostics, hsp_stream);
374 
375             thread_array[index] =
376                NlmThreadCreate(Blast_PrelimSearchThreadRun,
377                                (void*) search_data);
378         }
379         for (index = 0; index < kNumCpus; index++)
380             NlmThreadJoin(thread_array[index], &join_status);
381 
382         MemFree(thread_array);
383 
384         if (!tf_data) {
385             SPHIPatternSearchBlk* pattern_blk = NULL;
386             if (Blast_ProgramIsPhiBlast(kProgram)) {
387                 pattern_blk = (SPHIPatternSearchBlk*) lookup_wrap->lut;
388                 pattern_blk->num_patterns_db =
389                                 (Int4)diagnostics->ungapped_stat->lookup_hits;
390             }
391 
392             if ((status = Blast_RunTracebackSearch(kProgram, query,
393                              query_info, seq_src, score_options,
394                              ext_options, hit_options, eff_len_options,
395                              db_options, psi_options, sbp, hsp_stream,
396                              rps_info, pattern_blk, results, kNumCpus)) != 0) {
397                 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
398                                    "Traceback engine failed\n", NULL, options->believe_query);
399             }
400         }
401     } else {
402         diagnostics = Blast_DiagnosticsInit();
403 
404         if (tf_data) { /* Single thread, tabular */
405             if ((status =
406                  Blast_RunPreliminarySearch(kProgram, query, query_info,
407                      seq_src, score_options, sbp, lookup_wrap, word_options,
408                      ext_options, hit_options, eff_len_options, psi_options,
409                      db_options, hsp_stream, diagnostics)) != 0) {
410                 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
411                                    "Preliminary search engine failed\n", NULL, options->believe_query);
412             }
413         } else { /* Single thread, non-tabular */
414             if ((status=Blast_RunFullSearch(kProgram, query, query_info,
415                             seq_src, sbp, score_options, lookup_wrap,
416                             word_options, ext_options, hit_options,
417                             eff_len_options, psi_options, db_options, hsp_stream,
418                             rps_info, diagnostics, results, 0, 0)) != 0) {
419                 SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
420                                     "Blast_RunFullSearch failed\n", NULL, options->believe_query);
421             }
422         }
423     }
424 
425     if (tf_data) {
426         void* join_status = NULL;
427         BlastHSPStreamClose(hsp_stream);
428         NlmThreadJoin(format_thread, &join_status);
429         /* Free the internally allocated structures used for tabular
430            formatting. */
431         BlastTabularFormatDataClean(tf_data);
432     }
433 
434     hsp_stream = BlastHSPStreamFree(hsp_stream);
435     Blast_SummaryReturnFill(kProgram, score_options, sbp, options->lookup_options,
436                             word_options, ext_options, hit_options,
437                             eff_len_options, options->query_options, query_info,
438                             seq_src, &diagnostics, extra_returns);
439 
440     return status;
441 }
442 
443 /** GET_MATRIX_PATH callback to find the path to a specified matrix.
444  * Looks first in current directory, then one specified by
445  * .ncbirc, then in local data directory, then env
446  * variables.
447  * @param matrix_name name of the matrix (e.g., BLOSUM50) [in]
448  * @param is_prot protein matrix if TRUE [in]
449  * @return path to matrix if found, or NULL.
450  */
451 static char*
s_BlastFindMatrixPath(const char * matrix_name,Boolean is_prot)452 s_BlastFindMatrixPath(const char* matrix_name, Boolean is_prot)
453 {
454      char* matrix_path = NULL;  /* return value. */
455      char buf_path[PATH_MAX];  /* Used for path without matrix filename. */
456      char buf_full[PATH_MAX];  /* used for full path with filename. */
457      char* ptr = NULL;
458 
459      if (matrix_name == NULL)
460        return NULL;
461 
462      /* current directory */
463      if (Nlm_FileLength((char*) matrix_name) > 0)
464      {
465          char buf_path_2[PATH_MAX];
466          Nlm_ProgramPath(buf_path, PATH_MAX);
467          ptr = StringRChr (buf_path, DIRDELIMCHR);
468          if (ptr != NULL)
469              *ptr = '\0';
470          sprintf(buf_path_2, "%s%s", buf_path, DIRDELIMSTR);
471          matrix_path = StringSave(buf_path_2);
472          return matrix_path;
473      }
474 
475      /* local data directory. */
476      sprintf(buf_full, "data%s%s", DIRDELIMSTR, matrix_name);
477      if (Nlm_FileLength(buf_full) > 0)
478      {
479          char buf_path_2[PATH_MAX];
480          Nlm_ProgramPath(buf_path, PATH_MAX);
481          ptr = StringRChr (buf_path, DIRDELIMCHR);
482          if (ptr != NULL)
483              *ptr = '\0';
484          sprintf(buf_path_2, "%s%sdata%s", buf_path, DIRDELIMSTR, DIRDELIMSTR);
485          matrix_path = StringSave(buf_path_2);
486          return matrix_path;
487      }
488 
489      if(FindPath("ncbi", "ncbi", "data", buf_path, PATH_MAX)) {
490             sprintf(buf_full, "%s%s", buf_path, matrix_name);
491             if(FileLength(buf_full) > 0) {
492                 matrix_path = StringSave(buf_path);
493                 return matrix_path;
494             } else {
495                  char alphabet_type[3];     /* aa or nt */
496                  if (is_prot)
497                       Nlm_StringNCpy(alphabet_type, "aa", 2);
498                  else
499                       Nlm_StringNCpy(alphabet_type, "nt", 2);
500                  alphabet_type[2] = NULLB;
501 
502                  sprintf(buf_full, "%s%s%s%s", buf_path,
503                           alphabet_type, DIRDELIMSTR, matrix_name);
504                  if(FileLength(buf_full) > 0)
505                  {
506                     matrix_path = StringSave(buf_path);
507                     return matrix_path;
508                  }
509             }
510      }
511 
512      return NULL;
513 }
514 
515 
516 /**
517  * Read a checkpoint file and set the necessary structures in a
518  * BlastScoreBlk: the psi_matrix, kbp_psi[0], and kbp_gap_psi[0].
519  *
520  * @param sbp              a BlastScoreBlk to receive a PSSM [in/out]
521  * @param query            query sequence data
522  * @param psi_matrix_file  checkpoint file to read
523  * @pcore_msg              a pointer to receive error and warning messages
524  */
525 static int
s_SetupScoreBlkPssmFromChkpt(BlastScoreBlk * sbp,BLAST_SequenceBlk * query,Blast_PsiCheckpointLoc * psi_checkpoint,Blast_Message ** pcore_msg)526 s_SetupScoreBlkPssmFromChkpt(BlastScoreBlk * sbp,
527                              BLAST_SequenceBlk * query,
528                              Blast_PsiCheckpointLoc * psi_checkpoint,
529                              Blast_Message* *pcore_msg)
530 {
531     int status = 0;
532     /* An intermediate representation of the PSSM data that is used
533        in PSIBlast routines */
534     PSIMatrix * pssm = NULL;
535     /* The actual PSSM that is saved in the BlastScoreBlk */
536     SPsiBlastScoreMatrix * psi_matrix = NULL;
537     size_t i, j;
538 
539     psi_matrix = SPsiBlastScoreMatrixNew(query->length);
540     if (!psi_matrix) {
541         ErrPostEx(SEV_FATAL, 1, 0,
542             "Out-of-memory: cannot allocate a PSSM of length %d.\n",
543             query->length);
544         status = -1;
545         goto error_return;
546     }
547     status = Blast_PosReadCheckpoint(psi_matrix->freq_ratios,
548                                      query->length, query->sequence,
549                                      psi_checkpoint,
550                                      pcore_msg);
551     if (status != 0) {
552         goto error_return;
553     }
554     Blast_KarlinBlkCopy(psi_matrix->kbp, sbp->kbp_gap_std[0]);
555     status = PSICreatePssmFromFrequencyRatios(query->sequence,
556                                               query->length, sbp,
557                                               psi_matrix->freq_ratios,
558                                               kPSSM_NoImpalaScaling,
559                                               &pssm);
560     if (0 != status) {
561         goto error_return;
562     }
563     for (i = 0;  i < psi_matrix->pssm->ncols;  i++) {
564         for (j = 0;  j < psi_matrix->pssm->nrows;  j++) {
565             psi_matrix->pssm->data[i][j] = pssm->pssm[i][j];
566         }
567     }
568     PSIMatrixFree(pssm);
569     sbp->psi_matrix = psi_matrix;
570     return 0;
571 error_return:
572     if (psi_matrix)
573         SPsiBlastScoreMatrixFree(psi_matrix);
574     return status;
575 }
576 
577 
578 Int2
Blast_RunSearch(SeqLoc * query_seqloc,Blast_PsiCheckpointLoc * psi_checkpoint,BlastSeqSrc * seq_src,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,BlastHSPResults ** results,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)579 Blast_RunSearch(SeqLoc* query_seqloc,
580                 Blast_PsiCheckpointLoc * psi_checkpoint,
581                 BlastSeqSrc* seq_src,
582                 SeqLoc* masking_locs,
583                 const SBlastOptions* options,
584                 BlastTabularFormatData* tf_data,
585                 BlastHSPResults **results,
586                 SeqLoc** filter_out,
587                 Blast_SummaryReturn* extra_returns)
588 {
589     Int2 status = 0;
590     BLAST_SequenceBlk *query = NULL;
591     BlastQueryInfo* query_info = NULL;
592     double scale_factor = 1.0;
593     BlastSeqLoc* lookup_segments = NULL;
594     BlastScoreBlk* sbp = NULL;
595     LookupTableWrap* lookup_wrap = NULL;
596     BlastMaskLoc* mask_loc = NULL;
597     BlastHSPStream* hsp_stream = NULL;
598     const EBlastProgramType kProgram = options->program;
599     const Boolean kRpsBlast =
600         (kProgram == eBlastTypeRpsBlast ||
601          kProgram == eBlastTypeRpsTblastn);
602     BlastRPSInfo* rps_info = NULL;
603     Nlm_MemMapPtr rps_mmap = NULL;
604     Nlm_MemMapPtr rps_pssm_mmap = NULL;
605     const QuerySetUpOptions* query_options = options->query_options;
606     const LookupTableOptions* lookup_options = options->lookup_options;
607     const BlastScoringOptions* score_options = options->score_options;
608     const BlastHitSavingOptions* hit_options = options->hit_options;
609     SBlastOptions* rps_options = NULL;
610     const Boolean kPhiBlast = Blast_ProgramIsPhiBlast(kProgram);
611     const Uint1 kDeallocateMe = 253;
612     Blast_Message *core_msg = NULL;
613 
614     if (!query_seqloc || !seq_src || !options || !extra_returns)
615         return -1;
616 
617     if ((status =
618          BLAST_ValidateOptions(kProgram, options->ext_options, score_options,
619                                lookup_options, options->word_options, hit_options,
620                                &core_msg)) != 0) {
621          extra_returns->error = Blast_MessageToSBlastMessage(core_msg, NULL, NULL, options->believe_query);
622          core_msg = Blast_MessageFree(core_msg);
623 
624         return status;
625     }
626 
627     if (options->program == eBlastTypeBlastn)
628     {
629          SeqLoc* dust_mask = NULL; /* Dust mask locations */
630          Blast_FindDustSeqLoc(query_seqloc, options, &dust_mask);
631          /* Combine dust mask with lower case mask
632             The dust mask will be deallocated by the end of this function
633             though as it's copied in BLAST_MainSetUp
634             Not deallocating it will result in a memory leak if masking_locs
635             was NULL at the start of this function */
636          if (dust_mask)
637          {
638             SeqLoc* dust_mask_var = dust_mask;
639             while (dust_mask_var)
640             {
641                dust_mask_var->choice = kDeallocateMe;
642                dust_mask_var = dust_mask_var->next;
643             }
644             ValNodeLink(&masking_locs, dust_mask);
645          }
646     }
647 
648     if (kRpsBlast) {
649         if ((status =
650              s_RPSExtraStructsSetUp(seq_src, options, &rps_options, &rps_info,
651                                     &rps_mmap, &rps_pssm_mmap, &scale_factor,
652                                     extra_returns)))
653             return status;
654         score_options = rps_options->score_options;
655         hit_options = rps_options->hit_options;
656         options = rps_options; /* This will not change the caller's pointer. */
657     }
658 
659     if ((status = BLAST_SetUpQuery(kProgram, query_seqloc, query_options,
660                                    masking_locs, &query_info, &query))) {
661         SBlastMessageWrite(&extra_returns->error, SEV_ERROR,
662                 "BLAST_SetUpQuery returned non-zero status\n", NULL, FALSE);
663         return status;
664     }
665 
666     status =
667         BLAST_MainSetUp(kProgram, query_options, score_options, query,
668                         query_info, scale_factor, &lookup_segments, &mask_loc,
669                         &sbp, &core_msg, s_BlastFindMatrixPath);
670     if (core_msg)
671     {
672        extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
673        core_msg = Blast_MessageFree(core_msg);
674     }
675 
676     if (status)
677         return status;
678 
679     if (psi_checkpoint) {
680         core_msg = NULL;
681         status = s_SetupScoreBlkPssmFromChkpt(sbp, query, psi_checkpoint,
682                                               &core_msg);
683         if (core_msg) {
684             extra_returns->error =
685                 Blast_MessageToSBlastMessage(core_msg, query_seqloc,
686                                              query_info,
687                                              options->believe_query);
688             core_msg = Blast_MessageFree(core_msg);
689         }
690         if (status)
691             return status;
692     }
693     if (filter_out) {
694         *filter_out =
695             BlastMaskLocToSeqLoc(kProgram, mask_loc, query_seqloc);
696     }
697 
698     /* Mask locations in BlastMaskLoc form are no longer needed. */
699     BlastMaskLocFree(mask_loc);
700 
701     if (masking_locs)
702     {
703           SeqLocPtr slp_var = masking_locs;
704           SeqLocPtr last = NULL;
705           while (slp_var)
706           {
707               if (slp_var->choice == kDeallocateMe)
708               {
709                   if (last == NULL)
710                   {
711                      masking_locs = slp_var->next;
712                      slp_var->next = NULL;
713                      Blast_ValNodeMaskListFree(slp_var);
714                      slp_var = masking_locs;
715                   }
716                   else
717                   {
718                      last->next = slp_var->next;
719                      slp_var->next = NULL;
720                      Blast_ValNodeMaskListFree(slp_var);
721                      slp_var = last->next;
722                   }
723               }
724               else
725               {
726                   last = slp_var;
727                   slp_var = slp_var->next;
728               }
729           }
730     }
731 
732     status = LookupTableWrapInit(query, lookup_options, query_options,
733                         lookup_segments, sbp, &lookup_wrap, rps_info, &core_msg, seq_src);
734     if (core_msg)
735     {
736           extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
737           core_msg = Blast_MessageFree(core_msg);
738     }
739     if (status)
740         return status;
741 
742     /* For PHI BLAST, save information about pattern occurrences in
743        query in the BlastQueryInfo structure. */
744     if (kPhiBlast) {
745         SPHIPatternSearchBlk* pattern_blk =
746             (SPHIPatternSearchBlk*) lookup_wrap->lut;
747         Blast_SetPHIPatternInfo(kProgram, pattern_blk, query, lookup_segments,
748                                 query_info, &core_msg);
749         if (core_msg)
750         {
751              extra_returns->error = Blast_MessageToSBlastMessage(core_msg, query_seqloc, query_info, options->believe_query);
752              core_msg = Blast_MessageFree(core_msg);
753         }
754 
755     }
756     /* Only need for the setup of lookup table. */
757     lookup_segments = BlastSeqLocFree(lookup_segments);
758 
759     if ((status = s_BlastHSPStreamSetUp(query, query_info, seq_src, options, sbp,
760                                         tf_data, &hsp_stream, extra_returns)))
761         return status;
762 
763     if ((status = s_BlastThreadManager(query, query_info, seq_src, options,
764                                        lookup_wrap, sbp, hsp_stream, rps_info,
765                                        tf_data, results, extra_returns)))
766         return status;
767 
768     lookup_wrap = LookupTableWrapFree(lookup_wrap);
769 
770     query = BlastSequenceBlkFree(query);
771     query_info = BlastQueryInfoFree(query_info);
772     BlastScoreBlkFree(sbp);
773 
774     if (kRpsBlast)
775         s_RPSExtraStructsFree(rps_info, rps_mmap, rps_pssm_mmap, rps_options);
776 
777     return status;
778 }
779 
780 Int2
Blast_DatabaseSearch(SeqLoc * query_seqloc,Blast_PsiCheckpointLoc * psi_checkpoint,char * db_name,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,SBlastSeqalignArray ** seqalign_arr,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)781 Blast_DatabaseSearch(SeqLoc* query_seqloc,
782                      Blast_PsiCheckpointLoc * psi_checkpoint,
783                      char* db_name,
784                      SeqLoc* masking_locs,
785                      const SBlastOptions* options,
786                      BlastTabularFormatData* tf_data,
787                      SBlastSeqalignArray* *seqalign_arr,
788                      SeqLoc** filter_out,
789                      Blast_SummaryReturn* extra_returns)
790 {
791     BlastSeqSrc *seq_src = NULL;
792     Boolean db_is_prot;
793     Int2 status = 0;
794     BlastHSPResults* results = NULL;
795     ReadDBFILE* rdfp = NULL;
796 
797     if (!options || !query_seqloc || !db_name || !extra_returns)
798         return -1;
799 
800     db_is_prot =
801         (options->program == eBlastTypeBlastp   ||
802          options->program == eBlastTypeBlastx   ||
803          options->program == eBlastTypeRpsBlast ||
804          options->program == eBlastTypeRpsTblastn);
805 
806     rdfp = readdb_new(db_name, db_is_prot);
807 
808     seq_src = ReaddbBlastSeqSrcAttach(rdfp);
809 
810     if (seq_src == NULL) {
811         SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
812                            "Initialization of subject sequences source failed",
813                            NULL, options->believe_query);
814     } else if (BlastSeqSrcGetNumSeqs(seq_src) == 0) {
815         SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
816                            "Database is empty", NULL, options->believe_query);
817     } else {
818         char* error_str = BlastSeqSrcGetInitError(seq_src);
819         if (error_str)
820             SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
821     }
822 
823     /* If there was an error initializing the sequence source, return without
824        doing the search. */
825     if (extra_returns->error)
826         return -1;
827 
828     status =
829         Blast_RunSearch(query_seqloc, psi_checkpoint, seq_src,
830                         masking_locs, options, tf_data, &results,
831                         filter_out, extra_returns);
832 
833     /* The ReadDBFILE structure will not be destroyed here, because the
834        initialising function used readdb_attach */
835     BlastSeqSrcFree(seq_src);
836 
837     if (!status && !tf_data) {
838         status =
839             BLAST_ResultsToSeqAlign(options->program, &results,
840                                     query_seqloc, rdfp, NULL,
841                                     options->score_options->gapped_calculation,
842                                     options->score_options->is_ooframe,
843                                     seqalign_arr);
844     }
845 
846     readdb_destruct(rdfp);
847 
848     if (status)
849         return status;
850 
851     return status;
852 }
853 
854 /** Splits the PHI BLAST results corresponding to different pattern occurrences
855  * in query, converts them to Seq-aligns and puts in a list of ValNodes.
856  * @param results All results from different pattern occurrences
857  *                mixed together. On return points to NULL. [in]
858  * @param pattern_info Query pattern occurrences information [in]
859  * @param program Program type (phiblastp or phiblastn) [in]
860  * @param query_seqloc List of query locations [in]
861  * @param rdfp blast db object [in]
862  * @param phivnps List of ValNodes containing Seq-aligns. [out]
863  * @return Status, 0 on success, -1 on failure.
864  */
865 static Int2
s_PHIResultsToSeqAlign(const BlastHSPResults * results,const SPHIQueryInfo * pattern_info,EBlastProgramType program,SeqLoc * query_seqloc,ReadDBFILE * rdfp,ValNode ** phivnps)866 s_PHIResultsToSeqAlign(const BlastHSPResults* results,
867                        const SPHIQueryInfo* pattern_info,
868                        EBlastProgramType program, SeqLoc* query_seqloc,
869                        ReadDBFILE* rdfp, ValNode* *phivnps)
870 {
871     Int2 status = 0;
872     /* Split results into an array of BlastHSPResults structures corresponding
873        to different pattern occurrences. */
874     BlastHSPResults* *phi_results =
875         PHIBlast_HSPResultsSplit(results, pattern_info);
876 
877     if (phi_results) {
878         int pattern_index; /* Index over pattern occurrences. */
879 
880         for (pattern_index = 0; pattern_index < pattern_info->num_patterns;
881              ++pattern_index) {
882             SBlastSeqalignArray* seqalign_arr = NULL;
883             SeqAlign* seqalign = NULL;
884             BlastHSPResults* one_phi_results = phi_results[pattern_index];
885 
886             if (one_phi_results) {
887                 /* PHI BLAST is always gapped, and never out-of-frame, hence
888                  * TRUE and FALSE values for the respective booleans in the next
889                  * call.
890                  */
891                 status =
892                     BLAST_ResultsToSeqAlign(program, &one_phi_results,
893                                             query_seqloc, rdfp, NULL, TRUE,
894                                             FALSE, &seqalign_arr);
895                 if (seqalign_arr)
896                 {
897                     seqalign = seqalign_arr->array[0];
898                     seqalign_arr->array[0] = NULL;
899                     SBlastSeqalignArrayFree(seqalign_arr);
900                 }
901                 ValNodeAddPointer(phivnps, pattern_index, seqalign);
902             }
903         }
904         sfree(phi_results);
905     }
906     return status;
907 }
908 
909 Int2
PHIBlastRunSearch(SeqLoc * query_seqloc,char * db_name,SeqLoc * masking_locs,const SBlastOptions * options,ValNode ** phivnps,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)910 PHIBlastRunSearch(SeqLoc* query_seqloc, char* db_name, SeqLoc* masking_locs,
911                   const SBlastOptions* options, ValNode* *phivnps,
912                   SeqLoc** filter_out, Blast_SummaryReturn* extra_returns)
913 {
914     BlastSeqSrc *seq_src = NULL;
915     Boolean is_prot;
916     Int2 status = 0;
917     BlastHSPResults* results = NULL;
918     ReadDBFILE* rdfp = NULL;
919 
920     if (!options || !query_seqloc || !db_name || !extra_returns || !phivnps)
921         return -1;
922 
923     ASSERT(Blast_ProgramIsPhiBlast(options->program));
924 
925     is_prot = (options->program == eBlastTypePhiBlastp);
926 
927     rdfp = readdb_new(db_name, is_prot);
928 
929     seq_src = ReaddbBlastSeqSrcAttach(rdfp);
930 
931     if (seq_src == NULL) {
932         SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
933                            "Initialization of subject sequences source failed", NULL, options->believe_query);
934     } else {
935         char* error_str = BlastSeqSrcGetInitError(seq_src);
936         if (error_str)
937             SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
938     }
939 
940     /* If there was an error initializing the sequence source, return without
941        doing the search. */
942     if (extra_returns->error)
943         return -1;
944 
945     /* Masking at hash and on-the-fly tabular output are not applicable for
946        PHI BLAST, so pass NULL in corresponding arguments. */
947     status =
948         Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL,
949                         seq_src, masking_locs, options,
950                         (BlastTabularFormatData*) NULL, &results,
951                         filter_out, extra_returns);
952 
953     /* The ReadDBFILE structure will not be destroyed here, because the
954        initialising function used readdb_attach */
955     BlastSeqSrcFree(seq_src);
956 
957     *phivnps = NULL;
958 
959     if (!status) {
960         status =
961             s_PHIResultsToSeqAlign(results, extra_returns->pattern_info,
962                                    options->program, query_seqloc, rdfp,
963                                    phivnps);
964     }
965 
966     results = Blast_HSPResultsFree(results);
967 
968     readdb_destruct(rdfp);
969     return status;
970 }
971 
972 Int2
Blast_TwoSeqLocSetsAdvanced(SeqLoc * query_seqloc,SeqLoc * subject_seqloc,SeqLoc * masking_locs,const SBlastOptions * options,BlastTabularFormatData * tf_data,SBlastSeqalignArray ** seqalign_arr,SeqLoc ** filter_out,Blast_SummaryReturn * extra_returns)973 Blast_TwoSeqLocSetsAdvanced(SeqLoc* query_seqloc,
974                             SeqLoc* subject_seqloc,
975                             SeqLoc* masking_locs,
976                             const SBlastOptions* options,
977                             BlastTabularFormatData* tf_data,
978                             SBlastSeqalignArray* *seqalign_arr,
979                             SeqLoc** filter_out,
980                             Blast_SummaryReturn* extra_returns)
981 {
982     BlastSeqSrc *seq_src = NULL;
983     Int2 status = 0;
984     BlastHSPResults* results = NULL;
985 
986 
987     if (!options || !query_seqloc || !subject_seqloc || !extra_returns)
988         return -1;
989 
990 
991     seq_src = MultiSeqBlastSeqSrcInit(subject_seqloc, options->program);
992 
993     if (seq_src == NULL) {
994         SBlastMessageWrite(&extra_returns->error, SEV_WARNING,
995                            "Initialization of subject sequences source failed", NULL, options->believe_query);
996     } else {
997         char* error_str = BlastSeqSrcGetInitError(seq_src);
998         if (error_str)
999             SBlastMessageWrite(&extra_returns->error, SEV_WARNING, error_str, NULL, options->believe_query);
1000     }
1001 
1002     /* If there was an error initializing the sequence source, return without
1003        doing the search. */
1004     if (extra_returns->error)
1005         return -1;
1006 
1007     status =
1008         Blast_RunSearch(query_seqloc, (Blast_PsiCheckpointLoc *) NULL,
1009                         seq_src, masking_locs, options, tf_data,
1010                         &results, filter_out, extra_returns);
1011 
1012     /* The ReadDBFILE structure will not be destroyed here, because the
1013        initialising function used readdb_attach */
1014     BlastSeqSrcFree(seq_src);
1015 
1016     if (!status) {
1017         status =
1018             BLAST_ResultsToSeqAlign(options->program, &results, query_seqloc,
1019                                     NULL, subject_seqloc,
1020                                     options->score_options->gapped_calculation,
1021                                     options->score_options->is_ooframe,
1022                                     seqalign_arr);
1023     }
1024 
1025     if (status)
1026         return status;
1027 
1028     return status;
1029 }
1030 
GeneticCodeSingletonInit()1031 void GeneticCodeSingletonInit()
1032 {
1033     Uint1* gc = NULL;
1034     GenCodeSingletonInit();
1035     BLAST_GeneticCodeFind(BLAST_GENETIC_CODE, &gc);
1036     GenCodeSingletonAdd(BLAST_GENETIC_CODE, gc);
1037     free(gc);
1038 }
1039 
GeneticCodeSingletonFini()1040 void GeneticCodeSingletonFini()
1041 {
1042     GenCodeSingletonFini();
1043 }
1044 
1045 /* @} */
1046 
1047 
1048 
1049