1 static char const rcsid[] = "$Id: readdb.c,v 6.549 2016/09/02 15:04:59 ucko Exp $";
2 
3 /* $Id: readdb.c,v 6.549 2016/09/02 15:04:59 ucko Exp $ */
4 /*
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 */
29 /*****************************************************************************
30 
31 File name: readdb.c
32 
33 Author: Tom Madden
34 
35 Contents: Reads Databases formatted by formatdb.
36 
37 Detailed Contents:
38 
39         - memory maps files.
40 
41     - database sequences are identified (by these routines) by their
42     order in the files.  this is based on a zero-offset.
43 
44 
45 ******************************************************************************/
46 
47 /* File Name: readdb.c
48 *
49 * Author: Tom Madden
50 *
51 * Version Creation Date:   3/22/95
52 *
53 * $Revision: 6.549 $
54 *
55 * File Description:
56 *       Functions to rapidly read databases from files produced by formatdb.
57 *
58 * Modifications:
59 * --------------------------------------------------------------------------
60 * Date     Name        Description of modification
61 * -------  ----------  -----------------------------------------------------
62 *
63 * ==========================================================================
64 *
65 *
66 * RCS Modification History:
67 * $Log: readdb.c,v $
68 * Revision 6.549  2016/09/02 15:04:59  ucko
69 * readdb.c: accommodate systems that use GNU libc with non-Linux kernels
70 * (such as the Hurd or the FreeBSD kernel), as already done in
71 * Debian/Ubuntu packages.
72 *
73 * Revision 6.548  2012/03/14 20:09:35  camacho
74 * Fix buffer overrun JIRA BD-348
75 *
76 * Revision 6.547  2012/03/14 20:02:52  camacho
77 * Fix buffer overrun JIRA BD-348
78 *
79 * Revision 6.546  2011/12/19 18:37:35  gouriano
80 * Corrected printf formatting. NOJIRA
81 *
82 * Revision 6.545  2011/11/28 20:24:51  camacho
83 * Add support for setting membership bits in the refseq_chromosome subset JIRA BD-333, BD-308
84 *
85 * Revision 6.544  2009/12/22 12:54:28  madden
86 * Try stripped off path for microbial db, JIRA WB-313
87 *
88 * Revision 6.543  2009/09/17 15:41:43  madden
89 * Consolidate checking of paths, JIRA SB-368
90 *
91 * Revision 6.542  2009/08/25 13:45:23  madden
92 * Work with database with no GI ISAM, JIRA SB-367
93 *
94 * Revision 6.541  2009/02/24 18:19:36  coulouri
95 * correct Nlm_StringTokMT invocation; fixes JIRA SB-181
96 *
97 * Revision 6.540  2009/01/14 21:50:47  madden
98 * Enable REDUCED_E2INDEX_SET
99 *
100 * Revision 6.539  2008/12/24 13:58:10  maning
101 * Quote path containing spaces.  Attempt to fix JIRA SB136.
102 *
103 * Revision 6.538  2008/10/31 18:50:52  madden
104 * Add readdb_check_oid to be used with pseed (SB-109)
105 *
106 * Revision 6.537  2008/06/25 18:32:37  merezhuk
107 * use portable Nlm_StringTokMT.
108 *
109 * Revision 6.536  2008/06/25 14:28:55  merezhuk
110 * support for multiple BLAST DB locations as in CSeqDB
111 *
112 * Revision 6.535  2008/02/26 18:34:20  kans
113 * use SeqDescrAddPointer instead of ValNodeAddPointer/Str
114 *
115 * Revision 6.534  2007/12/04 19:43:50  madden
116 * Index accession.version for swissprot
117 *
118 * Revision 6.533  2007/11/27 18:51:54  madden
119 * More efficient retrievals on string isam indices
120 *
121 * Revision 6.532  2007/11/15 21:10:49  madden
122 * New version of SeqIdE2Index ifdef by REDUCED_E2INDEX_SET
123 *
124 * Revision 6.531  2007/11/06 20:09:39  coulouri
125 * when printing taxonomy info, skip irrelevant sequences if a gi target was specified; fixes blast-rt#15347680
126 *
127 * Revision 6.530  2007/09/27 17:20:54  madden
128 * Add readdb_get_full_filename
129 *
130 * Revision 6.529  2007/08/17 15:56:10  papadopo
131 * 1. Make increment of reference count in readdb_attach atomic
132 * 2. Never initialize the reference count to a fixed value when
133 *    memory-mapping database files, only increment it (fixes RT 15280141)
134 *
135 * Revision 6.528  2007/07/12 20:44:07  papadopo
136 * open .nsd as a binary file for writing
137 *
138 * Revision 6.527  2007/05/16 18:47:56  camacho
139 * Fix RT#15284902
140 *
141 * Revision 6.526  2007/05/08 13:09:39  madden
142 * Add ability to read STATS_NSEQ and STATS_TOTLEN from alias file with funciton readdb_get_stats_numbers
143 *
144 * Revision 6.525  2007/05/07 13:30:54  kans
145 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
146 *
147 * Revision 6.524  2007/05/03 15:51:53  madden
148 * Do not require title to use alias file
149 *
150 * Revision 6.523  2007/04/12 20:19:06  camacho
151 * Remove informational messages about membership/links bits
152 *
153 * Revision 6.522  2007/02/27 15:16:24  camacho
154 * DBLIST field is mandatory
155 *
156 * Revision 6.521  2007/01/05 16:01:22  camacho
157 * Force munmap of string ISAM files after exceeding kSISAM_MaxNumVolumes volumes
158 * to avoid running out of memory on accession lookups. Fixes rt #15235977.
159 *
160 * Revision 6.520  2006/10/17 15:24:31  camacho
161 * Fix memory leak when printing accession list in fastacmd
162 *
163 * Revision 6.519  2006/09/27 18:51:01  camacho
164 * Bug fix in readdb_read_alias_file
165 *
166 * Revision 6.518  2006/09/27 14:19:12  camacho
167 * Bug fix in OID_GI_BelongsToMaskDB
168 *
169 * Revision 6.517  2006/09/19 19:37:28  kans
170 * readdb_parse_db_names has quote_mode (TM) to allow spaces in paths if bounced by doublequote marks
171 *
172 * Revision 6.516  2006/08/10 17:47:27  camacho
173 * Bug fix in readdb_acc2fasta: added quick check in oidlist in OID_GI_BelongsToMaskDB
174 *
175 * Revision 6.515  2006/08/07 15:19:12  camacho
176 * + is_REFSEQ_GENOMIC to FDBLoadMembershipsTable
177 *
178 * Revision 6.514  2006/08/07 15:03:57  camacho
179 * +is_REFSEQ_GENOMIC
180 *
181 * Revision 6.513  2006/07/13 20:10:06  camacho
182 * Bug fix in ScanDIFile
183 *
184 * Revision 6.512  2006/07/06 19:48:19  camacho
185 * Fix to previous commit
186 *
187 * Revision 6.511  2006/07/06 19:37:30  camacho
188 * Add extra sanity checks in ScanDIFile
189 *
190 * Revision 6.510  2006/07/05 18:24:08  camacho
191 * Fixes to ScanDIFile to read molecule type
192 *
193 * Revision 6.509  2006/07/05 16:07:48  camacho
194 * Minor changes in FDBLoadLinksTable
195 *
196 * Revision 6.508  2006/07/03 18:27:22  coulouri
197 * correct volume size defaults for protein databases
198 *
199 * Revision 6.507  2006/06/19 18:37:08  coulouri
200 * improve default handling for non-formatdb clients
201 *
202 * Revision 6.506  2006/06/19 17:20:14  coulouri
203 * Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398
204 *
205 * Revision 6.505  2006/06/05 19:59:53  camacho
206 * Changes to ScanDIFile, is_REFSEQ_RNA, and FDBAddSequence2 to handle the new
207 * DI_Record::mol field.
208 *
209 * Revision 6.504  2006/05/30 20:27:51  jianye
210 * fixing memory leak in FDBuildOldStyleDefline
211 *
212 * Revision 6.503  2006/05/11 13:51:45  kans
213 * made is_REFSEQ_RNA compatible with C compiler conventions
214 *
215 * Revision 6.502  2006/05/10 20:47:16  camacho
216 * From Ilya Dondoshansky: 1. Several FDB functions made public - needed for incremental dump efficiency; 2. Added mol field to SI_Record and DI_Record, check for mol = rna in is_REFSEQ_RNA; 3. Avoid redundant sorting of ISAM files; 4. In readdb_get_pig: look for PIG in all deflines in a set, until found.
217 *
218 * Revision 6.501  2006/05/04 20:07:27  camacho
219 * Report fatal error in case of failure to add sequence to BLAST database because
220 * of zero-length sequence and clean up the datababase that was being created.
221 *
222 * Revision 6.500  2006/04/24 15:50:19  camacho
223 * + is_REFSEQ_RNA
224 *
225 * Revision 6.499  2006/03/16 14:14:23  camacho
226 * Fix parsing of locations for fastacmd command line argument (rt # 15151399)
227 *
228 * Revision 6.498  2006/03/09 21:56:02  camacho
229 * Refactored sequence hash function
230 *
231 * Revision 6.497  2006/03/08 19:06:15  camacho
232 * Added definition for maximum number of volumes and FDBCleanUpInProgress, fixes rt ticket 15147600
233 *
234 * Revision 6.496  2006/02/15 21:07:28  camacho
235 * Add validation to fastacmd to reject mixed protein/nucleotide databases
236 *
237 * Revision 6.495  2006/01/11 16:24:45  camacho
238 * Fix bug in Fastacmd_PrintTaxonomyInfo
239 *
240 * Revision 6.494  2005/12/23 16:30:57  camacho
241 * Remove assertion no longer needed
242 *
243 * Revision 6.493  2005/12/02 14:04:07  camacho
244 * Minor fix in ScanDIFile
245 *
246 * Revision 6.492  2005/11/22 21:23:05  madden
247 * Fix in FDLCreateAsnDF for multiple volumes if input FASTA not parsed
248 *
249 * Revision 6.491  2005/10/04 20:40:42  madden
250 * Make PrintDbInformationBasicEx public, minor optimization to PrintDbInfoWithRID
251 *
252 * Revision 6.490  2005/10/04 16:40:25  madden
253 * Fix nit found by C++ compiler
254 *
255 * Revision 6.489  2005/10/04 15:44:54  madden
256 * Workaround to time-out problem of PrintDbInformationWithRID
257 *
258 * Revision 6.488  2005/09/30 14:54:32  camacho
259 * Enable recognition of the formatdb configuration file to allow users to set the
260 * membership and link bits in the ASN.1 deflines.
261 *
262 * Revision 6.487  2005/09/20 14:08:29  camacho
263 * Add error message when trying to dump subset database with gi file
264 *
265 * Revision 6.486  2005/09/08 13:19:11  camacho
266 * Remove unneeded assertion
267 *
268 * Revision 6.485  2005/09/02 21:52:13  camacho
269 * Correct buffer overflow on sparc
270 *
271 * Revision 6.484  2005/08/16 17:51:14  dondosha
272 * Decrement thread count in shared_info only if sequence/header files are open for this instance of readdb
273 *
274 * Revision 6.483  2005/08/07 01:55:32  camacho
275 * Bug fix to FDBAddSequence2
276 *
277 * Revision 6.482  2005/08/04 16:08:29  coulouri
278 * correct buffer overflow on sparc
279 *
280 * Revision 6.481  2005/08/04 15:29:47  camacho
281 * Fix to SI_RecordAddFormatdb_ver
282 *
283 * Revision 6.480  2005/07/28 14:57:10  coulouri
284 * remove dead code
285 *
286 * Revision 6.479  2005/07/27 21:30:02  camacho
287 * 1) Replaces is_REFSEQ_* functions by a single function (is_REFSEQ), to be
288 * used by genmask and ID1 group's BLAST database dumper.
289 * 2) Removed out-of-date is_WGS* functions.
290 *
291 * Revision 6.478  2005/07/27 17:48:57  coulouri
292 * remove hardcoded paths
293 *
294 * Revision 6.477  2005/06/22 13:55:22  coulouri
295 * add support for dumping accessions
296 *
297 * Revision 6.476  2005/06/21 19:15:50  dondosha
298 * In FD_CreateAliasFileEx, if there is a gi list, always add NSEQ and LENGTH lines, even with 0 values
299 *
300 * Revision 6.475  2005/06/08 19:25:36  camacho
301 * New feature to allow formatdb to add taxonomy ids to BLAST databases
302 * generated from FASTA input
303 * BugzID: 6
304 *
305 * Revision 6.474  2005/05/16 16:12:45  camacho
306 * Added auxiliary function for the SI_Record structure to fix a bug in
307 * FDBAddSequence, which caused all but the first BlastDefLine structure in
308 * a linked list to be ignored.
309 *
310 * Revision 6.473  2005/04/26 21:34:39  kans
311 * added SEQID_GPIPE
312 *
313 * Revision 6.472  2005/04/20 19:02:15  lavr
314 * +<assert.h>
315 *
316 * Revision 6.471  2005/04/11 18:55:16  coulouri
317 * Make BLASTDB environment variable usage consistent across platforms
318 *
319 * Revision 6.470  2005/04/11 18:04:56  madden
320 * Fix for alignment issue in readdb_get_sequence_ex
321 *
322 * Revision 6.469  2005/04/07 12:19:35  madden
323 * Refactor readdb_get_sequence_ex to eliminate unnecessary allocations
324 *
325 * Revision 6.468  2005/04/06 16:01:25  camacho
326 * Return -1 in case of memory allocation failures in readdb_get_sequence_ex
327 *
328 * Revision 6.467  2005/02/24 14:34:05  camacho
329 * Fix invocation of FDBAddSequence
330 *
331 * Revision 6.466  2005/02/22 14:15:48  camacho
332 * Pass bioseq data type by reference to FDBAddBioseq
333 *
334 * Revision 6.465  2004/12/07 15:14:14  kans
335 * third parameter to readdb_get_header_ex needs to be pointer to Uint4, not Int4 - CodeWarrior error
336 *
337 * Revision 6.464  2004/12/04 03:41:09  camacho
338 * Add extra enum for fastacmd -D option for error checking
339 *
340 * Revision 6.463  2004/12/03 04:57:57  camacho
341 * Fix name conflict in enumeration for fastacmd dump types
342 *
343 * Revision 6.462  2004/12/02 20:37:31  camacho
344 * + fastacmd feature to dump list of gis
345 *
346 * Revision 6.461  2004/11/22 20:54:58  coulouri
347 * optimization for subset database searches restricted by gi list
348 *
349 * Revision 6.460  2004/10/28 15:39:37  camacho
350 * Fixes to previous commit
351 *
352 * Revision 6.459  2004/10/04 18:00:00  madden
353 * Further fixes for SI_Record.title
354 *
355 * Revision 6.458  2004/09/27 16:29:34  madden
356 * Make title on SI_Record dynamically allocated
357 *
358 * Revision 6.457  2004/09/21 21:42:57  dondosha
359 * Initialize BlastDefLine before call to FDBAddBioseq in FastaToBlastDB
360 *
361 * Revision 6.456  2004/09/09 20:58:26  camacho
362 * Add sanity checks in readdb_read_alias_file
363 *
364 * Revision 6.455  2004/08/25 14:45:23  camacho
365 * Refactorings to allow formatdb process multiple deflines
366 *
367 * Revision 6.454  2004/08/06 17:55:35  madden
368 * Add new owners to is_REFSEQ_RNA
369 *
370 * Revision 6.453  2004/08/06 13:56:11  madden
371 * Add owner 45 to is_REFSEQ_PROTEIN
372 *
373 * Revision 6.452  2004/08/05 19:33:32  madden
374 * Add ownership 38 and 52 to Refseq for proteins
375 *
376 * Revision 6.451  2004/07/26 20:51:38  camacho
377 * Fix mismatched data type
378 *
379 * Revision 6.450  2004/07/22 16:16:41  camacho
380 * Guard against arguments longer than PATH_MAX to FindBlastDBFile
381 *
382 * Revision 6.449  2004/07/19 22:37:46  dondosha
383 * Added mutex lock/unlock around shared info manipulation in readdb_destruct_element
384 *
385 * Revision 6.448  2004/07/14 18:35:33  camacho
386 * Remove unneeded error message in readdb_get_header_ex
387 *
388 * Revision 6.447  2004/07/13 19:57:00  dondosha
389 * Tiny memory leak fix
390 *
391 * Revision 6.446  2004/07/13 17:31:33  camacho
392 * Fix for genmask to count only non-redundant sequences added to the masked
393 * databases instead of all sequences.
394 *
395 * Revision 6.445  2004/07/09 15:40:22  dondosha
396 * Fix in ReadDBOpenMHdrAndSeqFiles: increment nthreads if at least one of header or sequence files is already mapped
397 *
398 * Revision 6.444  2004/07/08 21:25:48  kans
399 * fixed Mac compiler error in FDBExtend4Sequence
400 *
401 * Revision 6.443  2004/07/08 19:49:02  camacho
402 * Contributions from ID1 Group:
403 * 1) SI_Record structure.
404 * 2) Refactoring of FDBAddSequence2 to allow addition of non-redundant sequences
405 * when creating BLAST databases.
406 *
407 * Revision 6.442  2004/06/30 13:42:27  kans
408 * include <blfmtutl.h> to clear up Mac compiler missing prototype errors
409 *
410 * Revision 6.441  2004/05/04 17:07:20  kans
411 * ReadDBBioseqFetchFunc checks result of ReadDBFindFetchStruct call for NULL before attempting to dereference - picked up by trying to use multiple threads
412 *
413 * Revision 6.440  2004/04/21 16:54:52  camacho
414 * Added removal for PIG files
415 *
416 * Revision 6.439  2004/04/13 17:22:46  camacho
417 * Optimization to Int4ListReadFromFile
418 *
419 * Revision 6.438  2004/04/01 13:43:08  lavr
420 * Spell "occurred", "occurrence", and "occurring"
421 *
422 * Revision 6.437  2004/03/29 05:17:55  camacho
423 * Fix to Int4ListConcat
424 *
425 * Revision 6.436  2004/03/15 18:45:05  coulouri
426 * Throw fatal error if BSRebuildDNA_4na() fails
427 *
428 * Revision 6.435  2004/02/24 16:32:52  camacho
429 * Use correct calling convention for win32
430 *
431 * Revision 6.434  2004/02/24 14:06:00  camacho
432 * Added support for approximate sequence length calculation for nucleotide
433 * sequences.
434 *
435 * Revision 6.433  2004/02/09 20:53:20  camacho
436 * Add FDBAddPig call from FDBAddSequence2
437 *
438 * Revision 6.432  2004/02/04 15:35:04  camacho
439 * Rollback to fix problems in release 2.2.7
440 *
441 * Revision 6.429  2004/01/29 20:48:07  coulouri
442 * Only limit volume sizes on 32-bit platforms
443 *
444 * Revision 6.428  2004/01/28 19:34:51  camacho
445 * Added sanity check for alias files
446 *
447 * Revision 6.427  2004/01/26 13:52:31  camacho
448 * Do not use snprintf
449 *
450 * Revision 6.426  2004/01/23 21:13:54  camacho
451 * 1. Refactored code to create multiple volumes.
452 * 2. Set the maximum sequence file size to 1GB.
453 *
454 * Revision 6.425  2004/01/12 23:06:36  camacho
455 * Sort link bit gi lists
456 *
457 * Revision 6.424  2003/10/01 19:03:50  camacho
458 * Fix in readdb_get_totals_ex2 to use the alias file length/number of entries when
459 * gilist is populated.
460 *
461 * Revision 6.423  2003/09/02 18:32:10  dondosha
462 * Changed http link for completed microbial genomes at genomes group request
463 *
464 * Revision 6.422  2003/08/08 19:31:37  camacho
465 * Minor fix for formatdb
466 *
467 * Revision 6.421  2003/07/28 13:59:17  camacho
468 * Bug fix
469 *
470 * Revision 6.420  2003/07/15 16:49:32  camacho
471 * Skip whitespace in alias files
472 *
473 * Revision 6.419  2003/07/10 14:00:59  camacho
474 * Fixed some memory leaks
475 *
476 * Revision 6.418  2003/07/08 18:42:39  camacho
477 * Elaborated fastacmd return values
478 *
479 * Revision 6.417  2003/07/02 19:22:10  camacho
480 * formatdb fix to remove stdin from database title
481 *
482 * Revision 6.416  2003/06/13 19:56:26  dondosha
483 * Removed call to SeqEntrySetScope in FastaToBlastDB that caused purify errors
484 *
485 * Revision 6.415  2003/05/30 17:25:37  coulouri
486 * add rcsid
487 *
488 * Revision 6.414  2003/05/21 21:33:36  camacho
489 * Deprecated isCommonIndex global
490 *
491 * Revision 6.413  2003/05/15 14:45:48  dondosha
492 * readdb_get_sequence_number returns -1 if rdfp is NULL
493 *
494 * Revision 6.412  2003/05/15 14:14:18  dondosha
495 * Check if offset is larger than total length of all rdfps in readdb_get_sequence_number
496 *
497 * Revision 6.411  2003/05/13 16:02:53  coulouri
498 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
499 *
500 * Revision 6.410  2003/05/12 12:24:02  camacho
501 * Fixed readdb_get_totals_ex2
502 *
503 * Revision 6.409  2003/05/01 14:10:03  camacho
504 * 1. Fixed readdb_get_totals_ex2 to use alias length and number of sequences
505 *    without an oidlist
506 * 2. Fixed readdb_merge_gifiles to properly sort the rdfp linked list (rdfp_chain)
507 * 3. Fixed readdb_gi2seq to look into subsequent rdfps if no isam indices are
508 *    found in the first rdfp
509 *
510 * Revision 6.408  2003/04/28 19:50:10  camacho
511 * Fixes to readdb_merge_gifiles
512 *
513 * Revision 6.407  2003/04/27 02:43:25  vakatov
514 * Added missing LIBCALL -- for MS-Win compilation
515 *
516 * Revision 6.406  2003/04/25 18:55:27  camacho
517 * 1. Added readdb_merge_gifiles to deal with Microbial blast database issues.
518 * 2. Minor fixes to Int4List functions.
519 *
520 * Revision 6.405  2003/04/24 15:44:42  camacho
521 * Fixes for windows build
522 *
523 * Revision 6.404  2003/04/24 13:16:25  camacho
524 * Minor fix
525 *
526 * Revision 6.403  2003/04/23 15:15:36  camacho
527 * Moved reading of gi list to readdb
528 *
529 * Revision 6.402  2003/04/22 21:30:13  camacho
530 * Added Int4 list utilities
531 *
532 * Revision 6.401  2003/04/22 19:04:57  camacho
533 * Moved GiList structure to generic list of 4-byte integers
534 *
535 * Revision 6.400  2003/04/17 21:10:54  camacho
536 * Add PIGs only when removing redundancy
537 *
538 * Revision 6.399  2003/04/15 19:09:13  camacho
539 * Completed implementation of PIG interface
540 *
541 * Revision 6.398  2003/04/14 19:53:30  camacho
542 * Fixed memory leak
543 *
544 * Revision 6.397  2003/04/09 21:46:00  camacho
545 * Added basic PIG interface
546 *
547 * Revision 6.396  2003/04/09 20:16:17  camacho
548 * Use #defined value for location of taxdb.tar.gz
549 *
550 * Revision 6.395  2003/04/08 15:45:02  camacho
551 * Minor fix to previous commit
552 *
553 * Revision 6.394  2003/04/08 15:37:14  camacho
554 * Extended FDBAddSequence2 to take pig
555 *
556 * Revision 6.393  2003/04/04 17:56:33  camacho
557 * fastacmd fix when retrieving repeated identifiers(-a)
558 *
559 * Revision 6.392  2003/04/03 17:57:00  vakatov
560 * Added missing LIBCALL
561 *
562 * Revision 6.391  2003/04/01 21:51:36  camacho
563 * Made fastacmd functions & structure non-static
564 *
565 * Revision 6.390  2003/03/27 22:51:16  camacho
566 * Minor change to previous commit
567 *
568 * Revision 6.389  2003/03/27 22:26:04  camacho
569 * Add error messages and non-zero return value on error for fastacmd
570 *
571 * Revision 6.388  2003/03/26 18:50:07  camacho
572 * Added eFDBCleanOpt to formatdb API
573 *
574 * Revision 6.387  2003/03/21 22:14:32  camacho
575 * Allow C ObjMgr & application to load taxonomy dbs
576 *
577 * Revision 6.386  2003/03/20 14:03:21  camacho
578 * Allow users to set the membership and link bits
579 *
580 * Revision 6.385  2003/03/14 21:39:08  camacho
581 * Fix bug in readdb_get_totals_ex2
582 *
583 * Revision 6.384  2003/03/08 23:02:35  camacho
584 * Bug fix in FDBFinish
585 *
586 * Revision 6.383  2003/03/07 13:16:42  madden
587 * Check for NULL rdfp before dereferencing
588 *
589 * Revision 6.382  2003/02/26 17:47:31  kimelman
590 * bugfix: doublicate close of Files and AsnIo
591 *
592 * Revision 6.381  2003/02/20 17:29:31  camacho
593 * Added support for the creation of empty databases
594 *
595 * Revision 6.380  2003/02/11 17:46:25  camacho
596 * Fix to FDBAddSequence2
597 *
598 * Revision 6.379  2003/01/31 17:58:29  camacho
599 * Eliminate unnecessary checks for redundant databases
600 *
601 * Revision 6.378  2003/01/31 14:39:21  camacho
602 * Use init_state argument to readdb_new_ex2
603 *
604 * Revision 6.377  2003/01/22 20:21:01  bealer
605 * - Handle error case better.
606 *
607 * Revision 6.376  2003/01/22 19:41:20  camacho
608 * Added function to build multi-volume db list for creating alias files
609 *
610 * Revision 6.375  2003/01/07 17:18:15  camacho
611 * Remove warning message when file is not found by FindBlastDBFile
612 *
613 * Revision 6.374  2003/01/02 22:17:54  madden
614 * Print name of database when wrong version used
615 *
616 * Revision 6.373  2002/12/19 14:25:07  camacho
617 * Minor change
618 *
619 * Revision 6.372  2002/12/17 20:33:25  camacho
620 * Removed unnecessary function attribute
621 *
622 * Revision 6.371  2002/12/17 20:01:18  madden
623 * Fix for oidlist when no memory-mapping available
624 *
625 * Revision 6.370  2002/12/17 17:46:01  madden
626 * readdb_get_sequence_number does not check for oidlist
627 *
628 * Revision 6.369  2002/12/16 20:22:48  camacho
629 * Removed unused options in formatdb options structure
630 *
631 * Revision 6.368  2002/12/16 05:01:54  camacho
632 * Fixes to previous commit
633 *
634 * Revision 6.367  2002/12/13 16:01:25  kans
635 * fixed mac compiler complaints
636 *
637 * Revision 6.366  2002/12/13 13:43:22  camacho
638 * Changes to set links and membership bits in formatdb API
639 *
640 * Revision 6.365  2002/12/11 17:05:53  camacho
641 * Added code to handle mmap failures in MT mode
642 *
643 * Revision 6.364  2002/12/10 18:31:44  camacho
644 * Added taxonomy database loading when using ReadDBBioseqFetchEnable
645 *
646 * Revision 6.363  2002/11/27 20:06:01  camacho
647 * Fix to deal with non-parseable seqids in new database format
648 *
649 * Revision 6.362  2002/11/25 17:23:28  camacho
650 * 1) Changed file access to blast taxonomy databases: only 2 files are loaded
651 *    for an entire chain of rdfp's.
652 * 2) Fixed memory leak in FindBlastDBFile.
653 * 3) Protect NlmOpenMFILE against NULL argument.
654 *
655 * Revision 6.361  2002/11/12 20:42:02  camacho
656 * Fixed problem with long deflines in FDLCreateAsnDF
657 *
658 * Revision 6.360  2002/11/06 21:31:08  ucko
659 * Make sure MADV_NORMAL is actually defined before trying to use madvise.
660 *
661 * Revision 6.359  2002/11/04 16:44:08  camacho
662 * Prevent MT problems by loading BlastDefLine ASN module in readdb_new_internal
663 *
664 * Revision 6.358  2002/10/25 16:49:45  camacho
665 * Added Michael Kimelman's FDBAddSequence2
666 *
667 * Revision 6.357  2002/10/17 17:47:46  camacho
668 * Added longest sequence length to fastacmd -I option
669 *
670 * Revision 6.356  2002/10/03 14:13:43  camacho
671 * Added support for gilist field in alias file in multivolume databases
672 *
673 * Revision 6.355  2002/09/30 15:05:07  camacho
674 * Added check for zero-length sequences in FDBAddSequence
675 *
676 * Revision 6.354  2002/09/26 17:54:56  camacho
677 * Fix for using -t option with multiple databases
678 *
679 * Revision 6.353  2002/09/26 02:14:42  camacho
680 * Allow limiting the number of sequences per volume
681 *
682 * Revision 6.352  2002/09/25 20:14:20  camacho
683 * Fix for multivolume databases with non-parseable seqids
684 *
685 * Revision 6.351  2002/09/24 19:08:31  camacho
686 * Removed unnecessary loop around SeqId2OrdinalId in ReadDBBioseqFetchFunc
687 *
688 * Revision 6.350  2002/09/20 14:42:12  camacho
689 * Changed order of precedence when reading looking for index/alias files
690 *
691 * Revision 6.349  2002/08/21 17:51:47  camacho
692 * Added taxonomy id to Fastacmd_PrintTaxonomyInfo
693 *
694 * Revision 6.348  2002/07/30 15:28:49  camacho
695 * Added fastacmd function to parse SeqLocs
696 *
697 * Revision 6.347  2002/07/29 15:45:18  camacho
698 * Made readdb_get_taxnames a LIBCALL function
699 *
700 * Revision 6.346  2002/07/26 15:33:42  raytseli
701 * for async searches use the highes priority for all databases other than "est", use default priority for "est".
702 *
703 * Revision 6.345  2002/07/25 13:45:07  raytseli
704 * added a couple sanity checks.
705 * .
706 *
707 * Revision 6.344  2002/07/24 21:11:39  kans
708 * reverted ncbi URL
709 *
710 * Revision 6.343  2002/07/24 19:57:46  raytseli
711 * removed special provisions for preloading the est database, since all preloads are acces driven only.
712 * .
713 *
714 * Revision 6.342  2002/07/24 19:31:47  raytseli
715 * much simpler and more efficient approach to using madvise()
716 * .
717 *
718 * Revision 6.341  2002/07/23 16:50:04  kans
719 * changed www.ncbi.nlm.nih.gov to www.ncbi.nih.gov
720 *
721 * Revision 6.340  2002/07/22 18:34:34  raytseli
722 * run madvise() thread at the highest priority.
723 * .
724 *
725 * Revision 6.339  2002/07/22 13:06:42  raytseli
726 * explicitly allow setting of the advice type for madvise()
727 * .
728 *
729 * Revision 6.338  2002/07/19 19:59:48  raytseli
730 * madvise()-related refinements.
731 *
732 * Revision 6.337  2002/07/19 17:15:59  madden
733 * MemSet for MyFsa, use BioseqRawToFastaExtraEx again
734 *
735 * Revision 6.336  2002/07/19 13:35:58  raytseli
736 * decided that preloading index is not advantageous in some cases, -- removed for now.
737 *
738 * Revision 6.335  2002/07/18 18:49:14  madden
739 * Use BioseqRawToFastaExtra as BioseqRawToFastaExtraEx still has problems
740 *
741 * Revision 6.334  2002/07/18 15:54:26  raytseli
742 * added function to explicitly set madvise() block size, and madvise() sync mode.
743 *
744 * Revision 6.333  2002/07/18 15:01:52  raytseli
745 * correct problem with pointer format "%p" ErrPostEx() handling on linux.
746 * Add extern func to allow explicit madvise() functionality activation.
747 *
748 * Revision 6.332  2002/07/17 19:41:29  raytseli
749 * solaris exotics and other refinements.
750 * .
751 *
752 * Revision 6.331  2002/07/17 17:52:40  raytseli
753 * dealt with linux idiosyncrazies
754 * .
755 *
756 * Revision 6.328  2002/07/17 16:46:27  raytseli
757 * Exclude Windows from madvise()-related stuff, -- Provisional version
758 * to allow Win build.
759 *
760 * Revision 6.327  2002/07/17 15:46:03  raytseli
761 * itemporarily disable madvise on linux
762 * .
763 *
764 * Revision 6.326  2002/07/17 15:20:50  raytseli
765 * use async madvise on linux, sync on solaris; other minor changes.
766 * .
767 *
768 * Revision 6.325  2002/07/17 14:36:54  raytseli
769 * incorporated madvise into readdb
770 * .
771 *
772 * Revision 6.324  2002/07/15 17:01:33  camacho
773 * Replaced call to snprintf with StringNCpy
774 *
775 * Revision 6.323  2002/07/14 21:02:08  camacho
776 * Added extra features to fastacmd
777 *
778 * Revision 6.322  2002/07/12 15:19:18  camacho
779 * Updated comment explaining order to search blast databases
780 *
781 * Revision 6.321  2002/07/11 18:37:40  camacho
782 * BLASTDB env. variable has higher precedence over .ncbirc file config value
783 *
784 * Revision 6.320  2002/07/09 16:41:52  camacho
785 * Made taxonomy databases multi-thread safe
786 *
787 * Revision 6.319  2002/07/07 20:43:45  camacho
788 * Pointer initialization in RDBGetTaxNames
789 *
790 * Revision 6.318  2002/06/26 00:45:37  camacho
791 *
792 * Added readdb_get_totals_ex2 to allow recalculation of database length as
793 * well as total number of sequences after the virtual oidlist has been
794 * created.
795 *
796 * Revision 6.317  2002/06/21 21:39:56  camacho
797 * Eliminated check for obsolete flag
798 *
799 * Revision 6.316  2002/06/18 18:06:27  dondosha
800 * Added comment to the readdb_get_sequence_number function
801 *
802 * Revision 6.315  2002/06/04 21:45:39  dondosha
803 * Corrected the readdb_get_sequence_number function in case of multiple-volume databases
804 *
805 * Revision 6.314  2002/06/04 20:22:56  camacho
806 * Fixed taxonomy databases to work w/o mmap
807 *
808 * Revision 6.313  2002/05/29 22:52:31  dondosha
809 * Removed debug printouts accidentally added in last change
810 *
811 * Revision 6.312  2002/05/29 22:50:58  dondosha
812 * Correction in readdb_get_sequence_number
813 *
814 * Revision 6.311  2002/05/15 20:23:46  camacho
815 * Added wgs_{mouse,anthrax} criteria functions
816 *
817 * Revision 6.310  2002/05/07 18:10:42  camacho
818 * Fixed memory leak in FDBAddSequence
819 *
820 * Revision 6.309  2002/05/02 21:58:42  camacho
821 * Removed fastacmd dependency on the common index
822 *
823 * Revision 6.308  2002/05/02 21:52:06  camacho
824 * Support for genmask's new month/subset mask combinations
825 *
826 * Revision 6.307  2002/04/26 16:31:36  camacho
827 * Byte order fix to BlastDBToFasta
828 *
829 * Revision 6.306  2002/04/24 22:26:42  dondosha
830 * First and last oid in alias files are one-offset
831 *
832 * Revision 6.305  2002/04/18 19:35:05  camacho
833 * 1. Added fdfilter/genmask callbacks for wgs subsets
834 * 2. Modified fdfilter/genmask refseq_protein callback function
835 * 3. Fixed problem in readdb_read_alias_file to read multiple oidlists
836 *
837 * Revision 6.304  2002/04/09 20:15:15  camacho
838 * Fixed FDBAddSequence to correctly handle the dump_info files when using volumes
839 *
840 * Revision 6.303  2002/03/26 15:32:50  camacho
841 * Allow space delimited GIs/accessions in fastacmd
842 *
843 * Revision 6.302  2002/03/18 17:58:17  camacho
844 * Added detailed error messages
845 *
846 * Revision 6.301  2002/03/08 16:58:50  camacho
847 * Added accessions to dump info files *.[pn]di
848 *
849 * Revision 6.300  2002/02/15 20:50:24  beloslyu
850 * fix from HP
851 *
852 * Revision 6.299  2002/01/31 21:29:50  camacho
853 * Fixed bug in readdb_get_asn1_defline
854 *
855 * Revision 6.298  2002/01/25 17:06:57  camacho
856 * Added new criteria to create new refseq databases
857 *
858 * Revision 6.297  2002/01/24 18:47:48  camacho
859 * Moved RDBTaxNamesFree from readdb.[ch] to txalign.[ch]
860 *
861 * Revision 6.296  2002/01/11 19:22:26  camacho
862 * 1. Added preferred_gi field to ReadDBFILE structure.
863 * 2. Modified FDReadDeflineAsn to return the preferred gi as the
864 *    first element of the list of BlastDefLine structures (if set).
865 *
866 * Revision 6.295  2002/01/10 20:55:37  camacho
867 * Modified OIDBelongsToMaskDB to accept a gi as a parameter
868 *
869 * Revision 6.294  2002/01/09 20:19:14  camacho
870 * Fix to previous commit
871 *
872 * Revision 6.293  2002/01/09 19:47:49  camacho
873 * Added call to SeqEntryLoad in readdb_new_internal
874 *
875 * Revision 6.292  2002/01/09 14:45:30  camacho
876 * Fixed some memory leaks, fix to BlastDBToFasta
877 *
878 * Revision 6.291  2001/12/19 21:14:24  camacho
879 * Guard against a bad pointer in readdb_get_taxonomy_names
880 *
881 * Revision 6.290  2001/12/18 13:01:51  camacho
882 * Added new flag -D to dump blast database in FASTA format
883 *
884 * Revision 6.289  2001/12/13 21:50:25  camacho
885 * Fixed little endian/big endian issue in RDBGetTaxNames
886 *
887 * Revision 6.288  2001/12/10 19:17:13  camacho
888 * Added option to allow fastacmd to use Ctrl-As as defline separators.
889 *
890 * Revision 6.287  2001/12/06 21:20:33  camacho
891 * 1. Enabled fastacmd to dump multiple mask databases.
892 * 2. Made genmask show progress if SHOW_PROGRESS is defined.
893 *
894 * Revision 6.286  2001/12/04 21:21:19  camacho
895 * Eliminated unnecessary condition in readdb_gi2seq
896 *
897 * Revision 6.285  2001/11/28 20:17:33  camacho
898 * Fixed trailing semicolon problem in PrintDbInformationWithRID
899 *
900 * Revision 6.284  2001/11/27 18:08:09  camacho
901 * 1. Corrected readdb_gi2seq to retrieve the correct oid's in the new
902 *    database format (FORMATDB_VER) even if the CommonIndex is present.
903 * 2. Updated a few conditionals.
904 *
905 * Revision 6.283  2001/11/19 22:18:03  camacho
906 * Fixed invocation to OIDBelongsToMaskDB to ensure that the right offset
907 * in the database is retrieved.
908 *
909 * Revision 6.282  2001/11/16 17:15:26  madden
910 * Fix for multi-volume searches
911 *
912 * Revision 6.281  2001/11/15 16:11:29  dondosha
913 * Changed genome view link from neptune to public page
914 *
915 * Revision 6.280  2001/11/14 17:29:07  camacho
916 * Fixed PrintDbInformationWithRID to print semicolons only
917 * when searching multiple databases.
918 *
919 * Revision 6.279  2001/11/13 20:32:56  dondosha
920 * Removed a tiny bit of garbage code
921 *
922 * Revision 6.278  2001/11/13 17:01:43  dondosha
923 * Correction of previous change
924 *
925 * Revision 6.277  2001/11/09 23:11:45  dondosha
926 * Correction for links from completed genomes databases to genome view
927 *
928 * Revision 6.276  2001/11/09 19:05:35  dondosha
929 * ReadDBFreeSharedInfo and ReadDBOpenMHdrAndSeqFiles made static in readdb.c
930 *
931 * Revision 6.275  2001/11/09 19:04:21  dondosha
932 * Check shared_info->nthreads for 0 outside of mutex to avoid huge number of mutex locks; check again once inside mutex
933 *
934 * Revision 6.274  2001/11/05 23:00:40  dondosha
935 * Put back changes from revision 6.270 that were accidentally removed
936 *
937 * Revision 6.273  2001/11/02 20:18:09  camacho
938 * Fixed a small memory leak in OIDListFree
939 *
940 * Revision 6.272  2001/11/02 19:56:56  camacho
941 * Corrected a source for memory leaks in OIDBelongsToMaskDB
942 *
943 * Revision 6.271  2001/11/02 19:45:16  camacho
944 * 1. Modified FDReadDeflineAsn to return the correct
945 * BlastDefLine structure when dealing with subset
946 * (mask) databases.
947 * 2. Added readdb_encode_subset_asn1_defline to
948 * add the BlastDefLine structure to the Bioseq
949 * as a UserObject (when dealing with subset db's).
950 * 3. Updated readdb_get_defline_ex, readdb_get_descriptor,
951 * and OIDBelongsToMaskDB to use the changes introduced
952 * above.
953 *
954 * Revision 6.270  2001/11/02 18:33:00  dondosha
955 * 1. Added function readdb_get_sequence_number (by position in database)
956 * 2. Added function PrintDbInformationWithRID for Microbial genomes page
957 *
958 * Revision 6.269  2001/10/19 13:46:50  camacho
959 * Added membership_bit field to ReadDBFILE structure for FORMATDB_VER subset
960 * databases.
961 * Added OIDBelongsToMaskDB and modified readdb_gi2seq and readdb_acc2fasta
962 * to return the proper sequence when dealing with a subset database in the FORMATDB_VER format.
963 * Updated readdb_read_alias_file to read the new MEMB_BIT field.
964 * Updated readdb_get_defline_ex and FDBuildOldStyleDefline to build the proper defline when dealing with a subset database in the FORMATDB_VER format.
965 *
966 * Revision 6.268  2001/10/01 18:44:22  camacho
967 * Added BlastDBToFasta function
968 * Added readdb_get_header_ex function
969 *
970 * Revision 6.267  2001/10/01 18:37:31  camacho
971 * readdb.h
972 *
973 * Revision 6.266  2001/09/28 14:28:37  madden
974 * Fixes for ambiguity problem for sequences longer than 16 million bps.
975 *
976 * Revision 6.265  2001/09/26 16:36:52  dondosha
977 * Previous fix still wrong - corrected
978 *
979 * Revision 6.264  2001/09/20 18:30:04  dondosha
980 * Correction to change in revision 6.262
981 *
982 * Revision 6.263  2001/08/29 21:12:59  dondosha
983 * Do not check ISAM indices for non-gi seqid if gifile is provided in rdfp
984 *
985 * Revision 6.262  2001/08/24 22:30:32  dondosha
986 * Correction for alias databases with dblists containing databases with and without OID lists
987 *
988 * Revision 6.261  2001/08/16 13:52:28  madden
989 * Reinit gi to zero for every try
990 *
991 * Revision 6.260  2001/08/08 13:13:57  madden
992 * Add third-party annotation IDs
993 *
994 * Revision 6.259  2001/08/02 20:13:28  madden
995 * Close sequence and header files for all non-used rdfps
996 *
997 * Revision 6.258  2001/08/02 17:55:00  madden
998 * Fix for length and non-mmapped file
999 *
1000 * Revision 6.257  2001/07/26 12:53:12  madden
1001 * Fix for non memory-mapped mode
1002 *
1003 * Revision 6.256  2001/07/16 20:25:07  madden
1004 * Do not init ISAM string indices until needed
1005 *
1006 * Revision 6.255  2001/07/12 19:27:30  madden
1007 * Increase volume by one in call to FD_CreateAliasFileEx
1008 *
1009 * Revision 6.254  2001/07/09 14:17:24  madden
1010 * Fix PC-lint complaints from R. Williams
1011 *
1012 * Revision 6.253  2001/07/06 13:59:02  madden
1013 * Fixed compiler and lint warnings
1014 *
1015 * Revision 6.252  2001/06/25 18:30:24  madden
1016 * Add define for NLM_GENERATED_CODE_PROTO to get prototypes in fdlobj.h
1017 *
1018 * Revision 6.251  2001/06/22 19:13:59  dondosha
1019 * Fixed a thread race condition
1020 *
1021 * Revision 6.250  2001/06/21 19:43:12  shavirin
1022 * Removed to txalign.h definitions related to Taxonomy names.
1023 *
1024 * Revision 6.249  2001/06/21 18:27:27  shavirin
1025 * Moved into files txalign.[c,h] functions returning taxonomy names
1026 * from Bioseq created from Blast database.
1027 *
1028 * Revision 6.248  2001/06/20 19:46:04  madden
1029 * Replace Int2 by Int4 for readdb_get_bioseq_ex
1030 *
1031 * Revision 6.247  2001/06/15 20:57:06  shavirin
1032 * Fixed problem when bsp->descr == NULL in the function readdb_get_bioseq_ex().
1033 *
1034 * Revision 6.246  2001/06/14 14:17:52  madden
1035 * Add FD_MakeAliasFile
1036 *
1037 * Revision 6.245  2001/06/12 18:50:56  shavirin
1038 * Fixed function FDReadDeflineAsn to get correct rdfp structure.
1039 *
1040 * Revision 6.244  2001/06/12 17:33:26  egorov
1041 * Print an error message if DI file could not be found
1042 *
1043 * Revision 6.243  2001/06/08 20:30:24  madden
1044 * Fix problem with not searching all databases in a list for identifier lookups
1045 *
1046 * Revision 6.242  2001/06/08 12:49:31  madden
1047 * Use gi if possible in readdb_seqid2fasta, make readdb_find_best_id static
1048 *
1049 * Revision 6.241  2001/06/04 16:20:20  shavirin
1050 * Fixed problem with retrieve of PDB accessions using fastacmd program.
1051 *
1052 * Revision 6.240  2001/05/29 16:03:41  shavirin
1053 * Adjusted return codes of the function FDBAddSequence().
1054 *
1055 * Revision 6.239  2001/05/21 15:27:18  dondosha
1056 * Change stat call to FileLength
1057 *
1058 * Revision 6.238  2001/05/17 20:21:46  dondosha
1059 * Do not add .00 extension when only one volume created
1060 *
1061 * Revision 6.237  2001/05/14 17:39:07  shavirin
1062 * Changes related to possibility to manipulate with BLAST databases with
1063 * ASN.1 structured deflines.
1064 *
1065 * Revision 6.236  2001/05/11 19:59:40  madden
1066 * Add gi_file_bin to FDOptions, oidlist and gifile to FD_CreateAliasFileEx
1067 *
1068 * Revision 6.235  2001/05/11 18:18:12  madden
1069 * Add error message if db_file is NULL
1070 *
1071 * Revision 6.234  2001/05/10 17:19:53  madden
1072 * Add number_seqs arg to FD_CreateAliasFileEx
1073 *
1074 * Revision 6.233  2001/05/08 21:58:27  shavirin
1075 * Added possibility to generate tax_id for every definition in Blast FASTA
1076 * definition set in ASN.1 structured definition lines.
1077 *
1078 * Revision 6.232  2001/05/02 16:22:05  dondosha
1079 * Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb
1080 *
1081 * Revision 6.231  2001/04/30 19:29:47  madden
1082 * Remove intermediate buffer in readdb_get_bioseq_ex
1083 *
1084 * Revision 6.230  2001/04/27 15:26:37  madden
1085 * Use RebuildDNA_4na rather than BSRebuildDNA_4na_core
1086 *
1087 * Revision 6.229  2001/04/27 15:18:29  madden
1088 * Use BSRebuildDNA_4na_core, remove unnecessary memset
1089 *
1090 * Revision 6.228  2001/04/23 17:08:52  madden
1091 * Do not delete gifile memory if readdb is only attached
1092 *
1093 * Revision 6.227  2001/04/19 14:41:08  madden
1094 * Fix for subset database deflines
1095 *
1096 * Revision 6.226  2001/04/16 20:42:59  madden
1097 * Fix readdb_adjust_local_id to only work on BL_ORD_ID
1098 *
1099 * Revision 6.225  2001/04/13 22:17:06  dondosha
1100 * Fixed formatdb but if one of multiple FASTA file inputs is empty
1101 *
1102 * Revision 6.224  2001/04/11 21:00:52  dondosha
1103 * Made functions FD_CreateAliasFile(Ex) public
1104 *
1105 * Revision 6.223  2001/04/11 20:45:35  dondosha
1106 * Moved appending of .00 for the first volume to FormatDBInit function
1107 *
1108 * Revision 6.222  2001/04/11 20:14:40  dondosha
1109 * Processing of volumes moved to lower level
1110 *
1111 * Revision 6.221  2001/03/29 20:15:40  madden
1112 * Int4 to Uint4 where needed
1113 *
1114 * Revision 6.220  2001/03/27 21:16:02  dondosha
1115 * Allow FIRST_OID and LAST_OID parameters in alias database file
1116 *
1117 * Revision 6.219  2001/03/26 14:42:01  madden
1118 * Fix number warnings and two bugs found by PC compiler
1119 *
1120 * Revision 6.218  2001/03/23 17:23:54  madden
1121 * Move FDGetDeflineAsnFromBioseq to txalign.[ch]
1122 *
1123 * Revision 6.217  2001/03/21 22:14:21  shavirin
1124 * Fixed problem with using ASN.1 structured deflines in non-parse seq-id
1125 * database.
1126 *
1127 * Revision 6.216  2001/03/13 21:49:11  madden
1128 * Remove extra &
1129 *
1130 * Revision 6.215  2001/03/08 14:08:06  madden
1131 * Use ByteStorePtr PNTR rather than ByteStorePtr for User-field
1132 *
1133 * Revision 6.214  2001/02/21 14:53:40  madden
1134 * Protection against -1 gi
1135 *
1136 * Revision 6.213  2001/02/12 17:42:50  madden
1137 * Replace another OLD_INT4_DB_SIZE_TO_BE_REMOVED with check for FORMATDB_VER_TEXT
1138 *
1139 * Revision 6.212  2001/02/06 18:47:48  madden
1140 * replace OLD_UIN4_DB_LEN_TO_BE_REMOVED with version check
1141 *
1142 * Revision 6.211  2001/02/05 18:52:00  shavirin
1143 * Blast database size was changed from Uint4 to Uint8 - this corrected
1144 * invalidly printed database size for large databases.
1145 *
1146 * Revision 6.210  2001/01/06 21:21:27  kans
1147 * Mac compiler complained about return NULL for Int2 return value
1148 *
1149 * Revision 6.209  2001/01/05 16:37:53  egorov
1150 * 1. Initialize OffsetAllocated=1024
1151 * 2. Add more diagnostic messages
1152 *
1153 * Revision 6.208  2001/01/02 22:28:14  dondosha
1154 * Check for partial duplication of databases when a whole database and its part with oidlist are provided for search
1155 *
1156 * Revision 6.207  2000/12/15 21:47:35  shavirin
1157 * Added set of functions to encode taxonomy names information into
1158 * Bioseq and retrieval of specific information from it.
1159 *
1160 * Revision 6.206  2000/12/12 23:14:41  shavirin
1161 * Added functions to initialize taxonomy names database and search functions
1162 * to get all taxonomy names given tax_id using this database.
1163 *
1164 * Revision 6.205  2000/12/08 22:25:00  shavirin
1165 * Added code for creation Taxonomy lookup database using formatdb API.
1166 *
1167 * Revision 6.204  2000/11/22 20:51:12  shavirin
1168 * Added new parameter tax_id into function FDBAddBioseq() for creation
1169 * ASN.1 structured deflines in BLAST databases.
1170 *
1171 * Revision 6.203  2000/11/22 19:54:48  shavirin
1172 * Added creation of the special user object with ASN.1 structured deflines
1173 * in the function readdb_get_bioseq()
1174 *
1175 * Revision 6.202  2000/11/13 21:33:59  madden
1176 * Add warning for zero-length sequence
1177 *
1178 * Revision 6.201  2000/11/07 20:56:14  egorov
1179 * Few improvements  by Michael Kimelman
1180 *
1181 * Revision 6.200  2000/11/03 19:49:47  madden
1182 * Add final return value to FastaToBlastDb to silence compiler
1183 *
1184 * Revision 6.199  2000/11/03 15:46:04  madden
1185 * Save gifile from alias file for nucleotides
1186 *
1187 * Revision 6.198  2000/10/30 21:02:07  madden
1188 * Fix memory leak and FUM for formatdb
1189 *
1190 * Revision 6.197  2000/10/26 18:32:55  dondosha
1191 * Fill the gifile string from alias structure when creating ReadDBFILE
1192 *
1193 * Revision 6.196  2000/10/24 19:11:45  madden
1194 * Add function CheckForRecursion that checks all dbs in string, issues warning if recursion found
1195 *
1196 * Revision 6.195  2000/10/20 19:27:09  madden
1197 * Fix UMR (bdfp_head) in readdb_get_descriptor
1198 *
1199 * Revision 6.194  2000/10/13 17:31:51  shavirin
1200 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
1201 *
1202 * Revision 6.193  2000/10/13 16:05:43  shavirin
1203 * Fixed minir bug with reporting database name.
1204 *
1205 * Revision 6.192  2000/10/03 16:12:37  madden
1206 * Replace atol with sscanf for large numbers
1207 *
1208 * Revision 6.191  2000/09/29 16:38:28  shavirin
1209 * Added new function FDB_FreeCLOptions(FDB_optionsPtr options).
1210 *
1211 * Revision 6.190  2000/09/27 14:06:51  shavirin
1212 * Fixed minor bug in FormatDBInit() function.
1213 *
1214 * Revision 6.189  2000/09/25 20:39:32  dondosha
1215 * Call ReadDBCloseMHdrAndSeqFiles from readdb_destruct only when contents allocated
1216 *
1217 * Revision 6.188  2000/09/19 20:12:59  shavirin
1218 * Empty log message
1219 *
1220 * Revision 6.187  2000/09/19 20:10:27  shavirin
1221 * Attempt to fix NT bug related to unproper defines generated by asntool.
1222 *
1223 * Revision 6.186  2000/09/18 01:15:50  shavirin
1224 * Changed definition BlastDefline -> BlastDefLine do not conflict with
1225 * Blast network definitions.
1226 *
1227 * Revision 6.185  2000/09/15 20:43:22  shavirin
1228 * Empty log message.
1229 *
1230 * Revision 6.184  2000/09/15 20:40:03  shavirin
1231 * Many changes to allow dump and retrieval of ASN.1 structured deflines.
1232 *
1233 * Revision 6.183  2000/09/07 20:49:57  shavirin
1234 * Added parameters to support ASN.1 defline dump for blast db. FORMATDB_VER 3->4
1235 * Added parameter FORMATDB_VER_TEXT for backward compatibility.
1236 *
1237 * Revision 6.182  2000/09/05 17:24:59  shavirin
1238 * Fixed problem with initialization of sparse_idx information.
1239 *
1240 * Revision 6.181  2000/09/01 18:28:12  dondosha
1241 * Call ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles from readdb_destruct
1242 *
1243 * Revision 6.180  2000/08/31 15:56:38  dondosha
1244 * Change allowing to pass rdfp from higher level to search
1245 *
1246 * Revision 6.179  2000/08/30 20:29:00  shavirin
1247 * Fixed GCC compiler warnings.
1248 *
1249 * Revision 6.178  2000/08/07 20:43:04  madden
1250 * Proper casting of int to long for printf
1251 *
1252 * Revision 6.177  2000/07/19 14:01:47  madden
1253 * Call CommonIndexDestruct if opening of CommonIndex does not succeed
1254 *
1255 * Revision 6.176  2000/07/18 19:29:28  shavirin
1256 * Added new parameter test_non_unique to suppress check for non-unique
1257 * strings ids in the database - default - TRUE.
1258 *
1259 * Revision 6.175  2000/06/30 18:20:30  madden
1260 * Elaborate on SORTFiles error message
1261 *
1262 * Revision 6.174  2000/06/30 16:40:11  madden
1263 * Changed error message if unable to initialze readdb
1264 *
1265 * Revision 6.173  2000/06/28 16:55:49  madden
1266 * Add function Fastacmd_Search_ex, gi_target to ReadDBFILEPtr
1267 *
1268 * Revision 6.172  2000/06/22 18:59:33  egorov
1269 * Allow absolute paths to databases in alias files.
1270 * The change is provided by Maxim Shemanarev (Informax Inc).
1271 *
1272 * Revision 6.171  2000/06/19 20:06:42  madden
1273 * Add ready Boolean to readdb_get_sequence_ex, for nucl. sequence the data is then in blastna format with sentinel bytes
1274 *
1275 * Revision 6.170  2000/06/19 16:53:21  madden
1276 * Remove unneeded memcpy
1277 *
1278 * Revision 6.169  2000/06/16 16:43:33  madden
1279 * Replace MemNew with Nlm_Malloc
1280 *
1281 * Revision 6.168  2000/06/08 19:02:26  madden
1282 * Return file-name if no title found
1283 *
1284 * Revision 6.167  2000/05/25 20:31:24  madden
1285 * Do not change aliasfilebit unless it is zero
1286 *
1287 * Revision 6.166  2000/05/23 21:22:37  dondosha
1288 * Do not open sequence files in shared_info when flag is set to not do it - correction to previous change
1289 *
1290 * Revision 6.165  2000/05/22 18:46:43  dondosha
1291 * Merged all Boolean members in ReadDBFILE structure into a single Int4
1292 *
1293 * Revision 6.164  2000/05/09 15:54:19  shavirin
1294 * Added function ReadDBBioseqSetDbGeneticCode().
1295 *
1296 * Revision 6.163  2000/05/03 17:41:21  madden
1297 * Fix for readdb_get_descriptor problem when searching subset database
1298 *
1299 * Revision 6.162  2000/05/03 16:19:01  dondosha
1300 * Added function FastaToBlastDB
1301 *
1302 * Revision 6.161  2000/05/03 12:49:45  madden
1303 * Do not add > if not first definition
1304 *
1305 * Revision 6.160  2000/05/01 20:01:11  madden
1306 * Protection against too large gis
1307 *
1308 * Revision 6.159  2000/04/19 17:59:23  madden
1309 * Move setting of start and stop, adjust of indices to end, do every time in case of recursive calls or multiple databases
1310 *
1311 * Revision 6.158  2000/04/14 21:16:36  madden
1312 * Fix for non-NULL aliasfilename
1313 *
1314 * Revision 6.157  2000/04/11 19:56:48  madden
1315 * Set aliasfilename even if oidlist does not exist
1316 *
1317 * Revision 6.156  2000/04/10 18:01:46  dondosha
1318 * Fixed FindBlastDBFile when file exists in current directory
1319 *
1320 * Revision 6.155  2000/04/05 19:25:09  madden
1321 * Check for NULL searchstr in Fastacmd_Search, allow line break to be a valid delimiter for a file that is read in
1322 *
1323 * Revision 6.154  2000/04/03 21:17:57  dondosha
1324 * readdb_MakeGiFileBinary will sort gis in increasing order
1325 *
1326 * Revision 6.153  2000/04/03 17:34:27  shavirin
1327 * Fixed case when indexed and regular databases are mixed in multiple
1328 * database set.
1329 *
1330 * Revision 6.152  2000/03/28 04:38:20  egorov
1331 * Bug seen on Malaria page fixed
1332 *
1333 * Revision 6.151  2000/03/24 14:36:43  egorov
1334 * Allow NULL alias_dbid on input of GI2OID
1335 *
1336 * Revision 6.150  2000/03/24 14:34:33  egorov
1337 * Add support for month.sts, month.pataa, month.patnt month subsets
1338 *
1339 * Revision 6.149  2000/03/20 22:03:34  egorov
1340 * bug with multiple alias databases mask is fixed
1341 *
1342 * Revision 6.148  2000/03/20 17:03:19  dondosha
1343 * Return NULL from readdb_get_link and readdb_get_bioseq if cannot mem-map files
1344 *
1345 * Revision 6.147  2000/03/20 14:36:54  egorov
1346 * Add protection from the reading out of the ISAM index file boundary when update CommonIndex.
1347 *
1348 * Revision 6.146  2000/03/16 19:47:18  egorov
1349 * Db mask should be Uint4, not Int2.  Also previous change about FreeOIDList is rolled back.
1350 *
1351 * Revision 6.145  2000/03/16 18:09:50  dondosha
1352 * Fixes memory leak in OIDListFree; corrects ReadDBCloseMHdrAndSeqFiles
1353 *
1354 * Revision 6.144  2000/03/15 21:34:30  egorov
1355 * 1. Fix bug with using alias databases.
1356 * 2. 2. Initialize new_defline variable.
1357 *
1358 * Revision 6.143  2000/03/13 18:36:37  madden
1359 * Added insert_ctrlA Boolean to readdb_get_bioseq_ex
1360 *
1361 * Revision 6.142  2000/03/13 13:53:50  madden
1362 * Check for non-NULL rdfp before dereference
1363 *
1364 * Revision 6.141  2000/03/10 19:16:30  shavirin
1365 * Added multi-thread support for the function ReadDBBioseqFetchEnable().
1366 *
1367 * Revision 6.140  2000/03/10 18:51:33  madden
1368 * Add prototype for readdb_get_filebits
1369 *
1370 * Revision 6.139  2000/03/08 22:03:32  madden
1371 * added readdb_get_filebits
1372 *
1373 * Revision 6.138  2000/03/08 20:52:37  madden
1374 * readdb_get_bioseq_ex only returns gis for subset database
1375 *
1376 * Revision 6.137  2000/02/28 21:50:13  egorov
1377 * All month subsets use same criteria.
1378 *
1379 * Revision 6.136  2000/02/24 19:02:37  egorov
1380 * Add support for PDB subset of nr
1381 *
1382 * Revision 6.135  2000/02/16 18:39:59  madden
1383 * Fix check for nucl. alias file
1384 *
1385 * Revision 6.134  2000/02/11 19:59:29  shavirin
1386 * Increased nthreads when attaching to the rdfp structure.
1387 *
1388 * Revision 6.133  2000/02/09 19:35:51  madden
1389 * Added readdb_MakeGiFileBinary
1390 *
1391 * Revision 6.132  2000/02/07 21:15:15  madden
1392 * Issue warning before stripping zero gi
1393 *
1394 * Revision 6.131  2000/02/07 20:56:08  madden
1395 * Strip off gi|0 identifiers for formatdb
1396 *
1397 * Revision 6.130  2000/01/26 15:38:34  madden
1398 * Fix for fastacmd and alias files
1399 *
1400 * Revision 6.129  2000/01/26 15:19:59  madden
1401 * Return aliasfilename if present in readdb_get_filename
1402 *
1403 * Revision 6.128  2000/01/20 20:26:05  egorov
1404 * Use "est_" prefix for subsets 'human', 'mouse', and 'others'
1405 *
1406 * Revision 6.127  2000/01/20 18:57:24  madden
1407 * Check whether rdfp is NULL before dereference
1408 *
1409 * Revision 6.126  2000/01/12 21:51:50  madden
1410 * Check for oidlist before setting aliasfilename
1411 *
1412 * Revision 6.125  2000/01/12 21:46:35  dondosha
1413 * Fixed memory leak (rdfp->aliasfilename)
1414 *
1415 * Revision 6.124  2000/01/12 21:03:52  egorov
1416 * 1. Introduce Fastacmd API function - Fastacmd_Search
1417 * 2. Rearrange order of functions to have Fastacmd, ID1, and CommonIndex stuff separate.
1418 *
1419 * Revision 6.123  2000/01/12 20:28:31  dondosha
1420 * Fixed readdb_new_ex2 behavior with multiple volume database
1421 *
1422 * Revision 6.122  2000/01/12 18:06:03  egorov
1423 * Fix memory leak.  Remove debug stuff.
1424 *
1425 * Revision 6.121  2000/01/12 17:39:31  madden
1426 * Fix readdb_parse_db_names so done is TRUE on last db
1427 *
1428 * Revision 6.120  2000/01/11 15:32:46  dondosha
1429 * Fixed memory leaks in opening shared header and sequence file memory maps
1430 *
1431 * Revision 6.119  2000/01/07 16:00:25  madden
1432 * Alias db length is Int8 instead of Uint4
1433 *
1434 * Revision 6.118  1999/12/31 14:23:20  egorov
1435 * Add support for using mixture of real and maks database with gi-list files:
1436 * 1. Change logic of creating rdfp list.
1437 * 2. BlastGetDbChunk gets real databases first, then masks.
1438 * 3. Propoper calculation of database sizes using alias files.
1439 * 4. Change to CommonIndex to support using of mask databases.
1440 * 5. Use correct gis in formatted output (BlastGetAllowedGis()).
1441 * 6. Other small changes
1442 *
1443 * Revision 6.117  1999/12/29 13:46:42  madden
1444 * Fix for moving virtual rdfp to end, remove bad fix for infinite recursion
1445 *
1446 * Revision 6.116  1999/12/23 18:15:37  madden
1447 * Move mask databases to end of all databases
1448 *
1449 * Revision 6.115  1999/12/22 21:54:41  dondosha
1450 * Open header and sequence files consecutively as needed, close them when all threads have finished working with the database
1451 *
1452 * Revision 6.114  1999/12/21 20:02:16  egorov
1453 * Set proper 'start' and 'stop' values for mask's rdfp.
1454 * Add 'start' parameter into readdb_gi2seq.  This is return
1455 * value which is set to rdfp->start where given gi was found.
1456 *
1457 * Revision 6.113  1999/12/17 21:33:01  egorov
1458 * Add support for the 'month' subset.
1459 *
1460 * Revision 6.112  1999/12/17 20:47:05  egorov
1461 * Fix 'gcc -Wall' warnings
1462 *
1463 * Revision 6.111  1999/12/15 21:57:58  egorov
1464 * Initialize extra_bytes variable
1465 *
1466 * Revision 6.110  1999/12/15 17:40:07  egorov
1467 * 1. Fix but with path to CommonIndexFile.
1468 * 2. Add ScanDIFile() function for scanning DI index file and perform
1469 *    callback-specified action for each record which meets database
1470 *    subset criteria.
1471 * 3. Change UpdateCommonIndexFile() function to use ScanDIFile.
1472 * 4. Criteria for est_others, est_human, est_mouse, swissprot added.
1473 *
1474 * Revision 6.109  1999/12/14 19:27:09  dondosha
1475 * Test against infinite recursion in IndexFileExists
1476 *
1477 * Revision 6.108  1999/11/30 17:07:15  egorov
1478 * Fix problem with parsing database names when file_path is not NULL.
1479 * Add prefix path prefix to OIDLIST and GILIST values, if any.
1480 *
1481 * Revision 6.107  1999/11/29 14:45:47  egorov
1482 * Bug fixed.
1483 *
1484 * Revision 6.106  1999/11/24 21:43:34  madden
1485 * Added Nlm_SwapUint4 call to make database masks work with both big and small endian systems
1486 *
1487 * Revision 6.105  1999/11/24 18:42:25  egorov
1488 * It was reported by Andrei Shkeda and observed by us in neighboring software
1489 * that using ReadDbFile structure was not MT-safe, and, as a result,
1490 * it was impossible to format seqalign from different threads.
1491 * Now it is fixed.  Mutecies shared same ISAM structures, so I had to put additional mutex.
1492 *
1493 * Revision 6.104  1999/11/24 18:01:38  egorov
1494 * Bug fixed:  it truncated full database name to just file name if BLASTDB was specified.
1495 * So it was impossible to have BLASTDB=/blast/db/blast and filename = "subdir/database".
1496 * Now it works and makes it possible to use subdirectories for organism-specific databases.
1497 *
1498 * Revision 6.103  1999/11/23 22:02:26  madden
1499 * Added readdb_get_totals_ex that may use alias file values
1500 *
1501 * Revision 6.102  1999/11/23 21:30:10  madden
1502 * Deallocate OID list
1503 *
1504 * Revision 6.101  1999/11/22 16:15:36  egorov
1505 * Remove correct return code in readdb_get_header function
1506 *
1507 * Revision 6.100  1999/11/15 17:42:48  egorov
1508 * Fix bug when CommonIndex finds wrong Gi if database is not the first
1509 * or the second in the CommonIndex list
1510 *
1511 * Revision 6.99  1999/11/12 14:15:54  madden
1512 * Allow NlmOpenMFILE to simply open a file if it cannot be memory-mapped, alow other initialization states in readdb_new_ex2
1513 *
1514 * Revision 6.98  1999/10/07 20:40:48  madden
1515 * Remove calls and function readdb_get_index
1516 *
1517 * Revision 6.97  1999/10/07 13:40:37  madden
1518 * remove extra call to Nlm_SwapUint4
1519 *
1520 * Revision 6.96  1999/10/06 21:08:36  shavirin
1521 * Cleared last bits in last byte written in function FDBAddSequence()
1522 * These bits may be dirty in case of ASN.1 coming directly from ID.
1523 *
1524 * Revision 6.95  1999/10/01 18:25:07  shavirin
1525 * Fixed bug in the function FDBAddSequence
1526 *
1527 * Revision 6.94  1999/09/30 20:48:24  madden
1528 * Change static buffer to dynamically allocated
1529 *
1530 * Revision 6.93  1999/09/29 17:20:34  shavirin
1531 * Fixed minor memory leak.
1532 *
1533 * Revision 6.92  1999/09/29 13:30:51  shavirin
1534 * Changed sequence of allocating/deleting of oidlist structure.
1535 *
1536 * Revision 6.91  1999/09/28 20:45:07  shavirin
1537 * Passed oidlist info when cloning rdfp in readdb_attach() function.
1538 *
1539 * Revision 6.90  1999/09/28 13:41:57  shavirin
1540 * Freed memory of OID list in readdb_destruct().
1541 *
1542 * Revision 6.89  1999/09/24 16:30:25  egorov
1543 * Remove Mac incompatible stuff.  Add two more functions for CommonIndex API.
1544 *
1545 * Revision 6.88  1999/09/23 18:22:30  egorov
1546 * Do not keep private copy of index arrays (sequence_index, header_index, ambchar_index),
1547 * but just use as it is in memory mapped file.  Big and small endian stuff is not forgot.
1548 *
1549 * Revision 6.87  1999/09/23 15:17:24  egorov
1550 * Add CommonIndex API function - UpdateCommonIndexFile
1551 *
1552 * Revision 6.86  1999/09/23 15:10:52  egorov
1553 * Add new fields into OIDList structure.
1554 * Add new keywords into alias file: NSEQ and LENGTH.
1555 * Use Nlm_Malloc instead of MemNew where MemSet is not needed.
1556 * Create ReadOIDList function.
1557 *
1558 * Revision 6.85  1999/09/23 15:03:43  egorov
1559 * Close alias file;  change name of index file;  add comments
1560 *
1561 * Revision 6.84  1999/09/22 21:58:07  egorov
1562 * fix compilation bug
1563 *
1564 * Revision 6.83  1999/09/13 16:18:37  shavirin
1565 * Added function readdb_get_bioseq_ex, which has possibility
1566 * to bypass ObjMgr registration.
1567 *
1568 * Revision 6.82  1999/09/10 16:30:17  shavirin
1569 * Fixed problems with formating proteins by formatdb
1570 *
1571 * Revision 6.81  1999/09/09 18:25:04  shavirin
1572 * Added functions to parse ASN.1 with formatdb
1573 *
1574 * Revision 6.80  1999/09/02 18:02:33  madden
1575 * No spaces after date
1576 *
1577 * Revision 6.79  1999/09/02 12:56:52  egorov
1578 * Change format of the BLAST index file to set proper alignment
1579 * for memory map.
1580 *
1581 * Revision 6.78  1999/08/30 18:21:29  shavirin
1582 * Temporary return of full dumping set in SeqidE2Index() function.
1583 *
1584 * Revision 6.77  1999/08/26 20:55:55  shavirin
1585 * Changed way to look for seqids.
1586 *
1587 * Revision 6.76  1999/08/26 14:12:50  shavirin
1588 * Redused amount of information dumped for string indexes in regular case.
1589 *
1590 * Revision 6.75  1999/08/25 20:17:38  shavirin
1591 * Added option to create and retrieve from sparse indexes.
1592 *
1593 * Revision 6.74  1999/08/04 18:26:41  madden
1594 * Change databases in alias file for file path
1595 *
1596 * Revision 6.72  1999/08/03 19:21:44  shavirin
1597 * Changed to dynamically allocated memory in function readdb_read_alias_file()
1598 *
1599 * Revision 6.71  1999/08/02 13:36:01  shavirin
1600 * Rolled back last changes.
1601 *
1602 * Revision 6.69  1999/06/29 19:26:59  madden
1603 * Took SeqIdWrite out of loop for efficiency
1604 *
1605 * Revision 6.68  1999/06/10 20:53:22  egorov
1606 * Few changes to make it possible to perform multiple searches against different db's.
1607 *
1608 * Revision 6.67  1999/05/28 14:30:38  yaschenk
1609 * rolling back fixes of 6.63 by shavirin, since they lead to coredump
1610 *
1611 * Revision 6.66  1999/05/27 21:47:12  yaschenk
1612 * fix to the previous change
1613 *
1614 * Revision 6.65  1999/05/27 21:41:44  yaschenk
1615 * dump_info file should be created fro nucleotides
1616 *
1617 * Revision 6.64  1999/05/27 15:51:29  shavirin
1618 * Added function readdb_get_defline ()
1619 *
1620 * Revision 6.63  1999/05/27 14:40:17  shavirin
1621 * Fixed some memory leaks.
1622 *
1623 * Revision 6.62  1999/05/21 17:36:52  madden
1624 * Minor efficiencies
1625 *
1626 * Revision 6.61  1999/05/18 20:35:30  madden
1627 * Changes to read an alias file for multiple db searches and ordinal ID lists
1628 *
1629 * Revision 6.60  1999/05/17 15:28:30  egorov
1630 * First check that gi belongs to correct database and only then do all CommonIndex stuff
1631 *
1632 * Revision 6.59  1999/05/13 19:31:13  shavirin
1633 * More changes toward dump from ID.
1634 *
1635 * Revision 6.58  1999/05/12 15:48:33  shavirin
1636 * Many changes to fit new dump from ID.
1637 *
1638 * Revision 6.57  1999/05/10 13:47:44  madden
1639 * NULL database not a fatal error
1640 *
1641 * Revision 6.56  1999/05/04 13:12:19  egorov
1642 * Declare parse* functions as static and remove unused argument
1643 *
1644 * Revision 6.55  1999/05/03 21:44:33  chappey
1645 * getline is now static function
1646 *
1647 * Revision 6.54  1999/04/27 17:28:17  shavirin
1648 * Fixed few problems in the function FDBAddSequence().
1649 *
1650 * Revision 6.53  1999/04/26 14:55:23  shavirin
1651 * Checked variable for not NULL.
1652 *
1653 * Revision 6.52  1999/04/26 14:36:04  shavirin
1654 * Added ability to dump statistics.
1655 *
1656 * Revision 6.50  1999/04/21 22:59:41  kans
1657 * added includes
1658 *
1659 * Revision 6.49  1999/04/21 21:43:28  shavirin
1660 * Added set of functions, which used in "formatdb".
1661 *
1662 * Revision 6.48  1999/04/14 14:53:49  madden
1663 * Correction for databases over 2 Gig
1664 *
1665 * Revision 6.47  1999/03/23 14:38:28  egorov
1666 * Destruct CommonIndex structures only by thread it belongs to.
1667 *
1668 * Revision 6.46  1999/03/19 19:29:47  egorov
1669 * Bug fixed.  Initialize cih.
1670 *
1671 * Revision 6.45  1999/03/18 16:55:22  egorov
1672 * Previous fix was incompleete.
1673 *
1674 * Revision 6.44  1999/03/18 16:36:16  egorov
1675 * Check if rdfp is not NULL before dereferencing it.
1676 *
1677 * Revision 6.43  1999/03/17 16:57:21  egorov
1678 * Previously each element in rdfp list had his own CommonIndexHeadPtr
1679 * initialized with MemMap.  But when we do search agains many databases,
1680 * like in case of unfinished genomes, we meet limit for doing MemMap
1681 * on SGI machines.  So now we initialize 'rdfp->cih' only for the first
1682 * element in the list and reuse it for the others.
1683 * Also the change contains proper freeing memory after the above change.
1684 *
1685 * Revision 6.42  1999/03/12 23:02:49  madden
1686 * initialize memory in buffer_2na first
1687 *
1688 * Revision 6.41  1999/03/12 18:36:16  madden
1689 * formatting fix
1690 *
1691 * Revision 6.40  1999/02/22 21:49:08  egorov
1692 * Optimize GIs2OIDs using already initialized ISAM indecies from rdfp.  Use SwapUint4 function to use common index file
1693 * on Solaris/Intel machines
1694 *
1695 * Revision 6.39  1999/02/18 21:19:12  madden
1696 * ignore GIs not in common index
1697 *
1698 * Revision 6.38  1999/02/17 13:23:40  madden
1699 * use MapNa2ByteToNa4String
1700 *
1701 * Revision 6.37  1999/01/07 14:35:01  madden
1702 * Fix for readdb_acc2fasta for multiple databases
1703 *
1704 * Revision 6.36  1998/12/14 21:50:15  egorov
1705 * new max gi number memeber in CommonIndexHead structure and therefore no need for COMMON_INDEX_TABLE_SIZE
1706 *
1707 * Revision 6.35  1998/09/24 15:26:41  egorov
1708 * Fix lint complaints
1709 *
1710 * Revision 6.34  1998/09/14 15:11:20  egorov
1711 * Add support for Int8 length databases; remove unused variables
1712 *
1713 * Revision 6.33  1998/09/03 18:43:09  egorov
1714 * Close db config file
1715 *
1716 * Revision 6.32  1998/08/29 20:05:47  madden
1717 * Fixed MemCpy length problem
1718 *
1719 * Revision 6.31  1998/08/24 14:59:56  madden
1720 * readdb_get_sequence_ex function
1721 *
1722 * Revision 6.30  1998/07/31 19:30:11  egorov
1723 * Fix bug when OID=0 treated as bad in common index
1724 *
1725 * Revision 6.29  1998/07/09 13:35:16  egorov
1726 * remove platform dependent statement
1727 *
1728 * Revision 6.28  1998/07/08 14:10:53  madden
1729 * Fix for multiple db search, use of more efficient readdb_new_ex
1730 *
1731 * Revision 6.27  1998/07/01 16:45:25  egorov
1732 * Remove debug mesages
1733 *
1734 * Revision 6.26  1998/07/01 14:14:49  egorov
1735 * Move FilePathFind function into ncbitoolkit remove its definition here
1736 *
1737 * Revision 6.25  1998/07/01 14:03:04  egorov
1738 * Fix bug with a thread freeing CommonIndex: add new flag to rdfp
1739 *
1740 * Revision 6.24  1998/06/26 16:51:13  egorov
1741 * Fix CommonIndex bugs
1742 *
1743 * Revision 6.23  1998/06/24 21:03:35  egorov
1744 * Remove memory leaks
1745 *
1746 * Revision 6.20  1998/05/22 20:19:53  madden
1747 * Changes to fix multi-db search bug
1748 *
1749 * Revision 6.19  1998/02/26 22:49:23  kans
1750 * needed to include ffprint.h
1751 *
1752 * Revision 6.18  1998/02/26 22:34:21  madden
1753 * Changes for 16 bit windows
1754 *
1755 * Revision 6.17  1998/01/16 22:02:03  madden
1756 * Added readdb_new_ex with init_indices Boolean to allow faster retrieval of one sequence
1757 *
1758 * Revision 6.16  1997/12/12 20:39:25  madden
1759 * Added parens for if
1760 *
1761 * Revision 6.15  1997/12/11 22:21:05  madden
1762 * Removed unused variables
1763 *
1764 * Revision 6.14  1997/12/03 21:48:01  madden
1765 * Check for duplicate database names
1766 *
1767 * Revision 6.13  1997/12/02 22:18:09  madden
1768 * Fixed UMR
1769 *
1770 * Revision 6.12  1997/11/26 22:48:35  madden
1771 * Added readdb_parse_db_names for multiple db searches
1772 *
1773 * Revision 6.11  1997/11/07 16:16:14  shavirin
1774 * Added new function readdb_acc2fastaEx(), that retrieve array of hits
1775 *
1776 * Revision 6.10  1997/11/07 14:44:53  madden
1777 * Sped up start up
1778 *
1779 * Revision 6.9  1997/11/06 21:27:19  madden
1780 * Speeded up initialization
1781 *
1782 * Revision 6.8  1997/10/30 18:16:12  madden
1783 * Change to readdb_acc2fasta to allow lookups by accession strings
1784 *
1785 * Revision 6.7  1997/10/24 19:08:13  madden
1786 * Added ReadDBGetDb and ReadDBGetDbId
1787 *
1788 * Revision 6.6  1997/10/24 14:10:30  madden
1789 * Changed Fetch function to speed up retrieval of cached sequences
1790 *
1791 * Revision 6.5  1997/09/24 22:37:03  madden
1792 * Added readdb_destruct_element
1793 *
1794 * Revision 6.4  1997/09/16 16:31:36  madden
1795 * More changes for multiple db runs
1796 *
1797 * Revision 6.3  1997/09/12 19:55:35  madden
1798 * Added readdb_compare
1799 *
1800 * Revision 6.2  1997/09/11 18:49:37  madden
1801 * Changes to enable searches against multiple databases.
1802 *
1803 * Revision 6.1  1997/08/27 14:46:56  madden
1804 * Changes to enable multiple DB searches
1805 *
1806 * Revision 6.0  1997/08/25 18:53:55  madden
1807 * Revision changed to 6.0
1808 *
1809 * Revision 1.52  1997/07/14 20:11:21  madden
1810 * Removed unused variables
1811 *
1812 * Revision 1.51  1997/06/26 20:32:55  madden
1813 * Only convert sequence if ambig. chars
1814 *
1815 * Revision 1.50  1997/05/20 14:33:32  shavirin
1816 * Fixed retrievel by LOCUS in function readdb_acc2fasta()
1817 *
1818 * Revision 1.49  1997/05/19 21:14:56  shavirin
1819 * Changed function readdb_acc2fasta() as required by E2Index() functions
1820 * family
1821 *
1822 * Revision 1.48  1997/05/16 13:50:42  madden
1823 * Fixed bug, wrong type of database opened
1824 *
1825 * Revision 1.47  1997/05/12 21:33:57  madden
1826 * readdb_new allows indeterminate database type
1827 *
1828 * Revision 1.46  1997/05/12 21:10:31  shavirin
1829 * Added new function readdb_acc2fasta()
1830 *
1831 * Revision 1.44  1997/05/07 21:03:11  madden
1832 * Added function SeqId2OrdinalId
1833 *
1834 * Revision 1.43  1997/05/01 17:27:31  shavirin
1835 * Added new function readdb_seqid2fasta()
1836 *
1837  * Revision 1.42  1997/03/31  17:06:40  shavirin
1838  * Changed function readdb_get_bioseq to use BSRebuildDNA_4na()
1839  * function.
1840  *
1841  * Revision 1.41  1997/03/26  14:01:34  madden
1842  * Changes to Fetch function to allow cached-out structures to be read back in.
1843  *
1844  * Revision 1.40  1997/03/05  18:24:17  madden
1845  * Fixed MT problem introduced with use of ISAM code.
1846  *
1847  * Revision 1.39  1997/02/26  23:39:54  madden
1848  * Removed unused variables.
1849  *
1850  * Revision 1.38  1997/02/26  20:37:31  madden
1851  * Added protection against MT use to fetch function.
1852  *
1853  * Revision 1.37  1997/02/25  23:52:05  madden
1854  * Added readdb_gi2seq call to ReadDBBioseqFetchFunc.
1855  *
1856  * Revision 1.36  1997/02/25  22:15:33  shavirin
1857  * Changes in accordance to ISAM API changes
1858  *
1859  * Revision 1.35  1997/02/25  16:28:05  shavirin
1860  * Added function readdb_gi2seq() - returnes sequence number from gi
1861  *
1862  * Revision 1.34  1997/02/14  17:17:59  madden
1863  * Checked for NULL return from MemNew.
1864  *
1865  * Revision 1.33  1997/02/07  22:32:40  madden
1866  * Fixed bug.
1867  *
1868  * Revision 1.32  1997/01/14  23:11:27  madden
1869  * Cleaned ctrl-A's out of defline in readdb_get_bioseq.
1870  *
1871  * Revision 1.31  1996/12/20  00:30:20  madden
1872  * Protected ambiguity data against big/little endian changes.
1873  *
1874  * Revision 1.30  1996/12/19  16:29:56  madden
1875  * Changes to eliminate ".nac" file for nucl.
1876  *
1877  * Revision 1.29  1996/12/17  21:34:46  madden
1878  * Changes to allow deflines for inidividual entries to be retrieved.
1879  *
1880  * Revision 1.28  1996/12/11  18:42:36  madden
1881  * Added BioseqFetch functions.
1882  *
1883  * Revision 1.27  1996/12/11  17:59:42  madden
1884  * Fixed purify leaks.
1885  *
1886  * Revision 1.26  1996/12/08  15:19:59  madden
1887  * Checked for NULL pointer.
1888  *
1889  * Revision 1.25  1996/11/27  16:39:11  madden
1890  * Added functions to return filename and date.
1891  *
1892  * Revision 1.24  1996/11/26  19:54:27  madden
1893  * Added check for database in standard places.
1894  *
1895  * Revision 1.23  1996/11/22  19:05:48  madden
1896  * removed ifdef for OLD_BIT_ORDER.
1897  *
1898  * Revision 1.22  1996/11/18  17:28:13  madden
1899  * properly set contents_allocated flag for ambig. char. in readdb_attach.
1900  *
1901  * Revision 1.21  1996/11/08  21:45:03  madden
1902  * Removed function readdb_get_partial_unpacked_sequence.
1903  *
1904  * Revision 1.20  1996/11/07  22:31:15  madden
1905  * Added function readdb_ambchar_present to check for the presence
1906  * of ambig. characters in a db sequence.
1907  *
1908  * Revision 1.19  1996/11/04  18:48:53  shavirin
1909  * Added possibility to reconstruct Nucleotide sequence using function
1910  * readdb_get_bioseq. Added new function readdb_get_ambchar() to retrieve
1911  * ambiguity information.
1912  *
1913  * Revision 1.18  1996/10/31  16:29:18  shavirin
1914  * Multiple changes due to reverce of residues in BLAST database
1915  * for nucleotide sequences from (4321) to (1234)
1916  * New dumper now required to create BLAST databases.
1917  *
1918  * Revision 1.17  1996/09/27  19:12:17  madden
1919  * Added function readdb_get_bioseq to obtain a BioseqPtr from the BLAST databases.
1920  *
1921  * Revision 1.16  1996/09/26  20:18:43  madden
1922  * Saved filename.
1923  *
1924  * Revision 1.15  1996/09/23  17:36:20  madden
1925  * Removed unused variable.
1926  *
1927  * Revision 1.14  1996/09/23  14:37:35  madden
1928  * Replaced CharPtr (for sequence) with Uint1Ptr.
1929  *
1930  * Revision 1.13  1996/09/20  21:58:14  madden
1931  * Changed CharPtr's to Uint1Ptr, got remainder length out of top order bits.
1932  *
1933  * Revision 1.12  1996/09/16  13:48:51  madden
1934  * Removed extra increment of counter in readdb_get_partial_unpacked_sequence.
1935  *
1936  * Revision 1.11  1996/09/15  17:35:48  madden
1937  * readdb_get_partial_unpacked_sequence now packages ncbi4na properly.
1938  *
1939  * Revision 1.10  1996/09/13  18:55:04  madden
1940  * Added function readdb_get_partial_unpacked_sequence.
1941  *
1942  * Revision 1.9  1996/09/11  21:31:11  shavirin
1943  * Added check for NULL from function Nlm_MemMapInit(name)
1944  *
1945  * Revision 1.8  1996/08/29  20:42:01  madden
1946  * memory mapping moved to the corelib (in ncbimem.[ch]).
1947  *
1948  * Revision 1.7  1996/08/23  15:32:02  shavirin
1949  * Fixed a lot of NT compiler warnings about type mismatch
1950  *
1951  * Revision 1.6  1996/08/21  21:25:25  madden
1952  * Changes for reading nt. db's.
1953  *
1954  * Revision 1.5  1996/08/14  14:31:28  madden
1955  * Added efficiencies in readdb_get_sequence_length.
1956  *
1957  * Revision 1.4  1996/08/13  22:04:36  madden
1958  * Changed readdb_get_sequence to report the uncompressed length of
1959  * a nucl. sequence.
1960  *
1961  * Revision 1.3  1996/08/08  21:39:48  madden
1962  * Added code to read in nucleotide databases.
1963  *
1964  * Revision 1.2  1996/08/07  18:32:05  madden
1965  * Moved define of MMAP_AVAIL from readdb.h to readdb.c
1966  *
1967  * Revision 1.1  1996/08/05  19:48:21  madden
1968  * Initial revision
1969  *
1970  * Revision 1.14  1996/08/02  14:20:06  madden
1971  * Added readdb_attach function.
1972  *
1973  * Revision 1.13  1996/07/31  13:09:17  madden
1974  * Changes for partial copy of ReadDB structure.
1975  *
1976  * Revision 1.12  1996/07/29  19:43:35  madden
1977  * Changes to make BLAST big/little endian independent.
1978  *
1979  * Revision 1.11  1996/07/25  20:45:20  madden
1980  * Change to arguments of readdb_get_sequence.
1981  *
1982  * Revision 1.10  1996/07/25  12:56:15  madden
1983  * readdb_get_sequence changed to allow for systems w/o mmap.
1984  *
1985  * Revision 1.9  1996/06/20  16:16:36  madden
1986  * Replaced int's with Int4's.
1987  *
1988  * Revision 1.8  1996/06/07  15:05:21  madden
1989  * MemCpy used instead of a while loop.
1990  *
1991  * Revision 1.7  1996/05/16  21:07:33  madden
1992  * Added protections against missing input files.
1993  *
1994  * Revision 1.6  1996/05/16  19:50:15  madden
1995  * Added documentation block.
1996  *
1997  * Revision 1.5  1996/04/22  21:41:13  madden
1998  * memory mapping added.
1999  *
2000  * Revision 1.4  1996/04/11  14:30:06  madden
2001  * Memory-mapping added.
2002  *
2003  * Revision 1.3  1996/03/29  21:28:30  madden
2004  * Added function readdb_get_sequence_length.
2005  *
2006  * Revision 1.2  1996/03/28  20:42:36  madden
2007  * Added functions readdb_get_title, readdb_is_prot and
2008  * readdb_get_formatdb_version.
2009  *
2010  * Revision 1.1  1996/03/26  19:38:08  madden
2011  * Initial revision
2012  *
2013  *
2014 */
2015 
2016 
2017 /* Description of conception:
2018 
2019  * BLAST uses the concept of a virtual database, meaning that
2020  * we may have a few databases searched together as one.  For
2021  * a virtual database BLAST numbers the sequence from zero to
2022  * the total number in all databases minus one (the numbers
2023  * are called ordinal ID's or OID's):
2024  *
2025  * 0 <= OID < total in all db's
2026  *
2027  * Readdb is aware of these virtual OID's and handles them properly.
2028  * The situation has grown rather confused as we also allow
2029  * the specification of a gilist (that determines a subset of the
2030  * sequences in the database to be searched) as well as the recent
2031  * addition of the 'mask' database (which specifies that a subset of
2032  * a real database is to be searched, e.g., est_human is a mask and est is
2033  * the real database).  To clarify the situation Alexey and I have written
2034  * down some rules that describe how virtual databases should be used.
2035  *
2036  * 1.) Ordinal ID (OID) numbering is from zero to total-1, where total
2037  * is the total number of sequences in all databases in the virtual
2038  * database.
2039  *
2040  * 2.) OID's of 'mask' databases refer to the real (i.e., underlying)
2041  * database - not the mask.
2042  *
2043  * 3.) If a gilist is used, then one virtual mask is used for all
2044  * databases - regardless of whether any database being searched is real.
2045  *
2046  * 4.) If there is a mixture of real and mask databases and no gilist
2047  * is being used, then the mask databases should go to the end of the
2048  * virtual database and one virtual mask will be created for this subsection
2049  * of the virtual database.  (readdb_new_ex2 will be changed to move
2050  * the 'mask' databases to the end).
2051 */
2052 
2053 #define FILECLOSE(x)  if(x){ FileClose(x); x=NULL;}
2054 #define ASNIOCLOSE(x) if(x){ AsnIoClose(x); x=NULL;}
2055 
2056 #define NLM_GENERATED_CODE_PROTO
2057 #include <readdb.h>
2058 #include <ncbithr.h>
2059 #include <ffprint.h>
2060 #include <ncbisami.h>
2061 #include <blast.h>
2062 #include <ncbisort.h>
2063 #include <tofasta.h>
2064 #include <assert.h>
2065 #include <errno.h>
2066 #include <txalign.h>
2067 #include <sqnutils.h>
2068 #include <blfmtutl.h>
2069 #ifdef FDB_TAXONOMYDB
2070 #include <taxblast.h>
2071 #endif
2072 
2073 #ifdef __GLIBC__ /* not just __linux */
2074 #ifndef __USE_BSD
2075 #define __USE_BSD
2076 #endif
2077 #include <fcntl.h>
2078 #include <sys/types.h>
2079 #include <sys/mman.h>
2080 #endif
2081 
2082 /* Used by fetch functions. */
2083 #define READDB_BUF_SIZE 255
2084 #define READDBBF_INIT 0
2085 #define READDBBF_READY 1
2086 #define READDBBF_DISABLE 2
2087 
2088 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2089 #ifndef MADV_NORMAL
2090 #undef HAVE_MADVISE
2091 #endif
2092 #ifdef  HAVE_MADVISE
2093 
2094 /* default size of preload (in sequences) in a single madvise operation */
2095 #define MADVISE_SEQ_PRELOAD 1024
2096 
2097 /* by default use async madvise for linux, sync for solaris, etc */
2098 #ifdef OS_UNIX_LINUX
2099 #define MADVISE_SYNC_MODE FALSE
2100 #else
2101 #define MADVISE_SYNC_MODE TRUE
2102 #endif
2103 
2104 
2105 /* flag enabling madvise functionality */
2106 static Boolean useMadvise = FALSE;
2107 
2108 /* advice to use */
2109 static EMemMapAdvise mmapAdvice = eMMA_Normal;
2110 
2111 
2112 /* flag that determines whether madvise is sync or async */
2113 static Boolean madviseSyncMode = MADVISE_SYNC_MODE;
2114 
2115 /* size of the block preloaded (in sequences) in a single madvise operation */
2116 static Int4 madvisePreloadBlock = MADVISE_SEQ_PRELOAD;
2117 
2118 #endif /* HAVE_MADVISE */
2119 #endif /* OS_UNIX_SOL || OS_UNIX_LINUX */
2120 
2121 #if defined(OS_UNIX_SOL)
2122 #include <sys/int_types.h>
2123 #elif defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2124 #include <stdint.h>
2125 #endif
2126 typedef struct readdbbioseqfetch {
2127     struct readdbbioseqfetch PNTR next;
2128     Uint1 ReadDBFetchState;
2129     CharPtr dbname;    /* Name of the database. */
2130     Uint2 ctr;
2131     Boolean is_prot; /* Is it a protein or not. */
2132     ReadDBFILEPtr rdfp;
2133     Int4 db_genetic_code;
2134     TNlmThread    thread_id;
2135 } ReadDBFetchStruct, PNTR ReadDBFetchStructPtr;
2136 
2137 typedef struct readdbfetchuserdata {
2138     Int4 ordinal_number;    /* ordinal number of db sequence. */
2139     Int2 db_id;        /* database ID, for multiple databases. */
2140 } ReadDBFetchUserData, PNTR ReadDBFetchUserDataPtr;
2141 
2142 static Int2 LIBCALLBACK ReadDBBioseqFetchFunc PROTO((Pointer data));
2143 static ReadDBFILEPtr ReadDBFILENew(void);
2144 static Boolean FormatDbUint8Write(Uint8 value, FILE *fp);
2145 static Int8 FormatDbUint8Read(NlmMFILEPtr mfp);
2146 static ValNodePtr readdb_encode_subset_asn1_defline(ReadDBFILEPtr, Int4);
2147 static ValNodePtr IntValNodeCopy(ValNodePtr vnp);
2148 static int LIBCALLBACK ID_Compare(VoidPtr i, VoidPtr j);
2149 static ReadDBFILEPtr readdb_merge_gifiles (ReadDBFILEPtr rdfp_chain);
2150 static Boolean s_IsTextFile(const char* filename);
2151 
2152 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2153 #ifdef  HAVE_MADVISE
2154 static void readdb_preload_index (ReadDBFILEPtr rdfp, Int4 first_db_seq,
2155 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync);
2156 static void readdb_preload_data (ReadDBFILEPtr rdfp, Int4 first_db_seq,
2157 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync);
2158 static void readdb_preload_file ( NlmMFILEPtr mFilePtr, Int4 nPages,
2159 				EMemMapAdvise advice, Boolean sync, EThreadPriority pri);
2160 static void readdb_madvise (void * mp, size_t len,
2161                 EMemMapAdvise advice, Boolean sync, EThreadPriority pri);
2162 #endif /* HAVE_MADVISE */
2163 #endif /* SOL || LINUX */
2164 
2165 static TNlmMutex isamsearch_mutex;    /* Mutex to regulate using ISAM;
2166                      rdfp->isam is common for all threads */
2167 static TNlmMutex hdrseq_mutex;
2168 
2169 /* Common index global variables */
2170 Boolean    isCommonIndex = FALSE;   /* deprecated 05/21/2003 */
2171 
2172 /* Global to load the taxonomy databases only once per readdb_new invocation */
2173 static Boolean taxonomyDbLoaded = FALSE;
2174 
2175 const Uint4 kFDBMaxNumVolumes = 100;
2176 
2177 /**************************************************************************
2178 *
2179 *    Functions to perform memory mapping.
2180 *
2181 *    If memory mapping is not available, then these functions should
2182 *    default to normal FILE pointers.
2183 *
2184 *    This is allowed with "read-only files right now.
2185 *
2186 **************************************************************************/
2187 
2188 /*
2189     Initialize the memory-mapping.
2190 */
2191 NlmMFILEPtr LIBCALL
NlmOpenMFILE(CharPtr name)2192 NlmOpenMFILE (CharPtr name)
2193 
2194 {
2195     NlmMFILEPtr mfp;
2196 
2197     if (!name || name[0] == '\0')
2198         return NULL;
2199 
2200     if ((mfp=(NlmMFILEPtr) MemNew(sizeof(NlmMFILE))) == NULL)
2201         return NULL;
2202 
2203     /* Default is FALSE. */
2204     mfp->mfile_true = FALSE;
2205 
2206     mfp->mmp_begin = NULL;
2207 
2208     if (Nlm_MemMapAvailable() == TRUE)
2209     {     /* IF mem-map fails, open as a regular file. */
2210                 if((mfp->mem_mapp = Nlm_MemMapInit(name)) != NULL)
2211         { /* copy this pointer to where it's convenient. */
2212             mfp->mmp_madvise_end = mfp->mmp_begin = mfp->mmp = (Uint1Ptr) mfp->mem_mapp->mmp_begin;
2213             if (mfp->mmp_begin != NULL)
2214             {
2215                 mfp->mfile_true = TRUE;
2216                 mfp->mmp_end = mfp->mmp_begin + mfp->mem_mapp->file_size;
2217             }
2218         }
2219     }
2220 
2221     if (mfp->mmp_begin == NULL)
2222     {
2223         mfp->fp = FileOpen(name, "rb");
2224         if (mfp->fp == NULL)
2225         {
2226             mfp = (NlmMFILEPtr) MemFree(mfp);
2227             return NULL;
2228         }
2229     }
2230 
2231     /* contents have been allocated. */
2232     mfp->contents_allocated = TRUE;
2233 
2234     return mfp;
2235 
2236 }    /* NlmOpenMFILE */
2237 
2238 /*
2239   Open the shared sequence and header files for memory mapping, if this hasn't
2240   already been done; duplicate this in headerfp and sequencefp
2241 */
ReadDBOpenMHdrAndSeqFiles(ReadDBFILEPtr rdfp)2242 static Boolean ReadDBOpenMHdrAndSeqFiles(ReadDBFILEPtr rdfp)
2243 {
2244    Char buffer[PATH_MAX];
2245    Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
2246 
2247    /* The check for nthreads == 0 was done outside of a mutex in
2248       readdb_get_link, hence repeat it here */
2249 
2250    if (rdfp->shared_info == NULL) {
2251       if(!((Boolean)(rdfp->parameters & READDB_NO_SEQ_FILE))) {
2252      sprintf(buffer, "%s.%csq", rdfp->full_filename, is_prot? 'p':'n');
2253      if ((rdfp->sequencefp = NlmOpenMFILE(buffer)) == NULL) {
2254         ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2255         rdfp = readdb_destruct(rdfp);
2256         return FALSE;
2257      }
2258       }
2259       sprintf(buffer, "%s.%chr", rdfp->full_filename, is_prot? 'p':'n');
2260       if((rdfp->headerfp = NlmOpenMFILE(buffer)) == NULL) {
2261      ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2262      rdfp = readdb_destruct(rdfp);
2263      return FALSE;
2264       }
2265       return TRUE;
2266    }
2267 
2268    /* the reference count can be incremented either here or
2269       in readdb_attach, and may be nonzero even if the database
2270       has not been memory-mapped. Hence, never initialize the
2271       reference count, only increment it */
2272 
2273    rdfp->shared_info->nthreads++;
2274 
2275    if (!((Boolean)(rdfp->parameters & READDB_NO_SEQ_FILE)) &&
2276        rdfp->shared_info->sequencefp == NULL) {
2277       sprintf(buffer, "%s.%csq", rdfp->full_filename, is_prot? 'p':'n');
2278       if((rdfp->shared_info->sequencefp = NlmOpenMFILE(buffer)) == NULL) {
2279      ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2280      rdfp = readdb_destruct(rdfp);
2281      return FALSE;
2282       }
2283    }
2284    rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
2285 
2286    rdfp->sequencefp =
2287       (NlmMFILEPtr) MemDup(rdfp->shared_info->sequencefp, sizeof(NlmMFILE));
2288    if (!rdfp->shared_info->sequencefp->mfile_true) {
2289       rdfp->shared_info->sequencefp = MemFree(rdfp->shared_info->sequencefp);
2290       rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
2291    } else {
2292       rdfp->sequencefp->contents_allocated = FALSE;
2293    }
2294 
2295    if (rdfp->shared_info->headerfp == NULL) {
2296       sprintf(buffer, "%s.%chr", rdfp->full_filename, is_prot? 'p':'n');
2297       if((rdfp->shared_info->headerfp = NlmOpenMFILE(buffer)) == NULL) {
2298      ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2299      rdfp = readdb_destruct(rdfp);
2300      return FALSE;
2301       }
2302    }
2303    rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
2304    rdfp->headerfp = (NlmMFILEPtr) MemDup(rdfp->shared_info->headerfp,
2305                      sizeof(NlmMFILE));
2306    if (rdfp->shared_info->headerfp->mfile_true == FALSE) {
2307        rdfp->shared_info->headerfp = MemFree(rdfp->shared_info->headerfp);
2308        rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
2309    } else {
2310        rdfp->headerfp->contents_allocated = FALSE;
2311    }
2312 
2313    return TRUE;
2314 }
2315 
ReadDBCloseMHdrAndSeqFiles(ReadDBFILEPtr rdfp)2316 ReadDBFILEPtr ReadDBCloseMHdrAndSeqFiles(ReadDBFILEPtr rdfp)
2317 {
2318    ReadDBFILEPtr start = rdfp;
2319 
2320    while (rdfp != NULL) {
2321       if (rdfp->shared_info) {
2322      rdfp->shared_info->sequencefp =
2323         NlmCloseMFILE(rdfp->shared_info->sequencefp);
2324      rdfp->shared_info->headerfp =
2325         NlmCloseMFILE(rdfp->shared_info->headerfp);
2326      rdfp->shared_info->nthreads = 0;
2327       }
2328       if (rdfp->sequencefp && rdfp->sequencefp->mfile_true)
2329           rdfp->sequencefp = MemFree(rdfp->sequencefp);
2330       else
2331           rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
2332 
2333       if (rdfp->headerfp && rdfp->headerfp->mfile_true)
2334           rdfp->headerfp = MemFree(rdfp->headerfp);
2335       else
2336           rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
2337 
2338       rdfp = rdfp->next;
2339    }
2340    return start;
2341 }
2342 
ReadDBFreeSharedInfo(ReadDBFILEPtr rdfp)2343 static ReadDBFILEPtr ReadDBFreeSharedInfo(ReadDBFILEPtr rdfp)
2344 {
2345    ReadDBFILEPtr start = rdfp;
2346 
2347    while (rdfp != NULL) {
2348       if ((rdfp->parameters & READDB_CONTENTS_ALLOCATED) && rdfp->shared_info)
2349      rdfp->shared_info =
2350         (ReadDBSharedInfoPtr) MemFree(rdfp->shared_info);
2351       rdfp = rdfp->next;
2352    }
2353    return start;
2354 }
2355 
2356 /****************************************************************************
2357 *
2358 *    Undo the memory-mapping.
2359 *
2360 *****************************************************************************/
2361 NlmMFILEPtr LIBCALL
NlmCloseMFILE(NlmMFILEPtr mfp)2362 NlmCloseMFILE (NlmMFILEPtr mfp)
2363 
2364 {
2365     if (mfp == NULL)
2366         return NULL;
2367 
2368     /* Have the contents been allocated, or is this just an attachemnt? */
2369     if (mfp->contents_allocated)
2370     {
2371 
2372         if (mfp->mfile_true == TRUE)
2373             {
2374             Nlm_MemMapFini(mfp->mem_mapp);
2375         }
2376 
2377         FILECLOSE(mfp->fp);
2378     }
2379 
2380     mfp = (NlmMFILEPtr) MemFree(mfp);
2381     return mfp;
2382 
2383 }    /* NlmCloseMFILE */
2384 
2385 /***********************************************************************
2386 *
2387 *    Analogous to ANSI-C fread.
2388 *
2389 ************************************************************************/
2390 Int4 LIBCALL
NlmReadMFILE(Uint1Ptr buffer,size_t size,Int4 nitems,NlmMFILEPtr mfp)2391 NlmReadMFILE (Uint1Ptr buffer, size_t size, Int4 nitems, NlmMFILEPtr mfp)
2392 
2393 {
2394     register size_t    diff, len;
2395 
2396     if (mfp == NULL)
2397         return 0;
2398 
2399     if (mfp->mfile_true == TRUE)
2400     {
2401         len = size * nitems;
2402         diff = mfp->mmp_end - mfp->mmp;
2403         if (len > diff)
2404         {
2405             nitems = diff / size;
2406             len = nitems * size;
2407         }
2408         MemCpy((VoidPtr) buffer, (VoidPtr) mfp->mmp, len);
2409         mfp->mmp += len;
2410         return nitems;
2411     }
2412 
2413     return FileRead(buffer, size, nitems, mfp->fp);
2414 
2415 }    /* NlmReadMFILE */
2416 
2417 /*
2418     Seeks to a point in the file, analogous to fseek.
2419 */
2420 Int4 LIBCALL
NlmSeekInMFILE(NlmMFILEPtr mfp,long offset,Int4 ptrname)2421 NlmSeekInMFILE (NlmMFILEPtr mfp, long offset, Int4 ptrname)
2422 
2423 {
2424     Uint1Ptr cp;
2425 
2426     if (mfp->mfile_true == TRUE)
2427     {
2428         switch (ptrname) {
2429             case SEEK_SET: /* relative to beginning */
2430                 cp = mfp->mmp_begin + offset;
2431                 if (offset < 0 || cp >= mfp->mmp_end)
2432                     return -1;
2433                 mfp->mmp = cp;
2434                 break;
2435             case SEEK_CUR: /* relative to current position */
2436                 cp = mfp->mmp + offset;
2437                 if (cp >= mfp->mmp_end || cp < mfp->mmp_begin)
2438                     return -1;
2439                 mfp->mmp = cp;
2440                 break;
2441             case SEEK_END: /* relative to end of file */
2442                 if (offset > 0 || mfp->mem_mapp->file_size < -offset)
2443                     return -1;
2444                 mfp->mmp = mfp->mmp_begin + (mfp->mem_mapp->file_size + offset);
2445                 break;
2446             default:
2447                 return -1;
2448         }
2449         return 0;
2450     }
2451 
2452     return (Int4) fseek(mfp->fp, offset, ptrname);
2453 
2454 }    /* NlmSeekInMFILE */
2455 
2456 /*
2457     What is the offset (in bytes) to the beginning of the file.
2458     Analog to ftell.
2459 */
2460 Int4 LIBCALL
NlmTellMFILE(NlmMFILEPtr mfp)2461 NlmTellMFILE (NlmMFILEPtr mfp)
2462 
2463 {
2464     if (mfp->mfile_true == TRUE)
2465     {
2466         return (mfp->mmp - mfp->mmp_begin);
2467     }
2468     else
2469     {
2470         return (Int4) ftell(mfp->fp);
2471     }
2472 
2473 }    /* NlmTellMFILE */
2474 
ReadDBFILENew(void)2475 static ReadDBFILEPtr ReadDBFILENew(void)
2476 {
2477   ReadDBFILEPtr new_t;
2478 
2479   new_t = (ReadDBFILEPtr) MemNew(sizeof(ReadDBFILE));
2480   return new_t;
2481 }
2482 
2483 /*
2484     Parses the databases names (if more than one) from
2485     'filenames' into buffer.  buffer should already be
2486     long enough and allocated.  The funciton should be
2487     repeatedly called until TRUE is returned.
2488 */
2489 Boolean LIBCALL
readdb_parse_db_names(CharPtr PNTR filenames,CharPtr buffer)2490 readdb_parse_db_names (CharPtr PNTR filenames, CharPtr buffer)
2491 
2492 {
2493     Boolean done = FALSE;
2494     Boolean quote_mode = FALSE;
2495 
2496     while (**filenames == ' ')
2497     {
2498         (*filenames)++;
2499     }
2500 
2501     while (**filenames != NULLB)
2502     {
2503         if (**filenames == '"')
2504             if (quote_mode == FALSE)
2505                 quote_mode = TRUE;
2506             else
2507                 quote_mode = FALSE;
2508 
2509         if (!quote_mode && **filenames == ' ')
2510         {
2511             *buffer = NULLB;
2512             break;
2513         }
2514 
2515         if (**filenames != '"')
2516         {
2517             *buffer = **filenames;
2518             buffer++;
2519         }
2520         (*filenames)++;
2521     }
2522 
2523     while (**filenames == ' ')
2524     {
2525         (*filenames)++;
2526     }
2527 
2528     if (**filenames == NULLB)
2529     {
2530         *buffer = NULLB;
2531         done = TRUE;
2532     }
2533 
2534     return done;
2535 }
2536 
2537 /********** Auxiliary gi list structure *************/
2538 #define GI_ALLOC_CHUNK 4096
2539 
2540 Int4ListPtr LIBCALL
Int4ListNew(void)2541 Int4ListNew PROTO((void))
2542 {
2543     return Int4ListNewEx(GI_ALLOC_CHUNK);
2544 }
2545 
2546 Int4ListPtr LIBCALL
Int4ListNewEx(Int4 init_size)2547 Int4ListNewEx PROTO((Int4 init_size))
2548 {
2549     Int4ListPtr lp = NULL;
2550 
2551     if ((lp = MemNew(sizeof(Int4List))) == NULL)
2552         return NULL;
2553     lp->allocated = init_size;
2554     if ((lp->i = MemNew(sizeof(Int4) * lp->allocated)) == NULL) {
2555         return MemFree(lp);
2556     }
2557     lp->count = 0;
2558 
2559     return lp;
2560 }
2561 
2562 Int4ListPtr LIBCALL
Int4ListFree(Int4ListPtr lp)2563 Int4ListFree PROTO((Int4ListPtr lp))
2564 {
2565     if(lp == NULL)
2566         return NULL;
2567 
2568     MemFree(lp->i);
2569     return MemFree(lp);
2570 }
2571 
2572 Boolean LIBCALL
Int4ListAdd(Int4ListPtr lp,Int4 i)2573 Int4ListAdd PROTO((Int4ListPtr lp, Int4 i))
2574 {
2575     if (!lp)
2576         return FALSE;
2577 
2578     if (lp->count >= lp->allocated) {
2579         lp->allocated *= 2;
2580         if ( !(lp->i = Realloc(lp->i, sizeof(Int4) * lp->allocated)))
2581             return FALSE;
2582     }
2583 
2584     lp->i[lp->count++] = i;
2585 
2586     return TRUE;
2587 }
2588 
2589 Int4ListPtr LIBCALL
Int4ListResize(Int4ListPtr listp,Int4 new_size)2590 Int4ListResize(Int4ListPtr listp, Int4 new_size)
2591 {
2592     if (!listp || new_size < 0)
2593         return NULL;
2594 
2595     if (new_size == listp->allocated)
2596         return listp;
2597 
2598     if ( !(listp->i = (Int4Ptr) Realloc(listp->i, sizeof(Int4)*new_size)))
2599         return NULL;
2600     listp->allocated = new_size;
2601 
2602     return listp;
2603 }
2604 
2605 /*
2606 	This function reads in a list of gi's from a file.
2607 The file may be either in binary or text format.
2608 
2609 The binary gilist format has the following construction:
2610 
2611 1.) 1st 4 bytes: a 'magic' number: UINT4_MAX
2612 2.) 2nd 4 bytes: total number of gi's in the file (call this value 'number').
2613 3.) 'number' set of 4 bytes, allowing 4 bytes for each gi.
2614 
2615 The function GetGisFromFile first checks what the first 4 bytes
2616 of a file are, if they are the 'magic' number, then it proceeds
2617 to read values assuming a binary format.  If they are not the
2618 'magic' number, then a text format is assumed.
2619 
2620 The binary gilist can be produced from a text gilist using the
2621 function readdb_MakeGiFileBinary.
2622 
2623 */
2624 
2625 #define	LINE_LEN	1024
2626 
2627 static Int4ListPtr
Int4ListReadFromFileEx(CharPtr lookup_dir,CharPtr fname)2628 Int4ListReadFromFileEx (CharPtr lookup_dir,CharPtr fname)
2629 {
2630     Int4ListPtr listp = NULL;
2631     FILE		*fp = NULL;
2632     Int4		index = 0, value, number;
2633     Int2		status;
2634     Char		line[LINE_LEN];
2635     long		tmplong;
2636     NlmMFILEPtr mfp;
2637     Uint4		tmp_value;
2638     Char	    file_name[PATH_MAX], blast_dir[PATH_MAX];
2639 
2640     /**
2641      * first looking in current directory, then checking .ncbirc,
2642      * then $BLASTDB
2643      */
2644     if (FileLength(fname) > 0) {
2645        char *path = Nlm_FilePathFind(fname);
2646        if (StringLen(path) > 0) {
2647           StringCpy(blast_dir, path);
2648        } else {
2649           StringCpy(blast_dir, ".");
2650        }
2651        MemFree(path);
2652     } else {
2653 	if( !lookup_dir) return NULL;
2654 	StringCpy(blast_dir,lookup_dir);
2655     }
2656     sprintf(file_name, "%s%s%s", blast_dir, DIRDELIMSTR, FileNameFind(fname));
2657 
2658     mfp = NlmOpenMFILE(file_name);
2659     if (mfp == NULL) {
2660         return NULL;
2661     }
2662 
2663     NlmReadMFILE((Uint1Ptr)&tmp_value, sizeof(Uint4), 1, mfp);
2664     if (SwapUint4(tmp_value) == READDB_MAGIC_NUMBER) {
2665 
2666         /*** Binary gi list ***/
2667 
2668         /* Use a 32 kb buffer to read the file */
2669         const Int4 BUFFER_SIZE = (0x1<<15);
2670 
2671         /* Number of gis per BUFFER_SIZE byte chunk */
2672         const Int4 NGIS = BUFFER_SIZE/sizeof(Int4);
2673 
2674         /* Buffer to read the gi list in BUFFER_SIZE byte chunks */
2675         Int4Ptr buffer = (Int4Ptr) Malloc(BUFFER_SIZE);
2676 
2677         if ( !buffer ) {
2678             ErrPostEx(SEV_ERROR, 0, 0, "Not enough memory to read %s\n",
2679                       file_name);
2680             return NULL;
2681         }
2682 
2683         /* Read the number of gis in this file */
2684         NlmReadMFILE((Uint1Ptr)&tmp_value, sizeof(Uint4), 1, mfp);
2685         number = SwapUint4(tmp_value);
2686         listp = Int4ListNewEx(number);
2687 
2688         for (index = 0; index < number; ) {
2689             Int4 bytes_read = NlmReadMFILE((Uint1Ptr)buffer, sizeof(Uint1),
2690                                            BUFFER_SIZE, mfp);
2691             Uint4 idx = 0;
2692 
2693             for (idx = 0; idx < bytes_read/sizeof(Int4) && idx < NGIS; idx++) {
2694                 Int4ListAdd(listp, SwapUint4(buffer[idx]));
2695                 index++;
2696             }
2697         }
2698 
2699         buffer = MemFree(buffer);
2700         mfp = NlmCloseMFILE(mfp);
2701 
2702     } else {
2703 
2704         /*** Text gi list ***/
2705         mfp = NlmCloseMFILE(mfp);
2706         if (!(fp = FileOpen(file_name, "r"))) {
2707             return NULL;
2708         }
2709 
2710         listp = Int4ListNew();
2711 
2712         while (FileGets(line, LINE_LEN, fp)) {
2713 
2714             /* do correct casting */
2715             status = sscanf(line, "%ld", &tmplong);
2716             value = tmplong;
2717 
2718             /* skip non-valid lines */
2719             if (status > 0)
2720                 Int4ListAdd(listp, value);
2721         }
2722 
2723         FileClose(fp);
2724     }
2725 
2726     return listp;
2727 }
2728 
2729 /* read configuration and check file in all places */
s_GetBlastDirInfo(char * blast_dir,char * path_delim)2730 static void s_GetBlastDirInfo(char *blast_dir, char *path_delim)
2731 {
2732     CharPtr envp = NULL;
2733     /* read configuration... */
2734     memset(blast_dir, 0, sizeof(blast_dir));
2735     if ((envp = getenv("BLASTDB")) == NULL) {
2736       /* This checks current directory, user home directory, then path pointed to by $NCBI. */
2737       Nlm_GetAppParam ("NCBI", "BLAST", "BLASTDB", NULL, blast_dir, PATH_MAX);
2738     }
2739     else {
2740       StringCpy(blast_dir, envp);
2741     }
2742 
2743     StringCpy(path_delim,":");
2744 
2745     return;
2746 }
2747 
2748 Int4ListPtr LIBCALL
Int4ListReadFromFile(CharPtr fname)2749 Int4ListReadFromFile PROTO((CharPtr fname))
2750 {
2751     Int4ListPtr listp = NULL;
2752     Char *wrk_buf;
2753     char blast_dir[PATH_MAX];
2754     Char *one_blast_dir ;
2755     char path_delim[2];
2756 
2757     s_GetBlastDirInfo(blast_dir, path_delim);
2758 
2759     /* parse paths and check files*/
2760     for( one_blast_dir = Nlm_StringTokMT(blast_dir,(char*)path_delim, (char **)&wrk_buf);
2761 	 one_blast_dir != NULL;
2762 	 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
2763     {
2764 	listp = Int4ListReadFromFileEx(one_blast_dir,fname);
2765 	if( listp ) return listp;
2766     }
2767     ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", fname);
2768     return NULL;
2769 }
2770 Int4ListPtr LIBCALL
Int4ListMakeUnique(Int4ListPtr list)2771 Int4ListMakeUnique PROTO((Int4ListPtr list))
2772 {
2773     Int4 idx, i;
2774 
2775     if (!list || list->count <= 0)
2776         return list;
2777 
2778     HeapSort(list->i, list->count, sizeof(Int4), ID_Compare);
2779 
2780     for (i = 0, idx = 0; i < list->count - 1; i++) {
2781         if (list->i[i] == list->i[i+1])
2782             continue;
2783         list->i[idx++] = list->i[i];
2784     }
2785     /* check the last element */
2786     if (list->i[i] != list->i[idx-1])
2787         list->i[idx++] = list->i[i];
2788 
2789     list->count = idx;
2790 
2791     return list;
2792 }
2793 
2794 Int4ListPtr LIBCALL
Int4ListConcat(Int4ListPtr * list1,Int4ListPtr * list2)2795 Int4ListConcat PROTO((Int4ListPtr *list1, Int4ListPtr *list2))
2796 {
2797     Int4ListPtr retval = NULL;
2798     Int4 size;
2799 
2800     if ((*list1) && !(*list2)) {
2801         retval = (*list1);
2802         (*list1) = NULL;
2803         return retval;
2804     }
2805 
2806     if ((*list2) && !(*list1)) {
2807         retval = (*list2);
2808         (*list2) = NULL;
2809         return retval;
2810     }
2811 
2812     if ( (size = (*list1)->count + (*list2)->count) <= 0) {
2813         (*list1) = Int4ListFree((*list1));
2814         (*list2) = Int4ListFree((*list2));
2815         return NULL;
2816     }
2817 
2818     if ( !(retval = Int4ListNewEx(size)))
2819         return NULL;
2820 
2821     MemCpy(retval->i, (*list1)->i, sizeof(Int4)* (*list1)->count);
2822     retval->count = (*list1)->count;
2823     MemCpy(retval->i+retval->count, (*list2)->i, sizeof(Int4)* (*list2)->count);
2824     retval->count += (*list2)->count;
2825 
2826     (*list1) = Int4ListFree((*list1));
2827     (*list2) = Int4ListFree((*list2));
2828 
2829     return retval;
2830 }
2831 
2832 Int4ListPtr LIBCALL
Int4ListIntersect(Int4ListPtr * list1,Int4ListPtr * list2)2833 Int4ListIntersect PROTO((Int4ListPtr *list1, Int4ListPtr *list2))
2834 {
2835     Int4 i, j, size, value;
2836     Int4ListPtr retval = NULL;
2837 
2838     if ((*list1) && !(*list2))
2839         return (*list1);
2840 
2841     if ((*list2) && !(*list1))
2842         return (*list2);
2843 
2844     (*list1) = Int4ListMakeUnique((*list1));
2845     (*list2) = Int4ListMakeUnique((*list2));
2846     size = MIN(((*list1))->count, ((*list2))->count);
2847 
2848     if (size == 0) {
2849         (*list1) = Int4ListFree((*list1));
2850         (*list2) = Int4ListFree((*list2));
2851         return NULL;
2852     }
2853 
2854     if ( !(retval = Int4ListNewEx(size)))
2855         return NULL;
2856 
2857     for (i = 0, j = 0; i < (*list1)->count; i++) {
2858         value = (*list1)->i[i];
2859 
2860         for (; j < (*list2)->count && (*list2)->i[j] < value; j++);
2861 
2862         if (j < (*list2)->count && (*list2)->i[j] == value)
2863             retval->i[retval->count++] = (*list1)->i[i];
2864     }
2865 
2866     if (retval->count == 0)
2867         retval = Int4ListFree(retval);
2868 
2869     (*list1) = Int4ListFree((*list1));
2870     (*list2) = Int4ListFree((*list2));
2871 
2872     return retval;
2873 }
2874 
2875 typedef struct _readdb_alias_file {
2876     CharPtr title,     /* title of the database. */
2877         dblist,        /* list of databases. */
2878         gilist,        /* a gilist to be used with the database. */
2879         oidlist;       /* an ordinal id list to be used with this database. */
2880     Int8    len;       /* length of the database */
2881     Uint4   nseq;      /* number of seqs of the database */
2882     Int8    len_stats;   /* length of the database for statistical purposes */
2883     Uint4   nseq_stats;  /* number of seqs of the database for statistical purposes */
2884     Int4    first_oid; /* first ordinal id in a range */
2885     Int4    last_oid;  /* last ordinal id in a range */
2886     Int4    membership;/* membership bit */
2887     Int4    maxlen;    /* maximal length of seqs in the database */
2888 } ReadDBAlias, PNTR ReadDBAliasPtr;
2889 /*
2890     This function frees the 'alias' file for the BLAST databases.
2891 */
2892 
ReadDBAliasFree(ReadDBAliasPtr rdbap)2893 static ReadDBAliasPtr ReadDBAliasFree(ReadDBAliasPtr rdbap)
2894 {
2895 
2896     if (rdbap == NULL)
2897     return NULL;
2898 
2899     MemFree(rdbap->title);
2900     MemFree(rdbap->dblist);
2901     MemFree(rdbap->gilist);
2902     MemFree(rdbap->oidlist);
2903     MemFree(rdbap);
2904     return NULL;
2905 }
2906 
2907 /*
2908     This function reads the 'alias' file for the BLAST databases.
2909 */
2910 static ReadDBAliasPtr
readdb_read_alias_file(CharPtr filename)2911 readdb_read_alias_file(CharPtr filename)
2912 
2913 {
2914     CharPtr buffer;
2915     CharPtr file_path, ptr;
2916     Char file_buffer[PATH_MAX], full_buffer[PATH_MAX];
2917     ReadDBAliasPtr rdbap;
2918     FILE *fp;
2919     Int4 buflen, buffer_length, total_length=PATH_MAX, length;
2920     long tmplong;
2921 
2922     if (filename == NULL || (buflen = FileLength(filename)) <= 0)
2923         return NULL;
2924 
2925     fp = FileOpen(filename, "r");
2926     if (fp == NULL)
2927         return NULL;
2928 
2929     if (!s_IsTextFile(filename)) {
2930         ErrPostEx(SEV_ERROR, 0, 1, "%s is not a valid alias file\n", filename);
2931         return NULL;
2932     }
2933 
2934     file_path = Nlm_FilePathFind(filename);
2935 
2936     buffer = MemNew(buflen + 1);
2937 
2938     rdbap = (ReadDBAliasPtr) MemNew(sizeof(ReadDBAlias));
2939 
2940     while (Nlm_FileGets(buffer, buflen + 1, fp) != NULL) {
2941 
2942         Char* newline_ptr = NULL;   /* pointer to newline character */
2943 
2944         if (buffer[0] == '#')  /* ignore comments. */
2945             continue;
2946 
2947         if (StringNCmp(buffer, "TITLE", 5) == 0) {
2948             ptr = buffer;
2949             ptr += 5;
2950             while (isspace((int)*ptr)) /* skip whitespace */
2951                 ptr++;
2952 
2953             newline_ptr = Nlm_StrChr(ptr, '\n');
2954             if (newline_ptr != NULL) {
2955                 *newline_ptr = NULLB;
2956             } else {
2957                 *ptr = NULLB;
2958             }
2959 
2960             if (*ptr != NULLB)
2961                 rdbap->title = StringSave(ptr);
2962             /* empty title is okay? */
2963 
2964             continue;
2965         }
2966 
2967         if (StringNCmp(buffer, "DBLIST", 6) == 0) {
2968             ptr = buffer;
2969             ptr += 6;
2970             while (isspace((int)*ptr)) /* skip whitespace */
2971                 ptr++;
2972 
2973             newline_ptr = Nlm_StrChr(ptr, '\n');
2974             if (newline_ptr != NULL) {
2975                 *newline_ptr = NULLB;
2976             } else {
2977                 *ptr = NULLB;
2978             }
2979 
2980             if (*ptr != NULLB)
2981             {
2982                 Boolean done = FALSE, first = TRUE;
2983                 if (file_path && *file_path != NULLB)
2984                 { /* Prepend file_path if it exists. */
2985                     rdbap->dblist = MemNew(total_length*sizeof(Char));
2986                     length=0;
2987                     while (!done)
2988                     {
2989                         done = readdb_parse_db_names(&ptr, file_buffer);
2990 
2991                         if(*file_buffer == DIRDELIMCHR)
2992                             StringCpy(full_buffer, file_buffer);
2993                         else
2994                             sprintf(full_buffer, "%s%c%s", file_path,
2995                                     DIRDELIMCHR, file_buffer);
2996 
2997                         buffer_length = StringLen(full_buffer);
2998                         /* + 1 for the extra space in between multiple paths */
2999                         if (buffer_length+length+3 >= total_length)
3000                         {
3001                             rdbap->dblist = Realloc(rdbap->dblist,
3002                                             2*total_length);
3003                             total_length *= 2;
3004                         }
3005                         if (!first)
3006                         {
3007                             StringCpy(rdbap->dblist+length, " ");
3008                             length++;
3009                         }
3010                         else
3011                             first = FALSE;
3012                         StringCpy(rdbap->dblist+length, "\"");
3013                         length++;
3014                         StringCpy(rdbap->dblist+length, full_buffer);
3015                         length += buffer_length;
3016                         StringCpy(rdbap->dblist+length, "\"");
3017                         length++;
3018                     }
3019 
3020                 }
3021                 else
3022                     rdbap->dblist = StringSave(ptr);
3023             }
3024             if (rdbap->dblist == NULL) {
3025                 ErrPostEx(SEV_ERROR, 0, 0, "DBLIST field in %s is empty",
3026                           filename);
3027                 return NULL;
3028             }
3029 
3030             continue;
3031         }
3032 
3033         if (StringNCmp(buffer, "GILIST", 6) == 0) {
3034             ptr = buffer;
3035             ptr += 6;
3036             while (isspace((int)*ptr)) /* skip whitespace */
3037                 ptr++;
3038 
3039             newline_ptr = Nlm_StrChr(ptr, '\n');
3040             if (newline_ptr != NULL) {
3041                 *newline_ptr = NULLB;
3042             } else {
3043                 *ptr = NULLB;
3044             }
3045 
3046             if (*ptr != NULLB) {
3047                 if (file_path && StrCmp(file_path,"")) {
3048                     /* add directory prefix, if any */
3049                     sprintf(full_buffer, "%s%c%s", file_path, DIRDELIMCHR, ptr);
3050                     rdbap->gilist = StringSave(full_buffer);
3051                 } else {
3052                     rdbap->gilist = StringSave(ptr);
3053                 }
3054             }
3055             if (rdbap->gilist == NULL) {
3056                 ErrPostEx(SEV_WARNING, 0, 0, "GILIST field in %s is empty",
3057                           filename);
3058             }
3059 
3060             continue;
3061         }
3062 
3063         if (StringNCmp(buffer, "OIDLIST", 7) == 0) {
3064             ptr = buffer;
3065             ptr += 7;
3066             while (isspace((int)*ptr)) /* skip whitespace */
3067                 ptr++;
3068 
3069             newline_ptr = Nlm_StrChr(ptr, '\n');
3070             if (newline_ptr != NULL) {
3071                 *newline_ptr = NULLB;
3072             } else {
3073                 *ptr = NULLB;
3074             }
3075 
3076             if (*ptr != NULLB) {
3077                 Boolean done=FALSE, first=TRUE;
3078                 if (file_path && StrCmp(file_path, "")) {
3079                     /* add directory prefix, if any */
3080                     rdbap->oidlist = MemNew(total_length*sizeof(Char));
3081                     length = 0;
3082                     while (!done) {
3083                         done = readdb_parse_db_names(&ptr, file_buffer);
3084                         sprintf(full_buffer, "%s%c%s", file_path,
3085                                 DIRDELIMCHR, file_buffer);
3086 
3087                         if (*file_buffer == DIRDELIMCHR)
3088                             StringCpy(full_buffer, file_buffer);
3089                         else
3090                             sprintf(full_buffer, "%s%c%s", file_path,
3091                                     DIRDELIMCHR, file_buffer);
3092 
3093                         buffer_length = StringLen(full_buffer);
3094                         if (buffer_length+length > total_length) {
3095                             rdbap->oidlist = Realloc(rdbap->oidlist,
3096                                     2*total_length);
3097                             total_length *= 2;
3098                         }
3099                         if (!first) {
3100                             StringCpy(rdbap->oidlist+length, " ");
3101                             length++;
3102                         } else {
3103                             first = FALSE;
3104                         }
3105                         StringCpy(rdbap->oidlist+length, full_buffer);
3106                         length += buffer_length;
3107                     }
3108 
3109                 } else {
3110                     rdbap->oidlist = StringSave(ptr);
3111                 }
3112             }
3113             if (rdbap->oidlist == NULL) {
3114                 ErrPostEx(SEV_WARNING, 0, 0, "OIDLIST field in %s is empty",
3115                           filename);
3116             }
3117 
3118             continue;
3119         }
3120 
3121         if (StringNCmp(buffer, "FIRST_OID", 9) == 0) {
3122            ptr = buffer + 9;
3123            while (isspace((int)*ptr)) /* skip whitespace */
3124               ptr++;
3125            newline_ptr = Nlm_StrChr(ptr, '\n');
3126            if (newline_ptr != NULL) {
3127                *newline_ptr = NULLB;
3128            } else {
3129                *ptr = NULLB;
3130            }
3131            if (*ptr != NULLB) {
3132                sscanf(ptr, "%ld", &tmplong);
3133                rdbap->first_oid = tmplong;
3134            }
3135            continue;
3136         }
3137         if (StringNCmp(buffer, "LAST_OID", 8) == 0) {
3138            ptr = buffer + 8;
3139            while (isspace((int)*ptr)) /* skip whitespace */
3140               ptr++;
3141            newline_ptr = Nlm_StrChr(ptr, '\n');
3142            if (newline_ptr != NULL) {
3143                *newline_ptr = NULLB;
3144            } else {
3145                *ptr = NULLB;
3146            }
3147            if (*ptr != NULLB) {
3148                sscanf(ptr, "%ld", &tmplong);
3149                rdbap->last_oid = tmplong;
3150            }
3151            continue;
3152         }
3153 
3154         if (StringNCmp(buffer, "LENGTH", 6) == 0) {
3155             ptr = buffer;
3156             ptr += 6;
3157             while (isspace((int)*ptr)) /* skip whitespace */
3158                 ptr++;
3159 
3160             newline_ptr = Nlm_StrChr(ptr, '\n');
3161             if (newline_ptr != NULL) {
3162                 *newline_ptr = NULLB;
3163             } else {
3164                 *ptr = NULLB;
3165             }
3166             if (*ptr != NULLB)
3167                 sscanf(ptr, "%lld", &(rdbap->len));
3168 
3169             continue;
3170         }
3171         if (StringNCmp(buffer, "MEMB_BIT", 8) == 0) {
3172             ptr = buffer;
3173             ptr += 8;
3174             while (isspace((int)*ptr)) /* skip whitespace */
3175                 ptr++;
3176             newline_ptr = Nlm_StrChr(ptr, '\n');
3177             if (newline_ptr != NULL) {
3178                 *newline_ptr = NULLB;
3179             } else {
3180                 *ptr = NULLB;
3181             }
3182             if (*ptr != NULLB) {
3183                 tmplong = 0;
3184                 sscanf(ptr, "%ld", &tmplong);
3185                 rdbap->membership = tmplong;
3186             }
3187             continue;
3188         }
3189         if (StringNCmp(buffer, "NSEQ", 4) == 0) {
3190             ptr = buffer;
3191             ptr += 4;
3192             while (isspace((int)*ptr)) /* skip whitespace */
3193                 ptr++;
3194 
3195             newline_ptr = Nlm_StrChr(ptr, '\n');
3196             if (newline_ptr != NULL) {
3197                 *newline_ptr = NULLB;
3198             } else {
3199                 *ptr = NULLB;
3200             }
3201             if (*ptr != NULLB)
3202                 rdbap->nseq = atol(ptr);
3203 
3204             continue;
3205         }
3206         if (StringNCmp(buffer, "STATS_NSEQ", 10) == 0) {
3207             ptr = buffer;
3208             ptr += 10;
3209             while (isspace((int)*ptr)) /* skip whitespace */
3210                 ptr++;
3211 
3212             newline_ptr = Nlm_StrChr(ptr, '\n');
3213             if (newline_ptr != NULL) {
3214                 *newline_ptr = NULLB;
3215             } else {
3216                 *ptr = NULLB;
3217             }
3218             if (*ptr != NULLB)
3219                 rdbap->nseq_stats = atol(ptr);
3220 
3221             continue;
3222         }
3223         if (StringNCmp(buffer, "STATS_TOTLEN", 12) == 0) {
3224             ptr = buffer;
3225             ptr += 12;
3226             while (isspace((int)*ptr)) /* skip whitespace */
3227                 ptr++;
3228 
3229             newline_ptr = Nlm_StrChr(ptr, '\n');
3230             if (newline_ptr != NULL) {
3231                 *newline_ptr = NULLB;
3232             } else {
3233                 *ptr = NULLB;
3234             }
3235             if (*ptr != NULLB)
3236                 rdbap->len_stats = atol(ptr);
3237 
3238             continue;
3239         }
3240         if (StringNCmp(buffer, "MAXLEN", 6) == 0) {
3241            ptr = buffer;
3242            ptr += 6;
3243            while (isspace((int)*ptr)) /* skip whitespace */
3244                 ptr++;
3245 
3246            newline_ptr = Nlm_StrChr(ptr, '\n');
3247            if (newline_ptr != NULL) {
3248                *newline_ptr = NULLB;
3249            } else {
3250                *ptr = NULLB;
3251            }
3252            if (*ptr != NULLB)
3253                rdbap->maxlen = atol(ptr);
3254 
3255            continue;
3256         }
3257     }
3258 
3259     MemFree(file_path);
3260     MemFree(buffer);
3261     FILECLOSE(fp);
3262 
3263     if (rdbap->dblist == NULL) {
3264         ErrPostEx(SEV_ERROR, 0, 0, "Alias file (%s) is missing DBLIST field\n",
3265                   filename);
3266         return ReadDBAliasFree(rdbap);
3267     }
3268 
3269     return rdbap;
3270 }
3271 
3272 /*
3273     Check if an alias file contains a database by the same name.  This
3274     situation will lead to an infinite recursion and we do not allow it.
3275     TRUE is returned if recursive situation found, otherwise FALSE.
3276 */
3277 
CheckForRecursion(CharPtr alias_filename,CharPtr db_list)3278 static Boolean CheckForRecursion(CharPtr alias_filename, CharPtr db_list)
3279 
3280 {
3281     Boolean done=FALSE;
3282     Char buffer[PATH_MAX];
3283 
3284         while (!done) {
3285             done = readdb_parse_db_names(&db_list, buffer);
3286             if (*buffer == NULLB)
3287             break;
3288 
3289         if (StringCmp(buffer, alias_filename) == 0)
3290         {
3291             ErrPostEx(SEV_WARNING, 0, 0,
3292                                         "Recursive situation detected with %s, ignoring alias file", buffer);
3293             return TRUE;
3294         }
3295     }
3296 
3297     return FALSE;
3298 
3299 }
3300 /* Check if .?in file exists for specified database
3301    and assign proper is_prot to rdfp->is_prot */
3302 
IndexFileExists(CharPtr full_filename,ReadDBFILEPtr PNTR rdfpp,Boolean PNTR is_prot,Uint1 init_state)3303 static    Int2    IndexFileExists(CharPtr full_filename, ReadDBFILEPtr PNTR rdfpp, Boolean PNTR is_prot, Uint1 init_state)
3304 {
3305     Char    buffer[PATH_MAX];
3306     Int4    length = 0, i;
3307     ReadDBAliasPtr rdbap;
3308     ReadDBFILEPtr rdfp=NULL;
3309 
3310     /* Check for protein and nucl. alias files first. */
3311     if (*is_prot == READDB_DB_UNKNOWN || *is_prot == READDB_DB_IS_PROT) {
3312 
3313         sprintf(buffer, "%s.pal", full_filename);
3314     rdbap = readdb_read_alias_file(buffer);
3315     if (rdbap && CheckForRecursion(full_filename, rdbap->dblist) == FALSE &&
3316         (rdfp=readdb_new_ex2(rdbap->dblist, TRUE, init_state, rdbap->oidlist, rdbap->gilist)))
3317     {
3318        MemFree(rdfp->aliasfilename);
3319        rdfp->aliasfilename = StringSave(Nlm_FileNameFind(full_filename));
3320         if (rdfp->cih)
3321             rdfp->aliasfilebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids, rdfp->aliasfilename, TRUE);
3322 
3323             /* In case first_oid and last_oid are given in the alias file, we
3324              * create a mask in the following lines that only selects those
3325              * ordinal id's that are in the range of first_oid and last_oid */
3326             if (rdbap->first_oid > 0) {
3327                 OIDListPtr oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
3328                 Int4 total, mask_index, oid, oid_bit;
3329                 oidlist->total = rdbap->last_oid + 1;
3330                 total = rdbap->last_oid/MASK_WORD_SIZE + 2;
3331                 oidlist->list = (Uint4Ptr) MemNew (total*sizeof(Int4));
3332                 oidlist->memory = oidlist->list;
3333                 for (oid=rdbap->first_oid-1; oid<rdbap->last_oid; oid++) {
3334                     mask_index = oid / MASK_WORD_SIZE;
3335                     oid_bit =
3336                         0x1 << (MASK_WORD_SIZE - 1 - oid % MASK_WORD_SIZE);
3337                     oidlist->list[mask_index] |= oid_bit;
3338                 }
3339                 for (i=0; i<total; i++) {
3340                     oidlist->list[i] = Nlm_SwapUint4(oidlist->list[i]);
3341                 }
3342                 oidlist->filename = StringSave(rdfp->aliasfilename);
3343                 rdfp->oidlist = oidlist;
3344             }
3345 
3346             *rdfpp = rdfp;
3347             /* replace standard title with new one. */
3348             if (rdbap->title) {
3349                 ReadDBFILEPtr rdfp_var = NULL;
3350                 if (rdfp->title) {
3351                     MemFree(rdfp->title);
3352                 }
3353                 rdfp->title = rdbap->title;
3354                 rdbap->title = NULL;
3355                 /* Free all other titles since we use one from alias file. */
3356                 rdfp_var = rdfp->next;
3357                 while (rdfp_var) {
3358                     rdfp_var->title = MemFree(rdfp_var->title);
3359                     rdfp_var = rdfp_var->next;
3360                 }
3361             }
3362         /* Length of the database is already calculated in alias file */
3363         if (rdbap->len) {
3364             rdfp->aliaslen = rdbap->len;
3365         }
3366         if (rdbap->nseq) {
3367             rdfp->aliasnseq = rdbap->nseq;
3368         }
3369         if (rdbap->maxlen) {
3370            rdfp->maxlen = rdbap->maxlen;
3371         }
3372         if (rdbap->nseq_stats) {
3373             rdfp->nseq_stats = rdbap->nseq_stats;
3374         }
3375         if (rdbap->len_stats) {
3376            rdfp->totlen_stats = rdbap->len_stats;
3377         }
3378 
3379         rdfp->membership_bit = rdbap->membership;
3380 
3381         rdbap = ReadDBAliasFree(rdbap);
3382         return 1;
3383     }
3384         rdbap = ReadDBAliasFree(rdbap);
3385         /* Try finding an index file */
3386         sprintf(buffer, "%s.pin", full_filename);
3387         length = FileLength(buffer);
3388         if (length > 0) {
3389             *is_prot = READDB_DB_IS_PROT;
3390         }
3391     }
3392     if ((*is_prot == READDB_DB_IS_NUC) || (rdfp == NULL && *is_prot == READDB_DB_UNKNOWN)) {
3393         sprintf(buffer, "%s.nal", full_filename);
3394         rdbap = readdb_read_alias_file(buffer);
3395     if (rdbap && CheckForRecursion(full_filename, rdbap->dblist) == FALSE &&
3396         (rdfp=readdb_new_ex2(rdbap->dblist, FALSE, init_state, rdbap->oidlist, rdbap->gilist)))
3397     {
3398         MemFree(rdfp->aliasfilename);
3399         rdfp->aliasfilename = StringSave(Nlm_FileNameFind(full_filename));
3400         if (rdfp->cih && rdfp->aliasfilebit == 0)
3401             rdfp->aliasfilebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids, rdfp->aliasfilename, FALSE);
3402 
3403             /* In case first_oid and last_oid are given in the alias file, we
3404              * create a mask in the following lines that only selects those
3405              * ordinal id's that are in the range of first_oid and last_oid */
3406             if (rdbap->first_oid > 0) {
3407                 OIDListPtr oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
3408                 Int4 total, mask_index, oid, oid_bit;
3409                 oidlist->total = rdbap->last_oid + 1;
3410                 total = rdbap->last_oid/MASK_WORD_SIZE + 2;
3411                 oidlist->list = (Uint4Ptr) MemNew (total*sizeof(Int4));
3412                 oidlist->memory = oidlist->list;
3413                 for (oid=rdbap->first_oid-1; oid<rdbap->last_oid; oid++) {
3414                     mask_index = oid / MASK_WORD_SIZE;
3415                     oid_bit =
3416                         0x1 << (MASK_WORD_SIZE - 1 - oid % MASK_WORD_SIZE);
3417                     oidlist->list[mask_index] |= oid_bit;
3418                 }
3419                 for (i=0; i<total; i++) {
3420                     oidlist->list[i] = Nlm_SwapUint4(oidlist->list[i]);
3421                 }
3422                 oidlist->filename = StringSave(rdfp->aliasfilename);
3423                 rdfp->oidlist = oidlist;
3424             }
3425 
3426             *rdfpp = rdfp;
3427             /* replace standard title with new one. */
3428             if (rdbap->title) {
3429                 if (rdfp->title) {
3430                     MemFree(rdfp->title);
3431                 }
3432                 rdfp->title = rdbap->title;
3433                 rdbap->title = NULL;
3434         /* Length of the database is already calculated in alias file */
3435         if (rdbap->len) {
3436             rdfp->aliaslen = rdbap->len;
3437         }
3438         if (rdbap->nseq) {
3439             rdfp->aliasnseq = rdbap->nseq;
3440         }
3441         if (rdbap->maxlen) {
3442            rdfp->maxlen = rdbap->maxlen;
3443         }
3444         if (rdbap->nseq_stats) {
3445             rdfp->nseq_stats = rdbap->nseq_stats;
3446         }
3447         if (rdbap->len_stats) {
3448            rdfp->totlen_stats = rdbap->len_stats;
3449         }
3450         rdfp->membership_bit = rdbap->membership;
3451 
3452                 rdfp = rdfp->next;
3453                 while (rdfp) {
3454                     rdfp->title = MemFree(rdfp->title);
3455                     rdfp = rdfp->next;
3456                 }
3457             }
3458 
3459             rdbap = ReadDBAliasFree(rdbap);
3460             return 1;
3461     }
3462         rdbap = ReadDBAliasFree(rdbap);
3463         /* Try finding an index file */
3464         sprintf(buffer, "%s.nin", full_filename);
3465         length = FileLength(buffer);
3466         if (length > 0) {
3467             *is_prot = READDB_DB_IS_NUC;
3468         }
3469     }
3470 
3471     if (length > 0)
3472         return 0;
3473     else
3474         return -1;
3475 }
3476 
FindBlastDBFileEx(CharPtr lookup_dir,CharPtr filename)3477 static CharPtr    FindBlastDBFileEx (CharPtr lookup_dir, CharPtr filename)
3478 {
3479 
3480     CharPtr    buffer, buffer1, envp = NULL;
3481     Int4    len;
3482 
3483     /* We cannot deal with strings larger than PATH_MAX in a platform
3484      * independent manner */
3485     if (StringLen(filename) > PATH_MAX-1) {
3486         ErrPostEx(SEV_WARNING, 0, 0, "Argument to FindBlastDBFile is "
3487                   "longer than PATH_MAX");
3488         return NULL;
3489     }
3490 
3491     /* check current directory */
3492     len = FileLength(filename);
3493     if (len)
3494        return StringSave(filename);
3495 
3496     buffer  = MemNew(PATH_MAX);
3497     buffer1 = MemNew(PATH_MAX);
3498 
3499     if( !lookup_dir) return NULL;
3500     StringCpy(buffer,lookup_dir);
3501 
3502     sprintf(buffer1, "%s%s%s", buffer, DIRDELIMSTR, filename);
3503 
3504     /* see if the file is not empty */
3505 
3506     len = FileLength(buffer1);
3507 
3508     MemFree(buffer);
3509 
3510     if (len)
3511         return buffer1;
3512     else
3513         MemFree(buffer1);
3514 
3515 /* give up */
3516 return NULL;
3517 }
3518 
FindBlastDBFile(CharPtr filename)3519 CharPtr    FindBlastDBFile (CharPtr filename)
3520 {
3521     Char *wrk_buf;
3522     char blast_dir[PATH_MAX];
3523     Char *one_blast_dir ;
3524     char path_delim[2];
3525     CharPtr found_name = NULL;
3526 
3527     s_GetBlastDirInfo(blast_dir, path_delim);
3528 
3529     /* parse paths and lookup filename */
3530     for( one_blast_dir = Nlm_StringTokMT (blast_dir,(char*)path_delim, (char **)&wrk_buf);
3531 	 one_blast_dir != NULL;
3532 	 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
3533     {
3534 	found_name = FindBlastDBFileEx(one_blast_dir,filename);
3535 	if( found_name  ) return found_name;
3536     }
3537     return NULL;
3538 }
3539 /*
3540     filename: name of the file to be openend.
3541     is_prot: three choices: protein, nucleotide, or either one.
3542     init_state: how much should be initialized.
3543         READDB_NEW_DO_ALL : initialize everything possible
3544         READDB_NEW_DO_REPORT : init enough for a report on db size etc.
3545     cih: common index
3546 */
3547 
3548 static ReadDBFILEPtr
readdb_new_internalEx(CharPtr lookup_dir,CharPtr filename,Uint1 is_prot,Uint1 init_state,CommonIndexHeadPtr cih)3549 readdb_new_internalEx(CharPtr lookup_dir, CharPtr filename, Uint1 is_prot, Uint1 init_state, CommonIndexHeadPtr cih)
3550 {
3551     ReadDBFILEPtr rdfp=NULL;
3552     Char buffer[PATH_MAX], buffer1[PATH_MAX];
3553     Char commonindex_full_filename[PATH_MAX];
3554     Char    database_dir[PATH_MAX] = "";
3555     Uint4 seq_type, formatdb_ver, date_length, title_length, value;
3556     Int2 status;
3557     Int4 length, num_seqs;
3558     CharPtr    charptr, envp = NULL;
3559     Boolean    localdb = FALSE;
3560 
3561     if (filename == NULL)
3562         return NULL;
3563 
3564 
3565     /* We need to find out what directory to use and which index system will
3566        be used for searching OID by give GI.  The algorithm is:
3567        Define blast database directory by present database searching
3568        in the following order  (and stopping when it is found):
3569        1) If absolute path was given, only this file is attempted.
3570        2) Current working directory
3571        3) getenv("BLASTDB")
3572        4) .ncbirc file: a) current working directory, b) home directory, c)
3573            NCBI directory (obtained from the environment)
3574        Then defind which index system to use.  If "CommonIndex" then
3575        we need all CommonIndex and ISAM files to be present in database directory,
3576        if ISAM, them the only ISAM index files should be present in the
3577        current directory
3578     */
3579 
3580 
3581       /* first see in the current directory */
3582 
3583     if ((status=IndexFileExists(filename, &rdfp, &is_prot,
3584                                 init_state)) >= 0) {
3585 
3586     if (status > 0)
3587             return rdfp;
3588 
3589         /* use current directory */
3590         charptr = Nlm_FilePathFind(filename);
3591         StringCpy(database_dir, charptr);
3592         MemFree(charptr);
3593         localdb = TRUE;
3594         rdfp = readdb_destruct(rdfp);
3595     } else  {
3596 	/* set passed directory location */
3597 	if( !lookup_dir ) return NULL;
3598         StringCpy(buffer, lookup_dir);
3599 
3600         sprintf(buffer1, "%s%s%s", buffer, DIRDELIMSTR, filename);
3601         if ((status=IndexFileExists(buffer1, &rdfp, &is_prot,
3602                                     init_state)) >= 0) {
3603             if (status > 0){
3604                 return rdfp;
3605 	    }
3606             /* database file is in directory 'buffer' */
3607             StringCpy(database_dir, buffer);
3608             rdfp = readdb_destruct(rdfp);
3609         }
3610     }
3611 
3612     /* ATTENTION: at this point database_dir contains file directory name */
3613 
3614     rdfp = readdb_destruct(rdfp);
3615     rdfp = ReadDBFILENew();
3616 
3617     if (rdfp == NULL)
3618     return NULL;
3619 
3620     rdfp->filename = StringSave(filename);
3621 
3622     /*rdfp->is_prot = is_prot;*/
3623     if (is_prot)
3624         rdfp->parameters |= READDB_IS_PROT;
3625 
3626 
3627     /* Here we know that database is in database_dir directory */
3628 
3629     /* constract full file name */
3630     if (!StringCmp(database_dir, "")) {
3631         sprintf(rdfp->full_filename, "%s", Nlm_FileNameFind(filename));
3632     } else if (!localdb) {
3633         sprintf(rdfp->full_filename, "%s%s%s", database_dir, DIRDELIMSTR, filename);
3634     } else  {
3635         sprintf(rdfp->full_filename, "%s%s%s", database_dir, DIRDELIMSTR, Nlm_FileNameFind(filename));
3636     }
3637 
3638     /* Now let's find out which index system to use */
3639 
3640     /* First see if user has preferences */
3641     StringCpy(buffer1, "CommonIndex");
3642 
3643     if (getenv("INDEX_SYSTEM") &&
3644         StringCmp(getenv("INDEX_SYSTEM"), "CommonIndex"))
3645         StringCpy(buffer1, "ISAM");
3646 
3647     Nlm_GetAppParam ("NCBI", "BLAST", "INDEX_SYSTEM", buffer1,
3648                      buffer, PATH_MAX);
3649 
3650     isCommonIndex = !StrCmp("CommonIndex", buffer);
3651 
3652     /* now we know that if isCommonIndex == TRUE, than it is
3653        prefered to use CommonIndex */
3654 
3655     /* test if there exist common index file */
3656     if (isCommonIndex) {
3657         if (!StringCmp(database_dir, "")) {
3658             sprintf(commonindex_full_filename, "%s", COMMONINDEX_FN);
3659         } else {
3660             sprintf(commonindex_full_filename, "%s%s%s", database_dir, DIRDELIMSTR, COMMONINDEX_FN);
3661         }
3662 
3663         if (!(length = FileLength(commonindex_full_filename))) {
3664             /* no CommonIndex files in this directory, try to use ISAM only */
3665             isCommonIndex = FALSE;
3666         }
3667     }
3668 
3669     /* check if present main three files: index, sequences, headers */
3670 
3671     sprintf(buffer, "%s.%cin", rdfp->full_filename, is_prot? 'p':'n');
3672     if((rdfp->indexfp = NlmOpenMFILE(buffer)) == NULL) {
3673         ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
3674         rdfp = readdb_destruct(rdfp);
3675         return rdfp;
3676     }
3677 
3678     if (init_state & READDB_NEW_DO_ALL)
3679         if (ReadDBOpenMHdrAndSeqFiles(rdfp) == FALSE)
3680             ErrPostEx(SEV_ERROR, 0, 0,
3681                       "ReadDBOpenMHdrAndSeqFiles: failed to map files\n");
3682 
3683     /* fill in other fields of rdfp-> */
3684     NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3685     formatdb_ver = Nlm_SwapUint4(value);
3686 
3687     /* Here we will handle version of formatdb program */
3688 
3689     if (formatdb_ver != FORMATDB_VER && formatdb_ver != FORMATDB_VER_TEXT) {
3690         ErrPostEx(SEV_WARNING, 0, 0, "readdb: wrong version of formatdb "
3691                   "was used to make database %s.", filename);
3692         rdfp = readdb_destruct(rdfp);
3693         return NULL;
3694     }
3695     rdfp->formatdb_ver = formatdb_ver;
3696 
3697     NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3698     seq_type = Nlm_SwapUint4(value);
3699     if ((is_prot && seq_type == 0) || (!is_prot && seq_type == 1)) {
3700         rdfp = readdb_destruct(rdfp);
3701         return rdfp;
3702     }
3703     NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3704     title_length = Nlm_SwapUint4(value);
3705 
3706     if (title_length) {
3707         rdfp->title = (CharPtr)Nlm_Malloc((title_length+1)*sizeof(Char));
3708         NlmReadMFILE((Uint1Ptr) rdfp->title, title_length, 1, rdfp->indexfp);
3709         rdfp->title[title_length] = NULLB;
3710     } else {    /* Use the filename, if there is no title. */
3711         rdfp->title = StringSave(rdfp->filename);;
3712     }
3713 
3714     NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3715     date_length = Nlm_SwapUint4(value);
3716 
3717     rdfp->date = (CharPtr)Nlm_Malloc((date_length+1)*sizeof(Char));
3718     NlmReadMFILE((Uint1Ptr) rdfp->date, date_length, 1, rdfp->indexfp);
3719     rdfp->date[date_length] = NULLB;
3720 
3721     NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3722     num_seqs = rdfp->num_seqs = Nlm_SwapUint4(value);
3723 
3724     if (formatdb_ver == FORMATDB_VER_TEXT)
3725     {
3726         NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3727         rdfp->totlen = Nlm_SwapUint4(value);
3728     }
3729     else
3730     {
3731         rdfp->totlen = FormatDbUint8Read(rdfp->indexfp);
3732     }
3733 
3734     NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3735     rdfp->maxlen = Nlm_SwapUint4(value);
3736 
3737     /* Initializing taxonomy names database if it exists (only once!) */
3738     if (rdfp->formatdb_ver > FORMATDB_VER_TEXT &&
3739         init_state & READDB_NEW_DO_TAXDB && taxonomyDbLoaded == FALSE) {
3740         rdfp->taxinfo = RDBTaxInfoInit();
3741         taxonomyDbLoaded = TRUE;
3742     }
3743 
3744     if (init_state & READDB_NEW_DO_REPORT) {
3745         rdfp->parameters |= READDB_CONTENTS_ALLOCATED;
3746         /*rdfp->contents_allocated = TRUE; */
3747         /* Some was allocated, but index pointers are NULLs - that's OK */
3748         return rdfp;
3749     }
3750 
3751     if (!((title_length + date_length)%4) && rdfp->indexfp->mfile_true) {
3752         rdfp->header_index = (Uint4Ptr) rdfp->indexfp->mmp;
3753         rdfp->indexfp->mmp += 4 * (num_seqs+1);
3754 
3755         rdfp->sequence_index = (Uint4Ptr) rdfp->indexfp->mmp;
3756         rdfp->indexfp->mmp += 4 * (num_seqs+1);
3757 
3758         rdfp->ambchar_index = (Uint4Ptr) rdfp->indexfp->mmp;
3759         rdfp->indexfp->mmp += 4 * (num_seqs+1);
3760     } else {
3761         /* Use old stuff */
3762 
3763         if((rdfp->header_index =
3764             (Uint4Ptr) Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3765             rdfp = readdb_destruct(rdfp);
3766             return rdfp;
3767         }
3768 
3769         rdfp->header_index_start = rdfp->header_index;
3770         rdfp->header_index_offset = NlmTellMFILE(rdfp->indexfp);
3771         NlmReadMFILE((Uint1Ptr) rdfp->header_index, 4, num_seqs+1,
3772                      rdfp->indexfp);
3773 
3774         if((rdfp->sequence_index =
3775             (Uint4Ptr)Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3776             rdfp = readdb_destruct(rdfp);
3777             return rdfp;
3778         }
3779         rdfp->sequence_index_start = rdfp->sequence_index;
3780         NlmReadMFILE((Uint1Ptr) rdfp->sequence_index, 4, num_seqs+1,
3781                      rdfp->indexfp);
3782 
3783         /* For nucleotide sequence we will process ambiguity file */
3784         if(!is_prot) {
3785             if((rdfp->ambchar_index = (Uint4Ptr)Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3786                 rdfp = readdb_destruct(rdfp);
3787                 return rdfp;
3788             }
3789             rdfp->ambchar_index_start = rdfp->ambchar_index;
3790             NlmReadMFILE((Uint1Ptr) rdfp->ambchar_index, 4, num_seqs+1, rdfp->indexfp);
3791         }
3792     }
3793 
3794 
3795     /* Contents were allocated above. */
3796     /*rdfp->contents_allocated = TRUE;*/
3797     rdfp->parameters |= READDB_CONTENTS_ALLOCATED;
3798 
3799     /* mmap is not being used, allocate a buffer 2 longer (for sentinel bytes)
3800        than the longest subject length. */
3801     if (rdfp->sequencefp && rdfp->sequencefp->mfile_true == FALSE) {
3802         rdfp->buffer = (UcharPtr)Nlm_Malloc((2+rdfp->maxlen)*sizeof(Uint1));
3803         if (rdfp->buffer == NULL) {
3804             rdfp = readdb_destruct(rdfp);
3805             return rdfp;
3806         }
3807         rdfp->allocated_length = 2 + rdfp->maxlen;
3808     }
3809 
3810     /* Now initializing Numeric ISAM indexes */
3811     sprintf(buffer,  "%s.%cnd", rdfp->full_filename, is_prot? 'p':'n');
3812     sprintf(buffer1, "%s.%cni", rdfp->full_filename, is_prot? 'p':'n');
3813 
3814     if(FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3815         if((rdfp->nisam_opt = ISAMObjectNew(ISAMNumeric,
3816                                             buffer, buffer1)) == NULL) {
3817             ErrPostEx(SEV_WARNING, 0, 0, "Failed to create NISAM object");
3818             rdfp = readdb_destruct(rdfp);
3819             return rdfp;
3820         }
3821     }
3822 
3823     /* Now initializing string ISAM indexes */
3824 
3825     sprintf(buffer,  "%s.%csd", rdfp->full_filename, is_prot? 'p':'n');
3826     sprintf(buffer1, "%s.%csi", rdfp->full_filename, is_prot? 'p':'n');
3827 
3828     if(FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3829 
3830         if((rdfp->sisam_opt = ISAMObjectNew(ISAMString,
3831                                             buffer, buffer1)) == NULL) {
3832             ErrPostEx(SEV_WARNING, 0, 0, "Failed to create SISAM object");
3833             rdfp = readdb_destruct(rdfp);
3834             return rdfp;
3835         }
3836 
3837         /* This line may be given only for information - how to access
3838            this parameter. We need to intialize ISAM database before
3839            this parameter is available using function above */
3840         rdfp->sparse_idx = ((ISAMDataPtr) rdfp->sisam_opt)->idx_option;
3841     }
3842 
3843     /* Now initializing PIG ISAM indexes */
3844     if (is_prot) {
3845         sprintf(buffer,  "%s.ppd", rdfp->full_filename);
3846         sprintf(buffer1, "%s.ppi", rdfp->full_filename);
3847 
3848         if (FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3849             if ( !(rdfp->isam_pig = ISAMObjectNew(ISAMNumeric,
3850                                                   buffer, buffer1))) {
3851                 ErrPostEx(SEV_WARNING, 0, 0, "Failed to read PIG ISAM object");
3852                 rdfp = readdb_destruct(rdfp);
3853                 return rdfp;
3854             }
3855         }
3856     }
3857 
3858 
3859     /* Now initializing Common index files */
3860     if (isCommonIndex) {
3861         if (cih) {
3862             rdfp->cih = cih;
3863             /*rdfp->handle_common_index = FALSE;*/
3864             rdfp->parameters &= ~READDB_HANDLE_COMMON_INDEX;
3865         } else {
3866             rdfp->cih = CommonIndexInit(commonindex_full_filename);
3867             /*rdfp->handle_common_index = TRUE;*/
3868             rdfp->parameters |= READDB_HANDLE_COMMON_INDEX;
3869         }
3870         if (!(rdfp->cih)) {
3871             isCommonIndex = FALSE;
3872             /*rdfp->handle_common_index = FALSE;*/
3873             rdfp->parameters &= ~READDB_HANDLE_COMMON_INDEX;
3874         } else {
3875             rdfp->filebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids,
3876                                     Nlm_FileNameFind(filename), is_prot);
3877         }
3878     }
3879 
3880     /* Initialize shared information structure */
3881     rdfp->shared_info = (ReadDBSharedInfoPtr) MemNew(sizeof(ReadDBSharedInfo));
3882 
3883     /* Without this, FDReadDeflineAsn will fail in multi-threaded mode! */
3884     if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
3885         SeqEntryLoad();
3886         fdlobjAsnLoad();
3887     }
3888 
3889     return rdfp;
3890 }
3891 
3892 /*
3893     filename: name of the file to be openend.
3894     is_prot: three choices: protein, nucleotide, or either one.
3895     init_state: how much should be initialized.
3896         READDB_NEW_DO_ALL : initialize everything possible
3897         READDB_NEW_DO_REPORT : init enough for a report on db size etc.
3898     cih: common index
3899 */
3900 static ReadDBFILEPtr
readdb_new_internal(CharPtr filename,Uint1 is_prot,Uint1 init_state,CommonIndexHeadPtr cih)3901 readdb_new_internal(CharPtr filename, Uint1 is_prot, Uint1 init_state, CommonIndexHeadPtr cih)
3902 {
3903     ReadDBFILEPtr ret_rdfp = NULL;
3904     Char *wrk_buf;
3905     char blast_dir[PATH_MAX];
3906     Char *one_blast_dir;
3907     char path_delim[2];
3908     if( (ret_rdfp = readdb_new_internalEx(NULL,filename,is_prot,init_state,cih)) ){
3909 	return ret_rdfp;
3910     }
3911 
3912     s_GetBlastDirInfo(blast_dir, path_delim);
3913 
3914     for( one_blast_dir = Nlm_StringTokMT (blast_dir,(char*)path_delim, (char **)&wrk_buf);
3915 	 one_blast_dir != NULL;
3916 	 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
3917     {
3918 	ret_rdfp = readdb_new_internalEx( one_blast_dir,filename,is_prot,init_state,cih);
3919 	if( ret_rdfp )  return ret_rdfp;
3920 
3921         /* Try it again, stripping any path off the filename. */
3922         ret_rdfp = readdb_new_internalEx( one_blast_dir, Nlm_FileNameFind(filename),is_prot,init_state,cih);
3923 	if( ret_rdfp )  return ret_rdfp;
3924     }
3925     return NULL;
3926 }
3927 
OIDListFree(OIDListPtr oidlist)3928 OIDListPtr OIDListFree (OIDListPtr oidlist)
3929 
3930 {
3931     if (oidlist == NULL)
3932         return NULL;
3933 
3934     if (oidlist->memory)
3935         MemFree(oidlist->memory);
3936     else
3937         NlmCloseMFILE(oidlist->mfp);
3938 
3939     if (oidlist->filename)
3940         MemFree(oidlist->filename);
3941 
3942     MemFree(oidlist);
3943 
3944     return NULL;
3945 }
3946 
3947 
ReadOIDList(OIDListPtr oidlist)3948 Boolean    ReadOIDList (OIDListPtr oidlist)
3949 {
3950     NlmMFILEPtr       mmfile;
3951     Int4 length;
3952 
3953 
3954 
3955     /* format of the file:
3956        <number of OID's - N>
3957        <oid1>
3958        <oid2>
3959        ...
3960        <oidN>
3961      */
3962 
3963     /* use memmap */
3964     mmfile = NlmOpenMFILE(oidlist->filename);
3965     if (!mmfile) {
3966         ErrPostEx(SEV_ERROR, 0, 0, "Could not open OID mask %s\n",
3967                   oidlist->filename);
3968         return FALSE;
3969     }
3970 
3971     if (mmfile->mfile_true == FALSE)
3972     {
3973     	length = FileLength(oidlist->filename);
3974    	oidlist->memory = MemNew(length);
3975     	if (oidlist->memory == NULL)
3976         	return FALSE;
3977     	FileRead(oidlist->memory, length, 1, mmfile->fp);
3978     	oidlist->list = oidlist->memory + 1;
3979     	oidlist->total = Nlm_SwapUint4(*((Int4Ptr) oidlist->memory));
3980     	NlmCloseMFILE(mmfile);
3981     }
3982     else
3983     {
3984         oidlist->list = (Uint4Ptr) mmfile->mmp_begin + 1;
3985     	oidlist->mfp = mmfile;
3986     	oidlist->total = Nlm_SwapUint4(*((Int4Ptr) mmfile->mmp_begin));
3987     }
3988 
3989     return TRUE;
3990 }
3991 
3992 Int4 LIBCALL
readdb_validate(ReadDBFILEPtr rdfp)3993 readdb_validate (ReadDBFILEPtr rdfp)
3994 {
3995     Int4 retval = READDB_VALID;
3996 
3997     if ( !rdfp ) {
3998         return READDB_INVALID_NULL_ARG;
3999     }
4000 
4001     /* Verify that all elements of the rdfp linked list are either protein or
4002      * nucleotide */
4003     {
4004         Boolean is_prot = (rdfp->parameters & READDB_IS_PROT) ? TRUE : FALSE;
4005         for (; rdfp; rdfp = rdfp->next) {
4006             if ((rdfp->parameters & READDB_IS_PROT) && !is_prot) {
4007                 retval = READDB_INVALID_MIXED_DBS;
4008                 break;
4009             }
4010         }
4011     }
4012 
4013     return retval;
4014 }
4015 
4016 ReadDBFILEPtr LIBCALL
readdb_new_ex(CharPtr filename,Uint1 is_prot,Boolean init_indices)4017 readdb_new_ex (CharPtr filename, Uint1 is_prot, Boolean init_indices)
4018 
4019 {
4020     return readdb_new_ex2(filename, is_prot, READDB_NEW_INDEX, NULL, NULL);
4021 }
4022 
4023 /* Maximum number of rdfp structures during calls to readdb_new_ex2 before we
4024  * call readdb_merge_gifiles */
4025 #define RDFP_THRESHOLD 10
4026 
4027 ReadDBFILEPtr LIBCALL
readdb_new_ex2(CharPtr filename,Uint1 is_prot,Uint1 init_state,CharPtr oidlist,CharPtr gilist)4028 readdb_new_ex2 (CharPtr filename, Uint1 is_prot, Uint1 init_state, CharPtr oidlist, CharPtr gilist)
4029 
4030 {
4031     Boolean done = FALSE, duplicate_db;
4032     Char buffer[PATH_MAX], buffer_oidlist[PATH_MAX];
4033     Int4 start=0, old_start = 0;
4034     ReadDBFILEPtr new, tmp, var, var1, rdfp_w_oidlist, var2;
4035     CommonIndexHeadPtr    cih = NULL;
4036     Int4 num_whole_db = 0, i, rdfp_ctr = 0;
4037 
4038     new = NULL;
4039     rdfp_w_oidlist = NULL;
4040     buffer_oidlist[0] = NULLB;
4041 
4042     while (!done) {
4043         done = readdb_parse_db_names(&filename, buffer);
4044         if (*buffer == NULLB)
4045             break;
4046         if (oidlist) { /* NOTE: no account taken of duplicate databases?? */
4047             readdb_parse_db_names(&oidlist, buffer_oidlist);
4048             if (*buffer_oidlist == NULLB)
4049                 break;
4050         }
4051         /* Look for duplicates of the database names. */
4052         duplicate_db = FALSE;
4053         var1 = new;
4054         while (var1) {
4055             if (StringCmp(readdb_get_filename(var1), buffer) == 0) {
4056                 duplicate_db = TRUE;
4057                 break;
4058             }
4059             var1 = var1->next;
4060         }
4061         if (duplicate_db)
4062             continue;
4063 
4064         /* 'continue' if return is NULL in case only one of many databases can't
4065            be found.  Warning issued by readdb_new_internal. */
4066         if(!(tmp = readdb_new_internal(buffer, is_prot, init_state, cih)))
4067 
4068             continue;
4069 
4070         if (tmp->cih) {
4071             cih = tmp->cih;
4072         }
4073 
4074         while (tmp) {
4075            if (tmp->oidlist) {
4076               /* Save these separately. */
4077               if (rdfp_w_oidlist == NULL) {
4078                  rdfp_w_oidlist = tmp;
4079               } else {
4080                  var = rdfp_w_oidlist;
4081                  while (var->next)
4082                     var = var->next;
4083                  var->next = tmp;
4084               }
4085            } else {
4086               if (!(tmp->parameters & READDB_NOT_FIRST_TIME)) {
4087                  tmp->parameters |= READDB_NOT_FIRST_TIME;
4088                  if (buffer_oidlist[0] != NULLB) {
4089 
4090                 /* read this OID list */
4091                     tmp->oidlist = (OIDListPtr) MemNew (sizeof(OIDList));
4092                     tmp->oidlist->filename = StringSave(buffer_oidlist);
4093                     if (!ReadOIDList(tmp->oidlist)) {
4094                        return NULL;
4095                     }
4096                  }
4097               }
4098 
4099               if (new == NULL) {
4100                     new = tmp;
4101               } else {
4102                     var = new;
4103                     while(var->next)
4104                        var = var->next;
4105                     var->next = tmp;
4106               }
4107            }
4108            if (gilist) {
4109                /*tmp->gifile = StringSave(gilist); CC: No need for this */
4110                tmp->gilist = Int4ListReadFromFile(gilist);
4111            }
4112            var = tmp->next;
4113            tmp->next = NULL;
4114            tmp = var;
4115         }
4116 
4117         /* If we have more than RDFP_THRESHOLD elements in new, try to
4118          * compress merge rdfp's that have the same underlying blast database
4119          * so that we don't mmap too many index files. This is not an issue if
4120          * init_state is READDB_NEW_DO_REPORT. */
4121         for (var2 = new, rdfp_ctr = 0; var2; var2 = var2->next, rdfp_ctr++) ;
4122         if (rdfp_ctr > RDFP_THRESHOLD && !(init_state & READDB_NEW_DO_REPORT))
4123             new = readdb_merge_gifiles(new);
4124     }
4125 
4126     /* Attach the RDFP's with an OID.
4127        Check if any of them are already present as complete databases */
4128     {{
4129     if (rdfp_w_oidlist) {
4130         if (new == NULL) {
4131             num_whole_db = 0;
4132             new = rdfp_w_oidlist;
4133             var = NULL;
4134         } else {
4135             num_whole_db = 1;
4136             var = new;
4137             while(var->next) {
4138                 num_whole_db++;
4139                 var = var->next;
4140             }
4141             var->next = rdfp_w_oidlist;
4142         }
4143     }
4144 
4145     if (num_whole_db > 0) {
4146         var1 = var;
4147         while (rdfp_w_oidlist) {
4148             for (i=0, var = new; i<num_whole_db; i++, var = var->next) {
4149                 if (StringCmp(var->full_filename, rdfp_w_oidlist->full_filename)
4150                     == 0) {
4151                     var1->next = rdfp_w_oidlist->next;
4152                     rdfp_w_oidlist->next = NULL;
4153                     readdb_destruct(rdfp_w_oidlist);
4154                     rdfp_w_oidlist = var1->next;
4155                     break;
4156                 }
4157             }
4158             if (i==num_whole_db) {
4159                 var1 = rdfp_w_oidlist;
4160                 rdfp_w_oidlist = rdfp_w_oidlist->next;
4161             }
4162         }
4163     }
4164     }}
4165 
4166     /* For databases such as the Microbial blast databases, where the list of
4167      * databases includes many of the same underlying database and different
4168      * gi lists to specify a subset of the database, concatenate the gi lists
4169      * and keep only one copy of the underlying rdfp structure (avoid mmap'ing
4170      * the same index file multiple times). This is not an issue if init_state
4171      * is READDB_NEW_DO_REPORT.  */
4172     if (!(init_state & READDB_NEW_DO_REPORT))
4173         new = readdb_merge_gifiles(new);
4174 
4175     /* adjust all the RDFP's. */
4176     tmp = new;
4177     start = 0;
4178     while (tmp) {
4179     /* this may have been adjusted on a previous call to this function,
4180            readjust for indices. */
4181         old_start = tmp->start;
4182         tmp->start = start;
4183         tmp->stop = tmp->num_seqs-1+start;
4184         tmp->ambchar_index -= (start-old_start);
4185         tmp->header_index -= (start-old_start);
4186         tmp->sequence_index -= (start-old_start);
4187 
4188         start = tmp->stop+1;
4189         tmp = tmp->next;
4190     }
4191 
4192     if (new)
4193        /*new->not_first_time = FALSE;*/
4194        new->parameters &= ~READDB_NOT_FIRST_TIME;
4195     return new;
4196 }
4197 
4198 ReadDBFILEPtr LIBCALL
readdb_new(CharPtr filename,Uint1 is_prot)4199 readdb_new (CharPtr filename, Uint1 is_prot)
4200 
4201 {
4202 
4203     return readdb_new_ex(filename, is_prot, TRUE);
4204 }
4205 
4206 /*
4207     Get total length and number of sequences in multiple databases.
4208 */
4209 
4210 Boolean LIBCALL
readdb_get_totals(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num)4211 readdb_get_totals(ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num)
4212 
4213 {
4214     return readdb_get_totals_ex(rdfp_list, total_len, total_num, FALSE);
4215 }
4216 
4217 
4218 
4219 /*
4220     Get total length and number of sequences in multiple databases.
4221     if 'use_alias' is TRUE, values from the alias file will be used
4222     if non-zero.
4223 */
4224 
4225 Boolean LIBCALL
readdb_get_totals_ex(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias)4226 readdb_get_totals_ex(ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num, Boolean use_alias)
4227 
4228 {
4229     return readdb_get_totals_ex2(rdfp_list, total_len, total_num, use_alias,
4230             FALSE);
4231 }
4232 
4233 /* retrieves the total number of sequences and database length in the
4234  * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive
4235  * options: use_virtual_oidlist assumes this rdfp_list has been processed by
4236  * BlastProcessGiLists. */
4237 Boolean LIBCALL
readdb_get_totals_ex2(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias,Boolean use_virtual_oidlist)4238 readdb_get_totals_ex2 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr total_len,
4239         Int4Ptr total_num, Boolean use_alias, Boolean use_virtual_oidlist))
4240 {
4241     return readdb_get_totals_ex3(rdfp_list, total_len, total_num, use_alias,
4242                                  use_virtual_oidlist, eExact);
4243 }
4244 
4245 /* retrieves the total number of sequences and database length in the
4246  * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive
4247  * options: use_virtual_oidlist assumes this rdfp_list has been processed by
4248  * BlastProcessGiLists. */
4249 Boolean LIBCALL
readdb_get_totals_ex3(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias,Boolean use_virtual_oidlist,EAccountingMode acc_mode)4250 readdb_get_totals_ex3 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr total_len,
4251         Int4Ptr total_num, Boolean use_alias, Boolean use_virtual_oidlist,
4252         EAccountingMode acc_mode))
4253 {
4254     ReadDBFILEPtr rdfp;
4255     OIDListPtr virtual_oidlist = NULL;
4256     Uint4 maskindex, i, base = 0, total_mask;
4257     Uint4 mask;
4258     typedef Int4 (LIBCALL *fun_ptr) (ReadDBFILEPtr, Int4);
4259 
4260     fun_ptr get_sequence_length = (acc_mode == eExact ?
4261                                    &readdb_get_sequence_length :
4262                                    &readdb_get_sequence_length_approx);
4263     *total_len = 0;
4264     *total_num = 0;
4265 
4266     if (rdfp_list == NULL || total_len == NULL || total_num == NULL)
4267         return FALSE;
4268 
4269     if (use_alias && use_virtual_oidlist)
4270         return FALSE;
4271 
4272     if (use_virtual_oidlist) {
4273 
4274         for (rdfp = rdfp_list; rdfp; rdfp = rdfp->next) {
4275 
4276             if ((virtual_oidlist = rdfp->oidlist)) {
4277                 total_mask = virtual_oidlist->total/MASK_WORD_SIZE + 1;
4278                 maskindex = 0;
4279 
4280                 while (maskindex < total_mask){
4281                     mask = SwapUint4(virtual_oidlist->list[maskindex]);
4282                     i = 0;
4283                     while (mask) {
4284                         if ((mask & (((Uint4)0x1) << (MASK_WORD_SIZE-1)))) {
4285                             (*total_num)++;
4286                             *total_len += (*get_sequence_length)(rdfp_list,
4287                                     base+i);
4288                         }
4289                         mask <<= 1;
4290                         i++;
4291                     }
4292                     maskindex++;
4293                     base += MASK_WORD_SIZE;
4294                 }
4295                 break; /* virtual oidlist is always the last oidlist */
4296             }
4297             *total_len += readdb_get_dblen(rdfp);
4298             *total_num += readdb_get_num_entries(rdfp);
4299         }
4300     } else {
4301 
4302         while (rdfp_list) {
4303 
4304             /* Note well: This assumes that the information in the alias file is
4305              * accurate, if the aliaslen and aliasnseq fields are inaccurate, so
4306              * will be the total number of sequences and database length
4307              * returned */
4308             if (use_alias && rdfp_list->aliasfilename) {
4309                 if (rdfp_list->aliaslen >= 0
4310                         && (rdfp_list->oidlist || rdfp_list->gifile ||
4311                             rdfp_list->gilist))
4312                     *total_len += rdfp_list->aliaslen;
4313                 else
4314                     *total_len += readdb_get_dblen(rdfp_list);
4315 
4316                 if (rdfp_list->aliasnseq >= 0
4317                         && (rdfp_list->oidlist || rdfp_list->gifile ||
4318                             rdfp_list->gilist))
4319                     *total_num += rdfp_list->aliasnseq;
4320                 else
4321                     *total_num += readdb_get_num_entries(rdfp_list);
4322             } else {
4323                 *total_len += readdb_get_dblen(rdfp_list);
4324                 *total_num += readdb_get_num_entries(rdfp_list);
4325             }
4326             rdfp_list = rdfp_list->next;
4327         }
4328     }
4329 
4330     return TRUE;
4331 
4332 }
4333 
4334 
4335 /*
4336 	Gets the number to be used for statistical purposes.  Should be set in
4337         alias file as STATS_NSEQ and STATS_TOTLEN.
4338 */
4339 Boolean LIBCALL
readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list,Int4 * num_seq_stats,Int8 * tot_len_stats)4340 readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list, Int4* num_seq_stats, Int8* tot_len_stats)
4341 {
4342    Int4 num_seqs=0;
4343    Int8 tot_len=0;
4344 
4345    if (rdfp_list == NULL)
4346      return FALSE;
4347 
4348    while (rdfp_list)
4349    {
4350        num_seqs += rdfp_list->nseq_stats;
4351        tot_len += rdfp_list->totlen_stats;
4352        rdfp_list = rdfp_list->next;
4353    }
4354    *num_seq_stats = num_seqs;
4355    *tot_len_stats = tot_len;
4356    return TRUE;
4357 }
4358 
4359 
4360 
4361 /*
4362     Checks whether a ReadDBFILEPtr is the original, or just attaced.
4363     It does this by checking the rdfp->contents_allocated flag.
4364 */
4365 Boolean LIBCALL
readdb_copy(ReadDBFILEPtr rdfp)4366 readdb_copy (ReadDBFILEPtr rdfp)
4367 
4368 {
4369     if (rdfp == NULL)
4370         return FALSE;
4371 
4372     /* if allocated, this is not a copy. */
4373     /*if (rdfp->contents_allocated)*/
4374     if (rdfp->parameters & READDB_CONTENTS_ALLOCATED)
4375         return FALSE;
4376 
4377     return TRUE;
4378 }
4379 
4380 /* Compare rdfp1 with rdfp2 for identical:
4381    molecule type (prot/nucl)
4382    total number of bases/residues
4383    maximum sequence length
4384    file name
4385    date of creation
4386    membership_bit
4387    oidlist
4388 */
4389 Boolean
readdb_compare_basic(ReadDBFILEPtr rdfp1,ReadDBFILEPtr rdfp2)4390 readdb_compare_basic(ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2)
4391 {
4392     if (rdfp1 == NULL || rdfp2 == NULL)
4393         return FALSE;
4394 
4395     if (rdfp1 == rdfp2)
4396         return TRUE;
4397 
4398     /*if (rdfp1->is_prot != rdfp2->is_prot)*/
4399     if ((rdfp1->parameters & READDB_IS_PROT) !=
4400         (rdfp2->parameters & READDB_IS_PROT))
4401         return FALSE;
4402 
4403     if (rdfp1->totlen != rdfp2->totlen)
4404         return FALSE;
4405 
4406     if (rdfp1->maxlen != rdfp2->maxlen)
4407         return FALSE;
4408 
4409     if (StringCmp(rdfp1->filename, rdfp2->filename) != 0)
4410         return FALSE;
4411 
4412     if (StringCmp(rdfp1->date, rdfp2->date) != 0)
4413         return FALSE;
4414 
4415     if (rdfp1->membership_bit != rdfp2->membership_bit)
4416         return FALSE;
4417 
4418     if ((rdfp1->oidlist!=NULL && rdfp2->oidlist==NULL) ||
4419         (rdfp1->oidlist==NULL && rdfp2->oidlist!=NULL))
4420         return FALSE;
4421 
4422         /* If both have a valid oidlist ... */
4423     if ((rdfp1->oidlist && rdfp2->oidlist) &&
4424         (rdfp1->oidlist->filename && rdfp2->oidlist->filename) &&
4425         /* but different filenames, then they must have different oidlists */
4426         (StringCmp(rdfp1->oidlist->filename, rdfp2->oidlist->filename) != 0))
4427             return FALSE;
4428 
4429     return TRUE;
4430 }
4431 
4432 /*
4433     Check whether two different ReadDBFILEPtr refer to the
4434     same database.
4435 
4436     If they are, then TRUE is returned.
4437 */
4438 Boolean LIBCALL
readdb_compare(ReadDBFILEPtr rdfp1,ReadDBFILEPtr rdfp2)4439 readdb_compare(ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2)
4440 {
4441     Boolean same_title = (StringCmp(rdfp1->title, rdfp2->title) == 0);
4442 
4443     return (same_title && readdb_compare_basic(rdfp1, rdfp2));
4444 }
4445 
4446 
4447 /* This function attempts to merge the contents of rdfp->gilist(s) of those
4448  * rdfp's in rdfp_chain that have the same underlying blast database. This is
4449  * done so that we don't mmap the same index files multiple times. */
readdb_merge_gifiles(ReadDBFILEPtr rdfp_chain)4450 static ReadDBFILEPtr readdb_merge_gifiles (ReadDBFILEPtr rdfp_chain)
4451 {
4452     register ReadDBFILEPtr rdfp = NULL, temp = NULL, prev = NULL;
4453     CharPtr title = NULL;
4454     Int4 title_len = 0;
4455 
4456     for (rdfp = prev = rdfp_chain; rdfp; rdfp = rdfp->next, prev = rdfp) {
4457 
4458         for (temp = rdfp->next; temp; prev = temp, temp = temp->next) {
4459 
4460             if (!readdb_compare_basic(rdfp, temp))
4461                 continue;
4462             /* rdfp and temp have the same underlying database, so we combine
4463                them */
4464             prev->next = temp->next;
4465             temp->next = NULL;
4466 
4467             /*** Merge the gilists, if any ***/
4468             if (temp->gilist) {
4469                 rdfp->gilist = Int4ListConcat(&rdfp->gilist, &temp->gilist);
4470                 ASSERT(rdfp->gifile == NULL && temp->gifile == NULL);
4471             }
4472 
4473             /*** Keep track of the length and number of sequences according to
4474              * the gi lists ***/
4475             rdfp->aliaslen += temp->aliaslen;
4476             rdfp->aliasnseq += temp->aliasnseq;
4477 
4478             /*** Concatenate the titles ***/
4479             if (temp->title) {
4480                 title_len = StringLen(rdfp->title) + StringLen(temp->title) + 3;
4481                 title = (CharPtr) MemNew(sizeof(Char)*title_len);
4482                 if (rdfp->title) {
4483                     title = StringCat(title, rdfp->title);
4484                     title = StringCat(title, "; ");
4485                 }
4486                 title = StringCat(title, temp->title);
4487                 rdfp->title = MemFree(rdfp->title);
4488                 rdfp->title = title;
4489             }
4490 
4491             /*** Free temp ***/
4492             temp = readdb_destruct(temp);
4493             temp = prev;
4494         }
4495 
4496     }
4497 
4498     /* In case new real databases have been found (i.e.: alias file referring to
4499      * another alias file(s) along with real database(s)), arrange them so that
4500      * the real databases are at the front of the rdfp_chain */
4501     {
4502         ReadDBFILEPtr rdfp_w_gilist = NULL;
4503 
4504         rdfp = rdfp_chain;
4505         rdfp_chain = NULL;
4506 
4507         /* separate rdfp's w/ gilists and real databases */
4508         while (rdfp) {
4509 
4510             if (rdfp->gilist) {
4511                 if (rdfp_w_gilist == NULL) {
4512                     rdfp_w_gilist = rdfp;
4513                 } else {
4514                     temp = rdfp_w_gilist;
4515                     while (temp->next)
4516                         temp = temp->next;
4517                     temp->next = rdfp;
4518                 }
4519             } else {
4520                 if (rdfp_chain == NULL) {
4521                     rdfp_chain = rdfp;
4522                 } else {
4523                     temp = rdfp_chain;
4524                     while (temp->next)
4525                         temp = temp->next;
4526                     temp->next = rdfp;
4527                 }
4528             }
4529             temp = rdfp->next;
4530             rdfp->next = NULL;
4531             rdfp = temp;
4532         }
4533 
4534         /* append the rdfp_w_gilist to the rdfp_chain */
4535         if ( (temp = rdfp_chain)) {
4536             while (temp->next)
4537                 temp = temp->next;
4538             temp->next = rdfp_w_gilist;
4539         } else
4540             rdfp_chain = rdfp_w_gilist;
4541     }
4542 
4543     return rdfp_chain;
4544 }
4545 
4546 /*
4547     Attach to an already open ReadDBFILEPtr.  Duplicate the
4548     indexfp, sequencefp, and headerfp structures as the pointers
4549     there (i.e., mmp) will need to be manipulated.  Do not
4550     change the FILE PNTR fp.
4551 */
4552 
4553 ReadDBFILEPtr LIBCALL
readdb_attach(ReadDBFILEPtr rdfp)4554 readdb_attach (ReadDBFILEPtr rdfp)
4555 
4556 {
4557     ReadDBFILEPtr head, last, new_t;
4558 
4559     if (rdfp == NULL)
4560         return NULL;
4561 
4562     head = NULL;
4563     last = NULL;
4564     while (rdfp)
4565     {
4566         new_t = (ReadDBFILEPtr) MemDup(rdfp, sizeof(ReadDBFILE));
4567 
4568         /*
4569         The contents_allocated flag DOES NOT apply to the actual
4570         structures indexfp, headerfp, or sequencefp.  These must always
4571         be duplicated, as their pointers need to be independently
4572         manipulated by threads.  They have their own allocation flags.
4573         */
4574                /*new_t->contents_allocated = FALSE;*/
4575         new_t->parameters &= ~READDB_CONTENTS_ALLOCATED;
4576            new_t->indexfp = (NlmMFILEPtr) MemDup(rdfp->indexfp,
4577                                               sizeof(NlmMFILE));
4578         new_t->indexfp->contents_allocated = FALSE;
4579         if (rdfp->headerfp != NULL) {
4580            new_t->headerfp = (NlmMFILEPtr) MemDup(rdfp->headerfp,
4581                               sizeof(NlmMFILE));
4582            new_t->headerfp->contents_allocated = FALSE;
4583         }
4584         if (rdfp->sequencefp != NULL) {
4585            new_t->sequencefp = (NlmMFILEPtr) MemDup(rdfp->sequencefp,
4586                                 sizeof(NlmMFILE));
4587            new_t->sequencefp->contents_allocated = FALSE;
4588         }
4589 
4590         if (new_t->taxinfo != NULL) {
4591             new_t->taxinfo = (RDBTaxInfoPtr)
4592                 MemDup(rdfp->taxinfo, sizeof(RDBTaxInfo));
4593 
4594             if (new_t->taxinfo->taxfp != NULL) {
4595                 new_t->taxinfo->taxfp = (NlmMFILEPtr)
4596                     MemDup(rdfp->taxinfo->taxfp, sizeof(NlmMFILE));
4597                 new_t->taxinfo->taxfp->contents_allocated = FALSE;
4598             }
4599 
4600             new_t->taxinfo->name_fd = (NlmMFILEPtr)
4601                 MemDup(rdfp->taxinfo->name_fd, sizeof(NlmMFILE));
4602             new_t->taxinfo->name_fd->contents_allocated = FALSE;
4603 
4604             new_t->taxinfo->taxinfo_alloc = FALSE;
4605             new_t->taxinfo->taxdata_alloc = FALSE;
4606         }
4607 
4608                 /*new_t->handle_common_index = FALSE;*/
4609         new_t->parameters &= ~READDB_HANDLE_COMMON_INDEX;
4610 
4611                 new_t->oidlist = rdfp->oidlist;
4612 
4613         /* Copy address of shared information */
4614         new_t->shared_info = rdfp->shared_info;
4615 
4616         /* increment the reference count atomically */
4617 
4618         if(new_t->shared_info != NULL) {
4619              NlmMutexLockEx(&hdrseq_mutex);
4620              rdfp->shared_info->nthreads++;
4621              NlmMutexUnlock(hdrseq_mutex);
4622         }
4623 
4624         /* Contents_allocated also does not apply to buffer, this is
4625         determined by allocated_length. */
4626         if (new_t->allocated_length > 0)
4627             {
4628                         new_t->buffer = (UcharPtr) MemNew((new_t->allocated_length)*sizeof(Uint1));
4629             }
4630 
4631         if (head == NULL)
4632         {
4633             head = new_t;
4634         }
4635         else
4636         {
4637             last->next = new_t;
4638         }
4639 
4640         last = new_t;
4641         rdfp = rdfp->next;
4642     }
4643 
4644     return head;
4645 }
4646 
4647 ReadDBFILEPtr LIBCALL
readdb_destruct(ReadDBFILEPtr rdfp)4648 readdb_destruct (ReadDBFILEPtr rdfp)
4649 
4650 {
4651     ReadDBFILEPtr next;
4652 
4653     if (!rdfp)
4654         return NULL;
4655 
4656     if (rdfp->parameters & READDB_CONTENTS_ALLOCATED) {
4657         rdfp = ReadDBCloseMHdrAndSeqFiles(rdfp);
4658         taxonomyDbLoaded = FALSE;
4659     }
4660     rdfp = ReadDBFreeSharedInfo(rdfp);
4661     while (rdfp) {
4662         next = rdfp->next;
4663         rdfp = readdb_destruct_element(rdfp);
4664         rdfp = next;
4665     }
4666 
4667     return NULL;
4668 }
4669 
4670 /*
4671     Destroys a single element.
4672 */
4673 ReadDBFILEPtr LIBCALL
readdb_destruct_element(ReadDBFILEPtr rdfp)4674 readdb_destruct_element (ReadDBFILEPtr rdfp)
4675 
4676 {
4677 
4678     if (rdfp == NULL)
4679         return NULL;
4680 
4681     /* Deallocate if contents were allocated. */
4682     /*if (rdfp->contents_allocated) {*/
4683     if (rdfp->parameters & READDB_CONTENTS_ALLOCATED) {
4684         rdfp->filename = (CharPtr)MemFree(rdfp->filename);
4685         rdfp->aliasfilename = (CharPtr)MemFree(rdfp->aliasfilename);
4686         rdfp->title = (CharPtr)MemFree(rdfp->title);
4687         rdfp->date = (CharPtr)MemFree(rdfp->date);
4688         /* free array if they were allocated, ie no memmap */
4689         if (rdfp->header_index_start)
4690             rdfp->header_index_start = (Uint4Ptr)MemFree(rdfp->header_index_start);
4691         if (rdfp->sequence_index_start)
4692             rdfp->sequence_index_start = (Uint4Ptr)MemFree(rdfp->sequence_index_start);
4693         if (rdfp->ambchar_index_start)
4694             rdfp->ambchar_index_start  =(Uint4Ptr) MemFree(rdfp->ambchar_index_start);
4695         /* is it completely safe to have one rdfp->nisam_opt for all threads. */
4696         ISAMObjectFree(rdfp->nisam_opt); /* Terminating NISAM */
4697         ISAMObjectFree(rdfp->sisam_opt); /* Terminating NISAM */
4698         ISAMObjectFree(rdfp->isam_pig);  /* Terminating PIG ISAM */
4699         OIDListFree(rdfp->oidlist);
4700         rdfp->gifile = MemFree(rdfp->gifile);
4701         rdfp->gilist = Int4ListFree(rdfp->gilist);
4702 
4703     }
4704     rdfp->indexfp = NlmCloseMFILE(rdfp->indexfp);
4705     NlmMutexLockEx(&hdrseq_mutex);
4706     if (rdfp->shared_info && (rdfp->sequencefp || rdfp->headerfp)) {
4707        if (--(rdfp->shared_info->nthreads) == 0) {
4708           rdfp->shared_info->sequencefp =
4709              NlmCloseMFILE(rdfp->shared_info->sequencefp);
4710           rdfp->shared_info->headerfp =
4711              NlmCloseMFILE(rdfp->shared_info->headerfp);
4712        } else if (rdfp->shared_info->nthreads == -1) {
4713           rdfp->shared_info->nthreads = 0;
4714           rdfp->shared_info = NULL;
4715        }
4716     }
4717     NlmMutexUnlock(hdrseq_mutex);
4718     rdfp->shared_info = NULL;
4719     rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
4720     rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
4721 
4722     RDBTaxInfoClose(rdfp->taxinfo);  /* Closing taxonomy names database */
4723 
4724     if (rdfp->allocated_length > 0) {
4725         rdfp->buffer = (UcharPtr)MemFree(rdfp->buffer);
4726     }
4727 
4728     if (rdfp->blast_deflinep != NULL)
4729         rdfp->blast_deflinep = BlastDefLineSetFree(rdfp->blast_deflinep);
4730 
4731     /* destruct common index only if it is permited to do it for this thread */
4732 
4733     if (rdfp->cih && /*rdfp->handle_common_index*/
4734         (rdfp->parameters & READDB_HANDLE_COMMON_INDEX))
4735        CommonIndexDestruct(rdfp->cih);
4736 
4737     rdfp = (ReadDBFILEPtr) MemFree(rdfp);
4738 
4739     return NULL;
4740 }
4741 
4742 /*
4743     Goes through a chain of ReadDBfILEPtr's, looking for the one
4744     that contains the specified ordinal ID.
4745 */
4746 
4747 static ReadDBFILEPtr
readdb_get_link(ReadDBFILEPtr rdfp,Int4 ordinal_id)4748 readdb_get_link(ReadDBFILEPtr rdfp, Int4 ordinal_id)
4749 
4750 {
4751    ReadDBFILEPtr last, last_used, rdfp_var;
4752    Boolean loaded_new = FALSE;
4753 
4754    last_used = last = rdfp;
4755 
4756    while (rdfp) {
4757       if (rdfp->start <=ordinal_id && rdfp->stop >= ordinal_id)
4758      break;
4759       rdfp = rdfp->next;
4760    }
4761    if (! rdfp)
4762 	return 0;
4763    if (!(last->parameters & READDB_KEEP_HDR_AND_SEQ)) {
4764       while (rdfp != last) {
4765      if (last->sequencefp != NULL || last->headerfp != NULL) {
4766         if (last->shared_info) {
4767            NlmMutexLockEx(&hdrseq_mutex);
4768            if (--(last->shared_info->nthreads) == 0) {
4769           last->shared_info->sequencefp =
4770              NlmCloseMFILE(last->shared_info->sequencefp);
4771           last->shared_info->headerfp =
4772              NlmCloseMFILE(last->shared_info->headerfp);
4773            } else if (last->shared_info->nthreads < 0) {
4774                   last->sequencefp = NULL;
4775                   last->headerfp = NULL;
4776           last->shared_info->nthreads = 0;
4777                }
4778            NlmMutexUnlock(hdrseq_mutex);
4779         }
4780         last->sequencefp = NlmCloseMFILE(last->sequencefp);
4781         last->headerfp = NlmCloseMFILE(last->headerfp);
4782      }
4783      last = last->next;
4784       }
4785 
4786       rdfp_var = rdfp->next;
4787       while (rdfp_var != NULL) {
4788          if (rdfp_var->sequencefp != NULL || rdfp_var->headerfp != NULL) {
4789             if (rdfp_var->shared_info) {
4790                NlmMutexLockEx(&hdrseq_mutex);
4791                if (--(rdfp_var->shared_info->nthreads) == 0) {
4792                   rdfp_var->shared_info->sequencefp =
4793                      NlmCloseMFILE(rdfp_var->shared_info->sequencefp);
4794                   rdfp_var->shared_info->headerfp =
4795                      NlmCloseMFILE(rdfp_var->shared_info->headerfp);
4796                } else if (rdfp_var->shared_info->nthreads < 0) {
4797                   rdfp_var->sequencefp = NULL;
4798                   rdfp_var->headerfp = NULL;
4799                   rdfp_var->shared_info->nthreads = 0;
4800                }
4801                NlmMutexUnlock(hdrseq_mutex);
4802             }
4803             rdfp_var->sequencefp = NlmCloseMFILE(rdfp_var->sequencefp);
4804             rdfp_var->headerfp = NlmCloseMFILE(rdfp_var->headerfp);
4805          }
4806          rdfp_var = rdfp_var->next;
4807       }
4808    }
4809 
4810    /* Check for nthreads == 0 is needed because rdfp->sequencefp and
4811       rdfp->headerfp might be already freed by another thread, but still
4812       not NULL here. The check is done outside the mutex to avoid a huge number
4813       of mutex locks. It will be repeated once in the mutex */
4814    if ((rdfp->sequencefp==NULL && rdfp->headerfp==NULL) ||
4815        (rdfp->shared_info && rdfp->shared_info->nthreads==0)) {
4816       NlmMutexLockEx(&hdrseq_mutex);
4817       if ((rdfp->sequencefp==NULL && rdfp->headerfp==NULL) ||
4818           (rdfp->shared_info && rdfp->shared_info->nthreads==0)) {
4819 
4820          if (ReadDBOpenMHdrAndSeqFiles(rdfp) == FALSE) {
4821             ErrPostEx(SEV_ERROR, 0, 0,
4822                       "ReadDBOpenMHdrAndSeqFiles: failed to map files\n");
4823             rdfp = NULL;
4824          }
4825 		 else {
4826 			loaded_new = TRUE;
4827 		 }
4828       }
4829       NlmMutexUnlock(hdrseq_mutex);
4830    }
4831 
4832 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
4833 #ifdef  HAVE_MADVISE
4834 	if( useMadvise && rdfp != NULL ) {
4835 		EThreadPriority pri = eTP_Highest;
4836 
4837 		/* est database requires special treatment */
4838 		if( rdfp->filename && !strncmp(rdfp->filename, "est", 3) ) {
4839 			pri = eTP_Default;
4840 		}
4841 
4842 		readdb_preload_file(rdfp->indexfp, madvisePreloadBlock,
4843 								mmapAdvice, madviseSyncMode, pri);
4844 
4845 		readdb_preload_file(rdfp->sequencefp, madvisePreloadBlock,
4846 								mmapAdvice, madviseSyncMode, pri);
4847 
4848 		readdb_preload_file(rdfp->headerfp, madvisePreloadBlock,
4849 								mmapAdvice, madviseSyncMode, pri);
4850 	}
4851 #endif /* HAVE_MADVISE */
4852 #endif /* SOL || LINUX */
4853 
4854    return rdfp;
4855 }
4856 
4857 /*** This function checks whether the oid passed as 2nd argument to this
4858  * function is part of the ordinal id list, if it is, an extra check should be
4859  * done by loading the ASN.1 defline, but if this check fails, there's no need
4860  * to load the defline, as we know it is not part of this subset database.
4861  */
4862 static Int4
s_SearchOidInLocalOidList(const OIDListPtr oidlist,Uint4 oid)4863 s_SearchOidInLocalOidList(const OIDListPtr oidlist, Uint4 oid)
4864 {
4865     /* which word in the array? */
4866     Uint4 oidmask_index = oid / MASK_WORD_SIZE;
4867     /* which bit in the word? */
4868     Uint4 oidmask_bit   = 0x1 << ( (MASK_WORD_SIZE-1) - (oid % MASK_WORD_SIZE));
4869 
4870     /* No oid list? Then we need to load the defline... */
4871     if ( !oidlist ) {
4872         return 0;
4873     }
4874 
4875     /* If the OID is past the end of the mask, bail out. */
4876     if (oid > oidlist->total) return -1;
4877 
4878     /* If the bit isn't set, bail out early. */
4879     if (!(SwapUint4(oidlist->list[oidmask_index]) & oidmask_bit))
4880         return -1;
4881 
4882     return 0;
4883 }
4884 
4885 Boolean
readdb_check_oid(ReadDBFILEPtr rdfp_head,Int4 oid)4886 readdb_check_oid(ReadDBFILEPtr rdfp_head, Int4 oid)
4887 {
4888 	ReadDBFILEPtr rdfp_var = rdfp_head;
4889 
4890 	while (rdfp_var && rdfp_var->start < oid)
4891         {
4892 		if (rdfp_var->oidlist) {
4893 			if (s_SearchOidInLocalOidList(rdfp_var->oidlist, oid-rdfp_var->start) == 0)
4894                   		return TRUE;
4895                 } else {
4896                 	if (rdfp_var->start <= oid <= rdfp_var->stop)
4897                                 return TRUE;
4898                 }
4899                 rdfp_var = rdfp_var->next;
4900         }
4901         return FALSE;
4902 }
4903 
4904 /* This function verifies if a given ordinal id (or gi) belongs
4905    to a mask database, based on the membership bit stored in the
4906    BlastDefLine structure of the new ASN.1 deflines.
4907    @param oidlist OID list where the current oid is presumed to be found [in]
4908    @param oid OID as returned by the ISAM functions, i.e.: relative to a single
4909    rdfp element in the linked list [in]
4910    @param rdfp_head Head of the linked list of ReadDBFILE structures [in]
4911    @param oid_offset offset to be added to the oid so that it can be searched
4912    from rdfp_head [in]
4913    @param gi gi to be found [in]
4914    Note: If gi is -1, then only the oid will be verified to belong
4915    to the mask database. This will matter only on non-redundant
4916    databases, where there can be many gi's associated with the same
4917    oid */
OID_GI_BelongsToMaskDB(OIDListPtr oidlist,Int4 oid,ReadDBFILEPtr rdfp_head,Int4 oid_offset,Int4 gi)4918 static Boolean OID_GI_BelongsToMaskDB(OIDListPtr oidlist,
4919                                       Int4 oid,
4920                                       ReadDBFILEPtr rdfp_head,
4921                                       Int4 oid_offset,
4922                                       Int4 gi)
4923 {
4924     BlastDefLinePtr bdp = NULL, bdp_tmp = NULL;
4925     SeqIdPtr seqid_gi = NULL;
4926     Boolean retval = FALSE;
4927 
4928     /*
4929      * For performance reasons, check to see if the OID corresponding
4930      * to the GI in the GI list exists in the oid mask.
4931      */
4932     if (s_SearchOidInLocalOidList(oidlist, oid) != 0) {
4933         return FALSE;
4934     }
4935 
4936     /*
4937      * Otherwise, load the GI's defline to verify it belongs to
4938      * the subset database, since multiple GIs may resolve
4939      * to a single OID.
4940      */
4941 
4942     if ((bdp = FDReadDeflineAsn(rdfp_head, oid+oid_offset)) != NULL &&
4943         gi != -1) {
4944 
4945         ValNodeAddInt(&seqid_gi, SEQID_GI, gi);
4946 
4947         for (bdp_tmp = bdp; bdp_tmp; bdp_tmp = bdp_tmp->next) {
4948             /* FIXME: should do Seq-id comparison to avoid missing accessions
4949              * and not depend on gi values */
4950             if (SeqIdIn(bdp_tmp->seqid, seqid_gi)) {
4951                 retval = TRUE;
4952                 break;
4953             }
4954         }
4955         bdp = BlastDefLineSetFree(bdp);
4956         seqid_gi = SeqIdFree(seqid_gi);
4957     }
4958 
4959     return retval;
4960 }
4961 
4962 
4963 /*
4964   Returnes Int4 sequence_number by gi using NISAM indexes:
4965 
4966   ReadDBFILEPtr rdfp: the main ReadDB reference,
4967   Int4 gi - input gi number to find
4968   Int4 sequence_number: which number is this sequence,
4969   Returned 0 indicates, that gi was found
4970   Returned -1 indicates, that gi was not found
4971   Returned negative value mean fault of NISAM library
4972 */
4973 
4974 Int4 LIBCALL
readdb_gi2seq(ReadDBFILEPtr rdfp,Int4 gi,Int4Ptr start)4975 readdb_gi2seq(ReadDBFILEPtr rdfp, Int4 gi, Int4Ptr start)
4976 {
4977 
4978     Boolean    thereis_unknown_database = FALSE;
4979     ReadDBFILEPtr    rdfp_head = rdfp;
4980 
4981     if (start)
4982     *start = 0;
4983 
4984     while(rdfp) {
4985     if (!rdfp->filebit) {
4986         thereis_unknown_database = TRUE;
4987         break;
4988     }
4989     rdfp = rdfp->next;
4990     }
4991 
4992     rdfp = rdfp_head;
4993 
4994     if (thereis_unknown_database || (!isCommonIndex)) {
4995     ISAMErrorCode error;
4996     Uint4 Value;
4997 
4998     while (rdfp)
4999     {
5000         if(rdfp->nisam_opt == NULL) {
5001             rdfp = rdfp->next;
5002             continue;
5003         }
5004 
5005         /* Resolve GI to OID. */
5006         if((error = NISAMSearch(rdfp->nisam_opt, gi,
5007             &Value, NULL)) < 0) {
5008         ErrPostEx(SEV_WARNING, 0, 0, "Failed to initialize search. "
5009             "ISAM Error code is %d\n", error);
5010         return error;
5011         }
5012 
5013         if(error != ISAMNotFound) {
5014         if (start)
5015             *start = rdfp->start;
5016 
5017         /* Before returning, make sure that this gi belongs to
5018          * the subset (mask) database, if we are dealing with one */
5019         if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5020 
5021             if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value, rdfp_head,
5022                                         rdfp->start, gi))
5023                 return -1;
5024         }
5025 
5026 
5027         return (Int4) (Value+rdfp->start);
5028         }
5029 
5030         rdfp = rdfp->next;
5031     }
5032     return -1;
5033     } else {
5034     Int4        retval = 0;
5035     Int4        mask = 0, alias_mask = 0;
5036     CommonIndexHeadPtr    cih = rdfp->cih;
5037     Int2        dbid=0, alias_dbid=0;
5038 
5039     /* create common mask for all databases */
5040     while (rdfp) {
5041         if (rdfp->aliasfilebit) {
5042         alias_mask |= (0x1 << rdfp->aliasfilebit);
5043         };
5044         mask |= (0x1 << rdfp->filebit);
5045         rdfp = rdfp->next;
5046     }
5047 
5048     /* get OID and database id (dbid) of this OID */
5049     if (cih)
5050         retval = GI2OID(cih, gi, mask, alias_mask, &dbid, &alias_dbid, rdfp_head);
5051 
5052     if (retval >= 0) {
5053         /* find correct rdfp in the list */
5054         rdfp = rdfp_head;
5055         while (rdfp) {
5056         /* if the oid found in mask database */
5057         if (alias_mask && rdfp->aliasfilebit == alias_dbid)
5058             break;
5059         /* if the oid found in real database */
5060         if (!alias_mask && (rdfp->filebit == dbid))
5061             break;
5062                 /* if version is greater than FORMATDB_VER and we have a
5063                  * CommonIndex, rely on the membership information on
5064                  * the BlastDefLine structure */
5065                 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5066                     if (OID_GI_BelongsToMaskDB(rdfp->oidlist, retval,
5067                                                rdfp_head, rdfp->start, gi))
5068                         break;
5069                 }
5070 
5071         rdfp = rdfp->next;
5072         }
5073 
5074         if (!rdfp) {
5075         /* we did not find the gi where we were trying */
5076         return -1;
5077         }
5078 
5079         if (start)
5080         *start = rdfp->start;
5081 
5082         return retval+rdfp->start;
5083     }
5084     else
5085         return -1;
5086     }
5087 }
5088 
5089 /*
5090 Used for sparse indices.
5091 
5092 objective to get not SeqId, but Int4 gi number or CharPtr printed Seqid
5093 ISAM indexes may use or numeric search or string search.
5094 
5095 */
5096 
readdb_find_best_id(SeqIdPtr sip,Int4Ptr gi,CharPtr tmpbuf)5097 static Boolean readdb_find_best_id(SeqIdPtr sip, Int4Ptr gi, CharPtr tmpbuf)
5098 {
5099     TextSeqIdPtr tsip = NULL;
5100     ObjectIdPtr oid;
5101     PDBSeqIdPtr psip;
5102     DbtagPtr dbt;
5103     SeqIdPtr sip_tmp;
5104 
5105     if (sip == NULL)
5106         return FALSE;
5107 
5108     for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
5109         if(sip_tmp->choice == SEQID_GI) {
5110             *gi = sip_tmp->data.intvalue;
5111             break;
5112         }
5113     }
5114 
5115     if(*gi != 0) return TRUE;
5116 
5117     for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
5118 
5119         switch (sip_tmp->choice) {
5120         case SEQID_LOCAL:     /* local */
5121             oid = (ObjectIdPtr)(sip_tmp->data.ptrvalue);
5122             StringCpy(tmpbuf, oid->str);
5123             break;
5124         case SEQID_GIBBSQ:    /* gibbseq */
5125             sprintf(tmpbuf, "%ld", (long)(sip_tmp->data.intvalue));
5126             break;
5127         case SEQID_EMBL:      /* embl */
5128         case SEQID_DDBJ:      /* ddbj */
5129         case SEQID_GENBANK:   /* genbank */
5130         case SEQID_TPG:       /* Third Party Annot/Seq Genbank */
5131         case SEQID_TPE:       /* Third Party Annot/Seq EMBL */
5132         case SEQID_TPD:       /* Third Party Annot/Seq DDBJ */
5133         case SEQID_OTHER:     /* other */
5134         case SEQID_PIR:       /* pir   */
5135         case SEQID_SWISSPROT: /* swissprot */
5136         case SEQID_PRF:       /* prf   */
5137         case SEQID_GPIPE:     /* genome pipeline */
5138             tsip = (TextSeqIdPtr)(sip_tmp->data.ptrvalue);
5139             break;
5140         case SEQID_GENERAL:   /* general */
5141             dbt = (DbtagPtr)(sip_tmp->data.ptrvalue);
5142             StringCpy(tmpbuf, dbt->tag->str);
5143             break;
5144         case SEQID_PDB:       /* pdb   */
5145             psip = (PDBSeqIdPtr)(sip_tmp->data.ptrvalue);
5146             StringCpy(tmpbuf, psip->mol);
5147             break;
5148         }
5149     }
5150 
5151     if(tsip != NULL) {
5152         if(tsip->accession != NULL)
5153             StringCpy(tmpbuf, tsip->accession);
5154         else
5155             StringCpy(tmpbuf, tsip->name);
5156     }
5157 
5158     return TRUE;
5159 }
5160 
5161 #define READDB_TMPBUFF_SIZE 81
5162 /*
5163   Returnes Int4 sequence_number by SeqIdPtr using SISAM indexes:
5164 
5165   ReadDBFILEPtr rdfp: the main ReadDB reference,
5166   SeqIdPtr sip - input SeqIdPtr to find
5167   Int4 sequence_number: which number is this sequence,
5168   Returned 0 indicates, that gi was found
5169   Returned -1 indicates, that gi was not found
5170   Returned negative value mean fault of NISAM library
5171 */
5172 Int4 LIBCALL
readdb_seqid2fasta(ReadDBFILEPtr rdfp,SeqIdPtr sip)5173 readdb_seqid2fasta(ReadDBFILEPtr rdfp, SeqIdPtr sip)
5174 {
5175     ISAMErrorCode error;
5176     Int4 Value;
5177     CharPtr key_out = NULL, data = NULL;
5178     Uint4 index;
5179     Int4 gi = 0;
5180     CharPtr chptr = NULL;
5181     SeqIdPtr bestid;
5182     TextSeqIdPtr tsip = NULL;
5183 
5184     Char tmpbuff[READDB_TMPBUFF_SIZE];
5185     CharPtr seqid_buff_ptr = tmpbuff;
5186 
5187     if(rdfp->sisam_opt == NULL || sip == NULL)
5188         return -1;
5189 
5190     /* Use a gi if present to do a numerical lokup. */
5191     bestid = SeqIdFindBest(sip, SEQID_GI);
5192     if (bestid && bestid->choice == SEQID_GI)
5193     {
5194         return readdb_gi2seq(rdfp, bestid->data.intvalue, NULL);
5195     }
5196 
5197     while (rdfp)
5198     {
5199         if (rdfp->gifile) {
5200            rdfp = rdfp->next;
5201            continue;
5202         }
5203         if((error = ISAMGetIdxOption(rdfp->sisam_opt, &rdfp->sparse_idx)) < 0) {
5204             ErrPostEx(SEV_WARNING, 0, 0, "Failed to access string index "
5205                       "ISAM Error code is %d\n", error);
5206                 return -1;
5207         }
5208 
5209         if(rdfp->sparse_idx) {
5210             readdb_find_best_id(sip, &gi, seqid_buff_ptr);
5211             if(gi != 0) {
5212                     return readdb_gi2seq(rdfp, gi, NULL);
5213             }
5214         } else {
5215             Int4 i;
5216 
5217             switch (sip->choice) {
5218             case SEQID_EMBL:      /* embl */
5219             case SEQID_DDBJ:      /* ddbj */
5220             case SEQID_GENBANK:   /* genbank */
5221             case SEQID_TPG:       /* Third Party Annot/Seq Genbank */
5222             case SEQID_TPE:       /* Third Party Annot/Seq EMBL */
5223             case SEQID_TPD:       /* Third Party Annot/Seq DDBJ */
5224             case SEQID_OTHER:     /* other */
5225             case SEQID_PIR:       /* pir   */
5226             case SEQID_SWISSPROT: /* swissprot */
5227             case SEQID_PRF:       /* prf   */
5228             case SEQID_GPIPE:     /* genome pipeline */
5229                 tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
5230                 break;
5231             default:
5232                 break;
5233             }
5234 
5235             if(tsip != NULL) {
5236                 Int4 dummy_gi = 0; /* Not used, should have been handled above. */
5237                 GetAccessionVersionFromSeqId(sip, &gi, &seqid_buff_ptr, TRUE);
5238             } else {
5239                 if((SeqIdWrite(sip, seqid_buff_ptr,
5240                        PRINTID_FASTA_SHORT, READDB_TMPBUFF_SIZE-1)) == NULL)
5241                 return -1;
5242             }
5243 
5244             for(i = 0; seqid_buff_ptr[i] != '\0'; i++)
5245                 seqid_buff_ptr[i] = TO_LOWER(seqid_buff_ptr[i]);
5246         }
5247 
5248         NlmMutexLockEx(&isamsearch_mutex);
5249         if((error = SISAMSearch(rdfp->sisam_opt, seqid_buff_ptr, 0, &key_out,
5250                                 &data, &index)) < 0) {
5251             ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5252                       "ISAM Error code is %d\n", error);
5253             return error;
5254         }
5255         NlmMutexUnlock(isamsearch_mutex);
5256 
5257         if (tmpbuff != seqid_buff_ptr)
5258           MemFree(seqid_buff_ptr); /* seqid_buff_ptr allocated in GetAccessionVersionFromSeqId. */
5259 
5260         MemFree(key_out); /* We need no this for now */
5261 
5262         if(data && error != ISAMNotFound) {
5263             Value = atol(data);
5264             MemFree(data);
5265             return Value + rdfp->start;
5266         }
5267         rdfp = rdfp->next;
5268     }
5269     return -1;
5270 }
5271 
5272 /** Maximum number of volumes in a ReadDBFILEPtr linked list after which we
5273  * start munmap'ing the ISAM files to avoid running out of memory */
5274 static const size_t kSISAM_MaxNumVolumes = 10;
5275 
5276 /*
5277   Returns array of sequence numbers by accession using SISAM indexes:
5278 
5279   ReadDBFILEPtr rdfp: the main ReadDB reference,
5280   CharPtr string - input accession to find
5281   Int4Ptr PNTR ids - array of sequence numbers
5282   Int4Ptr count - number of hits
5283   Returned  non-negative value indicates, that hits were found
5284   Returned -1 indicates, that hit(s) were not found
5285   Returned negative value mean fault of ISAM library
5286 */
5287 
5288 Int4 LIBCALL
readdb_acc2fastaEx(ReadDBFILEPtr rdfp,CharPtr string,Int4Ptr PNTR ids,Int4Ptr count)5289 readdb_acc2fastaEx(ReadDBFILEPtr rdfp, CharPtr string, Int4Ptr PNTR ids,
5290                    Int4Ptr count)
5291 {
5292     ISAMErrorCode error;
5293     size_t vol_counter = 0;
5294     SeqIdPtr sip;
5295 
5296     if(rdfp->sisam_opt == NULL || string == NULL)
5297         return -1;
5298 
5299     if (StringChr(string, '|') != NULL) {
5300 
5301         if((sip = SeqIdParse(string)) != NULL) {
5302             *ids = MemNew(sizeof(Int4));
5303             **ids = readdb_seqid2fasta(rdfp, sip);
5304             SeqIdFree(sip);
5305 
5306             if(**ids >= 0) {
5307                 *count = 1;
5308                 return 1;
5309             } else {
5310                 return -1;
5311             }
5312         }
5313     }
5314 
5315     for (vol_counter = 0; rdfp; rdfp = rdfp->next, vol_counter++) {
5316         error =  SISAMFindAllData(rdfp->sisam_opt, string, ids, count);
5317 
5318         if(error != ISAMNotFound) {
5319             Int4 index=0;
5320             while (index < *count)
5321             {
5322                 (*ids)[index] += rdfp->start;
5323                 index++;
5324             }
5325             return 1;
5326         }
5327         if (vol_counter >= kSISAM_MaxNumVolumes) {
5328             ISAMUninitSearch(rdfp->sisam_opt);
5329         }
5330     }
5331     return -1;
5332 }
5333 /*
5334   Returns the first (*) Int4 sequence_number found by accession/locus using
5335   SISAM indexes:
5336 
5337   ReadDBFILEPtr rdfp: the main ReadDB reference,
5338   CharPtr string - input accession to find
5339   Int4 sequence_number: which number is this sequence,
5340   Returned 0 indicates, that gi was found
5341   Returned -1 indicates, that gi was not found
5342   Returned negative value mean fault of ISAM library
5343 
5344   (*): This means that in multi-volume databases (which potentially join
5345   databases which might contain the same sequence), only the first match will be
5346   returned.
5347 */
5348 
5349 Int4 LIBCALL
readdb_acc2fasta(ReadDBFILEPtr rdfp,CharPtr string)5350 readdb_acc2fasta(ReadDBFILEPtr rdfp, CharPtr string)
5351 {
5352     ISAMErrorCode error;
5353     ReadDBFILEPtr rdfp_head = rdfp;
5354     Int4 Value;
5355     CharPtr key_out = NULL, data = NULL;
5356     Uint4 index;
5357     Char tmp_str[64];
5358     size_t vol_counter = 0;
5359     SeqIdPtr sip;
5360 
5361     if(rdfp->sisam_opt == NULL || string == NULL)
5362         return -1;
5363 
5364     if (StringChr(string, '|') != NULL)
5365     {
5366         sip = SeqIdParse(string);
5367         Value = readdb_seqid2fasta(rdfp, sip);
5368         SeqIdFree(sip);
5369         return Value;
5370     }
5371 
5372     for (vol_counter = 0; rdfp; rdfp = rdfp->next, vol_counter++)
5373     {
5374         if((error = ISAMGetIdxOption(rdfp->sisam_opt, &rdfp->sparse_idx)) < 0) {
5375             ErrPostEx(SEV_WARNING, 0, 0, "Failed to access string index "
5376                       "ISAM Error code is %d\n", error);
5377             return -1;
5378         }
5379 
5380         if(rdfp->sparse_idx) {
5381 
5382             Int4 seq_num, count;
5383             Int4Ptr ids;
5384 
5385             readdb_acc2fastaEx(rdfp, string, &ids, &count);
5386             if(count > 0) {
5387                 seq_num = *ids;
5388                 MemFree(ids);
5389                 return seq_num;
5390             }
5391         }
5392         else
5393         {
5394             /* Trying accession first */
5395 
5396             sprintf(tmp_str, "gb|%s|", string);
5397 
5398             if((error = SISAMSearch(rdfp->sisam_opt, tmp_str, 0, &key_out, &data, &index)) < 0) {
5399                     ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index " "ISAM Error code is %d\n", error);
5400                     return error;
5401             }
5402 
5403             MemFree(key_out); /* We need no this for now */
5404 
5405             if(error != ISAMNotFound) {
5406                 Value = atol(data);
5407                 MemFree(data);
5408                 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5409                     if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5410                                                 rdfp_head, rdfp->start, -1))
5411                         return -1;
5412                 }
5413 
5414                 return Value + rdfp->start;
5415             }
5416 
5417             /* Now trying LOCUS */
5418 
5419             sprintf(tmp_str, "gb||%s", string);
5420 
5421             if((error = SISAMSearch(rdfp->sisam_opt, tmp_str, 0, &key_out,
5422                                 &data, &index)) < 0) {
5423                 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5424                       "ISAM Error code is %d\n", error);
5425                 return error;
5426             }
5427 
5428             MemFree(key_out); /* We need no this for now */
5429 
5430             if(error != ISAMNotFound) {
5431                 Value = atol(data);
5432                 MemFree(data);
5433                 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5434                     if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5435                                                 rdfp_head, rdfp->start, -1))
5436                         return -1;
5437                 }
5438 
5439                 return Value + rdfp->start;
5440             }
5441 
5442             /* Now trying string */
5443 
5444 
5445             if((error = SISAMSearch(rdfp->sisam_opt, string, 0, &key_out,
5446                                 &data, &index)) < 0) {
5447                 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5448                       "ISAM Error code is %d\n", error);
5449                 return error;
5450             }
5451 
5452             MemFree(key_out); /* We need no this for now */
5453 
5454             if(error != ISAMNotFound) {
5455                 Value = atol(data);
5456                 MemFree(data);
5457                 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5458                     if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5459                                                 rdfp_head, rdfp->start, -1))
5460                         return -1;
5461                 }
5462 
5463                 return Value + rdfp->start;
5464             } else {
5465                     MemFree(data);
5466             }
5467         }
5468         if (vol_counter >= kSISAM_MaxNumVolumes) {
5469             ISAMUninitSearch(rdfp->sisam_opt);
5470         }
5471     }
5472 
5473     return -1;
5474 }
5475 
5476 /*
5477    This function returnes "Seq-descr" as ValNode. This valnode then may be
5478    simply linked to set of descriptors in Bioseq: bsp->descr
5479 */
readdb_get_asn1_defline(ReadDBFILEPtr rdfp,Int4 sequence_number)5480 ValNodePtr readdb_get_asn1_defline(ReadDBFILEPtr rdfp, Int4 sequence_number)
5481 {
5482     ValNodePtr vnp = NULL;
5483     Int4 size;
5484     ByteStorePtr bsp;
5485     ByteStorePtr PNTR bspp;
5486     CharPtr buffer;
5487     UserFieldPtr ufp;
5488     UserObjectPtr uop;
5489     ObjectIdPtr oidp;
5490 
5491     /* If we're dealing with a subset (mask) database, encode the
5492      * proper defline, which is dictated by looking at the
5493      * membership bits of the BlastDefLinePtr
5494      * If readdb_encode_subset_asn1_defline fails, encode the
5495      * BlastDefLinePtr as found in the blast database [pn]hr files */
5496     if (rdfp->oidlist && rdfp->membership_bit != 0) {
5497         vnp = readdb_encode_subset_asn1_defline(rdfp, sequence_number);
5498         if (vnp != NULL)
5499             return vnp;
5500     }
5501 
5502     size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
5503         Nlm_SwapUint4(rdfp->header_index[sequence_number]);
5504 
5505     bsp = BSNew(size+1);
5506 
5507     if (rdfp->headerfp->mfile_true == TRUE) {
5508         NlmSeekInMFILE(rdfp->headerfp,
5509                        Nlm_SwapUint4(rdfp->header_index[sequence_number]),
5510                        SEEK_SET);
5511 
5512         BSWrite(bsp, rdfp->headerfp->mmp, size);
5513         BSSeek(bsp, 0, SEEK_SET);
5514     } else {
5515         NlmSeekInMFILE(rdfp->headerfp,
5516                        Nlm_SwapUint4(rdfp->header_index[sequence_number]),
5517                        SEEK_SET);
5518 
5519         buffer = MemNew(size+1);
5520         FileRead(buffer, size, 1, rdfp->headerfp->fp);
5521         BSWrite(bsp, buffer, size);
5522         MemFree(buffer);
5523     }
5524 
5525     /* Creating user field */
5526     ufp = UserFieldNew();
5527     ufp->num = 1;
5528     bspp = (ByteStorePtr PNTR) MemNew((ufp->num)*sizeof(ByteStorePtr));
5529     bspp[0] = bsp;
5530     ufp->data.ptrvalue = bspp;
5531 
5532     /* And object Id type for this object */
5533     oidp = ObjectIdNew();
5534     oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5535     ufp->label = oidp;
5536 
5537     /* SEQUENCE OF OCTET STRING ,   ptrvalue = ByteStorePtr PNTR */
5538     ufp->choice = 10;
5539 
5540     /* Creating user object */
5541     uop = UserObjectNew();
5542     uop->data = ufp;
5543 
5544     /* Create a new ObjectId for the UserObject */
5545     oidp = ObjectIdNew();
5546     oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5547     uop->type = oidp;
5548 
5549     /* Finaly descriptor is created as ... */
5550     vnp = NULL;
5551     vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5552 
5553     return vnp;
5554 }
5555 
readdb_encode_subset_asn1_defline(ReadDBFILEPtr rdfp,Int4 sequence_number)5556 ValNodePtr readdb_encode_subset_asn1_defline(ReadDBFILEPtr rdfp,
5557         Int4 sequence_number)
5558 {
5559     ValNodePtr vnp;
5560     Int4 size = 0;
5561     ByteStorePtr bsp;
5562     ByteStorePtr PNTR bspp;
5563     BytePtr buffer;
5564     UserFieldPtr ufp;
5565     UserObjectPtr uop;
5566     ObjectIdPtr oidp;
5567     BlastDefLinePtr bdsp = NULL;
5568     AsnIoMemPtr aimp;
5569 
5570     if ((bdsp = FDReadDeflineAsn(rdfp, sequence_number)) == NULL)
5571         return NULL;
5572 
5573     size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
5574            Nlm_SwapUint4(rdfp->header_index[sequence_number]);
5575     bsp = BSNew(size+1);
5576     buffer = MemNew(size+1);
5577 
5578     aimp = AsnIoMemOpen("wb",buffer,size+1);
5579     BlastDefLineSetAsnWrite(bdsp,aimp->aip, NULL);
5580     AsnIoFlush(aimp->aip);
5581     BSWrite(bsp,buffer,size+1);
5582 
5583     bdsp = BlastDefLineSetFree(bdsp);
5584     buffer = MemFree(buffer);
5585     aimp = AsnIoMemClose(aimp);
5586 
5587     /* Creating user field */
5588     ufp = UserFieldNew();
5589     ufp->num = 1;
5590     bspp = (ByteStorePtr PNTR) MemNew((ufp->num)*sizeof(ByteStorePtr));
5591     bspp[0] = bsp;
5592     ufp->data.ptrvalue = bspp;
5593 
5594     /* And object Id type for this object */
5595     oidp = ObjectIdNew();
5596     oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5597     ufp->label = oidp;
5598 
5599     /* SEQUENCE OF OCTET STRING ,   ptrvalue = ByteStorePtr PNTR */
5600     ufp->choice = 10;
5601 
5602     /* Creating user object */
5603     uop = UserObjectNew();
5604     uop->data = ufp;
5605 
5606     /* Create a new ObjectId for this UserObject */
5607     oidp = ObjectIdNew();
5608     oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5609     uop->type = oidp;
5610 
5611     /* Finaly descriptor is created as ... */
5612     vnp = NULL;
5613     vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5614 
5615     return vnp;
5616 }
5617 
5618 /*
5619    This function returnes "Seq-descr" as ValNode. This valnode then may be
5620    simply linked to set of descriptors in Bioseq: bsp->descr
5621 */
5622 
readdb_get_taxonomy_names(ReadDBFILEPtr rdfp,Int4 sequence_number)5623 ValNodePtr readdb_get_taxonomy_names(ReadDBFILEPtr rdfp, Int4 sequence_number)
5624 {
5625     BlastDefLinePtr bdp, tbdp;
5626     RDBTaxNamesPtr  tnames;
5627     UserFieldPtr ufp, ufp_last;
5628     UserObjectPtr uop;
5629     ObjectIdPtr oidp;
5630     CharPtr PNTR cpp;
5631     ValNodePtr vnp;
5632 
5633     if(rdfp == NULL || rdfp->taxinfo == NULL)
5634         return NULL;
5635 
5636     if((bdp =  FDReadDeflineAsn(rdfp, sequence_number)) == NULL)
5637         return NULL;
5638 
5639     /* Creating user object */
5640     uop = UserObjectNew();
5641 
5642     /* And object Id type for this object */
5643     oidp = ObjectIdNew();
5644     oidp->str = StringSave(TAX_DATA_OBJ_LABEL);
5645     uop->type = oidp;
5646 
5647     for(tbdp = bdp; tbdp != NULL; tbdp = tbdp->next) {
5648 
5649         /* Make sure we have the taxonomy information for this
5650          * tbdp->taxid */
5651         if ((tnames = RDBGetTaxNames(rdfp->taxinfo, tbdp->taxid)) == NULL )
5652             continue;
5653 
5654         /* Creating user field */
5655         ufp = UserFieldNew();
5656         ufp->choice = 7; /* strs */
5657 
5658         /* Label of every User-field will contain taxonomy Id and
5659            taxonomy names will be located in Visible Strings in
5660            pre-defined sequence */
5661 
5662         oidp = ObjectIdNew();
5663         oidp->id = tbdp->taxid;
5664         ufp->label = oidp;
5665 
5666         ufp->num = NUM_TAX_NAMES;
5667         cpp = MemNew(sizeof(CharPtr)*NUM_TAX_NAMES);
5668 
5669         cpp[SCI_NAME_POS] = StringSave(tnames->sci_name);
5670         cpp[COMMON_NAME_POS] = StringSave(tnames->common_name);
5671         cpp[BLAST_NAME_POS] = StringSave(tnames->blast_name);
5672         cpp[S_KING_POS] = StringSave(tnames->s_king);
5673 
5674         ufp->data.ptrvalue = cpp;
5675 
5676         if(uop->data == NULL)
5677             uop->data = ufp;
5678         else
5679             ufp_last->next = ufp;
5680 
5681         ufp_last = ufp;
5682         RDBTaxNamesFree(tnames);
5683     }
5684 
5685     /* Finaly descriptor is created as ... */
5686     vnp = NULL;
5687     if (uop->data != NULL)
5688         vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5689     else {
5690         UserObjectFree(uop);
5691     }
5692     BlastDefLineSetFree(bdp);
5693 
5694     return vnp;
5695 }
5696 
5697 /*
5698     Obtains a BioseqPtr from readdb:
5699 
5700     ReadDBFILEPtr rdfp: the main ReadDB reference,
5701     Int4 sequence_number: which number is this sequence,
5702 */
5703 BioseqPtr LIBCALL
readdb_get_bioseq(ReadDBFILEPtr rdfp,Int4 sequence_number)5704 readdb_get_bioseq(ReadDBFILEPtr rdfp, Int4 sequence_number)
5705 {
5706     return readdb_get_bioseq_ex(rdfp, sequence_number, TRUE, FALSE);
5707 }
5708 
5709 BioseqPtr LIBCALL
readdb_get_bioseq_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Boolean use_objmgr,Boolean insert_ctrlA)5710 readdb_get_bioseq_ex(ReadDBFILEPtr rdfp, Int4 sequence_number,
5711                      Boolean use_objmgr, Boolean insert_ctrlA)
5712 
5713 {
5714     BioseqPtr bsp;
5715     ByteStorePtr byte_store;
5716     CharPtr defline, new_defline = NULL, defline_ptr, new_defline_ptr;
5717     Int2 byte_value;
5718     Int4 length, compressed_length, count;
5719     SeqIdPtr sip;
5720     Uint1Ptr buffer, buffer_4na;
5721     Uint4Ptr ambchar = NULL;
5722     Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
5723 
5724     if ((rdfp = readdb_get_link(rdfp, sequence_number)) == NULL)
5725         return NULL;
5726 
5727     defline = NULL;
5728 
5729     readdb_get_descriptor(rdfp, sequence_number, &sip, &defline);
5730 
5731     if (insert_ctrlA == FALSE)
5732     {
5733             count = 0;
5734             new_defline = NULL;
5735             if (defline != NULL) {
5736                 defline_ptr = defline;
5737 
5738                 while (*defline_ptr != NULLB) {
5739                     count++;
5740                     if (*defline_ptr == READDB_DEF_SEPARATOR) {
5741                     /* Two spaces for every ctrl-A as it will be replaced by 2. */
5742                         count++;
5743                     }
5744                     defline_ptr++;
5745                 }
5746 
5747                    if (count != 0) {
5748                         new_defline = (CharPtr)Nlm_Malloc((count+1)*sizeof(Char));
5749                         new_defline_ptr = new_defline;
5750                         defline_ptr = defline;
5751                         while (*defline_ptr != NULLB) {
5752                         if (*defline_ptr == READDB_DEF_SEPARATOR) {
5753                                 *new_defline_ptr = ' ';
5754                                 new_defline_ptr++;
5755                                 *new_defline_ptr = '>';
5756                                 new_defline_ptr++;
5757                            } else {
5758                                 *new_defline_ptr = *defline_ptr;
5759                                 new_defline_ptr++;
5760                         }
5761                     defline_ptr++;
5762                         }
5763                         *new_defline_ptr = NULLB;
5764                         defline = (CharPtr)MemFree(defline);
5765                     }
5766             }
5767         }
5768         else
5769         new_defline = defline;
5770 
5771     if((length = readdb_get_sequence(rdfp, sequence_number, &buffer)) < 1)
5772         return NULL;
5773 
5774     if(use_objmgr) {
5775         if((bsp = BioseqNew()) == NULL)
5776             return NULL;
5777     } else {
5778         bsp = (BioseqPtr)MemNew(sizeof(Bioseq));
5779         if (bsp == NULL) return bsp;
5780         bsp->length = -1;    /* not set */
5781         bsp->topology = 1;   /* DEFAULT = linear */
5782     }
5783 
5784     byte_store = BSNew(0);
5785     if (is_prot) {
5786         bsp->mol = Seq_mol_aa;
5787         bsp->seq_data_type = Seq_code_ncbistdaa;
5788         BSWrite(byte_store, (VoidPtr) buffer, length);
5789     } else {
5790         /* Nucleotide sequence require more attention */
5791         if(!readdb_get_ambchar(rdfp, sequence_number, &ambchar)) {
5792             ErrPostEx(SEV_WARNING, 0, 0,
5793                       "Failure to read ambiguity information");
5794             return NULL;
5795         }
5796     /* Convert sequence if ambiguities. */
5797         if(ambchar != NULL) {/* are there any ambiguity ? */
5798         compressed_length = (length+3)/4; /* enough bytes for all bases. */
5799             buffer_4na = Nlm_Malloc((2*compressed_length)*sizeof(Uint1));
5800             MapNa2ByteToNa4String(buffer, (Uint2Ptr) buffer_4na, length/4);
5801         if (length%4 != 0)
5802         {
5803             Uint1 bytes[2];
5804                     bytes[0] = *(buffer+length/4);
5805                     bytes[0] &= 252;
5806                 MapNa2ByteToNa4String(bytes, (Uint2Ptr) (buffer_4na+2*(compressed_length-1)), 1);
5807         }
5808         RebuildDNA_4na(buffer_4na, compressed_length*2, ambchar);
5809                 BSWrite(byte_store, (VoidPtr) buffer_4na, compressed_length*2);
5810         MemFree(buffer_4na);
5811                 MemFree(ambchar);
5812                 bsp->seq_data_type = Seq_code_ncbi4na;
5813         }
5814     else
5815     {
5816             BSWrite(byte_store, (VoidPtr) buffer, length/4);
5817             if (length%4 != 0) {
5818                     byte_value = *(buffer+length/4);
5819                     byte_value &= 252;
5820                     BSPutByte(byte_store, byte_value);
5821             }
5822                 bsp->seq_data_type = Seq_code_ncbi2na;
5823     }
5824 
5825         bsp->mol = Seq_mol_na;
5826     }
5827 
5828     bsp->seq_data = (SeqDataPtr) byte_store;
5829 
5830     bsp->length = length;
5831     bsp->id = sip;
5832     bsp->repr = Seq_repr_raw;
5833 
5834     if (new_defline != NULL)  {
5835         bsp->descr = SeqDescrAddPointer(NULL, Seq_descr_title, new_defline);
5836     }
5837 
5838     if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5839         ValNodePtr vnp, vnp_tmp;
5840 
5841         /* First we encode complete ASN.1 definition line */
5842 
5843         vnp = readdb_get_asn1_defline(rdfp, sequence_number);
5844 
5845         if(bsp->descr != NULL) {
5846             for (vnp_tmp = bsp->descr; vnp_tmp->next != NULL;
5847                  vnp_tmp = vnp_tmp->next)
5848                 continue;
5849             vnp_tmp->next = vnp;
5850             vnp_tmp = vnp;
5851         } else {
5852             bsp->descr = vnp;
5853             vnp_tmp = bsp->descr;
5854         }
5855 
5856         /* Then encoding taxonomy names information from the
5857            taxonomy names database */
5858 
5859         vnp = readdb_get_taxonomy_names(rdfp, sequence_number);
5860         vnp_tmp->next = vnp;
5861     }
5862 
5863     return bsp;
5864 }
5865 
5866 /*
5867     returns the 'filebits' associated with a certain ordinal number.
5868     This is done by going to the rdfp for that ordinal id and
5869     gathering the filebits.
5870 */
5871 Boolean LIBCALL
readdb_get_filebits(ReadDBFILEPtr rdfp,Int4 ordinal_id,Uint2Ptr filebit,Uint2Ptr aliasfilebit)5872 readdb_get_filebits (ReadDBFILEPtr rdfp, Int4 ordinal_id, Uint2Ptr filebit, Uint2Ptr aliasfilebit)
5873 
5874 {
5875         rdfp = readdb_get_link(rdfp, ordinal_id);
5876 
5877     if (rdfp == NULL)
5878         return FALSE;
5879 
5880     if (filebit)
5881         *filebit = rdfp->filebit;
5882 
5883     if (aliasfilebit)
5884         *aliasfilebit = rdfp->aliasfilebit;
5885 
5886     return TRUE;
5887 }
5888 
5889 /* The following function performs a binary search to return an ordinal id of
5890    the last sequence in the BLAST database, whose offset in the sequence index
5891    is less than the offset argument. The offset is in bases, nucleotide or
5892    protein. For the former, the actual offset in the sequence index is
5893    computed inside the function (i.e. the offset argument is divided by 4).
5894    The first_seq argument tells the function not to look at ordinal ids smaller
5895    than the argument value.
5896 */
5897 Int4 LIBCALL
readdb_get_sequence_number(ReadDBFILEPtr rdfp,Int4 first_seq,Int8 offset)5898 readdb_get_sequence_number(ReadDBFILEPtr rdfp, Int4 first_seq, Int8 offset)
5899 {
5900    Int4 m, b, e, val;
5901    Int2 compression_ratio;
5902 
5903    if (!rdfp)
5904       return -1;
5905 
5906    if (rdfp->parameters & READDB_IS_PROT)
5907       compression_ratio = 1;
5908    else
5909       compression_ratio = READDB_COMPRESSION_RATIO;
5910 
5911    while (rdfp && rdfp->totlen <= offset) {
5912       offset -= rdfp->totlen;
5913       rdfp = rdfp->next;
5914    }
5915 
5916    if (!rdfp)
5917       return -1;
5918 
5919    e = rdfp->stop;
5920    b = MAX(first_seq, rdfp->start);
5921    offset /= compression_ratio;
5922 
5923    while (b < e - 1) {
5924       m = (b + e) / 2;
5925       if ((val = Nlm_SwapUint4(rdfp->sequence_index[m])) > offset)
5926          e = m;
5927       else if (val == offset)
5928          return m;
5929       else
5930          b = m;
5931    }
5932 
5933    return b;
5934 }
5935 
5936 /*
5937     Gets the sequence number "sequence_number".  If memory-mapped
5938     files are enabled, then *buffer points to the appropriate place
5939     in the memory-mapped file.  If memory-mapped files are not enabled,
5940     then sufficient space in *buffer is allocated (if this is not already
5941     the case) and this length is stored in *buffer_length.
5942 
5943     The length of the sequence requested is the return value; for memory-
5944     mapped files this is different than *buffer_length, which is always
5945     zero.
5946 */
5947 
5948 Int4 LIBCALL
readdb_get_sequence(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint1Ptr PNTR buffer)5949 readdb_get_sequence (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer)
5950 
5951 {
5952     Uint4 diff, length, nitems=0;
5953     Uint1 remainder;
5954     Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
5955 
5956     rdfp = readdb_get_link(rdfp, sequence_number);
5957 
5958     if (rdfp == NULL || rdfp->sequencefp == NULL)
5959         return 0;
5960 
5961     if (is_prot == FALSE)
5962     {
5963         nitems = Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]) -
5964             Nlm_SwapUint4(rdfp->sequence_index[sequence_number]);
5965     }
5966     else
5967     {
5968         nitems = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
5969             Nlm_SwapUint4(rdfp->sequence_index[sequence_number]) - 1;
5970     }
5971 
5972     NlmSeekInMFILE(rdfp->sequencefp,
5973         Nlm_SwapUint4(rdfp->sequence_index[sequence_number]),
5974         SEEK_SET);
5975 
5976         length = sizeof(Uint1) * nitems;
5977     /* Use memory-mapped file, don't allocate buffer. */
5978     if (rdfp->sequencefp->mfile_true == TRUE)
5979     {
5980             diff = rdfp->sequencefp->mmp_end - rdfp->sequencefp->mmp;
5981 
5982             if (length > diff)
5983             {
5984                         nitems = diff / sizeof(Uint1);
5985                         length = nitems * sizeof(Uint1);
5986             }
5987             *buffer = rdfp->sequencefp->mmp;
5988     }
5989     else
5990     {
5991     /* No mem-mapping, allocate a buffer for the subject sequence. */
5992         if (length+2 > rdfp->allocated_length)
5993         {
5994             if (rdfp->buffer != NULL)
5995                 rdfp->buffer = (UcharPtr)MemFree(rdfp->buffer);
5996             rdfp->allocated_length = rdfp->maxlen+2;
5997             rdfp->buffer = (UcharPtr)MemNew((rdfp->allocated_length)*sizeof(Uint1));
5998         }
5999 /* For protein db's the first and last byte is the NULLB, which is a sentinel byte
6000 used by the extension functions. For nucl. db's there are no sentinel bytes. */
6001         if (is_prot)
6002         {
6003             rdfp->buffer[0] = NULLB;
6004             *buffer = rdfp->buffer+1;
6005             FileRead(*buffer, sizeof(Uint1), nitems+1, rdfp->sequencefp->fp);
6006         }
6007         else
6008         {
6009             *buffer = rdfp->buffer;
6010             FileRead(*buffer, sizeof(Uint1), nitems, rdfp->sequencefp->fp);
6011         }
6012     }
6013 
6014     /* For nucl. return "unpacked" length and get the remainder out
6015     of the last byte. */
6016     if (is_prot == FALSE)
6017     {
6018 /* The first six bits in the byte holds the "remainder" (not a multiple of 4)
6019 and the last two bits of the byte holds the size of the remainder (0-3). */
6020         remainder = *(*buffer+length-1);
6021         remainder &= 0x3;
6022         length--;
6023 /* 4 bases per byte. */
6024         length *= 4;
6025         length += remainder;
6026     }
6027 
6028     return length;
6029 }
6030 
6031 /*
6032     Gets the sequence number "sequence_number".  The sequence returned includes
6033     all ambiguity information.  THis funciton should only be used for nucleic
6034     acid sequences, for proteins use readdb_get_sequence.
6035 
6036     buffer contains the sequence and is reallocated if *buffer_length is not long enough.
6037 
6038     The length of the sequence requested is the return value.
6039     protein sequences are always returned as Seq_code_ncbistdaa,
6040     nucleotide sequences as Seq_code_ncbi4na.
6041 
6042     In case of memory allocation failure, buffer is free'd and points to NULL,
6043     buffer_length is set to 0 and -1 is returned
6044 */
6045 
6046 Int4 LIBCALL
readdb_get_sequence_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint1Ptr PNTR buffer,Int4 * buffer_length,Boolean ready)6047 readdb_get_sequence_ex (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer, Int4 *buffer_length, Boolean ready)
6048 
6049 {
6050     Int4 length; /* Uncompressed length of sequence to be fetched */
6051     Uint1Ptr readdb_buffer; /* Pointer to (read-only) data returned by readdb. */
6052 
6053     length = readdb_get_sequence(rdfp, sequence_number, &readdb_buffer);
6054 
6055     /* Check the length, make it one longer for ALIGN. */
6056     if ((length+2) > *buffer_length || *buffer == NULL)
6057     {
6058         if (*buffer)
6059             MemFree(*buffer);
6060 
6061         *buffer = Nlm_Malloc((length+2)*sizeof(Uint1));
6062         if (*buffer == NULL) {
6063             *buffer_length = 0;
6064             return -1;
6065         }
6066         *buffer_length = length+2;
6067     }
6068 
6069     /* Copy sequence into allocated buffer. */
6070     if (rdfp->parameters & READDB_IS_PROT)   /* Protein */
6071     {
6072         MemCpy((VoidPtr) *buffer, readdb_buffer, length);
6073     }
6074     else   /* Nucleotide. */
6075     {
6076         Int4 copy_length;  /*  compressed (4-to-1) length of sequence being fetched. */
6077         Uint4Ptr ambchar = NULL;  /* Used below for fetching ambiguity information. */
6078         Uint1* buffer_ptr = (*buffer);   /* Used for calls to MapNa2ByteToNa4String and  RebuildDNA_4na */
6079 
6080         copy_length = length/4;
6081         MapNa2ByteToNa4String(readdb_buffer, (Uint2*) buffer_ptr, copy_length);
6082 
6083         if (length%4 != 0)
6084         {   /* Sets letters in last (incomplete) byte. */
6085                 Uint1 byte_value = *(readdb_buffer+length/4);
6086                 byte_value &= 252;
6087                 MapNa2ByteToNa4String(&byte_value, (Uint2*) (buffer_ptr+(2*copy_length)), 1);
6088                 copy_length++;
6089         }
6090 
6091         if(!readdb_get_ambchar(rdfp, sequence_number, &ambchar)) {
6092                 ErrPostEx(SEV_WARNING, 0, 0,
6093                           "Failure to read ambiguity information");
6094                 return -1;
6095         }
6096         /* Convert sequence if ambiguities. */
6097         if(ambchar != NULL) /* are there any ambiguity ? */
6098         {
6099                 Boolean status = RebuildDNA_4na(buffer_ptr, copy_length*2, ambchar);
6100                 ambchar = MemFree(ambchar);
6101                 if (status == FALSE)
6102                 {
6103                    ErrPostEx(SEV_WARNING, 0, 0,
6104                           "Failure to rebuild DNA in readdb_get_seqeuence_ex");
6105                    return -1;
6106                 }
6107 
6108         }
6109 
6110         if (ready)
6111         {
6112             Int4 index, index2;   /* Loop indices. */
6113             Uint1* private_buffer = (*buffer) + 1;
6114             index = length/2 - 2;
6115             index2 = length-1;
6116             if (length%2 != 0)
6117             {
6118                 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index+1] >> 4)];
6119                 index2--;
6120             }
6121             while (index2 > 0)
6122             {
6123                 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index] & 15)];
6124                 index2--;
6125                 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index] >> 4)];
6126                 index2--; index--;
6127             }
6128             private_buffer[length] = ncbi4na_to_blastna[0];
6129             (*buffer)[0] = ncbi4na_to_blastna[0];
6130         }
6131         else
6132         {
6133             Int4 index, index2;   /* Loop indices. */
6134             Uint1* private_buffer = (*buffer);
6135             index = length/2 - 1;
6136             index2 = length-1;
6137             if (length%2 != 0)
6138             {
6139                 private_buffer[index2] = (private_buffer[index+1] >> 4);
6140                 index2--;
6141             }
6142             while (index2 > 0)
6143             {
6144                 private_buffer[index2] = (private_buffer[index] & 15);
6145                 index2--;
6146                 private_buffer[index2] = (private_buffer[index] >> 4);
6147                 index2--; index--;
6148             }
6149         }
6150     }
6151 
6152     return length;
6153 }
6154 
6155 Int4 LIBCALL
readdb_get_sequence_length_approx(ReadDBFILEPtr rdfp,Int4 sequence_number)6156 readdb_get_sequence_length_approx(ReadDBFILEPtr rdfp, Int4 sequence_number)
6157 {
6158     Uint4 length = 0;
6159 
6160     rdfp = readdb_get_link(rdfp, sequence_number);
6161 
6162     if (rdfp == NULL)
6163         return 0;
6164 
6165     if (readdb_is_prot(rdfp) == FALSE)
6166     {
6167         length = Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]) -
6168                  Nlm_SwapUint4(rdfp->sequence_index[sequence_number]);
6169         length *= READDB_COMPRESSION_RATIO;
6170     }
6171     else
6172     {
6173         length = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6174                  Nlm_SwapUint4(rdfp->sequence_index[sequence_number]) - 1;
6175     }
6176     return (Int4)length;
6177 }
6178 /*
6179     Gets the length of sequence number "sequence_number".
6180 */
6181 
6182 Int4 LIBCALL
readdb_get_sequence_length(ReadDBFILEPtr rdfp,Int4 sequence_number)6183 readdb_get_sequence_length (ReadDBFILEPtr rdfp, Int4 sequence_number)
6184 
6185 {
6186     Int4 length = readdb_get_sequence_length_approx(rdfp, sequence_number);
6187 
6188     /* For nucl. return "unpacked" length and get the remainder out
6189        of the last byte. */
6190     if (readdb_is_prot(rdfp) == FALSE)
6191     {
6192         Uint1 remainder = 0;
6193         rdfp = readdb_get_link(rdfp, sequence_number);
6194         if (rdfp->sequencefp->mfile_true == TRUE)
6195         {
6196             NlmSeekInMFILE(rdfp->sequencefp,
6197                 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])-1, SEEK_SET);
6198             remainder = *(rdfp->sequencefp->mmp);
6199         }
6200         else
6201         {
6202             NlmSeekInMFILE(rdfp->sequencefp,
6203                 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])-1, SEEK_SET);
6204             NlmReadMFILE((Uint1Ptr) &remainder, 1, 1, rdfp->sequencefp);
6205         }
6206         /* The first six bits in the byte holds the "remainder" (not a
6207            multiple of 4) and the last two bits of the byte holds the size of
6208            the remainder (0-3). Note that length (as returned from
6209            readdb_get_sequence_length_approx) is the "unpacked" approximate
6210            length, that is, it assumes the last byte has 4 bases in it.
6211            Therefore, the next 3 lines correct that calculation with the exact
6212            sequence length.
6213         */
6214         remainder &= 3;  /* number of bases stored in the last byte */
6215         length -= READDB_COMPRESSION_RATIO; /* subtract the last byte */
6216         length += remainder; /* this is the exact "unpacked" sequence length */
6217     }
6218 
6219     return length;
6220 }
6221 #ifdef FASTA_ASN
6222 /*
6223 Get the FasfaPtr (ASN.1) for the sequence with sequence_number.
6224 It is the caller's RESPONSIBILITY to DEALLOCATE Fasta ASN.1".
6225 */
readdb_get_fastaid(ReadDBFILEPtr rdfp,Int4 sequence_number)6226 FdbFastaPtr LIBCALL readdb_get_fastaid PROTO((ReadDBFILEPtr rdfp,
6227                                            Int4 sequence_number))
6228 {
6229   FdbFastaPtr fasta;
6230   AsnIoPtr aip;
6231   AsnIoMemPtr aimp;
6232   Int4 size;
6233 
6234   rdfp = readdb_get_link(rdfp, sequence_number);
6235 
6236   if (rdfp == NULL)
6237     return FALSE;
6238 
6239   size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6240       Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6241 
6242   if (rdfp->headerfp->mfile_true == TRUE) {
6243     NlmSeekInMFILE(rdfp->headerfp,
6244                    Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6245            SEEK_SET);
6246     aimp = AsnIoMemOpen("rb", rdfp->headerfp->mmp, size);
6247     fasta = FdbFastaAsnRead(aimp->aip, NULL);
6248     AsnIoMemClose(aimp);
6249   } else {
6250     aip = AsnIoNew(ASNIO_BIN_IN, rdfp->headerfp->fp, NULL, NULL, NULL);
6251     NlmSeekInMFILE(rdfp->headerfp,
6252                    Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6253            SEEK_SET);
6254     fasta = FdbFastaAsnRead(aip, NULL);
6255     AsnIoFree(aip, FALSE);
6256   }
6257   return fasta;
6258 }
6259 #endif
6260 Boolean  LIBCALL
readdb_get_ambchar(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr PNTR ambchar_return)6261 readdb_get_ambchar (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr PNTR ambchar_return)
6262 {
6263   Uint4Ptr ambchar;
6264   Int4 length, index;
6265   Uint4 total;
6266 
6267   rdfp = readdb_get_link(rdfp, sequence_number);
6268 
6269   if((length = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6270           Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])) == 0) {
6271     *ambchar_return = NULL;
6272     return TRUE;    /* no ambiguous characters available */
6273   }
6274 
6275     /* Each ambig. residue is represented by a Uint4,
6276        but length is in bytes. */
6277 
6278     total = length/4;
6279     if((ambchar = (Uint4Ptr)MemNew(total*sizeof(Uint4))) == NULL)
6280       return FALSE;
6281 
6282     NlmSeekInMFILE(rdfp->sequencefp,
6283                    Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]), SEEK_SET);
6284 
6285     NlmReadMFILE((Uint1Ptr) ambchar, 4, total, rdfp->sequencefp);
6286     total &= 0x7FFFFFFF; /* mask off everything but the highest order bit. */
6287     for (index=0; index<total; index++) {
6288       ambchar[index] = Nlm_SwapUint4(ambchar[index]);
6289     }
6290 
6291   *ambchar_return = ambchar;
6292   return TRUE;
6293 }
6294 
6295 /*
6296     Check if ambiguity characters are present in the sequence.
6297 */
6298 
6299 Boolean LIBCALL
readdb_ambchar_present(ReadDBFILEPtr rdfp,Int4 sequence_number)6300 readdb_ambchar_present (ReadDBFILEPtr rdfp, Int4 sequence_number)
6301 
6302 {
6303       rdfp = readdb_get_link(rdfp, sequence_number);
6304     if (rdfp == NULL)
6305         return FALSE;
6306 
6307     if (rdfp->ambchar_index == NULL)
6308         return FALSE;
6309 
6310     if((Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6311             Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])) == 0)
6312     {
6313         return FALSE;
6314     }
6315 
6316     return TRUE;
6317 }
6318 
6319 static Boolean
readdb_adjust_local_id(ReadDBFILEPtr rdfp,SeqIdPtr sip)6320 readdb_adjust_local_id(ReadDBFILEPtr rdfp, SeqIdPtr sip)
6321 
6322 {
6323     DbtagPtr dbtag;
6324     ObjectIdPtr oid;
6325 
6326     if (sip == NULL || sip->choice != SEQID_GENERAL)
6327         return FALSE;
6328 
6329     if (rdfp->start == 0)
6330         return TRUE;
6331 
6332     dbtag = sip->data.ptrvalue;
6333     if (dbtag && StringCmp(dbtag->db, "BL_ORD_ID") == 0)
6334     {
6335         oid = dbtag->tag;
6336         oid->id += rdfp->start;
6337     }
6338 
6339     return TRUE;
6340 
6341 
6342 
6343 }
6344 
FDBuildOldStyleDefline(ReadDBFILEPtr rdfp,BlastDefLinePtr bdsp)6345 static CharPtr FDBuildOldStyleDefline(ReadDBFILEPtr rdfp, BlastDefLinePtr bdsp)
6346 {
6347     CharPtr defline;
6348     Char id_buffer[128];
6349     Int4 length, count;
6350     BlastDefLinePtr bdsp_tmp;
6351     Boolean first;
6352     ValNodePtr memb = NULL;
6353     Uint4 membership_mask = 0;
6354 
6355     count = 0;
6356     length = 0;
6357     membership_mask = (0x1 << (rdfp->membership_bit-1));
6358 
6359     /* First calculating - how much memory do we need ? */
6360     for(bdsp_tmp = bdsp; bdsp_tmp != NULL; bdsp_tmp = bdsp_tmp->next) {
6361         length += StringLen(bdsp_tmp->title);
6362         count++;
6363     }
6364 
6365     defline = MemNew(count*128 + length);
6366     MemSet(defline, '\0', sizeof(defline));
6367     first = TRUE;
6368     for(bdsp_tmp = bdsp; bdsp_tmp != NULL; bdsp_tmp = bdsp_tmp->next) {
6369 
6370         if (rdfp->membership_bit == 0) {  /* real database */
6371             if(!first) {
6372                 StringCat(defline, "\1");
6373                 SeqIdWrite(bdsp_tmp->seqid, id_buffer,
6374                            PRINTID_FASTA_LONG, sizeof(id_buffer));
6375                 StringCat(defline, id_buffer);
6376                 StringCat(defline, " ");
6377             } else {
6378                 first = FALSE;
6379             }
6380 
6381             StringCat(defline, bdsp_tmp->title);
6382 
6383         } else { /* subset database, verify the membership bit */
6384 
6385             memb = bdsp_tmp->memberships;
6386             if (memb && (membership_mask & memb->data.intvalue)) {
6387                 if (!first) {
6388                     StringCat(defline, "\1");
6389                     SeqIdWrite(bdsp_tmp->seqid, id_buffer,
6390                             PRINTID_FASTA_LONG, sizeof(id_buffer));
6391                     StringCat(defline, id_buffer);
6392                     StringCat(defline, " ");
6393                 } else {
6394                     first = FALSE;
6395                 }
6396                 StringCat(defline, bdsp_tmp->title);
6397             }
6398             memb = NULL;
6399         }
6400     }
6401     if(*defline == '\0'){
6402         MemFree(defline);
6403         defline = NULL;
6404     }
6405     return defline;
6406 }
6407 
FDReadDeflineAsn(ReadDBFILEPtr rdfp,Int4 sequence_number)6408 BlastDefLinePtr FDReadDeflineAsn(ReadDBFILEPtr rdfp, Int4 sequence_number)
6409 {
6410     BlastDefLinePtr bdsp, bdsp_tmp, bdsp_prev;
6411     AsnIoPtr aip;
6412     AsnIoMemPtr aimp;
6413     Int4 size;
6414     SeqIdPtr seqid = NULL;
6415 
6416     if ((rdfp = readdb_get_link(rdfp, sequence_number)) == NULL)
6417         return NULL;
6418 
6419     size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6420         Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6421 
6422     if (rdfp->headerfp->mfile_true == TRUE) {
6423         NlmSeekInMFILE(rdfp->headerfp,
6424                        Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6425                        SEEK_SET);
6426         aimp = AsnIoMemOpen("rb", rdfp->headerfp->mmp, size);
6427         bdsp = (BlastDefLinePtr) BlastDefLineSetAsnRead(aimp->aip, NULL);
6428         AsnIoMemClose(aimp);
6429     } else {
6430         aip = AsnIoNew(ASNIO_BIN_IN, rdfp->headerfp->fp, NULL, NULL, NULL);
6431         NlmSeekInMFILE(rdfp->headerfp,
6432                        Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6433                        SEEK_SET);
6434         bdsp =  (BlastDefLinePtr) BlastDefLineSetAsnRead(aip, NULL);
6435         AsnIoFree(aip, FALSE);
6436     }
6437 
6438     /* If dealing with a subset (mask) database, filter the
6439      * BlastDefLinePtr from entries that are not relevant
6440      * (this applies only to non-redundant databases) */
6441     if (rdfp->oidlist && rdfp->membership_bit != 0) {
6442         ValNodePtr memb = NULL;
6443         Uint4 memb_mask = 0;
6444         BlastDefLinePtr bdsp_last, bdsp_rv_tmp = NULL, bdsp_head = NULL;
6445         Boolean first = TRUE;
6446 
6447         /* create the memberships mask (this should be fixed to allow membership
6448          * bits greater than 32) */
6449         memb_mask = 0x1 << (rdfp->membership_bit-1);
6450 
6451         /* build the new adjusted BlastDefLine structure */
6452         for (bdsp_tmp = bdsp; bdsp_tmp; bdsp_tmp = bdsp_tmp->next) {
6453             memb = bdsp_tmp->memberships;
6454             if (memb && (memb_mask & memb->data.intvalue)) {
6455 
6456                 if (first) {
6457                     bdsp_rv_tmp = BlastDefLineNew();
6458                     bdsp_head = bdsp_last = bdsp_rv_tmp;
6459                     if (!bdsp_rv_tmp) {
6460                         ErrPostEx(SEV_ERROR,0,0,
6461                                 "Not enough memory in FDReadDeflineAsn");
6462                         return bdsp;
6463                     }
6464                     first = FALSE;
6465                 } else {
6466                     bdsp_rv_tmp = BlastDefLineNew();
6467                     if (!bdsp_rv_tmp) {
6468                         ErrPostEx(SEV_ERROR,0,0,
6469                                 "Not enough memory in FDReadDeflineAsn");
6470                         bdsp_head = BlastDefLineSetFree(bdsp_head);
6471                         return bdsp;
6472                     }
6473                 }
6474 
6475                 bdsp_rv_tmp->seqid = SeqIdSetDup(bdsp_tmp->seqid);
6476                 bdsp_rv_tmp->title = StringSave(bdsp_tmp->title);
6477                 bdsp_rv_tmp->taxid = bdsp_tmp->taxid;
6478                 bdsp_rv_tmp->memberships=IntValNodeCopy(bdsp_tmp->memberships);
6479                 bdsp_rv_tmp->links = IntValNodeCopy(bdsp_tmp->links);
6480                 bdsp_rv_tmp->other_info = IntValNodeCopy(bdsp_tmp->other_info);
6481                 bdsp_last->next = bdsp_rv_tmp;
6482                 bdsp_rv_tmp->next = NULL;
6483                 bdsp_last = bdsp_rv_tmp;
6484             }
6485         }
6486         bdsp = BlastDefLineSetFree(bdsp);
6487         bdsp = bdsp_head;
6488     }
6489 
6490     /* If the preferred gi is set, then put the BlastDefLine structure that
6491      * contains it first in the chain of BlastDefLinePtr's */
6492     if (rdfp->preferred_gi != 0) {
6493 
6494         ValNodeAddInt(&seqid, SEQID_GI, rdfp->preferred_gi);
6495         bdsp_prev = NULL;
6496 
6497         for (bdsp_tmp = bdsp; bdsp_tmp; bdsp_tmp = bdsp_tmp->next) {
6498 
6499             if (SeqIdIn(bdsp_tmp->seqid, seqid)) {
6500                 if (bdsp_prev != NULL)
6501                     bdsp_prev->next = bdsp_tmp->next;
6502                 if (bdsp_tmp != bdsp) {
6503                     bdsp_tmp->next = bdsp;
6504                     bdsp = bdsp_tmp;
6505                 }
6506                 break;
6507             }
6508             bdsp_prev = bdsp_tmp;
6509         }
6510         SeqIdFree(seqid);
6511     }
6512 
6513     return bdsp;
6514 }
6515 
6516 /* This function suppose, that gived rdfp is correct - it contains given
6517    sequence number */
6518 static Boolean
readdb_get_defline_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,CharPtr PNTR description,SeqIdPtr PNTR seqidp)6519 readdb_get_defline_ex (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description, SeqIdPtr PNTR seqidp)
6520 
6521 {
6522     Char buffer[READDB_BUF_SIZE], id_buf[READDB_BUF_SIZE];
6523     CharPtr buf_ptr;
6524     Int4 new_size, index;
6525     BlastDefLinePtr bdsp;
6526 
6527     if(rdfp == NULL)
6528         return FALSE;
6529 
6530     SeqLocAsnLoad();
6531 
6532     new_size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6533     Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6534 
6535     if (new_size > READDB_BUF_SIZE){
6536         buf_ptr = (CharPtr)Nlm_Malloc(new_size*sizeof(Char) + 1);
6537     } else {
6538         buf_ptr = &buffer[0];
6539     }
6540 
6541     NlmSeekInMFILE(rdfp->headerfp, Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6542                    SEEK_SET);
6543     if (NlmReadMFILE((Uint1Ptr) buf_ptr, sizeof(Char), new_size,
6544                      rdfp->headerfp) != new_size)
6545     {
6546         if (buf_ptr != &buffer[0])
6547               buf_ptr = (CharPtr)MemFree(buf_ptr);
6548         return FALSE;
6549     }
6550 
6551     if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6552 
6553         bdsp = FDReadDeflineAsn(rdfp, sequence_number);
6554 
6555         if(bdsp == NULL) {
6556             ErrPostEx(SEV_ERROR, 0, 0, "readdb_get_defline_ex: "
6557                     "Failure to read defline ASN for %d", sequence_number);
6558             if (seqidp)        *seqidp = NULL;
6559             if (description)   *description = NULL;
6560             if (buf_ptr != &buffer[0])
6561                 buf_ptr = (CharPtr)MemFree(buf_ptr);
6562             return FALSE;
6563         }
6564 
6565         if(seqidp != NULL) {
6566             *seqidp = SeqIdSetDup(bdsp->seqid);
6567             readdb_adjust_local_id(rdfp, *seqidp);
6568         }
6569 
6570         if(description != NULL)
6571             *description = FDBuildOldStyleDefline(rdfp, bdsp);
6572 
6573         BlastDefLineSetFree(bdsp);
6574 
6575         if (buf_ptr != &buffer[0])
6576             buf_ptr = (CharPtr)MemFree(buf_ptr);
6577 
6578         return TRUE;
6579     }
6580 
6581 
6582     buf_ptr[new_size] = NULLB;    /* defline saved w/o NULLB. */
6583 
6584     if(seqidp != NULL) {        /* SeqId requested separate from descriptor */
6585 
6586         for (index=0; index<READDB_BUF_SIZE; index++) {
6587             if (buf_ptr[index] == ' ' || buf_ptr[index] == NULLB) {
6588                 id_buf[index] = NULLB;
6589                 index++;
6590                 break;
6591             }
6592             id_buf[index] = buf_ptr[index];
6593         }
6594 
6595         *seqidp = SeqIdParse(id_buf);
6596         readdb_adjust_local_id(rdfp, *seqidp);
6597 
6598         if (description != NULL)
6599             *description = StringSave(&buf_ptr[index]);
6600     } else {
6601         if (description != NULL)
6602             *description = StringSave(buf_ptr);
6603     }
6604 
6605     if (buf_ptr != &buffer[0])
6606         buf_ptr = (CharPtr)MemFree(buf_ptr);
6607 
6608     return TRUE;
6609 }
6610 
readdb_get_descriptor(ReadDBFILEPtr rdfp,Int4 sequence_number,SeqIdPtr PNTR id,CharPtr PNTR description)6611 Boolean LIBCALL readdb_get_descriptor (ReadDBFILEPtr rdfp,
6612                                        Int4 sequence_number,
6613                                        SeqIdPtr PNTR id,
6614                                        CharPtr PNTR description)
6615 
6616 {
6617     Boolean not_done;
6618     Char id_buf[READDB_BUF_SIZE];
6619     CharPtr defline, new_defline=NULL, tmp_defline;
6620     CommonIndexPtr      cigi;
6621     Int4 alias_mask=0, gi;
6622     Int4 defline_length, new_defline_length;
6623     SeqIdPtr bestid, seqid;
6624     Uint2 aliasfilebit=0;
6625     Uint4 header_index;
6626     BlastDefLinePtr bdfp=NULL, bdfp_head=NULL;
6627 
6628     rdfp = readdb_get_link(rdfp, sequence_number);
6629     if (rdfp == NULL)
6630         return FALSE;
6631 
6632     if (rdfp->oidlist) {
6633     readdb_get_filebits(rdfp, sequence_number, NULL, &aliasfilebit);
6634     }
6635 
6636     if (aliasfilebit != 0) {
6637     alias_mask |= (0x1 << aliasfilebit);
6638 
6639     *id = NULL;
6640     not_done = TRUE;
6641     header_index = 0;
6642 
6643         bdfp = NULL;
6644         if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6645             return readdb_get_defline_ex(rdfp, sequence_number, description, id);
6646         } else {
6647 
6648         while (not_done) {
6649 
6650                 not_done = readdb_get_header(rdfp, sequence_number, &header_index, &seqid, &defline);
6651                 if (not_done == FALSE)
6652                     break;
6653 
6654                 bestid = SeqIdFindBest(seqid, SEQID_GI);
6655                 gi = bestid->data.intvalue;
6656                 cigi = rdfp->cih->ci + gi;
6657                 if (alias_mask & SwapUint4(cigi->dbmask)) {
6658                     if (*id == NULL) {
6659                         *id = seqid;
6660                         seqid = NULL;
6661                         new_defline = defline;
6662                         new_defline_length = StringLen(new_defline);
6663                         defline = NULL;
6664                     } else {
6665                         SeqIdWrite(seqid, id_buf, PRINTID_FASTA_LONG, READDB_BUF_SIZE);
6666                         seqid = SeqIdSetFree(seqid);
6667                         defline_length = new_defline_length;
6668                         new_defline_length += StringLen(defline) + StringLen(id_buf);
6669                         new_defline_length += 2;
6670                         tmp_defline = MemNew(new_defline_length+1);
6671                         MemCpy(tmp_defline, new_defline, defline_length);
6672                         sprintf(tmp_defline+defline_length, "%c%s %s", READDB_DEF_SEPARATOR, id_buf, defline);
6673                         defline = MemFree(defline);
6674                         new_defline = MemFree(new_defline);
6675                         new_defline = tmp_defline;
6676                     }
6677                 } else {
6678                     seqid = SeqIdSetFree(seqid);
6679                     defline = MemFree(defline);
6680                 }
6681         }
6682 
6683         if (seqid != NULL)
6684             seqid = SeqIdSetFree(seqid);
6685         if (defline != NULL)
6686             defline = MemFree(defline);
6687 
6688         if (description != NULL)
6689                 *description = new_defline;
6690         else
6691                 new_defline = MemFree(new_defline);
6692         }
6693     } else if (rdfp->gi_target != 0) {
6694     *id = NULL;
6695     not_done = TRUE;
6696     header_index = 0;
6697     new_defline = NULL;
6698 
6699         bdfp = NULL;
6700         if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6701             bdfp = FDReadDeflineAsn(rdfp, sequence_number);
6702             if(bdfp == NULL) {
6703                 ErrPostEx(SEV_ERROR, 0, 0, "readdb_get_descriptor: "
6704                         "Failure to read defline ASN for %d", sequence_number);
6705                 *id = NULL;
6706                 *description = NULL;
6707                 return FALSE;
6708             }
6709             bdfp_head = bdfp;
6710         }
6711 
6712     while (not_done) {
6713 
6714             if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6715                 seqid = SeqIdSetDup(bdfp->seqid);
6716                 defline = StringSave(bdfp->title);
6717                 if((bdfp = bdfp->next) == NULL)
6718                     not_done = FALSE;
6719             } else {
6720                 not_done = readdb_get_header(rdfp, sequence_number, &header_index, &seqid, &defline);
6721                 if (not_done == FALSE)
6722                     break;
6723             }
6724 
6725             bestid = SeqIdFindBest(seqid, SEQID_GI);
6726             gi = bestid->data.intvalue;
6727             if (gi == rdfp->gi_target) {
6728                 *id = seqid;
6729                 seqid = NULL;
6730                 new_defline = defline;
6731                 defline = NULL;
6732             } else {
6733                 seqid = SeqIdSetFree(seqid);
6734                 defline = MemFree(defline);
6735             }
6736     }
6737 
6738     if (seqid != NULL)
6739         seqid = SeqIdSetFree(seqid);
6740     if (defline != NULL)
6741         defline = MemFree(defline);
6742 
6743     BlastDefLineSetFree(bdfp_head);
6744 
6745     if (description != NULL)
6746             *description = new_defline;
6747     else
6748             new_defline = MemFree(new_defline);
6749     } else {
6750         return readdb_get_defline_ex(rdfp, sequence_number, description, id);
6751     }
6752 
6753     return TRUE;
6754 }
6755 
6756 Boolean
readdb_get_defline(ReadDBFILEPtr rdfp,Int4 sequence_number,CharPtr PNTR description)6757 readdb_get_defline (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description)
6758 {
6759     rdfp = readdb_get_link(rdfp, sequence_number);
6760 
6761     if (rdfp == NULL)
6762         return FALSE;
6763 
6764     return readdb_get_defline_ex(rdfp, sequence_number, description, NULL);
6765 }
6766 
6767 
6768 
6769 /*
6770     A single sequence may be attched to several entries (as they all
6771     have the same sequence).  This function gets the ID and deflines for
6772     each entry attched to one sequence.  On the first call the Uint4
6773     (*header_index) should be zero; it will be filled in by readdb_get_header.
6774     Subsequent calls will use this information to know which ID and
6775     defline to retrieve next.  When all are retrieved, FALSE will be returned.
6776     Caller is responsible for deallocating the out-parameters.
6777 */
6778 Boolean LIBCALL
readdb_get_header(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr header_index,SeqIdPtr PNTR id,CharPtr PNTR description)6779 readdb_get_header (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr header_index,
6780                    SeqIdPtr PNTR id, CharPtr PNTR description)
6781 {
6782     return readdb_get_header_ex (rdfp, sequence_number, header_index, id, description,
6783                                NULL, NULL, NULL);
6784 }
6785 
6786 /*
6787  * Simple function to copy a linked list of ints
6788  * (shouldn't this go somewhere else?)
6789  *
6790  */
IntValNodeCopy(ValNodePtr src)6791 static ValNodePtr IntValNodeCopy(ValNodePtr src)
6792 {
6793     ValNodePtr retval = NULL;
6794 
6795     if (!src)
6796         return NULL;
6797 
6798     if ((retval = ValNodeAddInt(NULL,0,src->data.intvalue)) == NULL)
6799         return NULL;
6800 
6801     for (src = src->next ; src; src = src->next) {
6802         ValNodeAddInt(&retval,0,src->data.intvalue);
6803     }
6804 
6805     return retval;
6806 }
6807 
6808 Boolean LIBCALL
readdb_get_header_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr header_index,SeqIdPtr PNTR id,CharPtr PNTR description,Int4 PNTR taxid,ValNodePtr PNTR memberships,ValNodePtr PNTR links)6809 readdb_get_header_ex (ReadDBFILEPtr rdfp, Int4 sequence_number,
6810                       Uint4Ptr header_index, SeqIdPtr PNTR id,
6811                       CharPtr PNTR description, Int4 PNTR taxid,
6812                       ValNodePtr PNTR memberships, ValNodePtr PNTR links)
6813 
6814 {
6815     Boolean retval = FALSE;
6816     Char id_buf[READDB_BUF_SIZE];
6817     CharPtr buf_ptr, buf_defline_start;
6818     Int4 index, size, i;
6819     Uint4 header_index_end;
6820     BlastDefLinePtr bdlp = NULL;
6821 
6822     rdfp = readdb_get_link(rdfp, sequence_number);
6823 
6824     if (!rdfp)
6825         return retval;
6826 
6827     if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6828 
6829         if (*header_index == 0) {
6830             bdlp = FDReadDeflineAsn(rdfp, sequence_number);
6831             if (bdlp == NULL) {
6832                 if (id != NULL)          *id = NULL;
6833                 if (description != NULL) *description = NULL;
6834                 if (memberships != NULL) *memberships = NULL;
6835                 if (links != NULL)       *links = NULL;
6836                 return retval;
6837             }
6838             if (rdfp->blast_deflinep)
6839                 BlastDefLineSetFree(rdfp->blast_deflinep);
6840             rdfp->blast_deflinep = bdlp; /* cache the BlastDefLinePtr */
6841 
6842         } else if (*header_index == UINT4_MAX) {
6843             if (id != NULL)          *id = NULL;
6844             if (description != NULL) *description = NULL;
6845             if (memberships != NULL) *memberships = NULL;
6846             if (links != NULL)       *links       = NULL;
6847             rdfp->blast_deflinep = BlastDefLineSetFree(rdfp->blast_deflinep);
6848             return retval;
6849 
6850         } else {
6851             bdlp = rdfp->blast_deflinep;
6852             for (i = 0; i < *header_index; i++) {
6853                 if (bdlp == NULL) { /* sanity check */
6854                     ErrPostEx(SEV_ERROR,0,0,"There is no BlastDefLinePtr in rdfp!");
6855                     return retval;
6856                 }
6857                 bdlp = bdlp->next;
6858             }
6859         }
6860 
6861         /* Assign the values */
6862         if (id != NULL)          *id = SeqIdSetDup(bdlp->seqid);
6863         if (description != NULL) *description = StringSave(bdlp->title);
6864         if (taxid != NULL)       *taxid       = bdlp->taxid;
6865         if (memberships != NULL) *memberships = IntValNodeCopy(bdlp->memberships);
6866         if (links != NULL)       *links       = IntValNodeCopy(bdlp->links);
6867 
6868         /* At the end of the deflines, set *header_index to a sentinel value */
6869         if (bdlp->next == NULL)
6870             *header_index = UINT4_MAX;
6871         else
6872             (*header_index)++;
6873 
6874         retval = TRUE;
6875 
6876     } else {     /* Provide old version for backwards compatibility */
6877 
6878         rdfp = readdb_get_link(rdfp, sequence_number);
6879         if (rdfp == NULL)
6880             return FALSE;
6881 
6882         if (*header_index == 0)
6883             *header_index = Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6884 
6885         header_index_end = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]);
6886 
6887         if (*header_index >= header_index_end) {
6888            *header_index = 0;
6889             return FALSE;
6890         }
6891 
6892         size = header_index_end-(*header_index);
6893         buf_ptr = MemNew((size+1)*sizeof(Char));
6894 
6895         NlmSeekInMFILE(rdfp->headerfp, (long) *header_index, SEEK_SET);
6896         if (NlmReadMFILE((Uint1Ptr) buf_ptr, sizeof(Char), size, rdfp->headerfp) != size)
6897            return FALSE;
6898 
6899         for (index=0; index<size; index++) {
6900             if (buf_ptr[index] == ' ') {
6901                 id_buf[index] = NULLB;
6902                 index++;
6903                 break;
6904             }
6905             id_buf[index] = buf_ptr[index];
6906         }
6907         if (id) *id = SeqIdParse(id_buf);
6908 
6909         buf_defline_start = &buf_ptr[index];
6910         while (index < size) {
6911             if (buf_ptr[index] == READDB_DEF_SEPARATOR) {
6912                 break;
6913             }
6914             index++;
6915         }
6916         buf_ptr[index] = NULLB;
6917         index++;
6918         if (description != NULL) {
6919             *description = StringSave(buf_defline_start);
6920         }
6921         buf_ptr = MemFree(buf_ptr);
6922         *header_index += index;
6923 
6924         retval = TRUE;
6925     }
6926 
6927     return retval;
6928 }
6929 
6930 /*
6931     Obtains the total database length from the ReadDBFILE structure.
6932 */
6933 Int8 LIBCALL
readdb_get_dblen(ReadDBFILEPtr rdfp)6934 readdb_get_dblen (ReadDBFILEPtr rdfp)
6935 
6936 {
6937     if (rdfp == NULL)
6938         return 0;
6939 
6940     return rdfp->totlen;
6941 }
6942 
6943 /*
6944 Obtains the total number of database sequences from all the ReadDBFILE structures.
6945 */
6946 Int4 LIBCALL
readdb_get_num_entries_total(ReadDBFILEPtr rdfp)6947 readdb_get_num_entries_total (ReadDBFILEPtr rdfp)
6948 
6949 {
6950     Int4 total=0;
6951     if (rdfp == NULL)
6952         return 0;
6953 
6954     while (rdfp) {
6955         total += rdfp->num_seqs;
6956         rdfp = rdfp->next;
6957     }
6958     return total;
6959 }
6960 
6961 /*
6962 Obtains the total number of real database sequences from all the ReadDBFILE structures.
6963 */
6964 Int4 LIBCALL
readdb_get_num_entries_total_real(ReadDBFILEPtr rdfp)6965 readdb_get_num_entries_total_real (ReadDBFILEPtr rdfp)
6966 
6967 {
6968     Int4 total=0;
6969     if (rdfp == NULL)
6970         return 0;
6971 
6972     while (rdfp && !rdfp->oidlist)
6973     {
6974         total += rdfp->num_seqs;
6975         rdfp = rdfp->next;
6976     }
6977     return total;
6978 }
6979 
6980 /*
6981 Obtains the number of database sequences from the ReadDBFILE structure.
6982 */
6983 Int4 LIBCALL
readdb_get_num_entries(ReadDBFILEPtr rdfp)6984 readdb_get_num_entries (ReadDBFILEPtr rdfp)
6985 
6986 {
6987     if (rdfp == NULL)
6988         return 0;
6989 
6990     return rdfp->num_seqs;
6991 }
6992 
6993 /*
6994 Obtains the length of the longest database seq from the ReadDBFILE structure.
6995 */
6996 Int4 LIBCALL
readdb_get_maxlen(ReadDBFILEPtr rdfp)6997 readdb_get_maxlen (ReadDBFILEPtr rdfp)
6998 
6999 {
7000     if (rdfp == NULL)
7001         return 0;
7002 
7003     return rdfp->maxlen;
7004 }
7005 
7006 /*
7007 Obtains the title of the database.  Note that the return CharPtr is not
7008 owned by the caller.  It should be copied if the user wishes to modify it.
7009 */
7010 CharPtr LIBCALL
readdb_get_filename(ReadDBFILEPtr rdfp)7011 readdb_get_filename (ReadDBFILEPtr rdfp)
7012 
7013 {
7014     if (rdfp == NULL)
7015         return NULL;
7016 
7017     if (rdfp->aliasfilename)
7018         return rdfp->aliasfilename;
7019 
7020     return rdfp->filename;
7021 }
7022 
7023 /*
7024 Obtains the title of the database.  Note that the return CharPtr is not
7025 owned by the caller.  It should be copied if the user wishes to modify it.
7026 */
7027 CharPtr LIBCALL
readdb_get_full_filename(ReadDBFILEPtr rdfp)7028 readdb_get_full_filename (ReadDBFILEPtr rdfp)
7029 
7030 {
7031     char* retval = NULL;
7032 
7033     if (rdfp == NULL)
7034         return NULL;
7035 
7036     if (!rdfp->aliasfilename)
7037         retval = StringSave(rdfp->filename);
7038     else {
7039        char* path = Nlm_FilePathFind(rdfp->filename);
7040        char buffer[PATH_MAX];
7041        sprintf(buffer, "%s/%s", path, rdfp->aliasfilename);
7042        retval = StringSave(buffer);
7043     }
7044     return retval;
7045 }
7046 
7047 /*
7048   Obtains the title of the database.  Note that the return CharPtr is not
7049   owned by the caller.  It should be copied if the user wishes to modify it.
7050 */
7051 CharPtr LIBCALL
readdb_get_title(ReadDBFILEPtr rdfp)7052 readdb_get_title (ReadDBFILEPtr rdfp)
7053 
7054 {
7055     if (rdfp == NULL)
7056         return NULL;
7057 
7058     if (rdfp->title)
7059         return rdfp->title;
7060 
7061     /* return the file-name if no title found. */
7062 
7063     return NULL;
7064     /* return readdb_get_filename(rdfp); */
7065 }
7066 
7067 /*
7068 Obtains the date and time the database was formatted with formatdb.
7069 Note that the return CharPtr is not owned by the caller.  It should
7070 be copied if the user wishes to modify it.
7071 */
7072 CharPtr LIBCALL
readdb_get_date(ReadDBFILEPtr rdfp)7073 readdb_get_date (ReadDBFILEPtr rdfp)
7074 
7075 {
7076     if (rdfp == NULL)
7077         return NULL;
7078 
7079     return rdfp->date;
7080 }
7081 
7082 /*
7083 Queries readdb whether the sequence is protein.
7084 */
7085 Boolean LIBCALL
readdb_is_prot(ReadDBFILEPtr rdfp)7086 readdb_is_prot (ReadDBFILEPtr rdfp)
7087 
7088 {
7089     if (rdfp == NULL)
7090         return FALSE;
7091 
7092     return /*rdfp->is_prot*/(Boolean) (rdfp->parameters & READDB_IS_PROT);
7093 }
7094 
7095 /*
7096 Obtains the formatdb version used to format the database.
7097 */
7098 Int4 LIBCALL
readdb_get_formatdb_version(ReadDBFILEPtr rdfp)7099 readdb_get_formatdb_version (ReadDBFILEPtr rdfp)
7100 
7101 {
7102     if (rdfp == NULL)
7103         return 0;
7104 
7105     return rdfp->formatdb_ver;
7106 }
7107 
7108 /*
7109     Translates a SeqIdPtr to an ordinal ID, used by the BLAST database.
7110     If the SeqIdPtr cannot be translated, a negative number is returned.
7111     All valid ordinal numbers are >= 0.
7112 */
7113 
SeqId2OrdinalId(ReadDBFILEPtr rdfp,SeqIdPtr sip)7114 Int4 SeqId2OrdinalId(ReadDBFILEPtr rdfp, SeqIdPtr sip)
7115 
7116 {
7117     DbtagPtr    dbtagptr;
7118     Int4 ordinal_id;
7119 
7120     if (rdfp == NULL || sip == NULL)
7121         return -2;
7122 
7123     switch (sip->choice)
7124     {
7125         case SEQID_GI:
7126             ordinal_id = readdb_gi2seq(rdfp, sip->data.intvalue, NULL);
7127             break;
7128 
7129         case SEQID_GENERAL:
7130             dbtagptr = (DbtagPtr) sip->data.ptrvalue;
7131             if (dbtagptr == NULL)
7132                 return OM_MSG_RET_OK;
7133             if (StringCmp(dbtagptr->db, "BL_ORD_ID") == 0)
7134             {
7135                 ordinal_id = dbtagptr->tag->id;
7136                 break;
7137             }
7138             /* Fall through to default if not "BL_ORD_ID" */
7139         default:
7140             ordinal_id = readdb_seqid2fasta(rdfp, sip);
7141             break;
7142     }
7143 
7144     return ordinal_id;
7145 }
7146 /*************************************************************************
7147 
7148     Inits the ReadDBFILEPtr for the BioseqFetch functions.
7149 
7150 **************************************************************************/
7151 
7152 static Boolean
ReadDBInit(ReadDBFetchStructPtr rdfsp)7153 ReadDBInit(ReadDBFetchStructPtr rdfsp)
7154 {
7155 
7156     rdfsp->rdfp = readdb_new_ex2(rdfsp->dbname, rdfsp->is_prot,
7157             READDB_NEW_INDEX | READDB_NEW_DO_TAXDB, NULL, NULL);
7158     taxonomyDbLoaded = FALSE; /* If object manager loads tax dbs, don't block
7159                                  application from loading it again */
7160 
7161     if (rdfsp->rdfp != NULL)
7162         return TRUE;
7163     else
7164         return FALSE;
7165 }
7166 
7167 /*
7168     Checks the chain of ReadDBFetchStructPtr's for one
7169     which belongs to the calling thread. If none is found,
7170     NULL isreturned; otherwise the ReadDBFetchStructPtr is
7171     returned.
7172 */
7173 static ReadDBFetchStructPtr
ReadDBFindFetchStruct(ReadDBFetchStructPtr rdfp)7174 ReadDBFindFetchStruct(ReadDBFetchStructPtr rdfp)
7175 
7176 {
7177 
7178     if (rdfp == NULL)
7179         return NULL;
7180 
7181     while (rdfp)
7182     {
7183         if (NlmThreadCompare(rdfp->thread_id, NlmThreadSelf()) == TRUE)
7184             break;
7185         rdfp = rdfp->next;
7186     }
7187     return rdfp;
7188 }
7189 
7190 /*
7191     Initializes the ReadDBFetchStructPtr and adds onto end of
7192     chain of ReadDBFetchStructPtr (head).  The new ReadDBFetchStructPtr
7193     is returned.
7194 */
7195 static ReadDBFetchStructPtr
ReadDBFetchStructNew(ReadDBFetchStructPtr head,CharPtr dbname,Boolean is_na)7196 ReadDBFetchStructNew(ReadDBFetchStructPtr head, CharPtr dbname, Boolean is_na)
7197 
7198 {
7199     ReadDBFetchStructPtr rdfsp, rdfsp_var;
7200 
7201 
7202     rdfsp = (ReadDBFetchStructPtr) MemNew(sizeof(ReadDBFetchStruct));
7203     rdfsp->dbname = StringSave(dbname);
7204     rdfsp->is_prot = (is_na == TRUE) ? FALSE : TRUE;
7205     rdfsp->thread_id = NlmThreadSelf();
7206 
7207     if (head != NULL)
7208     {
7209         rdfsp_var = head;
7210         while (rdfsp_var->next)
7211             rdfsp_var = rdfsp_var->next;
7212         rdfsp_var->next = rdfsp;
7213     }
7214 
7215     return rdfsp;
7216 }
7217 
7218 /****************************************************************
7219 *
7220 *    ReadDBFetchFreeFunc
7221 *    Frees ReadDBFetchUserData.
7222 *
7223 ****************************************************************/
7224 
ReadDBFetchFreeFunc(Pointer ptr)7225 static Pointer LIBCALLBACK ReadDBFetchFreeFunc (Pointer ptr)
7226 {
7227     ReadDBFetchUserDataPtr userdata;
7228 
7229     userdata = (ReadDBFetchUserDataPtr) ptr;
7230     return MemFree(userdata);
7231 }
7232 
7233 
7234 
7235 /**********************************************************************
7236 
7237     Fetches the Bioseq, based on the ordinal number of the
7238     sequence in the database.
7239 
7240 ************************************************************************/
7241 
ReadDBBioseqFetchFunc(Pointer data)7242 static Int2 LIBCALLBACK ReadDBBioseqFetchFunc(Pointer data)
7243 {
7244     BioseqPtr bsp, core_bsp;
7245     Boolean status;
7246     Int4 ordinal_id;
7247     OMProcControlPtr ompcp;
7248         ObjMgrProcPtr ompp;
7249     OMUserDataPtr omdp;
7250     ReadDBFetchStructPtr rdfsp;
7251     ReadDBFILEPtr rdfp=NULL;
7252     ReadDBFetchUserDataPtr userdata;
7253     SeqIdPtr sip, best_id;
7254     SeqEntryPtr sep;
7255 
7256     ordinal_id = -1;
7257 
7258     ompcp = (OMProcControlPtr)data;
7259         ompp = ompcp->proc;
7260 
7261     rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7262 
7263     if (rdfsp == NULL)
7264     {
7265         return OM_MSG_RET_OK;
7266     }
7267 
7268     if (rdfsp->ReadDBFetchState == READDBBF_DISABLE)
7269     {
7270         return OM_MSG_RET_OK;
7271     }
7272 
7273     if (rdfsp->ReadDBFetchState == READDBBF_INIT)
7274     {
7275         status = ReadDBInit(rdfsp);
7276         if (status == FALSE)
7277             return OM_MSG_RET_OK;
7278         rdfsp->ReadDBFetchState = READDBBF_READY;
7279     }
7280 
7281     if (ordinal_id < 0 || rdfp == NULL)
7282     {
7283         sip = (SeqIdPtr) (ompcp->input_data);
7284 
7285         best_id = SeqIdFindBest(sip, SEQID_GI);
7286 
7287         if (best_id == NULL)
7288         {
7289             core_bsp = BioseqFindCore(sip);
7290             if (core_bsp)
7291                 best_id = SeqIdFindBest(core_bsp->id, SEQID_GI);
7292         }
7293 
7294         if (best_id == NULL)
7295             return OM_MSG_RET_OK;
7296 
7297         rdfp = rdfsp->rdfp;
7298         ordinal_id = SeqId2OrdinalId(rdfp, best_id);
7299         if (ordinal_id >= 0) {
7300             rdfp->preferred_gi = best_id->data.intvalue;
7301         }
7302     }
7303 
7304     /* ordinal_id's start at zero. */
7305     if (ordinal_id < 0)
7306         return OM_MSG_RET_OK;
7307 
7308     /* A BioseqPtr is returned by this function. */
7309     bsp = readdb_get_bioseq(rdfp, ordinal_id);
7310 
7311         /* Reset the preferred_gi */
7312         rdfp->preferred_gi = 0;
7313 
7314         /* We have to add information about genetic code to
7315            the Bioseq */
7316 
7317         if(rdfsp->db_genetic_code > 1) {
7318             BioSourcePtr source;
7319             source = BioSourceNew();
7320             source->org = OrgRefNew();
7321             source->org->orgname = OrgNameNew();
7322             source->org->orgname->gcode = rdfsp->db_genetic_code;
7323             SeqDescrAddPointer(&(bsp->descr), Seq_descr_source, source);
7324         }
7325 
7326     sep = SeqEntryNew();
7327     sep->choice = 1;
7328     sep->data.ptrvalue = bsp;
7329     SeqMgrSeqEntry(SM_BIOSEQ, (Pointer)bsp, sep);
7330     ompcp->output_data = (Pointer)bsp;
7331     ompcp->output_entityID = ObjMgrGetEntityIDForChoice(sep);
7332     omdp = ObjMgrAddUserData(ompcp->output_entityID, ompp->procid, OMPROC_FETCH, 0);
7333     userdata = (ReadDBFetchUserDataPtr) MemNew(sizeof(ReadDBFetchUserData));
7334     omdp->userdata.ptrvalue = userdata;
7335     userdata->ordinal_number = ordinal_id;
7336     userdata->db_id = ReadDBGetDbId(rdfsp->rdfp, rdfp);
7337     omdp->freefunc = ReadDBFetchFreeFunc;
7338 
7339     return OM_MSG_RET_DONE;
7340 }
7341 
ReadDBBioseqSetDbGeneticCode(Int4 db_genetic_code)7342 Boolean LIBCALL ReadDBBioseqSetDbGeneticCode(Int4 db_genetic_code)
7343 {
7344     ReadDBFetchStructPtr rdfsp;
7345     ObjMgrPtr omp;
7346     ObjMgrProcPtr ompp;
7347 
7348     omp = ObjMgrGet();
7349     ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7350     if (ompp != NULL) {   /* already initialized */
7351         rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7352         rdfsp->db_genetic_code = db_genetic_code;
7353         return FALSE;
7354     }
7355     return TRUE;
7356 }
7357 
7358 /*********************************************************************
7359 
7360     Enables the fetching.  Initializes needed structures and calls
7361     ReadDBInit.
7362 
7363 **********************************************************************/
7364 Boolean LIBCALL
ReadDBBioseqFetchEnable(CharPtr program,CharPtr dbname,Boolean is_na,Boolean now)7365 ReadDBBioseqFetchEnable(CharPtr program, CharPtr dbname, Boolean is_na, Boolean now)
7366 
7367 {
7368     Boolean result;
7369     ReadDBFetchStructPtr rdfsp;
7370     ObjMgrPtr omp;
7371     ObjMgrProcPtr ompp;
7372     static TNlmMutex enable_lock = NULL;
7373     /* check if already enabled ***/
7374 
7375     NlmMutexInit(&enable_lock);
7376     NlmMutexLock(enable_lock);
7377 
7378     omp = ObjMgrGet();
7379     ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7380     if (ompp != NULL) {   /* already initialized */
7381         rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7382 
7383         if(rdfsp == NULL) { /* Another thread */
7384             rdfsp = ReadDBFetchStructNew((ReadDBFetchStructPtr)(ompp->procdata), dbname, is_na);
7385         } else {
7386             if (rdfsp->is_prot == is_na || StringCmp(rdfsp->dbname, dbname)) {
7387                 rdfsp->is_prot = (is_na == TRUE) ? FALSE : TRUE;
7388                 rdfsp->dbname = MemFree(rdfsp->dbname);
7389                 rdfsp->dbname = StringSave(dbname);
7390             }
7391         }
7392     } else { /* New element is not registered with ObjMgr */
7393         rdfsp = ReadDBFetchStructNew(NULL, dbname, is_na);
7394         ObjMgrProcLoad(OMPROC_FETCH, "ReadDBBioseqFetch", "ReadDBBioseqFetch", OBJ_SEQID, 0,OBJ_BIOSEQ,0,
7395                        (Pointer)rdfsp, ReadDBBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
7396         rdfsp->ReadDBFetchState = READDBBF_INIT;
7397     }
7398 
7399     rdfsp->ctr++;    /* count number of enables */
7400 
7401     NlmMutexUnlock(enable_lock);
7402 
7403     if (rdfsp->ReadDBFetchState == READDBBF_READY) {
7404         return TRUE;
7405     }
7406 
7407     if (now) {
7408         result = ReadDBInit(rdfsp);
7409         if (! result) {
7410             return result;
7411         }
7412         rdfsp->ReadDBFetchState = READDBBF_READY;
7413     } else {
7414         rdfsp->ReadDBFetchState = READDBBF_INIT;
7415     }
7416 
7417     return TRUE;
7418 }
7419 
7420 /*****************************************************************************
7421 *
7422 *        ReadDBBioseqFetchDisable()
7423 *
7424 *    Calls readdb_destruct if necessary to deallocate resources.
7425 *
7426 *****************************************************************************/
ReadDBBioseqFetchDisable(void)7427 void LIBCALL ReadDBBioseqFetchDisable(void)
7428 {
7429         ObjMgrPtr omp;
7430         ObjMgrProcPtr ompp;
7431         ReadDBFetchStructPtr rdfsp;
7432 
7433         omp = ObjMgrGet();
7434         ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7435         if (ompp == NULL)   /* not initialized */
7436                 return;
7437 
7438     rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7439     if (! rdfsp->ctr)   /* no enables active */
7440         return;
7441 
7442     rdfsp->ctr--;
7443     if (rdfsp->ctr)   /* connection still pending */
7444         return;
7445 
7446     if (rdfsp->ReadDBFetchState == READDBBF_READY)
7447     {
7448         rdfsp->ReadDBFetchState = READDBBF_DISABLE;  /* not active */
7449         rdfsp->rdfp = readdb_destruct(rdfsp->rdfp);
7450     }
7451 
7452     return;
7453 }
7454 
7455 /*
7456     Returns the ReadDBFILEPtr by the database ID.
7457     NULL is returned on error.
7458 */
7459 
7460 ReadDBFILEPtr
ReadDBGetDb(ReadDBFILEPtr rdfp_list,Int2 db_id)7461 ReadDBGetDb (ReadDBFILEPtr rdfp_list, Int2 db_id)
7462 
7463 {
7464     Int2 index=0;
7465 
7466     while (rdfp_list)
7467     {
7468         if (index == db_id)
7469         {
7470             return rdfp_list;
7471         }
7472         rdfp_list = rdfp_list->next;
7473         index++;
7474     }
7475     return NULL;
7476 }
7477 
7478 /*
7479     Returns the Database ID.
7480     -1 is returned on error.
7481 */
7482 
7483 Int2
ReadDBGetDbId(ReadDBFILEPtr list,ReadDBFILEPtr target)7484 ReadDBGetDbId (ReadDBFILEPtr list, ReadDBFILEPtr target)
7485 
7486 {
7487     Int2 index=0;
7488 
7489     while (list)
7490     {
7491         if (readdb_compare(list, target) == TRUE)
7492             return index;
7493         list = list->next;
7494         index++;
7495     }
7496     return -1;
7497 }
7498 
7499 /*
7500     Formatting functions for databases formatted by formatdb.
7501 */
7502 Boolean LIBCALL
PrintDbInformationBasicEx(Boolean is_aa,Int4 line_length,CharPtr definition,Int4 number_seqs,Int8 total_length,FILE * outfp,Boolean html,Boolean with_links)7503 PrintDbInformationBasicEx (Boolean is_aa, Int4 line_length,
7504                            CharPtr definition, Int4 number_seqs,
7505                            Int8 total_length, FILE *outfp, Boolean html,
7506                            Boolean with_links)
7507 {
7508     if (html && with_links) {
7509            fprintf(outfp, "<b>Database:</b> %s", definition);
7510            asn2ff_set_output(outfp, NULL);
7511            ff_StartPrint(0, 0, line_length, NULL);
7512     } else {
7513            asn2ff_set_output(outfp, NULL);
7514 
7515            ff_StartPrint(0, 0, line_length, NULL);
7516            if (html)
7517               ff_AddString("<b>Database:</b> ");
7518            else
7519               ff_AddString("Database: ");
7520            ff_AddString(definition);
7521         }
7522     NewContLine();
7523     TabToColumn(12);
7524     ff_AddString(Ltostr((long) number_seqs, 1));
7525     ff_AddString(" sequences; ");
7526     ff_AddString(Nlm_Int8tostr(total_length, 1));
7527     ff_AddString(" total letters");
7528     NewContLine();
7529     ff_EndPrint();
7530 
7531     return TRUE;
7532 }
7533 
7534 Boolean LIBCALL
PrintDbInformationBasic(CharPtr database,Boolean is_aa,Int4 line_length,CharPtr definition,Int4 number_seqs,Int8 total_length,FILE * outfp,Boolean html)7535 PrintDbInformationBasic (CharPtr database, Boolean is_aa, Int4 line_length,
7536                          CharPtr definition, Int4 number_seqs, Int8
7537                          total_length, FILE *outfp, Boolean html)
7538 {
7539    return PrintDbInformationBasicEx(is_aa, line_length, definition,
7540                                     number_seqs, total_length, outfp, html,
7541                                     FALSE);
7542 }
7543 
7544 /*
7545     Print a summary of the database(s) used.
7546 */
7547 
7548 Boolean LIBCALL
PrintDbInformationWithRID(CharPtr database,Boolean is_aa,Int4 line_length,FILE * outfp,Boolean html,CharPtr rid,Boolean query_is_aa)7549 PrintDbInformationWithRID(CharPtr database, Boolean is_aa, Int4 line_length,
7550                           FILE *outfp, Boolean html, CharPtr rid, Boolean query_is_aa)
7551 {
7552     CharPtr        definition, ptr, chptr;
7553     Int8        total_length;
7554     Int4        number_seqs, length, real_length, avail_length, shift;
7555     ReadDBFILEPtr    rdfp, rdfp_var, rdfp_tmp;
7556     Boolean         first_title;
7557     Char            next_title[1024];
7558     Boolean         with_links = FALSE;
7559     Int2 tmp_len;
7560 
7561     if (database == NULL || outfp == NULL)
7562         return FALSE;
7563 
7564     if (is_aa == TRUE)
7565         rdfp = readdb_new_ex2(database, READDB_DB_IS_PROT,
7566                 READDB_NEW_DO_REPORT, NULL, NULL);
7567     else
7568         rdfp = readdb_new_ex2(database, READDB_DB_IS_NUC,
7569                 READDB_NEW_DO_REPORT, NULL, NULL);
7570 
7571     if (rdfp == FALSE)
7572         return FALSE;
7573 
7574     length = 4096;  /* Initial length, may be increased. */
7575     definition = MemNew(length*sizeof(Char));
7576     ptr = definition;
7577     rdfp_var = rdfp;
7578 
7579     real_length = 0;
7580     avail_length = length;
7581     first_title = TRUE;
7582     while (rdfp_var) {
7583         chptr = readdb_get_title(rdfp_var);
7584 
7585         if(chptr == NULL) {
7586             rdfp_var = rdfp_var->next;
7587             continue;
7588         }
7589 
7590         if (rid && html && rdfp_var->aliasfilename && atoi(rdfp_var->aliasfilename) != 0) {
7591            if (query_is_aa && !StrNCmp(chptr, "Completed", 9)) {
7592               sprintf(next_title,
7593                    "<a href=http://www.ncbi.nlm.nih.gov/sutils/genomeRID.cgi?"
7594                    "taxid=%s&RID=%s>%s</a>; \n",
7595                    rdfp_var->aliasfilename, rid, chptr);
7596               with_links = TRUE;
7597            } else {
7598               sprintf(next_title, "%s", chptr);
7599 
7600               tmp_len = StrLen(next_title);
7601               /* are there more titles to concatenate? */
7602               for (rdfp_tmp = rdfp_var->next; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
7603                   if (rdfp_tmp->title != NULL) {
7604                        next_title[tmp_len++] = ';';
7605                        break;
7606                    }
7607               }
7608 
7609               if (!first_title && rdfp_var->next != NULL) {
7610                   /*next_title[tmp_len++] = ';';*/
7611                   next_title[tmp_len++] = ' ';
7612                   next_title[tmp_len++] = '\n';
7613                   next_title[tmp_len++] = NULLB;
7614               } else {
7615                   /*if (rdfp_var->next != NULL)
7616                       next_title[tmp_len++] = ';';*/
7617                   next_title[tmp_len++] = ' ';
7618                   next_title[tmp_len++] = NULLB;
7619                   first_title = FALSE;
7620               }
7621            }
7622            real_length += StrLen(next_title) + 4;
7623            /* We print these as keep-alive messages for this specific use. */
7624            fprintf(outfp, "%s", " ");
7625            fflush(outfp);
7626         } else {
7627            real_length += StrLen(chptr) + 3;
7628            sprintf(next_title, "%s", chptr);
7629 
7630            tmp_len = StrLen(next_title);
7631 
7632            /* are there more titles to concatenate? */
7633            for (rdfp_tmp = rdfp_var->next; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
7634                if (rdfp_tmp->title != NULL) {
7635                     next_title[tmp_len++] = ';';
7636                     break;
7637                 }
7638            }
7639 
7640            if (!first_title) {
7641               next_title[tmp_len++] = ' ';
7642               next_title[tmp_len++] = NULLB;
7643            } else {
7644               next_title[tmp_len++] = ' ';
7645               next_title[tmp_len++] = NULLB;
7646               first_title = FALSE;
7647            }
7648 
7649         }
7650 
7651         if (real_length > avail_length) {
7652            shift = ptr - definition;
7653            definition = Realloc(definition, 2*real_length);
7654            avail_length = 2*real_length;
7655            ptr = definition + shift;
7656         }
7657         StringCpy(ptr, next_title);
7658 
7659         length = StringLen(ptr);
7660         ptr += length;
7661 
7662         rdfp_var = rdfp_var->next;
7663     }
7664 
7665     *ptr = NULLB;
7666     readdb_get_totals_ex(rdfp, &(total_length), &(number_seqs), TRUE);
7667 
7668     rdfp = readdb_destruct(rdfp);
7669     if (rid && html)
7670     {
7671          fprintf(outfp, "%s", "\n");
7672          fflush(outfp);
7673     }
7674 
7675     PrintDbInformationBasicEx (is_aa, line_length, definition,
7676                                  number_seqs, total_length, outfp,
7677                                  html, with_links);
7678 
7679     definition = MemFree(definition);
7680 
7681     return TRUE;
7682 }
7683 
7684 Boolean LIBCALL
PrintDbInformation(CharPtr database,Boolean is_aa,Int4 line_length,FILE * outfp,Boolean html)7685 PrintDbInformation(CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html)
7686 {
7687    return PrintDbInformationWithRID(database, is_aa, line_length,
7688                                     outfp, html, NULL, FALSE);
7689 }
7690 
7691 /** Common Index Stuff **/
7692 
7693 /* Parse DB configuration file */
7694 
7695 #define    MAX_LINE_LENGTH    1024
7696 
7697 typedef enum {
7698     lexIGNORE,
7699     lexINT,
7700     lexSTRING,
7701     lexBOOL,
7702     lexEOF
7703 } LexTokens;
7704 
getLine(FILE * fp,CharPtr buf)7705 static CharPtr    getLine (FILE *fp, CharPtr buf)
7706 {
7707     buf[0] = '\0';
7708     while (!buf || (buf[0] == '#') || (buf[0] == '\0') || (buf[0] == '\n')) {
7709     FileGets(buf, MAX_LINE_LENGTH, fp);
7710     }
7711     return buf;
7712 }
7713 
parseInt(CharPtr buf)7714 static Int4    parseInt(CharPtr buf)
7715 {
7716     Int4    retval;
7717     long    my_long;
7718 
7719     sscanf(buf, "%ld", &my_long);
7720     retval = my_long;
7721 
7722     return retval;
7723 }
7724 
parseString(CharPtr buf)7725 static CharPtr    parseString(CharPtr buf)
7726 {
7727     CharPtr    retval = MemNew(sizeof(Char) * MAX_LINE_LENGTH);
7728 
7729     sscanf(buf, "%s", retval);
7730     return retval;
7731 }
7732 
parseBool(CharPtr buf)7733 static Boolean    parseBool(CharPtr buf)
7734 {
7735     Boolean    retval;
7736     CharPtr    str = parseString(buf);
7737 
7738     if ((!StrCmp(str, "true")) || (!StrCmp(str, "True")) || (!StrCmp(str, "TRUE")) ||
7739         (!StrCmp(str, "t")) || (!StrCmp(str, "T")) ||
7740         (!StrCmp(str, "1")) ||
7741         (!StrCmp(str, "y")) || (!StrCmp(str, "Y")))
7742     retval = TRUE;
7743     else
7744     retval = FALSE;
7745 
7746     str = MemFree(str);
7747     return retval;
7748 }
7749 
ParseDBConfigFile(DataBaseIDPtr * dbidsp,CharPtr path)7750 Int2    ParseDBConfigFile(DataBaseIDPtr *dbidsp, CharPtr path)
7751 {
7752     Int2        number_of_DBs = 0, i;
7753     FILE        *fp;
7754     DataBaseIDPtr    retval;
7755     Char        buf[MAX_LINE_LENGTH], name[MAX_LINE_LENGTH];
7756     Char        dbid[MAX_LINE_LENGTH], isprot[MAX_LINE_LENGTH];
7757     Char        full_filename[PATH_MAX];
7758 
7759     /* open config file */
7760     if (path && StrCmp(path, "")) {
7761     sprintf(full_filename, "%s%s%s", path, DIRDELIMSTR, DB_CONFIG_FN);
7762     } else {
7763     sprintf(full_filename, "%s", DB_CONFIG_FN);
7764     }
7765 
7766     if (!(fp = FileOpen(full_filename, "r")))
7767     return 0;
7768 
7769     getLine(fp, buf);
7770 
7771     /* first line is number of databases */
7772     number_of_DBs = parseInt(buf);
7773 
7774     /* allocate that much memory */
7775     retval = (DataBaseIDPtr) MemNew(sizeof(DataBaseID) * number_of_DBs);
7776 
7777     /* each next line is contains name, id and type of a DB */
7778     for (i=0; i < number_of_DBs; i++) {
7779     getLine(fp, buf);
7780     sscanf(buf, "%s%s%s", name, dbid, isprot);
7781     (retval+i)->name   = parseString(name);
7782     (retval+i)->id     = parseInt(dbid);
7783     (retval+i)->isprot = parseBool(isprot);
7784     }
7785 
7786     FILECLOSE(fp);
7787     *dbidsp = retval;
7788     return number_of_DBs;
7789 }
7790 
7791 /* ---------------------------------------------------------------------*/
7792 /* --------- Here is set of functions, that uses in formatdb ---------- */
7793 /* ---------------------------------------------------------------------*/
7794 
7795 #define STRLENGTH     4096
7796 #define INDEX_INIT_SIZE 1024
7797 #define INDEX_ARRAY_CHUNKS 100000
7798 
7799 #define LOOKUP_CHUNK   5
7800 #define LOOKUP_SIZE    12
7801 #define LOOKUP_ID_SIZE 8
7802 
7803 #define FORMATDB_SIZE 4
7804 #define ID_MAX_SIZE   64
7805 
7806 #define LOOKUP_NO_ERROR  0
7807 #define ERR_GI_FAILED    1
7808 #define ERR_SEQID_FAILED 2
7809 
7810 #define NON_SEQID_PREFIX "gnl|BL_ORD_ID|"
7811 #define CREATE_DEFLINE_INDEX 1
7812 
7813 #define SEQID_FIELD   1
7814 #define ACCN_FIELD    2
7815 #define DEFLINE_FIELD 4
7816 /* Size of variable that is manipulated, and swapped
7817    for big/little endian stuff. */
7818 
7819 static Boolean
FormatDbUint4Write(Uint4 number,FILE * fp)7820 FormatDbUint4Write(Uint4 number, FILE *fp)
7821 
7822 {
7823   Uint4 value;
7824 
7825   /* If FORMATDB_SIZE changes, this must be changed. */
7826   value = Nlm_SwapUint4(number);
7827   if (FileWrite(&(value), FORMATDB_SIZE, 1, fp) != (Uint4) 1)
7828     return FALSE;
7829 
7830   return TRUE;
7831 }
7832 
7833 
7834 static Boolean
FormatDbUint8Write(Uint8 value,FILE * fp)7835 FormatDbUint8Write(Uint8 value, FILE *fp)
7836 {
7837     Uint1Ptr bytes;
7838 
7839     if((bytes =  Uint8ToBytes(value)) == NULL)
7840         return FALSE;
7841 
7842     if(FileWrite(bytes, 8, 1, fp) != (Uint4) 1) {
7843         MemFree(bytes);
7844         return FALSE;
7845     }
7846 
7847     MemFree(bytes);
7848     return TRUE;
7849 }
7850 
7851 static Int8
FormatDbUint8Read(NlmMFILEPtr mfp)7852 FormatDbUint8Read(NlmMFILEPtr mfp)
7853 {
7854     Int8 value;
7855     Uint1 bytes[8];
7856 
7857     NlmReadMFILE((Uint1Ptr) bytes, 8, 1, mfp);
7858 
7859     value = (Int8) BytesToUint8(bytes);
7860 
7861     return value;
7862 }
7863 
FASTALookupNew(void)7864 static FASTALookupPtr FASTALookupNew(void) {
7865   FASTALookupPtr lookup;
7866 
7867   if((lookup = (FASTALookupPtr)MemNew(sizeof(FASTALookup))) == NULL)
7868     return NULL;
7869   if((lookup->table = (Int4Ptr)MemNew(LOOKUP_CHUNK*4)) == NULL)
7870     return NULL;
7871 
7872   lookup->allocated = LOOKUP_CHUNK;
7873   lookup->used = 0;
7874   return lookup;
7875 }
FASTALookupFree(FASTALookupPtr lookup)7876 static void FASTALookupFree(FASTALookupPtr lookup)
7877 {
7878   MemFree(lookup->table);
7879   MemFree(lookup);
7880 }
7881 
7882 /* ---------------------------------------------------------------------*/
7883 /* - Here is set of functions for creation of taxonomy info database -- */
7884 /* ---------------------------------------------------------------------*/
7885 
7886 
7887 
7888 /*******************************************************************************
7889  * Initializing FormatDB structure (see formatdb.h),
7890  *******************************************************************************
7891  * Parameters:
7892  *    dbname        - name of the input file
7893  *    isProtein    - true, if file with protein seqs
7894  *
7895  * Returns pointer to allocated FormatDB structure (FormatDBPtr)
7896  *
7897  ******************************************************************************/
7898 
FDBOptionsNew(CharPtr input,Boolean is_prot,CharPtr title,Boolean is_asn,Boolean is_asn_bin,Boolean is_seqentry,Boolean sparse_idx,Boolean test_non_unique,Boolean parse_deflines,CharPtr basename,CharPtr alias_file_name,Int8 bases_per_volume,Int4 seqs_per_volume,Int4 version,Boolean dump_info,EFDBCleanOpt clean_opt)7899 FDB_optionsPtr FDBOptionsNew(CharPtr input, Boolean is_prot, CharPtr title,
7900         Boolean is_asn, Boolean is_asn_bin, Boolean is_seqentry, Boolean
7901         sparse_idx, Boolean test_non_unique, Boolean parse_deflines,
7902         CharPtr basename, CharPtr alias_file_name, Int8 bases_per_volume,
7903         Int4 seqs_per_volume, Int4 version, Boolean dump_info, EFDBCleanOpt
7904         clean_opt)
7905 {
7906     FDB_optionsPtr options = NULL;
7907 
7908     if ((input == NULL || input[0] == '\0') && alias_file_name != NULL) {
7909         ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: input file needed");
7910         return NULL;
7911     }
7912 
7913     if (!SeqEntryLoad()) {
7914         ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: SeqEntryLoad failed");
7915         return NULL;
7916     }
7917     if (!fdlobjAsnLoad()) {
7918         ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: fdlobjAsnLoad failed");
7919         return NULL;
7920     }
7921     UseLocalAsnloadDataAndErrMsg();
7922 
7923     if ((options = (FDB_optionsPtr)MemNew(sizeof(FDB_options))) == NULL) {
7924         ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: Out of memory");
7925         return NULL;
7926     }
7927 
7928     options->db_file = StringSave(input);
7929     if (!title)
7930         options->db_title = basename ?
7931             StringSave(basename) : StringSave(options->db_file);
7932     else
7933         options->db_title = StringSave(title);
7934     options->is_protein = is_prot;
7935     options->parse_mode = parse_deflines;
7936     if (!basename)
7937         options->base_name = StringSave(options->db_file);
7938     else
7939         options->base_name = StringSave(basename);
7940 
7941     if (!alias_file_name)
7942         options->alias_file_name = StringSave(options->base_name);
7943     else
7944         options->alias_file_name = StringSave(alias_file_name);
7945 
7946     /*
7947      * If specified, set bases_per_volume. If not, set it to
7948      * SEQFILE_SIZE_DFL if nucleotide, or SEQFILE_SIZE_DFL/4 if protein.
7949      */
7950 
7951     options->bases_in_volume = (bases_per_volume <= 0) ? SEQFILE_SIZE_DFL : bases_per_volume;
7952 
7953     if (is_prot)
7954         options->bases_in_volume /= 4;
7955 
7956     options->sequences_in_volume = (seqs_per_volume < 0) ? 0 : seqs_per_volume;
7957 
7958     if ((options->version = version) == 0)
7959         options->version = FORMATDB_VER; /* default version */
7960 
7961     options->isASN = is_asn;
7962     options->asnbin = is_asn_bin;
7963     options->is_seqentry = is_seqentry;
7964     options->sparse_idx = sparse_idx;
7965     options->test_non_unique = test_non_unique;
7966     options->total_num_of_seqs = 0;
7967     if (clean_opt >= 0 && clean_opt < eCleanOptMax)
7968         options->clean_opt = clean_opt;
7969     else
7970         options->clean_opt = (EFDBCleanOpt) 0;
7971 
7972     /* The following options are for NCBI use only */
7973     options->dump_info = dump_info;
7974     options->linkbit_listp = NULL;
7975     options->memb_tblp = NULL;
7976     options->memb_argp = NULL;
7977     options->tax_lookup = NULL;
7978 
7979     return options;
7980 }
7981 
7982 /* Recursively remove a blast database specified by base_name. dbtype must be
7983  * either 'p' or 'n' (lowercase) to denote a protein or nucleotide database
7984  * respectively. */
FDBCleanUpRecursively(CharPtr base_name,Char dbtype)7985 Boolean FDBCleanUpRecursively(CharPtr base_name, Char dbtype)
7986 {
7987     Char filenamebuf[FILENAME_MAX];
7988     ReadDBAliasPtr rdbap = NULL;
7989     Boolean done = FALSE;
7990     CharPtr p = NULL;
7991 
7992     /* Handle alias files */
7993     sprintf(filenamebuf, "%s.%cal", base_name, dbtype);
7994     rdbap = readdb_read_alias_file(filenamebuf);
7995 
7996     if (rdbap && (CheckForRecursion(filenamebuf, rdbap->dblist) == FALSE)) {
7997 
7998         p = rdbap->dblist;
7999         while (!done) {
8000             done = readdb_parse_db_names(&p, filenamebuf);
8001             if (*filenamebuf == NULLB)
8002                 break;
8003             FDBCleanUpRecursively(filenamebuf, dbtype);
8004         }
8005         sprintf(filenamebuf, "%s.%cal", base_name, dbtype);
8006         FileRemove(filenamebuf); /* alias file */
8007         rdbap = ReadDBAliasFree(rdbap);
8008         ErrLogPrintf("Removed %s\n",filenamebuf);
8009 
8010     } else { /* Single-volume blast database */
8011 
8012         sprintf(filenamebuf, "%s.%cin", base_name, dbtype);
8013         FileRemove(filenamebuf); /* index file */
8014         sprintf(filenamebuf, "%s.%chr", base_name, dbtype);
8015         FileRemove(filenamebuf); /* header file */
8016         sprintf(filenamebuf, "%s.%csq", base_name, dbtype);
8017         FileRemove(filenamebuf); /* sequence file */
8018         sprintf(filenamebuf, "%s.%csi", base_name, dbtype);
8019         FileRemove(filenamebuf); /* string isam index file */
8020         sprintf(filenamebuf, "%s.%csd", base_name, dbtype);
8021         FileRemove(filenamebuf); /* string isam data file */
8022         sprintf(filenamebuf, "%s.%cni", base_name, dbtype);
8023         FileRemove(filenamebuf); /* numeric isam index file */
8024         sprintf(filenamebuf, "%s.%cnd", base_name, dbtype);
8025         FileRemove(filenamebuf); /* numeric isam data file */
8026         if (dbtype == 'p') {
8027             sprintf(filenamebuf, "%s.ppi", base_name);
8028             FileRemove(filenamebuf); /* PIG isam index file */
8029             sprintf(filenamebuf, "%s.ppd", base_name);
8030             FileRemove(filenamebuf); /* PIG isam data file */
8031         }
8032         sprintf(filenamebuf, "%s.%cti", base_name, dbtype);
8033         FileRemove(filenamebuf); /* deprecated taxonomy index file */
8034         sprintf(filenamebuf, "%s.%ctd", base_name, dbtype);
8035         FileRemove(filenamebuf); /* deprecated taxonomy data file */
8036         sprintf(filenamebuf, "%s.%ctm", base_name, dbtype);
8037         FileRemove(filenamebuf); /* formatdb temporary file */
8038         sprintf(filenamebuf, "%s.%cdi", base_name, dbtype);
8039         FileRemove(filenamebuf); /* formatdb dump info file (NCBI only) */
8040         ErrLogPrintf("Removed single-volume database %s\n",base_name);
8041 
8042     }
8043     return TRUE;
8044 }
8045 
8046 /* Before creating any files, check if there are any blast database files
8047  * that might collide with the one about to be created.
8048  * Returns FALSE only if user does not want to proceed. */
FDBCleanUp(FDB_optionsPtr options)8049 Boolean FDBCleanUp(FDB_optionsPtr options)
8050 {
8051     Boolean alias_file_exists = FALSE, index_file_exists = FALSE;
8052     Char filenamebuf[FILENAME_MAX] = { NULLB };
8053     MsgAnswer ans;
8054     Char dbtype;
8055 
8056     if (!options || options->clean_opt == eCleanNever)
8057         return TRUE;
8058 
8059     dbtype = options->is_protein ? 'p' : 'n';
8060 
8061     /* First look for an alias file */
8062     sprintf(filenamebuf, "%s.%cal", options->base_name, dbtype);
8063     if (FileLengthEx(filenamebuf) != -1)
8064         alias_file_exists = TRUE;
8065 
8066     /* Now try an index file */
8067     memset((void*) &filenamebuf, 0, sizeof(filenamebuf));
8068     sprintf(filenamebuf, "%s.%cin", options->base_name, dbtype);
8069     if (FileLength(filenamebuf) != -1)
8070         index_file_exists = TRUE;
8071 
8072     /* nothing to remove ? */
8073     if (!alias_file_exists && !index_file_exists)
8074         return TRUE;
8075 
8076     switch (options->clean_opt) {
8077     case eCleanPrompt:
8078 #ifdef OS_UNIX
8079         if (!StringCmp(options->db_file, "stdin")) {
8080             ErrPostEx(SEV_ERROR, 0, 0, "Cannot prompt for answer if "
8081                     "input to format is stdin");
8082             return FALSE;
8083         }
8084 #endif
8085         ans = Message(KEY_YNC, "Would you like to clean up %s.* files?",
8086             options->base_name);
8087         if (ans == ANS_NO || ans == ANS_CANCEL) {
8088             ErrLogPrintf("User cancelled formatting.\n");
8089             return FALSE;
8090         } /* else fall through and clean up ! */
8091     case eCleanAlways:
8092     default:
8093         FDBCleanUpRecursively(options->base_name, dbtype);
8094         break;
8095     }
8096 
8097     return TRUE;
8098 }
8099 
8100 /* Deletes all volumes of a BLAST databases which is "in progress" (i.e.: being
8101  * built). This is necessary for proper clean up in case of errors, specially
8102  * if the maximum number of volumes is reached. */
FDBCleanUpInProgress(const FDB_options * options)8103 void FDBCleanUpInProgress(const FDB_options* options)
8104 {
8105     int volume = 0;
8106     char base_name[FILENAME_MAX] = { '\0' };
8107 
8108     ASSERT(options);
8109     ASSERT(options->volume > 1);
8110 
8111     StringNCpy(base_name, options->base_name, StrLen(options->base_name) - 3);
8112 
8113     for (volume = 0; volume < options->volume; volume++) {
8114         FDB_options opts_tmp;
8115         memcpy((void*)&opts_tmp, (void*)options, sizeof(*options));
8116         opts_tmp.base_name = (char*)MemNew(FILENAME_MAX);
8117         sprintf(opts_tmp.base_name, "%s.%02d", base_name, volume);
8118         opts_tmp.clean_opt = eCleanAlways;
8119         FDBCleanUp(&opts_tmp);
8120         free(opts_tmp.base_name);
8121     }
8122 }
8123 
8124 /* Initialize the formatdb structure.
8125  * Taxonomy databases, link and membership tables should be initialized in the
8126  * options structure, by separate functions */
FormatDBInit(FDB_optionsPtr options)8127 FormatDBPtr FormatDBInit(FDB_optionsPtr options)
8128 {
8129 
8130     FormatDBPtr        fdbp;
8131     Char        filenamebuf[FILENAME_MAX];
8132     Uint4        i = 0;
8133 
8134     if(options == NULL)
8135         return NULL;
8136 
8137     if(options->db_file == NULL)
8138     {
8139         ErrPostEx(SEV_ERROR, 0, 0, "No database name was specified");
8140         return NULL;
8141     }
8142 
8143     fdbp = (FormatDBPtr) MemNew (sizeof(*fdbp));
8144 
8145     fdbp->num_of_seqs = 0;
8146     fdbp->TotalLen=0, fdbp->MaxSeqLen=0;
8147 
8148     fdbp->options = options;
8149 
8150     /* The next 2 fields are set in FDBOptionsNew, but kept for older apps
8151      * that don't use that function */
8152     if (options->version == 0)
8153         fdbp->options->version = FORMATDB_VER;
8154 
8155     /* If basename is NULL, use dbname. */
8156     if (options->base_name == NULL)
8157         options->base_name = StringSave(options->db_file);
8158 
8159     fdbp->fd = NULL;
8160     fdbp->aip = NULL;
8161 
8162     /* Clean up if necessary */
8163     if (!FDBCleanUp(options))
8164         return NULL;
8165 
8166     /* open output BLAST files */
8167 
8168     /* Defline file */
8169 
8170     sprintf(filenamebuf, "%s.%chr",
8171             options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8172 
8173     if (options->version > FORMATDB_VER_TEXT) {
8174         fdbp->aip_def = AsnIoOpen(filenamebuf, "wb");
8175     } else {
8176         fdbp->fd_def = FileOpen(filenamebuf, "wb");
8177     }
8178 
8179     /* Sequence file */
8180 
8181     sprintf(filenamebuf, "%s.%csq",
8182             options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8183     fdbp->fd_seq = FileOpen(filenamebuf, "wb");
8184 
8185     if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1) /* Sequence file started from NULLB */
8186     return NULL;
8187 
8188     /* Index file */
8189 
8190     sprintf(filenamebuf, "%s.%cin",
8191             options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8192     fdbp->fd_ind = FileOpen(filenamebuf, "wb");
8193 
8194     /* Misc. info dump file */
8195 
8196     if(options->dump_info) {
8197         sprintf(filenamebuf, "%s.%cdi",
8198                 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8199         fdbp->fd_sdi = FileOpen(filenamebuf, "wb");
8200     }
8201 
8202     /* String (accession) index temporary file */
8203 
8204     fdbp->fd_stmp = NULL;
8205 
8206     if(options->parse_mode) {
8207         sprintf(filenamebuf, "%s.%ctm",
8208                 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8209         fdbp->fd_stmp = FileOpen(filenamebuf, "wb");
8210     }
8211     ErrLogPrintf("Version %s [%s]\n", BlastGetVersionNumber(), BlastGetReleaseDate());
8212     ErrLogPrintf("Started database file \"%s\"\n", options->db_file);
8213     /* Allocating space for offset tables */
8214     fdbp->OffsetAllocated = INDEX_INIT_SIZE; /* initial value */
8215     fdbp->DefOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8216     fdbp->SeqOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8217 
8218     if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
8219         ErrLogPrintf("Not enough memory to initialize main formatdb structure. Formatting failed.\n");
8220         return NULL;
8221     }
8222 
8223     if(!options->is_protein) {
8224         fdbp->AmbOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8225     if (!fdbp->AmbOffsetTable) {
8226         ErrLogPrintf("Not enough memory to initialize main formatdb structure. Formatting failed.\n");
8227         return NULL;
8228     }
8229     } else {
8230         fdbp->AmbOffsetTable = NULL;
8231     }
8232 
8233 
8234     /* Allocating space for lookup table */
8235 
8236     if((fdbp->lookup = FASTALookupNew()) == NULL) {
8237         ErrLogPrintf("Error initializing Lookup structure. Formatting failed.\n");
8238         return NULL;
8239     }
8240 
8241     /* Allocate the PIG table structure */
8242     if ( !(fdbp->ptable = FDBPigTableNew())) {
8243         ErrLogPrintf("Not enough memory to allocate PIG table structure.\n");
8244         return NULL;
8245     }
8246 
8247     return fdbp;
8248 }
8249 
FDBLoadMembershipsTable(void)8250 ValNodePtr FDBLoadMembershipsTable(void)
8251 {
8252     ValNodePtr retval = NULL;
8253     MembInfoPtr mip = NULL;
8254     Int2 nbits, bit;
8255     Char buffer[256], numstr[256];
8256 
8257     /* Get the number of bits used according to the config file */
8258     nbits = GetAppParamInt2("formatdb","MembershipBitNumbers","TotalNum",0);
8259     if (nbits <= 0) {
8260         return NULL;
8261     }
8262 
8263     /* For each bit, load the appropriate criteria function */
8264     for (bit = 1; bit <= nbits; bit++) {
8265         const char* fn_name = NULL;
8266 
8267         memset((void*) &buffer, 0, sizeof(buffer));
8268         memset((void*) &numstr, 0, sizeof(numstr));
8269         Int8ToString((Int8)bit,numstr,sizeof(numstr));
8270         GetAppParam("formatdb","MembershipBitNumbers",numstr,"",buffer,
8271                 sizeof(buffer)-1);
8272         if (!mip) {
8273             mip = (MembInfoPtr) MemNew(sizeof(MembInfo));
8274             mip->criteria = NULL;
8275         }
8276         mip->bit_number = bit;
8277 
8278         if (!StringICmp("swissprot",buffer)) {
8279             mip->criteria = is_SWISSPROT;
8280             fn_name = "is_SWISSPROT";
8281         } else if (!StringICmp("pdb",buffer)) {
8282             mip->criteria = is_PDB;
8283             fn_name = "is_PDB";
8284         } else if (!StringICmp("refseq_genomic",buffer)) {
8285             mip->criteria = is_REFSEQ_GENOMIC;
8286             fn_name = "is_REFSEQ_GENOMIC";
8287         } else if (!StringICmp("refseq_rna",buffer)) {
8288             mip->criteria = is_REFSEQ_RNA;
8289             fn_name = "is_REFSEQ_RNA";
8290         } else if (!StringICmp("refseq_protein",buffer) ||
8291                    !StringICmp("refseq_chromosome", buffer)) {
8292             /* refseq_chromosome added per BD-308 */
8293             mip->criteria = is_REFSEQ;
8294             fn_name = "is_REFSEQ";
8295         }
8296 
8297         /* Add to the return value only if the criteria is set */
8298         if (mip->criteria != NULL) {
8299             ValNodeAddPointer(&retval,0,mip);
8300             mip = NULL;
8301             /*
8302             ErrLogPrintf("Membership bit %d: criteria for '%s' determined "
8303                          "by function '%s'\n", bit, buffer, fn_name);
8304                          */
8305         }
8306     }
8307     if (mip && mip->criteria == NULL)
8308         MemFree(mip);
8309 
8310     return retval;
8311 }
8312 
FDBLoadLinksTable(void)8313 ValNodePtr FDBLoadLinksTable(void)
8314 {
8315     ValNodePtr retval = NULL;
8316     Int4ListPtr gis = NULL;
8317     LinkInfoPtr lk_info = NULL;
8318     Int2 nbits, bit, nlists = 0;
8319     Char buffer[256], numstr[256], filename[FILENAME_MAX];
8320 
8321     /* Get the number of bits used according to the config file */
8322     nbits = GetAppParamInt2("formatdb","LinkBitNumbers","TotalNum",0);
8323     if (nbits <= 0) {
8324         return NULL;
8325     }
8326 
8327     /* For each bit and database, open the appropriate files and create the
8328      * gi lists */
8329     for (bit = 1; bit <= nbits; bit++) {
8330         memset((void*) numstr, 0, sizeof(numstr));
8331         memset((void*) buffer, 0, sizeof(buffer));
8332         memset((void*) filename, 0, sizeof(filename));
8333 
8334         Int8ToString((Int8)bit,numstr,sizeof(numstr));
8335         GetAppParam("formatdb", "LinkBitNumbers", numstr, "", buffer,
8336                 sizeof(buffer)-1);
8337         GetAppParam("formatdb", "LinkFiles", buffer, "", filename,
8338                 sizeof(filename)-1);
8339         if (StrLen(filename) == 0 || FileLength(filename) == 0) {
8340             ErrPostEx(SEV_WARNING,0,0,"Ignoring '%s' listing because it is "
8341                       "empty", buffer);
8342             continue;
8343         }
8344         if ((gis = Int4ListReadFromFile(filename)) == NULL) {
8345             ErrPostEx(SEV_ERROR,0,0,"Could not read %s", filename);
8346             continue;
8347         }
8348         HeapSort(gis->i, gis->count, sizeof(Int4), ID_Compare);
8349         lk_info = (LinkInfoPtr) MemNew(sizeof(LinkInfo));
8350         lk_info->bit_number = bit;
8351         lk_info->gi_list = gis;
8352         ValNodeAddPointer(&retval,0,lk_info);
8353         nlists++;
8354         ErrLogPrintf("Link bit %d: %ld gis from %s\n", bit, gis->count,
8355                 filename);
8356     }
8357 
8358     return retval;
8359 }
8360 
8361 /* This function will build (or create if needed) a chain of ValNode's
8362  * containing integers, which act as a large bit array, and set the
8363  * indicated bit. */
FDBBlastDefLineSetBit(Int2 bit_no,ValNodePtr PNTR retval)8364 void FDBBlastDefLineSetBit(Int2 bit_no, ValNodePtr PNTR retval)
8365 {
8366     Int4 bit_offset = 0, bit_mask = 0, i;
8367     Int4 currValNode = 0;
8368     ValNodePtr tmp = NULL;
8369 
8370     if (bit_no <= 0 || retval == NULL)
8371         return;
8372 
8373     bit_offset = (bit_no-1) % MASK_WORD_SIZE;
8374     currValNode = (Int4) ((bit_no-1)/MASK_WORD_SIZE);
8375 
8376     /* Allocate nodes if necessary */
8377     while (ValNodeLen(*retval) <= currValNode) {
8378         if (*retval == NULL) {
8379             (*retval) = ValNodeAddInt(NULL,0,0);
8380         } else {
8381             ValNodeAddInt(retval,0,0);
8382         }
8383     }
8384 
8385     /* Traverse the linked list of ValNodePtrs and use the bit_mask
8386      * in the appropriate node */
8387     bit_mask = 0x1 << bit_offset;
8388 
8389     tmp = *retval;
8390     for (i = 0; i < currValNode; i++)
8391         tmp = tmp->next;
8392 
8393     tmp->data.intvalue |= bit_mask;
8394 }
8395 
8396 static void
8397 s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,
8398                           const FDBTaxidDeflineTablePtr taxid_tbl);
8399 
FDBGetDefAsnFromBioseq(BioseqPtr bsp,const FDBTaxidDeflineTablePtr taxid_tbl)8400 BlastDefLinePtr FDBGetDefAsnFromBioseq(BioseqPtr bsp,
8401                                        const FDBTaxidDeflineTablePtr taxid_tbl)
8402 {
8403     BlastDefLinePtr bdp = NULL, bdp_last, bdp_head;
8404     CharPtr title, chptr, orig_title;
8405 
8406     if(bsp == NULL)
8407         return NULL;
8408 
8409     bdp = BlastDefLineNew();
8410     bdp_head = bdp;
8411 
8412     bdp->seqid = SeqIdSetDup(bsp->id);
8413     title = BioseqGetTitle(bsp);
8414 
8415     orig_title = title = StringSave(title);
8416 
8417     chptr = NULL;
8418     if((chptr = StringChr(title, '\1')) != NULL) {
8419         *chptr = NULLB;
8420         chptr++;
8421     }
8422     bdp->title = StringSave(title);
8423     bdp_last = bdp;
8424 
8425 
8426     while(chptr != NULL) {
8427 
8428         bdp = BlastDefLineNew();
8429 
8430         title = chptr;
8431 
8432         if((chptr = StringChr(title, ' ')) != NULL) {
8433             *chptr = NULLB;
8434             chptr++;
8435         }
8436         bdp->seqid = SeqIdParse(title);
8437         title = chptr;
8438 
8439         if((chptr = StringChr(title, '\1')) != NULL) {
8440             *chptr = NULLB;
8441             chptr++;
8442         }
8443         if(title != NULL)
8444             bdp->title = StringSave(title);
8445         else
8446             bdp->title = StringSave("No definition found");
8447 
8448         bdp_last->next = bdp;
8449         bdp_last = bdp;
8450     }
8451 
8452     MemFree(orig_title);
8453     s_FDBUpdateTaxIdInBdpList(bdp_head, taxid_tbl);
8454     return bdp_head;
8455 }
8456 
8457 Int4 LIBCALL
Int4ListBSearch(Int4ListPtr lp,Int4 key)8458 Int4ListBSearch PROTO((Int4ListPtr lp, Int4 key))
8459 {
8460     Int4 m, b, e;
8461 
8462     if (!lp)
8463         return -1;
8464 
8465     b = 0;
8466     e = lp->count-1;
8467 
8468     while (b <= e) {
8469         m = (b + e) / 2;
8470         if (lp->i[m] == key)
8471             return m;
8472         else if (lp->i[m] < key)
8473             b = m + 1;
8474         else
8475             e = m - 1;
8476     }
8477     return -1;
8478 }
8479 
FDBAddLinksInformation(BlastDefLinePtr bdp,ValNodePtr links_tblp)8480 Boolean FDBAddLinksInformation(BlastDefLinePtr bdp, ValNodePtr links_tblp)
8481 {
8482     ValNodePtr link_vnp = NULL, vnp_list = NULL;
8483     SeqIdPtr sip = NULL;
8484     Int4 gi = 0;
8485 
8486     if (bdp == NULL || links_tblp == NULL)
8487         return FALSE;
8488 
8489     /* Extract the gi from the bdp */
8490     if ((sip = SeqIdFindBest(bdp->seqid, SEQID_GI)) == NULL)
8491         return FALSE;
8492     gi = sip->data.intvalue;
8493 
8494     for (vnp_list = links_tblp; vnp_list; vnp_list = vnp_list->next) {
8495 
8496         LinkInfoPtr lk_info = (LinkInfoPtr)vnp_list->data.ptrvalue;
8497         if (Int4ListBSearch(lk_info->gi_list, gi) != -1)
8498             FDBBlastDefLineSetBit(lk_info->bit_number, &link_vnp);
8499     }
8500 
8501     if (link_vnp)
8502         bdp->links = link_vnp;
8503 
8504     return TRUE;
8505 }
8506 
FDBAddMembershipInformation(BlastDefLinePtr bdp,ValNodePtr memb_tblp,VoidPtr criteria_arg)8507 Boolean FDBAddMembershipInformation(BlastDefLinePtr bdp, ValNodePtr memb_tblp,
8508                                     VoidPtr criteria_arg)
8509 {
8510     ValNodePtr memb_vnp = NULL;
8511     MembInfoPtr mip = NULL;
8512 
8513     if (bdp == NULL || memb_tblp == NULL)
8514         return FALSE;
8515 
8516     /* Set the appropriate bit if this sequence satisfies the criteria */
8517     while (memb_tblp) {
8518         mip = (MembInfoPtr) memb_tblp->data.ptrvalue;
8519         if (mip->criteria(criteria_arg))
8520             FDBBlastDefLineSetBit(mip->bit_number, &memb_vnp);
8521         memb_tblp = memb_tblp->next;
8522     }
8523 
8524     if (memb_vnp)
8525         bdp->memberships = memb_vnp;
8526 
8527     return TRUE;
8528 }
8529 
FDBDestroyLinksTable(ValNodePtr list)8530 ValNodePtr FDBDestroyLinksTable(ValNodePtr list)
8531 {
8532     ValNodePtr tmp_vnp;
8533     LinkInfoPtr lk_info;
8534 
8535     if (!list)
8536         return NULL;
8537 
8538     for (tmp_vnp = list; tmp_vnp; tmp_vnp = tmp_vnp->next) {
8539         lk_info = (LinkInfoPtr) tmp_vnp->data.ptrvalue;
8540         lk_info->gi_list = Int4ListFree(lk_info->gi_list);
8541     }
8542     list = ValNodeFreeData(list);
8543 
8544     return NULL;
8545 }
8546 
FDBDestroyMembershipsTable(ValNodePtr tbl)8547 ValNodePtr FDBDestroyMembershipsTable(ValNodePtr tbl)
8548 {
8549     MembInfoPtr mip = NULL;
8550 
8551     while (tbl) {
8552         mip = (MembInfoPtr) tbl->data.ptrvalue;
8553         MemFree(mip);
8554         tbl = tbl->next;
8555     }
8556     return tbl;
8557 }
8558 
8559 
8560 #define REDUCED_E2INDEX_SET 1
8561 #ifdef REDUCED_E2INDEX_SET
8562 /*****************************************************************************
8563 *
8564 *   SeqIdE2Index(anp)
8565 *       atp is the current type (if identifier of a parent struct)
8566 *       if atp == NULL, then assumes it stands alone (SeqId ::=)
8567 *
8568 *****************************************************************************/
SeqIdE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8569 static Boolean SeqIdE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8570                              Boolean sparse)
8571 {
8572     Boolean retval = FALSE;
8573     TextSeqIdPtr tsip = NULL;
8574     ObjectIdPtr oid;
8575     PDBSeqIdPtr psip;
8576     Uint1 tmptype;
8577     CharPtr tmp, ptr=NULL;
8578     Char buf[81];
8579     Int4 length, i;
8580     DbtagPtr dbt;
8581     Uint1 chain = 0;
8582     Int2 version = 0;
8583 
8584     if (anp == NULL)
8585         return FALSE;
8586 
8587     if (anp->choice == SEQID_GI)
8588         return TRUE; /* Do not index GI as string. */
8589 
8590     switch (anp->choice) {
8591 
8592     case SEQID_LOCAL:     /* local */
8593     oid = (ObjectIdPtr)(anp->data.ptrvalue);
8594     ptr = oid->str;
8595         break;
8596     case SEQID_GIBBSQ:    /* gibbseq */
8597         sprintf(buf, "%ld", (long)(anp->data.intvalue));
8598         ptr = buf;
8599         break;
8600     case SEQID_GIBBMT:    /* gibbmt */
8601         break;
8602     case SEQID_GIIM:      /* giimid */
8603         return TRUE;      /* not indexed */
8604     case SEQID_EMBL:      /* embl */
8605     case SEQID_DDBJ:      /* ddbj */
8606     case SEQID_GENBANK:   /* genbank */
8607     case SEQID_TPG:       /* Third Party Annot/Seq Genbank */
8608     case SEQID_TPE:       /* Third Party Annot/Seq EMBL */
8609     case SEQID_TPD:       /* Third Party Annot/Seq DDBJ */
8610     case SEQID_OTHER:     /* other */
8611     case SEQID_GPIPE:     /* genome pipeline */
8612         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8613     if ((tsip->version > 0) && (tsip->release == NULL))
8614         version = tsip->version;
8615         break;
8616     case SEQID_SWISSPROT: /* swissprot */
8617         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8618     if (tsip->version > 0)
8619         version = tsip->version;
8620         break;
8621     case SEQID_PIR:       /* pir   */
8622     case SEQID_PRF:       /* prf   */
8623         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8624         break;
8625     case SEQID_PATENT:    /* patent seq id */
8626         break;
8627     case SEQID_GENERAL:   /* general */
8628         dbt = (DbtagPtr)(anp->data.ptrvalue);
8629         ptr = dbt->tag->str;
8630         break;
8631     case SEQID_GI:        /* gi */
8632         break;
8633     case SEQID_PDB:       /* pdb   */
8634     psip = (PDBSeqIdPtr)(anp->data.ptrvalue);
8635     ptr = psip->mol;
8636         chain = psip->chain;
8637         break;
8638     }
8639 
8640     if(tsip == NULL) {
8641         SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8642 
8643         length = StringLen(buf);
8644         for(i = 0; i < length; i++)
8645             buf[i] = TO_LOWER(buf[i]);
8646 
8647         fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8648 
8649     }
8650 
8651     if (tsip == NULL && ptr != NULL) {   /* write a single string for non TextSeqIDPtr cases. */
8652     StringMove(buf, ptr);
8653         length = StringLen(buf);
8654         for(i = 0; i < length; i++)
8655             buf[i] = TO_LOWER(buf[i]);
8656         fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8657 
8658         chain = TO_LOWER(chain);
8659 
8660         if (chain != 0) { /* PDB only. */
8661             fprintf(fd, "%s|%c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8662                     (long) seq_num);
8663             fprintf(fd, "%s %c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8664                     (long) seq_num);
8665         }
8666     }
8667 
8668     if (tsip != NULL) {   /* separately index accession and locus */
8669                /* now index as separate strings */
8670         if (tsip->name != NULL) {
8671             StringMove(buf, tsip->name);
8672             length = StringLen(buf);
8673             for(i = 0; i < length; i++)
8674                 buf[i] = TO_LOWER(buf[i]);
8675             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8676         }
8677         if (tsip->accession != NULL) {
8678             StringMove(buf, tsip->accession);
8679             length = StringLen(buf);
8680             for(i = 0; i < length; i++)
8681                 buf[i] = TO_LOWER(buf[i]);
8682             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8683             if (version)
8684                 fprintf(fd, "%s.%d%c%ld\n", buf, version, ISAM_DATA_CHAR, (long) seq_num);
8685         }
8686     }
8687 
8688     retval = TRUE;
8689     return retval;
8690 }
8691 #else
8692 /*****************************************************************************
8693 *
8694 *   SeqIdE2Index(anp)
8695 *       atp is the current type (if identifier of a parent struct)
8696 *       if atp == NULL, then assumes it stands alone (SeqId ::=)
8697 *
8698 *****************************************************************************/
SeqIdE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8699 static Boolean SeqIdE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8700                              Boolean sparse)
8701 {
8702     Boolean retval = FALSE;
8703     TextSeqIdPtr tsip = NULL;
8704     ObjectIdPtr oid;
8705     PDBSeqIdPtr psip;
8706     Boolean do_gb = FALSE;
8707     Uint1 tmptype;
8708     CharPtr tmp, ptr=NULL;
8709     Char buf[81];
8710     Int4 length, i;
8711     DbtagPtr dbt;
8712     Uint1 chain = 0;
8713     Int2 version = 0;
8714 
8715     if (anp == NULL)
8716         return FALSE;
8717 
8718     switch (anp->choice) {
8719 
8720     case SEQID_LOCAL:     /* local */
8721     oid = (ObjectIdPtr)(anp->data.ptrvalue);
8722     ptr = oid->str;
8723         break;
8724     case SEQID_GIBBSQ:    /* gibbseq */
8725         sprintf(buf, "%ld", (long)(anp->data.intvalue));
8726         ptr = buf;
8727         break;
8728     case SEQID_GIBBMT:    /* gibbmt */
8729         break;
8730     case SEQID_GIIM:      /* giimid */
8731         return TRUE;      /* not indexed */
8732     case SEQID_EMBL:      /* embl */
8733     case SEQID_DDBJ:      /* ddbj */
8734         do_gb = TRUE;     /* also index embl, ddbj as genbank */
8735     case SEQID_GENBANK:   /* genbank */
8736     case SEQID_TPG:       /* Third Party Annot/Seq Genbank */
8737     case SEQID_TPE:       /* Third Party Annot/Seq EMBL */
8738     case SEQID_TPD:       /* Third Party Annot/Seq DDBJ */
8739     case SEQID_OTHER:     /* other */
8740     case SEQID_GPIPE:     /* genome pipeline */
8741         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8742     if ((tsip->version > 0) && (tsip->release == NULL))
8743         version = tsip->version;
8744         break;
8745     case SEQID_SWISSPROT: /* swissprot */
8746         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8747     if (tsip->version > 0)
8748         version = tsip->version;
8749         break;
8750     case SEQID_PIR:       /* pir   */
8751     case SEQID_PRF:       /* prf   */
8752         tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8753         break;
8754     case SEQID_PATENT:    /* patent seq id */
8755         break;
8756     case SEQID_GENERAL:   /* general */
8757         dbt = (DbtagPtr)(anp->data.ptrvalue);
8758         ptr = dbt->tag->str;
8759         break;
8760     case SEQID_GI:        /* gi */
8761         break;
8762     case SEQID_PDB:       /* pdb   */
8763     psip = (PDBSeqIdPtr)(anp->data.ptrvalue);
8764     ptr = psip->mol;
8765         chain = psip->chain;
8766         break;
8767     }
8768 
8769     if(!sparse) {
8770         SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8771 
8772         length = StringLen(buf);
8773         for(i = 0; i < length; i++)
8774             buf[i] = TO_LOWER(buf[i]);
8775 
8776         fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8777 
8778         /* Index without version. */
8779         if (version) {
8780             tsip->version = 0;
8781             SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8782 
8783             length = StringLen(buf);
8784             for(i = 0; i < length; i++)
8785                buf[i] = TO_LOWER(buf[i]);
8786             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8787             tsip->version = version;
8788         }
8789     } /* if(!sparse) */
8790 
8791     if (ptr != NULL) {   /* write a single string */
8792     StringMove(buf, ptr);
8793         length = StringLen(buf);
8794         for(i = 0; i < length; i++)
8795             buf[i] = TO_LOWER(buf[i]);
8796         fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8797 
8798         chain = TO_LOWER(chain);
8799 
8800         if (chain != 0) { /* PDB only. */
8801             fprintf(fd, "%s|%c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8802                     (long) seq_num);
8803             fprintf(fd, "%s %c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8804                     (long) seq_num);
8805         }
8806     }
8807 
8808     if (tsip != NULL) {   /* separately index accession and locus */
8809         if ((tsip->accession != NULL) && (tsip->name != NULL) && !sparse) {
8810             tmp = tsip->accession;
8811             tsip->accession = NULL;
8812             SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8813             length = StringLen(buf);
8814             for(i = 0; i < length; i++)
8815                 buf[i] = TO_LOWER(buf[i]);
8816             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8817             tsip->accession = tmp;
8818             tmp = tsip->name;
8819             tsip->name = NULL;
8820             SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8821             length = StringLen(buf);
8822             for(i = 0; i < length; i++)
8823                 buf[i] = TO_LOWER(buf[i]);
8824             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8825             tsip->name = tmp;
8826         if (version)
8827         { /* Index accession without verison. */
8828         tsip->version = 0;
8829                 tmp = tsip->name;
8830                 tsip->name = NULL;
8831                 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8832                 length = StringLen(buf);
8833                 for(i = 0; i < length; i++)
8834                         buf[i] = TO_LOWER(buf[i]);
8835                 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8836                 tsip->name = tmp;
8837         tsip->version = version;
8838         }
8839         }
8840 
8841                /* now index as separate strings */
8842     if (tsip->name != NULL && !sparse) {
8843             StringMove(buf, tsip->name);
8844             length = StringLen(buf);
8845             for(i = 0; i < length; i++)
8846                 buf[i] = TO_LOWER(buf[i]);
8847             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8848     }
8849         if (tsip->accession != NULL) {
8850             StringMove(buf, tsip->accession);
8851             length = StringLen(buf);
8852             for(i = 0; i < length; i++)
8853                 buf[i] = TO_LOWER(buf[i]);
8854             fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8855         if (version && !sparse)
8856                 fprintf(fd, "%s.%d%c%ld\n", buf, version, ISAM_DATA_CHAR, (long) seq_num);
8857     }
8858 
8859     }
8860 
8861     if (do_gb && !sparse) {   /* index embl and ddbj as genbank */
8862         tmptype = anp->choice;
8863         anp->choice = SEQID_GENBANK;
8864         SeqIdE2Index(anp, fd, seq_num, sparse);
8865         anp->choice = tmptype;
8866     }
8867 
8868     retval = TRUE;
8869     return retval;
8870 }
8871 #endif
8872 
8873 /*****************************************************************************
8874  *
8875  *   SeqIdSetE2Index(anp, e2p, settype, elementtype)
8876  *
8877  *****************************************************************************/
SeqIdSetE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8878 static Boolean SeqIdSetE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8879                                 Boolean sparse)
8880 {
8881     SeqIdPtr oldanp;
8882     Boolean retval = FALSE;
8883 
8884     if (anp == NULL)
8885         return FALSE;
8886 
8887     oldanp = anp;
8888 
8889     while (anp != NULL) {
8890         if (!SeqIdE2Index(anp, fd, seq_num, sparse))
8891             goto erret;
8892         anp = anp->next;
8893     }
8894 
8895     retval = TRUE;
8896 erret:
8897     return retval;
8898 }
SeqIdSetFree_NO_OBJ_MGR(SeqIdPtr sip)8899 static SeqIdPtr SeqIdSetFree_NO_OBJ_MGR(SeqIdPtr sip)
8900 {
8901     SeqIdPtr    next;
8902 
8903     while(sip != NULL){
8904         next=sip->next;
8905         switch(sip->choice) {
8906          case SEQID_LOCAL:      /* local */
8907             ObjectIdFree(sip->data.ptrvalue);
8908             break;
8909          case SEQID_GIBBSQ:      /* gibbseq */
8910          case SEQID_GIBBMT:      /* gibbmt */
8911             break;
8912          case SEQID_GIIM:      /* giimid */
8913             GiimFree(sip->data.ptrvalue);
8914             break;
8915          case SEQID_GENBANK:      /* genbank */
8916          case SEQID_EMBL:      /* embl */
8917          case SEQID_PIR:      /* pir   */
8918          case SEQID_SWISSPROT:      /* swissprot */
8919          case SEQID_OTHER:     /* other */
8920          case SEQID_DDBJ:
8921          case SEQID_TPG:          /* Third Party Annot/Seq Genbank */
8922          case SEQID_TPE:          /* Third Party Annot/Seq EMBL */
8923          case SEQID_TPD:          /* Third Party Annot/Seq DDBJ */
8924          case SEQID_GPIPE:
8925          case SEQID_PRF:
8926             TextSeqIdFree(sip->data.ptrvalue);
8927             break;
8928          case SEQID_PATENT:      /* patent seq id */
8929             PatentSeqIdFree(sip->data.ptrvalue);
8930             break;
8931          case SEQID_GENERAL:     /* general */
8932             DbtagFree(sip->data.ptrvalue);
8933             break;
8934          case SEQID_GI:     /* gi */
8935             break;
8936          case SEQID_PDB:
8937                         PDBSeqIdFree(sip->data.ptrvalue);
8938                         break;
8939         }
8940         MemFree(sip);
8941         sip=next;
8942     }
8943     return NULL;
8944 }
8945 
UpdateLookupInfo(CharPtr defline,FASTALookupPtr lookup,Int4 num_of_seqs,FILE * fd_stmp,Boolean ParseSeqid,Boolean sparse)8946 static Int4 UpdateLookupInfo(CharPtr defline,
8947                              FASTALookupPtr lookup,
8948                              Int4 num_of_seqs,
8949                              FILE *fd_stmp,
8950                              Boolean ParseSeqid,
8951                              Boolean sparse
8952                              )
8953 {
8954     CharPtr p, d = defline;
8955     Int4 i, gi = 0;
8956     Char TextId[ID_MAX_SIZE+1];
8957     SeqIdPtr sip, sip_tmp;
8958 
8959     if(defline == NULL)
8960         return LOOKUP_NO_ERROR;
8961 
8962     if(!ParseSeqid)
8963         return LOOKUP_NO_ERROR;
8964 
8965     for(p = d = defline; ;d = p + StringLen(TextId)) {
8966 
8967         /* MemSet(TextId, 0, sizeof(TextId)); */
8968 
8969         for(i=0; !isspace((int)*p) && *p != NULLB && i < ID_MAX_SIZE; p++,i++)
8970             TextId[i]=*p;
8971 
8972         TextId[i]=0;
8973 
8974         if((sip = SeqIdParse(TextId)) == NULL) {/* Bad SeqId string */
8975             ErrLogPrintf("Sequence id \"%s\" is not parseable. "
8976                          "Formating failed at %s\n", TextId, defline);
8977             return ERR_SEQID_FAILED;
8978         }
8979 
8980         for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
8981             if(sip_tmp->choice == SEQID_GI) {
8982                 gi = sip_tmp->data.intvalue;
8983                 break;
8984             }
8985         }
8986 
8987         if(gi != 0) { /* GI not found */
8988 
8989             if((lookup->used + 2) >= lookup->allocated) {
8990                 lookup->allocated += LOOKUP_CHUNK;
8991                 lookup->table = (Int4Ptr)Realloc(lookup->table,
8992                                                  lookup->allocated*(sizeof(Int4)));
8993             }
8994 
8995             lookup->table[lookup->used] = gi;
8996             lookup->table[lookup->used+1] = num_of_seqs;
8997             lookup->used += 2;
8998         }
8999 
9000         if(!SeqIdSetE2Index (sip, fd_stmp, num_of_seqs, sparse)) {
9001             ErrLogPrintf("SeIdSetE2Index failed. Exiting..\n");
9002             return FALSE;
9003         }
9004 
9005     sip = SeqIdSetFree_NO_OBJ_MGR(sip);
9006 
9007         if((p = StringChr(d, READDB_DEF_SEPARATOR)) == NULL)
9008             break;
9009         else
9010             p++;
9011     }
9012     return LOOKUP_NO_ERROR;
9013 }
FormatdbCreateStringIndex(const CharPtr FileName,Boolean ProteinType,Int4 sparse_idx,Boolean test_non_unique)9014 static Boolean FormatdbCreateStringIndex(const CharPtr FileName,
9015                                          Boolean ProteinType,
9016                                          Int4 sparse_idx,
9017                                          Boolean test_non_unique)
9018 {
9019     SORTObjectPtr sop;
9020     Char filenamebuf[FILENAME_MAX], DBName[FILENAME_MAX];
9021     FILE *fd_out;
9022     CharPtr files;
9023     ISAMErrorCode error;
9024     ISAMObjectPtr isamp;
9025     Int4 line_count = 0;
9026 
9027     /*  object for unique sorting */
9028 
9029     if((sop = SORTObjectNew(NULL, '\0', 0,
9030                             FALSE, TRUE)) == NULL) {
9031         ErrPostEx(SEV_ERROR, 0, 0, "Failed to create SORT Object");
9032         return FALSE;
9033     }
9034 
9035     sprintf(filenamebuf, "%s.%ctm",
9036             FileName, ProteinType ? 'p' : 'n');
9037 
9038     sprintf(DBName, "%s.%csd",
9039             FileName, ProteinType ? 'p' : 'n');
9040 
9041     if((fd_out = FileOpen(DBName, "wb")) == NULL)
9042     {
9043         return FALSE;
9044     }
9045     files = filenamebuf;
9046 
9047     if (SORTFiles(&files, 1, fd_out, sop, &line_count) != SORTNoError)
9048     {
9049         ErrPostEx(SEV_ERROR, 0, 0, "SORTFiles failed, change TMPDIR to a partition with more free space or use -s option");
9050     return FALSE;
9051     }
9052     SORTObjectFree(sop);
9053 
9054     FILECLOSE(fd_out);
9055 
9056     FileRemove(filenamebuf);
9057     sprintf(filenamebuf, "%s.%csi",
9058             FileName, ProteinType ? 'p' : 'n');
9059 
9060     if((isamp = ISAMObjectNew(ISAMString, DBName, filenamebuf)) == NULL) {
9061         ErrPostEx(SEV_ERROR, 0, 0, "Creating of ISAM object failed");
9062         return FALSE;
9063     }
9064 
9065     ISAMSetDataSorted(isamp, line_count);
9066 
9067     ISAMSetCheckForNonUnique(isamp, test_non_unique);
9068 
9069     if((error = ISAMMakeIndex(isamp, 0, sparse_idx)) != ISAMNoError) {
9070         ErrPostEx(SEV_ERROR, 0, 0, "Creating of index failed with error code %ld\n", (long) error);
9071         ISAMObjectFree(isamp);
9072         return FALSE;
9073     }
9074 
9075     ISAMObjectFree(isamp);
9076     return TRUE;
9077 }
9078 
9079 /* This function should expect only single defline - multiple deflines
9080    usually have multiple tax_ids etc. If this is not TRUE writting
9081    ASN.1 with '\1' will result in failure. Code below should be removed
9082    when processing of multiple deflines is done */
FDLCreateAsnDF(FormatDBPtr fdbp,CharPtr seq_id,CharPtr title,Int4 taxid)9083 BlastDefLinePtr FDLCreateAsnDF(FormatDBPtr fdbp, CharPtr seq_id,
9084                                CharPtr title, Int4 taxid)
9085 {
9086     CharPtr p, d = title, chptr;
9087     Int4 i;
9088     Char TextId[ID_MAX_SIZE+1];
9089     SeqIdPtr sip;
9090     BlastDefLinePtr bdp, bdp_head = NULL, bdp_last;
9091 
9092     if(title == NULL && seq_id == NULL) {
9093         ErrPostEx(SEV_ERROR,0,0,"Cannot create a BlastDefLine",
9094                 " structure without a seq_id and a title");
9095         return NULL;
9096     }
9097 
9098     for(p = d = title; ;d = p) {
9099 
9100         MemSet(TextId, 0, sizeof(TextId));
9101         chptr = NULL;
9102 
9103         if(fdbp->options->parse_mode == TRUE) {
9104 
9105             if(seq_id == NULL) {
9106                 for(i=0; !isspace((int)*p) && i < ID_MAX_SIZE; p++,i++)
9107                     TextId[i]=*p;
9108 
9109                 p++;  /* Next character after space */
9110 
9111                 if((sip = SeqIdParse(TextId)) == NULL) {/* Bad SeqId string */
9112                     ErrLogPrintf("Sequence id \"%s\" is not parseable. "
9113                                  "Formating failed at %s\n", TextId, title);
9114                     return NULL;
9115                 }
9116             } else {
9117                 sip = SeqIdParse(seq_id);
9118                 seq_id = NULL;
9119             }
9120         } else {
9121 
9122             DbtagPtr dbtagptr;
9123 
9124             sip = ValNodeNew(NULL);
9125             dbtagptr = DbtagNew();
9126             dbtagptr->tag = ObjectIdNew();
9127 
9128             sip->choice = SEQID_GENERAL;
9129             sip->data.ptrvalue = dbtagptr;
9130             dbtagptr->tag->id = fdbp->num_of_seqs;
9131             dbtagptr->db = StringSave("BL_ORD_ID");
9132         }
9133 
9134         if((chptr = StringChr(d, READDB_DEF_SEPARATOR)) != NULL)
9135             *chptr = NULLB;
9136 
9137         bdp = BlastDefLineNew();
9138         bdp->seqid = SeqIdSetDup(sip);
9139         bdp->title = StringSave(p); /* Remaining line chunk */
9140         bdp->taxid = taxid;
9141 
9142         if(bdp_head == NULL) {
9143             bdp_head = bdp;
9144             bdp_last = bdp;
9145         } else {
9146             bdp_last->next = bdp;
9147             bdp_last = bdp;
9148         }
9149 
9150         sip = SeqIdSetFree_NO_OBJ_MGR(sip);
9151 
9152         /* Looking for the next defline in the set */
9153 
9154         if(chptr != NULL) {
9155             *chptr = READDB_DEF_SEPARATOR;
9156             p = chptr+1; /* Next after '\1' */
9157         } else {
9158             break;
9159         }
9160     }
9161 
9162     return bdp_head;
9163 }
9164 
FDBDumpDeflineAsn(FormatDBPtr fdbp,BlastDefLinePtr bdp_in)9165 Boolean FDBDumpDeflineAsn(FormatDBPtr fdbp, BlastDefLinePtr bdp_in)
9166 {
9167     Char    buffer[128];
9168     BlastDefLinePtr bdp;
9169 #ifdef FDB_TAXONOMYDB
9170     SeqIdPtr sip;
9171 #endif
9172 
9173     BlastDefLineSetAsnWrite(bdp_in, fdbp->aip_def, NULL);
9174     AsnIoFlush(fdbp->aip_def);
9175 
9176     MemSet(buffer, NULLB, sizeof(buffer));
9177     for(bdp = bdp_in; bdp != NULL; bdp = bdp->next) {
9178 
9179         /* ------------ Updating taxonomy information -------------- */
9180 
9181         if(fdbp->options->tax_callback != NULL) {
9182 
9183 #ifdef FDB_TAXONOMYDB
9184             if (bdp->taxid == 0) {
9185                 if ((sip = SeqIdFindBest(bdp->seqid, SEQID_GI)))
9186                     bdp->taxid = tax1_getTaxId4GI(sip->data.intvalue);
9187             }
9188 #endif
9189 
9190             if(!fdbp->options->tax_callback(fdbp->options->tax_lookup,
9191                                             bdp->taxid)) {
9192                 ErrPostEx(SEV_ERROR, 0,0,
9193                           "tax_callback() failed for taxid %ld. "
9194                           "Formating terminated abnormaly", bdp->taxid);
9195                 return 1;
9196             }
9197         }
9198 
9199         /* ------ Now adding new entried into lookup hash table ----- */
9200 
9201         if(fdbp->options->parse_mode == TRUE)  {
9202 
9203             SeqIdWrite(bdp->seqid, buffer, PRINTID_FASTA_LONG, 128);
9204 
9205             if((UpdateLookupInfo(buffer, fdbp->lookup, fdbp->num_of_seqs, fdbp->fd_stmp, fdbp->options->parse_mode, fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
9206                 return FALSE;
9207             }
9208         }
9209     }
9210 
9211     return TRUE;
9212 }
9213 
FDBDumpDefline(FormatDBPtr fdbp,CharPtr title,CharPtr seq_id)9214 static Boolean FDBDumpDefline(FormatDBPtr fdbp, CharPtr title, CharPtr seq_id)
9215 {
9216   Char    tmpbuff[1024];
9217   CharPtr defline;
9218   Int4    defline_len, id_length;
9219 
9220   if(fdbp->options->parse_mode == FALSE)  {
9221     sprintf(tmpbuff, "%s%ld ", NON_SEQID_PREFIX, (long) fdbp->num_of_seqs);
9222 
9223     if (FileWrite(tmpbuff, StringLen(tmpbuff), 1, fdbp->fd_def) != (Uint4) 1)
9224       return 1;
9225     defline = title;
9226   } else {
9227     if (title != NULL)
9228       defline_len = StringLen(title);
9229     else
9230       defline_len = 0;
9231 
9232     defline_len += 255;    /* Sufficient for an ID. */
9233 
9234     if ( sizeof(tmpbuff) > defline_len)
9235       defline = tmpbuff;
9236     else
9237       defline = MemNew((defline_len+1)*sizeof(Char));
9238 
9239     /* IF the gi is zero and there is another ID, then do not print it. */
9240     if (StringNCmp(seq_id, "gi|0|", 5) == 0) {
9241       StringCpy(defline, seq_id+5);
9242       ErrPostEx(SEV_WARNING, 0, 0, "%s: zero gi stripped", seq_id);
9243     } else {
9244       StringCpy(defline, seq_id);
9245     }
9246 
9247     id_length = StringLen(defline);
9248     StrCat(defline+id_length++," ");
9249     if(title) StringCat(defline+id_length, title);
9250   }
9251   ASSERT(StringLen(defline) < 0x7fffffffUL - 2000000000UL -20 /* for lcl|dddd...  */ );
9252 
9253   if (FileWrite(defline, StringLen(defline), 1, fdbp->fd_def) != (Uint4) 1) {
9254 
9255     if (defline != title && defline != tmpbuff)
9256       MemFree(defline);
9257 
9258     return 1;
9259   }
9260 
9261   /* -------- Now adding new entried into lookup hash table */
9262 
9263   if((UpdateLookupInfo(defline, fdbp->lookup, fdbp->num_of_seqs,
9264                        fdbp->fd_stmp, fdbp->options->parse_mode,
9265                        fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
9266 
9267     if ( defline != title && defline != tmpbuff)
9268       MemFree(defline);
9269 
9270     return FALSE;
9271   }
9272 
9273   if (defline != title && defline != tmpbuff)
9274     MemFree(defline);
9275 
9276   return TRUE;
9277 }
9278 
9279 /* Creates a new volume of the blast database being created if the sequence
9280  * being added causes it to exceed the volume limitations (number of
9281  * letters/sequences) */
FDBCreateNewVolume(FormatDBPtr fdbp,const ByteStorePtr seq,Int4 seq_length,const Uint4Ptr ambiguities)9282 static Int4 FDBCreateNewVolume(FormatDBPtr fdbp,
9283                                const ByteStorePtr seq,
9284                                Int4 seq_length,
9285                                const Uint4Ptr ambiguities)
9286 {
9287   FDB_optionsPtr options = fdbp->options;
9288   Int4 amb_size = 0; /* size of ambiguities for this sequence */
9289   Int8 seq_size = 0; /* length of sequence file with new sequence being added */
9290   Int4 hdr_size = 0; /* size of the header file without new sequence */
9291   Char extension_prefix = options->is_protein ? 'p' : 'n';
9292 
9293   if (ambiguities) {
9294       amb_size = sizeof(*ambiguities) * ((*ambiguities)&0x7fffffffUL);
9295   }
9296   seq_size = (ftell(fdbp->fd_seq) + BSLen(seq) + 1 + amb_size);
9297   hdr_size = ftell(fdbp->aip_def ? fdbp->aip_def->fp : fdbp->fd_def);
9298 
9299   if ( /* if bases_in_volume was specified, don't exceed that */
9300        (options->bases_in_volume &&
9301         (fdbp->TotalLen + seq_length > options->bases_in_volume)) ||
9302        /* if sequences_in_volume was specified, don't exceed that (will be
9303         * deprecated) */
9304        (options->sequences_in_volume &&
9305         (fdbp->num_of_seqs+1) > options->sequences_in_volume)  ||
9306        /* if sequence file is about to grow larger than SEQFILE_SIZE_MAX */
9307        ( seq_size > SEQFILE_SIZE_MAX) ||
9308        /* if header file is about to grow too large (assuming header can not
9309         * exceed 2G - 2000000000b) */
9310        ( hdr_size > 2000000000UL)
9311       )
9312     {
9313       Char dbnamebuf[PATH_MAX];
9314       FormatDBPtr tmp_fdbp = NULL;
9315 
9316       if (options->volume == 1) {
9317           sprintf(dbnamebuf, "%s.00", options->base_name);
9318       } else {
9319           sprintf(dbnamebuf, "%s", options->base_name);
9320       }
9321       ErrLogPrintf("Closing volume %s with %ld sequences, %s letters"
9322                    "(.%csq file = %ld bytes; .%chr file = %ld bytes)\n",
9323                    options->base_name, fdbp->num_of_seqs,
9324                    Nlm_Int8tostr(fdbp->TotalLen, 1),
9325                    extension_prefix, (long)seq_size,
9326                    extension_prefix, (long)hdr_size);
9327       tmp_fdbp = (FormatDBPtr) MemNew(sizeof(FormatDB));
9328       MemCpy(tmp_fdbp, fdbp, sizeof(FormatDB));
9329 
9330       if(FormatDBClose(tmp_fdbp))
9331          return 9;
9332       if (++options->volume >= kFDBMaxNumVolumes) {
9333           FDBCleanUpInProgress(options);
9334           ErrPostEx(SEV_FATAL, 1, 0,
9335                     "BLAST database exceeded %d volumes, please adjust the -v "
9336                     "option to formatdb (number of bases per volume)",
9337                     kFDBMaxNumVolumes);
9338           return -1;
9339       }
9340 
9341       /* When second volume is created, add suffix .00 to all
9342          first volume files */
9343       if (options->volume == 1)
9344         {
9345           Char  oldnamebuf[FILENAME_MAX], newnamebuf[FILENAME_MAX];
9346           int len = StringLen(options->base_name) + 2;
9347           sprintf(oldnamebuf, "%s.%cin", options->base_name, extension_prefix);
9348           sprintf(newnamebuf, "%s.00.%cin", options->base_name,
9349                   extension_prefix);
9350           if (FileLength(oldnamebuf) > 0)
9351             FileRename(oldnamebuf, newnamebuf);
9352           StringCpy(oldnamebuf + len, "hr");
9353           StringCpy(newnamebuf + len + 3, "hr");
9354           if (FileLength(oldnamebuf) > 0)
9355             FileRename(oldnamebuf, newnamebuf);
9356           StringCpy(oldnamebuf + len, "sq");
9357           StringCpy(newnamebuf + len + 3, "sq");
9358           if (FileLength(oldnamebuf) > 0)
9359             FileRename(oldnamebuf, newnamebuf);
9360           StringCpy(oldnamebuf + len, "nd");
9361           StringCpy(newnamebuf + len + 3, "nd");
9362           if (FileLength(oldnamebuf) > 0)
9363             FileRename(oldnamebuf, newnamebuf);
9364           StringCpy(oldnamebuf + len, "ni");
9365           StringCpy(newnamebuf + len + 3, "ni");
9366           if (FileLength(oldnamebuf) > 0)
9367             FileRename(oldnamebuf, newnamebuf);
9368           StringCpy(oldnamebuf + len, "sd");
9369           StringCpy(newnamebuf + len + 3, "sd");
9370           if (FileLength(oldnamebuf) > 0)
9371             FileRename(oldnamebuf, newnamebuf);
9372           StringCpy(oldnamebuf + len, "si");
9373           StringCpy(newnamebuf + len + 3, "si");
9374           if (FileLength(oldnamebuf) > 0)
9375             FileRename(oldnamebuf, newnamebuf);
9376          if (options->dump_info) {
9377              StringCpy(oldnamebuf + len, "di");
9378              StringCpy(newnamebuf + len + 3, "di");
9379              if (FileLength(oldnamebuf) > 0)
9380                  FileRename(oldnamebuf, newnamebuf);
9381          }
9382          if (options->is_protein) {
9383              /* PIG ISAM files */
9384              StringCpy(oldnamebuf + len, "pd");
9385              StringCpy(newnamebuf + len + 3, "pd");
9386              if (FileLength(oldnamebuf) > 0)
9387                  FileRename(oldnamebuf, newnamebuf);
9388              StringCpy(oldnamebuf + len, "pi");
9389              StringCpy(newnamebuf + len + 3, "pi");
9390              if (FileLength(oldnamebuf) > 0)
9391                  FileRename(oldnamebuf, newnamebuf);
9392          }
9393 
9394           MemFree(options->base_name);
9395           newnamebuf[len+1] = NULLB;
9396           options->base_name = StringSave(newnamebuf);
9397         }
9398 
9399       {
9400         CharPtr ptr;
9401       ptr = options->base_name + StringLen(options->base_name) - 2;
9402       sprintf(ptr, "%02ld", (long) options->volume);
9403       }
9404 
9405       if ((tmp_fdbp = FormatDBInit(options)) == NULL)
9406         return 2;
9407 
9408       MemCpy(fdbp, tmp_fdbp, sizeof(FormatDB));
9409       MemFree(tmp_fdbp);
9410     }
9411 
9412   return 0;
9413 }
9414 
FDBExtend4Sequence(FormatDBPtr fdbp,const ByteStorePtr seq,Int4 seq_length,const Uint4Ptr ambiguities)9415 static Int4 FDBExtend4Sequence(FormatDBPtr fdbp,
9416                                const ByteStorePtr seq,
9417                                Int4 seq_length, const Uint4Ptr ambiguities)
9418 {
9419 
9420     assert(ftell(fdbp->fd_seq) + BSLen(seq) + 1 +
9421            (ambiguities == NULL ? 0 : (*ambiguities) & 0x7fffffffUL) <
9422            0x7fffffffUL);
9423 
9424     return FDBFillIndexTables(fdbp, seq_length);
9425 }
9426 
FDBFillIndexTables(FormatDBPtr fdbp,Int4 seq_length)9427 Int4 FDBFillIndexTables(FormatDBPtr fdbp, Int4 seq_length)
9428 {
9429     fdbp->TotalLen += seq_length;
9430 
9431     if (fdbp->MaxSeqLen < seq_length)
9432         fdbp->MaxSeqLen = seq_length;
9433 
9434     if (fdbp->OffsetAllocated <= (fdbp->num_of_seqs + 1)) {
9435         fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
9436 
9437         fdbp->DefOffsetTable = (Int4Ptr) Realloc(fdbp->DefOffsetTable,
9438                                                  fdbp->OffsetAllocated *
9439                                                  sizeof(Uint4));
9440         fdbp->SeqOffsetTable =
9441             (Int4Ptr) Realloc(fdbp->SeqOffsetTable,
9442                               fdbp->OffsetAllocated * sizeof(Uint4));
9443         if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
9444             ErrLogPrintf
9445                 ("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9446             return 1;
9447         }
9448 
9449         if (!fdbp->options->is_protein) {
9450             fdbp->AmbOffsetTable = (Int4Ptr) Realloc(fdbp->AmbOffsetTable,
9451                                                      fdbp->OffsetAllocated *
9452                                                      sizeof(Uint4));
9453             if (!fdbp->AmbOffsetTable) {
9454                 ErrLogPrintf
9455                     ("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9456                 return 1;
9457             }
9458         }
9459     }
9460 
9461     if (fdbp->aip_def != NULL)  /* Structured deflines */
9462         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
9463     else if (fdbp->fd_def)
9464         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
9465 
9466     if (fdbp->fd_seq)
9467         fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
9468 
9469     return 0;
9470 }
9471 
9472 /********* BEGIN:  Auxiliary functions to the SI_Record structure ************/
9473 
9474 /** Allocates a single node in the SI_Record linked list structure */
SI_RecordNew(void)9475 SI_Record* SI_RecordNew(void)
9476 {
9477     return (SI_Record*) calloc(1, sizeof(SI_Record));
9478 }
9479 
9480 /** Deallocates the linked list of SI_Record structures in srp
9481  * @return NULL
9482  */
SI_RecordFree(SI_Record * srp)9483 SI_Record* SI_RecordFree(SI_Record* srp)
9484 {
9485     if ( !srp ) {
9486         return NULL;
9487     }
9488 
9489     while (srp) {
9490         SI_Record* tmp = srp->next;
9491         if (srp->title) {
9492             srp->title = MemFree(srp->title);
9493         }
9494         MemFree(srp);
9495         srp = tmp;
9496     }
9497     return NULL;
9498 }
9499 
9500 /** Appends a new node to the srp linked list.
9501  * @return the newly allocated node
9502  */
SI_RecordAddNode(SI_Record * srp)9503 static SI_Record* SI_RecordAddNode(SI_Record* srp)
9504 {
9505     if ( !srp ) {
9506         return SI_RecordNew();
9507     } else {
9508         for (; srp->next; srp = srp->next) ;
9509         srp->next = SI_RecordNew();
9510         return srp->next;
9511     }
9512 }
9513 
9514 /** Appends a new node to the srp linked list from data used in the
9515  * FORMATDB_VER format of the BLAST databases
9516  * @return pointer to the newly added node
9517  */
9518 static SI_Record*
SI_RecordAddFormatdb_ver(SI_Record * srp,int gi,int owner,const char * div,int date,Uint1 mol,const BlastDefLinePtr bdp)9519 SI_RecordAddFormatdb_ver(SI_Record* srp, int gi, int owner, const char* div,
9520                          int date, Uint1 mol, const BlastDefLinePtr bdp)
9521 {
9522     ASSERT(bdp);
9523 
9524     srp = SI_RecordAddNode(srp);
9525 
9526     srp->gi = gi;
9527     srp->owner = owner;
9528     srp->ent = date;
9529     srp->taxid = bdp->taxid;
9530     srp->mol = mol;
9531 
9532     if (div) {
9533         StringNCpy_0(srp->div, div, sizeof(srp->div));
9534     }
9535     if (bdp->seqid) {
9536         SeqIdWrite(bdp->seqid, srp->seqid, PRINTID_FASTA_LONG,
9537                    sizeof(srp->seqid));
9538     }
9539     if (bdp->title) {
9540         srp->title = StringSave(bdp->title);
9541     }
9542     return srp;
9543 }
9544 
9545 /** Appends a new node to the srp linked list from data used in the
9546  * FORMATDB_VER_TEXT format of the BLAST databases
9547  * @return pointer to the newly added node
9548  */
9549 static SI_Record*
SI_RecordAddFormatdb_ver_text(SI_Record * srp,Int4 gi,Int4 owner,Int4 taxid,char * div,Int4 date,Uint1 mol,char * seq_id,char * title)9550 SI_RecordAddFormatdb_ver_text(SI_Record* srp, Int4 gi, Int4 owner, Int4 taxid,
9551                               char* div, Int4 date, Uint1 mol, char* seq_id,
9552                               char* title)
9553 {
9554     srp = SI_RecordAddNode(srp);
9555 
9556     srp->gi = gi;
9557     srp->owner = owner;
9558     srp->ent = date;
9559     srp->taxid = taxid;
9560     srp->mol = mol;
9561 
9562     if (div)
9563         StringNCpy_0(srp->div, div, sizeof(srp->div));
9564     if (seq_id)
9565         StringNCpy_0(srp->seqid, seq_id, sizeof(srp->seqid));
9566     if (title)
9567         srp->title = StringSave(title);
9568     return srp;
9569 }
9570 
9571 /********* END:    Auxiliary functions to the SI_Record structure ************/
9572 
s_GetPrintableSequenceId(const SeqIdPtr seqid,char * seqid_string,char buffer[],size_t buffer_sz)9573 static void s_GetPrintableSequenceId(const SeqIdPtr seqid,
9574                                      char* seqid_string,
9575                                      char buffer[],
9576                                      size_t buffer_sz)
9577 {
9578     if (seqid_string) {
9579         StringNCpy_0(buffer, seqid_string, buffer_sz-1);
9580     } else {
9581         SeqIdWrite(seqid, buffer, PRINTID_FASTA_LONG, buffer_sz-1);
9582     }
9583 }
9584 
9585 /* If the bdp parameter is given, the defline, Seq-id, and taxonomy
9586  * information, is obtained from this parameter and thus the remainder
9587  * parameters are ignored. */
FDBAddSequence(FormatDBPtr fdbp,BlastDefLinePtr bdp,Uint1 * seq_data_type,ByteStorePtr * seq_data,Int4 SequenceLen,CharPtr seq_id,CharPtr title,Int4 gi,Int4 tax_id,CharPtr div,Int4 owner,Int4 date)9588 Int2 FDBAddSequence(FormatDBPtr fdbp, BlastDefLinePtr bdp,
9589                     Uint1* seq_data_type, ByteStorePtr * seq_data,
9590                     Int4 SequenceLen,
9591 
9592                     /* These 2 parameters are left for the backward
9593                        compatibility. They are not used for ASN.1 structues
9594                        deflines dump */
9595                     CharPtr seq_id, CharPtr title,
9596                     /* These parameters suppose, that this function adds
9597                        sequence to the Blast database with single definition
9598                        line. Generally speaking, this is not the common case
9599                        and if this function is used to add sequence item with
9600                        many definition lines these parameters must not be used
9601                        at all. */
9602                     Int4 gi, Int4 tax_id, CharPtr div, Int4 owner, Int4 date)
9603 {
9604     Uint4Ptr AmbCharPtr = NULL;
9605     ByteStorePtr new_data;
9606     Int2 status = 0;
9607 
9608     ASSERT(seq_data);
9609     ASSERT(seq_data_type);
9610 
9611     if (SequenceLen <= 0) {
9612         char tmpbuf[128] = { NULLB };
9613         s_GetPrintableSequenceId(bdp->seqid, seq_id, tmpbuf, sizeof(tmpbuf));
9614         ErrPostEx(SEV_WARNING, 0, 0,
9615           "Cannot add sequence number %ld (%s) because it has zero-length.\n",
9616                   (fdbp->options->total_num_of_seqs + 1), tmpbuf);
9617         return 1;
9618     }
9619     if (fdbp->options->is_protein) {
9620         if (*seq_data_type != Seq_code_ncbistdaa) {
9621             new_data = BSConvertSeq(*seq_data, Seq_code_ncbistdaa,
9622                                     *seq_data_type, SequenceLen);
9623             *seq_data = new_data;
9624             *seq_data_type = Seq_code_ncbistdaa;
9625         }
9626     } else {                    /* if(!fdbp->options->is_protein) */
9627 
9628         AmbCharPtr = NULL;
9629         if (*seq_data_type != Seq_code_ncbi2na
9630             && *seq_data_type != Seq_code_ncbi4na) {
9631             Uint1 new_code;
9632             new_data =
9633                 BSPack(*seq_data, *seq_data_type, SequenceLen, &new_code);
9634             if (new_data != NULL) {
9635                 *seq_data = new_data;
9636                 *seq_data_type = new_code;
9637             }
9638         }
9639 
9640         if (*seq_data_type == Seq_code_ncbi4na && seq_data != NULL) {
9641             /* ncbi4na require compression into ncbi2na */
9642 
9643             if (fdbp->options->version > FORMATDB_VER_TEXT) {
9644                 if ((new_data = BSCompressDNANew(*seq_data, SequenceLen,
9645                                                  &AmbCharPtr)) == NULL) {
9646                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
9647                                  "Formating failed.\n");
9648                     return 3;
9649                 }
9650             } else {
9651                 if ((new_data = BSCompressDNA(*seq_data, SequenceLen,
9652                                               &AmbCharPtr)) == NULL) {
9653                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
9654                                  "Formating failed.\n");
9655                     return 3;
9656                 }
9657             }
9658             *seq_data = new_data;
9659 
9660             *seq_data_type = Seq_code_ncbi2na;   /* just for information */
9661 
9662         } else {
9663             Uint1 remainder;
9664             /* if sequence already in ncbi2na format we have to update last
9665                byte */
9666             BSSeek(*seq_data, SequenceLen / 4, SEEK_SET);
9667 
9668             if ((remainder = (SequenceLen % 4)) == 0) {
9669                 BSPutByte(*seq_data, NULLB);
9670             } else {
9671                 Uint1 ch = remainder + (BSGetByte(*seq_data) & 0xfc);
9672                 BSSeek(*seq_data, SequenceLen / 4, SEEK_SET);
9673                 BSPutByte(*seq_data, ch);
9674             }
9675         }
9676     }                           /* if(!fdbp->options->is_protein) */
9677 
9678     /* Prepare SI_Record structure for calling FDBAddSequence2 */
9679     {
9680         SI_Record* si = NULL;
9681         /* There is no information available here to distinguish DNA from RNA
9682            etc., so assign only AA or DNA molecule type. */
9683         Uint1 mol = (fdbp->options->is_protein ? Seq_mol_aa : Seq_mol_dna);
9684 
9685         if (bdp != NULL) {
9686             Boolean first_iteration = TRUE;
9687             for (; bdp; bdp = bdp->next) {
9688                 if (first_iteration) {
9689                     si = SI_RecordAddFormatdb_ver(si, gi, owner, div, date, mol,
9690                                                   bdp);
9691                     first_iteration = FALSE;
9692                 } else {
9693                     SI_RecordAddFormatdb_ver(si, gi, owner, div, date, mol, bdp);
9694                 }
9695             }
9696         } else {
9697             si = SI_RecordAddFormatdb_ver_text(si, gi, owner, tax_id, div,
9698                                                date, mol, seq_id, title);
9699         }
9700 
9701         status = FDBAddSequence2(fdbp, si, *seq_data_type, seq_data,
9702                                  SequenceLen, AmbCharPtr, PIG_NONE, 0);
9703 
9704         si = SI_RecordFree(si);
9705     }
9706 
9707     return status;
9708 }
9709 
readdb_sequence_hash(const char * sequence,int sequence_length)9710 Uint4 readdb_sequence_hash(const char* sequence, int sequence_length)
9711 {
9712     Uint4 retval = 0;
9713     int i;
9714     for (i = 0; i < sequence_length; i++) {
9715         retval *= 1103515245;
9716         retval += (unsigned long) (sequence[i]) + 12345;
9717     }
9718     return retval;
9719 }
9720 
9721 /* See comment in readdb.h */
FDBAddSequence2(FormatDBPtr fdbp,SI_RecordPtr srp,Uint1 seq_data_type,const ByteStorePtr * seq_data,Int4 SequenceLen,Uint4Ptr AmbCharPtr,Int4 pig_id,Uint4 hash)9722 Int2 FDBAddSequence2(FormatDBPtr fdbp,  /* target blast db */
9723                      SI_RecordPtr srp,  /* linked list of sequence
9724                                            information for each gi */
9725                      /* sequence data itself */
9726                      Uint1 seq_data_type,
9727                      const ByteStorePtr * seq_data,
9728                      Int4 SequenceLen,
9729                      Uint4Ptr AmbCharPtr,
9730 
9731                      Int4 pig_id,  /* stable protein group identifier */
9732                      Uint4 hash /* sequence hash - to allow reuse of hash
9733                                    calculated in ID */
9734     )
9735 {
9736     BlastDefLinePtr bdp_first = NULL;
9737     BlastDefLinePtr bdp_cur = NULL;
9738     SI_RecordPtr pc = NULL;
9739 
9740     if (SequenceLen <= 0) {
9741         ErrLogPrintf("Sequence number %ld has zero-length!\n",
9742                      (fdbp->options->total_num_of_seqs + 1));
9743         return 1;
9744     }
9745 
9746     /* If too many bases in thise file, start a new volume */
9747     if (FDBCreateNewVolume(fdbp, *seq_data, SequenceLen, AmbCharPtr))
9748         return 1;
9749 
9750     if (FDBExtend4Sequence(fdbp, *seq_data, SequenceLen, AmbCharPtr))
9751         return 1;
9752 
9753     /* ---------- Dumping sequence data ---------- */
9754 
9755     BSSeek(*seq_data, 0, SEEK_SET);
9756     for (;;) {
9757         Char tmpbuff[1025];
9758         int len = BSRead(*seq_data, tmpbuff, sizeof(tmpbuff) - 1);
9759         if (len <= 0)
9760             break;
9761         if (FileWrite(tmpbuff, len, 1, fdbp->fd_seq) != (Uint4) 1)
9762             return 1;
9763         if (hash == 0 && fdbp->options->dump_info) {
9764             hash = readdb_sequence_hash(tmpbuff, len);
9765         }
9766     }
9767 
9768     if (fdbp->options->is_protein) {
9769         int i = 0;
9770         ASSERT(seq_data_type == Seq_code_ncbistdaa);
9771         if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
9772             return 1;
9773     } else {
9774         ASSERT(seq_data_type == Seq_code_ncbi2na);
9775         /* dump ambiguity characters. */
9776         fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);  /* Anyway...  */
9777 
9778         /* if AmbCharPtr is not NULL, then there was ambiguity. */
9779         if (AmbCharPtr != NULL) {
9780             Uint4 total, index;
9781 
9782             /* The first Uint4 holds the total number of ambig. bp. */
9783             total = (*AmbCharPtr) + 1;
9784             total &= 0x7FFFFFFF;
9785             for (index = 0; index < total; index++) {
9786                 if (!FormatDbUint4Write(AmbCharPtr[index], fdbp->fd_seq))
9787                     return 1;
9788             }
9789             MemFree(AmbCharPtr);
9790             AmbCharPtr = NULL;
9791         }
9792     }
9793 
9794     /* This information is written to the *.[pn]di file, and it is also
9795        needed to set the membership bits in the FORMATDB_VER version of the
9796        blast databases. */
9797 
9798     if (fdbp->options->version == FORMATDB_VER_TEXT) {
9799         if (fdbp->options->dump_info) {
9800             /* ------- Dumping misc info file ----------- */
9801             fprintf(fdbp->fd_sdi, "%ld %ld %ld %ld %s %ld %ld %ld\n",
9802                     (long) fdbp->num_of_seqs, (long) srp->gi,
9803                     (long) srp->taxid, (long) srp->owner,
9804                     srp->div ? srp->div : "N/A", (long) SequenceLen,
9805                     (long) hash, (long) srp->ent);
9806         }
9807 
9808         /* ------- Dumping definition line ---------- */
9809         if (!FDBDumpDefline(fdbp, srp->title, srp->seqid)) {
9810             ErrPostEx(SEV_ERROR, 0, 0,
9811                       "FDBDumpDefline() failed. Formating terminated abnormaly");
9812             return 1;
9813         }
9814         return 1;
9815     }
9816 
9817     assert(fdbp->options->version >= FORMATDB_VER);
9818 
9819     for (pc = srp; pc; pc = pc->next) {
9820         DI_Record direc;
9821         MemSet((VoidPtr) & direc, 0, sizeof(direc));
9822         direc.oid = fdbp->num_of_seqs;
9823         direc.gi = pc->gi;
9824         direc.taxid = pc->taxid;
9825         direc.owner = pc->owner;
9826         direc.date = pc->ent;
9827         direc.len = SequenceLen;
9828         direc.hash = hash;
9829         direc.mol = pc->mol;
9830 
9831         if (bdp_cur == NULL) {
9832             bdp_first = bdp_cur = FDLCreateAsnDF(fdbp, pc->seqid, pc->title,
9833                                                  pc->taxid);
9834         } else {
9835             bdp_cur = bdp_cur->next =
9836                 FDLCreateAsnDF(fdbp, pc->seqid, pc->title, pc->taxid);
9837         }
9838 
9839         /* Add the PIG information */
9840         if (fdbp->options->is_protein && pig_id != PIG_NONE) {
9841             if (!bdp_cur->other_info) {
9842                 ValNodeAddInt(&bdp_cur->other_info, 0, pig_id);
9843             }
9844         }
9845 
9846         if (fdbp->options->dump_info) {
9847             /* ------ Dumping misc info file ----------- */
9848             CharPtr acc =
9849                 FDFGetAccessionFromSeqIdChain((SeqIdPtr) bdp_cur->seqid);
9850             direc.acc = acc;
9851             fprintf(fdbp->fd_sdi, "%ld %ld %ld %ld %s %ld %ld %ld %s %u\n",
9852                     (long) direc.oid, (long) direc.gi, (long) direc.taxid,
9853                     (long) direc.owner, pc->div ? pc->div : "N/A",
9854                     (long) direc.len, (long) direc.hash, (long) direc.date,
9855                     (char *) (acc ? acc : "unknown"), (unsigned int) direc.mol);
9856         }
9857 
9858         /* ------- Add the links and membership information -- */
9859         FDBAddLinksInformation(bdp_cur, fdbp->options->linkbit_listp);
9860         FDBAddMembershipInformation(bdp_cur, fdbp->options->memb_tblp,
9861                                     (VoidPtr) & direc);
9862         if (direc.acc)
9863             MemFree(direc.acc);
9864 
9865     }  /* end of SI record loop */
9866 
9867     if (fdbp->options->is_protein)
9868         FDBAddPig(fdbp->ptable, pig_id, fdbp->num_of_seqs);
9869 
9870 
9871     /* ------- Dumping definition line ---------- */
9872     if (!FDBDumpDeflineAsn(fdbp, bdp_first)) {
9873         ErrPostEx(SEV_ERROR, 0, 0,
9874                   "FDBDumpDeflineAsn() failed. Formating terminated abnormaly");
9875         return 1;
9876     }
9877 
9878     BlastDefLineSetFree(bdp_first);
9879 
9880     fdbp->num_of_seqs++;        /* Finshed ... */
9881     fdbp->options->total_num_of_seqs++;
9882     /* ---------------------------------------------- */
9883 
9884     return 0;
9885 }
9886 
FDBAddBioseq(FormatDBPtr fdbp,BioseqPtr bsp,BlastDefLinePtr bdp)9887 Int2 FDBAddBioseq(FormatDBPtr fdbp, BioseqPtr bsp, BlastDefLinePtr bdp)
9888 {
9889     if (bsp == NULL || bsp->seq_data_type == Seq_code_gap) return 0;
9890 
9891     if ( !bdp ) {
9892         ASSERT(fdbp->options->version == FORMATDB_VER_TEXT);
9893         return FDBAddSequence (fdbp, NULL, &bsp->seq_data_type,
9894                                (ByteStorePtr PNTR) &bsp->seq_data,
9895                                bsp->length, 0, BioseqGetTitle(bsp),
9896                                0, 0, 0, 0, 0);
9897     } else {
9898         ASSERT(fdbp->options->version >= FORMATDB_VER);
9899         return FDBAddSequence (fdbp, bdp, &bsp->seq_data_type,
9900                                (ByteStorePtr PNTR) &bsp->seq_data,
9901                                bsp->length, NULL, NULL,
9902                                0, 0, 0, 0, 0);
9903     }
9904 
9905 }
9906 
9907 /*******************************************************************************
9908  * Pass thru each bioseq into given SeqEntry and write corresponding information
9909  * into "def", "index", ...., files
9910  *******************************************************************************
9911  * Parameters:
9912  *    fdbp    - pointer to memory to be freed
9913  *
9914  * Returns NULL
9915  ******************************************************************************/
process_sep(SeqEntryPtr sep,FormatDBPtr fdbp)9916 Int2 process_sep (SeqEntryPtr sep, FormatDBPtr fdbp)
9917 {
9918 
9919     Int4        SequenceLen;
9920     BioseqPtr        bsp = NULL;
9921     CharPtr        defline;
9922     Char        tmpbuff[1024];
9923     Int4        buffer_size=0, defline_len=0;
9924     CharPtr        buffer=NULL;
9925     Int4        len, id_length;
9926     Uint4Ptr        AmbCharPtr = NULL;
9927     Uint1        ch, remainder;
9928     Uint4        i, total, index;
9929 
9930     if (IS_Bioseq(sep))
9931         bsp = (BioseqPtr) sep->data.ptrvalue;
9932     else
9933         /* This is Bioseq-set.  Exit */
9934         return 0;
9935 
9936     if (bsp == NULL || bsp->seq_data_type == Seq_code_gap) return 0;
9937 
9938     /* Make a convertion to stadard form */
9939 
9940     if (fdbp->options->is_protein)
9941         BioseqRawConvert(bsp, Seq_code_ncbistdaa);
9942 
9943     SequenceLen = bsp->length;
9944     fdbp->TotalLen += SequenceLen;
9945 
9946     if (fdbp->MaxSeqLen < SequenceLen)
9947         fdbp->MaxSeqLen = SequenceLen;
9948 
9949     if(fdbp->OffsetAllocated <= (fdbp->num_of_seqs+1)) {
9950         fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
9951 
9952         fdbp->DefOffsetTable = (Int4Ptr)Realloc(fdbp->DefOffsetTable,
9953                                                 fdbp->OffsetAllocated*sizeof(Uint4));
9954         fdbp->SeqOffsetTable = (Int4Ptr)Realloc(fdbp->SeqOffsetTable,
9955                                                 fdbp->OffsetAllocated*sizeof(Uint4));
9956 
9957     if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
9958         ErrLogPrintf("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9959         return 0;
9960     }
9961 
9962         if(!fdbp->options->is_protein) {
9963             fdbp->AmbOffsetTable = (Int4Ptr)Realloc(fdbp->AmbOffsetTable,
9964                                                     fdbp->OffsetAllocated*sizeof(Uint4));
9965         if (!fdbp->AmbOffsetTable) {
9966         ErrLogPrintf("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9967         return 0;
9968         }
9969         }
9970     }
9971 
9972     if(fdbp->aip_def != NULL)   /* Structured deflines */
9973         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
9974     else
9975         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
9976 
9977     fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
9978 
9979     /* ---------------------- */
9980 
9981     if(fdbp->options->parse_mode == FALSE)  {
9982         sprintf(tmpbuff, "%s%ld ", NON_SEQID_PREFIX, (long) fdbp->num_of_seqs);
9983         if (FileWrite(tmpbuff, StringLen(tmpbuff), 1, fdbp->fd_def) != (Uint4) 1)
9984             return 1;
9985         defline = (CharPtr)bsp->descr->data.ptrvalue;
9986     } else {
9987         if (bsp->descr)
9988             defline_len = StringLen(BioseqGetTitle(bsp));
9989         else
9990             defline_len = 0;
9991         defline_len += 255;    /* Sufficient for an ID. */
9992         if (buffer_size < defline_len) {
9993             if (buffer)
9994                 buffer = MemFree(buffer);
9995             buffer = MemNew((defline_len+1)*sizeof(Char));
9996             buffer_size = defline_len;
9997         }
9998         SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, STRLENGTH);
9999         id_length = StringLen(buffer);
10000         buffer[id_length] = ' ';
10001         id_length++;
10002         StringCpy(&buffer[id_length], BioseqGetTitle(bsp));
10003         defline = buffer;
10004     }
10005     if (FileWrite(defline, StringLen(defline), 1, fdbp->fd_def) != (Uint4) 1)
10006     return 1;
10007 
10008         /* -------- Now adding new entried into lookup hash table */
10009 
10010     if((UpdateLookupInfo(defline, fdbp->lookup, fdbp->num_of_seqs,
10011                          fdbp->fd_stmp, fdbp->options->parse_mode,
10012                          fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
10013         return -1;
10014     }
10015 
10016     defline = NULL;
10017     if (buffer)
10018     MemFree(buffer);
10019 
10020     if(!fdbp->options->is_protein)  {
10021         AmbCharPtr = NULL;
10022         if (bsp->seq_data_type == Seq_code_ncbi4na && bsp->seq_data != NULL){
10023 
10024             /* ncbi4na require compression into ncbi2na */
10025 
10026         if (fdbp->options->version > FORMATDB_VER_TEXT)
10027         {
10028                 if((bsp->seq_data = (SeqDataPtr) BSCompressDNANew((ByteStorePtr) bsp->seq_data, bsp->length,
10029                                               &(AmbCharPtr))) == NULL) {
10030                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10031                              "Formating failed.\n");
10032                     return -1;
10033                 }
10034          }
10035          else
10036          {
10037                 if((bsp->seq_data = (SeqDataPtr) BSCompressDNA((ByteStorePtr) bsp->seq_data, bsp->length,
10038                                               &(AmbCharPtr))) == NULL) {
10039                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10040                              "Formating failed.\n");
10041                     return -1;
10042                 }
10043              }
10044 
10045             bsp->seq_data_type = Seq_code_ncbi2na; /* just for information */
10046         } else {
10047             /* if sequence already in ncbi2na format we have to update last byte */
10048 
10049             if((remainder = (bsp->length%4)) == 0) {
10050                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4+1, SEEK_SET);
10051                 BSPutByte((ByteStorePtr) bsp->seq_data, NULLB);
10052             } else {
10053                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10054                 ch = remainder + BSGetByte((ByteStorePtr) bsp->seq_data);
10055                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10056                 BSPutByte((ByteStorePtr) bsp->seq_data, ch);
10057             }
10058         }
10059     }
10060     /* Now dumping sequence */
10061 
10062     BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
10063 
10064     while((len = BSRead((ByteStorePtr) bsp->seq_data, tmpbuff, sizeof(tmpbuff))) != 0) {
10065         if (FileWrite(tmpbuff, len, 1, fdbp->fd_seq) != (Uint4) 1)
10066             return 1;
10067     }
10068 
10069 
10070     if(fdbp->options->is_protein) {
10071         i=0;
10072         if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
10073             return 1;
10074     } else {
10075         /* dump ambiguity characters. */
10076         fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq); /* Anyway... */
10077 
10078         /* if AmbCharPtr is not NULL, then there was ambiguity. */
10079         if(AmbCharPtr != NULL) { /* The first Uint4 holds the total number of ambig. bp. */
10080             total = (*AmbCharPtr)+1;
10081             for (index=0; index<total; index++) {
10082                 if (!FormatDbUint4Write(AmbCharPtr[index], fdbp->fd_seq))
10083                     return 1;
10084             }
10085             MemFree(AmbCharPtr);
10086             AmbCharPtr = NULL;
10087         }
10088     }
10089 
10090     fdbp->num_of_seqs++;  /* Finshed ... */
10091 
10092     return 0;
10093 }
10094 
10095 /* ------------------------------------------------------------------
10096                 This is handler for HeapSort function
10097    ------------------------------------------------------------------*/
ID_Compare(VoidPtr i,VoidPtr j)10098 static int LIBCALLBACK ID_Compare(VoidPtr i, VoidPtr j)
10099 {
10100     if (*(Int4Ptr)i > *(Int4Ptr)j)
10101         return (1);
10102     if (*(Int4Ptr)i < *(Int4Ptr)j)
10103         return (-1);
10104     return (0);
10105 }
10106 
10107 /*******************************************************************************
10108  * Finish stage - out offset tables, etc, into files.  Is to be called before
10109  * FormatDBClose()
10110  *******************************************************************************
10111  * Parameters:
10112  *
10113  *
10114  * Returns  void
10115  ******************************************************************************/
10116 #define DATETIME_LENGTH 64
10117 
FDBFinish(FormatDBPtr fdbp)10118 static    Int2    FDBFinish (FormatDBPtr fdbp)
10119 {
10120     Char    DBName[FILENAME_MAX];
10121     Int4    title_len;
10122     Char    dateTime[DATETIME_LENGTH];
10123     ISAMObjectPtr object;
10124     ISAMErrorCode error;
10125     Uint4    i;
10126     Char    filenamebuf[FILENAME_MAX];
10127     Int2    tmp, extra_bytes = 0;
10128 
10129     if(fdbp->aip_def != NULL)   /* Structured deflines */
10130         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
10131     else
10132         fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
10133 
10134     if(!fdbp->options->is_protein) {
10135         fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10136         fdbp->SeqOffsetTable[fdbp->num_of_seqs] =
10137             fdbp->AmbOffsetTable[fdbp->num_of_seqs];
10138     } else {
10139         fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10140     }
10141 
10142         /* Parsing finished - now dumping index file */
10143 
10144     if(fdbp->options->parse_mode)
10145         FILECLOSE(fdbp->fd_stmp);
10146 
10147         /* Information */
10148 
10149     if(fdbp->options->version == 0) /* Not Set */
10150         fdbp->options->version = FORMATDB_VER;
10151 
10152     if (!FormatDbUint4Write(fdbp->options->version, fdbp->fd_ind))
10153         return 1;
10154     if (!FormatDbUint4Write(fdbp->options->is_protein, fdbp->fd_ind))
10155         return 1;
10156 
10157     if(fdbp->options->db_title != NULL)
10158         title_len = StringLen(fdbp->options->db_title);
10159     else
10160         title_len = 0;
10161 
10162     if (!FormatDbUint4Write(title_len, fdbp->fd_ind))
10163         return 1;
10164 
10165     if (title_len != 0)
10166         if (FileWrite(fdbp->options->db_title, title_len, 1, fdbp->fd_ind) != (Uint4) 1)
10167             return 1;
10168 
10169     MemSet(dateTime, 0, DATETIME_LENGTH);
10170     Nlm_DayTimeStr(dateTime, TRUE, TRUE);
10171 
10172     /* write db_title and date-time stamp eigth bytes aligned */
10173     tmp = title_len + StringLen(dateTime);
10174     if (tmp%8) {
10175         extra_bytes = 8 - tmp%8;
10176     }
10177     if (!FormatDbUint4Write(StringLen(dateTime) + extra_bytes, fdbp->fd_ind))
10178         return 1;
10179     if (FileWrite(dateTime, StringLen(dateTime) + extra_bytes, 1, fdbp->fd_ind) != 1)
10180         return 1;
10181 
10182     if (!FormatDbUint4Write(fdbp->num_of_seqs, fdbp->fd_ind))
10183         return 1;
10184 
10185     if (fdbp->options->version == FORMATDB_VER_TEXT) {
10186         if (!FormatDbUint4Write(fdbp->TotalLen, fdbp->fd_ind))
10187             return 1;
10188     } else {
10189         if (!FormatDbUint8Write(fdbp->TotalLen, fdbp->fd_ind))
10190             return 1;
10191     }
10192 
10193     if (!FormatDbUint4Write(fdbp->MaxSeqLen, fdbp->fd_ind))
10194         return 1;
10195 
10196         /* Offset tables */
10197 
10198     for(i=0; i <= fdbp->num_of_seqs; i++) {
10199         if (!FormatDbUint4Write(fdbp->DefOffsetTable[i], fdbp->fd_ind))
10200         return 1;
10201     }
10202 
10203     for(i=0; i <= fdbp->num_of_seqs; i++) {
10204         if (!FormatDbUint4Write(fdbp->SeqOffsetTable[i], fdbp->fd_ind))
10205             return 1;
10206     }
10207     if(!fdbp->options->is_protein) {
10208         for(i=0; i <= fdbp->num_of_seqs; i++) {
10209             if (!FormatDbUint4Write(fdbp->AmbOffsetTable[i], fdbp->fd_ind))
10210             return 1;
10211         }
10212     }
10213 
10214     if(fdbp->num_of_seqs==0){
10215 
10216         Char db_type = fdbp->options->is_protein ? 'p' : 'n';
10217         ErrLogPrintf("FDBFinish: Empty %s database...\n",
10218                          fdbp->options->is_protein?"protein":"nucleotide");
10219 
10220         /* Close open files and remove them */
10221         FILECLOSE(fdbp->fd_seq);
10222         FILECLOSE(fdbp->fd_def);
10223         ASNIOCLOSE(fdbp->aip_def);
10224         FILECLOSE(fdbp->fd_stmp);
10225         FILECLOSE(fdbp->fd_sdi);
10226         sprintf(filenamebuf, "%s.%chr", fdbp->options->base_name, db_type);
10227         FileRemove(filenamebuf);
10228         sprintf(filenamebuf, "%s.%ctm", fdbp->options->base_name, db_type);
10229         FileRemove(filenamebuf);
10230         sprintf(filenamebuf, "%s.%csq", fdbp->options->base_name, db_type);
10231         FileRemove(filenamebuf);
10232         sprintf(filenamebuf, "%s.%cdi", fdbp->options->base_name, db_type);
10233         FileRemove(filenamebuf);
10234         FILECLOSE(fdbp->fd_ind); /* the only file standing */
10235         return 0;
10236     }
10237 
10238     /* Numeric lookup table sort & dump */
10239 
10240     if(fdbp->options->parse_mode && fdbp->lookup->used > 0) {
10241 
10242         FILE    *fd_lookup;
10243         sprintf(DBName, "%s.%cnd", fdbp->options->base_name,
10244                 fdbp->options->is_protein ? 'p' : 'n');
10245 
10246         fd_lookup = FileOpen(DBName, "wb");
10247 
10248         HeapSort(fdbp->lookup->table, fdbp->lookup->used/2,
10249                  sizeof(Uint4)*2, ID_Compare);
10250 
10251         for(i=0; i < fdbp->lookup->used; i++) {
10252             if (!FormatDbUint4Write(fdbp->lookup->table[i], fd_lookup))
10253             return 1;
10254         }
10255 
10256         FILECLOSE(fd_lookup);
10257 
10258         /* Now creating numeric ISAM index */
10259 
10260         sprintf(filenamebuf, "%s.%cni",
10261                 fdbp->options->base_name, fdbp->options->is_protein ? 'p' : 'n');
10262 
10263         if((object = ISAMObjectNew(ISAMNumeric,
10264                                    DBName, filenamebuf)) == NULL) {
10265             ErrPostEx(SEV_ERROR, 0, 0, "Failed to create ISAM object.\n");
10266             return 1;
10267         }
10268 
10269         if((error = ISAMMakeIndex(object, 0, 0)) != ISAMNoError) {
10270             if (error == ISAMNoOrder) {
10271                 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create index."
10272                           "  Possibly a gi included more than once in the database.\n", (long) error);
10273             } else {
10274                 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create index: ISAMErrorCode %ld.\n", (long) error);
10275             }
10276             return 1;
10277         }
10278         ISAMObjectFree(object);
10279     }
10280 
10281     /* String file sorting */
10282 
10283     if(fdbp->options->parse_mode) {
10284         if (!FormatdbCreateStringIndex(fdbp->options->base_name,
10285                                        fdbp->options->is_protein,
10286                                        fdbp->options->sparse_idx,
10287                                        fdbp->options->test_non_unique))
10288             return 1;
10289     }
10290 #ifdef FDB_TAXONOMYDB
10291     /* Creating taxonomy names lookup database */
10292     if(fdbp->options->tax_lookup != NULL) {
10293         FILE *tifp, *tdfp;
10294         RDBTaxLookupPtr tax_lookup;
10295         Int4 fd_position;
10296 
10297         if (fdbp->options->tax_lookup->taxids_in_db != 0) {
10298 
10299             tax_lookup = fdbp->options->tax_lookup;
10300 
10301             sprintf(filenamebuf, "%s.%cti", fdbp->options->base_name,
10302                     fdbp->options->is_protein ? 'p' : 'n');
10303             tifp = FileOpen(filenamebuf, "wb");
10304 
10305             sprintf(filenamebuf, "%s.%ctd", fdbp->options->base_name,
10306                     fdbp->options->is_protein ? 'p' : 'n');
10307             tdfp = FileOpen(filenamebuf, "wb");
10308 
10309             FormatDbUint4Write(TAX_DB_MAGIC_NUMBER, tifp);
10310             FormatDbUint4Write(tax_lookup->taxids_in_db, tifp);
10311 
10312             for(i = 0; i < 4; i++) { /* Here are 4 reserved numbers */
10313                 FormatDbUint4Write(0, tifp);
10314             }
10315 
10316             for(i = 0; i < tax_lookup->all_taxid_count; i++) {
10317                 if(tax_lookup->tax_array[i] != NULL) {
10318                     FormatDbUint4Write(tax_lookup->tax_array[i]->tax_id, tifp);
10319                     fd_position = ftell(tdfp);
10320                     FormatDbUint4Write(fd_position, tifp);
10321                     fprintf(tdfp,"%s\t%s\t%s\t%s",
10322                             tax_lookup->tax_array[i]->sci_name,
10323                             tax_lookup->tax_array[i]->common_name,
10324                             tax_lookup->tax_array[i]->blast_name,
10325                             tax_lookup->tax_array[i]->s_king);
10326                 }
10327             }
10328 
10329             /* We need to write one more element to have offset of the last
10330                taxonomy id entry */
10331 
10332             FormatDbUint4Write(0, tifp);
10333             fd_position = ftell(tdfp);
10334             FormatDbUint4Write(fd_position, tifp);
10335 
10336             FILECLOSE(tifp);
10337             FILECLOSE(tdfp);
10338         } else {
10339             ErrLogPrintf("No taxonomy entries found, no taxonomy database "
10340                     "will be created\n");
10341         }
10342         /* Free the taxonomy database built so far, but don't close the
10343          * connection to the taxonomy server, that should be done by the
10344          * client application by calling RDTaxLookupClose() */
10345         fdbp->options->tax_lookup = RDTaxLookupReset(fdbp->options->tax_lookup);
10346     } /* if(tax_lookup != NULL) */
10347 #endif
10348 
10349 
10350     /* PIG table sort and dump */
10351     if (fdbp->options->is_protein && fdbp->ptable && fdbp->ptable->count > 0) {
10352         FILE *fp;
10353 
10354         sprintf(DBName, "%s.ppd", fdbp->options->base_name);
10355 
10356         if ( !(fp = FileOpen(DBName, "wb")))
10357             return 1;
10358 
10359         HeapSort(fdbp->ptable->pop, fdbp->ptable->count/2,
10360                  sizeof(Uint4)*2, ID_Compare);
10361 
10362         for (i = 0; i < fdbp->ptable->count; i++) {
10363             if (!FormatDbUint4Write(fdbp->ptable->pop[i], fp))
10364                 return 1;
10365         }
10366 
10367         FILECLOSE(fp);
10368 
10369         /* Create ISAM index for PIG/ordinal id mapping */
10370         sprintf(filenamebuf, "%s.ppi", fdbp->options->base_name);
10371 
10372         if ( !(object = ISAMObjectNew(ISAMNumeric, DBName, filenamebuf))) {
10373             ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM object.\n");
10374             return 1;
10375         }
10376 
10377         if ( (error = ISAMMakeIndex(object, 0, 0)) != ISAMNoError) {
10378             if (error == ISAMNoOrder) {
10379                 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM index."
10380                           "  Possibly a PIG included more than once in the "
10381                           "database.\n", (long) error);
10382             } else {
10383                 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM index: "
10384                         "ISAMErrorCode %ld.\n", (long) error);
10385             }
10386             return 1;
10387         }
10388         ISAMObjectFree(object);
10389     }
10390 
10391     ErrLogPrintf("Formatted %ld sequences in volume %ld\n", fdbp->num_of_seqs,
10392             fdbp->options->volume);
10393 
10394     return 0;
10395 } /* end FDBFinish() */
10396 
10397 
FDBOptionsFree(FDB_optionsPtr options)10398 FDB_optionsPtr FDBOptionsFree(FDB_optionsPtr options)
10399 {
10400     if (!options)
10401         return NULL;
10402 
10403     MemFree(options->db_title);
10404     MemFree(options->db_file);
10405     MemFree(options->base_name);
10406     MemFree(options->alias_file_name);
10407     MemFree(options->gi_file);
10408     MemFree(options->gi_file_bin);
10409     MemFree(options);
10410 
10411     return options;
10412 }
10413 /*******************************************************************************
10414  * Free memory allocated for given variable of FormatDB
10415  *******************************************************************************
10416  * Parameters:
10417  *    fdbp    - pointer to memory to be freed
10418  *
10419  * Returns NULL
10420  ******************************************************************************/
10421 
FormatDBClose(FormatDBPtr fdbp)10422 Int2 FormatDBClose(FormatDBPtr fdbp)
10423 {
10424 
10425     /* Now dumping all data to disk */
10426 
10427     if(FDBFinish (fdbp))
10428         return 1;
10429 
10430     /* ... and MemFree all stuff */
10431 
10432     MemFree(fdbp->DefOffsetTable);
10433     MemFree(fdbp->SeqOffsetTable);
10434 
10435     if(!fdbp->options->is_protein) {
10436         MemFree(fdbp->AmbOffsetTable);
10437     }
10438 
10439     FASTALookupFree(fdbp->lookup);
10440     FDBPigTableFree(fdbp->ptable);
10441 
10442     FILECLOSE(fdbp->fd);
10443 
10444     ASNIOCLOSE(fdbp->aip_def);
10445     FILECLOSE(fdbp->fd_def);
10446     FILECLOSE(fdbp->fd_ind);
10447     FILECLOSE(fdbp->fd_seq);
10448     FILECLOSE(fdbp->fd_sdi);
10449 
10450     ASNIOCLOSE(fdbp->aip);
10451 
10452     /* Do not Clear options structure */
10453 
10454     MemFree (fdbp);
10455 
10456     return 0;
10457 }
SeqEntrysToBLAST(SeqEntryPtr sep,FormatDBPtr fdbp,Boolean is_na,Uint1 group_segs)10458 NLM_EXTERN Boolean SeqEntrysToBLAST (SeqEntryPtr sep, FormatDBPtr fdbp,
10459                                      Boolean is_na, Uint1 group_segs)
10460 {
10461     FastaDat tfa;
10462     MyFsa mfa;
10463     Char buf[255];
10464 
10465     if ((sep == NULL) || (fdbp == NULL))
10466         return FALSE;
10467 
10468     MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
10469     mfa.buf    = buf;
10470     mfa.buflen    = 254;
10471     mfa.seqlen    = 70;
10472     mfa.mydata    = (Pointer)fdbp;
10473     mfa.myfunc    = BLASTFileFunc;
10474     mfa.bad_asn1    = FALSE;
10475     mfa.order        = 0;
10476     mfa.accession    = NULL;
10477     mfa.organism    = NULL;
10478     mfa.do_virtual    = FALSE;
10479     mfa.tech        = 0;
10480     mfa.no_sequence    = FALSE;
10481     mfa.formatdb    = TRUE;
10482 
10483     if (is_na)
10484             /* in case of "formatdb" we wont use this parameter */
10485         mfa.code = Seq_code_ncbi2na;
10486     else
10487         mfa.code = Seq_code_ncbistdaa;
10488 
10489     tfa.mfp = &mfa;
10490     tfa.is_na = is_na;
10491     if (group_segs == 3) { /* do 2 things */
10492         mfa.do_virtual = TRUE;
10493         group_segs = 1;
10494     }
10495 
10496     tfa.group_segs = group_segs;
10497     tfa.last_indent = -1;
10498     tfa.parts = -1;
10499     tfa.seg = -1;
10500     tfa.got_one = FALSE;
10501     SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
10502 
10503     return tfa.got_one;
10504 }
10505 
10506 /*****************************************************************************
10507  *
10508  *   FastaFileFunc(key, buf, data)
10509  *       standard "write to file" callback
10510  *
10511  *****************************************************************************/
BLASTFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)10512 Boolean BLASTFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf, Uint4 buflen,
10513                        Pointer data)
10514 {
10515     FormatDBPtr    fdbp = (FormatDBPtr) data;
10516     Int4        SequenceLen;
10517     Uint4        i, total, index;
10518 
10519     switch (key) {
10520     case FASTA_ID:
10521 
10522         SequenceLen = bsp->length;
10523         fdbp->TotalLen += SequenceLen;
10524 
10525         if (fdbp->MaxSeqLen < SequenceLen)
10526             fdbp->MaxSeqLen = SequenceLen;
10527 
10528         if(fdbp->OffsetAllocated <= fdbp->num_of_seqs) {
10529             fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
10530 
10531             fdbp->DefOffsetTable = (Int4Ptr)Realloc(fdbp->DefOffsetTable,
10532                                                     fdbp->OffsetAllocated*sizeof(Uint4));
10533             fdbp->SeqOffsetTable = (Int4Ptr)Realloc(fdbp->SeqOffsetTable,
10534                                                     fdbp->OffsetAllocated*sizeof(Uint4));
10535             if(!fdbp->options->is_protein) {
10536                 fdbp->AmbOffsetTable = (Int4Ptr)Realloc(fdbp->AmbOffsetTable,
10537                                                         fdbp->OffsetAllocated*sizeof(Uint4));
10538             }
10539         }
10540 
10541         if(fdbp->aip_def != NULL)   /* Structured deflines */
10542             fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
10543         else
10544             fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
10545 
10546         fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10547 
10548         if (FileWrite(buf, buflen, 1, fdbp->fd_def) != (Uint4) 1)
10549             return FALSE;
10550         if (FileWrite(" ", 1, 1, fdbp->fd_def) != (Uint4) 1)
10551             return FALSE;
10552 
10553         /* Now adding new entried into lookup hash table */
10554 
10555         if((UpdateLookupInfo(buf, fdbp->lookup, fdbp->num_of_seqs,
10556                              fdbp->fd_stmp, fdbp->options->parse_mode,
10557                              fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
10558             return FALSE;
10559         }
10560 
10561         break;
10562     case FASTA_DEFLINE:
10563         if (FileWrite(buf, buflen, 1, fdbp->fd_def) != (Uint4) 1)
10564             return FALSE;
10565         break;
10566     case FASTA_SEQLINE:
10567         if (FileWrite(buf, buflen, 1, fdbp->fd_seq) != (Uint4) 1)
10568             return FALSE;
10569         break;
10570     case FASTA_EOS:   /* end of sequence */
10571         if(fdbp->options->is_protein) {
10572             i=0;
10573             if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
10574                 return FALSE;
10575         } else {
10576             /* dump ambiguity characters. */
10577             fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq); /* Anyway... */
10578 
10579             /* if AmbCharPtr is not NULL, then there was ambiguity. */
10580             if(fdbp->AmbCharPtr != NULL) {
10581                         /* The first Uint4 holds the total number of ambig. bp. */
10582                 total = (*(fdbp->AmbCharPtr))+1;
10583                 for (index=0; index<total; index++) {
10584                     if (!FormatDbUint4Write(fdbp->AmbCharPtr[index], fdbp->fd_seq))
10585                         return FALSE;
10586                 }
10587                 MemFree(fdbp->AmbCharPtr);
10588                 fdbp->AmbCharPtr = NULL;
10589             }
10590         }
10591         fdbp->num_of_seqs++;
10592         break;
10593     case FASTA_FORMATDB_AMB: {
10594         Int4 len;
10595         Char tmpbuff[1024];
10596         /* In case of "formatdb" nucleotides have to be compressed */
10597 
10598         fdbp->AmbCharPtr = NULL;
10599 
10600         if (bsp->seq_data_type == Seq_code_ncbi4na && bsp->seq_data != NULL){
10601 
10602             /* ncbi4na require compression into ncbi2na */
10603 
10604         if (fdbp->options->version > FORMATDB_VER_TEXT)
10605         {
10606                 if((bsp->seq_data = (SeqDataPtr) BSCompressDNANew((ByteStorePtr) bsp->seq_data, bsp->length,
10607                                               &(fdbp->AmbCharPtr))) == NULL) {
10608                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10609                              "Formating failed.\n");
10610                     return FALSE;
10611                 }
10612          }
10613          else
10614          {
10615                 if((bsp->seq_data = (SeqDataPtr) BSCompressDNA((ByteStorePtr) bsp->seq_data, bsp->length,
10616                                               &(fdbp->AmbCharPtr))) == NULL) {
10617                     ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10618                              "Formating failed.\n");
10619                     return FALSE;
10620                 }
10621          }
10622             bsp->seq_data_type = Seq_code_ncbi2na; /* just for information */
10623         } else {
10624             /* if sequence already in ncbi2na format we have to update last byte */
10625             Uint1 ch, remainder;
10626 
10627             if((remainder = (bsp->length%4)) == 0) {
10628                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4+1, SEEK_SET);
10629                 BSPutByte((ByteStorePtr) bsp->seq_data, NULLB);
10630             } else {
10631                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10632                 ch = remainder + BSGetByte((ByteStorePtr) bsp->seq_data);
10633                 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10634                 BSPutByte((ByteStorePtr) bsp->seq_data, ch);
10635             }
10636         }
10637         /* Now dumping sequence */
10638 
10639         BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
10640         while((len = BSRead((ByteStorePtr) bsp->seq_data, tmpbuff, sizeof(tmpbuff))) != 0) {
10641             BLASTFileFunc(bsp, FASTA_SEQLINE, tmpbuff, len, data);
10642         }
10643 
10644         BLASTFileFunc(bsp, FASTA_EOS, NULL, 0, data);
10645     }
10646 
10647     break;
10648     default:
10649         break;
10650     }
10651     return TRUE;
10652 }
10653 
10654 /* ----------------- Proccessing ASN.1 with formatdb ----------------- */
10655 
10656 typedef struct _FDB_SEDataInfo {
10657     CharPtr         seqid;
10658     ByteStorePtr    bsp;
10659     Int4            length;
10660     Uint1           seq_data_type;
10661     CharPtr         defline;
10662     FastaDat PNTR   tfp;
10663     FormatDBPtr     fdbp;
10664 } FDB_SEDataInfo, PNTR FDB_SEDataInfoPtr;
10665 
FDB_FastaFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)10666 static Boolean FDB_FastaFileFunc(BioseqPtr bsp, Int2 key, CharPtr buf,
10667                                  Uint4 buflen, Pointer data)
10668 {
10669     FDB_SEDataInfoPtr fsedip;
10670 
10671     if((fsedip = data) == NULL)
10672         return TRUE;
10673 
10674     switch (key) {
10675     case FASTA_DEFLINE:
10676         MemCpy(fsedip->defline, buf, buflen);
10677         fsedip->defline[buflen] = NULLB;
10678         break;
10679     case FASTA_SEQLINE:
10680         BSWrite(fsedip->bsp, buf, buflen);
10681         fsedip->length += buflen;
10682         break;
10683     case FASTA_ID:
10684         MemCpy(fsedip->seqid, buf, buflen);
10685         fsedip->seqid[buflen] = NULLB;
10686         break;
10687     case FASTA_EOS:   /* end of sequence */
10688         /* Here we should add new entry to FD database and reset
10689            all spaces */
10690 
10691         FDBAddSequence(fsedip->fdbp, NULL, &fsedip->seq_data_type,
10692                        &fsedip->bsp, fsedip->length,
10693                        fsedip->seqid, fsedip->defline,
10694                        0, 0, 0, 0, 0);
10695 
10696         BSSeek(fsedip->bsp, 0, SEEK_SET);
10697         BSDelete(fsedip->bsp, BSLen(fsedip->bsp));
10698         fsedip->length = 0;
10699 
10700         break;
10701     }
10702 
10703     return TRUE;
10704 }
10705 
FDB_SEDataInfoNew(void)10706 FDB_SEDataInfoPtr FDB_SEDataInfoNew(void)
10707 {
10708     FDB_SEDataInfoPtr fsedip;
10709     MyFsa PNTR mfp;
10710 
10711     fsedip = MemNew(sizeof(FDB_SEDataInfo));
10712 
10713     fsedip->tfp  = MemNew(sizeof (FastaDat));
10714     mfp  = MemNew(sizeof (MyFsa));
10715     fsedip->tfp->mfp = mfp;
10716 
10717     mfp->buf     = MemNew(255);
10718     mfp->buflen  = 254;
10719     mfp->seqlen  = 254;
10720     mfp->myfunc  = FDB_FastaFileFunc;
10721     mfp->bad_asn1        = FALSE;
10722     mfp->order           = 0;
10723     mfp->accession       = NULL;
10724     mfp->organism        = NULL;
10725     mfp->do_virtual      = TRUE;
10726     mfp->tech            = 0;
10727     mfp->no_sequence     = FALSE;
10728     mfp->formatdb        = FALSE;
10729     mfp->mydata          = fsedip; /* ... */
10730 
10731     fsedip->tfp->group_segs = 1; /*** to trigger delta's and maps ***/
10732     fsedip->tfp->last_indent = -1;
10733     fsedip->tfp->parts = -1;
10734     fsedip->tfp->seg = -1;
10735     fsedip->tfp->got_one = FALSE;
10736 
10737 
10738     if(fsedip->seqid == NULL)
10739         fsedip->seqid = MemNew(fsedip->tfp->mfp->buflen+1);
10740 
10741     fsedip->bsp = BSNew(2048);
10742 
10743     if(fsedip->defline == NULL){
10744         fsedip->defline = MemNew(fsedip->tfp->mfp->buflen+1);
10745     }
10746     return fsedip;
10747 }
10748 
FDB_SEDataInfoFree(FDB_SEDataInfoPtr fsedip)10749 void FDB_SEDataInfoFree(FDB_SEDataInfoPtr fsedip)
10750 {
10751     MemFree(fsedip->tfp->mfp->buf);
10752     MemFree(fsedip->tfp->mfp);
10753     MemFree(fsedip->tfp);
10754     BSFree(fsedip->bsp);
10755     MemFree(fsedip->defline);
10756     MemFree(fsedip->seqid);
10757 
10758     MemFree(fsedip);
10759 }
10760 
FDBSeqEntry_callback(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)10761 static void FDBSeqEntry_callback (SeqEntryPtr sep, Pointer data,
10762                                   Int4 index, Int2 indent)
10763 {
10764     FDB_SEDataInfoPtr fsedip;
10765     BioseqPtr    bsp=NULL;
10766     Boolean      is_na;
10767 
10768     if((fsedip = (FDB_SEDataInfoPtr) data) == NULL)
10769         return;
10770 
10771     if(!IS_Bioseq(sep)) {
10772         SeqEntryFasta(sep, fsedip->tfp, index, indent);
10773         return;
10774     }
10775 
10776     bsp = sep->data.ptrvalue;
10777     is_na = ISA_na(bsp->mol);
10778 
10779     /* We will format only sequences of one kind */
10780     if(fsedip->fdbp->options->is_protein != !is_na) {
10781         fsedip->tfp->mfp->no_sequence = TRUE;
10782         SeqEntryFasta(sep, fsedip->tfp, index, indent);
10783         return;
10784     }
10785 
10786     /* Segmented and virtual sequences are not indexed */
10787     if(bsp->repr == Seq_repr_seg || bsp->repr == Seq_repr_virtual) {
10788         fsedip->tfp->mfp->no_sequence = TRUE;
10789         SeqEntryFasta(sep, fsedip->tfp, index, indent);
10790         return;
10791     }
10792 
10793     fsedip->tfp->last_indent = -1;
10794 
10795     if(bsp->repr == Seq_repr_raw || bsp->repr == Seq_repr_const){
10796 
10797 
10798         /* This will collect defline and seqid */
10799 
10800         fsedip->tfp->mfp->no_sequence = TRUE;
10801         SeqEntryFasta(sep, fsedip->tfp, index, indent);
10802 
10803         FDBAddSequence(fsedip->fdbp, NULL, &bsp->seq_data_type,
10804                        (ByteStorePtr PNTR) &bsp->seq_data, bsp->length,
10805                        fsedip->seqid, fsedip->defline, 0, 0, 0, 0, 0);
10806 
10807         /* Reseting mfp structure */
10808         /* fsedip->tfp->mfp->accession = NULL;
10809            fsedip->tfp->mfp->organism  = NULL;
10810            *fsedip->defline = NULLB;
10811            *fsedip->seqid = NULLB; */
10812 
10813     } else {  /* This will work for example for delta seqs */
10814         fsedip->seq_data_type = fsedip->tfp->mfp->code;
10815         fsedip->length = 0;
10816         fsedip->tfp->mfp->no_sequence = FALSE;
10817         BSSeek(fsedip->bsp, 0, SEEK_SET);
10818         SeqEntryFasta(sep, fsedip->tfp, index, indent);
10819     }
10820 
10821     return;
10822 }
10823 
FDBAddSeqEntry(FormatDBPtr fdbp,SeqEntryPtr sep)10824 Boolean FDBAddSeqEntry(FormatDBPtr fdbp, SeqEntryPtr sep)
10825 {
10826     FDB_SEDataInfoPtr fsedip;
10827 
10828     fsedip = FDB_SEDataInfoNew();
10829     fsedip->fdbp = fdbp;
10830 
10831     fsedip->tfp->is_na = !fsedip->fdbp->options->is_protein;
10832 
10833     if (fsedip->tfp->is_na){
10834         fsedip->tfp->mfp->code = Seq_code_iupacna;
10835     } else {
10836         fsedip->tfp->mfp->code = Seq_code_ncbistdaa;
10837     }
10838 
10839     SeqEntryExplore(sep, fsedip, FDBSeqEntry_callback);
10840 
10841     FDB_SEDataInfoFree(fsedip);
10842 
10843     return TRUE;
10844 }
10845 
10846 
10847 /* ---------------------------------------------------------------------*/
10848 /* ------------- End of functions, that uses in formatdb -------------- */
10849 /* ---------------------------------------------------------------------*/
10850 
10851 
10852 /* ---------------------------------------------------------------------*/
10853 /* ------- Functions used to initialize and access blast taxonomy DB -- */
10854 /* ---------------------------------------------------------------------*/
10855 
10856 
RDBTaxInfoInit()10857 RDBTaxInfoPtr  RDBTaxInfoInit()
10858 {
10859     RDBTaxInfoPtr tip;
10860     Char buffer [1024], *filebuf = NULL;
10861     Uint4 value;
10862     Int4 i;
10863 
10864     tip = MemNew(sizeof(RDBTaxInfo));
10865 
10866     /* We do not suppose, that this database exists, but if it is
10867        exists we will intitialize it properly. So first message is just
10868        INFO, that database does not exists, but then - message will be
10869        ERROR if database is invalid */
10870 
10871     sprintf(buffer, "%s.bti", BLAST_TAXDB_FILENAME);
10872     filebuf = FindBlastDBFile(buffer);
10873     if((tip->taxfp = NlmOpenMFILE(filebuf)) == NULL) {
10874         ErrPostEx(SEV_INFO, 0, 0, "RDBTaxInfoInit: Unable to open %s", filebuf);
10875         MemFree(filebuf);
10876         MemFree(tip);
10877         return NULL;
10878     }
10879 
10880     filebuf[StringLen(filebuf)-1] = 'd';
10881     if((tip->name_fd = NlmOpenMFILE(filebuf)) == NULL) {
10882         ErrPostEx(SEV_ERROR, 0,0, "RDBTaxInfoInit: Unable to open %s", filebuf);
10883         NlmCloseMFILE(tip->taxfp);
10884         MemFree(filebuf);
10885         MemFree(tip);
10886         return NULL;
10887     }
10888     filebuf = MemFree(filebuf);
10889 
10890     /* Last check-up of the database validity */
10891     NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10892     if (Nlm_SwapUint4(value) != TAX_DB_MAGIC_NUMBER) {
10893         ErrPostEx(SEV_ERROR, 0, 0, "RDBTaxInfoInit: Invalid database",
10894                   buffer);
10895         NlmCloseMFILE(tip->taxfp);
10896         NlmCloseMFILE(tip->name_fd);
10897         MemFree(tip);
10898         return NULL;
10899     }
10900 
10901     NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10902     tip->all_taxid_count = Nlm_SwapUint4(value);
10903 
10904     for(i = 0; i < 4; i++) {
10905         NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10906         tip->reserved[i] = Nlm_SwapUint4(value);
10907     }
10908 
10909     /* Load the taxid/file offsets from the remaining of the index file */
10910     if (tip->taxfp->mfile_true) {
10911         tip->taxdata = (RDBTaxIdPtr) tip->taxfp->mmp;
10912         tip->taxdata_alloc = FALSE;
10913     } else {
10914 
10915         tip->taxdata = (RDBTaxIdPtr) MemNew(sizeof(RDBTaxId) *
10916                 tip->all_taxid_count);
10917         if ((tip->taxdata) == NULL) {
10918             ErrPostEx(SEV_ERROR, 0, 0, "RDBTaxInfoInit: Not enough memory to "
10919                     "load index table");
10920             NlmCloseMFILE(tip->taxfp);
10921             NlmCloseMFILE(tip->name_fd);
10922             MemFree(tip);
10923             return NULL;
10924         }
10925 
10926         for (i = 0; i < tip->all_taxid_count; i++) {
10927             NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10928             tip->taxdata[i].taxid = Nlm_SwapUint4(value);
10929             NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10930             tip->taxdata[i].offset = Nlm_SwapUint4(value);
10931         }
10932 
10933         tip->taxfp = NlmCloseMFILE(tip->taxfp);
10934         tip->taxdata_alloc = TRUE;
10935     }
10936 
10937     /* Only this thread will clean up this structure */
10938     tip->taxinfo_alloc = TRUE;
10939 
10940     return tip;
10941 }
10942 
10943 /* Free memory, unmap files etc. related to the taxonomy database */
RDBTaxInfoClose(RDBTaxInfoPtr tip)10944 void RDBTaxInfoClose(RDBTaxInfoPtr tip)
10945 {
10946     if(tip == NULL)
10947         return;
10948 
10949     if (tip->taxinfo_alloc) {
10950         if (tip->taxdata_alloc)
10951             MemFree(tip->taxdata);
10952         else
10953             NlmCloseMFILE(tip->taxfp);
10954 
10955         NlmCloseMFILE(tip->name_fd);
10956         MemFree(tip);
10957     } else {
10958         tip->taxfp = NlmCloseMFILE(tip->taxfp);
10959         tip->name_fd = NlmCloseMFILE(tip->name_fd);
10960         tip = MemFree(tip);
10961     }
10962 
10963     return;
10964 }
10965 
10966 /* Main function to get taxonomy names for given tax_id from
10967    blast taxonomy database. Returns NULL if tax_id is not in the database */
RDBGetTaxNames(RDBTaxInfoPtr tip,Int4 tax_id)10968 RDBTaxNamesPtr RDBGetTaxNames(RDBTaxInfoPtr tip, Int4 tax_id)
10969 {
10970     RDBTaxNamesPtr tnames;
10971     Int4 low_taxid, high_taxid;
10972     RDBTaxIdPtr taxdata;
10973     Int4 low_index, high_index, new_index, old_index, curr_taxid;
10974 
10975     if(tip == NULL)
10976         return NULL;
10977 
10978     taxdata = tip->taxdata;
10979 
10980     low_index = 0;
10981     high_index = tip->all_taxid_count-1;
10982 
10983     low_taxid = Nlm_SwapUint4(taxdata[low_index].taxid);
10984     high_taxid = Nlm_SwapUint4(taxdata[high_index].taxid);
10985 
10986     if(tax_id < low_taxid || tax_id > high_taxid)
10987         return NULL;
10988 
10989     new_index =  (low_index+high_index)/2;
10990     old_index = new_index;
10991 
10992     while(TRUE) {
10993 
10994         curr_taxid = Nlm_SwapUint4(taxdata[new_index].taxid);
10995 
10996         if (tax_id < curr_taxid) {
10997             high_index = new_index;
10998         } else if (tax_id > curr_taxid){
10999             low_index = new_index;
11000         } else { /* Got it ! */
11001             break;
11002         }
11003 
11004         new_index = (low_index+high_index)/2;
11005         if (new_index == old_index) {
11006             if (tax_id > curr_taxid) {
11007                 new_index++;
11008             }
11009             break;
11010         }
11011         old_index = new_index;
11012     }
11013 
11014     if(tax_id == Nlm_SwapUint4(taxdata[new_index].taxid)) {
11015         Char buffer[1024];
11016         CharPtr chptr = NULL, start_ptr = NULL;
11017 
11018         tnames = MemNew(sizeof(RDBTaxNames));
11019         tnames->tax_id = tax_id;
11020 
11021         NlmSeekInMFILE(tip->name_fd, Nlm_SwapUint4(taxdata[new_index].offset),
11022               SEEK_SET);
11023 
11024         NlmReadMFILE((Uint1Ptr)buffer,
11025                 Nlm_SwapUint4(taxdata[new_index+1].offset) -
11026                 Nlm_SwapUint4(taxdata[new_index].offset)+1, 1, tip->name_fd);
11027 
11028         start_ptr = buffer;
11029 
11030         /* Scientific name */
11031 
11032         if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11033             RDBTaxNamesFree(tnames);
11034             return NULL;
11035         }
11036 
11037         *chptr = NULLB;
11038         chptr++;
11039         tnames->sci_name = StringSave(start_ptr);
11040         start_ptr = chptr;
11041 
11042         /* Common name */
11043 
11044         if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11045             RDBTaxNamesFree(tnames);
11046             return NULL;
11047         }
11048 
11049         *chptr = NULLB;
11050         chptr++;
11051         tnames->common_name = StringSave(start_ptr);
11052         start_ptr = chptr;
11053 
11054         /* Blast name */
11055 
11056         if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11057             RDBTaxNamesFree(tnames);
11058             return NULL;
11059         }
11060 
11061         *chptr = NULLB;
11062         chptr++;
11063         tnames->blast_name = StringSave(start_ptr);
11064         start_ptr = chptr;
11065 
11066         /* Super - kingdom */
11067 
11068         tnames->s_king[0] = *start_ptr;
11069 
11070         /* fscanf(tip->name_fd, "%s\t%s\t%s\t%s",
11071            name1, name2, name3, tnames->s_king);
11072            tnames->sci_name = StringSave(name1);
11073            tnames->common_name = StringSave(name2);
11074            tnames->blast_name = StringSave(name3); */
11075 
11076         return tnames;
11077     }
11078 
11079     return NULL;
11080 }
11081 
readdb_get_taxnames(ReadDBFILEPtr rdfp,Int4 tax_id)11082 RDBTaxNamesPtr LIBCALL readdb_get_taxnames(ReadDBFILEPtr rdfp, Int4 tax_id)
11083 {
11084     RDBTaxInfoPtr tip;
11085     RDBTaxNamesPtr tnames = NULL;
11086 
11087     if((tip = rdfp->taxinfo) != NULL) {
11088         tnames = RDBGetTaxNames(tip, tax_id);
11089     }
11090 
11091     return tnames;
11092 }
11093 
11094 /************************************************************************/
11095 /*    The CommonIndex stuff                        */
11096 /************************************************************************/
11097 
11098 /* The function initializes CommonIndexPtr with give filename */
11099 
CommonIndexInit(CharPtr indexfilename)11100 CommonIndexHeadPtr    CommonIndexInit(CharPtr indexfilename)
11101 {
11102 
11103     Nlm_MemMapPtr    mmpindx;
11104     CommonIndexHeadPtr    cihp = (CommonIndexHeadPtr) MemNew(sizeof(CommonIndexHead));
11105     CharPtr        charptr = NULL;
11106 
11107     if (!(mmpindx = Nlm_MemMapInit(indexfilename))) {
11108         ErrPostEx(SEV_ERROR, 0, 0, "Could not open Common Index file.  Probably wrong path specified\n");
11109         CommonIndexDestruct(cihp); /* unable to find or parse config file. */
11110         return NULL;
11111     }
11112 
11113     cihp->maxgi = FileLength(indexfilename) / sizeof(CommonIndex);
11114     cihp->memmap = mmpindx;
11115     cihp->ci = (CommonIndexPtr) mmpindx->mmp_begin;
11116 
11117     /* read list of databases from the configuration file */
11118 
11119     charptr = Nlm_FilePathFind(indexfilename);
11120     if (!(cihp->num_of_DBs = ParseDBConfigFile(&(cihp->dbids), charptr))) {
11121         if (charptr)
11122         MemFree(charptr);
11123         CommonIndexDestruct(cihp); /* unable to find or parse config file. */
11124         return NULL;
11125     }
11126     if (charptr)
11127         MemFree(charptr);
11128 
11129     if (!(cihp->ci)) {
11130         return NULL;
11131     } else
11132         return cihp;
11133 }
11134 
CommonIndexDestruct(CommonIndexHeadPtr cihp)11135 void    CommonIndexDestruct(CommonIndexHeadPtr cihp) {
11136 
11137     Int2    i;
11138 
11139     if (cihp &&  cihp->memmap)
11140     Nlm_MemMapFini(cihp->memmap);
11141 
11142     for (i=0; i < cihp->num_of_DBs; i++) {
11143     if (cihp && cihp->dbids && ((cihp->dbids + i)->name))
11144         MemFree((cihp->dbids + i)->name);
11145     }
11146     if (cihp && cihp->dbids)
11147     MemFree(cihp->dbids);
11148 
11149     MemFree(cihp);
11150 }
11151 /* returns shift of bit for specified DB name */
11152 
DBShift(Int2 num_of_DBs,DataBaseIDPtr dbids,CharPtr dbname,Boolean is_prot)11153 Int2    DBShift(Int2 num_of_DBs, DataBaseIDPtr dbids, CharPtr dbname, Boolean is_prot)
11154 {
11155     Int2    i;
11156 
11157     if (!dbname) {
11158     ErrPostEx(SEV_ERROR, 0, 0, "Specified database name is NULL\n");
11159     return 0;
11160     }
11161 
11162     for(i=0; i < num_of_DBs; i++) {
11163     if(!StrCmp(dbname, (dbids+i)->name) && ((dbids+i)->isprot == is_prot)) {
11164         return (dbids+i)->id;
11165     }
11166     }
11167 
11168     return 0;
11169 }
11170 
11171 /* returns name of the database by given bit shift */
11172 
DBName(Int2 num_of_DBs,DataBaseIDPtr dbids,Int2 shift)11173 CharPtr    DBName(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift)
11174 {
11175     Int2      i;
11176 
11177     if (!shift) {
11178     ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift is zero\n");
11179     return NULL;
11180     }
11181 
11182     for(i=0; i < num_of_DBs; i++) {
11183     if((dbids+i)->id == shift) {
11184         return (dbids+i)->name;
11185     }
11186     }
11187     ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift %d is not known\n", shift);
11188     return NULL;
11189 }
11190 
11191 /* say if the database contains proteins */
11192 
DBisProt(Int2 num_of_DBs,DataBaseIDPtr dbids,Int2 shift)11193 Boolean    DBisProt(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift)
11194 {
11195     Int2      i;
11196 
11197     if (!shift) {
11198     ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift is zero\n");
11199     return FALSE;
11200     }
11201 
11202     for(i=0; i < num_of_DBs; i++) {
11203     if((dbids+i)->id == shift) {
11204         return (dbids+i)->isprot;
11205     }
11206     }
11207     ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift %d is not known\n", shift);
11208     return FALSE;
11209 }
11210 
CommonIndexResultDestruct(CommonIndexResultPtr cir)11211 void    CommonIndexResultDestruct(CommonIndexResultPtr cir)
11212 {
11213     if (!cir)
11214     return;
11215     if (cir->next)
11216         CommonIndexResultDestruct(cir->next);
11217     if (cir)
11218         MemFree(cir);
11219 }
11220 
11221 /* returns OID by given GI */
GI2OID(CommonIndexHeadPtr cih,Int4 gi,Int4 dbmask,Int4 alias_dbmask,Int2Ptr dbid,Int2Ptr alias_dbid,ReadDBFILEPtr rdfp)11222 Int4    GI2OID(CommonIndexHeadPtr cih, Int4 gi, Int4 dbmask, Int4 alias_dbmask,
11223     Int2Ptr dbid, Int2Ptr alias_dbid, ReadDBFILEPtr rdfp)
11224 {
11225     CommonIndexResultPtr    cir, cir_start;
11226     Int4        retval=-1;
11227     Uint4        dbmask_tmp;
11228 
11229     /* gi is not in the database (or even in the common index).
11230        The most probable reason for this is that the gi was released
11231        after the database was built.  Return -1 to indicate that it
11232        is not in the database. */
11233     if (gi < 0 || gi >= cih->maxgi) {
11234         return -1;
11235     }
11236 
11237     cir_start = GIs2OIDs(cih, &gi, 1, dbmask | alias_dbmask, rdfp);
11238 
11239 
11240     /* Get oid in a real database */
11241     cir = cir_start;
11242     while (cir && (retval==-1)) {
11243     if (dbmask & (0x1<<cir->dbid)) {
11244         *dbid = cir->dbid;
11245         retval = cir->oid;
11246     }
11247     cir = cir->next;
11248     }
11249 
11250 
11251     /* now set dbid to correct alias database, if alias database mask specified */
11252     dbmask_tmp = SwapUint4(cih->ci[gi].dbmask);
11253     if (dbmask_tmp - dbmask > 0 && alias_dbid)
11254         *alias_dbid = bit_engine_firstbit(dbmask_tmp & alias_dbmask);
11255 
11256     CommonIndexResultDestruct(cir_start);
11257 
11258     return retval;
11259 }
11260 
11261 /*
11262    gets list of GI's and returns all OID for each database from the mask
11263    the GI belongs to.  dbmask == 0 means all databases.
11264    The list of OID is constructed as list of the CommonIndexResult items
11265    (see readdb.h for definition)
11266    noids - number of found oid on return
11267  */
11268 
GIs2OIDs(CommonIndexHeadPtr cih,Int4Ptr gis,Int4 number_of_gis,Int4 dbmask,ReadDBFILEPtr startrdfp)11269 CommonIndexResultPtr    GIs2OIDs(CommonIndexHeadPtr cih, Int4Ptr gis,
11270     Int4 number_of_gis, Int4 dbmask, ReadDBFILEPtr startrdfp)
11271 {
11272     Int4        i, gi, numDB, mask;
11273     Int2        firstpos, curfirstpos;
11274     CommonIndexPtr    cigi;
11275     CommonIndexResultPtr cir = NULL, cirfirst = NULL;
11276     Boolean        first = TRUE;
11277     ISAMObjectPtr    nisam_opt = NULL;
11278     ISAMErrorCode    error;
11279     Uint4        value;
11280     ReadDBFILEPtr    rdfp;
11281 
11282     /* for each given GI we need to check if this gi is in list */
11283 
11284     for(i=0; i < number_of_gis; i++) {
11285     gi = gis[i];
11286     if (gi < 0)
11287         continue;
11288 
11289     cigi = cih->ci + gi;
11290 
11291     /* mask says what DBs the GI belongs to */
11292         mask = SwapUint4(cigi->dbmask);
11293 
11294     if (dbmask && !(dbmask & mask)) {
11295         /* skip the gi if it is not in dbmask databases */
11296         continue;
11297     }
11298 
11299     numDB = bit_engine_numofbits(mask);
11300 
11301     if (numDB) {
11302         /* Okay, there is at least one database which contains such GI */
11303 
11304         /* Check if this is the "often" database for the GI */
11305         firstpos = bit_engine_firstbit(mask);
11306 
11307         /* dbmask == 0 means that we search for ALL DBs */
11308         if (!dbmask || (dbmask & (0x1 << firstpos))) {
11309         if (first) {
11310             /* create first if needed */
11311             cirfirst = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11312             first = FALSE;
11313             cir = cirfirst;
11314         } else {
11315             cir->next = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11316             cir = cir->next;
11317         }
11318         cir->gi = gi;
11319 
11320         /* we know that for the first database the often field is used */
11321         cir->oid = SwapUint4(cigi->oftenOID);
11322 
11323         cir->dbid = firstpos;
11324         cir->next = NULL;
11325         }
11326             curfirstpos = firstpos;
11327 
11328         /* do for the rest of databases */
11329             while (--numDB) {
11330         /* shift mask to get next database bit shift */
11331                 mask >>= (curfirstpos + 1);
11332                 curfirstpos = bit_engine_firstbit(mask);
11333         /* update absolute bit shift */
11334                 firstpos += curfirstpos + 1;
11335 
11336         if (!dbmask || (dbmask & (0x1 << firstpos))) {
11337 
11338             /* find OID using ISAM old index */
11339 
11340             rdfp = startrdfp;
11341             while (rdfp) {
11342             if (rdfp->filebit == firstpos) {
11343                 nisam_opt = rdfp->nisam_opt;
11344                 break;
11345             }
11346             rdfp = rdfp->next;
11347             }
11348 
11349             if (!nisam_opt) {
11350             /* that means that the database specified by 'firstpos' is mask */
11351             /* skip the database */
11352             continue;
11353             }
11354             if (first) {
11355             cirfirst = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11356             first = FALSE;
11357             cir = cirfirst;
11358             } else {
11359             cir->next = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11360             cir = cir->next;
11361             }
11362 
11363             cir->gi = gi;
11364 
11365             /* Initialize and perform the ISAM search */
11366             if((error = NISAMSearch(nisam_opt, gi, &value, NULL)) < 0) {
11367             ErrPostEx(SEV_ERROR, 0, 0, "Failed to initialize ISAM search");
11368             return NULL;
11369             }
11370 
11371             if(error == ISAMNotFound) {
11372             ErrPostEx(SEV_ERROR, 0, 0, "Internal error inside GIs2OIDs(), we expected to find this GI into the database\n");
11373             }
11374 
11375             cir->oid = (Int4) value;
11376 
11377             cir->dbid = firstpos;
11378             cir->next = NULL;
11379         }
11380             }
11381     }
11382     }
11383     /* return first item of the list */
11384     return cirfirst;
11385 }
11386 
11387 
FindDBbyGI(CommonIndexHeadPtr cih,Int4 gi,Uint1 * is_prot)11388 CharPtr    FindDBbyGI(CommonIndexHeadPtr cih, Int4 gi, Uint1 *is_prot)
11389 {
11390     Int4        numDB, mask;
11391     Int2        firstpos;
11392     CommonIndexPtr    cigi;
11393 
11394    if (gi > cih->maxgi)
11395     return NULL;
11396 
11397     cigi = cih->ci + gi;
11398     mask = SwapUint4(cigi->dbmask);
11399 
11400     numDB = bit_engine_numofbits(mask);
11401 
11402     if (numDB) {
11403     firstpos = bit_engine_firstbit(mask);
11404     *is_prot = DBisProt(cih->num_of_DBs, cih->dbids, firstpos);
11405     return DBName(cih->num_of_DBs, cih->dbids, firstpos);
11406     } else {
11407     return NULL;
11408     }
11409 
11410 }
11411 
11412 /* returns senior (first) bit in the word */
bit_engine_firstbit(Int4 word)11413 Int2    bit_engine_firstbit (Int4 word)
11414 {
11415     Int2    i;
11416     Int4    senior_bit = 0x1;
11417 
11418     for (i=0; i < 8*sizeof(Int4); i++) {
11419     if (word & senior_bit)
11420         return i;
11421     senior_bit <<= 1;
11422     }
11423     return -1;
11424 }
11425 
11426 /* return number of bits which are ON in the give "word" */
bit_engine_numofbits(Int4 word)11427 Int2    bit_engine_numofbits(Int4 word)
11428 {
11429     Int2    i;
11430     Int4    tmpbit = 0x1;
11431     Int2    count = 0;
11432 
11433     if (!word) {
11434     return 0;
11435     }
11436 
11437     for (i=0; i < 8*sizeof(Int4); i++, tmpbit <<= 1) {
11438     if (word & tmpbit) {
11439         count++;
11440     }
11441     }
11442     return count;
11443 }
11444 /* returns:
11445    1. list of dbid shifts
11446    2. number of dbs
11447  */
11448 
bit_engine_arr(Int4 word)11449 Int2Ptr    bit_engine_arr(Int4 word)
11450 {
11451     Int2    i;
11452     Int4    tmpbit = 0x1;
11453     Int2Ptr    retval;
11454     Int2    count = 0;
11455 
11456     retval = (Int2Ptr) MemNew(sizeof(Int2)*8*sizeof(Int4));
11457 
11458     if (!word) {
11459     retval[0] = 0;
11460     return retval;
11461     }
11462 
11463     for (i=0; i < 8*sizeof(Int4); i++, tmpbit <<= 1) {
11464     if (word & tmpbit) {
11465         retval[count+1] = i;
11466         count++;
11467     }
11468     }
11469     retval[0] = count;
11470 
11471     return retval;
11472 }
11473 
11474 /************************************************************************/
11475 /* END    The CommonIndex stuff                        */
11476 /************************************************************************/
11477 
11478 /************************************************************************/
11479 /*    The functions used with ID1 dump stuff                */
11480 /************************************************************************/
11481 
11482 /* This function iterates through the array of function poiters, invoking each
11483  * one and returning the logical AND of each of these function's return
11484  * values. */
DB_Subset(GMSubsetDataPtr gmsdp,DI_Record direc)11485 Boolean    DB_Subset (GMSubsetDataPtr gmsdp, DI_Record direc)
11486 {
11487     Boolean retval;
11488     Int4 i;
11489 
11490     retval = (*gmsdp->criteria[0])((void *)&direc);
11491     for (i = 1; i < gmsdp->count; i++) {
11492         retval = (retval && (*gmsdp->criteria[i])((void *)&direc));
11493     }
11494 
11495     return retval;
11496 }
11497 
is_EST_HUMAN(VoidPtr direc)11498 Boolean   is_EST_HUMAN (VoidPtr direc)
11499 {
11500     return (((DI_RecordPtr)direc)->taxid == 9606);
11501 }
is_EST_MOUSE(VoidPtr direc)11502 Boolean   is_EST_MOUSE (VoidPtr direc)
11503 {
11504     return (((DI_RecordPtr)direc)->taxid == 10090 ||
11505             ((DI_RecordPtr)direc)->taxid == 10091 ||
11506             ((DI_RecordPtr)direc)->taxid == 10092 ||
11507             ((DI_RecordPtr)direc)->taxid == 35531 ||
11508             ((DI_RecordPtr)direc)->taxid == 80274 ||
11509             ((DI_RecordPtr)direc)->taxid == 57486);
11510 }
is_EST_OTHERS(VoidPtr direc)11511 Boolean   is_EST_OTHERS (VoidPtr direc)
11512 {
11513     return (!is_EST_HUMAN(direc) && !is_EST_MOUSE(direc));
11514 }
11515 
is_SWISSPROT(VoidPtr direc)11516 Boolean   is_SWISSPROT (VoidPtr direc)
11517 {
11518     return (((DI_RecordPtr)direc)->owner == 6);
11519 }
11520 
is_MONTH(VoidPtr direc)11521 Boolean   is_MONTH (VoidPtr direc)
11522 {
11523     return (((DI_RecordPtr)direc)->gi_threshold != -1 &&
11524             ((DI_RecordPtr)direc)->gi > ((DI_RecordPtr)direc)->gi_threshold);
11525 }
11526 
is_PDB(VoidPtr direc)11527 Boolean   is_PDB (VoidPtr direc)
11528 {
11529     return (((DI_RecordPtr)direc)->owner == 10);
11530 }
11531 
11532 /* Criteria for determining whether a sequence is refseq:
11533    First 2 characters of the accession are letters, 3rd character is an '_',
11534    and it must be at least kMinAccessionLength characters long.
11535    Updated per suggestion from Misha Kimelman (via email)
11536  */
is_REFSEQ(VoidPtr direc)11537 Boolean is_REFSEQ(VoidPtr direc)
11538 {
11539     const int kMinAccessionLength = 9;
11540     const char* accession = ((DI_RecordPtr)direc)->acc;
11541 
11542     if ((StringLen(accession) >= kMinAccessionLength) &&
11543         IS_ALPHA(accession[0]) &&
11544         IS_ALPHA(accession[1]) &&
11545         (accession[2] == '_')) {
11546         return TRUE;
11547     } else {
11548         return FALSE;
11549     }
11550 }
11551 
is_REFSEQ_GENOMIC(VoidPtr ptr)11552 Boolean is_REFSEQ_GENOMIC(VoidPtr ptr)
11553 {
11554     return (is_REFSEQ(ptr) && !is_REFSEQ_RNA(ptr));
11555 }
11556 
11557 /* Criteria for determining whether a sequence belongs in the refseq_rna
11558    database. This is a subset of the sequences identified by is_REFSEQ with the
11559    additional constraint that the molecule type must be RNA
11560  */
is_REFSEQ_RNA(VoidPtr ptr)11561 Boolean is_REFSEQ_RNA(VoidPtr ptr)
11562 {
11563     DI_RecordPtr direc = (DI_RecordPtr)ptr;
11564     if (!is_REFSEQ(direc)) {
11565         return FALSE;
11566     }
11567 
11568     return (direc->mol == Seq_mol_rna);
11569 }
11570 
is_CONTIG(VoidPtr direc)11571 Boolean is_CONTIG(VoidPtr direc)
11572 {
11573     return (((DI_RecordPtr)direc)->owner == 28);
11574 }
11575 
FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list)11576 CharPtr FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list)
11577 {
11578     SeqIdPtr     sip;
11579     TextSeqIdPtr tsip;
11580     CharPtr      acc;
11581 
11582     if(seqid_list == NULL)
11583         return NULL;
11584 
11585     for(acc = NULL, sip = seqid_list; sip != NULL; sip = sip->next)
11586     {
11587         if(sip->choice != SEQID_GENBANK && sip->choice != SEQID_EMBL &&
11588            sip->choice != SEQID_DDBJ && sip->choice != SEQID_OTHER)
11589             continue;
11590 
11591         tsip = (TextSeqIdPtr) sip->data.ptrvalue;
11592         if(tsip == NULL || tsip->accession == NULL ||
11593            tsip->accession[0] == '\0')
11594             continue;
11595 
11596         acc = StringSave(tsip->accession);
11597         break;
11598     }
11599     return(acc);
11600 }
11601 
11602 
11603 /* Function scans .pdi or .ndi file and do callback() for this gi and oid */
11604 
ScanDIFile(CharPtr difilename,GMSubsetDataPtr gmsubsetdp,Boolean (* callback)(DI_RecordPtr direc,VoidPtr data),VoidPtr data,FILE * out,Int4 gi_threshold)11605 Boolean    ScanDIFile(CharPtr difilename, GMSubsetDataPtr gmsubsetdp,
11606     Boolean (*callback)(DI_RecordPtr direc, VoidPtr data), VoidPtr data,
11607     FILE *out, Int4 gi_threshold)
11608 {
11609     static const int kNumFieldsDiFile = 10;
11610     FILE        *fdi;
11611     DI_Record    direc;
11612     Char        skipstr1[128], accession[128];
11613     long        skipdate = 0;
11614     int            readstat, total=0, progress_count=0, mol_type=0;
11615     int         prev_oid = -1;      /* helps to keep track of sequences which
11616                                        have been merged in a non-redundant
11617                                        database (i.e.: protein dbs) */
11618 #ifdef SHOW_PROGRESS
11619     Int4        progress_chunk = 100;
11620 #endif
11621 
11622 
11623     /* open index file */
11624     fdi = FileOpen(difilename, "r");
11625 
11626     if (!fdi) {
11627         fprintf(out, "\nERROR: cannot open '%s'", difilename);
11628         return FALSE;
11629     }
11630 
11631     /* set gi threshold for month subset */
11632     direc.gi_threshold = gi_threshold;
11633     MemSet(accession, NULLB, 128);
11634 
11635     /* each line in index file looks like: */
11636     /* 2933800 5769963 9606 8 EST 427 -2021038615 38990825 M61958 2 */
11637     while ((readstat = fscanf(fdi, "%ld %ld %ld %ld %s %ld %ld %ld %s %u",
11638                               (long *) &direc.oid, (long *) &direc.gi,
11639                               (long *) &direc.taxid, (long *) &direc.owner,
11640                               skipstr1, (long *) &direc.len,
11641                               (long *) &direc.hash, (long *) &skipdate,
11642                               accession, (unsigned int *) &mol_type)) ==
11643            kNumFieldsDiFile) {
11644         direc.acc = StringSave(accession);
11645         direc.mol = (Uint1)mol_type;
11646         direc.gi_threshold = gi_threshold;
11647         /*direc.oid += *curr_oid;*/
11648         if (DB_Subset(gmsubsetdp, direc)) {
11649             /* In the case of non-redundant databases, identical sequences will
11650              * be merged into the same sequence and have the same oid. Entries
11651              * with the same oid should only be counted once. */
11652             if (prev_oid != direc.oid) {
11653                 callback(&direc, data);
11654                 prev_oid = direc.oid;
11655             }
11656             progress_count++;
11657         }
11658         direc.acc = MemFree(direc.acc);
11659         MemSet((void*) accession, NULLB, sizeof(accession));
11660         MemSet((void*) &direc, NULLB, sizeof(direc));
11661 #ifdef SHOW_PROGRESS
11662         if (!(total % progress_chunk)) {
11663             if (progress_count < progress_chunk/3) {
11664                 printf (".");
11665             } else if (progress_count > 2*progress_chunk/3) {
11666                 printf ("X");
11667             } else {
11668                 printf ("x");
11669             }
11670             progress_count = 0;
11671             fflush(out);
11672         }
11673 #endif
11674 
11675         total++;
11676     }
11677 
11678     if (readstat != EOF) {
11679         fprintf(out, "\nError occurred while parsing line %d, %s "
11680                 "(read %d fields instead of the expected %d)", total+1,
11681                 difilename, readstat, kNumFieldsDiFile);
11682         return FALSE;
11683     }
11684     FILECLOSE(fdi);
11685 
11686     return TRUE;
11687 }
11688 /************************************************************************/
11689 /* END    The functions used with ID1 dump stuff                */
11690 /************************************************************************/
11691 
11692 /************************************************************************/
11693 /*        Fastacmd API                                           */
11694 /************************************************************************/
11695 
GetAccList(CharPtr file,Int4Ptr TotalItems)11696 FCMDAccListPtr LIBCALL GetAccList(CharPtr file, Int4Ptr TotalItems)
11697 {
11698     Char TmpBuff[128];
11699     Int4 i, j, k;
11700     Int4 FileLen = 0;
11701     FCMDAccListPtr AccList = NULL;
11702     FCMDAccListPtr AccListTmp, AccListLast;
11703     Int4 NumNotValid = 0;
11704     Int4 gi = 0;
11705 
11706   if(file == NULL || file[0] == NULLB) {
11707     *TotalItems = 0;
11708     return NULL;
11709   }
11710 
11711   FileLen = StringLen(file);
11712 
11713   for(i = 0; i < FileLen; i++) {
11714 
11715       if(isspace((int)file[i]) || file[i] == ',') /* Rolling spaces */
11716           continue;
11717 
11718       /* This is defence from badly formatted requests */
11719 
11720       if(NumNotValid > 10) {
11721           ErrPostEx(SEV_ERROR, 0, 0, "**** ERROR: Too many invalid Gis/Accessions, "
11722                  "parsing aborted\n");
11723           *TotalItems = 0;
11724           return NULL;
11725       }
11726 
11727       /* Rolling spaces */
11728 
11729       j= 0;
11730       while (j < 128  && i < FileLen) {
11731           TmpBuff[j] = TO_LOWER(file[i]);
11732           j++; i++;
11733           if(isspace((int)file[i]) ||
11734              file[i] == ',' || /* Comma is valid delimiter */
11735              file[i] == '\n')
11736               break;
11737       }
11738       TmpBuff[j] = NULLB;
11739 
11740       /* Is gi/accession too long ??? */
11741 
11742       if(j == 128) {
11743           ErrPostEx(SEV_WARNING, 0, 0, "Gi/Accession \"%s\" is too long\r\n",
11744                  TmpBuff);
11745           NumNotValid++;
11746 
11747           while(!isspace((int)file[i]) ||
11748                 file[i] == ',' ||
11749                 file[i] == NULLB) /* Rolling until spaces */
11750               i++;
11751           continue;  /* Next may be valid ... who knows...?? */
11752       }
11753 
11754       /* Now validating accession/gi */
11755 
11756       for(k =0; k < j; k++) {
11757           if(!IS_DIGIT(TmpBuff[k])) {
11758               break;
11759           }
11760       }
11761 
11762       gi = 0;
11763       if(k == j)
11764           gi = atol(TmpBuff);
11765 
11766        if (gi == 0) {
11767            if (StringChr(TmpBuff, '|') != NULL) {
11768                SeqIdPtr sip = SeqIdParse(TmpBuff);
11769                SeqIdPtr sip_var = sip;
11770                SeqIdPtr best_acc = SeqIdFindBestAccession(sip);
11771                if (best_acc)
11772                {
11773                    switch (best_acc->choice)
11774                    {   /* Only TextSeqIdPtrs */
11775                       case SEQID_GENBANK:
11776                       case SEQID_EMBL:
11777                       case SEQID_DDBJ:
11778                       case SEQID_PIR:
11779                       case SEQID_SWISSPROT:
11780                       case SEQID_PRF:
11781                       case SEQID_OTHER:
11782                       case SEQID_TPG:
11783                       case SEQID_TPE:
11784                       case SEQID_TPD:
11785                       case SEQID_GPIPE:
11786                         SeqIdWrite(best_acc, TmpBuff, PRINTID_TEXTID_ACC_VER, 128);
11787                         break;
11788                       default:
11789                         break;
11790                    }
11791                }
11792                while (sip_var)
11793                {
11794                   if (sip_var->choice == SEQID_GI)
11795                   {
11796                      gi = sip_var->data.intvalue;
11797                      break;
11798                   }
11799                   sip_var = sip_var->next;
11800                }
11801            }
11802       }
11803 
11804       /* If this is valid Accession check and tranfer it to gi */
11805 
11806       /* It we come here - we got valid text ID */
11807 
11808       if(AccList == NULL) { /* first element */
11809           AccList = (FCMDAccListPtr) MemNew(sizeof(FCMDAccList));
11810           AccListTmp = AccList;
11811           AccListTmp->acc = StringSave(TmpBuff);
11812           AccListTmp->gi = gi;
11813           AccListTmp->next = NULL;
11814           AccListLast=AccListTmp;
11815           *TotalItems = *TotalItems +1;
11816       } else {
11817           AccListTmp = (FCMDAccListPtr) MemNew(sizeof(FCMDAccList));
11818           AccListLast->next = AccListTmp;
11819           AccListTmp->acc = StringSave(TmpBuff);
11820           AccListTmp->gi = gi;
11821           AccListTmp->next = NULL;
11822           AccListLast = AccListTmp;
11823           *TotalItems = *TotalItems +1;
11824       }
11825   }
11826   if(NumNotValid) {
11827       ErrPostEx(SEV_ERROR, 0, 0, "**** %d invalid Gi%s/Accession%s present in fastacmd "
11828              "request\r\n",
11829              NumNotValid,
11830              NumNotValid == 1 ? "" : "s",
11831              NumNotValid == 1 ? "" : "s"
11832              );
11833   }
11834   return AccList;
11835 }
11836 
FCMDAccListFree(FCMDAccListPtr falp)11837 void LIBCALL FCMDAccListFree(FCMDAccListPtr falp)
11838 {
11839     FCMDAccListPtr falp_tmp, falp_next;
11840 
11841     if(falp == NULL)
11842         return;
11843 
11844     for(falp_tmp = falp; falp_tmp != NULL; falp_tmp=falp_next) {
11845         falp_next = falp_tmp->next;
11846         MemFree(falp_tmp->acc);
11847         MemFree(falp_tmp);
11848     }
11849 }
11850 
Fastacmd_PrintTaxonomyInfo(ReadDBFILEPtr rdfp,Int4 oid,FILE * fp,Int4 linelen)11851 static Boolean Fastacmd_PrintTaxonomyInfo(ReadDBFILEPtr rdfp, Int4 oid,
11852         FILE *fp, Int4 linelen)
11853 {
11854     RDBTaxNamesPtr  tnames = NULL;
11855     BlastDefLinePtr bdp = NULL, bdp_tmp;
11856     Char buf[128];
11857 
11858     if (rdfp == NULL || fp == NULL)
11859         return FALSE;
11860 
11861     if ((bdp = FDReadDeflineAsn(rdfp, oid)) == NULL)
11862         return FALSE;
11863 
11864     asn2ff_set_output(fp, NULL);
11865     ff_StartPrint(0, 0, linelen, NULL);
11866 
11867     /* Print the taxonomy report for each sequence associated with this oid */
11868     for (bdp_tmp = bdp; bdp_tmp; bdp_tmp = bdp_tmp->next) {
11869 
11870     /* skip irrelevant sequences if a gi target was specified */
11871     SeqIdPtr gi = SeqIdFindBest(bdp_tmp->seqid, SEQID_GI);
11872     if ( gi && (rdfp->gi_target != 0) && (gi->data.intvalue != rdfp->gi_target) )
11873         continue;
11874 
11875         MemSet(buf, 0, sizeof(buf));
11876         SeqIdWrite(bdp_tmp->seqid, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
11877 
11878         if (bdp_tmp->taxid == 0) {
11879             ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information not encoded for "
11880                     "Seq-id '%s'", buf);
11881             continue;
11882         }
11883 
11884         if ((tnames = RDBGetTaxNames(rdfp->taxinfo, bdp_tmp->taxid)) == NULL) {
11885             ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not available "
11886                     "for Seq-id '%s'.\nIf you have not done so already, "
11887                     "please update your copy from: %s\n", buf, TAXDB_ON_FTP);
11888             continue;
11889         }
11890 
11891         ff_AddString("NCBI sequence id: ");
11892         ff_AddString(buf); NewContLine();
11893 
11894         ff_AddString("NCBI taxonomy id: ");
11895         ff_AddInteger("%ld", bdp_tmp->taxid);
11896         NewContLine();
11897 
11898         ff_AddString("Common name: ");
11899         ff_AddString(tnames->common_name); NewContLine();
11900         ff_AddString("Scientific name: ");
11901         ff_AddString(tnames->sci_name); NewContLine();
11902         if (bdp_tmp->next)
11903             NewContLine();
11904         RDBTaxNamesFree(tnames);
11905     }
11906 
11907     ff_EndPrint();
11908     BlastDefLineSetFree(bdp);
11909 
11910     return TRUE;
11911 }
11912 
11913 /* Prints the output for the -I option in fastacmd */
11914 static Boolean
Fastacmd_PrintDbFullInformation(ReadDBFILEPtr rdfp,CharPtr databases,Int4 linelen,FILE * out)11915 Fastacmd_PrintDbFullInformation(ReadDBFILEPtr rdfp, CharPtr databases,
11916                                 Int4 linelen, FILE *out)
11917 {
11918     Boolean is_prot;
11919     CharPtr base_filename;
11920     Char buf[256];
11921     Int4 path_len;
11922 
11923     is_prot = (rdfp->parameters & READDB_IS_PROT) ? TRUE : FALSE;
11924     PrintDbInformationWithRID(databases, is_prot, linelen, out, FALSE, NULL, FALSE);
11925 
11926     asn2ff_set_output(out, NULL);
11927     ff_StartPrint(0, 0, linelen, NULL);
11928 
11929     ff_AddString("File name");
11930     if (rdfp->next)
11931         ff_AddString("s:");
11932     else
11933         ff_AddString(":");
11934     NewContLine();
11935 
11936     for (; rdfp; rdfp = rdfp->next) {
11937 
11938         /** Print file name **/
11939         if (rdfp->aliasfilename && rdfp->oidlist) { /* subset database */
11940             base_filename = StringRChr(rdfp->full_filename,'/');
11941             MemSet(buf, 0, sizeof(buf));
11942             path_len = StringLen(rdfp->full_filename) -
11943                         StringLen(base_filename);
11944             if (path_len > 0 && path_len < sizeof(buf)
11945                     && base_filename != NULL) {
11946                 StringNCpy(buf, rdfp->full_filename, path_len+1);
11947                 StringNCat(buf, rdfp->aliasfilename, sizeof(buf)-1-path_len-1);
11948             } else {
11949                 StringNCpy(buf, rdfp->aliasfilename, sizeof(buf)-1);
11950             }
11951 
11952             ff_AddString(buf);
11953             base_filename = NULL;
11954         } else /* real database */
11955             ff_AddString(rdfp->full_filename);
11956         NewContLine();
11957 
11958         /** Print date **/
11959         TabToColumn(4);
11960         ff_AddString("Date: "); ff_AddString(rdfp->date);
11961 
11962         /** Print database version **/
11963         ff_AddString("    Version: ");
11964         ff_AddString(Ltostr((long)rdfp->formatdb_ver, 1));
11965 
11966         /** Print length of longest sequence **/
11967         ff_AddString("    Longest sequence: ");
11968         ff_AddString(Ltostr((unsigned long)rdfp->maxlen, 1));
11969         if (readdb_is_prot(rdfp))
11970             ff_AddString(" res");
11971         else
11972             ff_AddString(" bp");
11973         NewContLine();
11974         TabToColumn(0);
11975     }
11976 
11977     ff_EndPrint();
11978     return TRUE;
11979 }
11980 
Fastacmd_ParseLocations(const char * str,Int4 locations[2])11981 void Fastacmd_ParseLocations(const char* str, Int4 locations[2])
11982 {
11983     const char* delimiters = " ,;";
11984     char* seqlocstr = NULL;
11985 
11986     locations[0] = locations[1] = 0;
11987 
11988     if ( !str ) {
11989         return;
11990     }
11991 
11992     seqlocstr = StringSave((char*) str);
11993 
11994     locations[0] =
11995         atol(StringTokMT(seqlocstr, (char*) delimiters, &seqlocstr));
11996     if (locations[0] < 0) {
11997         ErrPostEx(SEV_WARNING, 0, 0,
11998                   "Starting location is negative, setting to 0");
11999         locations[0] = 0;
12000     }
12001 
12002     if ( !seqlocstr ) {
12003         locations[1] = 0;
12004     } else {
12005         locations[1] = atol(seqlocstr);
12006     }
12007 
12008     if (locations[1] < 0) {
12009         ErrPostEx(SEV_WARNING, 0, 0,
12010                   "Ending location is negative, setting to 0");
12011         locations[1] = 0;
12012     }
12013 }
12014 
Fastacmd_ParseSeqLoc(CharPtr str,Uint1 strand,BioseqPtr bsp)12015 static SeqLocPtr Fastacmd_ParseSeqLoc(CharPtr str, Uint1 strand, BioseqPtr bsp)
12016 {
12017     Int4 locations[2];
12018 
12019     if (str == NULL) {
12020         return NULL;
12021     }
12022 
12023     Fastacmd_ParseLocations(str, locations);
12024     ASSERT(locations[0] >= 0);
12025     ASSERT(locations[1] >= 0);
12026 
12027     /* Sanity check */
12028     if (locations[1] > bsp->length) {
12029         ErrPostEx(SEV_ERROR, 0, 0, "From location cannot be greater "
12030                 "than %ld. Ignoring sequence location.\n",
12031                 bsp->length);
12032         locations[0] = 0; locations[1] = bsp->length - 1;
12033     }
12034 
12035     /* Convert locations to zero-offsets... */
12036     if (locations[1] == 0) {
12037         locations[1] = bsp->length - 1;
12038     } else {
12039         locations[1]--;
12040     }
12041 
12042     if (locations[0] > 0) {
12043         locations[0]--;
12044     }
12045 
12046     if (ISA_aa(bsp->mol))  /* for proteins, the strand is irrelevant */
12047         strand = Seq_strand_unknown;
12048 
12049     ASSERT(locations[0] >= 0);
12050     ASSERT(locations[1] >= 0);
12051     return SeqLocIntNew(locations[0], locations[1],
12052                        strand, SeqIdFindBest(bsp->id, SEQID_GI));
12053 
12054 }
12055 
Fastacmd_Search(CharPtr searchstr,CharPtr database,CharPtr batchfile,Boolean dupl,Int4 linelen,FILE * out)12056 Int2 Fastacmd_Search (CharPtr searchstr, CharPtr database,
12057     CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out)
12058 {
12059     return Fastacmd_Search_ex(searchstr, database, READDB_DB_UNKNOWN,
12060             batchfile, dupl, linelen, out, FALSE, FALSE, eNoDump, NULL,
12061             Seq_strand_unknown, FALSE, FALSE, PIG_NONE);
12062 }
12063 
Fastacmd_Search_ex(CharPtr searchstr,CharPtr database,Uint1 is_prot,CharPtr batchfile,Boolean dupl,Int4 linelen,FILE * out,Boolean use_target,Boolean use_ctrlAs,EBlastDbDumpType dump_db,CharPtr seqlocstr,Uint1 strand,Boolean taxonomy_info_only,Boolean dbinfo_only,Int4 pig)12064 Int2 Fastacmd_Search_ex (CharPtr searchstr, CharPtr database, Uint1 is_prot,
12065     CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out,
12066     Boolean use_target, Boolean use_ctrlAs, EBlastDbDumpType dump_db,
12067     CharPtr seqlocstr, Uint1 strand,
12068     Boolean taxonomy_info_only, Boolean dbinfo_only, Int4 pig)
12069 {
12070     BioseqPtr        bsp;
12071     ReadDBFILEPtr    rdfp = NULL, rdfp_tmp;
12072     Int4             i, fid, TotalItems=0, count = 0;
12073     FCMDAccListPtr   falp=NULL, falp_tmp;
12074     CharPtr          buffer = NULL, dbname = database;
12075     FILE             *fd;
12076     Int4Ptr          ids = NULL;
12077     Int4             guess_gi = -1;
12078     SeqLocPtr        slp = NULL;
12079     Uint1            init_state = 0;
12080     Int2             retval = FASTACMD_SUCCESS;
12081 
12082     if (searchstr)
12083         guess_gi = atol(searchstr);
12084 
12085     if (dbname == NULL)
12086         dbname = FASTACMD_DEFAULT_DB;
12087 
12088     ASSERT(dump_db >= eNoDump || dump_db < eDumpTypeMax);
12089 
12090     if (taxonomy_info_only)
12091         init_state = READDB_NEW_DO_TAXDB;
12092     else if (dbinfo_only)
12093         init_state = READDB_NEW_DO_REPORT;
12094     else
12095         init_state = READDB_NEW_INDEX;
12096 
12097     if (!(rdfp = readdb_new_ex2(dbname, is_prot, init_state, NULL, NULL))) {
12098         ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Cannot initialize readdb for "
12099              "%s database\n", dbname);
12100         return FASTACMD_DB_NOT_FOUND;
12101     }
12102 
12103     /* Validation of rdfp */
12104     {
12105         Int4 rv = readdb_validate(rdfp);
12106         ASSERT(rv != READDB_INVALID_NULL_ARG);
12107         if (rv == READDB_INVALID_MIXED_DBS) {
12108             ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Cannot initialize mismatched "
12109                       "protein/nucleotide databases '%s'\n", dbname);
12110             return FASTACMD_ERROR;
12111         }
12112     }
12113 
12114     if (dbinfo_only) {
12115         Fastacmd_PrintDbFullInformation(rdfp, dbname, linelen, out);
12116         readdb_destruct(rdfp);
12117         return retval;
12118     }
12119 
12120     if (pig != PIG_NONE) {
12121         if ( (fid = readdb_pig2oid(rdfp, pig, NULL)) == -1) {
12122             ErrPostEx(SEV_ERROR, 0, 0, "PIG %ld not found", (long) pig);
12123             return FASTACMD_FAILED_SEARCH;
12124         }
12125         bsp = readdb_get_bioseq_ex(rdfp, fid, TRUE, use_ctrlAs);
12126         slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12127         BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12128         bsp = BioseqFree(bsp);
12129         slp = SeqLocFree(slp);
12130         return retval;
12131     }
12132 
12133     /* Taxonomy information is encoded only in the new database format */
12134     if (taxonomy_info_only) {
12135         for (rdfp_tmp = rdfp; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
12136         if (rdfp_tmp->formatdb_ver < FORMATDB_VER) {
12137             ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not supported "
12138                     "in your version of\nthe blast databases (version %d). "
12139                     "Please update your databases and download\nthe taxonomy "
12140                     "blast database files (%s)\n", FORMATDB_VER_TEXT,
12141                     TAXDB_ON_FTP);
12142             readdb_destruct(rdfp);
12143             return FASTACMD_NO_TAXDB;
12144         }
12145         }
12146         if (rdfp->taxinfo == NULL) {
12147             ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not "
12148             "available. Please download it from\n"
12149             "%s\n", TAXDB_ON_FTP);
12150             readdb_destruct(rdfp);
12151             return FASTACMD_NO_TAXDB;
12152         }
12153     }
12154 
12155     if (dump_db == eNoDump) {
12156         if(searchstr != NULL) {
12157             if((falp =  GetAccList(searchstr, &TotalItems)) == NULL) {
12158                 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: No valid Gis/Accessions "
12159                 "found. Exiting...\n");
12160                 return FASTACMD_FAILED_SEARCH;
12161             }
12162         } else if(batchfile != NULL){
12163             if((fd = FileOpen(batchfile, "r")) == NULL) {
12164                 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Could not open %s",
12165                         batchfile);
12166                 return FASTACMD_ERROR;
12167             }
12168 
12169             buffer = WWWReadFileInMemory(fd, 0, TRUE);
12170 
12171             if((falp =  GetAccList(buffer, &TotalItems)) == NULL) {
12172                 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: No valid Gis/Accessions "
12173                 "found. Exiting...\n");
12174                 return FASTACMD_FAILED_SEARCH;
12175             }
12176         }
12177     }
12178 
12179     for (falp_tmp = falp; falp_tmp != NULL; falp_tmp = falp_tmp->next) {
12180 
12181         if(falp_tmp->gi != 0) {
12182             fid = readdb_gi2seq(rdfp, falp_tmp->gi, NULL);
12183         } else {
12184             if(!dupl) {
12185                 fid = readdb_acc2fasta(rdfp, falp_tmp->acc);
12186             } else {
12187                 count = 0;
12188                 fid = readdb_acc2fastaEx(rdfp, falp_tmp->acc, &ids, &count);
12189             }
12190         }
12191 
12192         if (fid < 0 && fid != -1) {
12193             ErrPostEx(SEV_ERROR, 0, 0, "Accesion search failed for \"%s\" "
12194                 "with error code %d\n", falp_tmp->acc, fid);
12195             return FASTACMD_FAILED_SEARCH;
12196         } else if (fid == -1) {
12197             ErrPostEx(SEV_ERROR, 0, 0, "Entry \"%s\" not found\n",
12198                         falp_tmp->acc);
12199             retval = FASTACMD_FAILED_SEARCH;
12200         } else if (ids == NULL) { /* gi or SeqId */
12201             if (use_target) {
12202                 ReadDBFILEPtr rdfp_tmp;
12203                 for (rdfp_tmp = rdfp; rdfp_tmp; rdfp_tmp = rdfp_tmp->next)
12204                     rdfp_tmp->gi_target = falp_tmp->gi;
12205             }
12206             if (taxonomy_info_only) {
12207                 if (!Fastacmd_PrintTaxonomyInfo(rdfp, fid, out, linelen))
12208                     retval = FASTACMD_FAILED_SEARCH;
12209             } else {
12210                 bsp = readdb_get_bioseq_ex(rdfp, fid, TRUE, use_ctrlAs);
12211                 slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12212                 BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12213                 bsp = BioseqFree(bsp);
12214                 slp = SeqLocFree(slp);
12215             }
12216         } else {
12217             for(i = 0; i < count; i++) {
12218                 if (taxonomy_info_only) {
12219                     if (!Fastacmd_PrintTaxonomyInfo(rdfp, ids[i], out,
12220                                 linelen))
12221                         retval = FASTACMD_FAILED_SEARCH;
12222                 } else {
12223                     bsp = readdb_get_bioseq_ex(rdfp, ids[i], TRUE,
12224                             use_ctrlAs);
12225                     slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12226                     BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12227                     bsp = BioseqFree(bsp);
12228                     slp = SeqLocFree(slp);
12229                 }
12230             }
12231             ids = MemFree(ids);
12232         }
12233     }
12234 
12235     /* sanity check */
12236     if (dump_db) {
12237         DumpBlastDB(rdfp, out, linelen, use_ctrlAs, dump_db);
12238     }
12239 
12240     readdb_destruct(rdfp);
12241     MemFree(buffer);
12242     FCMDAccListFree(falp);
12243     return retval;
12244 }
12245 
s_HasGiList(const ReadDBFILEPtr rdfp_list)12246 static Boolean s_HasGiList(const ReadDBFILEPtr rdfp_list)
12247 {
12248     ReadDBFILEPtr rdfp = (ReadDBFILEPtr) rdfp_list;
12249     if ( !rdfp_list ) {
12250         return FALSE;
12251     }
12252 
12253     for (; rdfp; rdfp = rdfp->next) {
12254         if (rdfp->gilist || rdfp->gifile) {
12255             return TRUE;
12256         }
12257     }
12258     return FALSE;
12259 }
12260 
12261 /* #define SHOW_PROGRESS */
12262 
DumpBlastDB(const ReadDBFILEPtr rdfp,FILE * fp,Int4 linelen,Boolean ctrlA,EBlastDbDumpType dump_type)12263 Int2 DumpBlastDB(const ReadDBFILEPtr rdfp, FILE *fp, Int4 linelen,
12264                  Boolean ctrlA, EBlastDbDumpType dump_type)
12265 {
12266     register Uint4 maskidx, bit_shift, dump;
12267     register Int4 i;
12268     Int4 total = 0, dumped = 0, nseqs = 0;
12269     OIDListPtr oidlist = NULL;
12270     Int8 tot_len = 0;
12271     ReadDBFILEPtr rdfp_tmp = rdfp;
12272 #ifdef SHOW_PROGRESS
12273     Int2 progress_chunk = 100;
12274 #endif
12275 
12276     /* Obtain the total length of this database */
12277     if (!(readdb_get_totals(rdfp,&tot_len,&total))) {
12278         ErrPostEx(SEV_ERROR,0,0,"Could not retrieve database length");
12279         return -1;
12280     }
12281     readdb_get_totals_ex(rdfp,&tot_len,&nseqs,TRUE); /* for testing only */
12282 
12283     /* readdb_new returns a sorted list of ReadDBFILEPtr's (real db's
12284      * followed by subset (mask) db's, and we do not support that yet */
12285     if (!rdfp->oidlist) {
12286         for (; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
12287             if (rdfp_tmp->oidlist) {
12288                 ErrPostEx(SEV_ERROR, 0, 0,
12289                 "Feature not available - Cannot dump subset databases and "
12290                 "real databases at the same time.");
12291                 return -1;
12292             }
12293         }
12294     }
12295     if (s_HasGiList(rdfp)) {
12296         ErrPostEx(SEV_ERROR, 0, 0,
12297         "Feature not available - Cannot dump databases with gi files");
12298         return -1;
12299     }
12300     rdfp_tmp = rdfp;
12301 
12302     if (!rdfp->oidlist) {
12303 
12304         for (i = 0; i < total; i++) {
12305 #ifdef SHOW_PROGRESS
12306             if (!(i%progress_chunk)) {
12307                 fprintf(stderr,"\b\b\b\b%3d%%",(int)((100*i)/total));
12308              }
12309 #endif
12310 
12311             if (DumpOneSequence(rdfp, fp, linelen, ctrlA, dump_type, i))
12312 	        dumped++;
12313         }
12314     } else {
12315 
12316         oidlist = rdfp_tmp->oidlist;
12317 
12318         for (i = 0; i < total; i++) {
12319 #ifdef SHOW_PROGRESS
12320             if (!(i%progress_chunk)) {
12321                 fprintf(stderr,"\b\b\b\b%3d%%",(int)((100*i)/total));
12322             }
12323 #endif
12324             /* Retrieve the correct oidlist, as each rdfp has its own */
12325             if (i > rdfp_tmp->stop) {
12326                 rdfp_tmp = rdfp_tmp->next;
12327                 if (rdfp_tmp) {
12328                     oidlist = rdfp_tmp->oidlist;
12329                 } else {
12330                     ErrPostEx(SEV_FATAL, 1,0,
12331                             "BlastDBToFasta: Oid %d is not in this mask");
12332                     return -1;
12333                 }
12334 
12335                 /* Make sure we have an oidlist! */
12336                 if (!oidlist) {
12337                     ErrPostEx(SEV_FATAL, 1,0,
12338                             "This mask database does not have an oidlist!\n"
12339                             "There is probably a wrong ordering problem in "
12340                             "the ReadDBFILEPtrs");
12341                     return -1;
12342                 }
12343             }
12344 
12345             /* Adjust the index i to this rdfp_tmp */
12346             maskidx = (i - rdfp_tmp->start)/MASK_WORD_SIZE;
12347             bit_shift = MASK_WORD_SIZE-1 - (i - rdfp_tmp->start) % MASK_WORD_SIZE;
12348 
12349             /* Make sure we are not addressing an index that's larger
12350              * than the our oidlist */
12351             if ((i - rdfp_tmp->start) > oidlist->total)
12352                 continue;
12353 
12354             /* Mask this index! */
12355             dump = SwapUint4(oidlist->list[maskidx]) & (0x1 << bit_shift);
12356 
12357             if ( !dump ) {
12358                 continue;
12359             }
12360 
12361             if (DumpOneSequence(rdfp, fp, linelen, ctrlA, dump_type, i))
12362                 dumped++;
12363         }
12364     }
12365 
12366 #ifdef SHOW_PROGRESS
12367     fprintf(stderr,"\n");
12368     fprintf(stderr,"Dumped %ld sequences (should be %d)\n", dumped,nseqs);
12369     Beep();
12370 #endif
12371 
12372     return 0;
12373 }
12374 
DumpOneSequence(const ReadDBFILEPtr rdfp,FILE * fp,Int4 linelen,Boolean ctrlA,EBlastDbDumpType dump_type,Int4 i)12375 Int2 DumpOneSequence(const ReadDBFILEPtr rdfp, FILE *fp, Int4 linelen,
12376                      Boolean ctrlA, EBlastDbDumpType dump_type, Int4 i)
12377 {
12378   Int2 retval=0;
12379 
12380   switch (dump_type) {
12381   case eFasta:
12382     {
12383       BioseqPtr bsp = NULL;
12384 
12385       if ((bsp = readdb_get_bioseq_ex(rdfp,i, TRUE, ctrlA)) != NULL) {
12386 	if (BioseqRawToFastaExtra(bsp, fp, linelen))
12387 	  retval = 1;
12388 	else
12389 	  ErrPostEx(SEV_ERROR,0,0, "Could not convert Bioseq to FASTA");
12390       }
12391       BioseqFree(bsp);
12392     }
12393     break;
12394 
12395   case eGi:
12396   case eAccession:
12397     {
12398       Uint4 h = 0;     /* header marker for readdb_get_header_ex */
12399       SeqIdPtr sip = NULL;
12400       while (readdb_get_header(rdfp, i, &h, &sip, NULL)) {
12401 	if (dump_type == eGi) {
12402 	  SeqIdPtr gi = SeqIdFindBest(sip, SEQID_GI);
12403 	  if (gi) {
12404 	    fprintf(fp, "%d\n", gi->data.intvalue);
12405 	    retval=1;
12406 	  }
12407 	}
12408 	if (dump_type == eAccession) {
12409 	  SeqIdPtr accn = SeqIdFindBestAccession(sip);
12410 
12411 	  if (accn) {
12412 	    Int4 gi=0;
12413 	    CharPtr id=NULL;
12414 	    Boolean numeric_id = GetAccessionVersionFromSeqId(accn, &gi, &id, TRUE);
12415 	    if (id)
12416 	      {
12417 	      fprintf(fp, "%s\n", id);
12418 	      retval=1;
12419 	      }
12420 	    else
12421 	      ErrPostEx(SEV_WARNING, 0, 0, "No accession found for oid %d", i);
12422 
12423 	    id = MemFree(id);
12424 	  }
12425 	}
12426 	  sip = SeqIdFree(sip);
12427       }
12428     }
12429     break;
12430 
12431   default:
12432     abort();        /* should never happen */
12433   }
12434 
12435   return retval;
12436 }
12437 
12438 /************************************************************************/
12439 /* END    Fastacmd API                                           */
12440 /************************************************************************/
12441 
12442 
12443 /*************************************************************************
12444     This function reads in a list of gi's from a text file
12445 and make a binary gilist file.
12446 
12447 The binary gilist format has the following construction:
12448 
12449 1.) 1st 4 bytes: a 'magic' number: UINT4_MAX
12450 2.) 2nd 4 bytes: total number of gi's in the file (call this value 'number').
12451 3.) 'number' set of 4 bytes, allowing 4 bytes for each gi.
12452 
12453 The function GetGisFromFile first checks what the first 4 bytes
12454 of a file are, if they are the 'magic' number, then it proceeds
12455 to read values assuming a binary format.  If they are not the
12456 'magic' number, then a text format is assumed.
12457 
12458 *************************************************************************/
12459 
12460 static int LIBCALLBACK
compare_gis(VoidPtr v1,VoidPtr v2)12461 compare_gis(VoidPtr v1, VoidPtr v2)
12462 {
12463    Uint4 gi1 = *(Uint4Ptr) v1;
12464    Uint4 gi2 = *(Uint4Ptr) v2;
12465 
12466    return ((gi1<gi2) ? -1 : ((gi1>gi2) ? 1 : 0));
12467 }
12468 
12469 
12470 #define    GIFILE_LINE_LEN    1024
12471 Int4 LIBCALL
readdb_MakeGiFileBinary(CharPtr input_file,CharPtr output_file)12472 readdb_MakeGiFileBinary (CharPtr input_file, CharPtr output_file)
12473 {
12474     FILE        *infp=NULL, *outfp=NULL;
12475     Int4        index = 0, value, chunk_size = 24, gilist_size;
12476     Int2        status;
12477     Char        line[GIFILE_LINE_LEN];
12478     long        tmplong;
12479     Uint4Ptr    gi_list;
12480 
12481     if (!(infp = FileOpen(input_file, "r"))) {
12482         ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", input_file);
12483         return -1;
12484     }
12485 
12486     if (!(outfp = FileOpen(output_file, "wb"))) {
12487         ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", output_file);
12488         return -1;
12489     }
12490 
12491     gi_list = MemNew(chunk_size * sizeof(Uint4));
12492 
12493     while (FileGets(line, GIFILE_LINE_LEN, infp))
12494     {
12495         /* do correct casting */
12496         status = sscanf(line, "%ld", &tmplong);
12497         value = tmplong;
12498 
12499         /* skip non-valid lines */
12500         if (status > 0 && value > 0) {
12501         /* do we have enough space in gi_list ? */
12502         if (chunk_size < index + 1) {
12503             chunk_size *= 2;
12504             gi_list = Realloc(gi_list, chunk_size * sizeof(Uint4));
12505         }
12506 
12507         gi_list[index++] = value;
12508         }
12509     }
12510 
12511     FormatDbUint4Write(READDB_MAGIC_NUMBER, outfp);
12512     FormatDbUint4Write(index, outfp);
12513 
12514     gilist_size = index;
12515     HeapSort(gi_list, gilist_size, sizeof(Uint4), compare_gis);
12516 
12517     for (index=0; index<gilist_size; index++)
12518     {
12519         FormatDbUint4Write(gi_list[index], outfp);
12520     }
12521 
12522     gi_list = MemFree(gi_list);
12523 
12524     FILECLOSE(infp);
12525     FILECLOSE(outfp);
12526 
12527     return gilist_size;
12528 }
12529 
FastaToBlastDB(FDB_optionsPtr options,Int4 Bases_In_Volume)12530 Int4 FastaToBlastDB(FDB_optionsPtr options, Int4 Bases_In_Volume)
12531 {
12532    FILE *fd;
12533    FormatDBPtr fdbp;
12534    SeqEntryPtr sep;
12535    BioseqPtr bsp;
12536    Char filenamebuf[FILENAME_MAX];
12537    Int4 count=0, volume=0;
12538    BlastDefLinePtr bdp = NULL;
12539 
12540    if ((fdbp = FormatDBInit(options)) == NULL)
12541       return 2;
12542    if((fd = FileOpen(options->db_file, "r")) == NULL)
12543       return 3;
12544 
12545    /* Get sequences */
12546    while ((sep = FastaToSeqEntryEx(fd, (Boolean)!options->is_protein,
12547                    NULL, options->parse_mode)) != NULL) {
12548 
12549       if(!IS_Bioseq(sep)) { /* Not Bioseq - failure */
12550      ErrLogPrintf("Error in readind Bioseq Formating failed.\n");
12551      return 4;
12552       }
12553 
12554       bsp = (BioseqPtr) sep->data.ptrvalue;
12555 
12556       if(Bases_In_Volume >= 1) {
12557          if(count > Bases_In_Volume) {
12558             /* starting new volume ? */
12559             count = 0;
12560             if(FormatDBClose(fdbp))
12561                return 9;
12562 
12563             if(Bases_In_Volume > 1) {
12564                sprintf(filenamebuf, "%s.%02ld",
12565                        options->base_name, (long) volume);
12566                options->base_name = StringSave(filenamebuf);
12567                volume++;
12568             }
12569 
12570             if ((fdbp = FormatDBInit(options)) == NULL)
12571                return 2;
12572          }
12573          count += bsp->length;
12574       }
12575       bdp = FDBGetDefAsnFromBioseq(bsp, NULL);
12576       FDBAddBioseq(fdbp, bsp, bdp);
12577       bdp = BlastDefLineFree(bdp);
12578 
12579       SeqEntryFree(sep);
12580    }
12581    FILECLOSE(fd);
12582 
12583    if(FormatDBClose(fdbp))
12584       return 9;
12585 
12586     return 0;
12587 }
12588 
FD_CreateAliasFileEx(CharPtr title,CharPtr basename,Int4 volumes,Boolean is_protein,CharPtr parent,Int4 first_oid,Int4 last_oid,Int8 total_length,Int4 number_seqs,CharPtr oidlist,CharPtr gilist)12589 Boolean FD_CreateAliasFileEx(CharPtr title, CharPtr basename,
12590                              Int4 volumes, Boolean is_protein,
12591                              CharPtr parent,
12592                              Int4 first_oid, Int4 last_oid,
12593                              Int8 total_length, Int4 number_seqs,
12594                  CharPtr oidlist, CharPtr gilist)
12595 {
12596     Char filenamebuf[128];
12597     time_t tnow;
12598     Int4 i;
12599     FILE *fd;
12600 
12601     sprintf(filenamebuf, "%s.%cal", basename, is_protein? 'p' : 'n');
12602 
12603     if((fd = FileOpen(filenamebuf, "wb")) == NULL)
12604         return FALSE;
12605 
12606     tnow = time(NULL);
12607     fprintf(fd, "#\n# Alias file created %s#\n#\n", ctime(&tnow));
12608 
12609     if(title != NULL)
12610         fprintf(fd, "TITLE %s\n#\n", title);
12611     else if (basename != NULL)
12612         fprintf(fd, "TITLE %s\n#\n", basename);
12613     else
12614         fprintf(fd, "#TITLE\n#\n");
12615 
12616     /* Now printing volume databases, or the parent database */
12617     fprintf(fd, "DBLIST ");
12618 
12619     if (volumes == 0 && parent != NULL)
12620        fprintf(fd, "%s", parent);
12621     else {
12622        for(i = 0; i < volumes; i++) {
12623           fprintf(fd, "%s.%02ld ", basename, (long) i);
12624        }
12625     }
12626     fprintf(fd, "\n#\n");
12627 
12628     if (gilist)
12629         fprintf(fd, "GILIST %s\n#\n", gilist);
12630     else
12631         fprintf(fd, "#GILIST\n#\n");
12632 
12633     if (oidlist)
12634         fprintf(fd, "OIDLIST %s\n#\n", oidlist);
12635     else
12636         fprintf(fd, "#OIDLIST\n#\n");
12637 
12638     if (first_oid > 0) {
12639        fprintf(fd, "FIRST_OID %ld\n#\n", (long) first_oid);
12640        fprintf(fd, "LAST_OID %ld\n#\n", (long) last_oid);
12641        fprintf(fd, "NSEQ %ld\n", (long) (last_oid - first_oid + 1));
12642        if (total_length > 0)
12643           fprintf(fd, "LENGTH %s\n", Nlm_Int8tostr(total_length, 0));
12644     }
12645     else if (gilist || number_seqs > 0)
12646     {
12647        /* When there is a gi list, print NSEQ and LENGTH even when they
12648           are 0. */
12649        fprintf(fd, "NSEQ %ld\n", (long) number_seqs);
12650        if (gilist || total_length > 0)
12651           fprintf(fd, "LENGTH %s\n", Nlm_Int8tostr(total_length, 0));
12652     }
12653     FILECLOSE(fd);
12654 
12655     return TRUE;
12656 }
12657 
12658 /* Returns the string that must be used in a multi-volume, multi-oidlist
12659  * (or multi-alias file) database. This string should be used as the DBLIST
12660  * field in the wrapper alias file for the multi-volume subset (or mask).
12661  * Caller is responsible to deallocate the return value */
FD_ConstructMultivolumeDBList(CharPtr basename,Int4 nvols)12662 CharPtr FD_ConstructMultivolumeDBList(CharPtr basename, Int4 nvols)
12663 {
12664     CharPtr retval = NULL;
12665     Int4 i, len = 0;
12666     Char numstr[10];
12667 
12668     if (!basename || basename[0] == NULLB || nvols <= 0)
12669         return NULL;
12670 
12671     /* Allocate memory for return value */
12672     len = ((StringLen(basename) + 1) * nvols) + nvols + 1;
12673     len += (4*nvols); /* for the '.NN' extension */
12674     if ((retval = (CharPtr)MemNew(sizeof(Char)*len)) == NULL) {
12675         ErrPostEx(SEV_ERROR, 0, 0,
12676                 "FD_ConstructMultivolumeDBList: out of memory");
12677         return NULL;
12678     }
12679 
12680     for (i = 0; i < nvols; i++) {
12681 
12682         /* convert nvols to a string */
12683         MemSet(numstr, 0, sizeof(numstr));
12684         if (i < 100) {
12685             sprintf(numstr, ".%02ld ", (long) i);
12686         } else {
12687             sprintf(numstr, ".%03ld ", (long) i);
12688         }
12689         retval = StringCat(retval, basename);
12690         retval = StringCat(retval, numstr);
12691     }
12692     retval[StringLen(retval)] = NULLB;
12693 
12694     return retval;
12695 }
12696 
FD_CreateAliasFile(CharPtr title,CharPtr basename,Int4 volumes,Boolean is_protein)12697 Boolean FD_CreateAliasFile(CharPtr title, CharPtr basename,
12698                            Int4 volumes, Boolean is_protein)
12699 {
12700    return FD_CreateAliasFileEx(title, basename, volumes, is_protein,
12701                                NULL, 0, 0, 0, 0, NULL, NULL);
12702 }
12703 
FD_MakeAliasFile(FDB_optionsPtr options)12704 Boolean FD_MakeAliasFile(FDB_optionsPtr options)
12705 {
12706    if (options == NULL)
12707     return FALSE;
12708 
12709    if (options->volume > 0)
12710        return FD_CreateAliasFileEx(options->db_title, options->alias_file_name, options->volume+1,
12711                                    options->is_protein, NULL, 0, 0, 0, 0, NULL, NULL);
12712    else
12713      return FALSE;
12714 }
12715 
12716 
12717 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
12718 #ifdef  HAVE_MADVISE
12719 
12720 /* IMPORTANT INFO:
12721  *
12722  * If we need to preload file(s) using madvise(), we do it now.
12723  * There are several file chunks that could be preloaded,
12724  * and several considerations to be taken into account.
12725  *
12726  * The file chunks are:
12727  * 1) the portion of the index file containing pointers to needed sections of
12728  *    the header file.
12729  * 2) the portion of the index file containing pointers to needed sections of
12730  *    the sequence file.
12731  * 3) the portion of the index file containing pointers to needed sections of
12732  *    the ambchar.
12733  * 4) the needed portion of the header file.
12734  * 5) the needed portion of the sequence file.
12735  *
12736  * The preloading consideration are
12737  * 1) whether the file chunk has been memory mapped.
12738  * 2) the size of the individual chunks, and
12739  * 3) the combined size of all chunks.
12740  *
12741  * If the size of an individual chunk is smaller than MADVISE_MIN_SIZE pages,
12742  * there is no obvious benefit to applying madvise, and it could be avoided.
12743  *
12744  * Also, if the total size of all the chunks to be preloaded exceeds
12745  * certain share of RAM cache size, some chunk portions may not be preloaded,
12746  * -- and even then some preloaded chunks won't stay in memory, -- but this is
12747  * the best we can do. We should, however, minimize probability that preloaded
12748  * pages will be pushed out by some other process, so we'll assume that
12749  * given that the same database is likely to be processed again and again on
12750  * the same server, the available portion of RAM is some value greater than 50%,
12751  * and is defined by MADVISE_RAM_SHARE.
12752  *
12753  *
12754  */
12755 #define MADVISE_MIN_SIZE  16
12756 #define MADVISE_RAM_SHARE 90
12757 
12758 /** exclusively for async madvise() */
12759 typedef struct {
12760 	void * mp;
12761 	size_t len;
12762 	EMemMapAdvise advice;
12763 
12764 }
12765 MadviseParam_t;
12766 
12767 /** */
12768 static void*
readdb_do_madvise(void * param)12769 readdb_do_madvise(void *param)
12770 {
12771 #ifdef READDB_DEBUG
12772 	fprintf(stderr, "madvise(%p, %u, %d)\n", ((MadviseParam_t *)param)->mp,
12773 		((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice);
12774 #else
12775 	ErrPostEx(SEV_INFO, 0, 0, "madvise(0x%x, %u, %d)", ((MadviseParam_t *)param)->mp,
12776 		((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice);
12777 
12778 #endif
12779 
12780 	if( !Nlm_MemMapAdvise(((MadviseParam_t *)param)->mp,
12781 			((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice) ) {
12782 
12783 #ifdef READDB_DEBUG
12784 		fprintf(stderr, "Nlm_MemMapAdvise(%p, %u, %d) failed: %s\n",
12785 			((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12786 			((MadviseParam_t *)param)->advice, strerror(errno));
12787 #else
12788 		ErrPostEx(SEV_WARNING, 0, 0, "Nlm_MemMapAdvise(0x%x, %u, %d) failed: %s",
12789 			((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12790 			((MadviseParam_t *)param)->advice, strerror(errno));
12791 #endif
12792 	}
12793 
12794 #ifdef READDB_DEBUG
12795 	fprintf(stderr, "\t\t\t\tdone madvise(%p, %u, %d)\n",
12796 		((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12797 		((MadviseParam_t *)param)->advice);
12798 #endif
12799 
12800 	Nlm_MemFree(param);
12801 	return NULL;
12802 }
12803 
12804 /** */
12805 static void
readdb_madvise(void * mp,size_t len,EMemMapAdvise advice,Boolean sync,EThreadPriority pri)12806 readdb_madvise (void * mp, size_t len,
12807                 EMemMapAdvise advice, Boolean sync, EThreadPriority pri)
12808 {
12809 	MadviseParam_t *param = (MadviseParam_t *)Nlm_MemNew(sizeof(MadviseParam_t));
12810 	if( param ) {
12811 		param->mp = mp;
12812 		param->len = len;
12813 		param->advice = advice;
12814 		if( sync ) {
12815 			readdb_do_madvise(param);
12816 		}
12817 		else {
12818 			NlmThreadCreateEx(readdb_do_madvise, (void *)param,
12819 				THREAD_RUN | THREAD_DETACHED, pri, NULL, NULL);
12820 		}
12821 	}
12822 }
12823 
12824 /** */
12825 static void
readdb_preload_index(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)12826 readdb_preload_index (ReadDBFILEPtr rdfp, Int4 first_db_seq,
12827 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
12828 {
12829 	Uint4 idxHdrOffset = 0;
12830 	Uint4 idxSeqOffset = 0;
12831 	Uint4 idxAmbOffset = 0;
12832 
12833 	Uint4 idxLength = 0;
12834 	Uint4 firstPage = 0;
12835 
12836 	uintptr_t baseOffset = (uintptr_t)rdfp->indexfp->mmp_begin;
12837 
12838 	/* get page size */
12839 	long pagesz = sysconf(_SC_PAGESIZE);
12840 
12841 	/* sanity check */
12842 	if( !rdfp || pagesz < 0 ) {
12843 		return;
12844 	}
12845 
12846 	/* insure that we are within the ordinal id range */
12847 	if( first_db_seq < rdfp->start ) {
12848 		first_db_seq = rdfp->start;
12849 	}
12850 	if( final_db_seq >= rdfp->stop ) {
12851 		final_db_seq = rdfp->stop - 1;
12852 	}
12853 
12854 	/* verify that the index file is memory mapped */
12855 	if( rdfp->indexfp && rdfp->indexfp->mfile_true ) {
12856 
12857 		/* portion of the index file containing pointers to header file. */
12858 		firstPage = (first_db_seq * 4) / pagesz;
12859 		idxHdrOffset = firstPage * pagesz;
12860 		idxLength = (final_db_seq - first_db_seq) * 4;
12861 		idxLength += (pagesz - idxLength % pagesz);
12862 
12863 		/* madvise segments if they are big enough */
12864 		if( idxLength / pagesz > MADVISE_MIN_SIZE ) {
12865 
12866 			/* portion of the index file containing pointers to sequence file. */
12867 			firstPage = ((rdfp->num_seqs + 1 + first_db_seq) * 4) / pagesz;
12868 			idxSeqOffset = firstPage * pagesz;
12869 
12870 			/* portion of the index file containing pointers to ambchars in seq file. */
12871 			firstPage = ((2 * rdfp->num_seqs + 2 + first_db_seq) * 4) / pagesz;
12872 			idxAmbOffset = firstPage * pagesz;
12873 		}
12874 	}
12875 
12876 	/* ensure that madvise() is called on page boundary */
12877 	if( baseOffset % pagesz ) {
12878 		uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
12879 		baseOffset += adjustVal;
12880 		idxHdrOffset -= ((adjustVal && idxHdrOffset > pagesz) ? pagesz : 0);
12881 		idxSeqOffset -= ((adjustVal && idxSeqOffset > pagesz) ? pagesz : 0);
12882 		idxAmbOffset -= ((adjustVal && idxAmbOffset > pagesz) ? pagesz : 0);
12883 	}
12884 
12885 #ifdef READDB_DEBUG
12886 	fprintf(stderr, "MMP        Offset:    %p\n", rdfp->indexfp->mmp_begin);
12887 	fprintf(stderr, "MMP Adjust Offset:    %p\n", baseOffset);
12888 	fprintf(stderr, "Index Head Offset:    %ld\n", idxHdrOffset);
12889 	fprintf(stderr, "Index File Length:    %ld\n", idxLength);
12890 #endif
12891 
12892 	/* finally, preload chunks that are big enough to make it
12893 	 * worth while */
12894 	if( rdfp->indexfp && idxLength / pagesz > MADVISE_MIN_SIZE ) {
12895 		readdb_madvise((char *)(baseOffset + idxHdrOffset), idxLength,
12896 						advice, sync, eTP_Default);
12897 		readdb_madvise((char *)(baseOffset + idxSeqOffset), idxLength,
12898 						advice, sync, eTP_Default);
12899 		readdb_madvise((char *)(baseOffset + idxAmbOffset), idxLength,
12900 						advice, sync, eTP_Default);
12901 	}
12902 }
12903 
12904 /** */
12905 static void
readdb_preload_data(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)12906 readdb_preload_data (ReadDBFILEPtr rdfp, Int4 first_db_seq,
12907 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
12908 {
12909 	Uint4 hdrOffset = 0;
12910 	Uint4 hdrLength = 0;
12911 	Uint4 seqOffset = 0;
12912 	Uint4 seqLength = 0;
12913 
12914 	Uint4 firstPage = 0;
12915 
12916 	long allowPages = 0;
12917 	long needPages = 0;
12918 
12919 	/* get page size */
12920 	long pagesz = sysconf(_SC_PAGESIZE);
12921 	long totalPages = sysconf(_SC_PHYS_PAGES);
12922 
12923 	/* sanity check */
12924 	if( !rdfp || pagesz < 0 || totalPages < 0 ) {
12925 		return;
12926 	}
12927 
12928 	/* insure that we are within the ordinal id range */
12929 	if( first_db_seq < rdfp->start ) {
12930 		first_db_seq = rdfp->start;
12931 	}
12932 	if( final_db_seq >= rdfp->stop ) {
12933 		final_db_seq = rdfp->stop - 1;
12934 	}
12935 
12936 	/** verify that the header file is memory mapped */
12937 	if( rdfp->headerfp && rdfp->headerfp->mfile_true ) {
12938 		long firstOff = Nlm_SwapUint4(rdfp->header_index[first_db_seq]);
12939 		long lastOff = Nlm_SwapUint4(rdfp->header_index[final_db_seq]);
12940 
12941 		firstPage = firstOff / pagesz;
12942 		hdrOffset = firstPage * pagesz;
12943 		hdrLength = lastOff - firstOff;
12944 		hdrLength += (pagesz - hdrLength % pagesz);
12945 	}
12946 
12947 #ifdef READDB_DEBUG
12948 	if( !rdfp->sequencefp ) {
12949 		fprintf(stderr, "rdfp->sequencefp == NULL\n");
12950 	}
12951 #endif
12952 
12953 	/** verify that the sequence file is memory mapped */
12954 	if( rdfp->sequencefp && rdfp->sequencefp->mfile_true ) {
12955 		long firstOff = Nlm_SwapUint4(rdfp->sequence_index[first_db_seq]);
12956 		long lastOff = Nlm_SwapUint4(rdfp->sequence_index[final_db_seq]);
12957 
12958 		firstPage = firstOff / pagesz;
12959 		seqOffset = firstPage * pagesz;
12960 		seqLength = lastOff - firstOff;
12961 		seqLength += (pagesz - seqLength % pagesz);
12962 	}
12963 
12964 	/** before preloading pages, trim sizes so that the total
12965 	 *  is under MADVISE_RAM_SHARE */
12966 	allowPages = totalPages / 100 * MADVISE_RAM_SHARE;
12967 	needPages = (hdrLength + seqLength) / pagesz;
12968 
12969 	if( needPages > allowPages ) {
12970 		int pctTrim = (needPages - allowPages) * 100 / needPages;
12971 
12972 		/* trim proportionately all chunks */
12973 		hdrLength -= hdrLength * pctTrim / 100;
12974 		hdrLength += (pagesz - hdrLength % pagesz);
12975 
12976 		seqLength -= seqLength * pctTrim / 100;
12977 		seqLength += (pagesz - seqLength % pagesz);
12978 	}
12979 
12980 #ifdef READDB_DEBUG
12981 	fprintf(stderr, "Header File:   %ld\n", hdrLength);
12982 	fprintf(stderr, "Sequence File: %ld\n", seqLength);
12983 #endif
12984 
12985 	/* finally, preload chunks that are big enough to make it
12986 	 * worth while */
12987 	if( rdfp->headerfp && hdrLength / pagesz > MADVISE_MIN_SIZE ) {
12988 		uintptr_t baseOffset = (uintptr_t)rdfp->headerfp->mmp_begin;
12989 		if( baseOffset % pagesz ) {
12990 			uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
12991 			baseOffset += adjustVal;
12992 			hdrOffset -= ((adjustVal && hdrOffset > pagesz) ? pagesz : 0);
12993 		}
12994 		readdb_madvise((char *)(baseOffset + hdrOffset), hdrLength,
12995 						advice, sync, eTP_Default);
12996 	}
12997 
12998 	if( rdfp->sequencefp && seqLength / pagesz > MADVISE_MIN_SIZE ) {
12999 		uintptr_t baseOffset = (uintptr_t)rdfp->sequencefp->mmp_begin;
13000 		if( baseOffset % pagesz ) {
13001 			uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
13002 			baseOffset += adjustVal;
13003 			seqOffset -= ((adjustVal && hdrOffset > pagesz) ? pagesz : 0);
13004 		}
13005 		readdb_madvise((char *)(baseOffset + seqOffset), seqLength,
13006 		               advice, sync, eTP_Default);
13007 	}
13008 }
13009 
13010 /** simpler and more efficient approach than the above */
13011 static void
readdb_preload_file(NlmMFILEPtr mFilePtr,Int4 nPages,EMemMapAdvise advice,Boolean sync,EThreadPriority pri)13012 readdb_preload_file (NlmMFILEPtr mFilePtr, Int4 nPages,
13013 					EMemMapAdvise advice, Boolean sync, EThreadPriority pri)
13014 {
13015 	long pagesz;
13016 	size_t len;
13017 
13018 	/* general sanity check */
13019 	if( !mFilePtr || !mFilePtr->mfile_true || !mFilePtr->mmp
13020 		|| !mFilePtr->mmp_madvise_end || !mFilePtr->mmp_end ) {
13021 		return;
13022 	}
13023 
13024 	/* check whether this portion was loaded before */
13025 	if( mFilePtr->mmp < mFilePtr->mmp_madvise_end ||
13026 		mFilePtr->mmp_end <= mFilePtr->mmp_madvise_end ) {
13027 		return;
13028 	}
13029 
13030 	pagesz = sysconf(_SC_PAGESIZE);
13031 	len = madvisePreloadBlock * pagesz;
13032 	if( len > mFilePtr->mmp_end - mFilePtr->mmp_madvise_end ) {
13033 		len = mFilePtr->mmp_end - mFilePtr->mmp_madvise_end;
13034 	}
13035 	readdb_madvise(mFilePtr->mmp_madvise_end, len, advice, sync, pri);
13036 	mFilePtr->mmp_madvise_end += len;
13037 }
13038 
13039 /** */
13040 void LIBCALL
readdb_preload(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)13041 readdb_preload (ReadDBFILEPtr rdfp, Int4 first_db_seq,
13042 				Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
13043 {
13044 	/* do not preload index */
13045 	/* readdb_preload_index(rdfp, first_db_seq, final_db_seq, advice, sync); */
13046 	readdb_preload_data(rdfp, first_db_seq, final_db_seq, advice, sync);
13047 }
13048 
13049 /** */
13050 void LIBCALL
readdb_madvise_enable(Boolean enable)13051 readdb_madvise_enable (Boolean enable)
13052 {
13053 	useMadvise = enable;
13054 }
13055 
13056 /** */
13057 void LIBCALL
readdb_madvise_type(EMemMapAdvise advice)13058 readdb_madvise_type (EMemMapAdvise advice)
13059 {
13060 	mmapAdvice = advice;
13061 }
13062 
13063 /** */
13064 void LIBCALL
readdb_madvise_sync_mode(Boolean mode)13065 readdb_madvise_sync_mode (Boolean mode)
13066 {
13067 	madviseSyncMode = mode;
13068 }
13069 
13070 /** */
13071 void LIBCALL
readdb_madvise_block(Int4 nSeqs)13072 readdb_madvise_block (Int4 nSeqs)
13073 {
13074 	madvisePreloadBlock = nSeqs;
13075 }
13076 
13077 #endif /* HAVE_MADVISE */
13078 #endif /* SOL || LINUX */
13079 
13080 /*** PIG (Protein Identifier Group) interface ***/
13081 
13082 FDBPigTablePtr LIBCALL
FDBPigTableNew()13083 FDBPigTableNew()
13084 {
13085     FDBPigTablePtr fptp = NULL;
13086 
13087     if ( !(fptp = (FDBPigTablePtr) MemNew(sizeof(FDBPigTable))))
13088         return NULL;
13089 
13090     fptp->count = 0;
13091     fptp->allocated = INDEX_INIT_SIZE*2;
13092 
13093     if ( !(fptp->pop = (Int4Ptr) MemNew(sizeof(Int4)*fptp->allocated)))
13094         return FDBPigTableFree(fptp);
13095 
13096     return fptp;
13097 }
13098 
13099 FDBPigTablePtr LIBCALL
FDBPigTableFree(FDBPigTablePtr fptp)13100 FDBPigTableFree(FDBPigTablePtr fptp)
13101 {
13102     if (!fptp)
13103         return NULL;
13104 
13105     fptp->pop = MemFree(fptp->pop);
13106     return MemFree(fptp);
13107 }
13108 
13109 Boolean LIBCALL
FDBAddPig(FDBPigTablePtr fptp,Int4 pig,Int4 oid)13110 FDBAddPig(FDBPigTablePtr fptp, Int4 pig, Int4 oid)
13111 {
13112     if (!fptp || pig == PIG_NONE || oid < 0)
13113         return FALSE;
13114 
13115     /* Reallocate if necessary */
13116     if (fptp->count + 2 >= fptp->allocated) {
13117         fptp->allocated += (INDEX_ARRAY_CHUNKS*2);
13118         fptp->pop = (Int4Ptr) Realloc(fptp->pop, sizeof(Int4)*fptp->allocated);
13119 
13120         if (!fptp->pop) {
13121             FDBPigTableFree(fptp);
13122             return FALSE;
13123         }
13124     }
13125 
13126     fptp->pop[fptp->count++] = pig;
13127     fptp->pop[fptp->count++] = oid;
13128 
13129     return TRUE;
13130 }
13131 
13132 Int4 LIBCALL
readdb_get_pig(ReadDBFILEPtr rdfp,Int4 oid)13133 readdb_get_pig(ReadDBFILEPtr rdfp, Int4 oid)
13134 {
13135     BlastDefLineSetPtr bdp_set = NULL;
13136     BlastDefLinePtr bdp = NULL;
13137     Int4 pig = PIG_NONE;
13138 
13139     if (rdfp->formatdb_ver < FORMATDB_VER)
13140         return pig;
13141 
13142     if (!(bdp_set = FDReadDeflineAsn(rdfp, oid)))
13143         return pig;
13144 
13145 
13146     for (bdp = bdp_set; bdp; bdp = bdp->next) {
13147         if (bdp->other_info &&
13148             ( (pig = bdp->other_info->data.intvalue) != PIG_NONE)) {
13149             bdp_set = (BlastDefLinePtr) BlastDefLineSetFree(bdp_set);
13150             return pig;
13151         }
13152     }
13153     bdp_set = (BlastDefLinePtr) BlastDefLineSetFree(bdp_set);
13154     return pig;
13155 }
13156 
13157 Int4 LIBCALL
readdb_pig2oid(ReadDBFILEPtr rdfp,Int4 pig,Int4Ptr start)13158 readdb_pig2oid(ReadDBFILEPtr rdfp, Int4 pig, Int4Ptr start)
13159 {
13160     Int4 retval = -1;
13161     ISAMErrorCode error;
13162     Uint4 oid = 0;
13163 
13164     for ( ; rdfp; rdfp = rdfp->next) {
13165 
13166         if (!rdfp->isam_pig)
13167             continue;
13168 
13169         if ( (error = NISAMSearch(rdfp->isam_pig, pig, &oid, NULL)) < 0) {
13170             ErrPostEx(SEV_WARNING, 0, 0, "Failed to initialize PIG search"
13171                     "on %s\nISAM Error code is %d\n", rdfp->filename, error);
13172             continue;
13173         } else if (error != ISAMNotFound) {
13174             if (start)
13175                 *start = rdfp->start;
13176             retval = (Int4)oid + rdfp->start;
13177             break;
13178         }
13179     }
13180 
13181     return retval;
13182 }
13183 
s_IsTextFile(const char * filename)13184 static Boolean s_IsTextFile(const char* filename)
13185 {
13186     FILE* fp = NULL;
13187     Boolean retval = TRUE;
13188     Int4 i = 0;
13189 
13190     if ( !(fp = FileOpen(filename, "r"))) {
13191         return FALSE;
13192     }
13193 
13194     for (i = 0; i < 10 && !feof(fp); i++) {
13195         int c = getc(fp);
13196         if ( ! (isprint(c) || isspace(c)) ) {
13197             retval = FALSE;
13198             break;
13199         }
13200     }
13201     FileClose(fp);
13202     return retval;
13203 }
13204 
13205 /*** TaxidDeflineTable interface ***/
13206 
13207 const Int4 kTaxidDeflineSearch_NotFound = -1;
13208 static const Int4 kNoGi = -1;
13209 static const Char* kNoSeqid = NULL;
13210 
13211 typedef enum EFDBTaxidDeflineDataType {
13212     eTaxidDefline_Gi = 1,
13213     eTaxidDefline_Seqid = 2
13214 } EFDBTaxidDeflineDataType;
13215 
13216 typedef struct FDBTaxidDeflineData_Gi {
13217     Int4 gi;
13218     Int4 taxid;
13219 } FDBTaxidDeflineData_Gi;
13220 
13221 typedef struct FDBTaxidDeflineData_Seqid {
13222     Char seqid[ID_MAX_SIZE+1];
13223     Int4 taxid;
13224 } FDBTaxidDeflineData_Seqid;
13225 
13226 /** Gi/taxid structure used to read the file specified in the formatdb
13227  * configuration file to set the taxonomy ids for the listed gis.
13228  */
13229 struct FDBTaxidDeflineTable {
13230     EFDBTaxidDeflineDataType type;  /* type of the table below */
13231     void*       data;               /* either an array of
13232                                        FDBTaxidDeflineTable_Gi or
13233                                        FDBTaxidDeflineTable_Seqid */
13234     Int4        count, allocated;   /* keep track of table size */
13235 };
13236 
13237 static size_t
s_FDBTaxidDeflineTable_GetDataTypeSize(FDBTaxidDeflineTablePtr taxid_tbl)13238 s_FDBTaxidDeflineTable_GetDataTypeSize(FDBTaxidDeflineTablePtr taxid_tbl)
13239 {
13240     size_t retval = 0;
13241 
13242     ASSERT(taxid_tbl);
13243 
13244     switch (taxid_tbl->type) {
13245     case eTaxidDefline_Gi:
13246         retval = sizeof(FDBTaxidDeflineData_Gi);
13247         break;
13248 
13249     case eTaxidDefline_Seqid:
13250         retval = sizeof(FDBTaxidDeflineData_Seqid);
13251         break;
13252 
13253     default:
13254         abort();
13255     }
13256 
13257     return retval;
13258 }
13259 
13260 /** Encapsulate addition of entries to FDBTaxidDeflineTable structure */
13261 static Boolean
s_FDBTaxidDeflineTableAddEntry(FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi,const char * seqid,Int4 taxid)13262 s_FDBTaxidDeflineTableAddEntry(FDBTaxidDeflineTablePtr taxid_tbl,
13263                                Int4 gi, const char* seqid, Int4 taxid)
13264 {
13265     ASSERT(taxid_tbl);
13266 
13267     if (taxid < 0) {
13268         ErrPostEx(SEV_ERROR, 0, 0, "Cannot add negative taxonomy id");
13269         return FALSE;
13270     }
13271 
13272     /* Reallocate if necessary */
13273     if (taxid_tbl->count + 1 >= taxid_tbl->allocated) {
13274         size_t data_type_size =
13275             s_FDBTaxidDeflineTable_GetDataTypeSize(taxid_tbl);
13276         taxid_tbl->allocated += (INDEX_ARRAY_CHUNKS);
13277         taxid_tbl->data = Realloc(taxid_tbl->data,
13278                                   data_type_size*taxid_tbl->allocated);
13279         if ( !taxid_tbl->data ) {
13280             FDBTaxidDeflineTableFree(taxid_tbl);
13281             return FALSE;
13282         }
13283     }
13284 
13285     switch (taxid_tbl->type) {
13286     case eTaxidDefline_Gi:
13287         {
13288             FDBTaxidDeflineData_Gi* gi_taxid_pairs =
13289                 (FDBTaxidDeflineData_Gi*) taxid_tbl->data;
13290             ASSERT(seqid == NULL);
13291             gi_taxid_pairs[taxid_tbl->count].gi = gi;
13292             gi_taxid_pairs[taxid_tbl->count++].taxid = taxid;
13293         }
13294         break;
13295     case eTaxidDefline_Seqid:
13296         {
13297             FDBTaxidDeflineData_Seqid* seqid_taxid_pairs =
13298                 (FDBTaxidDeflineData_Seqid*) taxid_tbl->data;
13299             ASSERT(seqid != NULL);
13300             StringNCpy_0(seqid_taxid_pairs[taxid_tbl->count].seqid, seqid,
13301                          ID_MAX_SIZE);
13302             seqid_taxid_pairs[taxid_tbl->count++].taxid = taxid;
13303         }
13304         break;
13305     default:
13306         abort();
13307     }
13308 
13309     return TRUE;
13310 }
13311 
13312 /** HeapSort comparison function to sort a TaxidDeflineTable structure
13313  * (sorts by gi)
13314  */
s_TaxidDeflineDataGi_Compare(VoidPtr i,VoidPtr j)13315 static int LIBCALLBACK s_TaxidDeflineDataGi_Compare(VoidPtr i, VoidPtr j)
13316 {
13317     Int4 gi1 = ((FDBTaxidDeflineData_Gi*)i)->gi;
13318     Int4 gi2 = ((FDBTaxidDeflineData_Gi*)j)->gi;
13319 
13320     return BLAST_CMP(gi1, gi2);
13321 }
13322 
13323 /** HeapSort comparison function to sort a TaxidDeflineTable structure
13324  * (sorts by seqid strings)
13325  */
s_TaxidDeflineDataSeqid_Compare(VoidPtr i,VoidPtr j)13326 static int LIBCALLBACK s_TaxidDeflineDataSeqid_Compare(VoidPtr i, VoidPtr j)
13327 {
13328     const Char* seqid1 = ((FDBTaxidDeflineData_Seqid*)i)->seqid;
13329     const Char* seqid2 = ((FDBTaxidDeflineData_Seqid*)j)->seqid;
13330 
13331     return StringCmp(seqid1, seqid2);
13332 }
13333 
13334 static FDBTaxidDeflineTablePtr
s_FDBTaxidDeflineTableNew_Gi(const Char * filename)13335 s_FDBTaxidDeflineTableNew_Gi(const Char* filename)
13336 {
13337     FDBTaxidDeflineTablePtr retval = NULL;
13338     FILE* fp = NULL;
13339     size_t data_type_size = 0;
13340 
13341     if ( !filename )
13342         return NULL;
13343 
13344     if ( !(fp = FileOpen(filename, "r")))
13345         return NULL;
13346 
13347     retval = (FDBTaxidDeflineTablePtr) MemNew(sizeof(FDBTaxidDeflineTable));
13348     if ( !retval ) {
13349         FileClose(fp);
13350         return NULL;
13351     }
13352 
13353     retval->count = 0;
13354     retval->allocated = INDEX_INIT_SIZE;
13355     retval->type = eTaxidDefline_Gi;
13356     data_type_size = s_FDBTaxidDeflineTable_GetDataTypeSize(retval);
13357 
13358     if ( !(retval->data = MemNew(data_type_size*retval->allocated))) {
13359         FileClose(fp);
13360         return FDBTaxidDeflineTableFree(retval);
13361     }
13362 
13363     /* Each line in the input file has the following format:
13364        gi taxid
13365        gi taxid
13366        ...
13367      */
13368     {
13369         Int4 gi = -1, taxid = -1;
13370         Int4 nread = 0; /* number of elements assigned by fscanf */
13371         Boolean success = FALSE;
13372         while ( (nread = fscanf(fp, "%d %d", &gi, &taxid)) != EOF) {
13373             if (nread != 2) {
13374                 break;
13375             }
13376             success = s_FDBTaxidDeflineTableAddEntry(retval, gi,
13377                                                      kNoSeqid, taxid);
13378             if ( !success ) {
13379                 break;
13380             }
13381         }
13382         if ( !feof(fp) || ferror(fp) || nread != EOF ) {
13383             ErrPostEx(SEV_INFO, 0, 0, "Failed to read "
13384                       "gi/taxonomy id pairs from %s", filename);
13385             FileClose(fp);
13386             return FDBTaxidDeflineTableFree(retval);
13387         }
13388         FileClose(fp);
13389     }
13390 
13391     if (retval->count == 0) {
13392         return FDBTaxidDeflineTableFree(retval);
13393     }
13394 
13395     /* Sort the list by gis */
13396     HeapSort(retval->data, retval->count, data_type_size,
13397              s_TaxidDeflineDataGi_Compare);
13398 
13399     ErrLogPrintf("Read %d gi/taxonomy id pairs from %s\n",
13400                  retval->count, filename);
13401 
13402     return retval;
13403 }
13404 
13405 static FDBTaxidDeflineTablePtr
s_FDBTaxidDeflineTableNew_Seqid(const Char * filename)13406 s_FDBTaxidDeflineTableNew_Seqid(const Char* filename)
13407 {
13408     FDBTaxidDeflineTablePtr retval = NULL;
13409     FILE* fp = NULL;
13410     size_t data_type_size = 0;
13411 
13412     if ( !filename )
13413         return NULL;
13414 
13415     if ( !(fp = FileOpen(filename, "r")))
13416         return NULL;
13417 
13418     retval = (FDBTaxidDeflineTablePtr) MemNew(sizeof(FDBTaxidDeflineTable));
13419     if ( !retval ) {
13420         FileClose(fp);
13421         return NULL;
13422     }
13423 
13424     retval->count = 0;
13425     retval->allocated = INDEX_INIT_SIZE;
13426     retval->type = eTaxidDefline_Seqid;
13427     data_type_size = s_FDBTaxidDeflineTable_GetDataTypeSize(retval);
13428 
13429     if ( !(retval->data = MemNew(data_type_size*retval->allocated))) {
13430         FileClose(fp);
13431         return FDBTaxidDeflineTableFree(retval);
13432     }
13433 
13434     /* Each line in the input file has the following format:
13435        seqid taxid
13436        seqid taxid
13437        ...
13438 
13439        N.B.: seqid is a string with ID_MAX_SIZE characters, which does NOT
13440        include a leading '>' character
13441      */
13442     {
13443         Int4 nread = 0; /* number of elements assigned by fscanf */
13444         Char format[ID_MAX_SIZE] = { '\0' }; /* format string for fscanf */
13445         Int4 taxid = -1;
13446         Char seqid_buf[ID_MAX_SIZE+1] = { '\0' };
13447         Boolean success = FALSE;
13448 
13449         StringCat(format, "%");
13450         StringCat(format, Nlm_Int8tostr((Int8)ID_MAX_SIZE, 1));
13451         StringCat(format, "s %d");
13452 
13453         while ( (nread = fscanf(fp, format, &seqid_buf, &taxid)) != EOF) {
13454             if (nread != 2) {
13455                 break;
13456             }
13457             success = s_FDBTaxidDeflineTableAddEntry(retval, kNoGi,
13458                                                      seqid_buf, taxid);
13459             if ( !success ) {
13460                 break;
13461             }
13462         }
13463         if ( !feof(fp) || ferror(fp) || nread != EOF ) {
13464             ErrPostEx(SEV_INFO, 0, 0, "Failed to read "
13465                       "Seq-id/taxonomy id pairs from %s", filename);
13466             FileClose(fp);
13467             return FDBTaxidDeflineTableFree(retval);
13468         }
13469         FileClose(fp);
13470     }
13471 
13472     if (retval->count == 0) {
13473         return FDBTaxidDeflineTableFree(retval);
13474     }
13475 
13476     /* Sort the list by seqids */
13477     HeapSort(retval->data, retval->count, data_type_size,
13478              s_TaxidDeflineDataSeqid_Compare);
13479 
13480     ErrLogPrintf("Read %d Seq-id/taxonomy id pairs from %s\n",
13481                  retval->count, filename);
13482 
13483     return retval;
13484 }
13485 
13486 FDBTaxidDeflineTablePtr LIBCALL
FDBTaxidDeflineTableNew(const Char * filename)13487 FDBTaxidDeflineTableNew PROTO((const Char* filename))
13488 {
13489     FDBTaxidDeflineTablePtr retval = NULL;
13490 
13491     /* Try reading a list of gi/taxid pairs */
13492     retval = s_FDBTaxidDeflineTableNew_Gi(filename);
13493     if ( !retval ) {
13494         /* Try reading a list of seqid/taxid pairs */
13495         retval = s_FDBTaxidDeflineTableNew_Seqid(filename);
13496     }
13497     return retval;
13498 }
13499 
13500 FDBTaxidDeflineTablePtr LIBCALL
FDBTaxidDeflineTableFree(FDBTaxidDeflineTablePtr taxid_tbl)13501 FDBTaxidDeflineTableFree PROTO((FDBTaxidDeflineTablePtr taxid_tbl))
13502 {
13503     if ( !taxid_tbl ) {
13504         return NULL;
13505     }
13506 
13507     /* Use a switch statement in case there's ever a need for a more elaborate
13508      * TaxidDefline_* data type */
13509     switch (taxid_tbl->type) {
13510     case eTaxidDefline_Gi:
13511         taxid_tbl->data = MemFree(taxid_tbl->data);
13512         break;
13513 
13514     case eTaxidDefline_Seqid:
13515         taxid_tbl->data = MemFree(taxid_tbl->data);
13516         break;
13517 
13518     default:
13519         abort();
13520     }
13521 
13522     return MemFree(taxid_tbl);
13523 }
13524 
13525 static Int4
s_FDBTaxidDeflineTableSearch_Gi(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi)13526 s_FDBTaxidDeflineTableSearch_Gi(const FDBTaxidDeflineTablePtr taxid_tbl,
13527                                 Int4 gi)
13528 {
13529     FDBTaxidDeflineData_Gi* gi_taxid_pairs =
13530         (FDBTaxidDeflineData_Gi*) taxid_tbl->data;
13531     Int4 retval = kTaxidDeflineSearch_NotFound;
13532 
13533     /* perform binary search */
13534     {
13535         Int4 m, b, e;
13536         b = 0;
13537         e = taxid_tbl->count;
13538         while (b <= e) {
13539             m = (b + e) / 2;
13540             if (gi_taxid_pairs[m].gi > gi) {
13541                 e = m - 1;
13542             } else if (gi_taxid_pairs[m].gi < gi) {
13543                 b = m + 1;
13544             } else {
13545                 retval = gi_taxid_pairs[m].taxid;
13546                 break;
13547             }
13548         }
13549     }
13550 
13551     return retval;
13552 }
13553 
13554 static Int4
s_FDBTaxidDeflineTableSearch_Seqid(const FDBTaxidDeflineTablePtr taxid_tbl,const Char * seqid)13555 s_FDBTaxidDeflineTableSearch_Seqid(const FDBTaxidDeflineTablePtr taxid_tbl,
13556                                    const Char* seqid)
13557 {
13558     FDBTaxidDeflineData_Seqid* seqid_taxid_pairs =
13559         (FDBTaxidDeflineData_Seqid*) taxid_tbl->data;
13560     Int4 retval = kTaxidDeflineSearch_NotFound;
13561 
13562     if ( !seqid ) {
13563         return retval;
13564     }
13565 
13566     /* perform binary search */
13567     {
13568         Int4 m, b, e, rv;
13569         b = 0;
13570         e = taxid_tbl->count;
13571         while (b <= e) {
13572             m = (b + e) / 2;
13573             rv = StringCmp(seqid_taxid_pairs[m].seqid, seqid);
13574             if (rv > 0) {
13575                 e = m - 1;
13576             } else if (rv < 0) {
13577                 b = m + 1;
13578             } else {
13579                 retval = seqid_taxid_pairs[m].taxid;
13580                 break;
13581             }
13582         }
13583     }
13584 
13585     return retval;
13586 }
13587 
13588 static Int4
s_FDBTaxidDeflineTableSearch(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi,const Char * seqid)13589 s_FDBTaxidDeflineTableSearch(const FDBTaxidDeflineTablePtr taxid_tbl,
13590                              Int4 gi, const Char* seqid)
13591 {
13592     Int4 retval = kTaxidDeflineSearch_NotFound;
13593 
13594     if ( !taxid_tbl ) {
13595         return retval;
13596     }
13597 
13598     switch (taxid_tbl->type) {
13599     case eTaxidDefline_Gi:
13600         retval = s_FDBTaxidDeflineTableSearch_Gi(taxid_tbl, gi);
13601         break;
13602 
13603     case eTaxidDefline_Seqid:
13604         retval = s_FDBTaxidDeflineTableSearch_Seqid(taxid_tbl, seqid);
13605         break;
13606 
13607     default:
13608         abort();
13609     }
13610 
13611     return retval;
13612 }
13613 
13614 Int4 LIBCALL
FDBTaxidDeflineTableSearchGi(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi)13615 FDBTaxidDeflineTableSearchGi PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
13616                                     Int4 gi))
13617 {
13618     return s_FDBTaxidDeflineTableSearch(taxid_tbl, gi, kNoSeqid);
13619 }
13620 
13621 Int4 LIBCALL
FDBTaxidDeflineTableSearchSeqid(const FDBTaxidDeflineTablePtr taxid_tbl,const Char * seqid)13622 FDBTaxidDeflineTableSearchSeqid PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
13623                                        const Char* seqid))
13624 {
13625     return s_FDBTaxidDeflineTableSearch(taxid_tbl, kNoGi, seqid);
13626 }
13627 
13628 static void
s_FDBUpdateTaxIdInSingleBdp(BlastDefLinePtr bdp,const FDBTaxidDeflineTablePtr taxid_tbl)13629 s_FDBUpdateTaxIdInSingleBdp(BlastDefLinePtr bdp,
13630                             const FDBTaxidDeflineTablePtr taxid_tbl)
13631 {
13632     Int4 taxid = kTaxidDeflineSearch_NotFound;
13633 
13634     if ( !taxid_tbl ) {
13635         return;
13636     }
13637 
13638     /* Retrieve the tax id */
13639     switch (taxid_tbl->type) {
13640     case eTaxidDefline_Gi:
13641         {
13642             SeqIdPtr sip = NULL;
13643             if( (sip = SeqIdFindBest(bdp->seqid, SEQID_GI)) != NULL) {
13644                 Int4 gi = sip->data.intvalue;
13645 #ifdef TAX_CS_LOOKUP
13646                 taxid = tax1_getTaxId4GI(gi);
13647 #else
13648                 taxid = FDBTaxidDeflineTableSearchGi(taxid_tbl, gi);
13649 #endif
13650             }
13651         }
13652         break;
13653 
13654     case eTaxidDefline_Seqid:
13655         {
13656             Char buf[ID_MAX_SIZE+1] = { '\0' };
13657             SeqIdWrite(bdp->seqid, buf, PRINTID_FASTA_LONG, sizeof(buf));
13658             taxid = FDBTaxidDeflineTableSearchSeqid(taxid_tbl, buf);
13659         }
13660         break;
13661 
13662     default:
13663         abort();
13664     }
13665 
13666     /* Assign the tax id */
13667     if (taxid != kTaxidDeflineSearch_NotFound) {
13668         bdp->taxid = taxid;
13669     }
13670 }
13671 
13672 static void
s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,const FDBTaxidDeflineTablePtr taxid_tbl)13673 s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,
13674                           const FDBTaxidDeflineTablePtr taxid_tbl)
13675 {
13676     for (; bdp; bdp = bdp->next) {
13677         s_FDBUpdateTaxIdInSingleBdp(bdp, taxid_tbl);
13678     }
13679 }
13680 
13681