1 static char const rcsid[] = "$Id: readdb.c,v 6.549 2016/09/02 15:04:59 ucko Exp $";
2
3 /* $Id: readdb.c,v 6.549 2016/09/02 15:04:59 ucko Exp $ */
4 /*
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 */
29 /*****************************************************************************
30
31 File name: readdb.c
32
33 Author: Tom Madden
34
35 Contents: Reads Databases formatted by formatdb.
36
37 Detailed Contents:
38
39 - memory maps files.
40
41 - database sequences are identified (by these routines) by their
42 order in the files. this is based on a zero-offset.
43
44
45 ******************************************************************************/
46
47 /* File Name: readdb.c
48 *
49 * Author: Tom Madden
50 *
51 * Version Creation Date: 3/22/95
52 *
53 * $Revision: 6.549 $
54 *
55 * File Description:
56 * Functions to rapidly read databases from files produced by formatdb.
57 *
58 * Modifications:
59 * --------------------------------------------------------------------------
60 * Date Name Description of modification
61 * ------- ---------- -----------------------------------------------------
62 *
63 * ==========================================================================
64 *
65 *
66 * RCS Modification History:
67 * $Log: readdb.c,v $
68 * Revision 6.549 2016/09/02 15:04:59 ucko
69 * readdb.c: accommodate systems that use GNU libc with non-Linux kernels
70 * (such as the Hurd or the FreeBSD kernel), as already done in
71 * Debian/Ubuntu packages.
72 *
73 * Revision 6.548 2012/03/14 20:09:35 camacho
74 * Fix buffer overrun JIRA BD-348
75 *
76 * Revision 6.547 2012/03/14 20:02:52 camacho
77 * Fix buffer overrun JIRA BD-348
78 *
79 * Revision 6.546 2011/12/19 18:37:35 gouriano
80 * Corrected printf formatting. NOJIRA
81 *
82 * Revision 6.545 2011/11/28 20:24:51 camacho
83 * Add support for setting membership bits in the refseq_chromosome subset JIRA BD-333, BD-308
84 *
85 * Revision 6.544 2009/12/22 12:54:28 madden
86 * Try stripped off path for microbial db, JIRA WB-313
87 *
88 * Revision 6.543 2009/09/17 15:41:43 madden
89 * Consolidate checking of paths, JIRA SB-368
90 *
91 * Revision 6.542 2009/08/25 13:45:23 madden
92 * Work with database with no GI ISAM, JIRA SB-367
93 *
94 * Revision 6.541 2009/02/24 18:19:36 coulouri
95 * correct Nlm_StringTokMT invocation; fixes JIRA SB-181
96 *
97 * Revision 6.540 2009/01/14 21:50:47 madden
98 * Enable REDUCED_E2INDEX_SET
99 *
100 * Revision 6.539 2008/12/24 13:58:10 maning
101 * Quote path containing spaces. Attempt to fix JIRA SB136.
102 *
103 * Revision 6.538 2008/10/31 18:50:52 madden
104 * Add readdb_check_oid to be used with pseed (SB-109)
105 *
106 * Revision 6.537 2008/06/25 18:32:37 merezhuk
107 * use portable Nlm_StringTokMT.
108 *
109 * Revision 6.536 2008/06/25 14:28:55 merezhuk
110 * support for multiple BLAST DB locations as in CSeqDB
111 *
112 * Revision 6.535 2008/02/26 18:34:20 kans
113 * use SeqDescrAddPointer instead of ValNodeAddPointer/Str
114 *
115 * Revision 6.534 2007/12/04 19:43:50 madden
116 * Index accession.version for swissprot
117 *
118 * Revision 6.533 2007/11/27 18:51:54 madden
119 * More efficient retrievals on string isam indices
120 *
121 * Revision 6.532 2007/11/15 21:10:49 madden
122 * New version of SeqIdE2Index ifdef by REDUCED_E2INDEX_SET
123 *
124 * Revision 6.531 2007/11/06 20:09:39 coulouri
125 * when printing taxonomy info, skip irrelevant sequences if a gi target was specified; fixes blast-rt#15347680
126 *
127 * Revision 6.530 2007/09/27 17:20:54 madden
128 * Add readdb_get_full_filename
129 *
130 * Revision 6.529 2007/08/17 15:56:10 papadopo
131 * 1. Make increment of reference count in readdb_attach atomic
132 * 2. Never initialize the reference count to a fixed value when
133 * memory-mapping database files, only increment it (fixes RT 15280141)
134 *
135 * Revision 6.528 2007/07/12 20:44:07 papadopo
136 * open .nsd as a binary file for writing
137 *
138 * Revision 6.527 2007/05/16 18:47:56 camacho
139 * Fix RT#15284902
140 *
141 * Revision 6.526 2007/05/08 13:09:39 madden
142 * Add ability to read STATS_NSEQ and STATS_TOTLEN from alias file with funciton readdb_get_stats_numbers
143 *
144 * Revision 6.525 2007/05/07 13:30:54 kans
145 * added casts for Seq-data.gap (SeqDataPtr, SeqGapPtr, ByteStorePtr)
146 *
147 * Revision 6.524 2007/05/03 15:51:53 madden
148 * Do not require title to use alias file
149 *
150 * Revision 6.523 2007/04/12 20:19:06 camacho
151 * Remove informational messages about membership/links bits
152 *
153 * Revision 6.522 2007/02/27 15:16:24 camacho
154 * DBLIST field is mandatory
155 *
156 * Revision 6.521 2007/01/05 16:01:22 camacho
157 * Force munmap of string ISAM files after exceeding kSISAM_MaxNumVolumes volumes
158 * to avoid running out of memory on accession lookups. Fixes rt #15235977.
159 *
160 * Revision 6.520 2006/10/17 15:24:31 camacho
161 * Fix memory leak when printing accession list in fastacmd
162 *
163 * Revision 6.519 2006/09/27 18:51:01 camacho
164 * Bug fix in readdb_read_alias_file
165 *
166 * Revision 6.518 2006/09/27 14:19:12 camacho
167 * Bug fix in OID_GI_BelongsToMaskDB
168 *
169 * Revision 6.517 2006/09/19 19:37:28 kans
170 * readdb_parse_db_names has quote_mode (TM) to allow spaces in paths if bounced by doublequote marks
171 *
172 * Revision 6.516 2006/08/10 17:47:27 camacho
173 * Bug fix in readdb_acc2fasta: added quick check in oidlist in OID_GI_BelongsToMaskDB
174 *
175 * Revision 6.515 2006/08/07 15:19:12 camacho
176 * + is_REFSEQ_GENOMIC to FDBLoadMembershipsTable
177 *
178 * Revision 6.514 2006/08/07 15:03:57 camacho
179 * +is_REFSEQ_GENOMIC
180 *
181 * Revision 6.513 2006/07/13 20:10:06 camacho
182 * Bug fix in ScanDIFile
183 *
184 * Revision 6.512 2006/07/06 19:48:19 camacho
185 * Fix to previous commit
186 *
187 * Revision 6.511 2006/07/06 19:37:30 camacho
188 * Add extra sanity checks in ScanDIFile
189 *
190 * Revision 6.510 2006/07/05 18:24:08 camacho
191 * Fixes to ScanDIFile to read molecule type
192 *
193 * Revision 6.509 2006/07/05 16:07:48 camacho
194 * Minor changes in FDBLoadLinksTable
195 *
196 * Revision 6.508 2006/07/03 18:27:22 coulouri
197 * correct volume size defaults for protein databases
198 *
199 * Revision 6.507 2006/06/19 18:37:08 coulouri
200 * improve default handling for non-formatdb clients
201 *
202 * Revision 6.506 2006/06/19 17:20:14 coulouri
203 * Extend 1GB default volume size to all platforms and impose a hard limit of 4G. rt#15171398
204 *
205 * Revision 6.505 2006/06/05 19:59:53 camacho
206 * Changes to ScanDIFile, is_REFSEQ_RNA, and FDBAddSequence2 to handle the new
207 * DI_Record::mol field.
208 *
209 * Revision 6.504 2006/05/30 20:27:51 jianye
210 * fixing memory leak in FDBuildOldStyleDefline
211 *
212 * Revision 6.503 2006/05/11 13:51:45 kans
213 * made is_REFSEQ_RNA compatible with C compiler conventions
214 *
215 * Revision 6.502 2006/05/10 20:47:16 camacho
216 * From Ilya Dondoshansky: 1. Several FDB functions made public - needed for incremental dump efficiency; 2. Added mol field to SI_Record and DI_Record, check for mol = rna in is_REFSEQ_RNA; 3. Avoid redundant sorting of ISAM files; 4. In readdb_get_pig: look for PIG in all deflines in a set, until found.
217 *
218 * Revision 6.501 2006/05/04 20:07:27 camacho
219 * Report fatal error in case of failure to add sequence to BLAST database because
220 * of zero-length sequence and clean up the datababase that was being created.
221 *
222 * Revision 6.500 2006/04/24 15:50:19 camacho
223 * + is_REFSEQ_RNA
224 *
225 * Revision 6.499 2006/03/16 14:14:23 camacho
226 * Fix parsing of locations for fastacmd command line argument (rt # 15151399)
227 *
228 * Revision 6.498 2006/03/09 21:56:02 camacho
229 * Refactored sequence hash function
230 *
231 * Revision 6.497 2006/03/08 19:06:15 camacho
232 * Added definition for maximum number of volumes and FDBCleanUpInProgress, fixes rt ticket 15147600
233 *
234 * Revision 6.496 2006/02/15 21:07:28 camacho
235 * Add validation to fastacmd to reject mixed protein/nucleotide databases
236 *
237 * Revision 6.495 2006/01/11 16:24:45 camacho
238 * Fix bug in Fastacmd_PrintTaxonomyInfo
239 *
240 * Revision 6.494 2005/12/23 16:30:57 camacho
241 * Remove assertion no longer needed
242 *
243 * Revision 6.493 2005/12/02 14:04:07 camacho
244 * Minor fix in ScanDIFile
245 *
246 * Revision 6.492 2005/11/22 21:23:05 madden
247 * Fix in FDLCreateAsnDF for multiple volumes if input FASTA not parsed
248 *
249 * Revision 6.491 2005/10/04 20:40:42 madden
250 * Make PrintDbInformationBasicEx public, minor optimization to PrintDbInfoWithRID
251 *
252 * Revision 6.490 2005/10/04 16:40:25 madden
253 * Fix nit found by C++ compiler
254 *
255 * Revision 6.489 2005/10/04 15:44:54 madden
256 * Workaround to time-out problem of PrintDbInformationWithRID
257 *
258 * Revision 6.488 2005/09/30 14:54:32 camacho
259 * Enable recognition of the formatdb configuration file to allow users to set the
260 * membership and link bits in the ASN.1 deflines.
261 *
262 * Revision 6.487 2005/09/20 14:08:29 camacho
263 * Add error message when trying to dump subset database with gi file
264 *
265 * Revision 6.486 2005/09/08 13:19:11 camacho
266 * Remove unneeded assertion
267 *
268 * Revision 6.485 2005/09/02 21:52:13 camacho
269 * Correct buffer overflow on sparc
270 *
271 * Revision 6.484 2005/08/16 17:51:14 dondosha
272 * Decrement thread count in shared_info only if sequence/header files are open for this instance of readdb
273 *
274 * Revision 6.483 2005/08/07 01:55:32 camacho
275 * Bug fix to FDBAddSequence2
276 *
277 * Revision 6.482 2005/08/04 16:08:29 coulouri
278 * correct buffer overflow on sparc
279 *
280 * Revision 6.481 2005/08/04 15:29:47 camacho
281 * Fix to SI_RecordAddFormatdb_ver
282 *
283 * Revision 6.480 2005/07/28 14:57:10 coulouri
284 * remove dead code
285 *
286 * Revision 6.479 2005/07/27 21:30:02 camacho
287 * 1) Replaces is_REFSEQ_* functions by a single function (is_REFSEQ), to be
288 * used by genmask and ID1 group's BLAST database dumper.
289 * 2) Removed out-of-date is_WGS* functions.
290 *
291 * Revision 6.478 2005/07/27 17:48:57 coulouri
292 * remove hardcoded paths
293 *
294 * Revision 6.477 2005/06/22 13:55:22 coulouri
295 * add support for dumping accessions
296 *
297 * Revision 6.476 2005/06/21 19:15:50 dondosha
298 * In FD_CreateAliasFileEx, if there is a gi list, always add NSEQ and LENGTH lines, even with 0 values
299 *
300 * Revision 6.475 2005/06/08 19:25:36 camacho
301 * New feature to allow formatdb to add taxonomy ids to BLAST databases
302 * generated from FASTA input
303 * BugzID: 6
304 *
305 * Revision 6.474 2005/05/16 16:12:45 camacho
306 * Added auxiliary function for the SI_Record structure to fix a bug in
307 * FDBAddSequence, which caused all but the first BlastDefLine structure in
308 * a linked list to be ignored.
309 *
310 * Revision 6.473 2005/04/26 21:34:39 kans
311 * added SEQID_GPIPE
312 *
313 * Revision 6.472 2005/04/20 19:02:15 lavr
314 * +<assert.h>
315 *
316 * Revision 6.471 2005/04/11 18:55:16 coulouri
317 * Make BLASTDB environment variable usage consistent across platforms
318 *
319 * Revision 6.470 2005/04/11 18:04:56 madden
320 * Fix for alignment issue in readdb_get_sequence_ex
321 *
322 * Revision 6.469 2005/04/07 12:19:35 madden
323 * Refactor readdb_get_sequence_ex to eliminate unnecessary allocations
324 *
325 * Revision 6.468 2005/04/06 16:01:25 camacho
326 * Return -1 in case of memory allocation failures in readdb_get_sequence_ex
327 *
328 * Revision 6.467 2005/02/24 14:34:05 camacho
329 * Fix invocation of FDBAddSequence
330 *
331 * Revision 6.466 2005/02/22 14:15:48 camacho
332 * Pass bioseq data type by reference to FDBAddBioseq
333 *
334 * Revision 6.465 2004/12/07 15:14:14 kans
335 * third parameter to readdb_get_header_ex needs to be pointer to Uint4, not Int4 - CodeWarrior error
336 *
337 * Revision 6.464 2004/12/04 03:41:09 camacho
338 * Add extra enum for fastacmd -D option for error checking
339 *
340 * Revision 6.463 2004/12/03 04:57:57 camacho
341 * Fix name conflict in enumeration for fastacmd dump types
342 *
343 * Revision 6.462 2004/12/02 20:37:31 camacho
344 * + fastacmd feature to dump list of gis
345 *
346 * Revision 6.461 2004/11/22 20:54:58 coulouri
347 * optimization for subset database searches restricted by gi list
348 *
349 * Revision 6.460 2004/10/28 15:39:37 camacho
350 * Fixes to previous commit
351 *
352 * Revision 6.459 2004/10/04 18:00:00 madden
353 * Further fixes for SI_Record.title
354 *
355 * Revision 6.458 2004/09/27 16:29:34 madden
356 * Make title on SI_Record dynamically allocated
357 *
358 * Revision 6.457 2004/09/21 21:42:57 dondosha
359 * Initialize BlastDefLine before call to FDBAddBioseq in FastaToBlastDB
360 *
361 * Revision 6.456 2004/09/09 20:58:26 camacho
362 * Add sanity checks in readdb_read_alias_file
363 *
364 * Revision 6.455 2004/08/25 14:45:23 camacho
365 * Refactorings to allow formatdb process multiple deflines
366 *
367 * Revision 6.454 2004/08/06 17:55:35 madden
368 * Add new owners to is_REFSEQ_RNA
369 *
370 * Revision 6.453 2004/08/06 13:56:11 madden
371 * Add owner 45 to is_REFSEQ_PROTEIN
372 *
373 * Revision 6.452 2004/08/05 19:33:32 madden
374 * Add ownership 38 and 52 to Refseq for proteins
375 *
376 * Revision 6.451 2004/07/26 20:51:38 camacho
377 * Fix mismatched data type
378 *
379 * Revision 6.450 2004/07/22 16:16:41 camacho
380 * Guard against arguments longer than PATH_MAX to FindBlastDBFile
381 *
382 * Revision 6.449 2004/07/19 22:37:46 dondosha
383 * Added mutex lock/unlock around shared info manipulation in readdb_destruct_element
384 *
385 * Revision 6.448 2004/07/14 18:35:33 camacho
386 * Remove unneeded error message in readdb_get_header_ex
387 *
388 * Revision 6.447 2004/07/13 19:57:00 dondosha
389 * Tiny memory leak fix
390 *
391 * Revision 6.446 2004/07/13 17:31:33 camacho
392 * Fix for genmask to count only non-redundant sequences added to the masked
393 * databases instead of all sequences.
394 *
395 * Revision 6.445 2004/07/09 15:40:22 dondosha
396 * Fix in ReadDBOpenMHdrAndSeqFiles: increment nthreads if at least one of header or sequence files is already mapped
397 *
398 * Revision 6.444 2004/07/08 21:25:48 kans
399 * fixed Mac compiler error in FDBExtend4Sequence
400 *
401 * Revision 6.443 2004/07/08 19:49:02 camacho
402 * Contributions from ID1 Group:
403 * 1) SI_Record structure.
404 * 2) Refactoring of FDBAddSequence2 to allow addition of non-redundant sequences
405 * when creating BLAST databases.
406 *
407 * Revision 6.442 2004/06/30 13:42:27 kans
408 * include <blfmtutl.h> to clear up Mac compiler missing prototype errors
409 *
410 * Revision 6.441 2004/05/04 17:07:20 kans
411 * ReadDBBioseqFetchFunc checks result of ReadDBFindFetchStruct call for NULL before attempting to dereference - picked up by trying to use multiple threads
412 *
413 * Revision 6.440 2004/04/21 16:54:52 camacho
414 * Added removal for PIG files
415 *
416 * Revision 6.439 2004/04/13 17:22:46 camacho
417 * Optimization to Int4ListReadFromFile
418 *
419 * Revision 6.438 2004/04/01 13:43:08 lavr
420 * Spell "occurred", "occurrence", and "occurring"
421 *
422 * Revision 6.437 2004/03/29 05:17:55 camacho
423 * Fix to Int4ListConcat
424 *
425 * Revision 6.436 2004/03/15 18:45:05 coulouri
426 * Throw fatal error if BSRebuildDNA_4na() fails
427 *
428 * Revision 6.435 2004/02/24 16:32:52 camacho
429 * Use correct calling convention for win32
430 *
431 * Revision 6.434 2004/02/24 14:06:00 camacho
432 * Added support for approximate sequence length calculation for nucleotide
433 * sequences.
434 *
435 * Revision 6.433 2004/02/09 20:53:20 camacho
436 * Add FDBAddPig call from FDBAddSequence2
437 *
438 * Revision 6.432 2004/02/04 15:35:04 camacho
439 * Rollback to fix problems in release 2.2.7
440 *
441 * Revision 6.429 2004/01/29 20:48:07 coulouri
442 * Only limit volume sizes on 32-bit platforms
443 *
444 * Revision 6.428 2004/01/28 19:34:51 camacho
445 * Added sanity check for alias files
446 *
447 * Revision 6.427 2004/01/26 13:52:31 camacho
448 * Do not use snprintf
449 *
450 * Revision 6.426 2004/01/23 21:13:54 camacho
451 * 1. Refactored code to create multiple volumes.
452 * 2. Set the maximum sequence file size to 1GB.
453 *
454 * Revision 6.425 2004/01/12 23:06:36 camacho
455 * Sort link bit gi lists
456 *
457 * Revision 6.424 2003/10/01 19:03:50 camacho
458 * Fix in readdb_get_totals_ex2 to use the alias file length/number of entries when
459 * gilist is populated.
460 *
461 * Revision 6.423 2003/09/02 18:32:10 dondosha
462 * Changed http link for completed microbial genomes at genomes group request
463 *
464 * Revision 6.422 2003/08/08 19:31:37 camacho
465 * Minor fix for formatdb
466 *
467 * Revision 6.421 2003/07/28 13:59:17 camacho
468 * Bug fix
469 *
470 * Revision 6.420 2003/07/15 16:49:32 camacho
471 * Skip whitespace in alias files
472 *
473 * Revision 6.419 2003/07/10 14:00:59 camacho
474 * Fixed some memory leaks
475 *
476 * Revision 6.418 2003/07/08 18:42:39 camacho
477 * Elaborated fastacmd return values
478 *
479 * Revision 6.417 2003/07/02 19:22:10 camacho
480 * formatdb fix to remove stdin from database title
481 *
482 * Revision 6.416 2003/06/13 19:56:26 dondosha
483 * Removed call to SeqEntrySetScope in FastaToBlastDB that caused purify errors
484 *
485 * Revision 6.415 2003/05/30 17:25:37 coulouri
486 * add rcsid
487 *
488 * Revision 6.414 2003/05/21 21:33:36 camacho
489 * Deprecated isCommonIndex global
490 *
491 * Revision 6.413 2003/05/15 14:45:48 dondosha
492 * readdb_get_sequence_number returns -1 if rdfp is NULL
493 *
494 * Revision 6.412 2003/05/15 14:14:18 dondosha
495 * Check if offset is larger than total length of all rdfps in readdb_get_sequence_number
496 *
497 * Revision 6.411 2003/05/13 16:02:53 coulouri
498 * make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
499 *
500 * Revision 6.410 2003/05/12 12:24:02 camacho
501 * Fixed readdb_get_totals_ex2
502 *
503 * Revision 6.409 2003/05/01 14:10:03 camacho
504 * 1. Fixed readdb_get_totals_ex2 to use alias length and number of sequences
505 * without an oidlist
506 * 2. Fixed readdb_merge_gifiles to properly sort the rdfp linked list (rdfp_chain)
507 * 3. Fixed readdb_gi2seq to look into subsequent rdfps if no isam indices are
508 * found in the first rdfp
509 *
510 * Revision 6.408 2003/04/28 19:50:10 camacho
511 * Fixes to readdb_merge_gifiles
512 *
513 * Revision 6.407 2003/04/27 02:43:25 vakatov
514 * Added missing LIBCALL -- for MS-Win compilation
515 *
516 * Revision 6.406 2003/04/25 18:55:27 camacho
517 * 1. Added readdb_merge_gifiles to deal with Microbial blast database issues.
518 * 2. Minor fixes to Int4List functions.
519 *
520 * Revision 6.405 2003/04/24 15:44:42 camacho
521 * Fixes for windows build
522 *
523 * Revision 6.404 2003/04/24 13:16:25 camacho
524 * Minor fix
525 *
526 * Revision 6.403 2003/04/23 15:15:36 camacho
527 * Moved reading of gi list to readdb
528 *
529 * Revision 6.402 2003/04/22 21:30:13 camacho
530 * Added Int4 list utilities
531 *
532 * Revision 6.401 2003/04/22 19:04:57 camacho
533 * Moved GiList structure to generic list of 4-byte integers
534 *
535 * Revision 6.400 2003/04/17 21:10:54 camacho
536 * Add PIGs only when removing redundancy
537 *
538 * Revision 6.399 2003/04/15 19:09:13 camacho
539 * Completed implementation of PIG interface
540 *
541 * Revision 6.398 2003/04/14 19:53:30 camacho
542 * Fixed memory leak
543 *
544 * Revision 6.397 2003/04/09 21:46:00 camacho
545 * Added basic PIG interface
546 *
547 * Revision 6.396 2003/04/09 20:16:17 camacho
548 * Use #defined value for location of taxdb.tar.gz
549 *
550 * Revision 6.395 2003/04/08 15:45:02 camacho
551 * Minor fix to previous commit
552 *
553 * Revision 6.394 2003/04/08 15:37:14 camacho
554 * Extended FDBAddSequence2 to take pig
555 *
556 * Revision 6.393 2003/04/04 17:56:33 camacho
557 * fastacmd fix when retrieving repeated identifiers(-a)
558 *
559 * Revision 6.392 2003/04/03 17:57:00 vakatov
560 * Added missing LIBCALL
561 *
562 * Revision 6.391 2003/04/01 21:51:36 camacho
563 * Made fastacmd functions & structure non-static
564 *
565 * Revision 6.390 2003/03/27 22:51:16 camacho
566 * Minor change to previous commit
567 *
568 * Revision 6.389 2003/03/27 22:26:04 camacho
569 * Add error messages and non-zero return value on error for fastacmd
570 *
571 * Revision 6.388 2003/03/26 18:50:07 camacho
572 * Added eFDBCleanOpt to formatdb API
573 *
574 * Revision 6.387 2003/03/21 22:14:32 camacho
575 * Allow C ObjMgr & application to load taxonomy dbs
576 *
577 * Revision 6.386 2003/03/20 14:03:21 camacho
578 * Allow users to set the membership and link bits
579 *
580 * Revision 6.385 2003/03/14 21:39:08 camacho
581 * Fix bug in readdb_get_totals_ex2
582 *
583 * Revision 6.384 2003/03/08 23:02:35 camacho
584 * Bug fix in FDBFinish
585 *
586 * Revision 6.383 2003/03/07 13:16:42 madden
587 * Check for NULL rdfp before dereferencing
588 *
589 * Revision 6.382 2003/02/26 17:47:31 kimelman
590 * bugfix: doublicate close of Files and AsnIo
591 *
592 * Revision 6.381 2003/02/20 17:29:31 camacho
593 * Added support for the creation of empty databases
594 *
595 * Revision 6.380 2003/02/11 17:46:25 camacho
596 * Fix to FDBAddSequence2
597 *
598 * Revision 6.379 2003/01/31 17:58:29 camacho
599 * Eliminate unnecessary checks for redundant databases
600 *
601 * Revision 6.378 2003/01/31 14:39:21 camacho
602 * Use init_state argument to readdb_new_ex2
603 *
604 * Revision 6.377 2003/01/22 20:21:01 bealer
605 * - Handle error case better.
606 *
607 * Revision 6.376 2003/01/22 19:41:20 camacho
608 * Added function to build multi-volume db list for creating alias files
609 *
610 * Revision 6.375 2003/01/07 17:18:15 camacho
611 * Remove warning message when file is not found by FindBlastDBFile
612 *
613 * Revision 6.374 2003/01/02 22:17:54 madden
614 * Print name of database when wrong version used
615 *
616 * Revision 6.373 2002/12/19 14:25:07 camacho
617 * Minor change
618 *
619 * Revision 6.372 2002/12/17 20:33:25 camacho
620 * Removed unnecessary function attribute
621 *
622 * Revision 6.371 2002/12/17 20:01:18 madden
623 * Fix for oidlist when no memory-mapping available
624 *
625 * Revision 6.370 2002/12/17 17:46:01 madden
626 * readdb_get_sequence_number does not check for oidlist
627 *
628 * Revision 6.369 2002/12/16 20:22:48 camacho
629 * Removed unused options in formatdb options structure
630 *
631 * Revision 6.368 2002/12/16 05:01:54 camacho
632 * Fixes to previous commit
633 *
634 * Revision 6.367 2002/12/13 16:01:25 kans
635 * fixed mac compiler complaints
636 *
637 * Revision 6.366 2002/12/13 13:43:22 camacho
638 * Changes to set links and membership bits in formatdb API
639 *
640 * Revision 6.365 2002/12/11 17:05:53 camacho
641 * Added code to handle mmap failures in MT mode
642 *
643 * Revision 6.364 2002/12/10 18:31:44 camacho
644 * Added taxonomy database loading when using ReadDBBioseqFetchEnable
645 *
646 * Revision 6.363 2002/11/27 20:06:01 camacho
647 * Fix to deal with non-parseable seqids in new database format
648 *
649 * Revision 6.362 2002/11/25 17:23:28 camacho
650 * 1) Changed file access to blast taxonomy databases: only 2 files are loaded
651 * for an entire chain of rdfp's.
652 * 2) Fixed memory leak in FindBlastDBFile.
653 * 3) Protect NlmOpenMFILE against NULL argument.
654 *
655 * Revision 6.361 2002/11/12 20:42:02 camacho
656 * Fixed problem with long deflines in FDLCreateAsnDF
657 *
658 * Revision 6.360 2002/11/06 21:31:08 ucko
659 * Make sure MADV_NORMAL is actually defined before trying to use madvise.
660 *
661 * Revision 6.359 2002/11/04 16:44:08 camacho
662 * Prevent MT problems by loading BlastDefLine ASN module in readdb_new_internal
663 *
664 * Revision 6.358 2002/10/25 16:49:45 camacho
665 * Added Michael Kimelman's FDBAddSequence2
666 *
667 * Revision 6.357 2002/10/17 17:47:46 camacho
668 * Added longest sequence length to fastacmd -I option
669 *
670 * Revision 6.356 2002/10/03 14:13:43 camacho
671 * Added support for gilist field in alias file in multivolume databases
672 *
673 * Revision 6.355 2002/09/30 15:05:07 camacho
674 * Added check for zero-length sequences in FDBAddSequence
675 *
676 * Revision 6.354 2002/09/26 17:54:56 camacho
677 * Fix for using -t option with multiple databases
678 *
679 * Revision 6.353 2002/09/26 02:14:42 camacho
680 * Allow limiting the number of sequences per volume
681 *
682 * Revision 6.352 2002/09/25 20:14:20 camacho
683 * Fix for multivolume databases with non-parseable seqids
684 *
685 * Revision 6.351 2002/09/24 19:08:31 camacho
686 * Removed unnecessary loop around SeqId2OrdinalId in ReadDBBioseqFetchFunc
687 *
688 * Revision 6.350 2002/09/20 14:42:12 camacho
689 * Changed order of precedence when reading looking for index/alias files
690 *
691 * Revision 6.349 2002/08/21 17:51:47 camacho
692 * Added taxonomy id to Fastacmd_PrintTaxonomyInfo
693 *
694 * Revision 6.348 2002/07/30 15:28:49 camacho
695 * Added fastacmd function to parse SeqLocs
696 *
697 * Revision 6.347 2002/07/29 15:45:18 camacho
698 * Made readdb_get_taxnames a LIBCALL function
699 *
700 * Revision 6.346 2002/07/26 15:33:42 raytseli
701 * for async searches use the highes priority for all databases other than "est", use default priority for "est".
702 *
703 * Revision 6.345 2002/07/25 13:45:07 raytseli
704 * added a couple sanity checks.
705 * .
706 *
707 * Revision 6.344 2002/07/24 21:11:39 kans
708 * reverted ncbi URL
709 *
710 * Revision 6.343 2002/07/24 19:57:46 raytseli
711 * removed special provisions for preloading the est database, since all preloads are acces driven only.
712 * .
713 *
714 * Revision 6.342 2002/07/24 19:31:47 raytseli
715 * much simpler and more efficient approach to using madvise()
716 * .
717 *
718 * Revision 6.341 2002/07/23 16:50:04 kans
719 * changed www.ncbi.nlm.nih.gov to www.ncbi.nih.gov
720 *
721 * Revision 6.340 2002/07/22 18:34:34 raytseli
722 * run madvise() thread at the highest priority.
723 * .
724 *
725 * Revision 6.339 2002/07/22 13:06:42 raytseli
726 * explicitly allow setting of the advice type for madvise()
727 * .
728 *
729 * Revision 6.338 2002/07/19 19:59:48 raytseli
730 * madvise()-related refinements.
731 *
732 * Revision 6.337 2002/07/19 17:15:59 madden
733 * MemSet for MyFsa, use BioseqRawToFastaExtraEx again
734 *
735 * Revision 6.336 2002/07/19 13:35:58 raytseli
736 * decided that preloading index is not advantageous in some cases, -- removed for now.
737 *
738 * Revision 6.335 2002/07/18 18:49:14 madden
739 * Use BioseqRawToFastaExtra as BioseqRawToFastaExtraEx still has problems
740 *
741 * Revision 6.334 2002/07/18 15:54:26 raytseli
742 * added function to explicitly set madvise() block size, and madvise() sync mode.
743 *
744 * Revision 6.333 2002/07/18 15:01:52 raytseli
745 * correct problem with pointer format "%p" ErrPostEx() handling on linux.
746 * Add extern func to allow explicit madvise() functionality activation.
747 *
748 * Revision 6.332 2002/07/17 19:41:29 raytseli
749 * solaris exotics and other refinements.
750 * .
751 *
752 * Revision 6.331 2002/07/17 17:52:40 raytseli
753 * dealt with linux idiosyncrazies
754 * .
755 *
756 * Revision 6.328 2002/07/17 16:46:27 raytseli
757 * Exclude Windows from madvise()-related stuff, -- Provisional version
758 * to allow Win build.
759 *
760 * Revision 6.327 2002/07/17 15:46:03 raytseli
761 * itemporarily disable madvise on linux
762 * .
763 *
764 * Revision 6.326 2002/07/17 15:20:50 raytseli
765 * use async madvise on linux, sync on solaris; other minor changes.
766 * .
767 *
768 * Revision 6.325 2002/07/17 14:36:54 raytseli
769 * incorporated madvise into readdb
770 * .
771 *
772 * Revision 6.324 2002/07/15 17:01:33 camacho
773 * Replaced call to snprintf with StringNCpy
774 *
775 * Revision 6.323 2002/07/14 21:02:08 camacho
776 * Added extra features to fastacmd
777 *
778 * Revision 6.322 2002/07/12 15:19:18 camacho
779 * Updated comment explaining order to search blast databases
780 *
781 * Revision 6.321 2002/07/11 18:37:40 camacho
782 * BLASTDB env. variable has higher precedence over .ncbirc file config value
783 *
784 * Revision 6.320 2002/07/09 16:41:52 camacho
785 * Made taxonomy databases multi-thread safe
786 *
787 * Revision 6.319 2002/07/07 20:43:45 camacho
788 * Pointer initialization in RDBGetTaxNames
789 *
790 * Revision 6.318 2002/06/26 00:45:37 camacho
791 *
792 * Added readdb_get_totals_ex2 to allow recalculation of database length as
793 * well as total number of sequences after the virtual oidlist has been
794 * created.
795 *
796 * Revision 6.317 2002/06/21 21:39:56 camacho
797 * Eliminated check for obsolete flag
798 *
799 * Revision 6.316 2002/06/18 18:06:27 dondosha
800 * Added comment to the readdb_get_sequence_number function
801 *
802 * Revision 6.315 2002/06/04 21:45:39 dondosha
803 * Corrected the readdb_get_sequence_number function in case of multiple-volume databases
804 *
805 * Revision 6.314 2002/06/04 20:22:56 camacho
806 * Fixed taxonomy databases to work w/o mmap
807 *
808 * Revision 6.313 2002/05/29 22:52:31 dondosha
809 * Removed debug printouts accidentally added in last change
810 *
811 * Revision 6.312 2002/05/29 22:50:58 dondosha
812 * Correction in readdb_get_sequence_number
813 *
814 * Revision 6.311 2002/05/15 20:23:46 camacho
815 * Added wgs_{mouse,anthrax} criteria functions
816 *
817 * Revision 6.310 2002/05/07 18:10:42 camacho
818 * Fixed memory leak in FDBAddSequence
819 *
820 * Revision 6.309 2002/05/02 21:58:42 camacho
821 * Removed fastacmd dependency on the common index
822 *
823 * Revision 6.308 2002/05/02 21:52:06 camacho
824 * Support for genmask's new month/subset mask combinations
825 *
826 * Revision 6.307 2002/04/26 16:31:36 camacho
827 * Byte order fix to BlastDBToFasta
828 *
829 * Revision 6.306 2002/04/24 22:26:42 dondosha
830 * First and last oid in alias files are one-offset
831 *
832 * Revision 6.305 2002/04/18 19:35:05 camacho
833 * 1. Added fdfilter/genmask callbacks for wgs subsets
834 * 2. Modified fdfilter/genmask refseq_protein callback function
835 * 3. Fixed problem in readdb_read_alias_file to read multiple oidlists
836 *
837 * Revision 6.304 2002/04/09 20:15:15 camacho
838 * Fixed FDBAddSequence to correctly handle the dump_info files when using volumes
839 *
840 * Revision 6.303 2002/03/26 15:32:50 camacho
841 * Allow space delimited GIs/accessions in fastacmd
842 *
843 * Revision 6.302 2002/03/18 17:58:17 camacho
844 * Added detailed error messages
845 *
846 * Revision 6.301 2002/03/08 16:58:50 camacho
847 * Added accessions to dump info files *.[pn]di
848 *
849 * Revision 6.300 2002/02/15 20:50:24 beloslyu
850 * fix from HP
851 *
852 * Revision 6.299 2002/01/31 21:29:50 camacho
853 * Fixed bug in readdb_get_asn1_defline
854 *
855 * Revision 6.298 2002/01/25 17:06:57 camacho
856 * Added new criteria to create new refseq databases
857 *
858 * Revision 6.297 2002/01/24 18:47:48 camacho
859 * Moved RDBTaxNamesFree from readdb.[ch] to txalign.[ch]
860 *
861 * Revision 6.296 2002/01/11 19:22:26 camacho
862 * 1. Added preferred_gi field to ReadDBFILE structure.
863 * 2. Modified FDReadDeflineAsn to return the preferred gi as the
864 * first element of the list of BlastDefLine structures (if set).
865 *
866 * Revision 6.295 2002/01/10 20:55:37 camacho
867 * Modified OIDBelongsToMaskDB to accept a gi as a parameter
868 *
869 * Revision 6.294 2002/01/09 20:19:14 camacho
870 * Fix to previous commit
871 *
872 * Revision 6.293 2002/01/09 19:47:49 camacho
873 * Added call to SeqEntryLoad in readdb_new_internal
874 *
875 * Revision 6.292 2002/01/09 14:45:30 camacho
876 * Fixed some memory leaks, fix to BlastDBToFasta
877 *
878 * Revision 6.291 2001/12/19 21:14:24 camacho
879 * Guard against a bad pointer in readdb_get_taxonomy_names
880 *
881 * Revision 6.290 2001/12/18 13:01:51 camacho
882 * Added new flag -D to dump blast database in FASTA format
883 *
884 * Revision 6.289 2001/12/13 21:50:25 camacho
885 * Fixed little endian/big endian issue in RDBGetTaxNames
886 *
887 * Revision 6.288 2001/12/10 19:17:13 camacho
888 * Added option to allow fastacmd to use Ctrl-As as defline separators.
889 *
890 * Revision 6.287 2001/12/06 21:20:33 camacho
891 * 1. Enabled fastacmd to dump multiple mask databases.
892 * 2. Made genmask show progress if SHOW_PROGRESS is defined.
893 *
894 * Revision 6.286 2001/12/04 21:21:19 camacho
895 * Eliminated unnecessary condition in readdb_gi2seq
896 *
897 * Revision 6.285 2001/11/28 20:17:33 camacho
898 * Fixed trailing semicolon problem in PrintDbInformationWithRID
899 *
900 * Revision 6.284 2001/11/27 18:08:09 camacho
901 * 1. Corrected readdb_gi2seq to retrieve the correct oid's in the new
902 * database format (FORMATDB_VER) even if the CommonIndex is present.
903 * 2. Updated a few conditionals.
904 *
905 * Revision 6.283 2001/11/19 22:18:03 camacho
906 * Fixed invocation to OIDBelongsToMaskDB to ensure that the right offset
907 * in the database is retrieved.
908 *
909 * Revision 6.282 2001/11/16 17:15:26 madden
910 * Fix for multi-volume searches
911 *
912 * Revision 6.281 2001/11/15 16:11:29 dondosha
913 * Changed genome view link from neptune to public page
914 *
915 * Revision 6.280 2001/11/14 17:29:07 camacho
916 * Fixed PrintDbInformationWithRID to print semicolons only
917 * when searching multiple databases.
918 *
919 * Revision 6.279 2001/11/13 20:32:56 dondosha
920 * Removed a tiny bit of garbage code
921 *
922 * Revision 6.278 2001/11/13 17:01:43 dondosha
923 * Correction of previous change
924 *
925 * Revision 6.277 2001/11/09 23:11:45 dondosha
926 * Correction for links from completed genomes databases to genome view
927 *
928 * Revision 6.276 2001/11/09 19:05:35 dondosha
929 * ReadDBFreeSharedInfo and ReadDBOpenMHdrAndSeqFiles made static in readdb.c
930 *
931 * Revision 6.275 2001/11/09 19:04:21 dondosha
932 * Check shared_info->nthreads for 0 outside of mutex to avoid huge number of mutex locks; check again once inside mutex
933 *
934 * Revision 6.274 2001/11/05 23:00:40 dondosha
935 * Put back changes from revision 6.270 that were accidentally removed
936 *
937 * Revision 6.273 2001/11/02 20:18:09 camacho
938 * Fixed a small memory leak in OIDListFree
939 *
940 * Revision 6.272 2001/11/02 19:56:56 camacho
941 * Corrected a source for memory leaks in OIDBelongsToMaskDB
942 *
943 * Revision 6.271 2001/11/02 19:45:16 camacho
944 * 1. Modified FDReadDeflineAsn to return the correct
945 * BlastDefLine structure when dealing with subset
946 * (mask) databases.
947 * 2. Added readdb_encode_subset_asn1_defline to
948 * add the BlastDefLine structure to the Bioseq
949 * as a UserObject (when dealing with subset db's).
950 * 3. Updated readdb_get_defline_ex, readdb_get_descriptor,
951 * and OIDBelongsToMaskDB to use the changes introduced
952 * above.
953 *
954 * Revision 6.270 2001/11/02 18:33:00 dondosha
955 * 1. Added function readdb_get_sequence_number (by position in database)
956 * 2. Added function PrintDbInformationWithRID for Microbial genomes page
957 *
958 * Revision 6.269 2001/10/19 13:46:50 camacho
959 * Added membership_bit field to ReadDBFILE structure for FORMATDB_VER subset
960 * databases.
961 * Added OIDBelongsToMaskDB and modified readdb_gi2seq and readdb_acc2fasta
962 * to return the proper sequence when dealing with a subset database in the FORMATDB_VER format.
963 * Updated readdb_read_alias_file to read the new MEMB_BIT field.
964 * Updated readdb_get_defline_ex and FDBuildOldStyleDefline to build the proper defline when dealing with a subset database in the FORMATDB_VER format.
965 *
966 * Revision 6.268 2001/10/01 18:44:22 camacho
967 * Added BlastDBToFasta function
968 * Added readdb_get_header_ex function
969 *
970 * Revision 6.267 2001/10/01 18:37:31 camacho
971 * readdb.h
972 *
973 * Revision 6.266 2001/09/28 14:28:37 madden
974 * Fixes for ambiguity problem for sequences longer than 16 million bps.
975 *
976 * Revision 6.265 2001/09/26 16:36:52 dondosha
977 * Previous fix still wrong - corrected
978 *
979 * Revision 6.264 2001/09/20 18:30:04 dondosha
980 * Correction to change in revision 6.262
981 *
982 * Revision 6.263 2001/08/29 21:12:59 dondosha
983 * Do not check ISAM indices for non-gi seqid if gifile is provided in rdfp
984 *
985 * Revision 6.262 2001/08/24 22:30:32 dondosha
986 * Correction for alias databases with dblists containing databases with and without OID lists
987 *
988 * Revision 6.261 2001/08/16 13:52:28 madden
989 * Reinit gi to zero for every try
990 *
991 * Revision 6.260 2001/08/08 13:13:57 madden
992 * Add third-party annotation IDs
993 *
994 * Revision 6.259 2001/08/02 20:13:28 madden
995 * Close sequence and header files for all non-used rdfps
996 *
997 * Revision 6.258 2001/08/02 17:55:00 madden
998 * Fix for length and non-mmapped file
999 *
1000 * Revision 6.257 2001/07/26 12:53:12 madden
1001 * Fix for non memory-mapped mode
1002 *
1003 * Revision 6.256 2001/07/16 20:25:07 madden
1004 * Do not init ISAM string indices until needed
1005 *
1006 * Revision 6.255 2001/07/12 19:27:30 madden
1007 * Increase volume by one in call to FD_CreateAliasFileEx
1008 *
1009 * Revision 6.254 2001/07/09 14:17:24 madden
1010 * Fix PC-lint complaints from R. Williams
1011 *
1012 * Revision 6.253 2001/07/06 13:59:02 madden
1013 * Fixed compiler and lint warnings
1014 *
1015 * Revision 6.252 2001/06/25 18:30:24 madden
1016 * Add define for NLM_GENERATED_CODE_PROTO to get prototypes in fdlobj.h
1017 *
1018 * Revision 6.251 2001/06/22 19:13:59 dondosha
1019 * Fixed a thread race condition
1020 *
1021 * Revision 6.250 2001/06/21 19:43:12 shavirin
1022 * Removed to txalign.h definitions related to Taxonomy names.
1023 *
1024 * Revision 6.249 2001/06/21 18:27:27 shavirin
1025 * Moved into files txalign.[c,h] functions returning taxonomy names
1026 * from Bioseq created from Blast database.
1027 *
1028 * Revision 6.248 2001/06/20 19:46:04 madden
1029 * Replace Int2 by Int4 for readdb_get_bioseq_ex
1030 *
1031 * Revision 6.247 2001/06/15 20:57:06 shavirin
1032 * Fixed problem when bsp->descr == NULL in the function readdb_get_bioseq_ex().
1033 *
1034 * Revision 6.246 2001/06/14 14:17:52 madden
1035 * Add FD_MakeAliasFile
1036 *
1037 * Revision 6.245 2001/06/12 18:50:56 shavirin
1038 * Fixed function FDReadDeflineAsn to get correct rdfp structure.
1039 *
1040 * Revision 6.244 2001/06/12 17:33:26 egorov
1041 * Print an error message if DI file could not be found
1042 *
1043 * Revision 6.243 2001/06/08 20:30:24 madden
1044 * Fix problem with not searching all databases in a list for identifier lookups
1045 *
1046 * Revision 6.242 2001/06/08 12:49:31 madden
1047 * Use gi if possible in readdb_seqid2fasta, make readdb_find_best_id static
1048 *
1049 * Revision 6.241 2001/06/04 16:20:20 shavirin
1050 * Fixed problem with retrieve of PDB accessions using fastacmd program.
1051 *
1052 * Revision 6.240 2001/05/29 16:03:41 shavirin
1053 * Adjusted return codes of the function FDBAddSequence().
1054 *
1055 * Revision 6.239 2001/05/21 15:27:18 dondosha
1056 * Change stat call to FileLength
1057 *
1058 * Revision 6.238 2001/05/17 20:21:46 dondosha
1059 * Do not add .00 extension when only one volume created
1060 *
1061 * Revision 6.237 2001/05/14 17:39:07 shavirin
1062 * Changes related to possibility to manipulate with BLAST databases with
1063 * ASN.1 structured deflines.
1064 *
1065 * Revision 6.236 2001/05/11 19:59:40 madden
1066 * Add gi_file_bin to FDOptions, oidlist and gifile to FD_CreateAliasFileEx
1067 *
1068 * Revision 6.235 2001/05/11 18:18:12 madden
1069 * Add error message if db_file is NULL
1070 *
1071 * Revision 6.234 2001/05/10 17:19:53 madden
1072 * Add number_seqs arg to FD_CreateAliasFileEx
1073 *
1074 * Revision 6.233 2001/05/08 21:58:27 shavirin
1075 * Added possibility to generate tax_id for every definition in Blast FASTA
1076 * definition set in ASN.1 structured definition lines.
1077 *
1078 * Revision 6.232 2001/05/02 16:22:05 dondosha
1079 * Add NSEQ and LENGTH to alias files in case of multiple inputs to formatdb
1080 *
1081 * Revision 6.231 2001/04/30 19:29:47 madden
1082 * Remove intermediate buffer in readdb_get_bioseq_ex
1083 *
1084 * Revision 6.230 2001/04/27 15:26:37 madden
1085 * Use RebuildDNA_4na rather than BSRebuildDNA_4na_core
1086 *
1087 * Revision 6.229 2001/04/27 15:18:29 madden
1088 * Use BSRebuildDNA_4na_core, remove unnecessary memset
1089 *
1090 * Revision 6.228 2001/04/23 17:08:52 madden
1091 * Do not delete gifile memory if readdb is only attached
1092 *
1093 * Revision 6.227 2001/04/19 14:41:08 madden
1094 * Fix for subset database deflines
1095 *
1096 * Revision 6.226 2001/04/16 20:42:59 madden
1097 * Fix readdb_adjust_local_id to only work on BL_ORD_ID
1098 *
1099 * Revision 6.225 2001/04/13 22:17:06 dondosha
1100 * Fixed formatdb but if one of multiple FASTA file inputs is empty
1101 *
1102 * Revision 6.224 2001/04/11 21:00:52 dondosha
1103 * Made functions FD_CreateAliasFile(Ex) public
1104 *
1105 * Revision 6.223 2001/04/11 20:45:35 dondosha
1106 * Moved appending of .00 for the first volume to FormatDBInit function
1107 *
1108 * Revision 6.222 2001/04/11 20:14:40 dondosha
1109 * Processing of volumes moved to lower level
1110 *
1111 * Revision 6.221 2001/03/29 20:15:40 madden
1112 * Int4 to Uint4 where needed
1113 *
1114 * Revision 6.220 2001/03/27 21:16:02 dondosha
1115 * Allow FIRST_OID and LAST_OID parameters in alias database file
1116 *
1117 * Revision 6.219 2001/03/26 14:42:01 madden
1118 * Fix number warnings and two bugs found by PC compiler
1119 *
1120 * Revision 6.218 2001/03/23 17:23:54 madden
1121 * Move FDGetDeflineAsnFromBioseq to txalign.[ch]
1122 *
1123 * Revision 6.217 2001/03/21 22:14:21 shavirin
1124 * Fixed problem with using ASN.1 structured deflines in non-parse seq-id
1125 * database.
1126 *
1127 * Revision 6.216 2001/03/13 21:49:11 madden
1128 * Remove extra &
1129 *
1130 * Revision 6.215 2001/03/08 14:08:06 madden
1131 * Use ByteStorePtr PNTR rather than ByteStorePtr for User-field
1132 *
1133 * Revision 6.214 2001/02/21 14:53:40 madden
1134 * Protection against -1 gi
1135 *
1136 * Revision 6.213 2001/02/12 17:42:50 madden
1137 * Replace another OLD_INT4_DB_SIZE_TO_BE_REMOVED with check for FORMATDB_VER_TEXT
1138 *
1139 * Revision 6.212 2001/02/06 18:47:48 madden
1140 * replace OLD_UIN4_DB_LEN_TO_BE_REMOVED with version check
1141 *
1142 * Revision 6.211 2001/02/05 18:52:00 shavirin
1143 * Blast database size was changed from Uint4 to Uint8 - this corrected
1144 * invalidly printed database size for large databases.
1145 *
1146 * Revision 6.210 2001/01/06 21:21:27 kans
1147 * Mac compiler complained about return NULL for Int2 return value
1148 *
1149 * Revision 6.209 2001/01/05 16:37:53 egorov
1150 * 1. Initialize OffsetAllocated=1024
1151 * 2. Add more diagnostic messages
1152 *
1153 * Revision 6.208 2001/01/02 22:28:14 dondosha
1154 * Check for partial duplication of databases when a whole database and its part with oidlist are provided for search
1155 *
1156 * Revision 6.207 2000/12/15 21:47:35 shavirin
1157 * Added set of functions to encode taxonomy names information into
1158 * Bioseq and retrieval of specific information from it.
1159 *
1160 * Revision 6.206 2000/12/12 23:14:41 shavirin
1161 * Added functions to initialize taxonomy names database and search functions
1162 * to get all taxonomy names given tax_id using this database.
1163 *
1164 * Revision 6.205 2000/12/08 22:25:00 shavirin
1165 * Added code for creation Taxonomy lookup database using formatdb API.
1166 *
1167 * Revision 6.204 2000/11/22 20:51:12 shavirin
1168 * Added new parameter tax_id into function FDBAddBioseq() for creation
1169 * ASN.1 structured deflines in BLAST databases.
1170 *
1171 * Revision 6.203 2000/11/22 19:54:48 shavirin
1172 * Added creation of the special user object with ASN.1 structured deflines
1173 * in the function readdb_get_bioseq()
1174 *
1175 * Revision 6.202 2000/11/13 21:33:59 madden
1176 * Add warning for zero-length sequence
1177 *
1178 * Revision 6.201 2000/11/07 20:56:14 egorov
1179 * Few improvements by Michael Kimelman
1180 *
1181 * Revision 6.200 2000/11/03 19:49:47 madden
1182 * Add final return value to FastaToBlastDb to silence compiler
1183 *
1184 * Revision 6.199 2000/11/03 15:46:04 madden
1185 * Save gifile from alias file for nucleotides
1186 *
1187 * Revision 6.198 2000/10/30 21:02:07 madden
1188 * Fix memory leak and FUM for formatdb
1189 *
1190 * Revision 6.197 2000/10/26 18:32:55 dondosha
1191 * Fill the gifile string from alias structure when creating ReadDBFILE
1192 *
1193 * Revision 6.196 2000/10/24 19:11:45 madden
1194 * Add function CheckForRecursion that checks all dbs in string, issues warning if recursion found
1195 *
1196 * Revision 6.195 2000/10/20 19:27:09 madden
1197 * Fix UMR (bdfp_head) in readdb_get_descriptor
1198 *
1199 * Revision 6.194 2000/10/13 17:31:51 shavirin
1200 * Adjusted calls to readdb_get_header for ASN.1 structured deflines.
1201 *
1202 * Revision 6.193 2000/10/13 16:05:43 shavirin
1203 * Fixed minir bug with reporting database name.
1204 *
1205 * Revision 6.192 2000/10/03 16:12:37 madden
1206 * Replace atol with sscanf for large numbers
1207 *
1208 * Revision 6.191 2000/09/29 16:38:28 shavirin
1209 * Added new function FDB_FreeCLOptions(FDB_optionsPtr options).
1210 *
1211 * Revision 6.190 2000/09/27 14:06:51 shavirin
1212 * Fixed minor bug in FormatDBInit() function.
1213 *
1214 * Revision 6.189 2000/09/25 20:39:32 dondosha
1215 * Call ReadDBCloseMHdrAndSeqFiles from readdb_destruct only when contents allocated
1216 *
1217 * Revision 6.188 2000/09/19 20:12:59 shavirin
1218 * Empty log message
1219 *
1220 * Revision 6.187 2000/09/19 20:10:27 shavirin
1221 * Attempt to fix NT bug related to unproper defines generated by asntool.
1222 *
1223 * Revision 6.186 2000/09/18 01:15:50 shavirin
1224 * Changed definition BlastDefline -> BlastDefLine do not conflict with
1225 * Blast network definitions.
1226 *
1227 * Revision 6.185 2000/09/15 20:43:22 shavirin
1228 * Empty log message.
1229 *
1230 * Revision 6.184 2000/09/15 20:40:03 shavirin
1231 * Many changes to allow dump and retrieval of ASN.1 structured deflines.
1232 *
1233 * Revision 6.183 2000/09/07 20:49:57 shavirin
1234 * Added parameters to support ASN.1 defline dump for blast db. FORMATDB_VER 3->4
1235 * Added parameter FORMATDB_VER_TEXT for backward compatibility.
1236 *
1237 * Revision 6.182 2000/09/05 17:24:59 shavirin
1238 * Fixed problem with initialization of sparse_idx information.
1239 *
1240 * Revision 6.181 2000/09/01 18:28:12 dondosha
1241 * Call ReadDBFreeSharedInfo and ReadDBCloseMHdrAndSeqFiles from readdb_destruct
1242 *
1243 * Revision 6.180 2000/08/31 15:56:38 dondosha
1244 * Change allowing to pass rdfp from higher level to search
1245 *
1246 * Revision 6.179 2000/08/30 20:29:00 shavirin
1247 * Fixed GCC compiler warnings.
1248 *
1249 * Revision 6.178 2000/08/07 20:43:04 madden
1250 * Proper casting of int to long for printf
1251 *
1252 * Revision 6.177 2000/07/19 14:01:47 madden
1253 * Call CommonIndexDestruct if opening of CommonIndex does not succeed
1254 *
1255 * Revision 6.176 2000/07/18 19:29:28 shavirin
1256 * Added new parameter test_non_unique to suppress check for non-unique
1257 * strings ids in the database - default - TRUE.
1258 *
1259 * Revision 6.175 2000/06/30 18:20:30 madden
1260 * Elaborate on SORTFiles error message
1261 *
1262 * Revision 6.174 2000/06/30 16:40:11 madden
1263 * Changed error message if unable to initialze readdb
1264 *
1265 * Revision 6.173 2000/06/28 16:55:49 madden
1266 * Add function Fastacmd_Search_ex, gi_target to ReadDBFILEPtr
1267 *
1268 * Revision 6.172 2000/06/22 18:59:33 egorov
1269 * Allow absolute paths to databases in alias files.
1270 * The change is provided by Maxim Shemanarev (Informax Inc).
1271 *
1272 * Revision 6.171 2000/06/19 20:06:42 madden
1273 * Add ready Boolean to readdb_get_sequence_ex, for nucl. sequence the data is then in blastna format with sentinel bytes
1274 *
1275 * Revision 6.170 2000/06/19 16:53:21 madden
1276 * Remove unneeded memcpy
1277 *
1278 * Revision 6.169 2000/06/16 16:43:33 madden
1279 * Replace MemNew with Nlm_Malloc
1280 *
1281 * Revision 6.168 2000/06/08 19:02:26 madden
1282 * Return file-name if no title found
1283 *
1284 * Revision 6.167 2000/05/25 20:31:24 madden
1285 * Do not change aliasfilebit unless it is zero
1286 *
1287 * Revision 6.166 2000/05/23 21:22:37 dondosha
1288 * Do not open sequence files in shared_info when flag is set to not do it - correction to previous change
1289 *
1290 * Revision 6.165 2000/05/22 18:46:43 dondosha
1291 * Merged all Boolean members in ReadDBFILE structure into a single Int4
1292 *
1293 * Revision 6.164 2000/05/09 15:54:19 shavirin
1294 * Added function ReadDBBioseqSetDbGeneticCode().
1295 *
1296 * Revision 6.163 2000/05/03 17:41:21 madden
1297 * Fix for readdb_get_descriptor problem when searching subset database
1298 *
1299 * Revision 6.162 2000/05/03 16:19:01 dondosha
1300 * Added function FastaToBlastDB
1301 *
1302 * Revision 6.161 2000/05/03 12:49:45 madden
1303 * Do not add > if not first definition
1304 *
1305 * Revision 6.160 2000/05/01 20:01:11 madden
1306 * Protection against too large gis
1307 *
1308 * Revision 6.159 2000/04/19 17:59:23 madden
1309 * Move setting of start and stop, adjust of indices to end, do every time in case of recursive calls or multiple databases
1310 *
1311 * Revision 6.158 2000/04/14 21:16:36 madden
1312 * Fix for non-NULL aliasfilename
1313 *
1314 * Revision 6.157 2000/04/11 19:56:48 madden
1315 * Set aliasfilename even if oidlist does not exist
1316 *
1317 * Revision 6.156 2000/04/10 18:01:46 dondosha
1318 * Fixed FindBlastDBFile when file exists in current directory
1319 *
1320 * Revision 6.155 2000/04/05 19:25:09 madden
1321 * Check for NULL searchstr in Fastacmd_Search, allow line break to be a valid delimiter for a file that is read in
1322 *
1323 * Revision 6.154 2000/04/03 21:17:57 dondosha
1324 * readdb_MakeGiFileBinary will sort gis in increasing order
1325 *
1326 * Revision 6.153 2000/04/03 17:34:27 shavirin
1327 * Fixed case when indexed and regular databases are mixed in multiple
1328 * database set.
1329 *
1330 * Revision 6.152 2000/03/28 04:38:20 egorov
1331 * Bug seen on Malaria page fixed
1332 *
1333 * Revision 6.151 2000/03/24 14:36:43 egorov
1334 * Allow NULL alias_dbid on input of GI2OID
1335 *
1336 * Revision 6.150 2000/03/24 14:34:33 egorov
1337 * Add support for month.sts, month.pataa, month.patnt month subsets
1338 *
1339 * Revision 6.149 2000/03/20 22:03:34 egorov
1340 * bug with multiple alias databases mask is fixed
1341 *
1342 * Revision 6.148 2000/03/20 17:03:19 dondosha
1343 * Return NULL from readdb_get_link and readdb_get_bioseq if cannot mem-map files
1344 *
1345 * Revision 6.147 2000/03/20 14:36:54 egorov
1346 * Add protection from the reading out of the ISAM index file boundary when update CommonIndex.
1347 *
1348 * Revision 6.146 2000/03/16 19:47:18 egorov
1349 * Db mask should be Uint4, not Int2. Also previous change about FreeOIDList is rolled back.
1350 *
1351 * Revision 6.145 2000/03/16 18:09:50 dondosha
1352 * Fixes memory leak in OIDListFree; corrects ReadDBCloseMHdrAndSeqFiles
1353 *
1354 * Revision 6.144 2000/03/15 21:34:30 egorov
1355 * 1. Fix bug with using alias databases.
1356 * 2. 2. Initialize new_defline variable.
1357 *
1358 * Revision 6.143 2000/03/13 18:36:37 madden
1359 * Added insert_ctrlA Boolean to readdb_get_bioseq_ex
1360 *
1361 * Revision 6.142 2000/03/13 13:53:50 madden
1362 * Check for non-NULL rdfp before dereference
1363 *
1364 * Revision 6.141 2000/03/10 19:16:30 shavirin
1365 * Added multi-thread support for the function ReadDBBioseqFetchEnable().
1366 *
1367 * Revision 6.140 2000/03/10 18:51:33 madden
1368 * Add prototype for readdb_get_filebits
1369 *
1370 * Revision 6.139 2000/03/08 22:03:32 madden
1371 * added readdb_get_filebits
1372 *
1373 * Revision 6.138 2000/03/08 20:52:37 madden
1374 * readdb_get_bioseq_ex only returns gis for subset database
1375 *
1376 * Revision 6.137 2000/02/28 21:50:13 egorov
1377 * All month subsets use same criteria.
1378 *
1379 * Revision 6.136 2000/02/24 19:02:37 egorov
1380 * Add support for PDB subset of nr
1381 *
1382 * Revision 6.135 2000/02/16 18:39:59 madden
1383 * Fix check for nucl. alias file
1384 *
1385 * Revision 6.134 2000/02/11 19:59:29 shavirin
1386 * Increased nthreads when attaching to the rdfp structure.
1387 *
1388 * Revision 6.133 2000/02/09 19:35:51 madden
1389 * Added readdb_MakeGiFileBinary
1390 *
1391 * Revision 6.132 2000/02/07 21:15:15 madden
1392 * Issue warning before stripping zero gi
1393 *
1394 * Revision 6.131 2000/02/07 20:56:08 madden
1395 * Strip off gi|0 identifiers for formatdb
1396 *
1397 * Revision 6.130 2000/01/26 15:38:34 madden
1398 * Fix for fastacmd and alias files
1399 *
1400 * Revision 6.129 2000/01/26 15:19:59 madden
1401 * Return aliasfilename if present in readdb_get_filename
1402 *
1403 * Revision 6.128 2000/01/20 20:26:05 egorov
1404 * Use "est_" prefix for subsets 'human', 'mouse', and 'others'
1405 *
1406 * Revision 6.127 2000/01/20 18:57:24 madden
1407 * Check whether rdfp is NULL before dereference
1408 *
1409 * Revision 6.126 2000/01/12 21:51:50 madden
1410 * Check for oidlist before setting aliasfilename
1411 *
1412 * Revision 6.125 2000/01/12 21:46:35 dondosha
1413 * Fixed memory leak (rdfp->aliasfilename)
1414 *
1415 * Revision 6.124 2000/01/12 21:03:52 egorov
1416 * 1. Introduce Fastacmd API function - Fastacmd_Search
1417 * 2. Rearrange order of functions to have Fastacmd, ID1, and CommonIndex stuff separate.
1418 *
1419 * Revision 6.123 2000/01/12 20:28:31 dondosha
1420 * Fixed readdb_new_ex2 behavior with multiple volume database
1421 *
1422 * Revision 6.122 2000/01/12 18:06:03 egorov
1423 * Fix memory leak. Remove debug stuff.
1424 *
1425 * Revision 6.121 2000/01/12 17:39:31 madden
1426 * Fix readdb_parse_db_names so done is TRUE on last db
1427 *
1428 * Revision 6.120 2000/01/11 15:32:46 dondosha
1429 * Fixed memory leaks in opening shared header and sequence file memory maps
1430 *
1431 * Revision 6.119 2000/01/07 16:00:25 madden
1432 * Alias db length is Int8 instead of Uint4
1433 *
1434 * Revision 6.118 1999/12/31 14:23:20 egorov
1435 * Add support for using mixture of real and maks database with gi-list files:
1436 * 1. Change logic of creating rdfp list.
1437 * 2. BlastGetDbChunk gets real databases first, then masks.
1438 * 3. Propoper calculation of database sizes using alias files.
1439 * 4. Change to CommonIndex to support using of mask databases.
1440 * 5. Use correct gis in formatted output (BlastGetAllowedGis()).
1441 * 6. Other small changes
1442 *
1443 * Revision 6.117 1999/12/29 13:46:42 madden
1444 * Fix for moving virtual rdfp to end, remove bad fix for infinite recursion
1445 *
1446 * Revision 6.116 1999/12/23 18:15:37 madden
1447 * Move mask databases to end of all databases
1448 *
1449 * Revision 6.115 1999/12/22 21:54:41 dondosha
1450 * Open header and sequence files consecutively as needed, close them when all threads have finished working with the database
1451 *
1452 * Revision 6.114 1999/12/21 20:02:16 egorov
1453 * Set proper 'start' and 'stop' values for mask's rdfp.
1454 * Add 'start' parameter into readdb_gi2seq. This is return
1455 * value which is set to rdfp->start where given gi was found.
1456 *
1457 * Revision 6.113 1999/12/17 21:33:01 egorov
1458 * Add support for the 'month' subset.
1459 *
1460 * Revision 6.112 1999/12/17 20:47:05 egorov
1461 * Fix 'gcc -Wall' warnings
1462 *
1463 * Revision 6.111 1999/12/15 21:57:58 egorov
1464 * Initialize extra_bytes variable
1465 *
1466 * Revision 6.110 1999/12/15 17:40:07 egorov
1467 * 1. Fix but with path to CommonIndexFile.
1468 * 2. Add ScanDIFile() function for scanning DI index file and perform
1469 * callback-specified action for each record which meets database
1470 * subset criteria.
1471 * 3. Change UpdateCommonIndexFile() function to use ScanDIFile.
1472 * 4. Criteria for est_others, est_human, est_mouse, swissprot added.
1473 *
1474 * Revision 6.109 1999/12/14 19:27:09 dondosha
1475 * Test against infinite recursion in IndexFileExists
1476 *
1477 * Revision 6.108 1999/11/30 17:07:15 egorov
1478 * Fix problem with parsing database names when file_path is not NULL.
1479 * Add prefix path prefix to OIDLIST and GILIST values, if any.
1480 *
1481 * Revision 6.107 1999/11/29 14:45:47 egorov
1482 * Bug fixed.
1483 *
1484 * Revision 6.106 1999/11/24 21:43:34 madden
1485 * Added Nlm_SwapUint4 call to make database masks work with both big and small endian systems
1486 *
1487 * Revision 6.105 1999/11/24 18:42:25 egorov
1488 * It was reported by Andrei Shkeda and observed by us in neighboring software
1489 * that using ReadDbFile structure was not MT-safe, and, as a result,
1490 * it was impossible to format seqalign from different threads.
1491 * Now it is fixed. Mutecies shared same ISAM structures, so I had to put additional mutex.
1492 *
1493 * Revision 6.104 1999/11/24 18:01:38 egorov
1494 * Bug fixed: it truncated full database name to just file name if BLASTDB was specified.
1495 * So it was impossible to have BLASTDB=/blast/db/blast and filename = "subdir/database".
1496 * Now it works and makes it possible to use subdirectories for organism-specific databases.
1497 *
1498 * Revision 6.103 1999/11/23 22:02:26 madden
1499 * Added readdb_get_totals_ex that may use alias file values
1500 *
1501 * Revision 6.102 1999/11/23 21:30:10 madden
1502 * Deallocate OID list
1503 *
1504 * Revision 6.101 1999/11/22 16:15:36 egorov
1505 * Remove correct return code in readdb_get_header function
1506 *
1507 * Revision 6.100 1999/11/15 17:42:48 egorov
1508 * Fix bug when CommonIndex finds wrong Gi if database is not the first
1509 * or the second in the CommonIndex list
1510 *
1511 * Revision 6.99 1999/11/12 14:15:54 madden
1512 * Allow NlmOpenMFILE to simply open a file if it cannot be memory-mapped, alow other initialization states in readdb_new_ex2
1513 *
1514 * Revision 6.98 1999/10/07 20:40:48 madden
1515 * Remove calls and function readdb_get_index
1516 *
1517 * Revision 6.97 1999/10/07 13:40:37 madden
1518 * remove extra call to Nlm_SwapUint4
1519 *
1520 * Revision 6.96 1999/10/06 21:08:36 shavirin
1521 * Cleared last bits in last byte written in function FDBAddSequence()
1522 * These bits may be dirty in case of ASN.1 coming directly from ID.
1523 *
1524 * Revision 6.95 1999/10/01 18:25:07 shavirin
1525 * Fixed bug in the function FDBAddSequence
1526 *
1527 * Revision 6.94 1999/09/30 20:48:24 madden
1528 * Change static buffer to dynamically allocated
1529 *
1530 * Revision 6.93 1999/09/29 17:20:34 shavirin
1531 * Fixed minor memory leak.
1532 *
1533 * Revision 6.92 1999/09/29 13:30:51 shavirin
1534 * Changed sequence of allocating/deleting of oidlist structure.
1535 *
1536 * Revision 6.91 1999/09/28 20:45:07 shavirin
1537 * Passed oidlist info when cloning rdfp in readdb_attach() function.
1538 *
1539 * Revision 6.90 1999/09/28 13:41:57 shavirin
1540 * Freed memory of OID list in readdb_destruct().
1541 *
1542 * Revision 6.89 1999/09/24 16:30:25 egorov
1543 * Remove Mac incompatible stuff. Add two more functions for CommonIndex API.
1544 *
1545 * Revision 6.88 1999/09/23 18:22:30 egorov
1546 * Do not keep private copy of index arrays (sequence_index, header_index, ambchar_index),
1547 * but just use as it is in memory mapped file. Big and small endian stuff is not forgot.
1548 *
1549 * Revision 6.87 1999/09/23 15:17:24 egorov
1550 * Add CommonIndex API function - UpdateCommonIndexFile
1551 *
1552 * Revision 6.86 1999/09/23 15:10:52 egorov
1553 * Add new fields into OIDList structure.
1554 * Add new keywords into alias file: NSEQ and LENGTH.
1555 * Use Nlm_Malloc instead of MemNew where MemSet is not needed.
1556 * Create ReadOIDList function.
1557 *
1558 * Revision 6.85 1999/09/23 15:03:43 egorov
1559 * Close alias file; change name of index file; add comments
1560 *
1561 * Revision 6.84 1999/09/22 21:58:07 egorov
1562 * fix compilation bug
1563 *
1564 * Revision 6.83 1999/09/13 16:18:37 shavirin
1565 * Added function readdb_get_bioseq_ex, which has possibility
1566 * to bypass ObjMgr registration.
1567 *
1568 * Revision 6.82 1999/09/10 16:30:17 shavirin
1569 * Fixed problems with formating proteins by formatdb
1570 *
1571 * Revision 6.81 1999/09/09 18:25:04 shavirin
1572 * Added functions to parse ASN.1 with formatdb
1573 *
1574 * Revision 6.80 1999/09/02 18:02:33 madden
1575 * No spaces after date
1576 *
1577 * Revision 6.79 1999/09/02 12:56:52 egorov
1578 * Change format of the BLAST index file to set proper alignment
1579 * for memory map.
1580 *
1581 * Revision 6.78 1999/08/30 18:21:29 shavirin
1582 * Temporary return of full dumping set in SeqidE2Index() function.
1583 *
1584 * Revision 6.77 1999/08/26 20:55:55 shavirin
1585 * Changed way to look for seqids.
1586 *
1587 * Revision 6.76 1999/08/26 14:12:50 shavirin
1588 * Redused amount of information dumped for string indexes in regular case.
1589 *
1590 * Revision 6.75 1999/08/25 20:17:38 shavirin
1591 * Added option to create and retrieve from sparse indexes.
1592 *
1593 * Revision 6.74 1999/08/04 18:26:41 madden
1594 * Change databases in alias file for file path
1595 *
1596 * Revision 6.72 1999/08/03 19:21:44 shavirin
1597 * Changed to dynamically allocated memory in function readdb_read_alias_file()
1598 *
1599 * Revision 6.71 1999/08/02 13:36:01 shavirin
1600 * Rolled back last changes.
1601 *
1602 * Revision 6.69 1999/06/29 19:26:59 madden
1603 * Took SeqIdWrite out of loop for efficiency
1604 *
1605 * Revision 6.68 1999/06/10 20:53:22 egorov
1606 * Few changes to make it possible to perform multiple searches against different db's.
1607 *
1608 * Revision 6.67 1999/05/28 14:30:38 yaschenk
1609 * rolling back fixes of 6.63 by shavirin, since they lead to coredump
1610 *
1611 * Revision 6.66 1999/05/27 21:47:12 yaschenk
1612 * fix to the previous change
1613 *
1614 * Revision 6.65 1999/05/27 21:41:44 yaschenk
1615 * dump_info file should be created fro nucleotides
1616 *
1617 * Revision 6.64 1999/05/27 15:51:29 shavirin
1618 * Added function readdb_get_defline ()
1619 *
1620 * Revision 6.63 1999/05/27 14:40:17 shavirin
1621 * Fixed some memory leaks.
1622 *
1623 * Revision 6.62 1999/05/21 17:36:52 madden
1624 * Minor efficiencies
1625 *
1626 * Revision 6.61 1999/05/18 20:35:30 madden
1627 * Changes to read an alias file for multiple db searches and ordinal ID lists
1628 *
1629 * Revision 6.60 1999/05/17 15:28:30 egorov
1630 * First check that gi belongs to correct database and only then do all CommonIndex stuff
1631 *
1632 * Revision 6.59 1999/05/13 19:31:13 shavirin
1633 * More changes toward dump from ID.
1634 *
1635 * Revision 6.58 1999/05/12 15:48:33 shavirin
1636 * Many changes to fit new dump from ID.
1637 *
1638 * Revision 6.57 1999/05/10 13:47:44 madden
1639 * NULL database not a fatal error
1640 *
1641 * Revision 6.56 1999/05/04 13:12:19 egorov
1642 * Declare parse* functions as static and remove unused argument
1643 *
1644 * Revision 6.55 1999/05/03 21:44:33 chappey
1645 * getline is now static function
1646 *
1647 * Revision 6.54 1999/04/27 17:28:17 shavirin
1648 * Fixed few problems in the function FDBAddSequence().
1649 *
1650 * Revision 6.53 1999/04/26 14:55:23 shavirin
1651 * Checked variable for not NULL.
1652 *
1653 * Revision 6.52 1999/04/26 14:36:04 shavirin
1654 * Added ability to dump statistics.
1655 *
1656 * Revision 6.50 1999/04/21 22:59:41 kans
1657 * added includes
1658 *
1659 * Revision 6.49 1999/04/21 21:43:28 shavirin
1660 * Added set of functions, which used in "formatdb".
1661 *
1662 * Revision 6.48 1999/04/14 14:53:49 madden
1663 * Correction for databases over 2 Gig
1664 *
1665 * Revision 6.47 1999/03/23 14:38:28 egorov
1666 * Destruct CommonIndex structures only by thread it belongs to.
1667 *
1668 * Revision 6.46 1999/03/19 19:29:47 egorov
1669 * Bug fixed. Initialize cih.
1670 *
1671 * Revision 6.45 1999/03/18 16:55:22 egorov
1672 * Previous fix was incompleete.
1673 *
1674 * Revision 6.44 1999/03/18 16:36:16 egorov
1675 * Check if rdfp is not NULL before dereferencing it.
1676 *
1677 * Revision 6.43 1999/03/17 16:57:21 egorov
1678 * Previously each element in rdfp list had his own CommonIndexHeadPtr
1679 * initialized with MemMap. But when we do search agains many databases,
1680 * like in case of unfinished genomes, we meet limit for doing MemMap
1681 * on SGI machines. So now we initialize 'rdfp->cih' only for the first
1682 * element in the list and reuse it for the others.
1683 * Also the change contains proper freeing memory after the above change.
1684 *
1685 * Revision 6.42 1999/03/12 23:02:49 madden
1686 * initialize memory in buffer_2na first
1687 *
1688 * Revision 6.41 1999/03/12 18:36:16 madden
1689 * formatting fix
1690 *
1691 * Revision 6.40 1999/02/22 21:49:08 egorov
1692 * Optimize GIs2OIDs using already initialized ISAM indecies from rdfp. Use SwapUint4 function to use common index file
1693 * on Solaris/Intel machines
1694 *
1695 * Revision 6.39 1999/02/18 21:19:12 madden
1696 * ignore GIs not in common index
1697 *
1698 * Revision 6.38 1999/02/17 13:23:40 madden
1699 * use MapNa2ByteToNa4String
1700 *
1701 * Revision 6.37 1999/01/07 14:35:01 madden
1702 * Fix for readdb_acc2fasta for multiple databases
1703 *
1704 * Revision 6.36 1998/12/14 21:50:15 egorov
1705 * new max gi number memeber in CommonIndexHead structure and therefore no need for COMMON_INDEX_TABLE_SIZE
1706 *
1707 * Revision 6.35 1998/09/24 15:26:41 egorov
1708 * Fix lint complaints
1709 *
1710 * Revision 6.34 1998/09/14 15:11:20 egorov
1711 * Add support for Int8 length databases; remove unused variables
1712 *
1713 * Revision 6.33 1998/09/03 18:43:09 egorov
1714 * Close db config file
1715 *
1716 * Revision 6.32 1998/08/29 20:05:47 madden
1717 * Fixed MemCpy length problem
1718 *
1719 * Revision 6.31 1998/08/24 14:59:56 madden
1720 * readdb_get_sequence_ex function
1721 *
1722 * Revision 6.30 1998/07/31 19:30:11 egorov
1723 * Fix bug when OID=0 treated as bad in common index
1724 *
1725 * Revision 6.29 1998/07/09 13:35:16 egorov
1726 * remove platform dependent statement
1727 *
1728 * Revision 6.28 1998/07/08 14:10:53 madden
1729 * Fix for multiple db search, use of more efficient readdb_new_ex
1730 *
1731 * Revision 6.27 1998/07/01 16:45:25 egorov
1732 * Remove debug mesages
1733 *
1734 * Revision 6.26 1998/07/01 14:14:49 egorov
1735 * Move FilePathFind function into ncbitoolkit remove its definition here
1736 *
1737 * Revision 6.25 1998/07/01 14:03:04 egorov
1738 * Fix bug with a thread freeing CommonIndex: add new flag to rdfp
1739 *
1740 * Revision 6.24 1998/06/26 16:51:13 egorov
1741 * Fix CommonIndex bugs
1742 *
1743 * Revision 6.23 1998/06/24 21:03:35 egorov
1744 * Remove memory leaks
1745 *
1746 * Revision 6.20 1998/05/22 20:19:53 madden
1747 * Changes to fix multi-db search bug
1748 *
1749 * Revision 6.19 1998/02/26 22:49:23 kans
1750 * needed to include ffprint.h
1751 *
1752 * Revision 6.18 1998/02/26 22:34:21 madden
1753 * Changes for 16 bit windows
1754 *
1755 * Revision 6.17 1998/01/16 22:02:03 madden
1756 * Added readdb_new_ex with init_indices Boolean to allow faster retrieval of one sequence
1757 *
1758 * Revision 6.16 1997/12/12 20:39:25 madden
1759 * Added parens for if
1760 *
1761 * Revision 6.15 1997/12/11 22:21:05 madden
1762 * Removed unused variables
1763 *
1764 * Revision 6.14 1997/12/03 21:48:01 madden
1765 * Check for duplicate database names
1766 *
1767 * Revision 6.13 1997/12/02 22:18:09 madden
1768 * Fixed UMR
1769 *
1770 * Revision 6.12 1997/11/26 22:48:35 madden
1771 * Added readdb_parse_db_names for multiple db searches
1772 *
1773 * Revision 6.11 1997/11/07 16:16:14 shavirin
1774 * Added new function readdb_acc2fastaEx(), that retrieve array of hits
1775 *
1776 * Revision 6.10 1997/11/07 14:44:53 madden
1777 * Sped up start up
1778 *
1779 * Revision 6.9 1997/11/06 21:27:19 madden
1780 * Speeded up initialization
1781 *
1782 * Revision 6.8 1997/10/30 18:16:12 madden
1783 * Change to readdb_acc2fasta to allow lookups by accession strings
1784 *
1785 * Revision 6.7 1997/10/24 19:08:13 madden
1786 * Added ReadDBGetDb and ReadDBGetDbId
1787 *
1788 * Revision 6.6 1997/10/24 14:10:30 madden
1789 * Changed Fetch function to speed up retrieval of cached sequences
1790 *
1791 * Revision 6.5 1997/09/24 22:37:03 madden
1792 * Added readdb_destruct_element
1793 *
1794 * Revision 6.4 1997/09/16 16:31:36 madden
1795 * More changes for multiple db runs
1796 *
1797 * Revision 6.3 1997/09/12 19:55:35 madden
1798 * Added readdb_compare
1799 *
1800 * Revision 6.2 1997/09/11 18:49:37 madden
1801 * Changes to enable searches against multiple databases.
1802 *
1803 * Revision 6.1 1997/08/27 14:46:56 madden
1804 * Changes to enable multiple DB searches
1805 *
1806 * Revision 6.0 1997/08/25 18:53:55 madden
1807 * Revision changed to 6.0
1808 *
1809 * Revision 1.52 1997/07/14 20:11:21 madden
1810 * Removed unused variables
1811 *
1812 * Revision 1.51 1997/06/26 20:32:55 madden
1813 * Only convert sequence if ambig. chars
1814 *
1815 * Revision 1.50 1997/05/20 14:33:32 shavirin
1816 * Fixed retrievel by LOCUS in function readdb_acc2fasta()
1817 *
1818 * Revision 1.49 1997/05/19 21:14:56 shavirin
1819 * Changed function readdb_acc2fasta() as required by E2Index() functions
1820 * family
1821 *
1822 * Revision 1.48 1997/05/16 13:50:42 madden
1823 * Fixed bug, wrong type of database opened
1824 *
1825 * Revision 1.47 1997/05/12 21:33:57 madden
1826 * readdb_new allows indeterminate database type
1827 *
1828 * Revision 1.46 1997/05/12 21:10:31 shavirin
1829 * Added new function readdb_acc2fasta()
1830 *
1831 * Revision 1.44 1997/05/07 21:03:11 madden
1832 * Added function SeqId2OrdinalId
1833 *
1834 * Revision 1.43 1997/05/01 17:27:31 shavirin
1835 * Added new function readdb_seqid2fasta()
1836 *
1837 * Revision 1.42 1997/03/31 17:06:40 shavirin
1838 * Changed function readdb_get_bioseq to use BSRebuildDNA_4na()
1839 * function.
1840 *
1841 * Revision 1.41 1997/03/26 14:01:34 madden
1842 * Changes to Fetch function to allow cached-out structures to be read back in.
1843 *
1844 * Revision 1.40 1997/03/05 18:24:17 madden
1845 * Fixed MT problem introduced with use of ISAM code.
1846 *
1847 * Revision 1.39 1997/02/26 23:39:54 madden
1848 * Removed unused variables.
1849 *
1850 * Revision 1.38 1997/02/26 20:37:31 madden
1851 * Added protection against MT use to fetch function.
1852 *
1853 * Revision 1.37 1997/02/25 23:52:05 madden
1854 * Added readdb_gi2seq call to ReadDBBioseqFetchFunc.
1855 *
1856 * Revision 1.36 1997/02/25 22:15:33 shavirin
1857 * Changes in accordance to ISAM API changes
1858 *
1859 * Revision 1.35 1997/02/25 16:28:05 shavirin
1860 * Added function readdb_gi2seq() - returnes sequence number from gi
1861 *
1862 * Revision 1.34 1997/02/14 17:17:59 madden
1863 * Checked for NULL return from MemNew.
1864 *
1865 * Revision 1.33 1997/02/07 22:32:40 madden
1866 * Fixed bug.
1867 *
1868 * Revision 1.32 1997/01/14 23:11:27 madden
1869 * Cleaned ctrl-A's out of defline in readdb_get_bioseq.
1870 *
1871 * Revision 1.31 1996/12/20 00:30:20 madden
1872 * Protected ambiguity data against big/little endian changes.
1873 *
1874 * Revision 1.30 1996/12/19 16:29:56 madden
1875 * Changes to eliminate ".nac" file for nucl.
1876 *
1877 * Revision 1.29 1996/12/17 21:34:46 madden
1878 * Changes to allow deflines for inidividual entries to be retrieved.
1879 *
1880 * Revision 1.28 1996/12/11 18:42:36 madden
1881 * Added BioseqFetch functions.
1882 *
1883 * Revision 1.27 1996/12/11 17:59:42 madden
1884 * Fixed purify leaks.
1885 *
1886 * Revision 1.26 1996/12/08 15:19:59 madden
1887 * Checked for NULL pointer.
1888 *
1889 * Revision 1.25 1996/11/27 16:39:11 madden
1890 * Added functions to return filename and date.
1891 *
1892 * Revision 1.24 1996/11/26 19:54:27 madden
1893 * Added check for database in standard places.
1894 *
1895 * Revision 1.23 1996/11/22 19:05:48 madden
1896 * removed ifdef for OLD_BIT_ORDER.
1897 *
1898 * Revision 1.22 1996/11/18 17:28:13 madden
1899 * properly set contents_allocated flag for ambig. char. in readdb_attach.
1900 *
1901 * Revision 1.21 1996/11/08 21:45:03 madden
1902 * Removed function readdb_get_partial_unpacked_sequence.
1903 *
1904 * Revision 1.20 1996/11/07 22:31:15 madden
1905 * Added function readdb_ambchar_present to check for the presence
1906 * of ambig. characters in a db sequence.
1907 *
1908 * Revision 1.19 1996/11/04 18:48:53 shavirin
1909 * Added possibility to reconstruct Nucleotide sequence using function
1910 * readdb_get_bioseq. Added new function readdb_get_ambchar() to retrieve
1911 * ambiguity information.
1912 *
1913 * Revision 1.18 1996/10/31 16:29:18 shavirin
1914 * Multiple changes due to reverce of residues in BLAST database
1915 * for nucleotide sequences from (4321) to (1234)
1916 * New dumper now required to create BLAST databases.
1917 *
1918 * Revision 1.17 1996/09/27 19:12:17 madden
1919 * Added function readdb_get_bioseq to obtain a BioseqPtr from the BLAST databases.
1920 *
1921 * Revision 1.16 1996/09/26 20:18:43 madden
1922 * Saved filename.
1923 *
1924 * Revision 1.15 1996/09/23 17:36:20 madden
1925 * Removed unused variable.
1926 *
1927 * Revision 1.14 1996/09/23 14:37:35 madden
1928 * Replaced CharPtr (for sequence) with Uint1Ptr.
1929 *
1930 * Revision 1.13 1996/09/20 21:58:14 madden
1931 * Changed CharPtr's to Uint1Ptr, got remainder length out of top order bits.
1932 *
1933 * Revision 1.12 1996/09/16 13:48:51 madden
1934 * Removed extra increment of counter in readdb_get_partial_unpacked_sequence.
1935 *
1936 * Revision 1.11 1996/09/15 17:35:48 madden
1937 * readdb_get_partial_unpacked_sequence now packages ncbi4na properly.
1938 *
1939 * Revision 1.10 1996/09/13 18:55:04 madden
1940 * Added function readdb_get_partial_unpacked_sequence.
1941 *
1942 * Revision 1.9 1996/09/11 21:31:11 shavirin
1943 * Added check for NULL from function Nlm_MemMapInit(name)
1944 *
1945 * Revision 1.8 1996/08/29 20:42:01 madden
1946 * memory mapping moved to the corelib (in ncbimem.[ch]).
1947 *
1948 * Revision 1.7 1996/08/23 15:32:02 shavirin
1949 * Fixed a lot of NT compiler warnings about type mismatch
1950 *
1951 * Revision 1.6 1996/08/21 21:25:25 madden
1952 * Changes for reading nt. db's.
1953 *
1954 * Revision 1.5 1996/08/14 14:31:28 madden
1955 * Added efficiencies in readdb_get_sequence_length.
1956 *
1957 * Revision 1.4 1996/08/13 22:04:36 madden
1958 * Changed readdb_get_sequence to report the uncompressed length of
1959 * a nucl. sequence.
1960 *
1961 * Revision 1.3 1996/08/08 21:39:48 madden
1962 * Added code to read in nucleotide databases.
1963 *
1964 * Revision 1.2 1996/08/07 18:32:05 madden
1965 * Moved define of MMAP_AVAIL from readdb.h to readdb.c
1966 *
1967 * Revision 1.1 1996/08/05 19:48:21 madden
1968 * Initial revision
1969 *
1970 * Revision 1.14 1996/08/02 14:20:06 madden
1971 * Added readdb_attach function.
1972 *
1973 * Revision 1.13 1996/07/31 13:09:17 madden
1974 * Changes for partial copy of ReadDB structure.
1975 *
1976 * Revision 1.12 1996/07/29 19:43:35 madden
1977 * Changes to make BLAST big/little endian independent.
1978 *
1979 * Revision 1.11 1996/07/25 20:45:20 madden
1980 * Change to arguments of readdb_get_sequence.
1981 *
1982 * Revision 1.10 1996/07/25 12:56:15 madden
1983 * readdb_get_sequence changed to allow for systems w/o mmap.
1984 *
1985 * Revision 1.9 1996/06/20 16:16:36 madden
1986 * Replaced int's with Int4's.
1987 *
1988 * Revision 1.8 1996/06/07 15:05:21 madden
1989 * MemCpy used instead of a while loop.
1990 *
1991 * Revision 1.7 1996/05/16 21:07:33 madden
1992 * Added protections against missing input files.
1993 *
1994 * Revision 1.6 1996/05/16 19:50:15 madden
1995 * Added documentation block.
1996 *
1997 * Revision 1.5 1996/04/22 21:41:13 madden
1998 * memory mapping added.
1999 *
2000 * Revision 1.4 1996/04/11 14:30:06 madden
2001 * Memory-mapping added.
2002 *
2003 * Revision 1.3 1996/03/29 21:28:30 madden
2004 * Added function readdb_get_sequence_length.
2005 *
2006 * Revision 1.2 1996/03/28 20:42:36 madden
2007 * Added functions readdb_get_title, readdb_is_prot and
2008 * readdb_get_formatdb_version.
2009 *
2010 * Revision 1.1 1996/03/26 19:38:08 madden
2011 * Initial revision
2012 *
2013 *
2014 */
2015
2016
2017 /* Description of conception:
2018
2019 * BLAST uses the concept of a virtual database, meaning that
2020 * we may have a few databases searched together as one. For
2021 * a virtual database BLAST numbers the sequence from zero to
2022 * the total number in all databases minus one (the numbers
2023 * are called ordinal ID's or OID's):
2024 *
2025 * 0 <= OID < total in all db's
2026 *
2027 * Readdb is aware of these virtual OID's and handles them properly.
2028 * The situation has grown rather confused as we also allow
2029 * the specification of a gilist (that determines a subset of the
2030 * sequences in the database to be searched) as well as the recent
2031 * addition of the 'mask' database (which specifies that a subset of
2032 * a real database is to be searched, e.g., est_human is a mask and est is
2033 * the real database). To clarify the situation Alexey and I have written
2034 * down some rules that describe how virtual databases should be used.
2035 *
2036 * 1.) Ordinal ID (OID) numbering is from zero to total-1, where total
2037 * is the total number of sequences in all databases in the virtual
2038 * database.
2039 *
2040 * 2.) OID's of 'mask' databases refer to the real (i.e., underlying)
2041 * database - not the mask.
2042 *
2043 * 3.) If a gilist is used, then one virtual mask is used for all
2044 * databases - regardless of whether any database being searched is real.
2045 *
2046 * 4.) If there is a mixture of real and mask databases and no gilist
2047 * is being used, then the mask databases should go to the end of the
2048 * virtual database and one virtual mask will be created for this subsection
2049 * of the virtual database. (readdb_new_ex2 will be changed to move
2050 * the 'mask' databases to the end).
2051 */
2052
2053 #define FILECLOSE(x) if(x){ FileClose(x); x=NULL;}
2054 #define ASNIOCLOSE(x) if(x){ AsnIoClose(x); x=NULL;}
2055
2056 #define NLM_GENERATED_CODE_PROTO
2057 #include <readdb.h>
2058 #include <ncbithr.h>
2059 #include <ffprint.h>
2060 #include <ncbisami.h>
2061 #include <blast.h>
2062 #include <ncbisort.h>
2063 #include <tofasta.h>
2064 #include <assert.h>
2065 #include <errno.h>
2066 #include <txalign.h>
2067 #include <sqnutils.h>
2068 #include <blfmtutl.h>
2069 #ifdef FDB_TAXONOMYDB
2070 #include <taxblast.h>
2071 #endif
2072
2073 #ifdef __GLIBC__ /* not just __linux */
2074 #ifndef __USE_BSD
2075 #define __USE_BSD
2076 #endif
2077 #include <fcntl.h>
2078 #include <sys/types.h>
2079 #include <sys/mman.h>
2080 #endif
2081
2082 /* Used by fetch functions. */
2083 #define READDB_BUF_SIZE 255
2084 #define READDBBF_INIT 0
2085 #define READDBBF_READY 1
2086 #define READDBBF_DISABLE 2
2087
2088 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2089 #ifndef MADV_NORMAL
2090 #undef HAVE_MADVISE
2091 #endif
2092 #ifdef HAVE_MADVISE
2093
2094 /* default size of preload (in sequences) in a single madvise operation */
2095 #define MADVISE_SEQ_PRELOAD 1024
2096
2097 /* by default use async madvise for linux, sync for solaris, etc */
2098 #ifdef OS_UNIX_LINUX
2099 #define MADVISE_SYNC_MODE FALSE
2100 #else
2101 #define MADVISE_SYNC_MODE TRUE
2102 #endif
2103
2104
2105 /* flag enabling madvise functionality */
2106 static Boolean useMadvise = FALSE;
2107
2108 /* advice to use */
2109 static EMemMapAdvise mmapAdvice = eMMA_Normal;
2110
2111
2112 /* flag that determines whether madvise is sync or async */
2113 static Boolean madviseSyncMode = MADVISE_SYNC_MODE;
2114
2115 /* size of the block preloaded (in sequences) in a single madvise operation */
2116 static Int4 madvisePreloadBlock = MADVISE_SEQ_PRELOAD;
2117
2118 #endif /* HAVE_MADVISE */
2119 #endif /* OS_UNIX_SOL || OS_UNIX_LINUX */
2120
2121 #if defined(OS_UNIX_SOL)
2122 #include <sys/int_types.h>
2123 #elif defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2124 #include <stdint.h>
2125 #endif
2126 typedef struct readdbbioseqfetch {
2127 struct readdbbioseqfetch PNTR next;
2128 Uint1 ReadDBFetchState;
2129 CharPtr dbname; /* Name of the database. */
2130 Uint2 ctr;
2131 Boolean is_prot; /* Is it a protein or not. */
2132 ReadDBFILEPtr rdfp;
2133 Int4 db_genetic_code;
2134 TNlmThread thread_id;
2135 } ReadDBFetchStruct, PNTR ReadDBFetchStructPtr;
2136
2137 typedef struct readdbfetchuserdata {
2138 Int4 ordinal_number; /* ordinal number of db sequence. */
2139 Int2 db_id; /* database ID, for multiple databases. */
2140 } ReadDBFetchUserData, PNTR ReadDBFetchUserDataPtr;
2141
2142 static Int2 LIBCALLBACK ReadDBBioseqFetchFunc PROTO((Pointer data));
2143 static ReadDBFILEPtr ReadDBFILENew(void);
2144 static Boolean FormatDbUint8Write(Uint8 value, FILE *fp);
2145 static Int8 FormatDbUint8Read(NlmMFILEPtr mfp);
2146 static ValNodePtr readdb_encode_subset_asn1_defline(ReadDBFILEPtr, Int4);
2147 static ValNodePtr IntValNodeCopy(ValNodePtr vnp);
2148 static int LIBCALLBACK ID_Compare(VoidPtr i, VoidPtr j);
2149 static ReadDBFILEPtr readdb_merge_gifiles (ReadDBFILEPtr rdfp_chain);
2150 static Boolean s_IsTextFile(const char* filename);
2151
2152 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
2153 #ifdef HAVE_MADVISE
2154 static void readdb_preload_index (ReadDBFILEPtr rdfp, Int4 first_db_seq,
2155 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync);
2156 static void readdb_preload_data (ReadDBFILEPtr rdfp, Int4 first_db_seq,
2157 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync);
2158 static void readdb_preload_file ( NlmMFILEPtr mFilePtr, Int4 nPages,
2159 EMemMapAdvise advice, Boolean sync, EThreadPriority pri);
2160 static void readdb_madvise (void * mp, size_t len,
2161 EMemMapAdvise advice, Boolean sync, EThreadPriority pri);
2162 #endif /* HAVE_MADVISE */
2163 #endif /* SOL || LINUX */
2164
2165 static TNlmMutex isamsearch_mutex; /* Mutex to regulate using ISAM;
2166 rdfp->isam is common for all threads */
2167 static TNlmMutex hdrseq_mutex;
2168
2169 /* Common index global variables */
2170 Boolean isCommonIndex = FALSE; /* deprecated 05/21/2003 */
2171
2172 /* Global to load the taxonomy databases only once per readdb_new invocation */
2173 static Boolean taxonomyDbLoaded = FALSE;
2174
2175 const Uint4 kFDBMaxNumVolumes = 100;
2176
2177 /**************************************************************************
2178 *
2179 * Functions to perform memory mapping.
2180 *
2181 * If memory mapping is not available, then these functions should
2182 * default to normal FILE pointers.
2183 *
2184 * This is allowed with "read-only files right now.
2185 *
2186 **************************************************************************/
2187
2188 /*
2189 Initialize the memory-mapping.
2190 */
2191 NlmMFILEPtr LIBCALL
NlmOpenMFILE(CharPtr name)2192 NlmOpenMFILE (CharPtr name)
2193
2194 {
2195 NlmMFILEPtr mfp;
2196
2197 if (!name || name[0] == '\0')
2198 return NULL;
2199
2200 if ((mfp=(NlmMFILEPtr) MemNew(sizeof(NlmMFILE))) == NULL)
2201 return NULL;
2202
2203 /* Default is FALSE. */
2204 mfp->mfile_true = FALSE;
2205
2206 mfp->mmp_begin = NULL;
2207
2208 if (Nlm_MemMapAvailable() == TRUE)
2209 { /* IF mem-map fails, open as a regular file. */
2210 if((mfp->mem_mapp = Nlm_MemMapInit(name)) != NULL)
2211 { /* copy this pointer to where it's convenient. */
2212 mfp->mmp_madvise_end = mfp->mmp_begin = mfp->mmp = (Uint1Ptr) mfp->mem_mapp->mmp_begin;
2213 if (mfp->mmp_begin != NULL)
2214 {
2215 mfp->mfile_true = TRUE;
2216 mfp->mmp_end = mfp->mmp_begin + mfp->mem_mapp->file_size;
2217 }
2218 }
2219 }
2220
2221 if (mfp->mmp_begin == NULL)
2222 {
2223 mfp->fp = FileOpen(name, "rb");
2224 if (mfp->fp == NULL)
2225 {
2226 mfp = (NlmMFILEPtr) MemFree(mfp);
2227 return NULL;
2228 }
2229 }
2230
2231 /* contents have been allocated. */
2232 mfp->contents_allocated = TRUE;
2233
2234 return mfp;
2235
2236 } /* NlmOpenMFILE */
2237
2238 /*
2239 Open the shared sequence and header files for memory mapping, if this hasn't
2240 already been done; duplicate this in headerfp and sequencefp
2241 */
ReadDBOpenMHdrAndSeqFiles(ReadDBFILEPtr rdfp)2242 static Boolean ReadDBOpenMHdrAndSeqFiles(ReadDBFILEPtr rdfp)
2243 {
2244 Char buffer[PATH_MAX];
2245 Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
2246
2247 /* The check for nthreads == 0 was done outside of a mutex in
2248 readdb_get_link, hence repeat it here */
2249
2250 if (rdfp->shared_info == NULL) {
2251 if(!((Boolean)(rdfp->parameters & READDB_NO_SEQ_FILE))) {
2252 sprintf(buffer, "%s.%csq", rdfp->full_filename, is_prot? 'p':'n');
2253 if ((rdfp->sequencefp = NlmOpenMFILE(buffer)) == NULL) {
2254 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2255 rdfp = readdb_destruct(rdfp);
2256 return FALSE;
2257 }
2258 }
2259 sprintf(buffer, "%s.%chr", rdfp->full_filename, is_prot? 'p':'n');
2260 if((rdfp->headerfp = NlmOpenMFILE(buffer)) == NULL) {
2261 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2262 rdfp = readdb_destruct(rdfp);
2263 return FALSE;
2264 }
2265 return TRUE;
2266 }
2267
2268 /* the reference count can be incremented either here or
2269 in readdb_attach, and may be nonzero even if the database
2270 has not been memory-mapped. Hence, never initialize the
2271 reference count, only increment it */
2272
2273 rdfp->shared_info->nthreads++;
2274
2275 if (!((Boolean)(rdfp->parameters & READDB_NO_SEQ_FILE)) &&
2276 rdfp->shared_info->sequencefp == NULL) {
2277 sprintf(buffer, "%s.%csq", rdfp->full_filename, is_prot? 'p':'n');
2278 if((rdfp->shared_info->sequencefp = NlmOpenMFILE(buffer)) == NULL) {
2279 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2280 rdfp = readdb_destruct(rdfp);
2281 return FALSE;
2282 }
2283 }
2284 rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
2285
2286 rdfp->sequencefp =
2287 (NlmMFILEPtr) MemDup(rdfp->shared_info->sequencefp, sizeof(NlmMFILE));
2288 if (!rdfp->shared_info->sequencefp->mfile_true) {
2289 rdfp->shared_info->sequencefp = MemFree(rdfp->shared_info->sequencefp);
2290 rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
2291 } else {
2292 rdfp->sequencefp->contents_allocated = FALSE;
2293 }
2294
2295 if (rdfp->shared_info->headerfp == NULL) {
2296 sprintf(buffer, "%s.%chr", rdfp->full_filename, is_prot? 'p':'n');
2297 if((rdfp->shared_info->headerfp = NlmOpenMFILE(buffer)) == NULL) {
2298 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
2299 rdfp = readdb_destruct(rdfp);
2300 return FALSE;
2301 }
2302 }
2303 rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
2304 rdfp->headerfp = (NlmMFILEPtr) MemDup(rdfp->shared_info->headerfp,
2305 sizeof(NlmMFILE));
2306 if (rdfp->shared_info->headerfp->mfile_true == FALSE) {
2307 rdfp->shared_info->headerfp = MemFree(rdfp->shared_info->headerfp);
2308 rdfp->parameters |= READDB_KEEP_HDR_AND_SEQ;
2309 } else {
2310 rdfp->headerfp->contents_allocated = FALSE;
2311 }
2312
2313 return TRUE;
2314 }
2315
ReadDBCloseMHdrAndSeqFiles(ReadDBFILEPtr rdfp)2316 ReadDBFILEPtr ReadDBCloseMHdrAndSeqFiles(ReadDBFILEPtr rdfp)
2317 {
2318 ReadDBFILEPtr start = rdfp;
2319
2320 while (rdfp != NULL) {
2321 if (rdfp->shared_info) {
2322 rdfp->shared_info->sequencefp =
2323 NlmCloseMFILE(rdfp->shared_info->sequencefp);
2324 rdfp->shared_info->headerfp =
2325 NlmCloseMFILE(rdfp->shared_info->headerfp);
2326 rdfp->shared_info->nthreads = 0;
2327 }
2328 if (rdfp->sequencefp && rdfp->sequencefp->mfile_true)
2329 rdfp->sequencefp = MemFree(rdfp->sequencefp);
2330 else
2331 rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
2332
2333 if (rdfp->headerfp && rdfp->headerfp->mfile_true)
2334 rdfp->headerfp = MemFree(rdfp->headerfp);
2335 else
2336 rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
2337
2338 rdfp = rdfp->next;
2339 }
2340 return start;
2341 }
2342
ReadDBFreeSharedInfo(ReadDBFILEPtr rdfp)2343 static ReadDBFILEPtr ReadDBFreeSharedInfo(ReadDBFILEPtr rdfp)
2344 {
2345 ReadDBFILEPtr start = rdfp;
2346
2347 while (rdfp != NULL) {
2348 if ((rdfp->parameters & READDB_CONTENTS_ALLOCATED) && rdfp->shared_info)
2349 rdfp->shared_info =
2350 (ReadDBSharedInfoPtr) MemFree(rdfp->shared_info);
2351 rdfp = rdfp->next;
2352 }
2353 return start;
2354 }
2355
2356 /****************************************************************************
2357 *
2358 * Undo the memory-mapping.
2359 *
2360 *****************************************************************************/
2361 NlmMFILEPtr LIBCALL
NlmCloseMFILE(NlmMFILEPtr mfp)2362 NlmCloseMFILE (NlmMFILEPtr mfp)
2363
2364 {
2365 if (mfp == NULL)
2366 return NULL;
2367
2368 /* Have the contents been allocated, or is this just an attachemnt? */
2369 if (mfp->contents_allocated)
2370 {
2371
2372 if (mfp->mfile_true == TRUE)
2373 {
2374 Nlm_MemMapFini(mfp->mem_mapp);
2375 }
2376
2377 FILECLOSE(mfp->fp);
2378 }
2379
2380 mfp = (NlmMFILEPtr) MemFree(mfp);
2381 return mfp;
2382
2383 } /* NlmCloseMFILE */
2384
2385 /***********************************************************************
2386 *
2387 * Analogous to ANSI-C fread.
2388 *
2389 ************************************************************************/
2390 Int4 LIBCALL
NlmReadMFILE(Uint1Ptr buffer,size_t size,Int4 nitems,NlmMFILEPtr mfp)2391 NlmReadMFILE (Uint1Ptr buffer, size_t size, Int4 nitems, NlmMFILEPtr mfp)
2392
2393 {
2394 register size_t diff, len;
2395
2396 if (mfp == NULL)
2397 return 0;
2398
2399 if (mfp->mfile_true == TRUE)
2400 {
2401 len = size * nitems;
2402 diff = mfp->mmp_end - mfp->mmp;
2403 if (len > diff)
2404 {
2405 nitems = diff / size;
2406 len = nitems * size;
2407 }
2408 MemCpy((VoidPtr) buffer, (VoidPtr) mfp->mmp, len);
2409 mfp->mmp += len;
2410 return nitems;
2411 }
2412
2413 return FileRead(buffer, size, nitems, mfp->fp);
2414
2415 } /* NlmReadMFILE */
2416
2417 /*
2418 Seeks to a point in the file, analogous to fseek.
2419 */
2420 Int4 LIBCALL
NlmSeekInMFILE(NlmMFILEPtr mfp,long offset,Int4 ptrname)2421 NlmSeekInMFILE (NlmMFILEPtr mfp, long offset, Int4 ptrname)
2422
2423 {
2424 Uint1Ptr cp;
2425
2426 if (mfp->mfile_true == TRUE)
2427 {
2428 switch (ptrname) {
2429 case SEEK_SET: /* relative to beginning */
2430 cp = mfp->mmp_begin + offset;
2431 if (offset < 0 || cp >= mfp->mmp_end)
2432 return -1;
2433 mfp->mmp = cp;
2434 break;
2435 case SEEK_CUR: /* relative to current position */
2436 cp = mfp->mmp + offset;
2437 if (cp >= mfp->mmp_end || cp < mfp->mmp_begin)
2438 return -1;
2439 mfp->mmp = cp;
2440 break;
2441 case SEEK_END: /* relative to end of file */
2442 if (offset > 0 || mfp->mem_mapp->file_size < -offset)
2443 return -1;
2444 mfp->mmp = mfp->mmp_begin + (mfp->mem_mapp->file_size + offset);
2445 break;
2446 default:
2447 return -1;
2448 }
2449 return 0;
2450 }
2451
2452 return (Int4) fseek(mfp->fp, offset, ptrname);
2453
2454 } /* NlmSeekInMFILE */
2455
2456 /*
2457 What is the offset (in bytes) to the beginning of the file.
2458 Analog to ftell.
2459 */
2460 Int4 LIBCALL
NlmTellMFILE(NlmMFILEPtr mfp)2461 NlmTellMFILE (NlmMFILEPtr mfp)
2462
2463 {
2464 if (mfp->mfile_true == TRUE)
2465 {
2466 return (mfp->mmp - mfp->mmp_begin);
2467 }
2468 else
2469 {
2470 return (Int4) ftell(mfp->fp);
2471 }
2472
2473 } /* NlmTellMFILE */
2474
ReadDBFILENew(void)2475 static ReadDBFILEPtr ReadDBFILENew(void)
2476 {
2477 ReadDBFILEPtr new_t;
2478
2479 new_t = (ReadDBFILEPtr) MemNew(sizeof(ReadDBFILE));
2480 return new_t;
2481 }
2482
2483 /*
2484 Parses the databases names (if more than one) from
2485 'filenames' into buffer. buffer should already be
2486 long enough and allocated. The funciton should be
2487 repeatedly called until TRUE is returned.
2488 */
2489 Boolean LIBCALL
readdb_parse_db_names(CharPtr PNTR filenames,CharPtr buffer)2490 readdb_parse_db_names (CharPtr PNTR filenames, CharPtr buffer)
2491
2492 {
2493 Boolean done = FALSE;
2494 Boolean quote_mode = FALSE;
2495
2496 while (**filenames == ' ')
2497 {
2498 (*filenames)++;
2499 }
2500
2501 while (**filenames != NULLB)
2502 {
2503 if (**filenames == '"')
2504 if (quote_mode == FALSE)
2505 quote_mode = TRUE;
2506 else
2507 quote_mode = FALSE;
2508
2509 if (!quote_mode && **filenames == ' ')
2510 {
2511 *buffer = NULLB;
2512 break;
2513 }
2514
2515 if (**filenames != '"')
2516 {
2517 *buffer = **filenames;
2518 buffer++;
2519 }
2520 (*filenames)++;
2521 }
2522
2523 while (**filenames == ' ')
2524 {
2525 (*filenames)++;
2526 }
2527
2528 if (**filenames == NULLB)
2529 {
2530 *buffer = NULLB;
2531 done = TRUE;
2532 }
2533
2534 return done;
2535 }
2536
2537 /********** Auxiliary gi list structure *************/
2538 #define GI_ALLOC_CHUNK 4096
2539
2540 Int4ListPtr LIBCALL
Int4ListNew(void)2541 Int4ListNew PROTO((void))
2542 {
2543 return Int4ListNewEx(GI_ALLOC_CHUNK);
2544 }
2545
2546 Int4ListPtr LIBCALL
Int4ListNewEx(Int4 init_size)2547 Int4ListNewEx PROTO((Int4 init_size))
2548 {
2549 Int4ListPtr lp = NULL;
2550
2551 if ((lp = MemNew(sizeof(Int4List))) == NULL)
2552 return NULL;
2553 lp->allocated = init_size;
2554 if ((lp->i = MemNew(sizeof(Int4) * lp->allocated)) == NULL) {
2555 return MemFree(lp);
2556 }
2557 lp->count = 0;
2558
2559 return lp;
2560 }
2561
2562 Int4ListPtr LIBCALL
Int4ListFree(Int4ListPtr lp)2563 Int4ListFree PROTO((Int4ListPtr lp))
2564 {
2565 if(lp == NULL)
2566 return NULL;
2567
2568 MemFree(lp->i);
2569 return MemFree(lp);
2570 }
2571
2572 Boolean LIBCALL
Int4ListAdd(Int4ListPtr lp,Int4 i)2573 Int4ListAdd PROTO((Int4ListPtr lp, Int4 i))
2574 {
2575 if (!lp)
2576 return FALSE;
2577
2578 if (lp->count >= lp->allocated) {
2579 lp->allocated *= 2;
2580 if ( !(lp->i = Realloc(lp->i, sizeof(Int4) * lp->allocated)))
2581 return FALSE;
2582 }
2583
2584 lp->i[lp->count++] = i;
2585
2586 return TRUE;
2587 }
2588
2589 Int4ListPtr LIBCALL
Int4ListResize(Int4ListPtr listp,Int4 new_size)2590 Int4ListResize(Int4ListPtr listp, Int4 new_size)
2591 {
2592 if (!listp || new_size < 0)
2593 return NULL;
2594
2595 if (new_size == listp->allocated)
2596 return listp;
2597
2598 if ( !(listp->i = (Int4Ptr) Realloc(listp->i, sizeof(Int4)*new_size)))
2599 return NULL;
2600 listp->allocated = new_size;
2601
2602 return listp;
2603 }
2604
2605 /*
2606 This function reads in a list of gi's from a file.
2607 The file may be either in binary or text format.
2608
2609 The binary gilist format has the following construction:
2610
2611 1.) 1st 4 bytes: a 'magic' number: UINT4_MAX
2612 2.) 2nd 4 bytes: total number of gi's in the file (call this value 'number').
2613 3.) 'number' set of 4 bytes, allowing 4 bytes for each gi.
2614
2615 The function GetGisFromFile first checks what the first 4 bytes
2616 of a file are, if they are the 'magic' number, then it proceeds
2617 to read values assuming a binary format. If they are not the
2618 'magic' number, then a text format is assumed.
2619
2620 The binary gilist can be produced from a text gilist using the
2621 function readdb_MakeGiFileBinary.
2622
2623 */
2624
2625 #define LINE_LEN 1024
2626
2627 static Int4ListPtr
Int4ListReadFromFileEx(CharPtr lookup_dir,CharPtr fname)2628 Int4ListReadFromFileEx (CharPtr lookup_dir,CharPtr fname)
2629 {
2630 Int4ListPtr listp = NULL;
2631 FILE *fp = NULL;
2632 Int4 index = 0, value, number;
2633 Int2 status;
2634 Char line[LINE_LEN];
2635 long tmplong;
2636 NlmMFILEPtr mfp;
2637 Uint4 tmp_value;
2638 Char file_name[PATH_MAX], blast_dir[PATH_MAX];
2639
2640 /**
2641 * first looking in current directory, then checking .ncbirc,
2642 * then $BLASTDB
2643 */
2644 if (FileLength(fname) > 0) {
2645 char *path = Nlm_FilePathFind(fname);
2646 if (StringLen(path) > 0) {
2647 StringCpy(blast_dir, path);
2648 } else {
2649 StringCpy(blast_dir, ".");
2650 }
2651 MemFree(path);
2652 } else {
2653 if( !lookup_dir) return NULL;
2654 StringCpy(blast_dir,lookup_dir);
2655 }
2656 sprintf(file_name, "%s%s%s", blast_dir, DIRDELIMSTR, FileNameFind(fname));
2657
2658 mfp = NlmOpenMFILE(file_name);
2659 if (mfp == NULL) {
2660 return NULL;
2661 }
2662
2663 NlmReadMFILE((Uint1Ptr)&tmp_value, sizeof(Uint4), 1, mfp);
2664 if (SwapUint4(tmp_value) == READDB_MAGIC_NUMBER) {
2665
2666 /*** Binary gi list ***/
2667
2668 /* Use a 32 kb buffer to read the file */
2669 const Int4 BUFFER_SIZE = (0x1<<15);
2670
2671 /* Number of gis per BUFFER_SIZE byte chunk */
2672 const Int4 NGIS = BUFFER_SIZE/sizeof(Int4);
2673
2674 /* Buffer to read the gi list in BUFFER_SIZE byte chunks */
2675 Int4Ptr buffer = (Int4Ptr) Malloc(BUFFER_SIZE);
2676
2677 if ( !buffer ) {
2678 ErrPostEx(SEV_ERROR, 0, 0, "Not enough memory to read %s\n",
2679 file_name);
2680 return NULL;
2681 }
2682
2683 /* Read the number of gis in this file */
2684 NlmReadMFILE((Uint1Ptr)&tmp_value, sizeof(Uint4), 1, mfp);
2685 number = SwapUint4(tmp_value);
2686 listp = Int4ListNewEx(number);
2687
2688 for (index = 0; index < number; ) {
2689 Int4 bytes_read = NlmReadMFILE((Uint1Ptr)buffer, sizeof(Uint1),
2690 BUFFER_SIZE, mfp);
2691 Uint4 idx = 0;
2692
2693 for (idx = 0; idx < bytes_read/sizeof(Int4) && idx < NGIS; idx++) {
2694 Int4ListAdd(listp, SwapUint4(buffer[idx]));
2695 index++;
2696 }
2697 }
2698
2699 buffer = MemFree(buffer);
2700 mfp = NlmCloseMFILE(mfp);
2701
2702 } else {
2703
2704 /*** Text gi list ***/
2705 mfp = NlmCloseMFILE(mfp);
2706 if (!(fp = FileOpen(file_name, "r"))) {
2707 return NULL;
2708 }
2709
2710 listp = Int4ListNew();
2711
2712 while (FileGets(line, LINE_LEN, fp)) {
2713
2714 /* do correct casting */
2715 status = sscanf(line, "%ld", &tmplong);
2716 value = tmplong;
2717
2718 /* skip non-valid lines */
2719 if (status > 0)
2720 Int4ListAdd(listp, value);
2721 }
2722
2723 FileClose(fp);
2724 }
2725
2726 return listp;
2727 }
2728
2729 /* read configuration and check file in all places */
s_GetBlastDirInfo(char * blast_dir,char * path_delim)2730 static void s_GetBlastDirInfo(char *blast_dir, char *path_delim)
2731 {
2732 CharPtr envp = NULL;
2733 /* read configuration... */
2734 memset(blast_dir, 0, sizeof(blast_dir));
2735 if ((envp = getenv("BLASTDB")) == NULL) {
2736 /* This checks current directory, user home directory, then path pointed to by $NCBI. */
2737 Nlm_GetAppParam ("NCBI", "BLAST", "BLASTDB", NULL, blast_dir, PATH_MAX);
2738 }
2739 else {
2740 StringCpy(blast_dir, envp);
2741 }
2742
2743 StringCpy(path_delim,":");
2744
2745 return;
2746 }
2747
2748 Int4ListPtr LIBCALL
Int4ListReadFromFile(CharPtr fname)2749 Int4ListReadFromFile PROTO((CharPtr fname))
2750 {
2751 Int4ListPtr listp = NULL;
2752 Char *wrk_buf;
2753 char blast_dir[PATH_MAX];
2754 Char *one_blast_dir ;
2755 char path_delim[2];
2756
2757 s_GetBlastDirInfo(blast_dir, path_delim);
2758
2759 /* parse paths and check files*/
2760 for( one_blast_dir = Nlm_StringTokMT(blast_dir,(char*)path_delim, (char **)&wrk_buf);
2761 one_blast_dir != NULL;
2762 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
2763 {
2764 listp = Int4ListReadFromFileEx(one_blast_dir,fname);
2765 if( listp ) return listp;
2766 }
2767 ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", fname);
2768 return NULL;
2769 }
2770 Int4ListPtr LIBCALL
Int4ListMakeUnique(Int4ListPtr list)2771 Int4ListMakeUnique PROTO((Int4ListPtr list))
2772 {
2773 Int4 idx, i;
2774
2775 if (!list || list->count <= 0)
2776 return list;
2777
2778 HeapSort(list->i, list->count, sizeof(Int4), ID_Compare);
2779
2780 for (i = 0, idx = 0; i < list->count - 1; i++) {
2781 if (list->i[i] == list->i[i+1])
2782 continue;
2783 list->i[idx++] = list->i[i];
2784 }
2785 /* check the last element */
2786 if (list->i[i] != list->i[idx-1])
2787 list->i[idx++] = list->i[i];
2788
2789 list->count = idx;
2790
2791 return list;
2792 }
2793
2794 Int4ListPtr LIBCALL
Int4ListConcat(Int4ListPtr * list1,Int4ListPtr * list2)2795 Int4ListConcat PROTO((Int4ListPtr *list1, Int4ListPtr *list2))
2796 {
2797 Int4ListPtr retval = NULL;
2798 Int4 size;
2799
2800 if ((*list1) && !(*list2)) {
2801 retval = (*list1);
2802 (*list1) = NULL;
2803 return retval;
2804 }
2805
2806 if ((*list2) && !(*list1)) {
2807 retval = (*list2);
2808 (*list2) = NULL;
2809 return retval;
2810 }
2811
2812 if ( (size = (*list1)->count + (*list2)->count) <= 0) {
2813 (*list1) = Int4ListFree((*list1));
2814 (*list2) = Int4ListFree((*list2));
2815 return NULL;
2816 }
2817
2818 if ( !(retval = Int4ListNewEx(size)))
2819 return NULL;
2820
2821 MemCpy(retval->i, (*list1)->i, sizeof(Int4)* (*list1)->count);
2822 retval->count = (*list1)->count;
2823 MemCpy(retval->i+retval->count, (*list2)->i, sizeof(Int4)* (*list2)->count);
2824 retval->count += (*list2)->count;
2825
2826 (*list1) = Int4ListFree((*list1));
2827 (*list2) = Int4ListFree((*list2));
2828
2829 return retval;
2830 }
2831
2832 Int4ListPtr LIBCALL
Int4ListIntersect(Int4ListPtr * list1,Int4ListPtr * list2)2833 Int4ListIntersect PROTO((Int4ListPtr *list1, Int4ListPtr *list2))
2834 {
2835 Int4 i, j, size, value;
2836 Int4ListPtr retval = NULL;
2837
2838 if ((*list1) && !(*list2))
2839 return (*list1);
2840
2841 if ((*list2) && !(*list1))
2842 return (*list2);
2843
2844 (*list1) = Int4ListMakeUnique((*list1));
2845 (*list2) = Int4ListMakeUnique((*list2));
2846 size = MIN(((*list1))->count, ((*list2))->count);
2847
2848 if (size == 0) {
2849 (*list1) = Int4ListFree((*list1));
2850 (*list2) = Int4ListFree((*list2));
2851 return NULL;
2852 }
2853
2854 if ( !(retval = Int4ListNewEx(size)))
2855 return NULL;
2856
2857 for (i = 0, j = 0; i < (*list1)->count; i++) {
2858 value = (*list1)->i[i];
2859
2860 for (; j < (*list2)->count && (*list2)->i[j] < value; j++);
2861
2862 if (j < (*list2)->count && (*list2)->i[j] == value)
2863 retval->i[retval->count++] = (*list1)->i[i];
2864 }
2865
2866 if (retval->count == 0)
2867 retval = Int4ListFree(retval);
2868
2869 (*list1) = Int4ListFree((*list1));
2870 (*list2) = Int4ListFree((*list2));
2871
2872 return retval;
2873 }
2874
2875 typedef struct _readdb_alias_file {
2876 CharPtr title, /* title of the database. */
2877 dblist, /* list of databases. */
2878 gilist, /* a gilist to be used with the database. */
2879 oidlist; /* an ordinal id list to be used with this database. */
2880 Int8 len; /* length of the database */
2881 Uint4 nseq; /* number of seqs of the database */
2882 Int8 len_stats; /* length of the database for statistical purposes */
2883 Uint4 nseq_stats; /* number of seqs of the database for statistical purposes */
2884 Int4 first_oid; /* first ordinal id in a range */
2885 Int4 last_oid; /* last ordinal id in a range */
2886 Int4 membership;/* membership bit */
2887 Int4 maxlen; /* maximal length of seqs in the database */
2888 } ReadDBAlias, PNTR ReadDBAliasPtr;
2889 /*
2890 This function frees the 'alias' file for the BLAST databases.
2891 */
2892
ReadDBAliasFree(ReadDBAliasPtr rdbap)2893 static ReadDBAliasPtr ReadDBAliasFree(ReadDBAliasPtr rdbap)
2894 {
2895
2896 if (rdbap == NULL)
2897 return NULL;
2898
2899 MemFree(rdbap->title);
2900 MemFree(rdbap->dblist);
2901 MemFree(rdbap->gilist);
2902 MemFree(rdbap->oidlist);
2903 MemFree(rdbap);
2904 return NULL;
2905 }
2906
2907 /*
2908 This function reads the 'alias' file for the BLAST databases.
2909 */
2910 static ReadDBAliasPtr
readdb_read_alias_file(CharPtr filename)2911 readdb_read_alias_file(CharPtr filename)
2912
2913 {
2914 CharPtr buffer;
2915 CharPtr file_path, ptr;
2916 Char file_buffer[PATH_MAX], full_buffer[PATH_MAX];
2917 ReadDBAliasPtr rdbap;
2918 FILE *fp;
2919 Int4 buflen, buffer_length, total_length=PATH_MAX, length;
2920 long tmplong;
2921
2922 if (filename == NULL || (buflen = FileLength(filename)) <= 0)
2923 return NULL;
2924
2925 fp = FileOpen(filename, "r");
2926 if (fp == NULL)
2927 return NULL;
2928
2929 if (!s_IsTextFile(filename)) {
2930 ErrPostEx(SEV_ERROR, 0, 1, "%s is not a valid alias file\n", filename);
2931 return NULL;
2932 }
2933
2934 file_path = Nlm_FilePathFind(filename);
2935
2936 buffer = MemNew(buflen + 1);
2937
2938 rdbap = (ReadDBAliasPtr) MemNew(sizeof(ReadDBAlias));
2939
2940 while (Nlm_FileGets(buffer, buflen + 1, fp) != NULL) {
2941
2942 Char* newline_ptr = NULL; /* pointer to newline character */
2943
2944 if (buffer[0] == '#') /* ignore comments. */
2945 continue;
2946
2947 if (StringNCmp(buffer, "TITLE", 5) == 0) {
2948 ptr = buffer;
2949 ptr += 5;
2950 while (isspace((int)*ptr)) /* skip whitespace */
2951 ptr++;
2952
2953 newline_ptr = Nlm_StrChr(ptr, '\n');
2954 if (newline_ptr != NULL) {
2955 *newline_ptr = NULLB;
2956 } else {
2957 *ptr = NULLB;
2958 }
2959
2960 if (*ptr != NULLB)
2961 rdbap->title = StringSave(ptr);
2962 /* empty title is okay? */
2963
2964 continue;
2965 }
2966
2967 if (StringNCmp(buffer, "DBLIST", 6) == 0) {
2968 ptr = buffer;
2969 ptr += 6;
2970 while (isspace((int)*ptr)) /* skip whitespace */
2971 ptr++;
2972
2973 newline_ptr = Nlm_StrChr(ptr, '\n');
2974 if (newline_ptr != NULL) {
2975 *newline_ptr = NULLB;
2976 } else {
2977 *ptr = NULLB;
2978 }
2979
2980 if (*ptr != NULLB)
2981 {
2982 Boolean done = FALSE, first = TRUE;
2983 if (file_path && *file_path != NULLB)
2984 { /* Prepend file_path if it exists. */
2985 rdbap->dblist = MemNew(total_length*sizeof(Char));
2986 length=0;
2987 while (!done)
2988 {
2989 done = readdb_parse_db_names(&ptr, file_buffer);
2990
2991 if(*file_buffer == DIRDELIMCHR)
2992 StringCpy(full_buffer, file_buffer);
2993 else
2994 sprintf(full_buffer, "%s%c%s", file_path,
2995 DIRDELIMCHR, file_buffer);
2996
2997 buffer_length = StringLen(full_buffer);
2998 /* + 1 for the extra space in between multiple paths */
2999 if (buffer_length+length+3 >= total_length)
3000 {
3001 rdbap->dblist = Realloc(rdbap->dblist,
3002 2*total_length);
3003 total_length *= 2;
3004 }
3005 if (!first)
3006 {
3007 StringCpy(rdbap->dblist+length, " ");
3008 length++;
3009 }
3010 else
3011 first = FALSE;
3012 StringCpy(rdbap->dblist+length, "\"");
3013 length++;
3014 StringCpy(rdbap->dblist+length, full_buffer);
3015 length += buffer_length;
3016 StringCpy(rdbap->dblist+length, "\"");
3017 length++;
3018 }
3019
3020 }
3021 else
3022 rdbap->dblist = StringSave(ptr);
3023 }
3024 if (rdbap->dblist == NULL) {
3025 ErrPostEx(SEV_ERROR, 0, 0, "DBLIST field in %s is empty",
3026 filename);
3027 return NULL;
3028 }
3029
3030 continue;
3031 }
3032
3033 if (StringNCmp(buffer, "GILIST", 6) == 0) {
3034 ptr = buffer;
3035 ptr += 6;
3036 while (isspace((int)*ptr)) /* skip whitespace */
3037 ptr++;
3038
3039 newline_ptr = Nlm_StrChr(ptr, '\n');
3040 if (newline_ptr != NULL) {
3041 *newline_ptr = NULLB;
3042 } else {
3043 *ptr = NULLB;
3044 }
3045
3046 if (*ptr != NULLB) {
3047 if (file_path && StrCmp(file_path,"")) {
3048 /* add directory prefix, if any */
3049 sprintf(full_buffer, "%s%c%s", file_path, DIRDELIMCHR, ptr);
3050 rdbap->gilist = StringSave(full_buffer);
3051 } else {
3052 rdbap->gilist = StringSave(ptr);
3053 }
3054 }
3055 if (rdbap->gilist == NULL) {
3056 ErrPostEx(SEV_WARNING, 0, 0, "GILIST field in %s is empty",
3057 filename);
3058 }
3059
3060 continue;
3061 }
3062
3063 if (StringNCmp(buffer, "OIDLIST", 7) == 0) {
3064 ptr = buffer;
3065 ptr += 7;
3066 while (isspace((int)*ptr)) /* skip whitespace */
3067 ptr++;
3068
3069 newline_ptr = Nlm_StrChr(ptr, '\n');
3070 if (newline_ptr != NULL) {
3071 *newline_ptr = NULLB;
3072 } else {
3073 *ptr = NULLB;
3074 }
3075
3076 if (*ptr != NULLB) {
3077 Boolean done=FALSE, first=TRUE;
3078 if (file_path && StrCmp(file_path, "")) {
3079 /* add directory prefix, if any */
3080 rdbap->oidlist = MemNew(total_length*sizeof(Char));
3081 length = 0;
3082 while (!done) {
3083 done = readdb_parse_db_names(&ptr, file_buffer);
3084 sprintf(full_buffer, "%s%c%s", file_path,
3085 DIRDELIMCHR, file_buffer);
3086
3087 if (*file_buffer == DIRDELIMCHR)
3088 StringCpy(full_buffer, file_buffer);
3089 else
3090 sprintf(full_buffer, "%s%c%s", file_path,
3091 DIRDELIMCHR, file_buffer);
3092
3093 buffer_length = StringLen(full_buffer);
3094 if (buffer_length+length > total_length) {
3095 rdbap->oidlist = Realloc(rdbap->oidlist,
3096 2*total_length);
3097 total_length *= 2;
3098 }
3099 if (!first) {
3100 StringCpy(rdbap->oidlist+length, " ");
3101 length++;
3102 } else {
3103 first = FALSE;
3104 }
3105 StringCpy(rdbap->oidlist+length, full_buffer);
3106 length += buffer_length;
3107 }
3108
3109 } else {
3110 rdbap->oidlist = StringSave(ptr);
3111 }
3112 }
3113 if (rdbap->oidlist == NULL) {
3114 ErrPostEx(SEV_WARNING, 0, 0, "OIDLIST field in %s is empty",
3115 filename);
3116 }
3117
3118 continue;
3119 }
3120
3121 if (StringNCmp(buffer, "FIRST_OID", 9) == 0) {
3122 ptr = buffer + 9;
3123 while (isspace((int)*ptr)) /* skip whitespace */
3124 ptr++;
3125 newline_ptr = Nlm_StrChr(ptr, '\n');
3126 if (newline_ptr != NULL) {
3127 *newline_ptr = NULLB;
3128 } else {
3129 *ptr = NULLB;
3130 }
3131 if (*ptr != NULLB) {
3132 sscanf(ptr, "%ld", &tmplong);
3133 rdbap->first_oid = tmplong;
3134 }
3135 continue;
3136 }
3137 if (StringNCmp(buffer, "LAST_OID", 8) == 0) {
3138 ptr = buffer + 8;
3139 while (isspace((int)*ptr)) /* skip whitespace */
3140 ptr++;
3141 newline_ptr = Nlm_StrChr(ptr, '\n');
3142 if (newline_ptr != NULL) {
3143 *newline_ptr = NULLB;
3144 } else {
3145 *ptr = NULLB;
3146 }
3147 if (*ptr != NULLB) {
3148 sscanf(ptr, "%ld", &tmplong);
3149 rdbap->last_oid = tmplong;
3150 }
3151 continue;
3152 }
3153
3154 if (StringNCmp(buffer, "LENGTH", 6) == 0) {
3155 ptr = buffer;
3156 ptr += 6;
3157 while (isspace((int)*ptr)) /* skip whitespace */
3158 ptr++;
3159
3160 newline_ptr = Nlm_StrChr(ptr, '\n');
3161 if (newline_ptr != NULL) {
3162 *newline_ptr = NULLB;
3163 } else {
3164 *ptr = NULLB;
3165 }
3166 if (*ptr != NULLB)
3167 sscanf(ptr, "%lld", &(rdbap->len));
3168
3169 continue;
3170 }
3171 if (StringNCmp(buffer, "MEMB_BIT", 8) == 0) {
3172 ptr = buffer;
3173 ptr += 8;
3174 while (isspace((int)*ptr)) /* skip whitespace */
3175 ptr++;
3176 newline_ptr = Nlm_StrChr(ptr, '\n');
3177 if (newline_ptr != NULL) {
3178 *newline_ptr = NULLB;
3179 } else {
3180 *ptr = NULLB;
3181 }
3182 if (*ptr != NULLB) {
3183 tmplong = 0;
3184 sscanf(ptr, "%ld", &tmplong);
3185 rdbap->membership = tmplong;
3186 }
3187 continue;
3188 }
3189 if (StringNCmp(buffer, "NSEQ", 4) == 0) {
3190 ptr = buffer;
3191 ptr += 4;
3192 while (isspace((int)*ptr)) /* skip whitespace */
3193 ptr++;
3194
3195 newline_ptr = Nlm_StrChr(ptr, '\n');
3196 if (newline_ptr != NULL) {
3197 *newline_ptr = NULLB;
3198 } else {
3199 *ptr = NULLB;
3200 }
3201 if (*ptr != NULLB)
3202 rdbap->nseq = atol(ptr);
3203
3204 continue;
3205 }
3206 if (StringNCmp(buffer, "STATS_NSEQ", 10) == 0) {
3207 ptr = buffer;
3208 ptr += 10;
3209 while (isspace((int)*ptr)) /* skip whitespace */
3210 ptr++;
3211
3212 newline_ptr = Nlm_StrChr(ptr, '\n');
3213 if (newline_ptr != NULL) {
3214 *newline_ptr = NULLB;
3215 } else {
3216 *ptr = NULLB;
3217 }
3218 if (*ptr != NULLB)
3219 rdbap->nseq_stats = atol(ptr);
3220
3221 continue;
3222 }
3223 if (StringNCmp(buffer, "STATS_TOTLEN", 12) == 0) {
3224 ptr = buffer;
3225 ptr += 12;
3226 while (isspace((int)*ptr)) /* skip whitespace */
3227 ptr++;
3228
3229 newline_ptr = Nlm_StrChr(ptr, '\n');
3230 if (newline_ptr != NULL) {
3231 *newline_ptr = NULLB;
3232 } else {
3233 *ptr = NULLB;
3234 }
3235 if (*ptr != NULLB)
3236 rdbap->len_stats = atol(ptr);
3237
3238 continue;
3239 }
3240 if (StringNCmp(buffer, "MAXLEN", 6) == 0) {
3241 ptr = buffer;
3242 ptr += 6;
3243 while (isspace((int)*ptr)) /* skip whitespace */
3244 ptr++;
3245
3246 newline_ptr = Nlm_StrChr(ptr, '\n');
3247 if (newline_ptr != NULL) {
3248 *newline_ptr = NULLB;
3249 } else {
3250 *ptr = NULLB;
3251 }
3252 if (*ptr != NULLB)
3253 rdbap->maxlen = atol(ptr);
3254
3255 continue;
3256 }
3257 }
3258
3259 MemFree(file_path);
3260 MemFree(buffer);
3261 FILECLOSE(fp);
3262
3263 if (rdbap->dblist == NULL) {
3264 ErrPostEx(SEV_ERROR, 0, 0, "Alias file (%s) is missing DBLIST field\n",
3265 filename);
3266 return ReadDBAliasFree(rdbap);
3267 }
3268
3269 return rdbap;
3270 }
3271
3272 /*
3273 Check if an alias file contains a database by the same name. This
3274 situation will lead to an infinite recursion and we do not allow it.
3275 TRUE is returned if recursive situation found, otherwise FALSE.
3276 */
3277
CheckForRecursion(CharPtr alias_filename,CharPtr db_list)3278 static Boolean CheckForRecursion(CharPtr alias_filename, CharPtr db_list)
3279
3280 {
3281 Boolean done=FALSE;
3282 Char buffer[PATH_MAX];
3283
3284 while (!done) {
3285 done = readdb_parse_db_names(&db_list, buffer);
3286 if (*buffer == NULLB)
3287 break;
3288
3289 if (StringCmp(buffer, alias_filename) == 0)
3290 {
3291 ErrPostEx(SEV_WARNING, 0, 0,
3292 "Recursive situation detected with %s, ignoring alias file", buffer);
3293 return TRUE;
3294 }
3295 }
3296
3297 return FALSE;
3298
3299 }
3300 /* Check if .?in file exists for specified database
3301 and assign proper is_prot to rdfp->is_prot */
3302
IndexFileExists(CharPtr full_filename,ReadDBFILEPtr PNTR rdfpp,Boolean PNTR is_prot,Uint1 init_state)3303 static Int2 IndexFileExists(CharPtr full_filename, ReadDBFILEPtr PNTR rdfpp, Boolean PNTR is_prot, Uint1 init_state)
3304 {
3305 Char buffer[PATH_MAX];
3306 Int4 length = 0, i;
3307 ReadDBAliasPtr rdbap;
3308 ReadDBFILEPtr rdfp=NULL;
3309
3310 /* Check for protein and nucl. alias files first. */
3311 if (*is_prot == READDB_DB_UNKNOWN || *is_prot == READDB_DB_IS_PROT) {
3312
3313 sprintf(buffer, "%s.pal", full_filename);
3314 rdbap = readdb_read_alias_file(buffer);
3315 if (rdbap && CheckForRecursion(full_filename, rdbap->dblist) == FALSE &&
3316 (rdfp=readdb_new_ex2(rdbap->dblist, TRUE, init_state, rdbap->oidlist, rdbap->gilist)))
3317 {
3318 MemFree(rdfp->aliasfilename);
3319 rdfp->aliasfilename = StringSave(Nlm_FileNameFind(full_filename));
3320 if (rdfp->cih)
3321 rdfp->aliasfilebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids, rdfp->aliasfilename, TRUE);
3322
3323 /* In case first_oid and last_oid are given in the alias file, we
3324 * create a mask in the following lines that only selects those
3325 * ordinal id's that are in the range of first_oid and last_oid */
3326 if (rdbap->first_oid > 0) {
3327 OIDListPtr oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
3328 Int4 total, mask_index, oid, oid_bit;
3329 oidlist->total = rdbap->last_oid + 1;
3330 total = rdbap->last_oid/MASK_WORD_SIZE + 2;
3331 oidlist->list = (Uint4Ptr) MemNew (total*sizeof(Int4));
3332 oidlist->memory = oidlist->list;
3333 for (oid=rdbap->first_oid-1; oid<rdbap->last_oid; oid++) {
3334 mask_index = oid / MASK_WORD_SIZE;
3335 oid_bit =
3336 0x1 << (MASK_WORD_SIZE - 1 - oid % MASK_WORD_SIZE);
3337 oidlist->list[mask_index] |= oid_bit;
3338 }
3339 for (i=0; i<total; i++) {
3340 oidlist->list[i] = Nlm_SwapUint4(oidlist->list[i]);
3341 }
3342 oidlist->filename = StringSave(rdfp->aliasfilename);
3343 rdfp->oidlist = oidlist;
3344 }
3345
3346 *rdfpp = rdfp;
3347 /* replace standard title with new one. */
3348 if (rdbap->title) {
3349 ReadDBFILEPtr rdfp_var = NULL;
3350 if (rdfp->title) {
3351 MemFree(rdfp->title);
3352 }
3353 rdfp->title = rdbap->title;
3354 rdbap->title = NULL;
3355 /* Free all other titles since we use one from alias file. */
3356 rdfp_var = rdfp->next;
3357 while (rdfp_var) {
3358 rdfp_var->title = MemFree(rdfp_var->title);
3359 rdfp_var = rdfp_var->next;
3360 }
3361 }
3362 /* Length of the database is already calculated in alias file */
3363 if (rdbap->len) {
3364 rdfp->aliaslen = rdbap->len;
3365 }
3366 if (rdbap->nseq) {
3367 rdfp->aliasnseq = rdbap->nseq;
3368 }
3369 if (rdbap->maxlen) {
3370 rdfp->maxlen = rdbap->maxlen;
3371 }
3372 if (rdbap->nseq_stats) {
3373 rdfp->nseq_stats = rdbap->nseq_stats;
3374 }
3375 if (rdbap->len_stats) {
3376 rdfp->totlen_stats = rdbap->len_stats;
3377 }
3378
3379 rdfp->membership_bit = rdbap->membership;
3380
3381 rdbap = ReadDBAliasFree(rdbap);
3382 return 1;
3383 }
3384 rdbap = ReadDBAliasFree(rdbap);
3385 /* Try finding an index file */
3386 sprintf(buffer, "%s.pin", full_filename);
3387 length = FileLength(buffer);
3388 if (length > 0) {
3389 *is_prot = READDB_DB_IS_PROT;
3390 }
3391 }
3392 if ((*is_prot == READDB_DB_IS_NUC) || (rdfp == NULL && *is_prot == READDB_DB_UNKNOWN)) {
3393 sprintf(buffer, "%s.nal", full_filename);
3394 rdbap = readdb_read_alias_file(buffer);
3395 if (rdbap && CheckForRecursion(full_filename, rdbap->dblist) == FALSE &&
3396 (rdfp=readdb_new_ex2(rdbap->dblist, FALSE, init_state, rdbap->oidlist, rdbap->gilist)))
3397 {
3398 MemFree(rdfp->aliasfilename);
3399 rdfp->aliasfilename = StringSave(Nlm_FileNameFind(full_filename));
3400 if (rdfp->cih && rdfp->aliasfilebit == 0)
3401 rdfp->aliasfilebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids, rdfp->aliasfilename, FALSE);
3402
3403 /* In case first_oid and last_oid are given in the alias file, we
3404 * create a mask in the following lines that only selects those
3405 * ordinal id's that are in the range of first_oid and last_oid */
3406 if (rdbap->first_oid > 0) {
3407 OIDListPtr oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
3408 Int4 total, mask_index, oid, oid_bit;
3409 oidlist->total = rdbap->last_oid + 1;
3410 total = rdbap->last_oid/MASK_WORD_SIZE + 2;
3411 oidlist->list = (Uint4Ptr) MemNew (total*sizeof(Int4));
3412 oidlist->memory = oidlist->list;
3413 for (oid=rdbap->first_oid-1; oid<rdbap->last_oid; oid++) {
3414 mask_index = oid / MASK_WORD_SIZE;
3415 oid_bit =
3416 0x1 << (MASK_WORD_SIZE - 1 - oid % MASK_WORD_SIZE);
3417 oidlist->list[mask_index] |= oid_bit;
3418 }
3419 for (i=0; i<total; i++) {
3420 oidlist->list[i] = Nlm_SwapUint4(oidlist->list[i]);
3421 }
3422 oidlist->filename = StringSave(rdfp->aliasfilename);
3423 rdfp->oidlist = oidlist;
3424 }
3425
3426 *rdfpp = rdfp;
3427 /* replace standard title with new one. */
3428 if (rdbap->title) {
3429 if (rdfp->title) {
3430 MemFree(rdfp->title);
3431 }
3432 rdfp->title = rdbap->title;
3433 rdbap->title = NULL;
3434 /* Length of the database is already calculated in alias file */
3435 if (rdbap->len) {
3436 rdfp->aliaslen = rdbap->len;
3437 }
3438 if (rdbap->nseq) {
3439 rdfp->aliasnseq = rdbap->nseq;
3440 }
3441 if (rdbap->maxlen) {
3442 rdfp->maxlen = rdbap->maxlen;
3443 }
3444 if (rdbap->nseq_stats) {
3445 rdfp->nseq_stats = rdbap->nseq_stats;
3446 }
3447 if (rdbap->len_stats) {
3448 rdfp->totlen_stats = rdbap->len_stats;
3449 }
3450 rdfp->membership_bit = rdbap->membership;
3451
3452 rdfp = rdfp->next;
3453 while (rdfp) {
3454 rdfp->title = MemFree(rdfp->title);
3455 rdfp = rdfp->next;
3456 }
3457 }
3458
3459 rdbap = ReadDBAliasFree(rdbap);
3460 return 1;
3461 }
3462 rdbap = ReadDBAliasFree(rdbap);
3463 /* Try finding an index file */
3464 sprintf(buffer, "%s.nin", full_filename);
3465 length = FileLength(buffer);
3466 if (length > 0) {
3467 *is_prot = READDB_DB_IS_NUC;
3468 }
3469 }
3470
3471 if (length > 0)
3472 return 0;
3473 else
3474 return -1;
3475 }
3476
FindBlastDBFileEx(CharPtr lookup_dir,CharPtr filename)3477 static CharPtr FindBlastDBFileEx (CharPtr lookup_dir, CharPtr filename)
3478 {
3479
3480 CharPtr buffer, buffer1, envp = NULL;
3481 Int4 len;
3482
3483 /* We cannot deal with strings larger than PATH_MAX in a platform
3484 * independent manner */
3485 if (StringLen(filename) > PATH_MAX-1) {
3486 ErrPostEx(SEV_WARNING, 0, 0, "Argument to FindBlastDBFile is "
3487 "longer than PATH_MAX");
3488 return NULL;
3489 }
3490
3491 /* check current directory */
3492 len = FileLength(filename);
3493 if (len)
3494 return StringSave(filename);
3495
3496 buffer = MemNew(PATH_MAX);
3497 buffer1 = MemNew(PATH_MAX);
3498
3499 if( !lookup_dir) return NULL;
3500 StringCpy(buffer,lookup_dir);
3501
3502 sprintf(buffer1, "%s%s%s", buffer, DIRDELIMSTR, filename);
3503
3504 /* see if the file is not empty */
3505
3506 len = FileLength(buffer1);
3507
3508 MemFree(buffer);
3509
3510 if (len)
3511 return buffer1;
3512 else
3513 MemFree(buffer1);
3514
3515 /* give up */
3516 return NULL;
3517 }
3518
FindBlastDBFile(CharPtr filename)3519 CharPtr FindBlastDBFile (CharPtr filename)
3520 {
3521 Char *wrk_buf;
3522 char blast_dir[PATH_MAX];
3523 Char *one_blast_dir ;
3524 char path_delim[2];
3525 CharPtr found_name = NULL;
3526
3527 s_GetBlastDirInfo(blast_dir, path_delim);
3528
3529 /* parse paths and lookup filename */
3530 for( one_blast_dir = Nlm_StringTokMT (blast_dir,(char*)path_delim, (char **)&wrk_buf);
3531 one_blast_dir != NULL;
3532 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
3533 {
3534 found_name = FindBlastDBFileEx(one_blast_dir,filename);
3535 if( found_name ) return found_name;
3536 }
3537 return NULL;
3538 }
3539 /*
3540 filename: name of the file to be openend.
3541 is_prot: three choices: protein, nucleotide, or either one.
3542 init_state: how much should be initialized.
3543 READDB_NEW_DO_ALL : initialize everything possible
3544 READDB_NEW_DO_REPORT : init enough for a report on db size etc.
3545 cih: common index
3546 */
3547
3548 static ReadDBFILEPtr
readdb_new_internalEx(CharPtr lookup_dir,CharPtr filename,Uint1 is_prot,Uint1 init_state,CommonIndexHeadPtr cih)3549 readdb_new_internalEx(CharPtr lookup_dir, CharPtr filename, Uint1 is_prot, Uint1 init_state, CommonIndexHeadPtr cih)
3550 {
3551 ReadDBFILEPtr rdfp=NULL;
3552 Char buffer[PATH_MAX], buffer1[PATH_MAX];
3553 Char commonindex_full_filename[PATH_MAX];
3554 Char database_dir[PATH_MAX] = "";
3555 Uint4 seq_type, formatdb_ver, date_length, title_length, value;
3556 Int2 status;
3557 Int4 length, num_seqs;
3558 CharPtr charptr, envp = NULL;
3559 Boolean localdb = FALSE;
3560
3561 if (filename == NULL)
3562 return NULL;
3563
3564
3565 /* We need to find out what directory to use and which index system will
3566 be used for searching OID by give GI. The algorithm is:
3567 Define blast database directory by present database searching
3568 in the following order (and stopping when it is found):
3569 1) If absolute path was given, only this file is attempted.
3570 2) Current working directory
3571 3) getenv("BLASTDB")
3572 4) .ncbirc file: a) current working directory, b) home directory, c)
3573 NCBI directory (obtained from the environment)
3574 Then defind which index system to use. If "CommonIndex" then
3575 we need all CommonIndex and ISAM files to be present in database directory,
3576 if ISAM, them the only ISAM index files should be present in the
3577 current directory
3578 */
3579
3580
3581 /* first see in the current directory */
3582
3583 if ((status=IndexFileExists(filename, &rdfp, &is_prot,
3584 init_state)) >= 0) {
3585
3586 if (status > 0)
3587 return rdfp;
3588
3589 /* use current directory */
3590 charptr = Nlm_FilePathFind(filename);
3591 StringCpy(database_dir, charptr);
3592 MemFree(charptr);
3593 localdb = TRUE;
3594 rdfp = readdb_destruct(rdfp);
3595 } else {
3596 /* set passed directory location */
3597 if( !lookup_dir ) return NULL;
3598 StringCpy(buffer, lookup_dir);
3599
3600 sprintf(buffer1, "%s%s%s", buffer, DIRDELIMSTR, filename);
3601 if ((status=IndexFileExists(buffer1, &rdfp, &is_prot,
3602 init_state)) >= 0) {
3603 if (status > 0){
3604 return rdfp;
3605 }
3606 /* database file is in directory 'buffer' */
3607 StringCpy(database_dir, buffer);
3608 rdfp = readdb_destruct(rdfp);
3609 }
3610 }
3611
3612 /* ATTENTION: at this point database_dir contains file directory name */
3613
3614 rdfp = readdb_destruct(rdfp);
3615 rdfp = ReadDBFILENew();
3616
3617 if (rdfp == NULL)
3618 return NULL;
3619
3620 rdfp->filename = StringSave(filename);
3621
3622 /*rdfp->is_prot = is_prot;*/
3623 if (is_prot)
3624 rdfp->parameters |= READDB_IS_PROT;
3625
3626
3627 /* Here we know that database is in database_dir directory */
3628
3629 /* constract full file name */
3630 if (!StringCmp(database_dir, "")) {
3631 sprintf(rdfp->full_filename, "%s", Nlm_FileNameFind(filename));
3632 } else if (!localdb) {
3633 sprintf(rdfp->full_filename, "%s%s%s", database_dir, DIRDELIMSTR, filename);
3634 } else {
3635 sprintf(rdfp->full_filename, "%s%s%s", database_dir, DIRDELIMSTR, Nlm_FileNameFind(filename));
3636 }
3637
3638 /* Now let's find out which index system to use */
3639
3640 /* First see if user has preferences */
3641 StringCpy(buffer1, "CommonIndex");
3642
3643 if (getenv("INDEX_SYSTEM") &&
3644 StringCmp(getenv("INDEX_SYSTEM"), "CommonIndex"))
3645 StringCpy(buffer1, "ISAM");
3646
3647 Nlm_GetAppParam ("NCBI", "BLAST", "INDEX_SYSTEM", buffer1,
3648 buffer, PATH_MAX);
3649
3650 isCommonIndex = !StrCmp("CommonIndex", buffer);
3651
3652 /* now we know that if isCommonIndex == TRUE, than it is
3653 prefered to use CommonIndex */
3654
3655 /* test if there exist common index file */
3656 if (isCommonIndex) {
3657 if (!StringCmp(database_dir, "")) {
3658 sprintf(commonindex_full_filename, "%s", COMMONINDEX_FN);
3659 } else {
3660 sprintf(commonindex_full_filename, "%s%s%s", database_dir, DIRDELIMSTR, COMMONINDEX_FN);
3661 }
3662
3663 if (!(length = FileLength(commonindex_full_filename))) {
3664 /* no CommonIndex files in this directory, try to use ISAM only */
3665 isCommonIndex = FALSE;
3666 }
3667 }
3668
3669 /* check if present main three files: index, sequences, headers */
3670
3671 sprintf(buffer, "%s.%cin", rdfp->full_filename, is_prot? 'p':'n');
3672 if((rdfp->indexfp = NlmOpenMFILE(buffer)) == NULL) {
3673 ErrPostEx(SEV_WARNING, 0, 0, "Unable to open %s", buffer);
3674 rdfp = readdb_destruct(rdfp);
3675 return rdfp;
3676 }
3677
3678 if (init_state & READDB_NEW_DO_ALL)
3679 if (ReadDBOpenMHdrAndSeqFiles(rdfp) == FALSE)
3680 ErrPostEx(SEV_ERROR, 0, 0,
3681 "ReadDBOpenMHdrAndSeqFiles: failed to map files\n");
3682
3683 /* fill in other fields of rdfp-> */
3684 NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3685 formatdb_ver = Nlm_SwapUint4(value);
3686
3687 /* Here we will handle version of formatdb program */
3688
3689 if (formatdb_ver != FORMATDB_VER && formatdb_ver != FORMATDB_VER_TEXT) {
3690 ErrPostEx(SEV_WARNING, 0, 0, "readdb: wrong version of formatdb "
3691 "was used to make database %s.", filename);
3692 rdfp = readdb_destruct(rdfp);
3693 return NULL;
3694 }
3695 rdfp->formatdb_ver = formatdb_ver;
3696
3697 NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3698 seq_type = Nlm_SwapUint4(value);
3699 if ((is_prot && seq_type == 0) || (!is_prot && seq_type == 1)) {
3700 rdfp = readdb_destruct(rdfp);
3701 return rdfp;
3702 }
3703 NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3704 title_length = Nlm_SwapUint4(value);
3705
3706 if (title_length) {
3707 rdfp->title = (CharPtr)Nlm_Malloc((title_length+1)*sizeof(Char));
3708 NlmReadMFILE((Uint1Ptr) rdfp->title, title_length, 1, rdfp->indexfp);
3709 rdfp->title[title_length] = NULLB;
3710 } else { /* Use the filename, if there is no title. */
3711 rdfp->title = StringSave(rdfp->filename);;
3712 }
3713
3714 NlmReadMFILE((Uint1Ptr) &value, 4, 1, rdfp->indexfp);
3715 date_length = Nlm_SwapUint4(value);
3716
3717 rdfp->date = (CharPtr)Nlm_Malloc((date_length+1)*sizeof(Char));
3718 NlmReadMFILE((Uint1Ptr) rdfp->date, date_length, 1, rdfp->indexfp);
3719 rdfp->date[date_length] = NULLB;
3720
3721 NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3722 num_seqs = rdfp->num_seqs = Nlm_SwapUint4(value);
3723
3724 if (formatdb_ver == FORMATDB_VER_TEXT)
3725 {
3726 NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3727 rdfp->totlen = Nlm_SwapUint4(value);
3728 }
3729 else
3730 {
3731 rdfp->totlen = FormatDbUint8Read(rdfp->indexfp);
3732 }
3733
3734 NlmReadMFILE((Uint1Ptr) &(value), 4, 1, rdfp->indexfp);
3735 rdfp->maxlen = Nlm_SwapUint4(value);
3736
3737 /* Initializing taxonomy names database if it exists (only once!) */
3738 if (rdfp->formatdb_ver > FORMATDB_VER_TEXT &&
3739 init_state & READDB_NEW_DO_TAXDB && taxonomyDbLoaded == FALSE) {
3740 rdfp->taxinfo = RDBTaxInfoInit();
3741 taxonomyDbLoaded = TRUE;
3742 }
3743
3744 if (init_state & READDB_NEW_DO_REPORT) {
3745 rdfp->parameters |= READDB_CONTENTS_ALLOCATED;
3746 /*rdfp->contents_allocated = TRUE; */
3747 /* Some was allocated, but index pointers are NULLs - that's OK */
3748 return rdfp;
3749 }
3750
3751 if (!((title_length + date_length)%4) && rdfp->indexfp->mfile_true) {
3752 rdfp->header_index = (Uint4Ptr) rdfp->indexfp->mmp;
3753 rdfp->indexfp->mmp += 4 * (num_seqs+1);
3754
3755 rdfp->sequence_index = (Uint4Ptr) rdfp->indexfp->mmp;
3756 rdfp->indexfp->mmp += 4 * (num_seqs+1);
3757
3758 rdfp->ambchar_index = (Uint4Ptr) rdfp->indexfp->mmp;
3759 rdfp->indexfp->mmp += 4 * (num_seqs+1);
3760 } else {
3761 /* Use old stuff */
3762
3763 if((rdfp->header_index =
3764 (Uint4Ptr) Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3765 rdfp = readdb_destruct(rdfp);
3766 return rdfp;
3767 }
3768
3769 rdfp->header_index_start = rdfp->header_index;
3770 rdfp->header_index_offset = NlmTellMFILE(rdfp->indexfp);
3771 NlmReadMFILE((Uint1Ptr) rdfp->header_index, 4, num_seqs+1,
3772 rdfp->indexfp);
3773
3774 if((rdfp->sequence_index =
3775 (Uint4Ptr)Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3776 rdfp = readdb_destruct(rdfp);
3777 return rdfp;
3778 }
3779 rdfp->sequence_index_start = rdfp->sequence_index;
3780 NlmReadMFILE((Uint1Ptr) rdfp->sequence_index, 4, num_seqs+1,
3781 rdfp->indexfp);
3782
3783 /* For nucleotide sequence we will process ambiguity file */
3784 if(!is_prot) {
3785 if((rdfp->ambchar_index = (Uint4Ptr)Nlm_Malloc((num_seqs+1)*sizeof(Uint4))) == NULL) {
3786 rdfp = readdb_destruct(rdfp);
3787 return rdfp;
3788 }
3789 rdfp->ambchar_index_start = rdfp->ambchar_index;
3790 NlmReadMFILE((Uint1Ptr) rdfp->ambchar_index, 4, num_seqs+1, rdfp->indexfp);
3791 }
3792 }
3793
3794
3795 /* Contents were allocated above. */
3796 /*rdfp->contents_allocated = TRUE;*/
3797 rdfp->parameters |= READDB_CONTENTS_ALLOCATED;
3798
3799 /* mmap is not being used, allocate a buffer 2 longer (for sentinel bytes)
3800 than the longest subject length. */
3801 if (rdfp->sequencefp && rdfp->sequencefp->mfile_true == FALSE) {
3802 rdfp->buffer = (UcharPtr)Nlm_Malloc((2+rdfp->maxlen)*sizeof(Uint1));
3803 if (rdfp->buffer == NULL) {
3804 rdfp = readdb_destruct(rdfp);
3805 return rdfp;
3806 }
3807 rdfp->allocated_length = 2 + rdfp->maxlen;
3808 }
3809
3810 /* Now initializing Numeric ISAM indexes */
3811 sprintf(buffer, "%s.%cnd", rdfp->full_filename, is_prot? 'p':'n');
3812 sprintf(buffer1, "%s.%cni", rdfp->full_filename, is_prot? 'p':'n');
3813
3814 if(FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3815 if((rdfp->nisam_opt = ISAMObjectNew(ISAMNumeric,
3816 buffer, buffer1)) == NULL) {
3817 ErrPostEx(SEV_WARNING, 0, 0, "Failed to create NISAM object");
3818 rdfp = readdb_destruct(rdfp);
3819 return rdfp;
3820 }
3821 }
3822
3823 /* Now initializing string ISAM indexes */
3824
3825 sprintf(buffer, "%s.%csd", rdfp->full_filename, is_prot? 'p':'n');
3826 sprintf(buffer1, "%s.%csi", rdfp->full_filename, is_prot? 'p':'n');
3827
3828 if(FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3829
3830 if((rdfp->sisam_opt = ISAMObjectNew(ISAMString,
3831 buffer, buffer1)) == NULL) {
3832 ErrPostEx(SEV_WARNING, 0, 0, "Failed to create SISAM object");
3833 rdfp = readdb_destruct(rdfp);
3834 return rdfp;
3835 }
3836
3837 /* This line may be given only for information - how to access
3838 this parameter. We need to intialize ISAM database before
3839 this parameter is available using function above */
3840 rdfp->sparse_idx = ((ISAMDataPtr) rdfp->sisam_opt)->idx_option;
3841 }
3842
3843 /* Now initializing PIG ISAM indexes */
3844 if (is_prot) {
3845 sprintf(buffer, "%s.ppd", rdfp->full_filename);
3846 sprintf(buffer1, "%s.ppi", rdfp->full_filename);
3847
3848 if (FileLength(buffer) != 0 && FileLength(buffer1) != 0) {
3849 if ( !(rdfp->isam_pig = ISAMObjectNew(ISAMNumeric,
3850 buffer, buffer1))) {
3851 ErrPostEx(SEV_WARNING, 0, 0, "Failed to read PIG ISAM object");
3852 rdfp = readdb_destruct(rdfp);
3853 return rdfp;
3854 }
3855 }
3856 }
3857
3858
3859 /* Now initializing Common index files */
3860 if (isCommonIndex) {
3861 if (cih) {
3862 rdfp->cih = cih;
3863 /*rdfp->handle_common_index = FALSE;*/
3864 rdfp->parameters &= ~READDB_HANDLE_COMMON_INDEX;
3865 } else {
3866 rdfp->cih = CommonIndexInit(commonindex_full_filename);
3867 /*rdfp->handle_common_index = TRUE;*/
3868 rdfp->parameters |= READDB_HANDLE_COMMON_INDEX;
3869 }
3870 if (!(rdfp->cih)) {
3871 isCommonIndex = FALSE;
3872 /*rdfp->handle_common_index = FALSE;*/
3873 rdfp->parameters &= ~READDB_HANDLE_COMMON_INDEX;
3874 } else {
3875 rdfp->filebit = DBShift(rdfp->cih->num_of_DBs, rdfp->cih->dbids,
3876 Nlm_FileNameFind(filename), is_prot);
3877 }
3878 }
3879
3880 /* Initialize shared information structure */
3881 rdfp->shared_info = (ReadDBSharedInfoPtr) MemNew(sizeof(ReadDBSharedInfo));
3882
3883 /* Without this, FDReadDeflineAsn will fail in multi-threaded mode! */
3884 if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
3885 SeqEntryLoad();
3886 fdlobjAsnLoad();
3887 }
3888
3889 return rdfp;
3890 }
3891
3892 /*
3893 filename: name of the file to be openend.
3894 is_prot: three choices: protein, nucleotide, or either one.
3895 init_state: how much should be initialized.
3896 READDB_NEW_DO_ALL : initialize everything possible
3897 READDB_NEW_DO_REPORT : init enough for a report on db size etc.
3898 cih: common index
3899 */
3900 static ReadDBFILEPtr
readdb_new_internal(CharPtr filename,Uint1 is_prot,Uint1 init_state,CommonIndexHeadPtr cih)3901 readdb_new_internal(CharPtr filename, Uint1 is_prot, Uint1 init_state, CommonIndexHeadPtr cih)
3902 {
3903 ReadDBFILEPtr ret_rdfp = NULL;
3904 Char *wrk_buf;
3905 char blast_dir[PATH_MAX];
3906 Char *one_blast_dir;
3907 char path_delim[2];
3908 if( (ret_rdfp = readdb_new_internalEx(NULL,filename,is_prot,init_state,cih)) ){
3909 return ret_rdfp;
3910 }
3911
3912 s_GetBlastDirInfo(blast_dir, path_delim);
3913
3914 for( one_blast_dir = Nlm_StringTokMT (blast_dir,(char*)path_delim, (char **)&wrk_buf);
3915 one_blast_dir != NULL;
3916 one_blast_dir = Nlm_StringTokMT (NULL,(char*)path_delim, (char **)&wrk_buf) )
3917 {
3918 ret_rdfp = readdb_new_internalEx( one_blast_dir,filename,is_prot,init_state,cih);
3919 if( ret_rdfp ) return ret_rdfp;
3920
3921 /* Try it again, stripping any path off the filename. */
3922 ret_rdfp = readdb_new_internalEx( one_blast_dir, Nlm_FileNameFind(filename),is_prot,init_state,cih);
3923 if( ret_rdfp ) return ret_rdfp;
3924 }
3925 return NULL;
3926 }
3927
OIDListFree(OIDListPtr oidlist)3928 OIDListPtr OIDListFree (OIDListPtr oidlist)
3929
3930 {
3931 if (oidlist == NULL)
3932 return NULL;
3933
3934 if (oidlist->memory)
3935 MemFree(oidlist->memory);
3936 else
3937 NlmCloseMFILE(oidlist->mfp);
3938
3939 if (oidlist->filename)
3940 MemFree(oidlist->filename);
3941
3942 MemFree(oidlist);
3943
3944 return NULL;
3945 }
3946
3947
ReadOIDList(OIDListPtr oidlist)3948 Boolean ReadOIDList (OIDListPtr oidlist)
3949 {
3950 NlmMFILEPtr mmfile;
3951 Int4 length;
3952
3953
3954
3955 /* format of the file:
3956 <number of OID's - N>
3957 <oid1>
3958 <oid2>
3959 ...
3960 <oidN>
3961 */
3962
3963 /* use memmap */
3964 mmfile = NlmOpenMFILE(oidlist->filename);
3965 if (!mmfile) {
3966 ErrPostEx(SEV_ERROR, 0, 0, "Could not open OID mask %s\n",
3967 oidlist->filename);
3968 return FALSE;
3969 }
3970
3971 if (mmfile->mfile_true == FALSE)
3972 {
3973 length = FileLength(oidlist->filename);
3974 oidlist->memory = MemNew(length);
3975 if (oidlist->memory == NULL)
3976 return FALSE;
3977 FileRead(oidlist->memory, length, 1, mmfile->fp);
3978 oidlist->list = oidlist->memory + 1;
3979 oidlist->total = Nlm_SwapUint4(*((Int4Ptr) oidlist->memory));
3980 NlmCloseMFILE(mmfile);
3981 }
3982 else
3983 {
3984 oidlist->list = (Uint4Ptr) mmfile->mmp_begin + 1;
3985 oidlist->mfp = mmfile;
3986 oidlist->total = Nlm_SwapUint4(*((Int4Ptr) mmfile->mmp_begin));
3987 }
3988
3989 return TRUE;
3990 }
3991
3992 Int4 LIBCALL
readdb_validate(ReadDBFILEPtr rdfp)3993 readdb_validate (ReadDBFILEPtr rdfp)
3994 {
3995 Int4 retval = READDB_VALID;
3996
3997 if ( !rdfp ) {
3998 return READDB_INVALID_NULL_ARG;
3999 }
4000
4001 /* Verify that all elements of the rdfp linked list are either protein or
4002 * nucleotide */
4003 {
4004 Boolean is_prot = (rdfp->parameters & READDB_IS_PROT) ? TRUE : FALSE;
4005 for (; rdfp; rdfp = rdfp->next) {
4006 if ((rdfp->parameters & READDB_IS_PROT) && !is_prot) {
4007 retval = READDB_INVALID_MIXED_DBS;
4008 break;
4009 }
4010 }
4011 }
4012
4013 return retval;
4014 }
4015
4016 ReadDBFILEPtr LIBCALL
readdb_new_ex(CharPtr filename,Uint1 is_prot,Boolean init_indices)4017 readdb_new_ex (CharPtr filename, Uint1 is_prot, Boolean init_indices)
4018
4019 {
4020 return readdb_new_ex2(filename, is_prot, READDB_NEW_INDEX, NULL, NULL);
4021 }
4022
4023 /* Maximum number of rdfp structures during calls to readdb_new_ex2 before we
4024 * call readdb_merge_gifiles */
4025 #define RDFP_THRESHOLD 10
4026
4027 ReadDBFILEPtr LIBCALL
readdb_new_ex2(CharPtr filename,Uint1 is_prot,Uint1 init_state,CharPtr oidlist,CharPtr gilist)4028 readdb_new_ex2 (CharPtr filename, Uint1 is_prot, Uint1 init_state, CharPtr oidlist, CharPtr gilist)
4029
4030 {
4031 Boolean done = FALSE, duplicate_db;
4032 Char buffer[PATH_MAX], buffer_oidlist[PATH_MAX];
4033 Int4 start=0, old_start = 0;
4034 ReadDBFILEPtr new, tmp, var, var1, rdfp_w_oidlist, var2;
4035 CommonIndexHeadPtr cih = NULL;
4036 Int4 num_whole_db = 0, i, rdfp_ctr = 0;
4037
4038 new = NULL;
4039 rdfp_w_oidlist = NULL;
4040 buffer_oidlist[0] = NULLB;
4041
4042 while (!done) {
4043 done = readdb_parse_db_names(&filename, buffer);
4044 if (*buffer == NULLB)
4045 break;
4046 if (oidlist) { /* NOTE: no account taken of duplicate databases?? */
4047 readdb_parse_db_names(&oidlist, buffer_oidlist);
4048 if (*buffer_oidlist == NULLB)
4049 break;
4050 }
4051 /* Look for duplicates of the database names. */
4052 duplicate_db = FALSE;
4053 var1 = new;
4054 while (var1) {
4055 if (StringCmp(readdb_get_filename(var1), buffer) == 0) {
4056 duplicate_db = TRUE;
4057 break;
4058 }
4059 var1 = var1->next;
4060 }
4061 if (duplicate_db)
4062 continue;
4063
4064 /* 'continue' if return is NULL in case only one of many databases can't
4065 be found. Warning issued by readdb_new_internal. */
4066 if(!(tmp = readdb_new_internal(buffer, is_prot, init_state, cih)))
4067
4068 continue;
4069
4070 if (tmp->cih) {
4071 cih = tmp->cih;
4072 }
4073
4074 while (tmp) {
4075 if (tmp->oidlist) {
4076 /* Save these separately. */
4077 if (rdfp_w_oidlist == NULL) {
4078 rdfp_w_oidlist = tmp;
4079 } else {
4080 var = rdfp_w_oidlist;
4081 while (var->next)
4082 var = var->next;
4083 var->next = tmp;
4084 }
4085 } else {
4086 if (!(tmp->parameters & READDB_NOT_FIRST_TIME)) {
4087 tmp->parameters |= READDB_NOT_FIRST_TIME;
4088 if (buffer_oidlist[0] != NULLB) {
4089
4090 /* read this OID list */
4091 tmp->oidlist = (OIDListPtr) MemNew (sizeof(OIDList));
4092 tmp->oidlist->filename = StringSave(buffer_oidlist);
4093 if (!ReadOIDList(tmp->oidlist)) {
4094 return NULL;
4095 }
4096 }
4097 }
4098
4099 if (new == NULL) {
4100 new = tmp;
4101 } else {
4102 var = new;
4103 while(var->next)
4104 var = var->next;
4105 var->next = tmp;
4106 }
4107 }
4108 if (gilist) {
4109 /*tmp->gifile = StringSave(gilist); CC: No need for this */
4110 tmp->gilist = Int4ListReadFromFile(gilist);
4111 }
4112 var = tmp->next;
4113 tmp->next = NULL;
4114 tmp = var;
4115 }
4116
4117 /* If we have more than RDFP_THRESHOLD elements in new, try to
4118 * compress merge rdfp's that have the same underlying blast database
4119 * so that we don't mmap too many index files. This is not an issue if
4120 * init_state is READDB_NEW_DO_REPORT. */
4121 for (var2 = new, rdfp_ctr = 0; var2; var2 = var2->next, rdfp_ctr++) ;
4122 if (rdfp_ctr > RDFP_THRESHOLD && !(init_state & READDB_NEW_DO_REPORT))
4123 new = readdb_merge_gifiles(new);
4124 }
4125
4126 /* Attach the RDFP's with an OID.
4127 Check if any of them are already present as complete databases */
4128 {{
4129 if (rdfp_w_oidlist) {
4130 if (new == NULL) {
4131 num_whole_db = 0;
4132 new = rdfp_w_oidlist;
4133 var = NULL;
4134 } else {
4135 num_whole_db = 1;
4136 var = new;
4137 while(var->next) {
4138 num_whole_db++;
4139 var = var->next;
4140 }
4141 var->next = rdfp_w_oidlist;
4142 }
4143 }
4144
4145 if (num_whole_db > 0) {
4146 var1 = var;
4147 while (rdfp_w_oidlist) {
4148 for (i=0, var = new; i<num_whole_db; i++, var = var->next) {
4149 if (StringCmp(var->full_filename, rdfp_w_oidlist->full_filename)
4150 == 0) {
4151 var1->next = rdfp_w_oidlist->next;
4152 rdfp_w_oidlist->next = NULL;
4153 readdb_destruct(rdfp_w_oidlist);
4154 rdfp_w_oidlist = var1->next;
4155 break;
4156 }
4157 }
4158 if (i==num_whole_db) {
4159 var1 = rdfp_w_oidlist;
4160 rdfp_w_oidlist = rdfp_w_oidlist->next;
4161 }
4162 }
4163 }
4164 }}
4165
4166 /* For databases such as the Microbial blast databases, where the list of
4167 * databases includes many of the same underlying database and different
4168 * gi lists to specify a subset of the database, concatenate the gi lists
4169 * and keep only one copy of the underlying rdfp structure (avoid mmap'ing
4170 * the same index file multiple times). This is not an issue if init_state
4171 * is READDB_NEW_DO_REPORT. */
4172 if (!(init_state & READDB_NEW_DO_REPORT))
4173 new = readdb_merge_gifiles(new);
4174
4175 /* adjust all the RDFP's. */
4176 tmp = new;
4177 start = 0;
4178 while (tmp) {
4179 /* this may have been adjusted on a previous call to this function,
4180 readjust for indices. */
4181 old_start = tmp->start;
4182 tmp->start = start;
4183 tmp->stop = tmp->num_seqs-1+start;
4184 tmp->ambchar_index -= (start-old_start);
4185 tmp->header_index -= (start-old_start);
4186 tmp->sequence_index -= (start-old_start);
4187
4188 start = tmp->stop+1;
4189 tmp = tmp->next;
4190 }
4191
4192 if (new)
4193 /*new->not_first_time = FALSE;*/
4194 new->parameters &= ~READDB_NOT_FIRST_TIME;
4195 return new;
4196 }
4197
4198 ReadDBFILEPtr LIBCALL
readdb_new(CharPtr filename,Uint1 is_prot)4199 readdb_new (CharPtr filename, Uint1 is_prot)
4200
4201 {
4202
4203 return readdb_new_ex(filename, is_prot, TRUE);
4204 }
4205
4206 /*
4207 Get total length and number of sequences in multiple databases.
4208 */
4209
4210 Boolean LIBCALL
readdb_get_totals(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num)4211 readdb_get_totals(ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num)
4212
4213 {
4214 return readdb_get_totals_ex(rdfp_list, total_len, total_num, FALSE);
4215 }
4216
4217
4218
4219 /*
4220 Get total length and number of sequences in multiple databases.
4221 if 'use_alias' is TRUE, values from the alias file will be used
4222 if non-zero.
4223 */
4224
4225 Boolean LIBCALL
readdb_get_totals_ex(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias)4226 readdb_get_totals_ex(ReadDBFILEPtr rdfp_list, Int8Ptr total_len, Int4Ptr total_num, Boolean use_alias)
4227
4228 {
4229 return readdb_get_totals_ex2(rdfp_list, total_len, total_num, use_alias,
4230 FALSE);
4231 }
4232
4233 /* retrieves the total number of sequences and database length in the
4234 * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive
4235 * options: use_virtual_oidlist assumes this rdfp_list has been processed by
4236 * BlastProcessGiLists. */
4237 Boolean LIBCALL
readdb_get_totals_ex2(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias,Boolean use_virtual_oidlist)4238 readdb_get_totals_ex2 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr total_len,
4239 Int4Ptr total_num, Boolean use_alias, Boolean use_virtual_oidlist))
4240 {
4241 return readdb_get_totals_ex3(rdfp_list, total_len, total_num, use_alias,
4242 use_virtual_oidlist, eExact);
4243 }
4244
4245 /* retrieves the total number of sequences and database length in the
4246 * rdfp_list. use_alias and use_virtual_oidlist are mutually exclusive
4247 * options: use_virtual_oidlist assumes this rdfp_list has been processed by
4248 * BlastProcessGiLists. */
4249 Boolean LIBCALL
readdb_get_totals_ex3(ReadDBFILEPtr rdfp_list,Int8Ptr total_len,Int4Ptr total_num,Boolean use_alias,Boolean use_virtual_oidlist,EAccountingMode acc_mode)4250 readdb_get_totals_ex3 PROTO ((ReadDBFILEPtr rdfp_list, Int8Ptr total_len,
4251 Int4Ptr total_num, Boolean use_alias, Boolean use_virtual_oidlist,
4252 EAccountingMode acc_mode))
4253 {
4254 ReadDBFILEPtr rdfp;
4255 OIDListPtr virtual_oidlist = NULL;
4256 Uint4 maskindex, i, base = 0, total_mask;
4257 Uint4 mask;
4258 typedef Int4 (LIBCALL *fun_ptr) (ReadDBFILEPtr, Int4);
4259
4260 fun_ptr get_sequence_length = (acc_mode == eExact ?
4261 &readdb_get_sequence_length :
4262 &readdb_get_sequence_length_approx);
4263 *total_len = 0;
4264 *total_num = 0;
4265
4266 if (rdfp_list == NULL || total_len == NULL || total_num == NULL)
4267 return FALSE;
4268
4269 if (use_alias && use_virtual_oidlist)
4270 return FALSE;
4271
4272 if (use_virtual_oidlist) {
4273
4274 for (rdfp = rdfp_list; rdfp; rdfp = rdfp->next) {
4275
4276 if ((virtual_oidlist = rdfp->oidlist)) {
4277 total_mask = virtual_oidlist->total/MASK_WORD_SIZE + 1;
4278 maskindex = 0;
4279
4280 while (maskindex < total_mask){
4281 mask = SwapUint4(virtual_oidlist->list[maskindex]);
4282 i = 0;
4283 while (mask) {
4284 if ((mask & (((Uint4)0x1) << (MASK_WORD_SIZE-1)))) {
4285 (*total_num)++;
4286 *total_len += (*get_sequence_length)(rdfp_list,
4287 base+i);
4288 }
4289 mask <<= 1;
4290 i++;
4291 }
4292 maskindex++;
4293 base += MASK_WORD_SIZE;
4294 }
4295 break; /* virtual oidlist is always the last oidlist */
4296 }
4297 *total_len += readdb_get_dblen(rdfp);
4298 *total_num += readdb_get_num_entries(rdfp);
4299 }
4300 } else {
4301
4302 while (rdfp_list) {
4303
4304 /* Note well: This assumes that the information in the alias file is
4305 * accurate, if the aliaslen and aliasnseq fields are inaccurate, so
4306 * will be the total number of sequences and database length
4307 * returned */
4308 if (use_alias && rdfp_list->aliasfilename) {
4309 if (rdfp_list->aliaslen >= 0
4310 && (rdfp_list->oidlist || rdfp_list->gifile ||
4311 rdfp_list->gilist))
4312 *total_len += rdfp_list->aliaslen;
4313 else
4314 *total_len += readdb_get_dblen(rdfp_list);
4315
4316 if (rdfp_list->aliasnseq >= 0
4317 && (rdfp_list->oidlist || rdfp_list->gifile ||
4318 rdfp_list->gilist))
4319 *total_num += rdfp_list->aliasnseq;
4320 else
4321 *total_num += readdb_get_num_entries(rdfp_list);
4322 } else {
4323 *total_len += readdb_get_dblen(rdfp_list);
4324 *total_num += readdb_get_num_entries(rdfp_list);
4325 }
4326 rdfp_list = rdfp_list->next;
4327 }
4328 }
4329
4330 return TRUE;
4331
4332 }
4333
4334
4335 /*
4336 Gets the number to be used for statistical purposes. Should be set in
4337 alias file as STATS_NSEQ and STATS_TOTLEN.
4338 */
4339 Boolean LIBCALL
readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list,Int4 * num_seq_stats,Int8 * tot_len_stats)4340 readdb_get_stats_numbers(ReadDBFILEPtr rdfp_list, Int4* num_seq_stats, Int8* tot_len_stats)
4341 {
4342 Int4 num_seqs=0;
4343 Int8 tot_len=0;
4344
4345 if (rdfp_list == NULL)
4346 return FALSE;
4347
4348 while (rdfp_list)
4349 {
4350 num_seqs += rdfp_list->nseq_stats;
4351 tot_len += rdfp_list->totlen_stats;
4352 rdfp_list = rdfp_list->next;
4353 }
4354 *num_seq_stats = num_seqs;
4355 *tot_len_stats = tot_len;
4356 return TRUE;
4357 }
4358
4359
4360
4361 /*
4362 Checks whether a ReadDBFILEPtr is the original, or just attaced.
4363 It does this by checking the rdfp->contents_allocated flag.
4364 */
4365 Boolean LIBCALL
readdb_copy(ReadDBFILEPtr rdfp)4366 readdb_copy (ReadDBFILEPtr rdfp)
4367
4368 {
4369 if (rdfp == NULL)
4370 return FALSE;
4371
4372 /* if allocated, this is not a copy. */
4373 /*if (rdfp->contents_allocated)*/
4374 if (rdfp->parameters & READDB_CONTENTS_ALLOCATED)
4375 return FALSE;
4376
4377 return TRUE;
4378 }
4379
4380 /* Compare rdfp1 with rdfp2 for identical:
4381 molecule type (prot/nucl)
4382 total number of bases/residues
4383 maximum sequence length
4384 file name
4385 date of creation
4386 membership_bit
4387 oidlist
4388 */
4389 Boolean
readdb_compare_basic(ReadDBFILEPtr rdfp1,ReadDBFILEPtr rdfp2)4390 readdb_compare_basic(ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2)
4391 {
4392 if (rdfp1 == NULL || rdfp2 == NULL)
4393 return FALSE;
4394
4395 if (rdfp1 == rdfp2)
4396 return TRUE;
4397
4398 /*if (rdfp1->is_prot != rdfp2->is_prot)*/
4399 if ((rdfp1->parameters & READDB_IS_PROT) !=
4400 (rdfp2->parameters & READDB_IS_PROT))
4401 return FALSE;
4402
4403 if (rdfp1->totlen != rdfp2->totlen)
4404 return FALSE;
4405
4406 if (rdfp1->maxlen != rdfp2->maxlen)
4407 return FALSE;
4408
4409 if (StringCmp(rdfp1->filename, rdfp2->filename) != 0)
4410 return FALSE;
4411
4412 if (StringCmp(rdfp1->date, rdfp2->date) != 0)
4413 return FALSE;
4414
4415 if (rdfp1->membership_bit != rdfp2->membership_bit)
4416 return FALSE;
4417
4418 if ((rdfp1->oidlist!=NULL && rdfp2->oidlist==NULL) ||
4419 (rdfp1->oidlist==NULL && rdfp2->oidlist!=NULL))
4420 return FALSE;
4421
4422 /* If both have a valid oidlist ... */
4423 if ((rdfp1->oidlist && rdfp2->oidlist) &&
4424 (rdfp1->oidlist->filename && rdfp2->oidlist->filename) &&
4425 /* but different filenames, then they must have different oidlists */
4426 (StringCmp(rdfp1->oidlist->filename, rdfp2->oidlist->filename) != 0))
4427 return FALSE;
4428
4429 return TRUE;
4430 }
4431
4432 /*
4433 Check whether two different ReadDBFILEPtr refer to the
4434 same database.
4435
4436 If they are, then TRUE is returned.
4437 */
4438 Boolean LIBCALL
readdb_compare(ReadDBFILEPtr rdfp1,ReadDBFILEPtr rdfp2)4439 readdb_compare(ReadDBFILEPtr rdfp1, ReadDBFILEPtr rdfp2)
4440 {
4441 Boolean same_title = (StringCmp(rdfp1->title, rdfp2->title) == 0);
4442
4443 return (same_title && readdb_compare_basic(rdfp1, rdfp2));
4444 }
4445
4446
4447 /* This function attempts to merge the contents of rdfp->gilist(s) of those
4448 * rdfp's in rdfp_chain that have the same underlying blast database. This is
4449 * done so that we don't mmap the same index files multiple times. */
readdb_merge_gifiles(ReadDBFILEPtr rdfp_chain)4450 static ReadDBFILEPtr readdb_merge_gifiles (ReadDBFILEPtr rdfp_chain)
4451 {
4452 register ReadDBFILEPtr rdfp = NULL, temp = NULL, prev = NULL;
4453 CharPtr title = NULL;
4454 Int4 title_len = 0;
4455
4456 for (rdfp = prev = rdfp_chain; rdfp; rdfp = rdfp->next, prev = rdfp) {
4457
4458 for (temp = rdfp->next; temp; prev = temp, temp = temp->next) {
4459
4460 if (!readdb_compare_basic(rdfp, temp))
4461 continue;
4462 /* rdfp and temp have the same underlying database, so we combine
4463 them */
4464 prev->next = temp->next;
4465 temp->next = NULL;
4466
4467 /*** Merge the gilists, if any ***/
4468 if (temp->gilist) {
4469 rdfp->gilist = Int4ListConcat(&rdfp->gilist, &temp->gilist);
4470 ASSERT(rdfp->gifile == NULL && temp->gifile == NULL);
4471 }
4472
4473 /*** Keep track of the length and number of sequences according to
4474 * the gi lists ***/
4475 rdfp->aliaslen += temp->aliaslen;
4476 rdfp->aliasnseq += temp->aliasnseq;
4477
4478 /*** Concatenate the titles ***/
4479 if (temp->title) {
4480 title_len = StringLen(rdfp->title) + StringLen(temp->title) + 3;
4481 title = (CharPtr) MemNew(sizeof(Char)*title_len);
4482 if (rdfp->title) {
4483 title = StringCat(title, rdfp->title);
4484 title = StringCat(title, "; ");
4485 }
4486 title = StringCat(title, temp->title);
4487 rdfp->title = MemFree(rdfp->title);
4488 rdfp->title = title;
4489 }
4490
4491 /*** Free temp ***/
4492 temp = readdb_destruct(temp);
4493 temp = prev;
4494 }
4495
4496 }
4497
4498 /* In case new real databases have been found (i.e.: alias file referring to
4499 * another alias file(s) along with real database(s)), arrange them so that
4500 * the real databases are at the front of the rdfp_chain */
4501 {
4502 ReadDBFILEPtr rdfp_w_gilist = NULL;
4503
4504 rdfp = rdfp_chain;
4505 rdfp_chain = NULL;
4506
4507 /* separate rdfp's w/ gilists and real databases */
4508 while (rdfp) {
4509
4510 if (rdfp->gilist) {
4511 if (rdfp_w_gilist == NULL) {
4512 rdfp_w_gilist = rdfp;
4513 } else {
4514 temp = rdfp_w_gilist;
4515 while (temp->next)
4516 temp = temp->next;
4517 temp->next = rdfp;
4518 }
4519 } else {
4520 if (rdfp_chain == NULL) {
4521 rdfp_chain = rdfp;
4522 } else {
4523 temp = rdfp_chain;
4524 while (temp->next)
4525 temp = temp->next;
4526 temp->next = rdfp;
4527 }
4528 }
4529 temp = rdfp->next;
4530 rdfp->next = NULL;
4531 rdfp = temp;
4532 }
4533
4534 /* append the rdfp_w_gilist to the rdfp_chain */
4535 if ( (temp = rdfp_chain)) {
4536 while (temp->next)
4537 temp = temp->next;
4538 temp->next = rdfp_w_gilist;
4539 } else
4540 rdfp_chain = rdfp_w_gilist;
4541 }
4542
4543 return rdfp_chain;
4544 }
4545
4546 /*
4547 Attach to an already open ReadDBFILEPtr. Duplicate the
4548 indexfp, sequencefp, and headerfp structures as the pointers
4549 there (i.e., mmp) will need to be manipulated. Do not
4550 change the FILE PNTR fp.
4551 */
4552
4553 ReadDBFILEPtr LIBCALL
readdb_attach(ReadDBFILEPtr rdfp)4554 readdb_attach (ReadDBFILEPtr rdfp)
4555
4556 {
4557 ReadDBFILEPtr head, last, new_t;
4558
4559 if (rdfp == NULL)
4560 return NULL;
4561
4562 head = NULL;
4563 last = NULL;
4564 while (rdfp)
4565 {
4566 new_t = (ReadDBFILEPtr) MemDup(rdfp, sizeof(ReadDBFILE));
4567
4568 /*
4569 The contents_allocated flag DOES NOT apply to the actual
4570 structures indexfp, headerfp, or sequencefp. These must always
4571 be duplicated, as their pointers need to be independently
4572 manipulated by threads. They have their own allocation flags.
4573 */
4574 /*new_t->contents_allocated = FALSE;*/
4575 new_t->parameters &= ~READDB_CONTENTS_ALLOCATED;
4576 new_t->indexfp = (NlmMFILEPtr) MemDup(rdfp->indexfp,
4577 sizeof(NlmMFILE));
4578 new_t->indexfp->contents_allocated = FALSE;
4579 if (rdfp->headerfp != NULL) {
4580 new_t->headerfp = (NlmMFILEPtr) MemDup(rdfp->headerfp,
4581 sizeof(NlmMFILE));
4582 new_t->headerfp->contents_allocated = FALSE;
4583 }
4584 if (rdfp->sequencefp != NULL) {
4585 new_t->sequencefp = (NlmMFILEPtr) MemDup(rdfp->sequencefp,
4586 sizeof(NlmMFILE));
4587 new_t->sequencefp->contents_allocated = FALSE;
4588 }
4589
4590 if (new_t->taxinfo != NULL) {
4591 new_t->taxinfo = (RDBTaxInfoPtr)
4592 MemDup(rdfp->taxinfo, sizeof(RDBTaxInfo));
4593
4594 if (new_t->taxinfo->taxfp != NULL) {
4595 new_t->taxinfo->taxfp = (NlmMFILEPtr)
4596 MemDup(rdfp->taxinfo->taxfp, sizeof(NlmMFILE));
4597 new_t->taxinfo->taxfp->contents_allocated = FALSE;
4598 }
4599
4600 new_t->taxinfo->name_fd = (NlmMFILEPtr)
4601 MemDup(rdfp->taxinfo->name_fd, sizeof(NlmMFILE));
4602 new_t->taxinfo->name_fd->contents_allocated = FALSE;
4603
4604 new_t->taxinfo->taxinfo_alloc = FALSE;
4605 new_t->taxinfo->taxdata_alloc = FALSE;
4606 }
4607
4608 /*new_t->handle_common_index = FALSE;*/
4609 new_t->parameters &= ~READDB_HANDLE_COMMON_INDEX;
4610
4611 new_t->oidlist = rdfp->oidlist;
4612
4613 /* Copy address of shared information */
4614 new_t->shared_info = rdfp->shared_info;
4615
4616 /* increment the reference count atomically */
4617
4618 if(new_t->shared_info != NULL) {
4619 NlmMutexLockEx(&hdrseq_mutex);
4620 rdfp->shared_info->nthreads++;
4621 NlmMutexUnlock(hdrseq_mutex);
4622 }
4623
4624 /* Contents_allocated also does not apply to buffer, this is
4625 determined by allocated_length. */
4626 if (new_t->allocated_length > 0)
4627 {
4628 new_t->buffer = (UcharPtr) MemNew((new_t->allocated_length)*sizeof(Uint1));
4629 }
4630
4631 if (head == NULL)
4632 {
4633 head = new_t;
4634 }
4635 else
4636 {
4637 last->next = new_t;
4638 }
4639
4640 last = new_t;
4641 rdfp = rdfp->next;
4642 }
4643
4644 return head;
4645 }
4646
4647 ReadDBFILEPtr LIBCALL
readdb_destruct(ReadDBFILEPtr rdfp)4648 readdb_destruct (ReadDBFILEPtr rdfp)
4649
4650 {
4651 ReadDBFILEPtr next;
4652
4653 if (!rdfp)
4654 return NULL;
4655
4656 if (rdfp->parameters & READDB_CONTENTS_ALLOCATED) {
4657 rdfp = ReadDBCloseMHdrAndSeqFiles(rdfp);
4658 taxonomyDbLoaded = FALSE;
4659 }
4660 rdfp = ReadDBFreeSharedInfo(rdfp);
4661 while (rdfp) {
4662 next = rdfp->next;
4663 rdfp = readdb_destruct_element(rdfp);
4664 rdfp = next;
4665 }
4666
4667 return NULL;
4668 }
4669
4670 /*
4671 Destroys a single element.
4672 */
4673 ReadDBFILEPtr LIBCALL
readdb_destruct_element(ReadDBFILEPtr rdfp)4674 readdb_destruct_element (ReadDBFILEPtr rdfp)
4675
4676 {
4677
4678 if (rdfp == NULL)
4679 return NULL;
4680
4681 /* Deallocate if contents were allocated. */
4682 /*if (rdfp->contents_allocated) {*/
4683 if (rdfp->parameters & READDB_CONTENTS_ALLOCATED) {
4684 rdfp->filename = (CharPtr)MemFree(rdfp->filename);
4685 rdfp->aliasfilename = (CharPtr)MemFree(rdfp->aliasfilename);
4686 rdfp->title = (CharPtr)MemFree(rdfp->title);
4687 rdfp->date = (CharPtr)MemFree(rdfp->date);
4688 /* free array if they were allocated, ie no memmap */
4689 if (rdfp->header_index_start)
4690 rdfp->header_index_start = (Uint4Ptr)MemFree(rdfp->header_index_start);
4691 if (rdfp->sequence_index_start)
4692 rdfp->sequence_index_start = (Uint4Ptr)MemFree(rdfp->sequence_index_start);
4693 if (rdfp->ambchar_index_start)
4694 rdfp->ambchar_index_start =(Uint4Ptr) MemFree(rdfp->ambchar_index_start);
4695 /* is it completely safe to have one rdfp->nisam_opt for all threads. */
4696 ISAMObjectFree(rdfp->nisam_opt); /* Terminating NISAM */
4697 ISAMObjectFree(rdfp->sisam_opt); /* Terminating NISAM */
4698 ISAMObjectFree(rdfp->isam_pig); /* Terminating PIG ISAM */
4699 OIDListFree(rdfp->oidlist);
4700 rdfp->gifile = MemFree(rdfp->gifile);
4701 rdfp->gilist = Int4ListFree(rdfp->gilist);
4702
4703 }
4704 rdfp->indexfp = NlmCloseMFILE(rdfp->indexfp);
4705 NlmMutexLockEx(&hdrseq_mutex);
4706 if (rdfp->shared_info && (rdfp->sequencefp || rdfp->headerfp)) {
4707 if (--(rdfp->shared_info->nthreads) == 0) {
4708 rdfp->shared_info->sequencefp =
4709 NlmCloseMFILE(rdfp->shared_info->sequencefp);
4710 rdfp->shared_info->headerfp =
4711 NlmCloseMFILE(rdfp->shared_info->headerfp);
4712 } else if (rdfp->shared_info->nthreads == -1) {
4713 rdfp->shared_info->nthreads = 0;
4714 rdfp->shared_info = NULL;
4715 }
4716 }
4717 NlmMutexUnlock(hdrseq_mutex);
4718 rdfp->shared_info = NULL;
4719 rdfp->sequencefp = NlmCloseMFILE(rdfp->sequencefp);
4720 rdfp->headerfp = NlmCloseMFILE(rdfp->headerfp);
4721
4722 RDBTaxInfoClose(rdfp->taxinfo); /* Closing taxonomy names database */
4723
4724 if (rdfp->allocated_length > 0) {
4725 rdfp->buffer = (UcharPtr)MemFree(rdfp->buffer);
4726 }
4727
4728 if (rdfp->blast_deflinep != NULL)
4729 rdfp->blast_deflinep = BlastDefLineSetFree(rdfp->blast_deflinep);
4730
4731 /* destruct common index only if it is permited to do it for this thread */
4732
4733 if (rdfp->cih && /*rdfp->handle_common_index*/
4734 (rdfp->parameters & READDB_HANDLE_COMMON_INDEX))
4735 CommonIndexDestruct(rdfp->cih);
4736
4737 rdfp = (ReadDBFILEPtr) MemFree(rdfp);
4738
4739 return NULL;
4740 }
4741
4742 /*
4743 Goes through a chain of ReadDBfILEPtr's, looking for the one
4744 that contains the specified ordinal ID.
4745 */
4746
4747 static ReadDBFILEPtr
readdb_get_link(ReadDBFILEPtr rdfp,Int4 ordinal_id)4748 readdb_get_link(ReadDBFILEPtr rdfp, Int4 ordinal_id)
4749
4750 {
4751 ReadDBFILEPtr last, last_used, rdfp_var;
4752 Boolean loaded_new = FALSE;
4753
4754 last_used = last = rdfp;
4755
4756 while (rdfp) {
4757 if (rdfp->start <=ordinal_id && rdfp->stop >= ordinal_id)
4758 break;
4759 rdfp = rdfp->next;
4760 }
4761 if (! rdfp)
4762 return 0;
4763 if (!(last->parameters & READDB_KEEP_HDR_AND_SEQ)) {
4764 while (rdfp != last) {
4765 if (last->sequencefp != NULL || last->headerfp != NULL) {
4766 if (last->shared_info) {
4767 NlmMutexLockEx(&hdrseq_mutex);
4768 if (--(last->shared_info->nthreads) == 0) {
4769 last->shared_info->sequencefp =
4770 NlmCloseMFILE(last->shared_info->sequencefp);
4771 last->shared_info->headerfp =
4772 NlmCloseMFILE(last->shared_info->headerfp);
4773 } else if (last->shared_info->nthreads < 0) {
4774 last->sequencefp = NULL;
4775 last->headerfp = NULL;
4776 last->shared_info->nthreads = 0;
4777 }
4778 NlmMutexUnlock(hdrseq_mutex);
4779 }
4780 last->sequencefp = NlmCloseMFILE(last->sequencefp);
4781 last->headerfp = NlmCloseMFILE(last->headerfp);
4782 }
4783 last = last->next;
4784 }
4785
4786 rdfp_var = rdfp->next;
4787 while (rdfp_var != NULL) {
4788 if (rdfp_var->sequencefp != NULL || rdfp_var->headerfp != NULL) {
4789 if (rdfp_var->shared_info) {
4790 NlmMutexLockEx(&hdrseq_mutex);
4791 if (--(rdfp_var->shared_info->nthreads) == 0) {
4792 rdfp_var->shared_info->sequencefp =
4793 NlmCloseMFILE(rdfp_var->shared_info->sequencefp);
4794 rdfp_var->shared_info->headerfp =
4795 NlmCloseMFILE(rdfp_var->shared_info->headerfp);
4796 } else if (rdfp_var->shared_info->nthreads < 0) {
4797 rdfp_var->sequencefp = NULL;
4798 rdfp_var->headerfp = NULL;
4799 rdfp_var->shared_info->nthreads = 0;
4800 }
4801 NlmMutexUnlock(hdrseq_mutex);
4802 }
4803 rdfp_var->sequencefp = NlmCloseMFILE(rdfp_var->sequencefp);
4804 rdfp_var->headerfp = NlmCloseMFILE(rdfp_var->headerfp);
4805 }
4806 rdfp_var = rdfp_var->next;
4807 }
4808 }
4809
4810 /* Check for nthreads == 0 is needed because rdfp->sequencefp and
4811 rdfp->headerfp might be already freed by another thread, but still
4812 not NULL here. The check is done outside the mutex to avoid a huge number
4813 of mutex locks. It will be repeated once in the mutex */
4814 if ((rdfp->sequencefp==NULL && rdfp->headerfp==NULL) ||
4815 (rdfp->shared_info && rdfp->shared_info->nthreads==0)) {
4816 NlmMutexLockEx(&hdrseq_mutex);
4817 if ((rdfp->sequencefp==NULL && rdfp->headerfp==NULL) ||
4818 (rdfp->shared_info && rdfp->shared_info->nthreads==0)) {
4819
4820 if (ReadDBOpenMHdrAndSeqFiles(rdfp) == FALSE) {
4821 ErrPostEx(SEV_ERROR, 0, 0,
4822 "ReadDBOpenMHdrAndSeqFiles: failed to map files\n");
4823 rdfp = NULL;
4824 }
4825 else {
4826 loaded_new = TRUE;
4827 }
4828 }
4829 NlmMutexUnlock(hdrseq_mutex);
4830 }
4831
4832 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
4833 #ifdef HAVE_MADVISE
4834 if( useMadvise && rdfp != NULL ) {
4835 EThreadPriority pri = eTP_Highest;
4836
4837 /* est database requires special treatment */
4838 if( rdfp->filename && !strncmp(rdfp->filename, "est", 3) ) {
4839 pri = eTP_Default;
4840 }
4841
4842 readdb_preload_file(rdfp->indexfp, madvisePreloadBlock,
4843 mmapAdvice, madviseSyncMode, pri);
4844
4845 readdb_preload_file(rdfp->sequencefp, madvisePreloadBlock,
4846 mmapAdvice, madviseSyncMode, pri);
4847
4848 readdb_preload_file(rdfp->headerfp, madvisePreloadBlock,
4849 mmapAdvice, madviseSyncMode, pri);
4850 }
4851 #endif /* HAVE_MADVISE */
4852 #endif /* SOL || LINUX */
4853
4854 return rdfp;
4855 }
4856
4857 /*** This function checks whether the oid passed as 2nd argument to this
4858 * function is part of the ordinal id list, if it is, an extra check should be
4859 * done by loading the ASN.1 defline, but if this check fails, there's no need
4860 * to load the defline, as we know it is not part of this subset database.
4861 */
4862 static Int4
s_SearchOidInLocalOidList(const OIDListPtr oidlist,Uint4 oid)4863 s_SearchOidInLocalOidList(const OIDListPtr oidlist, Uint4 oid)
4864 {
4865 /* which word in the array? */
4866 Uint4 oidmask_index = oid / MASK_WORD_SIZE;
4867 /* which bit in the word? */
4868 Uint4 oidmask_bit = 0x1 << ( (MASK_WORD_SIZE-1) - (oid % MASK_WORD_SIZE));
4869
4870 /* No oid list? Then we need to load the defline... */
4871 if ( !oidlist ) {
4872 return 0;
4873 }
4874
4875 /* If the OID is past the end of the mask, bail out. */
4876 if (oid > oidlist->total) return -1;
4877
4878 /* If the bit isn't set, bail out early. */
4879 if (!(SwapUint4(oidlist->list[oidmask_index]) & oidmask_bit))
4880 return -1;
4881
4882 return 0;
4883 }
4884
4885 Boolean
readdb_check_oid(ReadDBFILEPtr rdfp_head,Int4 oid)4886 readdb_check_oid(ReadDBFILEPtr rdfp_head, Int4 oid)
4887 {
4888 ReadDBFILEPtr rdfp_var = rdfp_head;
4889
4890 while (rdfp_var && rdfp_var->start < oid)
4891 {
4892 if (rdfp_var->oidlist) {
4893 if (s_SearchOidInLocalOidList(rdfp_var->oidlist, oid-rdfp_var->start) == 0)
4894 return TRUE;
4895 } else {
4896 if (rdfp_var->start <= oid <= rdfp_var->stop)
4897 return TRUE;
4898 }
4899 rdfp_var = rdfp_var->next;
4900 }
4901 return FALSE;
4902 }
4903
4904 /* This function verifies if a given ordinal id (or gi) belongs
4905 to a mask database, based on the membership bit stored in the
4906 BlastDefLine structure of the new ASN.1 deflines.
4907 @param oidlist OID list where the current oid is presumed to be found [in]
4908 @param oid OID as returned by the ISAM functions, i.e.: relative to a single
4909 rdfp element in the linked list [in]
4910 @param rdfp_head Head of the linked list of ReadDBFILE structures [in]
4911 @param oid_offset offset to be added to the oid so that it can be searched
4912 from rdfp_head [in]
4913 @param gi gi to be found [in]
4914 Note: If gi is -1, then only the oid will be verified to belong
4915 to the mask database. This will matter only on non-redundant
4916 databases, where there can be many gi's associated with the same
4917 oid */
OID_GI_BelongsToMaskDB(OIDListPtr oidlist,Int4 oid,ReadDBFILEPtr rdfp_head,Int4 oid_offset,Int4 gi)4918 static Boolean OID_GI_BelongsToMaskDB(OIDListPtr oidlist,
4919 Int4 oid,
4920 ReadDBFILEPtr rdfp_head,
4921 Int4 oid_offset,
4922 Int4 gi)
4923 {
4924 BlastDefLinePtr bdp = NULL, bdp_tmp = NULL;
4925 SeqIdPtr seqid_gi = NULL;
4926 Boolean retval = FALSE;
4927
4928 /*
4929 * For performance reasons, check to see if the OID corresponding
4930 * to the GI in the GI list exists in the oid mask.
4931 */
4932 if (s_SearchOidInLocalOidList(oidlist, oid) != 0) {
4933 return FALSE;
4934 }
4935
4936 /*
4937 * Otherwise, load the GI's defline to verify it belongs to
4938 * the subset database, since multiple GIs may resolve
4939 * to a single OID.
4940 */
4941
4942 if ((bdp = FDReadDeflineAsn(rdfp_head, oid+oid_offset)) != NULL &&
4943 gi != -1) {
4944
4945 ValNodeAddInt(&seqid_gi, SEQID_GI, gi);
4946
4947 for (bdp_tmp = bdp; bdp_tmp; bdp_tmp = bdp_tmp->next) {
4948 /* FIXME: should do Seq-id comparison to avoid missing accessions
4949 * and not depend on gi values */
4950 if (SeqIdIn(bdp_tmp->seqid, seqid_gi)) {
4951 retval = TRUE;
4952 break;
4953 }
4954 }
4955 bdp = BlastDefLineSetFree(bdp);
4956 seqid_gi = SeqIdFree(seqid_gi);
4957 }
4958
4959 return retval;
4960 }
4961
4962
4963 /*
4964 Returnes Int4 sequence_number by gi using NISAM indexes:
4965
4966 ReadDBFILEPtr rdfp: the main ReadDB reference,
4967 Int4 gi - input gi number to find
4968 Int4 sequence_number: which number is this sequence,
4969 Returned 0 indicates, that gi was found
4970 Returned -1 indicates, that gi was not found
4971 Returned negative value mean fault of NISAM library
4972 */
4973
4974 Int4 LIBCALL
readdb_gi2seq(ReadDBFILEPtr rdfp,Int4 gi,Int4Ptr start)4975 readdb_gi2seq(ReadDBFILEPtr rdfp, Int4 gi, Int4Ptr start)
4976 {
4977
4978 Boolean thereis_unknown_database = FALSE;
4979 ReadDBFILEPtr rdfp_head = rdfp;
4980
4981 if (start)
4982 *start = 0;
4983
4984 while(rdfp) {
4985 if (!rdfp->filebit) {
4986 thereis_unknown_database = TRUE;
4987 break;
4988 }
4989 rdfp = rdfp->next;
4990 }
4991
4992 rdfp = rdfp_head;
4993
4994 if (thereis_unknown_database || (!isCommonIndex)) {
4995 ISAMErrorCode error;
4996 Uint4 Value;
4997
4998 while (rdfp)
4999 {
5000 if(rdfp->nisam_opt == NULL) {
5001 rdfp = rdfp->next;
5002 continue;
5003 }
5004
5005 /* Resolve GI to OID. */
5006 if((error = NISAMSearch(rdfp->nisam_opt, gi,
5007 &Value, NULL)) < 0) {
5008 ErrPostEx(SEV_WARNING, 0, 0, "Failed to initialize search. "
5009 "ISAM Error code is %d\n", error);
5010 return error;
5011 }
5012
5013 if(error != ISAMNotFound) {
5014 if (start)
5015 *start = rdfp->start;
5016
5017 /* Before returning, make sure that this gi belongs to
5018 * the subset (mask) database, if we are dealing with one */
5019 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5020
5021 if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value, rdfp_head,
5022 rdfp->start, gi))
5023 return -1;
5024 }
5025
5026
5027 return (Int4) (Value+rdfp->start);
5028 }
5029
5030 rdfp = rdfp->next;
5031 }
5032 return -1;
5033 } else {
5034 Int4 retval = 0;
5035 Int4 mask = 0, alias_mask = 0;
5036 CommonIndexHeadPtr cih = rdfp->cih;
5037 Int2 dbid=0, alias_dbid=0;
5038
5039 /* create common mask for all databases */
5040 while (rdfp) {
5041 if (rdfp->aliasfilebit) {
5042 alias_mask |= (0x1 << rdfp->aliasfilebit);
5043 };
5044 mask |= (0x1 << rdfp->filebit);
5045 rdfp = rdfp->next;
5046 }
5047
5048 /* get OID and database id (dbid) of this OID */
5049 if (cih)
5050 retval = GI2OID(cih, gi, mask, alias_mask, &dbid, &alias_dbid, rdfp_head);
5051
5052 if (retval >= 0) {
5053 /* find correct rdfp in the list */
5054 rdfp = rdfp_head;
5055 while (rdfp) {
5056 /* if the oid found in mask database */
5057 if (alias_mask && rdfp->aliasfilebit == alias_dbid)
5058 break;
5059 /* if the oid found in real database */
5060 if (!alias_mask && (rdfp->filebit == dbid))
5061 break;
5062 /* if version is greater than FORMATDB_VER and we have a
5063 * CommonIndex, rely on the membership information on
5064 * the BlastDefLine structure */
5065 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5066 if (OID_GI_BelongsToMaskDB(rdfp->oidlist, retval,
5067 rdfp_head, rdfp->start, gi))
5068 break;
5069 }
5070
5071 rdfp = rdfp->next;
5072 }
5073
5074 if (!rdfp) {
5075 /* we did not find the gi where we were trying */
5076 return -1;
5077 }
5078
5079 if (start)
5080 *start = rdfp->start;
5081
5082 return retval+rdfp->start;
5083 }
5084 else
5085 return -1;
5086 }
5087 }
5088
5089 /*
5090 Used for sparse indices.
5091
5092 objective to get not SeqId, but Int4 gi number or CharPtr printed Seqid
5093 ISAM indexes may use or numeric search or string search.
5094
5095 */
5096
readdb_find_best_id(SeqIdPtr sip,Int4Ptr gi,CharPtr tmpbuf)5097 static Boolean readdb_find_best_id(SeqIdPtr sip, Int4Ptr gi, CharPtr tmpbuf)
5098 {
5099 TextSeqIdPtr tsip = NULL;
5100 ObjectIdPtr oid;
5101 PDBSeqIdPtr psip;
5102 DbtagPtr dbt;
5103 SeqIdPtr sip_tmp;
5104
5105 if (sip == NULL)
5106 return FALSE;
5107
5108 for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
5109 if(sip_tmp->choice == SEQID_GI) {
5110 *gi = sip_tmp->data.intvalue;
5111 break;
5112 }
5113 }
5114
5115 if(*gi != 0) return TRUE;
5116
5117 for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
5118
5119 switch (sip_tmp->choice) {
5120 case SEQID_LOCAL: /* local */
5121 oid = (ObjectIdPtr)(sip_tmp->data.ptrvalue);
5122 StringCpy(tmpbuf, oid->str);
5123 break;
5124 case SEQID_GIBBSQ: /* gibbseq */
5125 sprintf(tmpbuf, "%ld", (long)(sip_tmp->data.intvalue));
5126 break;
5127 case SEQID_EMBL: /* embl */
5128 case SEQID_DDBJ: /* ddbj */
5129 case SEQID_GENBANK: /* genbank */
5130 case SEQID_TPG: /* Third Party Annot/Seq Genbank */
5131 case SEQID_TPE: /* Third Party Annot/Seq EMBL */
5132 case SEQID_TPD: /* Third Party Annot/Seq DDBJ */
5133 case SEQID_OTHER: /* other */
5134 case SEQID_PIR: /* pir */
5135 case SEQID_SWISSPROT: /* swissprot */
5136 case SEQID_PRF: /* prf */
5137 case SEQID_GPIPE: /* genome pipeline */
5138 tsip = (TextSeqIdPtr)(sip_tmp->data.ptrvalue);
5139 break;
5140 case SEQID_GENERAL: /* general */
5141 dbt = (DbtagPtr)(sip_tmp->data.ptrvalue);
5142 StringCpy(tmpbuf, dbt->tag->str);
5143 break;
5144 case SEQID_PDB: /* pdb */
5145 psip = (PDBSeqIdPtr)(sip_tmp->data.ptrvalue);
5146 StringCpy(tmpbuf, psip->mol);
5147 break;
5148 }
5149 }
5150
5151 if(tsip != NULL) {
5152 if(tsip->accession != NULL)
5153 StringCpy(tmpbuf, tsip->accession);
5154 else
5155 StringCpy(tmpbuf, tsip->name);
5156 }
5157
5158 return TRUE;
5159 }
5160
5161 #define READDB_TMPBUFF_SIZE 81
5162 /*
5163 Returnes Int4 sequence_number by SeqIdPtr using SISAM indexes:
5164
5165 ReadDBFILEPtr rdfp: the main ReadDB reference,
5166 SeqIdPtr sip - input SeqIdPtr to find
5167 Int4 sequence_number: which number is this sequence,
5168 Returned 0 indicates, that gi was found
5169 Returned -1 indicates, that gi was not found
5170 Returned negative value mean fault of NISAM library
5171 */
5172 Int4 LIBCALL
readdb_seqid2fasta(ReadDBFILEPtr rdfp,SeqIdPtr sip)5173 readdb_seqid2fasta(ReadDBFILEPtr rdfp, SeqIdPtr sip)
5174 {
5175 ISAMErrorCode error;
5176 Int4 Value;
5177 CharPtr key_out = NULL, data = NULL;
5178 Uint4 index;
5179 Int4 gi = 0;
5180 CharPtr chptr = NULL;
5181 SeqIdPtr bestid;
5182 TextSeqIdPtr tsip = NULL;
5183
5184 Char tmpbuff[READDB_TMPBUFF_SIZE];
5185 CharPtr seqid_buff_ptr = tmpbuff;
5186
5187 if(rdfp->sisam_opt == NULL || sip == NULL)
5188 return -1;
5189
5190 /* Use a gi if present to do a numerical lokup. */
5191 bestid = SeqIdFindBest(sip, SEQID_GI);
5192 if (bestid && bestid->choice == SEQID_GI)
5193 {
5194 return readdb_gi2seq(rdfp, bestid->data.intvalue, NULL);
5195 }
5196
5197 while (rdfp)
5198 {
5199 if (rdfp->gifile) {
5200 rdfp = rdfp->next;
5201 continue;
5202 }
5203 if((error = ISAMGetIdxOption(rdfp->sisam_opt, &rdfp->sparse_idx)) < 0) {
5204 ErrPostEx(SEV_WARNING, 0, 0, "Failed to access string index "
5205 "ISAM Error code is %d\n", error);
5206 return -1;
5207 }
5208
5209 if(rdfp->sparse_idx) {
5210 readdb_find_best_id(sip, &gi, seqid_buff_ptr);
5211 if(gi != 0) {
5212 return readdb_gi2seq(rdfp, gi, NULL);
5213 }
5214 } else {
5215 Int4 i;
5216
5217 switch (sip->choice) {
5218 case SEQID_EMBL: /* embl */
5219 case SEQID_DDBJ: /* ddbj */
5220 case SEQID_GENBANK: /* genbank */
5221 case SEQID_TPG: /* Third Party Annot/Seq Genbank */
5222 case SEQID_TPE: /* Third Party Annot/Seq EMBL */
5223 case SEQID_TPD: /* Third Party Annot/Seq DDBJ */
5224 case SEQID_OTHER: /* other */
5225 case SEQID_PIR: /* pir */
5226 case SEQID_SWISSPROT: /* swissprot */
5227 case SEQID_PRF: /* prf */
5228 case SEQID_GPIPE: /* genome pipeline */
5229 tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
5230 break;
5231 default:
5232 break;
5233 }
5234
5235 if(tsip != NULL) {
5236 Int4 dummy_gi = 0; /* Not used, should have been handled above. */
5237 GetAccessionVersionFromSeqId(sip, &gi, &seqid_buff_ptr, TRUE);
5238 } else {
5239 if((SeqIdWrite(sip, seqid_buff_ptr,
5240 PRINTID_FASTA_SHORT, READDB_TMPBUFF_SIZE-1)) == NULL)
5241 return -1;
5242 }
5243
5244 for(i = 0; seqid_buff_ptr[i] != '\0'; i++)
5245 seqid_buff_ptr[i] = TO_LOWER(seqid_buff_ptr[i]);
5246 }
5247
5248 NlmMutexLockEx(&isamsearch_mutex);
5249 if((error = SISAMSearch(rdfp->sisam_opt, seqid_buff_ptr, 0, &key_out,
5250 &data, &index)) < 0) {
5251 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5252 "ISAM Error code is %d\n", error);
5253 return error;
5254 }
5255 NlmMutexUnlock(isamsearch_mutex);
5256
5257 if (tmpbuff != seqid_buff_ptr)
5258 MemFree(seqid_buff_ptr); /* seqid_buff_ptr allocated in GetAccessionVersionFromSeqId. */
5259
5260 MemFree(key_out); /* We need no this for now */
5261
5262 if(data && error != ISAMNotFound) {
5263 Value = atol(data);
5264 MemFree(data);
5265 return Value + rdfp->start;
5266 }
5267 rdfp = rdfp->next;
5268 }
5269 return -1;
5270 }
5271
5272 /** Maximum number of volumes in a ReadDBFILEPtr linked list after which we
5273 * start munmap'ing the ISAM files to avoid running out of memory */
5274 static const size_t kSISAM_MaxNumVolumes = 10;
5275
5276 /*
5277 Returns array of sequence numbers by accession using SISAM indexes:
5278
5279 ReadDBFILEPtr rdfp: the main ReadDB reference,
5280 CharPtr string - input accession to find
5281 Int4Ptr PNTR ids - array of sequence numbers
5282 Int4Ptr count - number of hits
5283 Returned non-negative value indicates, that hits were found
5284 Returned -1 indicates, that hit(s) were not found
5285 Returned negative value mean fault of ISAM library
5286 */
5287
5288 Int4 LIBCALL
readdb_acc2fastaEx(ReadDBFILEPtr rdfp,CharPtr string,Int4Ptr PNTR ids,Int4Ptr count)5289 readdb_acc2fastaEx(ReadDBFILEPtr rdfp, CharPtr string, Int4Ptr PNTR ids,
5290 Int4Ptr count)
5291 {
5292 ISAMErrorCode error;
5293 size_t vol_counter = 0;
5294 SeqIdPtr sip;
5295
5296 if(rdfp->sisam_opt == NULL || string == NULL)
5297 return -1;
5298
5299 if (StringChr(string, '|') != NULL) {
5300
5301 if((sip = SeqIdParse(string)) != NULL) {
5302 *ids = MemNew(sizeof(Int4));
5303 **ids = readdb_seqid2fasta(rdfp, sip);
5304 SeqIdFree(sip);
5305
5306 if(**ids >= 0) {
5307 *count = 1;
5308 return 1;
5309 } else {
5310 return -1;
5311 }
5312 }
5313 }
5314
5315 for (vol_counter = 0; rdfp; rdfp = rdfp->next, vol_counter++) {
5316 error = SISAMFindAllData(rdfp->sisam_opt, string, ids, count);
5317
5318 if(error != ISAMNotFound) {
5319 Int4 index=0;
5320 while (index < *count)
5321 {
5322 (*ids)[index] += rdfp->start;
5323 index++;
5324 }
5325 return 1;
5326 }
5327 if (vol_counter >= kSISAM_MaxNumVolumes) {
5328 ISAMUninitSearch(rdfp->sisam_opt);
5329 }
5330 }
5331 return -1;
5332 }
5333 /*
5334 Returns the first (*) Int4 sequence_number found by accession/locus using
5335 SISAM indexes:
5336
5337 ReadDBFILEPtr rdfp: the main ReadDB reference,
5338 CharPtr string - input accession to find
5339 Int4 sequence_number: which number is this sequence,
5340 Returned 0 indicates, that gi was found
5341 Returned -1 indicates, that gi was not found
5342 Returned negative value mean fault of ISAM library
5343
5344 (*): This means that in multi-volume databases (which potentially join
5345 databases which might contain the same sequence), only the first match will be
5346 returned.
5347 */
5348
5349 Int4 LIBCALL
readdb_acc2fasta(ReadDBFILEPtr rdfp,CharPtr string)5350 readdb_acc2fasta(ReadDBFILEPtr rdfp, CharPtr string)
5351 {
5352 ISAMErrorCode error;
5353 ReadDBFILEPtr rdfp_head = rdfp;
5354 Int4 Value;
5355 CharPtr key_out = NULL, data = NULL;
5356 Uint4 index;
5357 Char tmp_str[64];
5358 size_t vol_counter = 0;
5359 SeqIdPtr sip;
5360
5361 if(rdfp->sisam_opt == NULL || string == NULL)
5362 return -1;
5363
5364 if (StringChr(string, '|') != NULL)
5365 {
5366 sip = SeqIdParse(string);
5367 Value = readdb_seqid2fasta(rdfp, sip);
5368 SeqIdFree(sip);
5369 return Value;
5370 }
5371
5372 for (vol_counter = 0; rdfp; rdfp = rdfp->next, vol_counter++)
5373 {
5374 if((error = ISAMGetIdxOption(rdfp->sisam_opt, &rdfp->sparse_idx)) < 0) {
5375 ErrPostEx(SEV_WARNING, 0, 0, "Failed to access string index "
5376 "ISAM Error code is %d\n", error);
5377 return -1;
5378 }
5379
5380 if(rdfp->sparse_idx) {
5381
5382 Int4 seq_num, count;
5383 Int4Ptr ids;
5384
5385 readdb_acc2fastaEx(rdfp, string, &ids, &count);
5386 if(count > 0) {
5387 seq_num = *ids;
5388 MemFree(ids);
5389 return seq_num;
5390 }
5391 }
5392 else
5393 {
5394 /* Trying accession first */
5395
5396 sprintf(tmp_str, "gb|%s|", string);
5397
5398 if((error = SISAMSearch(rdfp->sisam_opt, tmp_str, 0, &key_out, &data, &index)) < 0) {
5399 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index " "ISAM Error code is %d\n", error);
5400 return error;
5401 }
5402
5403 MemFree(key_out); /* We need no this for now */
5404
5405 if(error != ISAMNotFound) {
5406 Value = atol(data);
5407 MemFree(data);
5408 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5409 if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5410 rdfp_head, rdfp->start, -1))
5411 return -1;
5412 }
5413
5414 return Value + rdfp->start;
5415 }
5416
5417 /* Now trying LOCUS */
5418
5419 sprintf(tmp_str, "gb||%s", string);
5420
5421 if((error = SISAMSearch(rdfp->sisam_opt, tmp_str, 0, &key_out,
5422 &data, &index)) < 0) {
5423 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5424 "ISAM Error code is %d\n", error);
5425 return error;
5426 }
5427
5428 MemFree(key_out); /* We need no this for now */
5429
5430 if(error != ISAMNotFound) {
5431 Value = atol(data);
5432 MemFree(data);
5433 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5434 if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5435 rdfp_head, rdfp->start, -1))
5436 return -1;
5437 }
5438
5439 return Value + rdfp->start;
5440 }
5441
5442 /* Now trying string */
5443
5444
5445 if((error = SISAMSearch(rdfp->sisam_opt, string, 0, &key_out,
5446 &data, &index)) < 0) {
5447 ErrPostEx(SEV_WARNING, 0, 0, "Failed to search string index "
5448 "ISAM Error code is %d\n", error);
5449 return error;
5450 }
5451
5452 MemFree(key_out); /* We need no this for now */
5453
5454 if(error != ISAMNotFound) {
5455 Value = atol(data);
5456 MemFree(data);
5457 if (rdfp->oidlist && rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5458 if (!OID_GI_BelongsToMaskDB(rdfp->oidlist, Value,
5459 rdfp_head, rdfp->start, -1))
5460 return -1;
5461 }
5462
5463 return Value + rdfp->start;
5464 } else {
5465 MemFree(data);
5466 }
5467 }
5468 if (vol_counter >= kSISAM_MaxNumVolumes) {
5469 ISAMUninitSearch(rdfp->sisam_opt);
5470 }
5471 }
5472
5473 return -1;
5474 }
5475
5476 /*
5477 This function returnes "Seq-descr" as ValNode. This valnode then may be
5478 simply linked to set of descriptors in Bioseq: bsp->descr
5479 */
readdb_get_asn1_defline(ReadDBFILEPtr rdfp,Int4 sequence_number)5480 ValNodePtr readdb_get_asn1_defline(ReadDBFILEPtr rdfp, Int4 sequence_number)
5481 {
5482 ValNodePtr vnp = NULL;
5483 Int4 size;
5484 ByteStorePtr bsp;
5485 ByteStorePtr PNTR bspp;
5486 CharPtr buffer;
5487 UserFieldPtr ufp;
5488 UserObjectPtr uop;
5489 ObjectIdPtr oidp;
5490
5491 /* If we're dealing with a subset (mask) database, encode the
5492 * proper defline, which is dictated by looking at the
5493 * membership bits of the BlastDefLinePtr
5494 * If readdb_encode_subset_asn1_defline fails, encode the
5495 * BlastDefLinePtr as found in the blast database [pn]hr files */
5496 if (rdfp->oidlist && rdfp->membership_bit != 0) {
5497 vnp = readdb_encode_subset_asn1_defline(rdfp, sequence_number);
5498 if (vnp != NULL)
5499 return vnp;
5500 }
5501
5502 size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
5503 Nlm_SwapUint4(rdfp->header_index[sequence_number]);
5504
5505 bsp = BSNew(size+1);
5506
5507 if (rdfp->headerfp->mfile_true == TRUE) {
5508 NlmSeekInMFILE(rdfp->headerfp,
5509 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
5510 SEEK_SET);
5511
5512 BSWrite(bsp, rdfp->headerfp->mmp, size);
5513 BSSeek(bsp, 0, SEEK_SET);
5514 } else {
5515 NlmSeekInMFILE(rdfp->headerfp,
5516 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
5517 SEEK_SET);
5518
5519 buffer = MemNew(size+1);
5520 FileRead(buffer, size, 1, rdfp->headerfp->fp);
5521 BSWrite(bsp, buffer, size);
5522 MemFree(buffer);
5523 }
5524
5525 /* Creating user field */
5526 ufp = UserFieldNew();
5527 ufp->num = 1;
5528 bspp = (ByteStorePtr PNTR) MemNew((ufp->num)*sizeof(ByteStorePtr));
5529 bspp[0] = bsp;
5530 ufp->data.ptrvalue = bspp;
5531
5532 /* And object Id type for this object */
5533 oidp = ObjectIdNew();
5534 oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5535 ufp->label = oidp;
5536
5537 /* SEQUENCE OF OCTET STRING , ptrvalue = ByteStorePtr PNTR */
5538 ufp->choice = 10;
5539
5540 /* Creating user object */
5541 uop = UserObjectNew();
5542 uop->data = ufp;
5543
5544 /* Create a new ObjectId for the UserObject */
5545 oidp = ObjectIdNew();
5546 oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5547 uop->type = oidp;
5548
5549 /* Finaly descriptor is created as ... */
5550 vnp = NULL;
5551 vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5552
5553 return vnp;
5554 }
5555
readdb_encode_subset_asn1_defline(ReadDBFILEPtr rdfp,Int4 sequence_number)5556 ValNodePtr readdb_encode_subset_asn1_defline(ReadDBFILEPtr rdfp,
5557 Int4 sequence_number)
5558 {
5559 ValNodePtr vnp;
5560 Int4 size = 0;
5561 ByteStorePtr bsp;
5562 ByteStorePtr PNTR bspp;
5563 BytePtr buffer;
5564 UserFieldPtr ufp;
5565 UserObjectPtr uop;
5566 ObjectIdPtr oidp;
5567 BlastDefLinePtr bdsp = NULL;
5568 AsnIoMemPtr aimp;
5569
5570 if ((bdsp = FDReadDeflineAsn(rdfp, sequence_number)) == NULL)
5571 return NULL;
5572
5573 size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
5574 Nlm_SwapUint4(rdfp->header_index[sequence_number]);
5575 bsp = BSNew(size+1);
5576 buffer = MemNew(size+1);
5577
5578 aimp = AsnIoMemOpen("wb",buffer,size+1);
5579 BlastDefLineSetAsnWrite(bdsp,aimp->aip, NULL);
5580 AsnIoFlush(aimp->aip);
5581 BSWrite(bsp,buffer,size+1);
5582
5583 bdsp = BlastDefLineSetFree(bdsp);
5584 buffer = MemFree(buffer);
5585 aimp = AsnIoMemClose(aimp);
5586
5587 /* Creating user field */
5588 ufp = UserFieldNew();
5589 ufp->num = 1;
5590 bspp = (ByteStorePtr PNTR) MemNew((ufp->num)*sizeof(ByteStorePtr));
5591 bspp[0] = bsp;
5592 ufp->data.ptrvalue = bspp;
5593
5594 /* And object Id type for this object */
5595 oidp = ObjectIdNew();
5596 oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5597 ufp->label = oidp;
5598
5599 /* SEQUENCE OF OCTET STRING , ptrvalue = ByteStorePtr PNTR */
5600 ufp->choice = 10;
5601
5602 /* Creating user object */
5603 uop = UserObjectNew();
5604 uop->data = ufp;
5605
5606 /* Create a new ObjectId for this UserObject */
5607 oidp = ObjectIdNew();
5608 oidp->str = StringSave(ASN_DEFLINE_OBJ_LABEL);
5609 uop->type = oidp;
5610
5611 /* Finaly descriptor is created as ... */
5612 vnp = NULL;
5613 vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5614
5615 return vnp;
5616 }
5617
5618 /*
5619 This function returnes "Seq-descr" as ValNode. This valnode then may be
5620 simply linked to set of descriptors in Bioseq: bsp->descr
5621 */
5622
readdb_get_taxonomy_names(ReadDBFILEPtr rdfp,Int4 sequence_number)5623 ValNodePtr readdb_get_taxonomy_names(ReadDBFILEPtr rdfp, Int4 sequence_number)
5624 {
5625 BlastDefLinePtr bdp, tbdp;
5626 RDBTaxNamesPtr tnames;
5627 UserFieldPtr ufp, ufp_last;
5628 UserObjectPtr uop;
5629 ObjectIdPtr oidp;
5630 CharPtr PNTR cpp;
5631 ValNodePtr vnp;
5632
5633 if(rdfp == NULL || rdfp->taxinfo == NULL)
5634 return NULL;
5635
5636 if((bdp = FDReadDeflineAsn(rdfp, sequence_number)) == NULL)
5637 return NULL;
5638
5639 /* Creating user object */
5640 uop = UserObjectNew();
5641
5642 /* And object Id type for this object */
5643 oidp = ObjectIdNew();
5644 oidp->str = StringSave(TAX_DATA_OBJ_LABEL);
5645 uop->type = oidp;
5646
5647 for(tbdp = bdp; tbdp != NULL; tbdp = tbdp->next) {
5648
5649 /* Make sure we have the taxonomy information for this
5650 * tbdp->taxid */
5651 if ((tnames = RDBGetTaxNames(rdfp->taxinfo, tbdp->taxid)) == NULL )
5652 continue;
5653
5654 /* Creating user field */
5655 ufp = UserFieldNew();
5656 ufp->choice = 7; /* strs */
5657
5658 /* Label of every User-field will contain taxonomy Id and
5659 taxonomy names will be located in Visible Strings in
5660 pre-defined sequence */
5661
5662 oidp = ObjectIdNew();
5663 oidp->id = tbdp->taxid;
5664 ufp->label = oidp;
5665
5666 ufp->num = NUM_TAX_NAMES;
5667 cpp = MemNew(sizeof(CharPtr)*NUM_TAX_NAMES);
5668
5669 cpp[SCI_NAME_POS] = StringSave(tnames->sci_name);
5670 cpp[COMMON_NAME_POS] = StringSave(tnames->common_name);
5671 cpp[BLAST_NAME_POS] = StringSave(tnames->blast_name);
5672 cpp[S_KING_POS] = StringSave(tnames->s_king);
5673
5674 ufp->data.ptrvalue = cpp;
5675
5676 if(uop->data == NULL)
5677 uop->data = ufp;
5678 else
5679 ufp_last->next = ufp;
5680
5681 ufp_last = ufp;
5682 RDBTaxNamesFree(tnames);
5683 }
5684
5685 /* Finaly descriptor is created as ... */
5686 vnp = NULL;
5687 if (uop->data != NULL)
5688 vnp = SeqDescrAddPointer(&vnp, Seq_descr_user, uop);
5689 else {
5690 UserObjectFree(uop);
5691 }
5692 BlastDefLineSetFree(bdp);
5693
5694 return vnp;
5695 }
5696
5697 /*
5698 Obtains a BioseqPtr from readdb:
5699
5700 ReadDBFILEPtr rdfp: the main ReadDB reference,
5701 Int4 sequence_number: which number is this sequence,
5702 */
5703 BioseqPtr LIBCALL
readdb_get_bioseq(ReadDBFILEPtr rdfp,Int4 sequence_number)5704 readdb_get_bioseq(ReadDBFILEPtr rdfp, Int4 sequence_number)
5705 {
5706 return readdb_get_bioseq_ex(rdfp, sequence_number, TRUE, FALSE);
5707 }
5708
5709 BioseqPtr LIBCALL
readdb_get_bioseq_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Boolean use_objmgr,Boolean insert_ctrlA)5710 readdb_get_bioseq_ex(ReadDBFILEPtr rdfp, Int4 sequence_number,
5711 Boolean use_objmgr, Boolean insert_ctrlA)
5712
5713 {
5714 BioseqPtr bsp;
5715 ByteStorePtr byte_store;
5716 CharPtr defline, new_defline = NULL, defline_ptr, new_defline_ptr;
5717 Int2 byte_value;
5718 Int4 length, compressed_length, count;
5719 SeqIdPtr sip;
5720 Uint1Ptr buffer, buffer_4na;
5721 Uint4Ptr ambchar = NULL;
5722 Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
5723
5724 if ((rdfp = readdb_get_link(rdfp, sequence_number)) == NULL)
5725 return NULL;
5726
5727 defline = NULL;
5728
5729 readdb_get_descriptor(rdfp, sequence_number, &sip, &defline);
5730
5731 if (insert_ctrlA == FALSE)
5732 {
5733 count = 0;
5734 new_defline = NULL;
5735 if (defline != NULL) {
5736 defline_ptr = defline;
5737
5738 while (*defline_ptr != NULLB) {
5739 count++;
5740 if (*defline_ptr == READDB_DEF_SEPARATOR) {
5741 /* Two spaces for every ctrl-A as it will be replaced by 2. */
5742 count++;
5743 }
5744 defline_ptr++;
5745 }
5746
5747 if (count != 0) {
5748 new_defline = (CharPtr)Nlm_Malloc((count+1)*sizeof(Char));
5749 new_defline_ptr = new_defline;
5750 defline_ptr = defline;
5751 while (*defline_ptr != NULLB) {
5752 if (*defline_ptr == READDB_DEF_SEPARATOR) {
5753 *new_defline_ptr = ' ';
5754 new_defline_ptr++;
5755 *new_defline_ptr = '>';
5756 new_defline_ptr++;
5757 } else {
5758 *new_defline_ptr = *defline_ptr;
5759 new_defline_ptr++;
5760 }
5761 defline_ptr++;
5762 }
5763 *new_defline_ptr = NULLB;
5764 defline = (CharPtr)MemFree(defline);
5765 }
5766 }
5767 }
5768 else
5769 new_defline = defline;
5770
5771 if((length = readdb_get_sequence(rdfp, sequence_number, &buffer)) < 1)
5772 return NULL;
5773
5774 if(use_objmgr) {
5775 if((bsp = BioseqNew()) == NULL)
5776 return NULL;
5777 } else {
5778 bsp = (BioseqPtr)MemNew(sizeof(Bioseq));
5779 if (bsp == NULL) return bsp;
5780 bsp->length = -1; /* not set */
5781 bsp->topology = 1; /* DEFAULT = linear */
5782 }
5783
5784 byte_store = BSNew(0);
5785 if (is_prot) {
5786 bsp->mol = Seq_mol_aa;
5787 bsp->seq_data_type = Seq_code_ncbistdaa;
5788 BSWrite(byte_store, (VoidPtr) buffer, length);
5789 } else {
5790 /* Nucleotide sequence require more attention */
5791 if(!readdb_get_ambchar(rdfp, sequence_number, &ambchar)) {
5792 ErrPostEx(SEV_WARNING, 0, 0,
5793 "Failure to read ambiguity information");
5794 return NULL;
5795 }
5796 /* Convert sequence if ambiguities. */
5797 if(ambchar != NULL) {/* are there any ambiguity ? */
5798 compressed_length = (length+3)/4; /* enough bytes for all bases. */
5799 buffer_4na = Nlm_Malloc((2*compressed_length)*sizeof(Uint1));
5800 MapNa2ByteToNa4String(buffer, (Uint2Ptr) buffer_4na, length/4);
5801 if (length%4 != 0)
5802 {
5803 Uint1 bytes[2];
5804 bytes[0] = *(buffer+length/4);
5805 bytes[0] &= 252;
5806 MapNa2ByteToNa4String(bytes, (Uint2Ptr) (buffer_4na+2*(compressed_length-1)), 1);
5807 }
5808 RebuildDNA_4na(buffer_4na, compressed_length*2, ambchar);
5809 BSWrite(byte_store, (VoidPtr) buffer_4na, compressed_length*2);
5810 MemFree(buffer_4na);
5811 MemFree(ambchar);
5812 bsp->seq_data_type = Seq_code_ncbi4na;
5813 }
5814 else
5815 {
5816 BSWrite(byte_store, (VoidPtr) buffer, length/4);
5817 if (length%4 != 0) {
5818 byte_value = *(buffer+length/4);
5819 byte_value &= 252;
5820 BSPutByte(byte_store, byte_value);
5821 }
5822 bsp->seq_data_type = Seq_code_ncbi2na;
5823 }
5824
5825 bsp->mol = Seq_mol_na;
5826 }
5827
5828 bsp->seq_data = (SeqDataPtr) byte_store;
5829
5830 bsp->length = length;
5831 bsp->id = sip;
5832 bsp->repr = Seq_repr_raw;
5833
5834 if (new_defline != NULL) {
5835 bsp->descr = SeqDescrAddPointer(NULL, Seq_descr_title, new_defline);
5836 }
5837
5838 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
5839 ValNodePtr vnp, vnp_tmp;
5840
5841 /* First we encode complete ASN.1 definition line */
5842
5843 vnp = readdb_get_asn1_defline(rdfp, sequence_number);
5844
5845 if(bsp->descr != NULL) {
5846 for (vnp_tmp = bsp->descr; vnp_tmp->next != NULL;
5847 vnp_tmp = vnp_tmp->next)
5848 continue;
5849 vnp_tmp->next = vnp;
5850 vnp_tmp = vnp;
5851 } else {
5852 bsp->descr = vnp;
5853 vnp_tmp = bsp->descr;
5854 }
5855
5856 /* Then encoding taxonomy names information from the
5857 taxonomy names database */
5858
5859 vnp = readdb_get_taxonomy_names(rdfp, sequence_number);
5860 vnp_tmp->next = vnp;
5861 }
5862
5863 return bsp;
5864 }
5865
5866 /*
5867 returns the 'filebits' associated with a certain ordinal number.
5868 This is done by going to the rdfp for that ordinal id and
5869 gathering the filebits.
5870 */
5871 Boolean LIBCALL
readdb_get_filebits(ReadDBFILEPtr rdfp,Int4 ordinal_id,Uint2Ptr filebit,Uint2Ptr aliasfilebit)5872 readdb_get_filebits (ReadDBFILEPtr rdfp, Int4 ordinal_id, Uint2Ptr filebit, Uint2Ptr aliasfilebit)
5873
5874 {
5875 rdfp = readdb_get_link(rdfp, ordinal_id);
5876
5877 if (rdfp == NULL)
5878 return FALSE;
5879
5880 if (filebit)
5881 *filebit = rdfp->filebit;
5882
5883 if (aliasfilebit)
5884 *aliasfilebit = rdfp->aliasfilebit;
5885
5886 return TRUE;
5887 }
5888
5889 /* The following function performs a binary search to return an ordinal id of
5890 the last sequence in the BLAST database, whose offset in the sequence index
5891 is less than the offset argument. The offset is in bases, nucleotide or
5892 protein. For the former, the actual offset in the sequence index is
5893 computed inside the function (i.e. the offset argument is divided by 4).
5894 The first_seq argument tells the function not to look at ordinal ids smaller
5895 than the argument value.
5896 */
5897 Int4 LIBCALL
readdb_get_sequence_number(ReadDBFILEPtr rdfp,Int4 first_seq,Int8 offset)5898 readdb_get_sequence_number(ReadDBFILEPtr rdfp, Int4 first_seq, Int8 offset)
5899 {
5900 Int4 m, b, e, val;
5901 Int2 compression_ratio;
5902
5903 if (!rdfp)
5904 return -1;
5905
5906 if (rdfp->parameters & READDB_IS_PROT)
5907 compression_ratio = 1;
5908 else
5909 compression_ratio = READDB_COMPRESSION_RATIO;
5910
5911 while (rdfp && rdfp->totlen <= offset) {
5912 offset -= rdfp->totlen;
5913 rdfp = rdfp->next;
5914 }
5915
5916 if (!rdfp)
5917 return -1;
5918
5919 e = rdfp->stop;
5920 b = MAX(first_seq, rdfp->start);
5921 offset /= compression_ratio;
5922
5923 while (b < e - 1) {
5924 m = (b + e) / 2;
5925 if ((val = Nlm_SwapUint4(rdfp->sequence_index[m])) > offset)
5926 e = m;
5927 else if (val == offset)
5928 return m;
5929 else
5930 b = m;
5931 }
5932
5933 return b;
5934 }
5935
5936 /*
5937 Gets the sequence number "sequence_number". If memory-mapped
5938 files are enabled, then *buffer points to the appropriate place
5939 in the memory-mapped file. If memory-mapped files are not enabled,
5940 then sufficient space in *buffer is allocated (if this is not already
5941 the case) and this length is stored in *buffer_length.
5942
5943 The length of the sequence requested is the return value; for memory-
5944 mapped files this is different than *buffer_length, which is always
5945 zero.
5946 */
5947
5948 Int4 LIBCALL
readdb_get_sequence(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint1Ptr PNTR buffer)5949 readdb_get_sequence (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer)
5950
5951 {
5952 Uint4 diff, length, nitems=0;
5953 Uint1 remainder;
5954 Boolean is_prot = (Boolean) (rdfp->parameters & READDB_IS_PROT);
5955
5956 rdfp = readdb_get_link(rdfp, sequence_number);
5957
5958 if (rdfp == NULL || rdfp->sequencefp == NULL)
5959 return 0;
5960
5961 if (is_prot == FALSE)
5962 {
5963 nitems = Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]) -
5964 Nlm_SwapUint4(rdfp->sequence_index[sequence_number]);
5965 }
5966 else
5967 {
5968 nitems = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
5969 Nlm_SwapUint4(rdfp->sequence_index[sequence_number]) - 1;
5970 }
5971
5972 NlmSeekInMFILE(rdfp->sequencefp,
5973 Nlm_SwapUint4(rdfp->sequence_index[sequence_number]),
5974 SEEK_SET);
5975
5976 length = sizeof(Uint1) * nitems;
5977 /* Use memory-mapped file, don't allocate buffer. */
5978 if (rdfp->sequencefp->mfile_true == TRUE)
5979 {
5980 diff = rdfp->sequencefp->mmp_end - rdfp->sequencefp->mmp;
5981
5982 if (length > diff)
5983 {
5984 nitems = diff / sizeof(Uint1);
5985 length = nitems * sizeof(Uint1);
5986 }
5987 *buffer = rdfp->sequencefp->mmp;
5988 }
5989 else
5990 {
5991 /* No mem-mapping, allocate a buffer for the subject sequence. */
5992 if (length+2 > rdfp->allocated_length)
5993 {
5994 if (rdfp->buffer != NULL)
5995 rdfp->buffer = (UcharPtr)MemFree(rdfp->buffer);
5996 rdfp->allocated_length = rdfp->maxlen+2;
5997 rdfp->buffer = (UcharPtr)MemNew((rdfp->allocated_length)*sizeof(Uint1));
5998 }
5999 /* For protein db's the first and last byte is the NULLB, which is a sentinel byte
6000 used by the extension functions. For nucl. db's there are no sentinel bytes. */
6001 if (is_prot)
6002 {
6003 rdfp->buffer[0] = NULLB;
6004 *buffer = rdfp->buffer+1;
6005 FileRead(*buffer, sizeof(Uint1), nitems+1, rdfp->sequencefp->fp);
6006 }
6007 else
6008 {
6009 *buffer = rdfp->buffer;
6010 FileRead(*buffer, sizeof(Uint1), nitems, rdfp->sequencefp->fp);
6011 }
6012 }
6013
6014 /* For nucl. return "unpacked" length and get the remainder out
6015 of the last byte. */
6016 if (is_prot == FALSE)
6017 {
6018 /* The first six bits in the byte holds the "remainder" (not a multiple of 4)
6019 and the last two bits of the byte holds the size of the remainder (0-3). */
6020 remainder = *(*buffer+length-1);
6021 remainder &= 0x3;
6022 length--;
6023 /* 4 bases per byte. */
6024 length *= 4;
6025 length += remainder;
6026 }
6027
6028 return length;
6029 }
6030
6031 /*
6032 Gets the sequence number "sequence_number". The sequence returned includes
6033 all ambiguity information. THis funciton should only be used for nucleic
6034 acid sequences, for proteins use readdb_get_sequence.
6035
6036 buffer contains the sequence and is reallocated if *buffer_length is not long enough.
6037
6038 The length of the sequence requested is the return value.
6039 protein sequences are always returned as Seq_code_ncbistdaa,
6040 nucleotide sequences as Seq_code_ncbi4na.
6041
6042 In case of memory allocation failure, buffer is free'd and points to NULL,
6043 buffer_length is set to 0 and -1 is returned
6044 */
6045
6046 Int4 LIBCALL
readdb_get_sequence_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint1Ptr PNTR buffer,Int4 * buffer_length,Boolean ready)6047 readdb_get_sequence_ex (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint1Ptr PNTR buffer, Int4 *buffer_length, Boolean ready)
6048
6049 {
6050 Int4 length; /* Uncompressed length of sequence to be fetched */
6051 Uint1Ptr readdb_buffer; /* Pointer to (read-only) data returned by readdb. */
6052
6053 length = readdb_get_sequence(rdfp, sequence_number, &readdb_buffer);
6054
6055 /* Check the length, make it one longer for ALIGN. */
6056 if ((length+2) > *buffer_length || *buffer == NULL)
6057 {
6058 if (*buffer)
6059 MemFree(*buffer);
6060
6061 *buffer = Nlm_Malloc((length+2)*sizeof(Uint1));
6062 if (*buffer == NULL) {
6063 *buffer_length = 0;
6064 return -1;
6065 }
6066 *buffer_length = length+2;
6067 }
6068
6069 /* Copy sequence into allocated buffer. */
6070 if (rdfp->parameters & READDB_IS_PROT) /* Protein */
6071 {
6072 MemCpy((VoidPtr) *buffer, readdb_buffer, length);
6073 }
6074 else /* Nucleotide. */
6075 {
6076 Int4 copy_length; /* compressed (4-to-1) length of sequence being fetched. */
6077 Uint4Ptr ambchar = NULL; /* Used below for fetching ambiguity information. */
6078 Uint1* buffer_ptr = (*buffer); /* Used for calls to MapNa2ByteToNa4String and RebuildDNA_4na */
6079
6080 copy_length = length/4;
6081 MapNa2ByteToNa4String(readdb_buffer, (Uint2*) buffer_ptr, copy_length);
6082
6083 if (length%4 != 0)
6084 { /* Sets letters in last (incomplete) byte. */
6085 Uint1 byte_value = *(readdb_buffer+length/4);
6086 byte_value &= 252;
6087 MapNa2ByteToNa4String(&byte_value, (Uint2*) (buffer_ptr+(2*copy_length)), 1);
6088 copy_length++;
6089 }
6090
6091 if(!readdb_get_ambchar(rdfp, sequence_number, &ambchar)) {
6092 ErrPostEx(SEV_WARNING, 0, 0,
6093 "Failure to read ambiguity information");
6094 return -1;
6095 }
6096 /* Convert sequence if ambiguities. */
6097 if(ambchar != NULL) /* are there any ambiguity ? */
6098 {
6099 Boolean status = RebuildDNA_4na(buffer_ptr, copy_length*2, ambchar);
6100 ambchar = MemFree(ambchar);
6101 if (status == FALSE)
6102 {
6103 ErrPostEx(SEV_WARNING, 0, 0,
6104 "Failure to rebuild DNA in readdb_get_seqeuence_ex");
6105 return -1;
6106 }
6107
6108 }
6109
6110 if (ready)
6111 {
6112 Int4 index, index2; /* Loop indices. */
6113 Uint1* private_buffer = (*buffer) + 1;
6114 index = length/2 - 2;
6115 index2 = length-1;
6116 if (length%2 != 0)
6117 {
6118 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index+1] >> 4)];
6119 index2--;
6120 }
6121 while (index2 > 0)
6122 {
6123 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index] & 15)];
6124 index2--;
6125 private_buffer[index2] = ncbi4na_to_blastna[(private_buffer[index] >> 4)];
6126 index2--; index--;
6127 }
6128 private_buffer[length] = ncbi4na_to_blastna[0];
6129 (*buffer)[0] = ncbi4na_to_blastna[0];
6130 }
6131 else
6132 {
6133 Int4 index, index2; /* Loop indices. */
6134 Uint1* private_buffer = (*buffer);
6135 index = length/2 - 1;
6136 index2 = length-1;
6137 if (length%2 != 0)
6138 {
6139 private_buffer[index2] = (private_buffer[index+1] >> 4);
6140 index2--;
6141 }
6142 while (index2 > 0)
6143 {
6144 private_buffer[index2] = (private_buffer[index] & 15);
6145 index2--;
6146 private_buffer[index2] = (private_buffer[index] >> 4);
6147 index2--; index--;
6148 }
6149 }
6150 }
6151
6152 return length;
6153 }
6154
6155 Int4 LIBCALL
readdb_get_sequence_length_approx(ReadDBFILEPtr rdfp,Int4 sequence_number)6156 readdb_get_sequence_length_approx(ReadDBFILEPtr rdfp, Int4 sequence_number)
6157 {
6158 Uint4 length = 0;
6159
6160 rdfp = readdb_get_link(rdfp, sequence_number);
6161
6162 if (rdfp == NULL)
6163 return 0;
6164
6165 if (readdb_is_prot(rdfp) == FALSE)
6166 {
6167 length = Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]) -
6168 Nlm_SwapUint4(rdfp->sequence_index[sequence_number]);
6169 length *= READDB_COMPRESSION_RATIO;
6170 }
6171 else
6172 {
6173 length = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6174 Nlm_SwapUint4(rdfp->sequence_index[sequence_number]) - 1;
6175 }
6176 return (Int4)length;
6177 }
6178 /*
6179 Gets the length of sequence number "sequence_number".
6180 */
6181
6182 Int4 LIBCALL
readdb_get_sequence_length(ReadDBFILEPtr rdfp,Int4 sequence_number)6183 readdb_get_sequence_length (ReadDBFILEPtr rdfp, Int4 sequence_number)
6184
6185 {
6186 Int4 length = readdb_get_sequence_length_approx(rdfp, sequence_number);
6187
6188 /* For nucl. return "unpacked" length and get the remainder out
6189 of the last byte. */
6190 if (readdb_is_prot(rdfp) == FALSE)
6191 {
6192 Uint1 remainder = 0;
6193 rdfp = readdb_get_link(rdfp, sequence_number);
6194 if (rdfp->sequencefp->mfile_true == TRUE)
6195 {
6196 NlmSeekInMFILE(rdfp->sequencefp,
6197 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])-1, SEEK_SET);
6198 remainder = *(rdfp->sequencefp->mmp);
6199 }
6200 else
6201 {
6202 NlmSeekInMFILE(rdfp->sequencefp,
6203 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])-1, SEEK_SET);
6204 NlmReadMFILE((Uint1Ptr) &remainder, 1, 1, rdfp->sequencefp);
6205 }
6206 /* The first six bits in the byte holds the "remainder" (not a
6207 multiple of 4) and the last two bits of the byte holds the size of
6208 the remainder (0-3). Note that length (as returned from
6209 readdb_get_sequence_length_approx) is the "unpacked" approximate
6210 length, that is, it assumes the last byte has 4 bases in it.
6211 Therefore, the next 3 lines correct that calculation with the exact
6212 sequence length.
6213 */
6214 remainder &= 3; /* number of bases stored in the last byte */
6215 length -= READDB_COMPRESSION_RATIO; /* subtract the last byte */
6216 length += remainder; /* this is the exact "unpacked" sequence length */
6217 }
6218
6219 return length;
6220 }
6221 #ifdef FASTA_ASN
6222 /*
6223 Get the FasfaPtr (ASN.1) for the sequence with sequence_number.
6224 It is the caller's RESPONSIBILITY to DEALLOCATE Fasta ASN.1".
6225 */
readdb_get_fastaid(ReadDBFILEPtr rdfp,Int4 sequence_number)6226 FdbFastaPtr LIBCALL readdb_get_fastaid PROTO((ReadDBFILEPtr rdfp,
6227 Int4 sequence_number))
6228 {
6229 FdbFastaPtr fasta;
6230 AsnIoPtr aip;
6231 AsnIoMemPtr aimp;
6232 Int4 size;
6233
6234 rdfp = readdb_get_link(rdfp, sequence_number);
6235
6236 if (rdfp == NULL)
6237 return FALSE;
6238
6239 size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6240 Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6241
6242 if (rdfp->headerfp->mfile_true == TRUE) {
6243 NlmSeekInMFILE(rdfp->headerfp,
6244 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6245 SEEK_SET);
6246 aimp = AsnIoMemOpen("rb", rdfp->headerfp->mmp, size);
6247 fasta = FdbFastaAsnRead(aimp->aip, NULL);
6248 AsnIoMemClose(aimp);
6249 } else {
6250 aip = AsnIoNew(ASNIO_BIN_IN, rdfp->headerfp->fp, NULL, NULL, NULL);
6251 NlmSeekInMFILE(rdfp->headerfp,
6252 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6253 SEEK_SET);
6254 fasta = FdbFastaAsnRead(aip, NULL);
6255 AsnIoFree(aip, FALSE);
6256 }
6257 return fasta;
6258 }
6259 #endif
6260 Boolean LIBCALL
readdb_get_ambchar(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr PNTR ambchar_return)6261 readdb_get_ambchar (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr PNTR ambchar_return)
6262 {
6263 Uint4Ptr ambchar;
6264 Int4 length, index;
6265 Uint4 total;
6266
6267 rdfp = readdb_get_link(rdfp, sequence_number);
6268
6269 if((length = Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6270 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])) == 0) {
6271 *ambchar_return = NULL;
6272 return TRUE; /* no ambiguous characters available */
6273 }
6274
6275 /* Each ambig. residue is represented by a Uint4,
6276 but length is in bytes. */
6277
6278 total = length/4;
6279 if((ambchar = (Uint4Ptr)MemNew(total*sizeof(Uint4))) == NULL)
6280 return FALSE;
6281
6282 NlmSeekInMFILE(rdfp->sequencefp,
6283 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number]), SEEK_SET);
6284
6285 NlmReadMFILE((Uint1Ptr) ambchar, 4, total, rdfp->sequencefp);
6286 total &= 0x7FFFFFFF; /* mask off everything but the highest order bit. */
6287 for (index=0; index<total; index++) {
6288 ambchar[index] = Nlm_SwapUint4(ambchar[index]);
6289 }
6290
6291 *ambchar_return = ambchar;
6292 return TRUE;
6293 }
6294
6295 /*
6296 Check if ambiguity characters are present in the sequence.
6297 */
6298
6299 Boolean LIBCALL
readdb_ambchar_present(ReadDBFILEPtr rdfp,Int4 sequence_number)6300 readdb_ambchar_present (ReadDBFILEPtr rdfp, Int4 sequence_number)
6301
6302 {
6303 rdfp = readdb_get_link(rdfp, sequence_number);
6304 if (rdfp == NULL)
6305 return FALSE;
6306
6307 if (rdfp->ambchar_index == NULL)
6308 return FALSE;
6309
6310 if((Nlm_SwapUint4(rdfp->sequence_index[sequence_number+1]) -
6311 Nlm_SwapUint4(rdfp->ambchar_index[sequence_number])) == 0)
6312 {
6313 return FALSE;
6314 }
6315
6316 return TRUE;
6317 }
6318
6319 static Boolean
readdb_adjust_local_id(ReadDBFILEPtr rdfp,SeqIdPtr sip)6320 readdb_adjust_local_id(ReadDBFILEPtr rdfp, SeqIdPtr sip)
6321
6322 {
6323 DbtagPtr dbtag;
6324 ObjectIdPtr oid;
6325
6326 if (sip == NULL || sip->choice != SEQID_GENERAL)
6327 return FALSE;
6328
6329 if (rdfp->start == 0)
6330 return TRUE;
6331
6332 dbtag = sip->data.ptrvalue;
6333 if (dbtag && StringCmp(dbtag->db, "BL_ORD_ID") == 0)
6334 {
6335 oid = dbtag->tag;
6336 oid->id += rdfp->start;
6337 }
6338
6339 return TRUE;
6340
6341
6342
6343 }
6344
FDBuildOldStyleDefline(ReadDBFILEPtr rdfp,BlastDefLinePtr bdsp)6345 static CharPtr FDBuildOldStyleDefline(ReadDBFILEPtr rdfp, BlastDefLinePtr bdsp)
6346 {
6347 CharPtr defline;
6348 Char id_buffer[128];
6349 Int4 length, count;
6350 BlastDefLinePtr bdsp_tmp;
6351 Boolean first;
6352 ValNodePtr memb = NULL;
6353 Uint4 membership_mask = 0;
6354
6355 count = 0;
6356 length = 0;
6357 membership_mask = (0x1 << (rdfp->membership_bit-1));
6358
6359 /* First calculating - how much memory do we need ? */
6360 for(bdsp_tmp = bdsp; bdsp_tmp != NULL; bdsp_tmp = bdsp_tmp->next) {
6361 length += StringLen(bdsp_tmp->title);
6362 count++;
6363 }
6364
6365 defline = MemNew(count*128 + length);
6366 MemSet(defline, '\0', sizeof(defline));
6367 first = TRUE;
6368 for(bdsp_tmp = bdsp; bdsp_tmp != NULL; bdsp_tmp = bdsp_tmp->next) {
6369
6370 if (rdfp->membership_bit == 0) { /* real database */
6371 if(!first) {
6372 StringCat(defline, "\1");
6373 SeqIdWrite(bdsp_tmp->seqid, id_buffer,
6374 PRINTID_FASTA_LONG, sizeof(id_buffer));
6375 StringCat(defline, id_buffer);
6376 StringCat(defline, " ");
6377 } else {
6378 first = FALSE;
6379 }
6380
6381 StringCat(defline, bdsp_tmp->title);
6382
6383 } else { /* subset database, verify the membership bit */
6384
6385 memb = bdsp_tmp->memberships;
6386 if (memb && (membership_mask & memb->data.intvalue)) {
6387 if (!first) {
6388 StringCat(defline, "\1");
6389 SeqIdWrite(bdsp_tmp->seqid, id_buffer,
6390 PRINTID_FASTA_LONG, sizeof(id_buffer));
6391 StringCat(defline, id_buffer);
6392 StringCat(defline, " ");
6393 } else {
6394 first = FALSE;
6395 }
6396 StringCat(defline, bdsp_tmp->title);
6397 }
6398 memb = NULL;
6399 }
6400 }
6401 if(*defline == '\0'){
6402 MemFree(defline);
6403 defline = NULL;
6404 }
6405 return defline;
6406 }
6407
FDReadDeflineAsn(ReadDBFILEPtr rdfp,Int4 sequence_number)6408 BlastDefLinePtr FDReadDeflineAsn(ReadDBFILEPtr rdfp, Int4 sequence_number)
6409 {
6410 BlastDefLinePtr bdsp, bdsp_tmp, bdsp_prev;
6411 AsnIoPtr aip;
6412 AsnIoMemPtr aimp;
6413 Int4 size;
6414 SeqIdPtr seqid = NULL;
6415
6416 if ((rdfp = readdb_get_link(rdfp, sequence_number)) == NULL)
6417 return NULL;
6418
6419 size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6420 Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6421
6422 if (rdfp->headerfp->mfile_true == TRUE) {
6423 NlmSeekInMFILE(rdfp->headerfp,
6424 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6425 SEEK_SET);
6426 aimp = AsnIoMemOpen("rb", rdfp->headerfp->mmp, size);
6427 bdsp = (BlastDefLinePtr) BlastDefLineSetAsnRead(aimp->aip, NULL);
6428 AsnIoMemClose(aimp);
6429 } else {
6430 aip = AsnIoNew(ASNIO_BIN_IN, rdfp->headerfp->fp, NULL, NULL, NULL);
6431 NlmSeekInMFILE(rdfp->headerfp,
6432 Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6433 SEEK_SET);
6434 bdsp = (BlastDefLinePtr) BlastDefLineSetAsnRead(aip, NULL);
6435 AsnIoFree(aip, FALSE);
6436 }
6437
6438 /* If dealing with a subset (mask) database, filter the
6439 * BlastDefLinePtr from entries that are not relevant
6440 * (this applies only to non-redundant databases) */
6441 if (rdfp->oidlist && rdfp->membership_bit != 0) {
6442 ValNodePtr memb = NULL;
6443 Uint4 memb_mask = 0;
6444 BlastDefLinePtr bdsp_last, bdsp_rv_tmp = NULL, bdsp_head = NULL;
6445 Boolean first = TRUE;
6446
6447 /* create the memberships mask (this should be fixed to allow membership
6448 * bits greater than 32) */
6449 memb_mask = 0x1 << (rdfp->membership_bit-1);
6450
6451 /* build the new adjusted BlastDefLine structure */
6452 for (bdsp_tmp = bdsp; bdsp_tmp; bdsp_tmp = bdsp_tmp->next) {
6453 memb = bdsp_tmp->memberships;
6454 if (memb && (memb_mask & memb->data.intvalue)) {
6455
6456 if (first) {
6457 bdsp_rv_tmp = BlastDefLineNew();
6458 bdsp_head = bdsp_last = bdsp_rv_tmp;
6459 if (!bdsp_rv_tmp) {
6460 ErrPostEx(SEV_ERROR,0,0,
6461 "Not enough memory in FDReadDeflineAsn");
6462 return bdsp;
6463 }
6464 first = FALSE;
6465 } else {
6466 bdsp_rv_tmp = BlastDefLineNew();
6467 if (!bdsp_rv_tmp) {
6468 ErrPostEx(SEV_ERROR,0,0,
6469 "Not enough memory in FDReadDeflineAsn");
6470 bdsp_head = BlastDefLineSetFree(bdsp_head);
6471 return bdsp;
6472 }
6473 }
6474
6475 bdsp_rv_tmp->seqid = SeqIdSetDup(bdsp_tmp->seqid);
6476 bdsp_rv_tmp->title = StringSave(bdsp_tmp->title);
6477 bdsp_rv_tmp->taxid = bdsp_tmp->taxid;
6478 bdsp_rv_tmp->memberships=IntValNodeCopy(bdsp_tmp->memberships);
6479 bdsp_rv_tmp->links = IntValNodeCopy(bdsp_tmp->links);
6480 bdsp_rv_tmp->other_info = IntValNodeCopy(bdsp_tmp->other_info);
6481 bdsp_last->next = bdsp_rv_tmp;
6482 bdsp_rv_tmp->next = NULL;
6483 bdsp_last = bdsp_rv_tmp;
6484 }
6485 }
6486 bdsp = BlastDefLineSetFree(bdsp);
6487 bdsp = bdsp_head;
6488 }
6489
6490 /* If the preferred gi is set, then put the BlastDefLine structure that
6491 * contains it first in the chain of BlastDefLinePtr's */
6492 if (rdfp->preferred_gi != 0) {
6493
6494 ValNodeAddInt(&seqid, SEQID_GI, rdfp->preferred_gi);
6495 bdsp_prev = NULL;
6496
6497 for (bdsp_tmp = bdsp; bdsp_tmp; bdsp_tmp = bdsp_tmp->next) {
6498
6499 if (SeqIdIn(bdsp_tmp->seqid, seqid)) {
6500 if (bdsp_prev != NULL)
6501 bdsp_prev->next = bdsp_tmp->next;
6502 if (bdsp_tmp != bdsp) {
6503 bdsp_tmp->next = bdsp;
6504 bdsp = bdsp_tmp;
6505 }
6506 break;
6507 }
6508 bdsp_prev = bdsp_tmp;
6509 }
6510 SeqIdFree(seqid);
6511 }
6512
6513 return bdsp;
6514 }
6515
6516 /* This function suppose, that gived rdfp is correct - it contains given
6517 sequence number */
6518 static Boolean
readdb_get_defline_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,CharPtr PNTR description,SeqIdPtr PNTR seqidp)6519 readdb_get_defline_ex (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description, SeqIdPtr PNTR seqidp)
6520
6521 {
6522 Char buffer[READDB_BUF_SIZE], id_buf[READDB_BUF_SIZE];
6523 CharPtr buf_ptr;
6524 Int4 new_size, index;
6525 BlastDefLinePtr bdsp;
6526
6527 if(rdfp == NULL)
6528 return FALSE;
6529
6530 SeqLocAsnLoad();
6531
6532 new_size = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]) -
6533 Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6534
6535 if (new_size > READDB_BUF_SIZE){
6536 buf_ptr = (CharPtr)Nlm_Malloc(new_size*sizeof(Char) + 1);
6537 } else {
6538 buf_ptr = &buffer[0];
6539 }
6540
6541 NlmSeekInMFILE(rdfp->headerfp, Nlm_SwapUint4(rdfp->header_index[sequence_number]),
6542 SEEK_SET);
6543 if (NlmReadMFILE((Uint1Ptr) buf_ptr, sizeof(Char), new_size,
6544 rdfp->headerfp) != new_size)
6545 {
6546 if (buf_ptr != &buffer[0])
6547 buf_ptr = (CharPtr)MemFree(buf_ptr);
6548 return FALSE;
6549 }
6550
6551 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6552
6553 bdsp = FDReadDeflineAsn(rdfp, sequence_number);
6554
6555 if(bdsp == NULL) {
6556 ErrPostEx(SEV_ERROR, 0, 0, "readdb_get_defline_ex: "
6557 "Failure to read defline ASN for %d", sequence_number);
6558 if (seqidp) *seqidp = NULL;
6559 if (description) *description = NULL;
6560 if (buf_ptr != &buffer[0])
6561 buf_ptr = (CharPtr)MemFree(buf_ptr);
6562 return FALSE;
6563 }
6564
6565 if(seqidp != NULL) {
6566 *seqidp = SeqIdSetDup(bdsp->seqid);
6567 readdb_adjust_local_id(rdfp, *seqidp);
6568 }
6569
6570 if(description != NULL)
6571 *description = FDBuildOldStyleDefline(rdfp, bdsp);
6572
6573 BlastDefLineSetFree(bdsp);
6574
6575 if (buf_ptr != &buffer[0])
6576 buf_ptr = (CharPtr)MemFree(buf_ptr);
6577
6578 return TRUE;
6579 }
6580
6581
6582 buf_ptr[new_size] = NULLB; /* defline saved w/o NULLB. */
6583
6584 if(seqidp != NULL) { /* SeqId requested separate from descriptor */
6585
6586 for (index=0; index<READDB_BUF_SIZE; index++) {
6587 if (buf_ptr[index] == ' ' || buf_ptr[index] == NULLB) {
6588 id_buf[index] = NULLB;
6589 index++;
6590 break;
6591 }
6592 id_buf[index] = buf_ptr[index];
6593 }
6594
6595 *seqidp = SeqIdParse(id_buf);
6596 readdb_adjust_local_id(rdfp, *seqidp);
6597
6598 if (description != NULL)
6599 *description = StringSave(&buf_ptr[index]);
6600 } else {
6601 if (description != NULL)
6602 *description = StringSave(buf_ptr);
6603 }
6604
6605 if (buf_ptr != &buffer[0])
6606 buf_ptr = (CharPtr)MemFree(buf_ptr);
6607
6608 return TRUE;
6609 }
6610
readdb_get_descriptor(ReadDBFILEPtr rdfp,Int4 sequence_number,SeqIdPtr PNTR id,CharPtr PNTR description)6611 Boolean LIBCALL readdb_get_descriptor (ReadDBFILEPtr rdfp,
6612 Int4 sequence_number,
6613 SeqIdPtr PNTR id,
6614 CharPtr PNTR description)
6615
6616 {
6617 Boolean not_done;
6618 Char id_buf[READDB_BUF_SIZE];
6619 CharPtr defline, new_defline=NULL, tmp_defline;
6620 CommonIndexPtr cigi;
6621 Int4 alias_mask=0, gi;
6622 Int4 defline_length, new_defline_length;
6623 SeqIdPtr bestid, seqid;
6624 Uint2 aliasfilebit=0;
6625 Uint4 header_index;
6626 BlastDefLinePtr bdfp=NULL, bdfp_head=NULL;
6627
6628 rdfp = readdb_get_link(rdfp, sequence_number);
6629 if (rdfp == NULL)
6630 return FALSE;
6631
6632 if (rdfp->oidlist) {
6633 readdb_get_filebits(rdfp, sequence_number, NULL, &aliasfilebit);
6634 }
6635
6636 if (aliasfilebit != 0) {
6637 alias_mask |= (0x1 << aliasfilebit);
6638
6639 *id = NULL;
6640 not_done = TRUE;
6641 header_index = 0;
6642
6643 bdfp = NULL;
6644 if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6645 return readdb_get_defline_ex(rdfp, sequence_number, description, id);
6646 } else {
6647
6648 while (not_done) {
6649
6650 not_done = readdb_get_header(rdfp, sequence_number, &header_index, &seqid, &defline);
6651 if (not_done == FALSE)
6652 break;
6653
6654 bestid = SeqIdFindBest(seqid, SEQID_GI);
6655 gi = bestid->data.intvalue;
6656 cigi = rdfp->cih->ci + gi;
6657 if (alias_mask & SwapUint4(cigi->dbmask)) {
6658 if (*id == NULL) {
6659 *id = seqid;
6660 seqid = NULL;
6661 new_defline = defline;
6662 new_defline_length = StringLen(new_defline);
6663 defline = NULL;
6664 } else {
6665 SeqIdWrite(seqid, id_buf, PRINTID_FASTA_LONG, READDB_BUF_SIZE);
6666 seqid = SeqIdSetFree(seqid);
6667 defline_length = new_defline_length;
6668 new_defline_length += StringLen(defline) + StringLen(id_buf);
6669 new_defline_length += 2;
6670 tmp_defline = MemNew(new_defline_length+1);
6671 MemCpy(tmp_defline, new_defline, defline_length);
6672 sprintf(tmp_defline+defline_length, "%c%s %s", READDB_DEF_SEPARATOR, id_buf, defline);
6673 defline = MemFree(defline);
6674 new_defline = MemFree(new_defline);
6675 new_defline = tmp_defline;
6676 }
6677 } else {
6678 seqid = SeqIdSetFree(seqid);
6679 defline = MemFree(defline);
6680 }
6681 }
6682
6683 if (seqid != NULL)
6684 seqid = SeqIdSetFree(seqid);
6685 if (defline != NULL)
6686 defline = MemFree(defline);
6687
6688 if (description != NULL)
6689 *description = new_defline;
6690 else
6691 new_defline = MemFree(new_defline);
6692 }
6693 } else if (rdfp->gi_target != 0) {
6694 *id = NULL;
6695 not_done = TRUE;
6696 header_index = 0;
6697 new_defline = NULL;
6698
6699 bdfp = NULL;
6700 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6701 bdfp = FDReadDeflineAsn(rdfp, sequence_number);
6702 if(bdfp == NULL) {
6703 ErrPostEx(SEV_ERROR, 0, 0, "readdb_get_descriptor: "
6704 "Failure to read defline ASN for %d", sequence_number);
6705 *id = NULL;
6706 *description = NULL;
6707 return FALSE;
6708 }
6709 bdfp_head = bdfp;
6710 }
6711
6712 while (not_done) {
6713
6714 if(rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6715 seqid = SeqIdSetDup(bdfp->seqid);
6716 defline = StringSave(bdfp->title);
6717 if((bdfp = bdfp->next) == NULL)
6718 not_done = FALSE;
6719 } else {
6720 not_done = readdb_get_header(rdfp, sequence_number, &header_index, &seqid, &defline);
6721 if (not_done == FALSE)
6722 break;
6723 }
6724
6725 bestid = SeqIdFindBest(seqid, SEQID_GI);
6726 gi = bestid->data.intvalue;
6727 if (gi == rdfp->gi_target) {
6728 *id = seqid;
6729 seqid = NULL;
6730 new_defline = defline;
6731 defline = NULL;
6732 } else {
6733 seqid = SeqIdSetFree(seqid);
6734 defline = MemFree(defline);
6735 }
6736 }
6737
6738 if (seqid != NULL)
6739 seqid = SeqIdSetFree(seqid);
6740 if (defline != NULL)
6741 defline = MemFree(defline);
6742
6743 BlastDefLineSetFree(bdfp_head);
6744
6745 if (description != NULL)
6746 *description = new_defline;
6747 else
6748 new_defline = MemFree(new_defline);
6749 } else {
6750 return readdb_get_defline_ex(rdfp, sequence_number, description, id);
6751 }
6752
6753 return TRUE;
6754 }
6755
6756 Boolean
readdb_get_defline(ReadDBFILEPtr rdfp,Int4 sequence_number,CharPtr PNTR description)6757 readdb_get_defline (ReadDBFILEPtr rdfp, Int4 sequence_number, CharPtr PNTR description)
6758 {
6759 rdfp = readdb_get_link(rdfp, sequence_number);
6760
6761 if (rdfp == NULL)
6762 return FALSE;
6763
6764 return readdb_get_defline_ex(rdfp, sequence_number, description, NULL);
6765 }
6766
6767
6768
6769 /*
6770 A single sequence may be attched to several entries (as they all
6771 have the same sequence). This function gets the ID and deflines for
6772 each entry attched to one sequence. On the first call the Uint4
6773 (*header_index) should be zero; it will be filled in by readdb_get_header.
6774 Subsequent calls will use this information to know which ID and
6775 defline to retrieve next. When all are retrieved, FALSE will be returned.
6776 Caller is responsible for deallocating the out-parameters.
6777 */
6778 Boolean LIBCALL
readdb_get_header(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr header_index,SeqIdPtr PNTR id,CharPtr PNTR description)6779 readdb_get_header (ReadDBFILEPtr rdfp, Int4 sequence_number, Uint4Ptr header_index,
6780 SeqIdPtr PNTR id, CharPtr PNTR description)
6781 {
6782 return readdb_get_header_ex (rdfp, sequence_number, header_index, id, description,
6783 NULL, NULL, NULL);
6784 }
6785
6786 /*
6787 * Simple function to copy a linked list of ints
6788 * (shouldn't this go somewhere else?)
6789 *
6790 */
IntValNodeCopy(ValNodePtr src)6791 static ValNodePtr IntValNodeCopy(ValNodePtr src)
6792 {
6793 ValNodePtr retval = NULL;
6794
6795 if (!src)
6796 return NULL;
6797
6798 if ((retval = ValNodeAddInt(NULL,0,src->data.intvalue)) == NULL)
6799 return NULL;
6800
6801 for (src = src->next ; src; src = src->next) {
6802 ValNodeAddInt(&retval,0,src->data.intvalue);
6803 }
6804
6805 return retval;
6806 }
6807
6808 Boolean LIBCALL
readdb_get_header_ex(ReadDBFILEPtr rdfp,Int4 sequence_number,Uint4Ptr header_index,SeqIdPtr PNTR id,CharPtr PNTR description,Int4 PNTR taxid,ValNodePtr PNTR memberships,ValNodePtr PNTR links)6809 readdb_get_header_ex (ReadDBFILEPtr rdfp, Int4 sequence_number,
6810 Uint4Ptr header_index, SeqIdPtr PNTR id,
6811 CharPtr PNTR description, Int4 PNTR taxid,
6812 ValNodePtr PNTR memberships, ValNodePtr PNTR links)
6813
6814 {
6815 Boolean retval = FALSE;
6816 Char id_buf[READDB_BUF_SIZE];
6817 CharPtr buf_ptr, buf_defline_start;
6818 Int4 index, size, i;
6819 Uint4 header_index_end;
6820 BlastDefLinePtr bdlp = NULL;
6821
6822 rdfp = readdb_get_link(rdfp, sequence_number);
6823
6824 if (!rdfp)
6825 return retval;
6826
6827 if (rdfp->formatdb_ver > FORMATDB_VER_TEXT) {
6828
6829 if (*header_index == 0) {
6830 bdlp = FDReadDeflineAsn(rdfp, sequence_number);
6831 if (bdlp == NULL) {
6832 if (id != NULL) *id = NULL;
6833 if (description != NULL) *description = NULL;
6834 if (memberships != NULL) *memberships = NULL;
6835 if (links != NULL) *links = NULL;
6836 return retval;
6837 }
6838 if (rdfp->blast_deflinep)
6839 BlastDefLineSetFree(rdfp->blast_deflinep);
6840 rdfp->blast_deflinep = bdlp; /* cache the BlastDefLinePtr */
6841
6842 } else if (*header_index == UINT4_MAX) {
6843 if (id != NULL) *id = NULL;
6844 if (description != NULL) *description = NULL;
6845 if (memberships != NULL) *memberships = NULL;
6846 if (links != NULL) *links = NULL;
6847 rdfp->blast_deflinep = BlastDefLineSetFree(rdfp->blast_deflinep);
6848 return retval;
6849
6850 } else {
6851 bdlp = rdfp->blast_deflinep;
6852 for (i = 0; i < *header_index; i++) {
6853 if (bdlp == NULL) { /* sanity check */
6854 ErrPostEx(SEV_ERROR,0,0,"There is no BlastDefLinePtr in rdfp!");
6855 return retval;
6856 }
6857 bdlp = bdlp->next;
6858 }
6859 }
6860
6861 /* Assign the values */
6862 if (id != NULL) *id = SeqIdSetDup(bdlp->seqid);
6863 if (description != NULL) *description = StringSave(bdlp->title);
6864 if (taxid != NULL) *taxid = bdlp->taxid;
6865 if (memberships != NULL) *memberships = IntValNodeCopy(bdlp->memberships);
6866 if (links != NULL) *links = IntValNodeCopy(bdlp->links);
6867
6868 /* At the end of the deflines, set *header_index to a sentinel value */
6869 if (bdlp->next == NULL)
6870 *header_index = UINT4_MAX;
6871 else
6872 (*header_index)++;
6873
6874 retval = TRUE;
6875
6876 } else { /* Provide old version for backwards compatibility */
6877
6878 rdfp = readdb_get_link(rdfp, sequence_number);
6879 if (rdfp == NULL)
6880 return FALSE;
6881
6882 if (*header_index == 0)
6883 *header_index = Nlm_SwapUint4(rdfp->header_index[sequence_number]);
6884
6885 header_index_end = Nlm_SwapUint4(rdfp->header_index[sequence_number+1]);
6886
6887 if (*header_index >= header_index_end) {
6888 *header_index = 0;
6889 return FALSE;
6890 }
6891
6892 size = header_index_end-(*header_index);
6893 buf_ptr = MemNew((size+1)*sizeof(Char));
6894
6895 NlmSeekInMFILE(rdfp->headerfp, (long) *header_index, SEEK_SET);
6896 if (NlmReadMFILE((Uint1Ptr) buf_ptr, sizeof(Char), size, rdfp->headerfp) != size)
6897 return FALSE;
6898
6899 for (index=0; index<size; index++) {
6900 if (buf_ptr[index] == ' ') {
6901 id_buf[index] = NULLB;
6902 index++;
6903 break;
6904 }
6905 id_buf[index] = buf_ptr[index];
6906 }
6907 if (id) *id = SeqIdParse(id_buf);
6908
6909 buf_defline_start = &buf_ptr[index];
6910 while (index < size) {
6911 if (buf_ptr[index] == READDB_DEF_SEPARATOR) {
6912 break;
6913 }
6914 index++;
6915 }
6916 buf_ptr[index] = NULLB;
6917 index++;
6918 if (description != NULL) {
6919 *description = StringSave(buf_defline_start);
6920 }
6921 buf_ptr = MemFree(buf_ptr);
6922 *header_index += index;
6923
6924 retval = TRUE;
6925 }
6926
6927 return retval;
6928 }
6929
6930 /*
6931 Obtains the total database length from the ReadDBFILE structure.
6932 */
6933 Int8 LIBCALL
readdb_get_dblen(ReadDBFILEPtr rdfp)6934 readdb_get_dblen (ReadDBFILEPtr rdfp)
6935
6936 {
6937 if (rdfp == NULL)
6938 return 0;
6939
6940 return rdfp->totlen;
6941 }
6942
6943 /*
6944 Obtains the total number of database sequences from all the ReadDBFILE structures.
6945 */
6946 Int4 LIBCALL
readdb_get_num_entries_total(ReadDBFILEPtr rdfp)6947 readdb_get_num_entries_total (ReadDBFILEPtr rdfp)
6948
6949 {
6950 Int4 total=0;
6951 if (rdfp == NULL)
6952 return 0;
6953
6954 while (rdfp) {
6955 total += rdfp->num_seqs;
6956 rdfp = rdfp->next;
6957 }
6958 return total;
6959 }
6960
6961 /*
6962 Obtains the total number of real database sequences from all the ReadDBFILE structures.
6963 */
6964 Int4 LIBCALL
readdb_get_num_entries_total_real(ReadDBFILEPtr rdfp)6965 readdb_get_num_entries_total_real (ReadDBFILEPtr rdfp)
6966
6967 {
6968 Int4 total=0;
6969 if (rdfp == NULL)
6970 return 0;
6971
6972 while (rdfp && !rdfp->oidlist)
6973 {
6974 total += rdfp->num_seqs;
6975 rdfp = rdfp->next;
6976 }
6977 return total;
6978 }
6979
6980 /*
6981 Obtains the number of database sequences from the ReadDBFILE structure.
6982 */
6983 Int4 LIBCALL
readdb_get_num_entries(ReadDBFILEPtr rdfp)6984 readdb_get_num_entries (ReadDBFILEPtr rdfp)
6985
6986 {
6987 if (rdfp == NULL)
6988 return 0;
6989
6990 return rdfp->num_seqs;
6991 }
6992
6993 /*
6994 Obtains the length of the longest database seq from the ReadDBFILE structure.
6995 */
6996 Int4 LIBCALL
readdb_get_maxlen(ReadDBFILEPtr rdfp)6997 readdb_get_maxlen (ReadDBFILEPtr rdfp)
6998
6999 {
7000 if (rdfp == NULL)
7001 return 0;
7002
7003 return rdfp->maxlen;
7004 }
7005
7006 /*
7007 Obtains the title of the database. Note that the return CharPtr is not
7008 owned by the caller. It should be copied if the user wishes to modify it.
7009 */
7010 CharPtr LIBCALL
readdb_get_filename(ReadDBFILEPtr rdfp)7011 readdb_get_filename (ReadDBFILEPtr rdfp)
7012
7013 {
7014 if (rdfp == NULL)
7015 return NULL;
7016
7017 if (rdfp->aliasfilename)
7018 return rdfp->aliasfilename;
7019
7020 return rdfp->filename;
7021 }
7022
7023 /*
7024 Obtains the title of the database. Note that the return CharPtr is not
7025 owned by the caller. It should be copied if the user wishes to modify it.
7026 */
7027 CharPtr LIBCALL
readdb_get_full_filename(ReadDBFILEPtr rdfp)7028 readdb_get_full_filename (ReadDBFILEPtr rdfp)
7029
7030 {
7031 char* retval = NULL;
7032
7033 if (rdfp == NULL)
7034 return NULL;
7035
7036 if (!rdfp->aliasfilename)
7037 retval = StringSave(rdfp->filename);
7038 else {
7039 char* path = Nlm_FilePathFind(rdfp->filename);
7040 char buffer[PATH_MAX];
7041 sprintf(buffer, "%s/%s", path, rdfp->aliasfilename);
7042 retval = StringSave(buffer);
7043 }
7044 return retval;
7045 }
7046
7047 /*
7048 Obtains the title of the database. Note that the return CharPtr is not
7049 owned by the caller. It should be copied if the user wishes to modify it.
7050 */
7051 CharPtr LIBCALL
readdb_get_title(ReadDBFILEPtr rdfp)7052 readdb_get_title (ReadDBFILEPtr rdfp)
7053
7054 {
7055 if (rdfp == NULL)
7056 return NULL;
7057
7058 if (rdfp->title)
7059 return rdfp->title;
7060
7061 /* return the file-name if no title found. */
7062
7063 return NULL;
7064 /* return readdb_get_filename(rdfp); */
7065 }
7066
7067 /*
7068 Obtains the date and time the database was formatted with formatdb.
7069 Note that the return CharPtr is not owned by the caller. It should
7070 be copied if the user wishes to modify it.
7071 */
7072 CharPtr LIBCALL
readdb_get_date(ReadDBFILEPtr rdfp)7073 readdb_get_date (ReadDBFILEPtr rdfp)
7074
7075 {
7076 if (rdfp == NULL)
7077 return NULL;
7078
7079 return rdfp->date;
7080 }
7081
7082 /*
7083 Queries readdb whether the sequence is protein.
7084 */
7085 Boolean LIBCALL
readdb_is_prot(ReadDBFILEPtr rdfp)7086 readdb_is_prot (ReadDBFILEPtr rdfp)
7087
7088 {
7089 if (rdfp == NULL)
7090 return FALSE;
7091
7092 return /*rdfp->is_prot*/(Boolean) (rdfp->parameters & READDB_IS_PROT);
7093 }
7094
7095 /*
7096 Obtains the formatdb version used to format the database.
7097 */
7098 Int4 LIBCALL
readdb_get_formatdb_version(ReadDBFILEPtr rdfp)7099 readdb_get_formatdb_version (ReadDBFILEPtr rdfp)
7100
7101 {
7102 if (rdfp == NULL)
7103 return 0;
7104
7105 return rdfp->formatdb_ver;
7106 }
7107
7108 /*
7109 Translates a SeqIdPtr to an ordinal ID, used by the BLAST database.
7110 If the SeqIdPtr cannot be translated, a negative number is returned.
7111 All valid ordinal numbers are >= 0.
7112 */
7113
SeqId2OrdinalId(ReadDBFILEPtr rdfp,SeqIdPtr sip)7114 Int4 SeqId2OrdinalId(ReadDBFILEPtr rdfp, SeqIdPtr sip)
7115
7116 {
7117 DbtagPtr dbtagptr;
7118 Int4 ordinal_id;
7119
7120 if (rdfp == NULL || sip == NULL)
7121 return -2;
7122
7123 switch (sip->choice)
7124 {
7125 case SEQID_GI:
7126 ordinal_id = readdb_gi2seq(rdfp, sip->data.intvalue, NULL);
7127 break;
7128
7129 case SEQID_GENERAL:
7130 dbtagptr = (DbtagPtr) sip->data.ptrvalue;
7131 if (dbtagptr == NULL)
7132 return OM_MSG_RET_OK;
7133 if (StringCmp(dbtagptr->db, "BL_ORD_ID") == 0)
7134 {
7135 ordinal_id = dbtagptr->tag->id;
7136 break;
7137 }
7138 /* Fall through to default if not "BL_ORD_ID" */
7139 default:
7140 ordinal_id = readdb_seqid2fasta(rdfp, sip);
7141 break;
7142 }
7143
7144 return ordinal_id;
7145 }
7146 /*************************************************************************
7147
7148 Inits the ReadDBFILEPtr for the BioseqFetch functions.
7149
7150 **************************************************************************/
7151
7152 static Boolean
ReadDBInit(ReadDBFetchStructPtr rdfsp)7153 ReadDBInit(ReadDBFetchStructPtr rdfsp)
7154 {
7155
7156 rdfsp->rdfp = readdb_new_ex2(rdfsp->dbname, rdfsp->is_prot,
7157 READDB_NEW_INDEX | READDB_NEW_DO_TAXDB, NULL, NULL);
7158 taxonomyDbLoaded = FALSE; /* If object manager loads tax dbs, don't block
7159 application from loading it again */
7160
7161 if (rdfsp->rdfp != NULL)
7162 return TRUE;
7163 else
7164 return FALSE;
7165 }
7166
7167 /*
7168 Checks the chain of ReadDBFetchStructPtr's for one
7169 which belongs to the calling thread. If none is found,
7170 NULL isreturned; otherwise the ReadDBFetchStructPtr is
7171 returned.
7172 */
7173 static ReadDBFetchStructPtr
ReadDBFindFetchStruct(ReadDBFetchStructPtr rdfp)7174 ReadDBFindFetchStruct(ReadDBFetchStructPtr rdfp)
7175
7176 {
7177
7178 if (rdfp == NULL)
7179 return NULL;
7180
7181 while (rdfp)
7182 {
7183 if (NlmThreadCompare(rdfp->thread_id, NlmThreadSelf()) == TRUE)
7184 break;
7185 rdfp = rdfp->next;
7186 }
7187 return rdfp;
7188 }
7189
7190 /*
7191 Initializes the ReadDBFetchStructPtr and adds onto end of
7192 chain of ReadDBFetchStructPtr (head). The new ReadDBFetchStructPtr
7193 is returned.
7194 */
7195 static ReadDBFetchStructPtr
ReadDBFetchStructNew(ReadDBFetchStructPtr head,CharPtr dbname,Boolean is_na)7196 ReadDBFetchStructNew(ReadDBFetchStructPtr head, CharPtr dbname, Boolean is_na)
7197
7198 {
7199 ReadDBFetchStructPtr rdfsp, rdfsp_var;
7200
7201
7202 rdfsp = (ReadDBFetchStructPtr) MemNew(sizeof(ReadDBFetchStruct));
7203 rdfsp->dbname = StringSave(dbname);
7204 rdfsp->is_prot = (is_na == TRUE) ? FALSE : TRUE;
7205 rdfsp->thread_id = NlmThreadSelf();
7206
7207 if (head != NULL)
7208 {
7209 rdfsp_var = head;
7210 while (rdfsp_var->next)
7211 rdfsp_var = rdfsp_var->next;
7212 rdfsp_var->next = rdfsp;
7213 }
7214
7215 return rdfsp;
7216 }
7217
7218 /****************************************************************
7219 *
7220 * ReadDBFetchFreeFunc
7221 * Frees ReadDBFetchUserData.
7222 *
7223 ****************************************************************/
7224
ReadDBFetchFreeFunc(Pointer ptr)7225 static Pointer LIBCALLBACK ReadDBFetchFreeFunc (Pointer ptr)
7226 {
7227 ReadDBFetchUserDataPtr userdata;
7228
7229 userdata = (ReadDBFetchUserDataPtr) ptr;
7230 return MemFree(userdata);
7231 }
7232
7233
7234
7235 /**********************************************************************
7236
7237 Fetches the Bioseq, based on the ordinal number of the
7238 sequence in the database.
7239
7240 ************************************************************************/
7241
ReadDBBioseqFetchFunc(Pointer data)7242 static Int2 LIBCALLBACK ReadDBBioseqFetchFunc(Pointer data)
7243 {
7244 BioseqPtr bsp, core_bsp;
7245 Boolean status;
7246 Int4 ordinal_id;
7247 OMProcControlPtr ompcp;
7248 ObjMgrProcPtr ompp;
7249 OMUserDataPtr omdp;
7250 ReadDBFetchStructPtr rdfsp;
7251 ReadDBFILEPtr rdfp=NULL;
7252 ReadDBFetchUserDataPtr userdata;
7253 SeqIdPtr sip, best_id;
7254 SeqEntryPtr sep;
7255
7256 ordinal_id = -1;
7257
7258 ompcp = (OMProcControlPtr)data;
7259 ompp = ompcp->proc;
7260
7261 rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7262
7263 if (rdfsp == NULL)
7264 {
7265 return OM_MSG_RET_OK;
7266 }
7267
7268 if (rdfsp->ReadDBFetchState == READDBBF_DISABLE)
7269 {
7270 return OM_MSG_RET_OK;
7271 }
7272
7273 if (rdfsp->ReadDBFetchState == READDBBF_INIT)
7274 {
7275 status = ReadDBInit(rdfsp);
7276 if (status == FALSE)
7277 return OM_MSG_RET_OK;
7278 rdfsp->ReadDBFetchState = READDBBF_READY;
7279 }
7280
7281 if (ordinal_id < 0 || rdfp == NULL)
7282 {
7283 sip = (SeqIdPtr) (ompcp->input_data);
7284
7285 best_id = SeqIdFindBest(sip, SEQID_GI);
7286
7287 if (best_id == NULL)
7288 {
7289 core_bsp = BioseqFindCore(sip);
7290 if (core_bsp)
7291 best_id = SeqIdFindBest(core_bsp->id, SEQID_GI);
7292 }
7293
7294 if (best_id == NULL)
7295 return OM_MSG_RET_OK;
7296
7297 rdfp = rdfsp->rdfp;
7298 ordinal_id = SeqId2OrdinalId(rdfp, best_id);
7299 if (ordinal_id >= 0) {
7300 rdfp->preferred_gi = best_id->data.intvalue;
7301 }
7302 }
7303
7304 /* ordinal_id's start at zero. */
7305 if (ordinal_id < 0)
7306 return OM_MSG_RET_OK;
7307
7308 /* A BioseqPtr is returned by this function. */
7309 bsp = readdb_get_bioseq(rdfp, ordinal_id);
7310
7311 /* Reset the preferred_gi */
7312 rdfp->preferred_gi = 0;
7313
7314 /* We have to add information about genetic code to
7315 the Bioseq */
7316
7317 if(rdfsp->db_genetic_code > 1) {
7318 BioSourcePtr source;
7319 source = BioSourceNew();
7320 source->org = OrgRefNew();
7321 source->org->orgname = OrgNameNew();
7322 source->org->orgname->gcode = rdfsp->db_genetic_code;
7323 SeqDescrAddPointer(&(bsp->descr), Seq_descr_source, source);
7324 }
7325
7326 sep = SeqEntryNew();
7327 sep->choice = 1;
7328 sep->data.ptrvalue = bsp;
7329 SeqMgrSeqEntry(SM_BIOSEQ, (Pointer)bsp, sep);
7330 ompcp->output_data = (Pointer)bsp;
7331 ompcp->output_entityID = ObjMgrGetEntityIDForChoice(sep);
7332 omdp = ObjMgrAddUserData(ompcp->output_entityID, ompp->procid, OMPROC_FETCH, 0);
7333 userdata = (ReadDBFetchUserDataPtr) MemNew(sizeof(ReadDBFetchUserData));
7334 omdp->userdata.ptrvalue = userdata;
7335 userdata->ordinal_number = ordinal_id;
7336 userdata->db_id = ReadDBGetDbId(rdfsp->rdfp, rdfp);
7337 omdp->freefunc = ReadDBFetchFreeFunc;
7338
7339 return OM_MSG_RET_DONE;
7340 }
7341
ReadDBBioseqSetDbGeneticCode(Int4 db_genetic_code)7342 Boolean LIBCALL ReadDBBioseqSetDbGeneticCode(Int4 db_genetic_code)
7343 {
7344 ReadDBFetchStructPtr rdfsp;
7345 ObjMgrPtr omp;
7346 ObjMgrProcPtr ompp;
7347
7348 omp = ObjMgrGet();
7349 ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7350 if (ompp != NULL) { /* already initialized */
7351 rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7352 rdfsp->db_genetic_code = db_genetic_code;
7353 return FALSE;
7354 }
7355 return TRUE;
7356 }
7357
7358 /*********************************************************************
7359
7360 Enables the fetching. Initializes needed structures and calls
7361 ReadDBInit.
7362
7363 **********************************************************************/
7364 Boolean LIBCALL
ReadDBBioseqFetchEnable(CharPtr program,CharPtr dbname,Boolean is_na,Boolean now)7365 ReadDBBioseqFetchEnable(CharPtr program, CharPtr dbname, Boolean is_na, Boolean now)
7366
7367 {
7368 Boolean result;
7369 ReadDBFetchStructPtr rdfsp;
7370 ObjMgrPtr omp;
7371 ObjMgrProcPtr ompp;
7372 static TNlmMutex enable_lock = NULL;
7373 /* check if already enabled ***/
7374
7375 NlmMutexInit(&enable_lock);
7376 NlmMutexLock(enable_lock);
7377
7378 omp = ObjMgrGet();
7379 ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7380 if (ompp != NULL) { /* already initialized */
7381 rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7382
7383 if(rdfsp == NULL) { /* Another thread */
7384 rdfsp = ReadDBFetchStructNew((ReadDBFetchStructPtr)(ompp->procdata), dbname, is_na);
7385 } else {
7386 if (rdfsp->is_prot == is_na || StringCmp(rdfsp->dbname, dbname)) {
7387 rdfsp->is_prot = (is_na == TRUE) ? FALSE : TRUE;
7388 rdfsp->dbname = MemFree(rdfsp->dbname);
7389 rdfsp->dbname = StringSave(dbname);
7390 }
7391 }
7392 } else { /* New element is not registered with ObjMgr */
7393 rdfsp = ReadDBFetchStructNew(NULL, dbname, is_na);
7394 ObjMgrProcLoad(OMPROC_FETCH, "ReadDBBioseqFetch", "ReadDBBioseqFetch", OBJ_SEQID, 0,OBJ_BIOSEQ,0,
7395 (Pointer)rdfsp, ReadDBBioseqFetchFunc, PROC_PRIORITY_DEFAULT);
7396 rdfsp->ReadDBFetchState = READDBBF_INIT;
7397 }
7398
7399 rdfsp->ctr++; /* count number of enables */
7400
7401 NlmMutexUnlock(enable_lock);
7402
7403 if (rdfsp->ReadDBFetchState == READDBBF_READY) {
7404 return TRUE;
7405 }
7406
7407 if (now) {
7408 result = ReadDBInit(rdfsp);
7409 if (! result) {
7410 return result;
7411 }
7412 rdfsp->ReadDBFetchState = READDBBF_READY;
7413 } else {
7414 rdfsp->ReadDBFetchState = READDBBF_INIT;
7415 }
7416
7417 return TRUE;
7418 }
7419
7420 /*****************************************************************************
7421 *
7422 * ReadDBBioseqFetchDisable()
7423 *
7424 * Calls readdb_destruct if necessary to deallocate resources.
7425 *
7426 *****************************************************************************/
ReadDBBioseqFetchDisable(void)7427 void LIBCALL ReadDBBioseqFetchDisable(void)
7428 {
7429 ObjMgrPtr omp;
7430 ObjMgrProcPtr ompp;
7431 ReadDBFetchStructPtr rdfsp;
7432
7433 omp = ObjMgrGet();
7434 ompp = ObjMgrProcFind(omp, 0, "ReadDBBioseqFetch", OMPROC_FETCH);
7435 if (ompp == NULL) /* not initialized */
7436 return;
7437
7438 rdfsp = ReadDBFindFetchStruct((ReadDBFetchStructPtr)(ompp->procdata));
7439 if (! rdfsp->ctr) /* no enables active */
7440 return;
7441
7442 rdfsp->ctr--;
7443 if (rdfsp->ctr) /* connection still pending */
7444 return;
7445
7446 if (rdfsp->ReadDBFetchState == READDBBF_READY)
7447 {
7448 rdfsp->ReadDBFetchState = READDBBF_DISABLE; /* not active */
7449 rdfsp->rdfp = readdb_destruct(rdfsp->rdfp);
7450 }
7451
7452 return;
7453 }
7454
7455 /*
7456 Returns the ReadDBFILEPtr by the database ID.
7457 NULL is returned on error.
7458 */
7459
7460 ReadDBFILEPtr
ReadDBGetDb(ReadDBFILEPtr rdfp_list,Int2 db_id)7461 ReadDBGetDb (ReadDBFILEPtr rdfp_list, Int2 db_id)
7462
7463 {
7464 Int2 index=0;
7465
7466 while (rdfp_list)
7467 {
7468 if (index == db_id)
7469 {
7470 return rdfp_list;
7471 }
7472 rdfp_list = rdfp_list->next;
7473 index++;
7474 }
7475 return NULL;
7476 }
7477
7478 /*
7479 Returns the Database ID.
7480 -1 is returned on error.
7481 */
7482
7483 Int2
ReadDBGetDbId(ReadDBFILEPtr list,ReadDBFILEPtr target)7484 ReadDBGetDbId (ReadDBFILEPtr list, ReadDBFILEPtr target)
7485
7486 {
7487 Int2 index=0;
7488
7489 while (list)
7490 {
7491 if (readdb_compare(list, target) == TRUE)
7492 return index;
7493 list = list->next;
7494 index++;
7495 }
7496 return -1;
7497 }
7498
7499 /*
7500 Formatting functions for databases formatted by formatdb.
7501 */
7502 Boolean LIBCALL
PrintDbInformationBasicEx(Boolean is_aa,Int4 line_length,CharPtr definition,Int4 number_seqs,Int8 total_length,FILE * outfp,Boolean html,Boolean with_links)7503 PrintDbInformationBasicEx (Boolean is_aa, Int4 line_length,
7504 CharPtr definition, Int4 number_seqs,
7505 Int8 total_length, FILE *outfp, Boolean html,
7506 Boolean with_links)
7507 {
7508 if (html && with_links) {
7509 fprintf(outfp, "<b>Database:</b> %s", definition);
7510 asn2ff_set_output(outfp, NULL);
7511 ff_StartPrint(0, 0, line_length, NULL);
7512 } else {
7513 asn2ff_set_output(outfp, NULL);
7514
7515 ff_StartPrint(0, 0, line_length, NULL);
7516 if (html)
7517 ff_AddString("<b>Database:</b> ");
7518 else
7519 ff_AddString("Database: ");
7520 ff_AddString(definition);
7521 }
7522 NewContLine();
7523 TabToColumn(12);
7524 ff_AddString(Ltostr((long) number_seqs, 1));
7525 ff_AddString(" sequences; ");
7526 ff_AddString(Nlm_Int8tostr(total_length, 1));
7527 ff_AddString(" total letters");
7528 NewContLine();
7529 ff_EndPrint();
7530
7531 return TRUE;
7532 }
7533
7534 Boolean LIBCALL
PrintDbInformationBasic(CharPtr database,Boolean is_aa,Int4 line_length,CharPtr definition,Int4 number_seqs,Int8 total_length,FILE * outfp,Boolean html)7535 PrintDbInformationBasic (CharPtr database, Boolean is_aa, Int4 line_length,
7536 CharPtr definition, Int4 number_seqs, Int8
7537 total_length, FILE *outfp, Boolean html)
7538 {
7539 return PrintDbInformationBasicEx(is_aa, line_length, definition,
7540 number_seqs, total_length, outfp, html,
7541 FALSE);
7542 }
7543
7544 /*
7545 Print a summary of the database(s) used.
7546 */
7547
7548 Boolean LIBCALL
PrintDbInformationWithRID(CharPtr database,Boolean is_aa,Int4 line_length,FILE * outfp,Boolean html,CharPtr rid,Boolean query_is_aa)7549 PrintDbInformationWithRID(CharPtr database, Boolean is_aa, Int4 line_length,
7550 FILE *outfp, Boolean html, CharPtr rid, Boolean query_is_aa)
7551 {
7552 CharPtr definition, ptr, chptr;
7553 Int8 total_length;
7554 Int4 number_seqs, length, real_length, avail_length, shift;
7555 ReadDBFILEPtr rdfp, rdfp_var, rdfp_tmp;
7556 Boolean first_title;
7557 Char next_title[1024];
7558 Boolean with_links = FALSE;
7559 Int2 tmp_len;
7560
7561 if (database == NULL || outfp == NULL)
7562 return FALSE;
7563
7564 if (is_aa == TRUE)
7565 rdfp = readdb_new_ex2(database, READDB_DB_IS_PROT,
7566 READDB_NEW_DO_REPORT, NULL, NULL);
7567 else
7568 rdfp = readdb_new_ex2(database, READDB_DB_IS_NUC,
7569 READDB_NEW_DO_REPORT, NULL, NULL);
7570
7571 if (rdfp == FALSE)
7572 return FALSE;
7573
7574 length = 4096; /* Initial length, may be increased. */
7575 definition = MemNew(length*sizeof(Char));
7576 ptr = definition;
7577 rdfp_var = rdfp;
7578
7579 real_length = 0;
7580 avail_length = length;
7581 first_title = TRUE;
7582 while (rdfp_var) {
7583 chptr = readdb_get_title(rdfp_var);
7584
7585 if(chptr == NULL) {
7586 rdfp_var = rdfp_var->next;
7587 continue;
7588 }
7589
7590 if (rid && html && rdfp_var->aliasfilename && atoi(rdfp_var->aliasfilename) != 0) {
7591 if (query_is_aa && !StrNCmp(chptr, "Completed", 9)) {
7592 sprintf(next_title,
7593 "<a href=http://www.ncbi.nlm.nih.gov/sutils/genomeRID.cgi?"
7594 "taxid=%s&RID=%s>%s</a>; \n",
7595 rdfp_var->aliasfilename, rid, chptr);
7596 with_links = TRUE;
7597 } else {
7598 sprintf(next_title, "%s", chptr);
7599
7600 tmp_len = StrLen(next_title);
7601 /* are there more titles to concatenate? */
7602 for (rdfp_tmp = rdfp_var->next; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
7603 if (rdfp_tmp->title != NULL) {
7604 next_title[tmp_len++] = ';';
7605 break;
7606 }
7607 }
7608
7609 if (!first_title && rdfp_var->next != NULL) {
7610 /*next_title[tmp_len++] = ';';*/
7611 next_title[tmp_len++] = ' ';
7612 next_title[tmp_len++] = '\n';
7613 next_title[tmp_len++] = NULLB;
7614 } else {
7615 /*if (rdfp_var->next != NULL)
7616 next_title[tmp_len++] = ';';*/
7617 next_title[tmp_len++] = ' ';
7618 next_title[tmp_len++] = NULLB;
7619 first_title = FALSE;
7620 }
7621 }
7622 real_length += StrLen(next_title) + 4;
7623 /* We print these as keep-alive messages for this specific use. */
7624 fprintf(outfp, "%s", " ");
7625 fflush(outfp);
7626 } else {
7627 real_length += StrLen(chptr) + 3;
7628 sprintf(next_title, "%s", chptr);
7629
7630 tmp_len = StrLen(next_title);
7631
7632 /* are there more titles to concatenate? */
7633 for (rdfp_tmp = rdfp_var->next; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
7634 if (rdfp_tmp->title != NULL) {
7635 next_title[tmp_len++] = ';';
7636 break;
7637 }
7638 }
7639
7640 if (!first_title) {
7641 next_title[tmp_len++] = ' ';
7642 next_title[tmp_len++] = NULLB;
7643 } else {
7644 next_title[tmp_len++] = ' ';
7645 next_title[tmp_len++] = NULLB;
7646 first_title = FALSE;
7647 }
7648
7649 }
7650
7651 if (real_length > avail_length) {
7652 shift = ptr - definition;
7653 definition = Realloc(definition, 2*real_length);
7654 avail_length = 2*real_length;
7655 ptr = definition + shift;
7656 }
7657 StringCpy(ptr, next_title);
7658
7659 length = StringLen(ptr);
7660 ptr += length;
7661
7662 rdfp_var = rdfp_var->next;
7663 }
7664
7665 *ptr = NULLB;
7666 readdb_get_totals_ex(rdfp, &(total_length), &(number_seqs), TRUE);
7667
7668 rdfp = readdb_destruct(rdfp);
7669 if (rid && html)
7670 {
7671 fprintf(outfp, "%s", "\n");
7672 fflush(outfp);
7673 }
7674
7675 PrintDbInformationBasicEx (is_aa, line_length, definition,
7676 number_seqs, total_length, outfp,
7677 html, with_links);
7678
7679 definition = MemFree(definition);
7680
7681 return TRUE;
7682 }
7683
7684 Boolean LIBCALL
PrintDbInformation(CharPtr database,Boolean is_aa,Int4 line_length,FILE * outfp,Boolean html)7685 PrintDbInformation(CharPtr database, Boolean is_aa, Int4 line_length, FILE *outfp, Boolean html)
7686 {
7687 return PrintDbInformationWithRID(database, is_aa, line_length,
7688 outfp, html, NULL, FALSE);
7689 }
7690
7691 /** Common Index Stuff **/
7692
7693 /* Parse DB configuration file */
7694
7695 #define MAX_LINE_LENGTH 1024
7696
7697 typedef enum {
7698 lexIGNORE,
7699 lexINT,
7700 lexSTRING,
7701 lexBOOL,
7702 lexEOF
7703 } LexTokens;
7704
getLine(FILE * fp,CharPtr buf)7705 static CharPtr getLine (FILE *fp, CharPtr buf)
7706 {
7707 buf[0] = '\0';
7708 while (!buf || (buf[0] == '#') || (buf[0] == '\0') || (buf[0] == '\n')) {
7709 FileGets(buf, MAX_LINE_LENGTH, fp);
7710 }
7711 return buf;
7712 }
7713
parseInt(CharPtr buf)7714 static Int4 parseInt(CharPtr buf)
7715 {
7716 Int4 retval;
7717 long my_long;
7718
7719 sscanf(buf, "%ld", &my_long);
7720 retval = my_long;
7721
7722 return retval;
7723 }
7724
parseString(CharPtr buf)7725 static CharPtr parseString(CharPtr buf)
7726 {
7727 CharPtr retval = MemNew(sizeof(Char) * MAX_LINE_LENGTH);
7728
7729 sscanf(buf, "%s", retval);
7730 return retval;
7731 }
7732
parseBool(CharPtr buf)7733 static Boolean parseBool(CharPtr buf)
7734 {
7735 Boolean retval;
7736 CharPtr str = parseString(buf);
7737
7738 if ((!StrCmp(str, "true")) || (!StrCmp(str, "True")) || (!StrCmp(str, "TRUE")) ||
7739 (!StrCmp(str, "t")) || (!StrCmp(str, "T")) ||
7740 (!StrCmp(str, "1")) ||
7741 (!StrCmp(str, "y")) || (!StrCmp(str, "Y")))
7742 retval = TRUE;
7743 else
7744 retval = FALSE;
7745
7746 str = MemFree(str);
7747 return retval;
7748 }
7749
ParseDBConfigFile(DataBaseIDPtr * dbidsp,CharPtr path)7750 Int2 ParseDBConfigFile(DataBaseIDPtr *dbidsp, CharPtr path)
7751 {
7752 Int2 number_of_DBs = 0, i;
7753 FILE *fp;
7754 DataBaseIDPtr retval;
7755 Char buf[MAX_LINE_LENGTH], name[MAX_LINE_LENGTH];
7756 Char dbid[MAX_LINE_LENGTH], isprot[MAX_LINE_LENGTH];
7757 Char full_filename[PATH_MAX];
7758
7759 /* open config file */
7760 if (path && StrCmp(path, "")) {
7761 sprintf(full_filename, "%s%s%s", path, DIRDELIMSTR, DB_CONFIG_FN);
7762 } else {
7763 sprintf(full_filename, "%s", DB_CONFIG_FN);
7764 }
7765
7766 if (!(fp = FileOpen(full_filename, "r")))
7767 return 0;
7768
7769 getLine(fp, buf);
7770
7771 /* first line is number of databases */
7772 number_of_DBs = parseInt(buf);
7773
7774 /* allocate that much memory */
7775 retval = (DataBaseIDPtr) MemNew(sizeof(DataBaseID) * number_of_DBs);
7776
7777 /* each next line is contains name, id and type of a DB */
7778 for (i=0; i < number_of_DBs; i++) {
7779 getLine(fp, buf);
7780 sscanf(buf, "%s%s%s", name, dbid, isprot);
7781 (retval+i)->name = parseString(name);
7782 (retval+i)->id = parseInt(dbid);
7783 (retval+i)->isprot = parseBool(isprot);
7784 }
7785
7786 FILECLOSE(fp);
7787 *dbidsp = retval;
7788 return number_of_DBs;
7789 }
7790
7791 /* ---------------------------------------------------------------------*/
7792 /* --------- Here is set of functions, that uses in formatdb ---------- */
7793 /* ---------------------------------------------------------------------*/
7794
7795 #define STRLENGTH 4096
7796 #define INDEX_INIT_SIZE 1024
7797 #define INDEX_ARRAY_CHUNKS 100000
7798
7799 #define LOOKUP_CHUNK 5
7800 #define LOOKUP_SIZE 12
7801 #define LOOKUP_ID_SIZE 8
7802
7803 #define FORMATDB_SIZE 4
7804 #define ID_MAX_SIZE 64
7805
7806 #define LOOKUP_NO_ERROR 0
7807 #define ERR_GI_FAILED 1
7808 #define ERR_SEQID_FAILED 2
7809
7810 #define NON_SEQID_PREFIX "gnl|BL_ORD_ID|"
7811 #define CREATE_DEFLINE_INDEX 1
7812
7813 #define SEQID_FIELD 1
7814 #define ACCN_FIELD 2
7815 #define DEFLINE_FIELD 4
7816 /* Size of variable that is manipulated, and swapped
7817 for big/little endian stuff. */
7818
7819 static Boolean
FormatDbUint4Write(Uint4 number,FILE * fp)7820 FormatDbUint4Write(Uint4 number, FILE *fp)
7821
7822 {
7823 Uint4 value;
7824
7825 /* If FORMATDB_SIZE changes, this must be changed. */
7826 value = Nlm_SwapUint4(number);
7827 if (FileWrite(&(value), FORMATDB_SIZE, 1, fp) != (Uint4) 1)
7828 return FALSE;
7829
7830 return TRUE;
7831 }
7832
7833
7834 static Boolean
FormatDbUint8Write(Uint8 value,FILE * fp)7835 FormatDbUint8Write(Uint8 value, FILE *fp)
7836 {
7837 Uint1Ptr bytes;
7838
7839 if((bytes = Uint8ToBytes(value)) == NULL)
7840 return FALSE;
7841
7842 if(FileWrite(bytes, 8, 1, fp) != (Uint4) 1) {
7843 MemFree(bytes);
7844 return FALSE;
7845 }
7846
7847 MemFree(bytes);
7848 return TRUE;
7849 }
7850
7851 static Int8
FormatDbUint8Read(NlmMFILEPtr mfp)7852 FormatDbUint8Read(NlmMFILEPtr mfp)
7853 {
7854 Int8 value;
7855 Uint1 bytes[8];
7856
7857 NlmReadMFILE((Uint1Ptr) bytes, 8, 1, mfp);
7858
7859 value = (Int8) BytesToUint8(bytes);
7860
7861 return value;
7862 }
7863
FASTALookupNew(void)7864 static FASTALookupPtr FASTALookupNew(void) {
7865 FASTALookupPtr lookup;
7866
7867 if((lookup = (FASTALookupPtr)MemNew(sizeof(FASTALookup))) == NULL)
7868 return NULL;
7869 if((lookup->table = (Int4Ptr)MemNew(LOOKUP_CHUNK*4)) == NULL)
7870 return NULL;
7871
7872 lookup->allocated = LOOKUP_CHUNK;
7873 lookup->used = 0;
7874 return lookup;
7875 }
FASTALookupFree(FASTALookupPtr lookup)7876 static void FASTALookupFree(FASTALookupPtr lookup)
7877 {
7878 MemFree(lookup->table);
7879 MemFree(lookup);
7880 }
7881
7882 /* ---------------------------------------------------------------------*/
7883 /* - Here is set of functions for creation of taxonomy info database -- */
7884 /* ---------------------------------------------------------------------*/
7885
7886
7887
7888 /*******************************************************************************
7889 * Initializing FormatDB structure (see formatdb.h),
7890 *******************************************************************************
7891 * Parameters:
7892 * dbname - name of the input file
7893 * isProtein - true, if file with protein seqs
7894 *
7895 * Returns pointer to allocated FormatDB structure (FormatDBPtr)
7896 *
7897 ******************************************************************************/
7898
FDBOptionsNew(CharPtr input,Boolean is_prot,CharPtr title,Boolean is_asn,Boolean is_asn_bin,Boolean is_seqentry,Boolean sparse_idx,Boolean test_non_unique,Boolean parse_deflines,CharPtr basename,CharPtr alias_file_name,Int8 bases_per_volume,Int4 seqs_per_volume,Int4 version,Boolean dump_info,EFDBCleanOpt clean_opt)7899 FDB_optionsPtr FDBOptionsNew(CharPtr input, Boolean is_prot, CharPtr title,
7900 Boolean is_asn, Boolean is_asn_bin, Boolean is_seqentry, Boolean
7901 sparse_idx, Boolean test_non_unique, Boolean parse_deflines,
7902 CharPtr basename, CharPtr alias_file_name, Int8 bases_per_volume,
7903 Int4 seqs_per_volume, Int4 version, Boolean dump_info, EFDBCleanOpt
7904 clean_opt)
7905 {
7906 FDB_optionsPtr options = NULL;
7907
7908 if ((input == NULL || input[0] == '\0') && alias_file_name != NULL) {
7909 ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: input file needed");
7910 return NULL;
7911 }
7912
7913 if (!SeqEntryLoad()) {
7914 ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: SeqEntryLoad failed");
7915 return NULL;
7916 }
7917 if (!fdlobjAsnLoad()) {
7918 ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: fdlobjAsnLoad failed");
7919 return NULL;
7920 }
7921 UseLocalAsnloadDataAndErrMsg();
7922
7923 if ((options = (FDB_optionsPtr)MemNew(sizeof(FDB_options))) == NULL) {
7924 ErrPostEx(SEV_ERROR, 0, 0, "FDBOptionsNew: Out of memory");
7925 return NULL;
7926 }
7927
7928 options->db_file = StringSave(input);
7929 if (!title)
7930 options->db_title = basename ?
7931 StringSave(basename) : StringSave(options->db_file);
7932 else
7933 options->db_title = StringSave(title);
7934 options->is_protein = is_prot;
7935 options->parse_mode = parse_deflines;
7936 if (!basename)
7937 options->base_name = StringSave(options->db_file);
7938 else
7939 options->base_name = StringSave(basename);
7940
7941 if (!alias_file_name)
7942 options->alias_file_name = StringSave(options->base_name);
7943 else
7944 options->alias_file_name = StringSave(alias_file_name);
7945
7946 /*
7947 * If specified, set bases_per_volume. If not, set it to
7948 * SEQFILE_SIZE_DFL if nucleotide, or SEQFILE_SIZE_DFL/4 if protein.
7949 */
7950
7951 options->bases_in_volume = (bases_per_volume <= 0) ? SEQFILE_SIZE_DFL : bases_per_volume;
7952
7953 if (is_prot)
7954 options->bases_in_volume /= 4;
7955
7956 options->sequences_in_volume = (seqs_per_volume < 0) ? 0 : seqs_per_volume;
7957
7958 if ((options->version = version) == 0)
7959 options->version = FORMATDB_VER; /* default version */
7960
7961 options->isASN = is_asn;
7962 options->asnbin = is_asn_bin;
7963 options->is_seqentry = is_seqentry;
7964 options->sparse_idx = sparse_idx;
7965 options->test_non_unique = test_non_unique;
7966 options->total_num_of_seqs = 0;
7967 if (clean_opt >= 0 && clean_opt < eCleanOptMax)
7968 options->clean_opt = clean_opt;
7969 else
7970 options->clean_opt = (EFDBCleanOpt) 0;
7971
7972 /* The following options are for NCBI use only */
7973 options->dump_info = dump_info;
7974 options->linkbit_listp = NULL;
7975 options->memb_tblp = NULL;
7976 options->memb_argp = NULL;
7977 options->tax_lookup = NULL;
7978
7979 return options;
7980 }
7981
7982 /* Recursively remove a blast database specified by base_name. dbtype must be
7983 * either 'p' or 'n' (lowercase) to denote a protein or nucleotide database
7984 * respectively. */
FDBCleanUpRecursively(CharPtr base_name,Char dbtype)7985 Boolean FDBCleanUpRecursively(CharPtr base_name, Char dbtype)
7986 {
7987 Char filenamebuf[FILENAME_MAX];
7988 ReadDBAliasPtr rdbap = NULL;
7989 Boolean done = FALSE;
7990 CharPtr p = NULL;
7991
7992 /* Handle alias files */
7993 sprintf(filenamebuf, "%s.%cal", base_name, dbtype);
7994 rdbap = readdb_read_alias_file(filenamebuf);
7995
7996 if (rdbap && (CheckForRecursion(filenamebuf, rdbap->dblist) == FALSE)) {
7997
7998 p = rdbap->dblist;
7999 while (!done) {
8000 done = readdb_parse_db_names(&p, filenamebuf);
8001 if (*filenamebuf == NULLB)
8002 break;
8003 FDBCleanUpRecursively(filenamebuf, dbtype);
8004 }
8005 sprintf(filenamebuf, "%s.%cal", base_name, dbtype);
8006 FileRemove(filenamebuf); /* alias file */
8007 rdbap = ReadDBAliasFree(rdbap);
8008 ErrLogPrintf("Removed %s\n",filenamebuf);
8009
8010 } else { /* Single-volume blast database */
8011
8012 sprintf(filenamebuf, "%s.%cin", base_name, dbtype);
8013 FileRemove(filenamebuf); /* index file */
8014 sprintf(filenamebuf, "%s.%chr", base_name, dbtype);
8015 FileRemove(filenamebuf); /* header file */
8016 sprintf(filenamebuf, "%s.%csq", base_name, dbtype);
8017 FileRemove(filenamebuf); /* sequence file */
8018 sprintf(filenamebuf, "%s.%csi", base_name, dbtype);
8019 FileRemove(filenamebuf); /* string isam index file */
8020 sprintf(filenamebuf, "%s.%csd", base_name, dbtype);
8021 FileRemove(filenamebuf); /* string isam data file */
8022 sprintf(filenamebuf, "%s.%cni", base_name, dbtype);
8023 FileRemove(filenamebuf); /* numeric isam index file */
8024 sprintf(filenamebuf, "%s.%cnd", base_name, dbtype);
8025 FileRemove(filenamebuf); /* numeric isam data file */
8026 if (dbtype == 'p') {
8027 sprintf(filenamebuf, "%s.ppi", base_name);
8028 FileRemove(filenamebuf); /* PIG isam index file */
8029 sprintf(filenamebuf, "%s.ppd", base_name);
8030 FileRemove(filenamebuf); /* PIG isam data file */
8031 }
8032 sprintf(filenamebuf, "%s.%cti", base_name, dbtype);
8033 FileRemove(filenamebuf); /* deprecated taxonomy index file */
8034 sprintf(filenamebuf, "%s.%ctd", base_name, dbtype);
8035 FileRemove(filenamebuf); /* deprecated taxonomy data file */
8036 sprintf(filenamebuf, "%s.%ctm", base_name, dbtype);
8037 FileRemove(filenamebuf); /* formatdb temporary file */
8038 sprintf(filenamebuf, "%s.%cdi", base_name, dbtype);
8039 FileRemove(filenamebuf); /* formatdb dump info file (NCBI only) */
8040 ErrLogPrintf("Removed single-volume database %s\n",base_name);
8041
8042 }
8043 return TRUE;
8044 }
8045
8046 /* Before creating any files, check if there are any blast database files
8047 * that might collide with the one about to be created.
8048 * Returns FALSE only if user does not want to proceed. */
FDBCleanUp(FDB_optionsPtr options)8049 Boolean FDBCleanUp(FDB_optionsPtr options)
8050 {
8051 Boolean alias_file_exists = FALSE, index_file_exists = FALSE;
8052 Char filenamebuf[FILENAME_MAX] = { NULLB };
8053 MsgAnswer ans;
8054 Char dbtype;
8055
8056 if (!options || options->clean_opt == eCleanNever)
8057 return TRUE;
8058
8059 dbtype = options->is_protein ? 'p' : 'n';
8060
8061 /* First look for an alias file */
8062 sprintf(filenamebuf, "%s.%cal", options->base_name, dbtype);
8063 if (FileLengthEx(filenamebuf) != -1)
8064 alias_file_exists = TRUE;
8065
8066 /* Now try an index file */
8067 memset((void*) &filenamebuf, 0, sizeof(filenamebuf));
8068 sprintf(filenamebuf, "%s.%cin", options->base_name, dbtype);
8069 if (FileLength(filenamebuf) != -1)
8070 index_file_exists = TRUE;
8071
8072 /* nothing to remove ? */
8073 if (!alias_file_exists && !index_file_exists)
8074 return TRUE;
8075
8076 switch (options->clean_opt) {
8077 case eCleanPrompt:
8078 #ifdef OS_UNIX
8079 if (!StringCmp(options->db_file, "stdin")) {
8080 ErrPostEx(SEV_ERROR, 0, 0, "Cannot prompt for answer if "
8081 "input to format is stdin");
8082 return FALSE;
8083 }
8084 #endif
8085 ans = Message(KEY_YNC, "Would you like to clean up %s.* files?",
8086 options->base_name);
8087 if (ans == ANS_NO || ans == ANS_CANCEL) {
8088 ErrLogPrintf("User cancelled formatting.\n");
8089 return FALSE;
8090 } /* else fall through and clean up ! */
8091 case eCleanAlways:
8092 default:
8093 FDBCleanUpRecursively(options->base_name, dbtype);
8094 break;
8095 }
8096
8097 return TRUE;
8098 }
8099
8100 /* Deletes all volumes of a BLAST databases which is "in progress" (i.e.: being
8101 * built). This is necessary for proper clean up in case of errors, specially
8102 * if the maximum number of volumes is reached. */
FDBCleanUpInProgress(const FDB_options * options)8103 void FDBCleanUpInProgress(const FDB_options* options)
8104 {
8105 int volume = 0;
8106 char base_name[FILENAME_MAX] = { '\0' };
8107
8108 ASSERT(options);
8109 ASSERT(options->volume > 1);
8110
8111 StringNCpy(base_name, options->base_name, StrLen(options->base_name) - 3);
8112
8113 for (volume = 0; volume < options->volume; volume++) {
8114 FDB_options opts_tmp;
8115 memcpy((void*)&opts_tmp, (void*)options, sizeof(*options));
8116 opts_tmp.base_name = (char*)MemNew(FILENAME_MAX);
8117 sprintf(opts_tmp.base_name, "%s.%02d", base_name, volume);
8118 opts_tmp.clean_opt = eCleanAlways;
8119 FDBCleanUp(&opts_tmp);
8120 free(opts_tmp.base_name);
8121 }
8122 }
8123
8124 /* Initialize the formatdb structure.
8125 * Taxonomy databases, link and membership tables should be initialized in the
8126 * options structure, by separate functions */
FormatDBInit(FDB_optionsPtr options)8127 FormatDBPtr FormatDBInit(FDB_optionsPtr options)
8128 {
8129
8130 FormatDBPtr fdbp;
8131 Char filenamebuf[FILENAME_MAX];
8132 Uint4 i = 0;
8133
8134 if(options == NULL)
8135 return NULL;
8136
8137 if(options->db_file == NULL)
8138 {
8139 ErrPostEx(SEV_ERROR, 0, 0, "No database name was specified");
8140 return NULL;
8141 }
8142
8143 fdbp = (FormatDBPtr) MemNew (sizeof(*fdbp));
8144
8145 fdbp->num_of_seqs = 0;
8146 fdbp->TotalLen=0, fdbp->MaxSeqLen=0;
8147
8148 fdbp->options = options;
8149
8150 /* The next 2 fields are set in FDBOptionsNew, but kept for older apps
8151 * that don't use that function */
8152 if (options->version == 0)
8153 fdbp->options->version = FORMATDB_VER;
8154
8155 /* If basename is NULL, use dbname. */
8156 if (options->base_name == NULL)
8157 options->base_name = StringSave(options->db_file);
8158
8159 fdbp->fd = NULL;
8160 fdbp->aip = NULL;
8161
8162 /* Clean up if necessary */
8163 if (!FDBCleanUp(options))
8164 return NULL;
8165
8166 /* open output BLAST files */
8167
8168 /* Defline file */
8169
8170 sprintf(filenamebuf, "%s.%chr",
8171 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8172
8173 if (options->version > FORMATDB_VER_TEXT) {
8174 fdbp->aip_def = AsnIoOpen(filenamebuf, "wb");
8175 } else {
8176 fdbp->fd_def = FileOpen(filenamebuf, "wb");
8177 }
8178
8179 /* Sequence file */
8180
8181 sprintf(filenamebuf, "%s.%csq",
8182 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8183 fdbp->fd_seq = FileOpen(filenamebuf, "wb");
8184
8185 if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1) /* Sequence file started from NULLB */
8186 return NULL;
8187
8188 /* Index file */
8189
8190 sprintf(filenamebuf, "%s.%cin",
8191 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8192 fdbp->fd_ind = FileOpen(filenamebuf, "wb");
8193
8194 /* Misc. info dump file */
8195
8196 if(options->dump_info) {
8197 sprintf(filenamebuf, "%s.%cdi",
8198 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8199 fdbp->fd_sdi = FileOpen(filenamebuf, "wb");
8200 }
8201
8202 /* String (accession) index temporary file */
8203
8204 fdbp->fd_stmp = NULL;
8205
8206 if(options->parse_mode) {
8207 sprintf(filenamebuf, "%s.%ctm",
8208 options->base_name, fdbp->options->is_protein ? 'p' : 'n');
8209 fdbp->fd_stmp = FileOpen(filenamebuf, "wb");
8210 }
8211 ErrLogPrintf("Version %s [%s]\n", BlastGetVersionNumber(), BlastGetReleaseDate());
8212 ErrLogPrintf("Started database file \"%s\"\n", options->db_file);
8213 /* Allocating space for offset tables */
8214 fdbp->OffsetAllocated = INDEX_INIT_SIZE; /* initial value */
8215 fdbp->DefOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8216 fdbp->SeqOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8217
8218 if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
8219 ErrLogPrintf("Not enough memory to initialize main formatdb structure. Formatting failed.\n");
8220 return NULL;
8221 }
8222
8223 if(!options->is_protein) {
8224 fdbp->AmbOffsetTable = (Int4Ptr)MemNew(fdbp->OffsetAllocated*sizeof(Uint4));
8225 if (!fdbp->AmbOffsetTable) {
8226 ErrLogPrintf("Not enough memory to initialize main formatdb structure. Formatting failed.\n");
8227 return NULL;
8228 }
8229 } else {
8230 fdbp->AmbOffsetTable = NULL;
8231 }
8232
8233
8234 /* Allocating space for lookup table */
8235
8236 if((fdbp->lookup = FASTALookupNew()) == NULL) {
8237 ErrLogPrintf("Error initializing Lookup structure. Formatting failed.\n");
8238 return NULL;
8239 }
8240
8241 /* Allocate the PIG table structure */
8242 if ( !(fdbp->ptable = FDBPigTableNew())) {
8243 ErrLogPrintf("Not enough memory to allocate PIG table structure.\n");
8244 return NULL;
8245 }
8246
8247 return fdbp;
8248 }
8249
FDBLoadMembershipsTable(void)8250 ValNodePtr FDBLoadMembershipsTable(void)
8251 {
8252 ValNodePtr retval = NULL;
8253 MembInfoPtr mip = NULL;
8254 Int2 nbits, bit;
8255 Char buffer[256], numstr[256];
8256
8257 /* Get the number of bits used according to the config file */
8258 nbits = GetAppParamInt2("formatdb","MembershipBitNumbers","TotalNum",0);
8259 if (nbits <= 0) {
8260 return NULL;
8261 }
8262
8263 /* For each bit, load the appropriate criteria function */
8264 for (bit = 1; bit <= nbits; bit++) {
8265 const char* fn_name = NULL;
8266
8267 memset((void*) &buffer, 0, sizeof(buffer));
8268 memset((void*) &numstr, 0, sizeof(numstr));
8269 Int8ToString((Int8)bit,numstr,sizeof(numstr));
8270 GetAppParam("formatdb","MembershipBitNumbers",numstr,"",buffer,
8271 sizeof(buffer)-1);
8272 if (!mip) {
8273 mip = (MembInfoPtr) MemNew(sizeof(MembInfo));
8274 mip->criteria = NULL;
8275 }
8276 mip->bit_number = bit;
8277
8278 if (!StringICmp("swissprot",buffer)) {
8279 mip->criteria = is_SWISSPROT;
8280 fn_name = "is_SWISSPROT";
8281 } else if (!StringICmp("pdb",buffer)) {
8282 mip->criteria = is_PDB;
8283 fn_name = "is_PDB";
8284 } else if (!StringICmp("refseq_genomic",buffer)) {
8285 mip->criteria = is_REFSEQ_GENOMIC;
8286 fn_name = "is_REFSEQ_GENOMIC";
8287 } else if (!StringICmp("refseq_rna",buffer)) {
8288 mip->criteria = is_REFSEQ_RNA;
8289 fn_name = "is_REFSEQ_RNA";
8290 } else if (!StringICmp("refseq_protein",buffer) ||
8291 !StringICmp("refseq_chromosome", buffer)) {
8292 /* refseq_chromosome added per BD-308 */
8293 mip->criteria = is_REFSEQ;
8294 fn_name = "is_REFSEQ";
8295 }
8296
8297 /* Add to the return value only if the criteria is set */
8298 if (mip->criteria != NULL) {
8299 ValNodeAddPointer(&retval,0,mip);
8300 mip = NULL;
8301 /*
8302 ErrLogPrintf("Membership bit %d: criteria for '%s' determined "
8303 "by function '%s'\n", bit, buffer, fn_name);
8304 */
8305 }
8306 }
8307 if (mip && mip->criteria == NULL)
8308 MemFree(mip);
8309
8310 return retval;
8311 }
8312
FDBLoadLinksTable(void)8313 ValNodePtr FDBLoadLinksTable(void)
8314 {
8315 ValNodePtr retval = NULL;
8316 Int4ListPtr gis = NULL;
8317 LinkInfoPtr lk_info = NULL;
8318 Int2 nbits, bit, nlists = 0;
8319 Char buffer[256], numstr[256], filename[FILENAME_MAX];
8320
8321 /* Get the number of bits used according to the config file */
8322 nbits = GetAppParamInt2("formatdb","LinkBitNumbers","TotalNum",0);
8323 if (nbits <= 0) {
8324 return NULL;
8325 }
8326
8327 /* For each bit and database, open the appropriate files and create the
8328 * gi lists */
8329 for (bit = 1; bit <= nbits; bit++) {
8330 memset((void*) numstr, 0, sizeof(numstr));
8331 memset((void*) buffer, 0, sizeof(buffer));
8332 memset((void*) filename, 0, sizeof(filename));
8333
8334 Int8ToString((Int8)bit,numstr,sizeof(numstr));
8335 GetAppParam("formatdb", "LinkBitNumbers", numstr, "", buffer,
8336 sizeof(buffer)-1);
8337 GetAppParam("formatdb", "LinkFiles", buffer, "", filename,
8338 sizeof(filename)-1);
8339 if (StrLen(filename) == 0 || FileLength(filename) == 0) {
8340 ErrPostEx(SEV_WARNING,0,0,"Ignoring '%s' listing because it is "
8341 "empty", buffer);
8342 continue;
8343 }
8344 if ((gis = Int4ListReadFromFile(filename)) == NULL) {
8345 ErrPostEx(SEV_ERROR,0,0,"Could not read %s", filename);
8346 continue;
8347 }
8348 HeapSort(gis->i, gis->count, sizeof(Int4), ID_Compare);
8349 lk_info = (LinkInfoPtr) MemNew(sizeof(LinkInfo));
8350 lk_info->bit_number = bit;
8351 lk_info->gi_list = gis;
8352 ValNodeAddPointer(&retval,0,lk_info);
8353 nlists++;
8354 ErrLogPrintf("Link bit %d: %ld gis from %s\n", bit, gis->count,
8355 filename);
8356 }
8357
8358 return retval;
8359 }
8360
8361 /* This function will build (or create if needed) a chain of ValNode's
8362 * containing integers, which act as a large bit array, and set the
8363 * indicated bit. */
FDBBlastDefLineSetBit(Int2 bit_no,ValNodePtr PNTR retval)8364 void FDBBlastDefLineSetBit(Int2 bit_no, ValNodePtr PNTR retval)
8365 {
8366 Int4 bit_offset = 0, bit_mask = 0, i;
8367 Int4 currValNode = 0;
8368 ValNodePtr tmp = NULL;
8369
8370 if (bit_no <= 0 || retval == NULL)
8371 return;
8372
8373 bit_offset = (bit_no-1) % MASK_WORD_SIZE;
8374 currValNode = (Int4) ((bit_no-1)/MASK_WORD_SIZE);
8375
8376 /* Allocate nodes if necessary */
8377 while (ValNodeLen(*retval) <= currValNode) {
8378 if (*retval == NULL) {
8379 (*retval) = ValNodeAddInt(NULL,0,0);
8380 } else {
8381 ValNodeAddInt(retval,0,0);
8382 }
8383 }
8384
8385 /* Traverse the linked list of ValNodePtrs and use the bit_mask
8386 * in the appropriate node */
8387 bit_mask = 0x1 << bit_offset;
8388
8389 tmp = *retval;
8390 for (i = 0; i < currValNode; i++)
8391 tmp = tmp->next;
8392
8393 tmp->data.intvalue |= bit_mask;
8394 }
8395
8396 static void
8397 s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,
8398 const FDBTaxidDeflineTablePtr taxid_tbl);
8399
FDBGetDefAsnFromBioseq(BioseqPtr bsp,const FDBTaxidDeflineTablePtr taxid_tbl)8400 BlastDefLinePtr FDBGetDefAsnFromBioseq(BioseqPtr bsp,
8401 const FDBTaxidDeflineTablePtr taxid_tbl)
8402 {
8403 BlastDefLinePtr bdp = NULL, bdp_last, bdp_head;
8404 CharPtr title, chptr, orig_title;
8405
8406 if(bsp == NULL)
8407 return NULL;
8408
8409 bdp = BlastDefLineNew();
8410 bdp_head = bdp;
8411
8412 bdp->seqid = SeqIdSetDup(bsp->id);
8413 title = BioseqGetTitle(bsp);
8414
8415 orig_title = title = StringSave(title);
8416
8417 chptr = NULL;
8418 if((chptr = StringChr(title, '\1')) != NULL) {
8419 *chptr = NULLB;
8420 chptr++;
8421 }
8422 bdp->title = StringSave(title);
8423 bdp_last = bdp;
8424
8425
8426 while(chptr != NULL) {
8427
8428 bdp = BlastDefLineNew();
8429
8430 title = chptr;
8431
8432 if((chptr = StringChr(title, ' ')) != NULL) {
8433 *chptr = NULLB;
8434 chptr++;
8435 }
8436 bdp->seqid = SeqIdParse(title);
8437 title = chptr;
8438
8439 if((chptr = StringChr(title, '\1')) != NULL) {
8440 *chptr = NULLB;
8441 chptr++;
8442 }
8443 if(title != NULL)
8444 bdp->title = StringSave(title);
8445 else
8446 bdp->title = StringSave("No definition found");
8447
8448 bdp_last->next = bdp;
8449 bdp_last = bdp;
8450 }
8451
8452 MemFree(orig_title);
8453 s_FDBUpdateTaxIdInBdpList(bdp_head, taxid_tbl);
8454 return bdp_head;
8455 }
8456
8457 Int4 LIBCALL
Int4ListBSearch(Int4ListPtr lp,Int4 key)8458 Int4ListBSearch PROTO((Int4ListPtr lp, Int4 key))
8459 {
8460 Int4 m, b, e;
8461
8462 if (!lp)
8463 return -1;
8464
8465 b = 0;
8466 e = lp->count-1;
8467
8468 while (b <= e) {
8469 m = (b + e) / 2;
8470 if (lp->i[m] == key)
8471 return m;
8472 else if (lp->i[m] < key)
8473 b = m + 1;
8474 else
8475 e = m - 1;
8476 }
8477 return -1;
8478 }
8479
FDBAddLinksInformation(BlastDefLinePtr bdp,ValNodePtr links_tblp)8480 Boolean FDBAddLinksInformation(BlastDefLinePtr bdp, ValNodePtr links_tblp)
8481 {
8482 ValNodePtr link_vnp = NULL, vnp_list = NULL;
8483 SeqIdPtr sip = NULL;
8484 Int4 gi = 0;
8485
8486 if (bdp == NULL || links_tblp == NULL)
8487 return FALSE;
8488
8489 /* Extract the gi from the bdp */
8490 if ((sip = SeqIdFindBest(bdp->seqid, SEQID_GI)) == NULL)
8491 return FALSE;
8492 gi = sip->data.intvalue;
8493
8494 for (vnp_list = links_tblp; vnp_list; vnp_list = vnp_list->next) {
8495
8496 LinkInfoPtr lk_info = (LinkInfoPtr)vnp_list->data.ptrvalue;
8497 if (Int4ListBSearch(lk_info->gi_list, gi) != -1)
8498 FDBBlastDefLineSetBit(lk_info->bit_number, &link_vnp);
8499 }
8500
8501 if (link_vnp)
8502 bdp->links = link_vnp;
8503
8504 return TRUE;
8505 }
8506
FDBAddMembershipInformation(BlastDefLinePtr bdp,ValNodePtr memb_tblp,VoidPtr criteria_arg)8507 Boolean FDBAddMembershipInformation(BlastDefLinePtr bdp, ValNodePtr memb_tblp,
8508 VoidPtr criteria_arg)
8509 {
8510 ValNodePtr memb_vnp = NULL;
8511 MembInfoPtr mip = NULL;
8512
8513 if (bdp == NULL || memb_tblp == NULL)
8514 return FALSE;
8515
8516 /* Set the appropriate bit if this sequence satisfies the criteria */
8517 while (memb_tblp) {
8518 mip = (MembInfoPtr) memb_tblp->data.ptrvalue;
8519 if (mip->criteria(criteria_arg))
8520 FDBBlastDefLineSetBit(mip->bit_number, &memb_vnp);
8521 memb_tblp = memb_tblp->next;
8522 }
8523
8524 if (memb_vnp)
8525 bdp->memberships = memb_vnp;
8526
8527 return TRUE;
8528 }
8529
FDBDestroyLinksTable(ValNodePtr list)8530 ValNodePtr FDBDestroyLinksTable(ValNodePtr list)
8531 {
8532 ValNodePtr tmp_vnp;
8533 LinkInfoPtr lk_info;
8534
8535 if (!list)
8536 return NULL;
8537
8538 for (tmp_vnp = list; tmp_vnp; tmp_vnp = tmp_vnp->next) {
8539 lk_info = (LinkInfoPtr) tmp_vnp->data.ptrvalue;
8540 lk_info->gi_list = Int4ListFree(lk_info->gi_list);
8541 }
8542 list = ValNodeFreeData(list);
8543
8544 return NULL;
8545 }
8546
FDBDestroyMembershipsTable(ValNodePtr tbl)8547 ValNodePtr FDBDestroyMembershipsTable(ValNodePtr tbl)
8548 {
8549 MembInfoPtr mip = NULL;
8550
8551 while (tbl) {
8552 mip = (MembInfoPtr) tbl->data.ptrvalue;
8553 MemFree(mip);
8554 tbl = tbl->next;
8555 }
8556 return tbl;
8557 }
8558
8559
8560 #define REDUCED_E2INDEX_SET 1
8561 #ifdef REDUCED_E2INDEX_SET
8562 /*****************************************************************************
8563 *
8564 * SeqIdE2Index(anp)
8565 * atp is the current type (if identifier of a parent struct)
8566 * if atp == NULL, then assumes it stands alone (SeqId ::=)
8567 *
8568 *****************************************************************************/
SeqIdE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8569 static Boolean SeqIdE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8570 Boolean sparse)
8571 {
8572 Boolean retval = FALSE;
8573 TextSeqIdPtr tsip = NULL;
8574 ObjectIdPtr oid;
8575 PDBSeqIdPtr psip;
8576 Uint1 tmptype;
8577 CharPtr tmp, ptr=NULL;
8578 Char buf[81];
8579 Int4 length, i;
8580 DbtagPtr dbt;
8581 Uint1 chain = 0;
8582 Int2 version = 0;
8583
8584 if (anp == NULL)
8585 return FALSE;
8586
8587 if (anp->choice == SEQID_GI)
8588 return TRUE; /* Do not index GI as string. */
8589
8590 switch (anp->choice) {
8591
8592 case SEQID_LOCAL: /* local */
8593 oid = (ObjectIdPtr)(anp->data.ptrvalue);
8594 ptr = oid->str;
8595 break;
8596 case SEQID_GIBBSQ: /* gibbseq */
8597 sprintf(buf, "%ld", (long)(anp->data.intvalue));
8598 ptr = buf;
8599 break;
8600 case SEQID_GIBBMT: /* gibbmt */
8601 break;
8602 case SEQID_GIIM: /* giimid */
8603 return TRUE; /* not indexed */
8604 case SEQID_EMBL: /* embl */
8605 case SEQID_DDBJ: /* ddbj */
8606 case SEQID_GENBANK: /* genbank */
8607 case SEQID_TPG: /* Third Party Annot/Seq Genbank */
8608 case SEQID_TPE: /* Third Party Annot/Seq EMBL */
8609 case SEQID_TPD: /* Third Party Annot/Seq DDBJ */
8610 case SEQID_OTHER: /* other */
8611 case SEQID_GPIPE: /* genome pipeline */
8612 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8613 if ((tsip->version > 0) && (tsip->release == NULL))
8614 version = tsip->version;
8615 break;
8616 case SEQID_SWISSPROT: /* swissprot */
8617 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8618 if (tsip->version > 0)
8619 version = tsip->version;
8620 break;
8621 case SEQID_PIR: /* pir */
8622 case SEQID_PRF: /* prf */
8623 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8624 break;
8625 case SEQID_PATENT: /* patent seq id */
8626 break;
8627 case SEQID_GENERAL: /* general */
8628 dbt = (DbtagPtr)(anp->data.ptrvalue);
8629 ptr = dbt->tag->str;
8630 break;
8631 case SEQID_GI: /* gi */
8632 break;
8633 case SEQID_PDB: /* pdb */
8634 psip = (PDBSeqIdPtr)(anp->data.ptrvalue);
8635 ptr = psip->mol;
8636 chain = psip->chain;
8637 break;
8638 }
8639
8640 if(tsip == NULL) {
8641 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8642
8643 length = StringLen(buf);
8644 for(i = 0; i < length; i++)
8645 buf[i] = TO_LOWER(buf[i]);
8646
8647 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8648
8649 }
8650
8651 if (tsip == NULL && ptr != NULL) { /* write a single string for non TextSeqIDPtr cases. */
8652 StringMove(buf, ptr);
8653 length = StringLen(buf);
8654 for(i = 0; i < length; i++)
8655 buf[i] = TO_LOWER(buf[i]);
8656 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8657
8658 chain = TO_LOWER(chain);
8659
8660 if (chain != 0) { /* PDB only. */
8661 fprintf(fd, "%s|%c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8662 (long) seq_num);
8663 fprintf(fd, "%s %c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8664 (long) seq_num);
8665 }
8666 }
8667
8668 if (tsip != NULL) { /* separately index accession and locus */
8669 /* now index as separate strings */
8670 if (tsip->name != NULL) {
8671 StringMove(buf, tsip->name);
8672 length = StringLen(buf);
8673 for(i = 0; i < length; i++)
8674 buf[i] = TO_LOWER(buf[i]);
8675 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8676 }
8677 if (tsip->accession != NULL) {
8678 StringMove(buf, tsip->accession);
8679 length = StringLen(buf);
8680 for(i = 0; i < length; i++)
8681 buf[i] = TO_LOWER(buf[i]);
8682 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8683 if (version)
8684 fprintf(fd, "%s.%d%c%ld\n", buf, version, ISAM_DATA_CHAR, (long) seq_num);
8685 }
8686 }
8687
8688 retval = TRUE;
8689 return retval;
8690 }
8691 #else
8692 /*****************************************************************************
8693 *
8694 * SeqIdE2Index(anp)
8695 * atp is the current type (if identifier of a parent struct)
8696 * if atp == NULL, then assumes it stands alone (SeqId ::=)
8697 *
8698 *****************************************************************************/
SeqIdE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8699 static Boolean SeqIdE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8700 Boolean sparse)
8701 {
8702 Boolean retval = FALSE;
8703 TextSeqIdPtr tsip = NULL;
8704 ObjectIdPtr oid;
8705 PDBSeqIdPtr psip;
8706 Boolean do_gb = FALSE;
8707 Uint1 tmptype;
8708 CharPtr tmp, ptr=NULL;
8709 Char buf[81];
8710 Int4 length, i;
8711 DbtagPtr dbt;
8712 Uint1 chain = 0;
8713 Int2 version = 0;
8714
8715 if (anp == NULL)
8716 return FALSE;
8717
8718 switch (anp->choice) {
8719
8720 case SEQID_LOCAL: /* local */
8721 oid = (ObjectIdPtr)(anp->data.ptrvalue);
8722 ptr = oid->str;
8723 break;
8724 case SEQID_GIBBSQ: /* gibbseq */
8725 sprintf(buf, "%ld", (long)(anp->data.intvalue));
8726 ptr = buf;
8727 break;
8728 case SEQID_GIBBMT: /* gibbmt */
8729 break;
8730 case SEQID_GIIM: /* giimid */
8731 return TRUE; /* not indexed */
8732 case SEQID_EMBL: /* embl */
8733 case SEQID_DDBJ: /* ddbj */
8734 do_gb = TRUE; /* also index embl, ddbj as genbank */
8735 case SEQID_GENBANK: /* genbank */
8736 case SEQID_TPG: /* Third Party Annot/Seq Genbank */
8737 case SEQID_TPE: /* Third Party Annot/Seq EMBL */
8738 case SEQID_TPD: /* Third Party Annot/Seq DDBJ */
8739 case SEQID_OTHER: /* other */
8740 case SEQID_GPIPE: /* genome pipeline */
8741 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8742 if ((tsip->version > 0) && (tsip->release == NULL))
8743 version = tsip->version;
8744 break;
8745 case SEQID_SWISSPROT: /* swissprot */
8746 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8747 if (tsip->version > 0)
8748 version = tsip->version;
8749 break;
8750 case SEQID_PIR: /* pir */
8751 case SEQID_PRF: /* prf */
8752 tsip = (TextSeqIdPtr)(anp->data.ptrvalue);
8753 break;
8754 case SEQID_PATENT: /* patent seq id */
8755 break;
8756 case SEQID_GENERAL: /* general */
8757 dbt = (DbtagPtr)(anp->data.ptrvalue);
8758 ptr = dbt->tag->str;
8759 break;
8760 case SEQID_GI: /* gi */
8761 break;
8762 case SEQID_PDB: /* pdb */
8763 psip = (PDBSeqIdPtr)(anp->data.ptrvalue);
8764 ptr = psip->mol;
8765 chain = psip->chain;
8766 break;
8767 }
8768
8769 if(!sparse) {
8770 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8771
8772 length = StringLen(buf);
8773 for(i = 0; i < length; i++)
8774 buf[i] = TO_LOWER(buf[i]);
8775
8776 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8777
8778 /* Index without version. */
8779 if (version) {
8780 tsip->version = 0;
8781 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8782
8783 length = StringLen(buf);
8784 for(i = 0; i < length; i++)
8785 buf[i] = TO_LOWER(buf[i]);
8786 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8787 tsip->version = version;
8788 }
8789 } /* if(!sparse) */
8790
8791 if (ptr != NULL) { /* write a single string */
8792 StringMove(buf, ptr);
8793 length = StringLen(buf);
8794 for(i = 0; i < length; i++)
8795 buf[i] = TO_LOWER(buf[i]);
8796 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8797
8798 chain = TO_LOWER(chain);
8799
8800 if (chain != 0) { /* PDB only. */
8801 fprintf(fd, "%s|%c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8802 (long) seq_num);
8803 fprintf(fd, "%s %c%c%ld\n", buf, chain, ISAM_DATA_CHAR,
8804 (long) seq_num);
8805 }
8806 }
8807
8808 if (tsip != NULL) { /* separately index accession and locus */
8809 if ((tsip->accession != NULL) && (tsip->name != NULL) && !sparse) {
8810 tmp = tsip->accession;
8811 tsip->accession = NULL;
8812 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8813 length = StringLen(buf);
8814 for(i = 0; i < length; i++)
8815 buf[i] = TO_LOWER(buf[i]);
8816 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8817 tsip->accession = tmp;
8818 tmp = tsip->name;
8819 tsip->name = NULL;
8820 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8821 length = StringLen(buf);
8822 for(i = 0; i < length; i++)
8823 buf[i] = TO_LOWER(buf[i]);
8824 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8825 tsip->name = tmp;
8826 if (version)
8827 { /* Index accession without verison. */
8828 tsip->version = 0;
8829 tmp = tsip->name;
8830 tsip->name = NULL;
8831 SeqIdWrite(anp, buf, PRINTID_FASTA_SHORT, 80);
8832 length = StringLen(buf);
8833 for(i = 0; i < length; i++)
8834 buf[i] = TO_LOWER(buf[i]);
8835 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8836 tsip->name = tmp;
8837 tsip->version = version;
8838 }
8839 }
8840
8841 /* now index as separate strings */
8842 if (tsip->name != NULL && !sparse) {
8843 StringMove(buf, tsip->name);
8844 length = StringLen(buf);
8845 for(i = 0; i < length; i++)
8846 buf[i] = TO_LOWER(buf[i]);
8847 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8848 }
8849 if (tsip->accession != NULL) {
8850 StringMove(buf, tsip->accession);
8851 length = StringLen(buf);
8852 for(i = 0; i < length; i++)
8853 buf[i] = TO_LOWER(buf[i]);
8854 fprintf(fd, "%s%c%ld\n", buf, ISAM_DATA_CHAR, (long) seq_num);
8855 if (version && !sparse)
8856 fprintf(fd, "%s.%d%c%ld\n", buf, version, ISAM_DATA_CHAR, (long) seq_num);
8857 }
8858
8859 }
8860
8861 if (do_gb && !sparse) { /* index embl and ddbj as genbank */
8862 tmptype = anp->choice;
8863 anp->choice = SEQID_GENBANK;
8864 SeqIdE2Index(anp, fd, seq_num, sparse);
8865 anp->choice = tmptype;
8866 }
8867
8868 retval = TRUE;
8869 return retval;
8870 }
8871 #endif
8872
8873 /*****************************************************************************
8874 *
8875 * SeqIdSetE2Index(anp, e2p, settype, elementtype)
8876 *
8877 *****************************************************************************/
SeqIdSetE2Index(SeqIdPtr anp,FILE * fd,Int4 seq_num,Boolean sparse)8878 static Boolean SeqIdSetE2Index (SeqIdPtr anp, FILE *fd, Int4 seq_num,
8879 Boolean sparse)
8880 {
8881 SeqIdPtr oldanp;
8882 Boolean retval = FALSE;
8883
8884 if (anp == NULL)
8885 return FALSE;
8886
8887 oldanp = anp;
8888
8889 while (anp != NULL) {
8890 if (!SeqIdE2Index(anp, fd, seq_num, sparse))
8891 goto erret;
8892 anp = anp->next;
8893 }
8894
8895 retval = TRUE;
8896 erret:
8897 return retval;
8898 }
SeqIdSetFree_NO_OBJ_MGR(SeqIdPtr sip)8899 static SeqIdPtr SeqIdSetFree_NO_OBJ_MGR(SeqIdPtr sip)
8900 {
8901 SeqIdPtr next;
8902
8903 while(sip != NULL){
8904 next=sip->next;
8905 switch(sip->choice) {
8906 case SEQID_LOCAL: /* local */
8907 ObjectIdFree(sip->data.ptrvalue);
8908 break;
8909 case SEQID_GIBBSQ: /* gibbseq */
8910 case SEQID_GIBBMT: /* gibbmt */
8911 break;
8912 case SEQID_GIIM: /* giimid */
8913 GiimFree(sip->data.ptrvalue);
8914 break;
8915 case SEQID_GENBANK: /* genbank */
8916 case SEQID_EMBL: /* embl */
8917 case SEQID_PIR: /* pir */
8918 case SEQID_SWISSPROT: /* swissprot */
8919 case SEQID_OTHER: /* other */
8920 case SEQID_DDBJ:
8921 case SEQID_TPG: /* Third Party Annot/Seq Genbank */
8922 case SEQID_TPE: /* Third Party Annot/Seq EMBL */
8923 case SEQID_TPD: /* Third Party Annot/Seq DDBJ */
8924 case SEQID_GPIPE:
8925 case SEQID_PRF:
8926 TextSeqIdFree(sip->data.ptrvalue);
8927 break;
8928 case SEQID_PATENT: /* patent seq id */
8929 PatentSeqIdFree(sip->data.ptrvalue);
8930 break;
8931 case SEQID_GENERAL: /* general */
8932 DbtagFree(sip->data.ptrvalue);
8933 break;
8934 case SEQID_GI: /* gi */
8935 break;
8936 case SEQID_PDB:
8937 PDBSeqIdFree(sip->data.ptrvalue);
8938 break;
8939 }
8940 MemFree(sip);
8941 sip=next;
8942 }
8943 return NULL;
8944 }
8945
UpdateLookupInfo(CharPtr defline,FASTALookupPtr lookup,Int4 num_of_seqs,FILE * fd_stmp,Boolean ParseSeqid,Boolean sparse)8946 static Int4 UpdateLookupInfo(CharPtr defline,
8947 FASTALookupPtr lookup,
8948 Int4 num_of_seqs,
8949 FILE *fd_stmp,
8950 Boolean ParseSeqid,
8951 Boolean sparse
8952 )
8953 {
8954 CharPtr p, d = defline;
8955 Int4 i, gi = 0;
8956 Char TextId[ID_MAX_SIZE+1];
8957 SeqIdPtr sip, sip_tmp;
8958
8959 if(defline == NULL)
8960 return LOOKUP_NO_ERROR;
8961
8962 if(!ParseSeqid)
8963 return LOOKUP_NO_ERROR;
8964
8965 for(p = d = defline; ;d = p + StringLen(TextId)) {
8966
8967 /* MemSet(TextId, 0, sizeof(TextId)); */
8968
8969 for(i=0; !isspace((int)*p) && *p != NULLB && i < ID_MAX_SIZE; p++,i++)
8970 TextId[i]=*p;
8971
8972 TextId[i]=0;
8973
8974 if((sip = SeqIdParse(TextId)) == NULL) {/* Bad SeqId string */
8975 ErrLogPrintf("Sequence id \"%s\" is not parseable. "
8976 "Formating failed at %s\n", TextId, defline);
8977 return ERR_SEQID_FAILED;
8978 }
8979
8980 for(sip_tmp = sip; sip_tmp != NULL; sip_tmp = sip_tmp->next) {
8981 if(sip_tmp->choice == SEQID_GI) {
8982 gi = sip_tmp->data.intvalue;
8983 break;
8984 }
8985 }
8986
8987 if(gi != 0) { /* GI not found */
8988
8989 if((lookup->used + 2) >= lookup->allocated) {
8990 lookup->allocated += LOOKUP_CHUNK;
8991 lookup->table = (Int4Ptr)Realloc(lookup->table,
8992 lookup->allocated*(sizeof(Int4)));
8993 }
8994
8995 lookup->table[lookup->used] = gi;
8996 lookup->table[lookup->used+1] = num_of_seqs;
8997 lookup->used += 2;
8998 }
8999
9000 if(!SeqIdSetE2Index (sip, fd_stmp, num_of_seqs, sparse)) {
9001 ErrLogPrintf("SeIdSetE2Index failed. Exiting..\n");
9002 return FALSE;
9003 }
9004
9005 sip = SeqIdSetFree_NO_OBJ_MGR(sip);
9006
9007 if((p = StringChr(d, READDB_DEF_SEPARATOR)) == NULL)
9008 break;
9009 else
9010 p++;
9011 }
9012 return LOOKUP_NO_ERROR;
9013 }
FormatdbCreateStringIndex(const CharPtr FileName,Boolean ProteinType,Int4 sparse_idx,Boolean test_non_unique)9014 static Boolean FormatdbCreateStringIndex(const CharPtr FileName,
9015 Boolean ProteinType,
9016 Int4 sparse_idx,
9017 Boolean test_non_unique)
9018 {
9019 SORTObjectPtr sop;
9020 Char filenamebuf[FILENAME_MAX], DBName[FILENAME_MAX];
9021 FILE *fd_out;
9022 CharPtr files;
9023 ISAMErrorCode error;
9024 ISAMObjectPtr isamp;
9025 Int4 line_count = 0;
9026
9027 /* object for unique sorting */
9028
9029 if((sop = SORTObjectNew(NULL, '\0', 0,
9030 FALSE, TRUE)) == NULL) {
9031 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create SORT Object");
9032 return FALSE;
9033 }
9034
9035 sprintf(filenamebuf, "%s.%ctm",
9036 FileName, ProteinType ? 'p' : 'n');
9037
9038 sprintf(DBName, "%s.%csd",
9039 FileName, ProteinType ? 'p' : 'n');
9040
9041 if((fd_out = FileOpen(DBName, "wb")) == NULL)
9042 {
9043 return FALSE;
9044 }
9045 files = filenamebuf;
9046
9047 if (SORTFiles(&files, 1, fd_out, sop, &line_count) != SORTNoError)
9048 {
9049 ErrPostEx(SEV_ERROR, 0, 0, "SORTFiles failed, change TMPDIR to a partition with more free space or use -s option");
9050 return FALSE;
9051 }
9052 SORTObjectFree(sop);
9053
9054 FILECLOSE(fd_out);
9055
9056 FileRemove(filenamebuf);
9057 sprintf(filenamebuf, "%s.%csi",
9058 FileName, ProteinType ? 'p' : 'n');
9059
9060 if((isamp = ISAMObjectNew(ISAMString, DBName, filenamebuf)) == NULL) {
9061 ErrPostEx(SEV_ERROR, 0, 0, "Creating of ISAM object failed");
9062 return FALSE;
9063 }
9064
9065 ISAMSetDataSorted(isamp, line_count);
9066
9067 ISAMSetCheckForNonUnique(isamp, test_non_unique);
9068
9069 if((error = ISAMMakeIndex(isamp, 0, sparse_idx)) != ISAMNoError) {
9070 ErrPostEx(SEV_ERROR, 0, 0, "Creating of index failed with error code %ld\n", (long) error);
9071 ISAMObjectFree(isamp);
9072 return FALSE;
9073 }
9074
9075 ISAMObjectFree(isamp);
9076 return TRUE;
9077 }
9078
9079 /* This function should expect only single defline - multiple deflines
9080 usually have multiple tax_ids etc. If this is not TRUE writting
9081 ASN.1 with '\1' will result in failure. Code below should be removed
9082 when processing of multiple deflines is done */
FDLCreateAsnDF(FormatDBPtr fdbp,CharPtr seq_id,CharPtr title,Int4 taxid)9083 BlastDefLinePtr FDLCreateAsnDF(FormatDBPtr fdbp, CharPtr seq_id,
9084 CharPtr title, Int4 taxid)
9085 {
9086 CharPtr p, d = title, chptr;
9087 Int4 i;
9088 Char TextId[ID_MAX_SIZE+1];
9089 SeqIdPtr sip;
9090 BlastDefLinePtr bdp, bdp_head = NULL, bdp_last;
9091
9092 if(title == NULL && seq_id == NULL) {
9093 ErrPostEx(SEV_ERROR,0,0,"Cannot create a BlastDefLine",
9094 " structure without a seq_id and a title");
9095 return NULL;
9096 }
9097
9098 for(p = d = title; ;d = p) {
9099
9100 MemSet(TextId, 0, sizeof(TextId));
9101 chptr = NULL;
9102
9103 if(fdbp->options->parse_mode == TRUE) {
9104
9105 if(seq_id == NULL) {
9106 for(i=0; !isspace((int)*p) && i < ID_MAX_SIZE; p++,i++)
9107 TextId[i]=*p;
9108
9109 p++; /* Next character after space */
9110
9111 if((sip = SeqIdParse(TextId)) == NULL) {/* Bad SeqId string */
9112 ErrLogPrintf("Sequence id \"%s\" is not parseable. "
9113 "Formating failed at %s\n", TextId, title);
9114 return NULL;
9115 }
9116 } else {
9117 sip = SeqIdParse(seq_id);
9118 seq_id = NULL;
9119 }
9120 } else {
9121
9122 DbtagPtr dbtagptr;
9123
9124 sip = ValNodeNew(NULL);
9125 dbtagptr = DbtagNew();
9126 dbtagptr->tag = ObjectIdNew();
9127
9128 sip->choice = SEQID_GENERAL;
9129 sip->data.ptrvalue = dbtagptr;
9130 dbtagptr->tag->id = fdbp->num_of_seqs;
9131 dbtagptr->db = StringSave("BL_ORD_ID");
9132 }
9133
9134 if((chptr = StringChr(d, READDB_DEF_SEPARATOR)) != NULL)
9135 *chptr = NULLB;
9136
9137 bdp = BlastDefLineNew();
9138 bdp->seqid = SeqIdSetDup(sip);
9139 bdp->title = StringSave(p); /* Remaining line chunk */
9140 bdp->taxid = taxid;
9141
9142 if(bdp_head == NULL) {
9143 bdp_head = bdp;
9144 bdp_last = bdp;
9145 } else {
9146 bdp_last->next = bdp;
9147 bdp_last = bdp;
9148 }
9149
9150 sip = SeqIdSetFree_NO_OBJ_MGR(sip);
9151
9152 /* Looking for the next defline in the set */
9153
9154 if(chptr != NULL) {
9155 *chptr = READDB_DEF_SEPARATOR;
9156 p = chptr+1; /* Next after '\1' */
9157 } else {
9158 break;
9159 }
9160 }
9161
9162 return bdp_head;
9163 }
9164
FDBDumpDeflineAsn(FormatDBPtr fdbp,BlastDefLinePtr bdp_in)9165 Boolean FDBDumpDeflineAsn(FormatDBPtr fdbp, BlastDefLinePtr bdp_in)
9166 {
9167 Char buffer[128];
9168 BlastDefLinePtr bdp;
9169 #ifdef FDB_TAXONOMYDB
9170 SeqIdPtr sip;
9171 #endif
9172
9173 BlastDefLineSetAsnWrite(bdp_in, fdbp->aip_def, NULL);
9174 AsnIoFlush(fdbp->aip_def);
9175
9176 MemSet(buffer, NULLB, sizeof(buffer));
9177 for(bdp = bdp_in; bdp != NULL; bdp = bdp->next) {
9178
9179 /* ------------ Updating taxonomy information -------------- */
9180
9181 if(fdbp->options->tax_callback != NULL) {
9182
9183 #ifdef FDB_TAXONOMYDB
9184 if (bdp->taxid == 0) {
9185 if ((sip = SeqIdFindBest(bdp->seqid, SEQID_GI)))
9186 bdp->taxid = tax1_getTaxId4GI(sip->data.intvalue);
9187 }
9188 #endif
9189
9190 if(!fdbp->options->tax_callback(fdbp->options->tax_lookup,
9191 bdp->taxid)) {
9192 ErrPostEx(SEV_ERROR, 0,0,
9193 "tax_callback() failed for taxid %ld. "
9194 "Formating terminated abnormaly", bdp->taxid);
9195 return 1;
9196 }
9197 }
9198
9199 /* ------ Now adding new entried into lookup hash table ----- */
9200
9201 if(fdbp->options->parse_mode == TRUE) {
9202
9203 SeqIdWrite(bdp->seqid, buffer, PRINTID_FASTA_LONG, 128);
9204
9205 if((UpdateLookupInfo(buffer, fdbp->lookup, fdbp->num_of_seqs, fdbp->fd_stmp, fdbp->options->parse_mode, fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
9206 return FALSE;
9207 }
9208 }
9209 }
9210
9211 return TRUE;
9212 }
9213
FDBDumpDefline(FormatDBPtr fdbp,CharPtr title,CharPtr seq_id)9214 static Boolean FDBDumpDefline(FormatDBPtr fdbp, CharPtr title, CharPtr seq_id)
9215 {
9216 Char tmpbuff[1024];
9217 CharPtr defline;
9218 Int4 defline_len, id_length;
9219
9220 if(fdbp->options->parse_mode == FALSE) {
9221 sprintf(tmpbuff, "%s%ld ", NON_SEQID_PREFIX, (long) fdbp->num_of_seqs);
9222
9223 if (FileWrite(tmpbuff, StringLen(tmpbuff), 1, fdbp->fd_def) != (Uint4) 1)
9224 return 1;
9225 defline = title;
9226 } else {
9227 if (title != NULL)
9228 defline_len = StringLen(title);
9229 else
9230 defline_len = 0;
9231
9232 defline_len += 255; /* Sufficient for an ID. */
9233
9234 if ( sizeof(tmpbuff) > defline_len)
9235 defline = tmpbuff;
9236 else
9237 defline = MemNew((defline_len+1)*sizeof(Char));
9238
9239 /* IF the gi is zero and there is another ID, then do not print it. */
9240 if (StringNCmp(seq_id, "gi|0|", 5) == 0) {
9241 StringCpy(defline, seq_id+5);
9242 ErrPostEx(SEV_WARNING, 0, 0, "%s: zero gi stripped", seq_id);
9243 } else {
9244 StringCpy(defline, seq_id);
9245 }
9246
9247 id_length = StringLen(defline);
9248 StrCat(defline+id_length++," ");
9249 if(title) StringCat(defline+id_length, title);
9250 }
9251 ASSERT(StringLen(defline) < 0x7fffffffUL - 2000000000UL -20 /* for lcl|dddd... */ );
9252
9253 if (FileWrite(defline, StringLen(defline), 1, fdbp->fd_def) != (Uint4) 1) {
9254
9255 if (defline != title && defline != tmpbuff)
9256 MemFree(defline);
9257
9258 return 1;
9259 }
9260
9261 /* -------- Now adding new entried into lookup hash table */
9262
9263 if((UpdateLookupInfo(defline, fdbp->lookup, fdbp->num_of_seqs,
9264 fdbp->fd_stmp, fdbp->options->parse_mode,
9265 fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
9266
9267 if ( defline != title && defline != tmpbuff)
9268 MemFree(defline);
9269
9270 return FALSE;
9271 }
9272
9273 if (defline != title && defline != tmpbuff)
9274 MemFree(defline);
9275
9276 return TRUE;
9277 }
9278
9279 /* Creates a new volume of the blast database being created if the sequence
9280 * being added causes it to exceed the volume limitations (number of
9281 * letters/sequences) */
FDBCreateNewVolume(FormatDBPtr fdbp,const ByteStorePtr seq,Int4 seq_length,const Uint4Ptr ambiguities)9282 static Int4 FDBCreateNewVolume(FormatDBPtr fdbp,
9283 const ByteStorePtr seq,
9284 Int4 seq_length,
9285 const Uint4Ptr ambiguities)
9286 {
9287 FDB_optionsPtr options = fdbp->options;
9288 Int4 amb_size = 0; /* size of ambiguities for this sequence */
9289 Int8 seq_size = 0; /* length of sequence file with new sequence being added */
9290 Int4 hdr_size = 0; /* size of the header file without new sequence */
9291 Char extension_prefix = options->is_protein ? 'p' : 'n';
9292
9293 if (ambiguities) {
9294 amb_size = sizeof(*ambiguities) * ((*ambiguities)&0x7fffffffUL);
9295 }
9296 seq_size = (ftell(fdbp->fd_seq) + BSLen(seq) + 1 + amb_size);
9297 hdr_size = ftell(fdbp->aip_def ? fdbp->aip_def->fp : fdbp->fd_def);
9298
9299 if ( /* if bases_in_volume was specified, don't exceed that */
9300 (options->bases_in_volume &&
9301 (fdbp->TotalLen + seq_length > options->bases_in_volume)) ||
9302 /* if sequences_in_volume was specified, don't exceed that (will be
9303 * deprecated) */
9304 (options->sequences_in_volume &&
9305 (fdbp->num_of_seqs+1) > options->sequences_in_volume) ||
9306 /* if sequence file is about to grow larger than SEQFILE_SIZE_MAX */
9307 ( seq_size > SEQFILE_SIZE_MAX) ||
9308 /* if header file is about to grow too large (assuming header can not
9309 * exceed 2G - 2000000000b) */
9310 ( hdr_size > 2000000000UL)
9311 )
9312 {
9313 Char dbnamebuf[PATH_MAX];
9314 FormatDBPtr tmp_fdbp = NULL;
9315
9316 if (options->volume == 1) {
9317 sprintf(dbnamebuf, "%s.00", options->base_name);
9318 } else {
9319 sprintf(dbnamebuf, "%s", options->base_name);
9320 }
9321 ErrLogPrintf("Closing volume %s with %ld sequences, %s letters"
9322 "(.%csq file = %ld bytes; .%chr file = %ld bytes)\n",
9323 options->base_name, fdbp->num_of_seqs,
9324 Nlm_Int8tostr(fdbp->TotalLen, 1),
9325 extension_prefix, (long)seq_size,
9326 extension_prefix, (long)hdr_size);
9327 tmp_fdbp = (FormatDBPtr) MemNew(sizeof(FormatDB));
9328 MemCpy(tmp_fdbp, fdbp, sizeof(FormatDB));
9329
9330 if(FormatDBClose(tmp_fdbp))
9331 return 9;
9332 if (++options->volume >= kFDBMaxNumVolumes) {
9333 FDBCleanUpInProgress(options);
9334 ErrPostEx(SEV_FATAL, 1, 0,
9335 "BLAST database exceeded %d volumes, please adjust the -v "
9336 "option to formatdb (number of bases per volume)",
9337 kFDBMaxNumVolumes);
9338 return -1;
9339 }
9340
9341 /* When second volume is created, add suffix .00 to all
9342 first volume files */
9343 if (options->volume == 1)
9344 {
9345 Char oldnamebuf[FILENAME_MAX], newnamebuf[FILENAME_MAX];
9346 int len = StringLen(options->base_name) + 2;
9347 sprintf(oldnamebuf, "%s.%cin", options->base_name, extension_prefix);
9348 sprintf(newnamebuf, "%s.00.%cin", options->base_name,
9349 extension_prefix);
9350 if (FileLength(oldnamebuf) > 0)
9351 FileRename(oldnamebuf, newnamebuf);
9352 StringCpy(oldnamebuf + len, "hr");
9353 StringCpy(newnamebuf + len + 3, "hr");
9354 if (FileLength(oldnamebuf) > 0)
9355 FileRename(oldnamebuf, newnamebuf);
9356 StringCpy(oldnamebuf + len, "sq");
9357 StringCpy(newnamebuf + len + 3, "sq");
9358 if (FileLength(oldnamebuf) > 0)
9359 FileRename(oldnamebuf, newnamebuf);
9360 StringCpy(oldnamebuf + len, "nd");
9361 StringCpy(newnamebuf + len + 3, "nd");
9362 if (FileLength(oldnamebuf) > 0)
9363 FileRename(oldnamebuf, newnamebuf);
9364 StringCpy(oldnamebuf + len, "ni");
9365 StringCpy(newnamebuf + len + 3, "ni");
9366 if (FileLength(oldnamebuf) > 0)
9367 FileRename(oldnamebuf, newnamebuf);
9368 StringCpy(oldnamebuf + len, "sd");
9369 StringCpy(newnamebuf + len + 3, "sd");
9370 if (FileLength(oldnamebuf) > 0)
9371 FileRename(oldnamebuf, newnamebuf);
9372 StringCpy(oldnamebuf + len, "si");
9373 StringCpy(newnamebuf + len + 3, "si");
9374 if (FileLength(oldnamebuf) > 0)
9375 FileRename(oldnamebuf, newnamebuf);
9376 if (options->dump_info) {
9377 StringCpy(oldnamebuf + len, "di");
9378 StringCpy(newnamebuf + len + 3, "di");
9379 if (FileLength(oldnamebuf) > 0)
9380 FileRename(oldnamebuf, newnamebuf);
9381 }
9382 if (options->is_protein) {
9383 /* PIG ISAM files */
9384 StringCpy(oldnamebuf + len, "pd");
9385 StringCpy(newnamebuf + len + 3, "pd");
9386 if (FileLength(oldnamebuf) > 0)
9387 FileRename(oldnamebuf, newnamebuf);
9388 StringCpy(oldnamebuf + len, "pi");
9389 StringCpy(newnamebuf + len + 3, "pi");
9390 if (FileLength(oldnamebuf) > 0)
9391 FileRename(oldnamebuf, newnamebuf);
9392 }
9393
9394 MemFree(options->base_name);
9395 newnamebuf[len+1] = NULLB;
9396 options->base_name = StringSave(newnamebuf);
9397 }
9398
9399 {
9400 CharPtr ptr;
9401 ptr = options->base_name + StringLen(options->base_name) - 2;
9402 sprintf(ptr, "%02ld", (long) options->volume);
9403 }
9404
9405 if ((tmp_fdbp = FormatDBInit(options)) == NULL)
9406 return 2;
9407
9408 MemCpy(fdbp, tmp_fdbp, sizeof(FormatDB));
9409 MemFree(tmp_fdbp);
9410 }
9411
9412 return 0;
9413 }
9414
FDBExtend4Sequence(FormatDBPtr fdbp,const ByteStorePtr seq,Int4 seq_length,const Uint4Ptr ambiguities)9415 static Int4 FDBExtend4Sequence(FormatDBPtr fdbp,
9416 const ByteStorePtr seq,
9417 Int4 seq_length, const Uint4Ptr ambiguities)
9418 {
9419
9420 assert(ftell(fdbp->fd_seq) + BSLen(seq) + 1 +
9421 (ambiguities == NULL ? 0 : (*ambiguities) & 0x7fffffffUL) <
9422 0x7fffffffUL);
9423
9424 return FDBFillIndexTables(fdbp, seq_length);
9425 }
9426
FDBFillIndexTables(FormatDBPtr fdbp,Int4 seq_length)9427 Int4 FDBFillIndexTables(FormatDBPtr fdbp, Int4 seq_length)
9428 {
9429 fdbp->TotalLen += seq_length;
9430
9431 if (fdbp->MaxSeqLen < seq_length)
9432 fdbp->MaxSeqLen = seq_length;
9433
9434 if (fdbp->OffsetAllocated <= (fdbp->num_of_seqs + 1)) {
9435 fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
9436
9437 fdbp->DefOffsetTable = (Int4Ptr) Realloc(fdbp->DefOffsetTable,
9438 fdbp->OffsetAllocated *
9439 sizeof(Uint4));
9440 fdbp->SeqOffsetTable =
9441 (Int4Ptr) Realloc(fdbp->SeqOffsetTable,
9442 fdbp->OffsetAllocated * sizeof(Uint4));
9443 if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
9444 ErrLogPrintf
9445 ("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9446 return 1;
9447 }
9448
9449 if (!fdbp->options->is_protein) {
9450 fdbp->AmbOffsetTable = (Int4Ptr) Realloc(fdbp->AmbOffsetTable,
9451 fdbp->OffsetAllocated *
9452 sizeof(Uint4));
9453 if (!fdbp->AmbOffsetTable) {
9454 ErrLogPrintf
9455 ("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9456 return 1;
9457 }
9458 }
9459 }
9460
9461 if (fdbp->aip_def != NULL) /* Structured deflines */
9462 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
9463 else if (fdbp->fd_def)
9464 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
9465
9466 if (fdbp->fd_seq)
9467 fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
9468
9469 return 0;
9470 }
9471
9472 /********* BEGIN: Auxiliary functions to the SI_Record structure ************/
9473
9474 /** Allocates a single node in the SI_Record linked list structure */
SI_RecordNew(void)9475 SI_Record* SI_RecordNew(void)
9476 {
9477 return (SI_Record*) calloc(1, sizeof(SI_Record));
9478 }
9479
9480 /** Deallocates the linked list of SI_Record structures in srp
9481 * @return NULL
9482 */
SI_RecordFree(SI_Record * srp)9483 SI_Record* SI_RecordFree(SI_Record* srp)
9484 {
9485 if ( !srp ) {
9486 return NULL;
9487 }
9488
9489 while (srp) {
9490 SI_Record* tmp = srp->next;
9491 if (srp->title) {
9492 srp->title = MemFree(srp->title);
9493 }
9494 MemFree(srp);
9495 srp = tmp;
9496 }
9497 return NULL;
9498 }
9499
9500 /** Appends a new node to the srp linked list.
9501 * @return the newly allocated node
9502 */
SI_RecordAddNode(SI_Record * srp)9503 static SI_Record* SI_RecordAddNode(SI_Record* srp)
9504 {
9505 if ( !srp ) {
9506 return SI_RecordNew();
9507 } else {
9508 for (; srp->next; srp = srp->next) ;
9509 srp->next = SI_RecordNew();
9510 return srp->next;
9511 }
9512 }
9513
9514 /** Appends a new node to the srp linked list from data used in the
9515 * FORMATDB_VER format of the BLAST databases
9516 * @return pointer to the newly added node
9517 */
9518 static SI_Record*
SI_RecordAddFormatdb_ver(SI_Record * srp,int gi,int owner,const char * div,int date,Uint1 mol,const BlastDefLinePtr bdp)9519 SI_RecordAddFormatdb_ver(SI_Record* srp, int gi, int owner, const char* div,
9520 int date, Uint1 mol, const BlastDefLinePtr bdp)
9521 {
9522 ASSERT(bdp);
9523
9524 srp = SI_RecordAddNode(srp);
9525
9526 srp->gi = gi;
9527 srp->owner = owner;
9528 srp->ent = date;
9529 srp->taxid = bdp->taxid;
9530 srp->mol = mol;
9531
9532 if (div) {
9533 StringNCpy_0(srp->div, div, sizeof(srp->div));
9534 }
9535 if (bdp->seqid) {
9536 SeqIdWrite(bdp->seqid, srp->seqid, PRINTID_FASTA_LONG,
9537 sizeof(srp->seqid));
9538 }
9539 if (bdp->title) {
9540 srp->title = StringSave(bdp->title);
9541 }
9542 return srp;
9543 }
9544
9545 /** Appends a new node to the srp linked list from data used in the
9546 * FORMATDB_VER_TEXT format of the BLAST databases
9547 * @return pointer to the newly added node
9548 */
9549 static SI_Record*
SI_RecordAddFormatdb_ver_text(SI_Record * srp,Int4 gi,Int4 owner,Int4 taxid,char * div,Int4 date,Uint1 mol,char * seq_id,char * title)9550 SI_RecordAddFormatdb_ver_text(SI_Record* srp, Int4 gi, Int4 owner, Int4 taxid,
9551 char* div, Int4 date, Uint1 mol, char* seq_id,
9552 char* title)
9553 {
9554 srp = SI_RecordAddNode(srp);
9555
9556 srp->gi = gi;
9557 srp->owner = owner;
9558 srp->ent = date;
9559 srp->taxid = taxid;
9560 srp->mol = mol;
9561
9562 if (div)
9563 StringNCpy_0(srp->div, div, sizeof(srp->div));
9564 if (seq_id)
9565 StringNCpy_0(srp->seqid, seq_id, sizeof(srp->seqid));
9566 if (title)
9567 srp->title = StringSave(title);
9568 return srp;
9569 }
9570
9571 /********* END: Auxiliary functions to the SI_Record structure ************/
9572
s_GetPrintableSequenceId(const SeqIdPtr seqid,char * seqid_string,char buffer[],size_t buffer_sz)9573 static void s_GetPrintableSequenceId(const SeqIdPtr seqid,
9574 char* seqid_string,
9575 char buffer[],
9576 size_t buffer_sz)
9577 {
9578 if (seqid_string) {
9579 StringNCpy_0(buffer, seqid_string, buffer_sz-1);
9580 } else {
9581 SeqIdWrite(seqid, buffer, PRINTID_FASTA_LONG, buffer_sz-1);
9582 }
9583 }
9584
9585 /* If the bdp parameter is given, the defline, Seq-id, and taxonomy
9586 * information, is obtained from this parameter and thus the remainder
9587 * parameters are ignored. */
FDBAddSequence(FormatDBPtr fdbp,BlastDefLinePtr bdp,Uint1 * seq_data_type,ByteStorePtr * seq_data,Int4 SequenceLen,CharPtr seq_id,CharPtr title,Int4 gi,Int4 tax_id,CharPtr div,Int4 owner,Int4 date)9588 Int2 FDBAddSequence(FormatDBPtr fdbp, BlastDefLinePtr bdp,
9589 Uint1* seq_data_type, ByteStorePtr * seq_data,
9590 Int4 SequenceLen,
9591
9592 /* These 2 parameters are left for the backward
9593 compatibility. They are not used for ASN.1 structues
9594 deflines dump */
9595 CharPtr seq_id, CharPtr title,
9596 /* These parameters suppose, that this function adds
9597 sequence to the Blast database with single definition
9598 line. Generally speaking, this is not the common case
9599 and if this function is used to add sequence item with
9600 many definition lines these parameters must not be used
9601 at all. */
9602 Int4 gi, Int4 tax_id, CharPtr div, Int4 owner, Int4 date)
9603 {
9604 Uint4Ptr AmbCharPtr = NULL;
9605 ByteStorePtr new_data;
9606 Int2 status = 0;
9607
9608 ASSERT(seq_data);
9609 ASSERT(seq_data_type);
9610
9611 if (SequenceLen <= 0) {
9612 char tmpbuf[128] = { NULLB };
9613 s_GetPrintableSequenceId(bdp->seqid, seq_id, tmpbuf, sizeof(tmpbuf));
9614 ErrPostEx(SEV_WARNING, 0, 0,
9615 "Cannot add sequence number %ld (%s) because it has zero-length.\n",
9616 (fdbp->options->total_num_of_seqs + 1), tmpbuf);
9617 return 1;
9618 }
9619 if (fdbp->options->is_protein) {
9620 if (*seq_data_type != Seq_code_ncbistdaa) {
9621 new_data = BSConvertSeq(*seq_data, Seq_code_ncbistdaa,
9622 *seq_data_type, SequenceLen);
9623 *seq_data = new_data;
9624 *seq_data_type = Seq_code_ncbistdaa;
9625 }
9626 } else { /* if(!fdbp->options->is_protein) */
9627
9628 AmbCharPtr = NULL;
9629 if (*seq_data_type != Seq_code_ncbi2na
9630 && *seq_data_type != Seq_code_ncbi4na) {
9631 Uint1 new_code;
9632 new_data =
9633 BSPack(*seq_data, *seq_data_type, SequenceLen, &new_code);
9634 if (new_data != NULL) {
9635 *seq_data = new_data;
9636 *seq_data_type = new_code;
9637 }
9638 }
9639
9640 if (*seq_data_type == Seq_code_ncbi4na && seq_data != NULL) {
9641 /* ncbi4na require compression into ncbi2na */
9642
9643 if (fdbp->options->version > FORMATDB_VER_TEXT) {
9644 if ((new_data = BSCompressDNANew(*seq_data, SequenceLen,
9645 &AmbCharPtr)) == NULL) {
9646 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
9647 "Formating failed.\n");
9648 return 3;
9649 }
9650 } else {
9651 if ((new_data = BSCompressDNA(*seq_data, SequenceLen,
9652 &AmbCharPtr)) == NULL) {
9653 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
9654 "Formating failed.\n");
9655 return 3;
9656 }
9657 }
9658 *seq_data = new_data;
9659
9660 *seq_data_type = Seq_code_ncbi2na; /* just for information */
9661
9662 } else {
9663 Uint1 remainder;
9664 /* if sequence already in ncbi2na format we have to update last
9665 byte */
9666 BSSeek(*seq_data, SequenceLen / 4, SEEK_SET);
9667
9668 if ((remainder = (SequenceLen % 4)) == 0) {
9669 BSPutByte(*seq_data, NULLB);
9670 } else {
9671 Uint1 ch = remainder + (BSGetByte(*seq_data) & 0xfc);
9672 BSSeek(*seq_data, SequenceLen / 4, SEEK_SET);
9673 BSPutByte(*seq_data, ch);
9674 }
9675 }
9676 } /* if(!fdbp->options->is_protein) */
9677
9678 /* Prepare SI_Record structure for calling FDBAddSequence2 */
9679 {
9680 SI_Record* si = NULL;
9681 /* There is no information available here to distinguish DNA from RNA
9682 etc., so assign only AA or DNA molecule type. */
9683 Uint1 mol = (fdbp->options->is_protein ? Seq_mol_aa : Seq_mol_dna);
9684
9685 if (bdp != NULL) {
9686 Boolean first_iteration = TRUE;
9687 for (; bdp; bdp = bdp->next) {
9688 if (first_iteration) {
9689 si = SI_RecordAddFormatdb_ver(si, gi, owner, div, date, mol,
9690 bdp);
9691 first_iteration = FALSE;
9692 } else {
9693 SI_RecordAddFormatdb_ver(si, gi, owner, div, date, mol, bdp);
9694 }
9695 }
9696 } else {
9697 si = SI_RecordAddFormatdb_ver_text(si, gi, owner, tax_id, div,
9698 date, mol, seq_id, title);
9699 }
9700
9701 status = FDBAddSequence2(fdbp, si, *seq_data_type, seq_data,
9702 SequenceLen, AmbCharPtr, PIG_NONE, 0);
9703
9704 si = SI_RecordFree(si);
9705 }
9706
9707 return status;
9708 }
9709
readdb_sequence_hash(const char * sequence,int sequence_length)9710 Uint4 readdb_sequence_hash(const char* sequence, int sequence_length)
9711 {
9712 Uint4 retval = 0;
9713 int i;
9714 for (i = 0; i < sequence_length; i++) {
9715 retval *= 1103515245;
9716 retval += (unsigned long) (sequence[i]) + 12345;
9717 }
9718 return retval;
9719 }
9720
9721 /* See comment in readdb.h */
FDBAddSequence2(FormatDBPtr fdbp,SI_RecordPtr srp,Uint1 seq_data_type,const ByteStorePtr * seq_data,Int4 SequenceLen,Uint4Ptr AmbCharPtr,Int4 pig_id,Uint4 hash)9722 Int2 FDBAddSequence2(FormatDBPtr fdbp, /* target blast db */
9723 SI_RecordPtr srp, /* linked list of sequence
9724 information for each gi */
9725 /* sequence data itself */
9726 Uint1 seq_data_type,
9727 const ByteStorePtr * seq_data,
9728 Int4 SequenceLen,
9729 Uint4Ptr AmbCharPtr,
9730
9731 Int4 pig_id, /* stable protein group identifier */
9732 Uint4 hash /* sequence hash - to allow reuse of hash
9733 calculated in ID */
9734 )
9735 {
9736 BlastDefLinePtr bdp_first = NULL;
9737 BlastDefLinePtr bdp_cur = NULL;
9738 SI_RecordPtr pc = NULL;
9739
9740 if (SequenceLen <= 0) {
9741 ErrLogPrintf("Sequence number %ld has zero-length!\n",
9742 (fdbp->options->total_num_of_seqs + 1));
9743 return 1;
9744 }
9745
9746 /* If too many bases in thise file, start a new volume */
9747 if (FDBCreateNewVolume(fdbp, *seq_data, SequenceLen, AmbCharPtr))
9748 return 1;
9749
9750 if (FDBExtend4Sequence(fdbp, *seq_data, SequenceLen, AmbCharPtr))
9751 return 1;
9752
9753 /* ---------- Dumping sequence data ---------- */
9754
9755 BSSeek(*seq_data, 0, SEEK_SET);
9756 for (;;) {
9757 Char tmpbuff[1025];
9758 int len = BSRead(*seq_data, tmpbuff, sizeof(tmpbuff) - 1);
9759 if (len <= 0)
9760 break;
9761 if (FileWrite(tmpbuff, len, 1, fdbp->fd_seq) != (Uint4) 1)
9762 return 1;
9763 if (hash == 0 && fdbp->options->dump_info) {
9764 hash = readdb_sequence_hash(tmpbuff, len);
9765 }
9766 }
9767
9768 if (fdbp->options->is_protein) {
9769 int i = 0;
9770 ASSERT(seq_data_type == Seq_code_ncbistdaa);
9771 if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
9772 return 1;
9773 } else {
9774 ASSERT(seq_data_type == Seq_code_ncbi2na);
9775 /* dump ambiguity characters. */
9776 fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq); /* Anyway... */
9777
9778 /* if AmbCharPtr is not NULL, then there was ambiguity. */
9779 if (AmbCharPtr != NULL) {
9780 Uint4 total, index;
9781
9782 /* The first Uint4 holds the total number of ambig. bp. */
9783 total = (*AmbCharPtr) + 1;
9784 total &= 0x7FFFFFFF;
9785 for (index = 0; index < total; index++) {
9786 if (!FormatDbUint4Write(AmbCharPtr[index], fdbp->fd_seq))
9787 return 1;
9788 }
9789 MemFree(AmbCharPtr);
9790 AmbCharPtr = NULL;
9791 }
9792 }
9793
9794 /* This information is written to the *.[pn]di file, and it is also
9795 needed to set the membership bits in the FORMATDB_VER version of the
9796 blast databases. */
9797
9798 if (fdbp->options->version == FORMATDB_VER_TEXT) {
9799 if (fdbp->options->dump_info) {
9800 /* ------- Dumping misc info file ----------- */
9801 fprintf(fdbp->fd_sdi, "%ld %ld %ld %ld %s %ld %ld %ld\n",
9802 (long) fdbp->num_of_seqs, (long) srp->gi,
9803 (long) srp->taxid, (long) srp->owner,
9804 srp->div ? srp->div : "N/A", (long) SequenceLen,
9805 (long) hash, (long) srp->ent);
9806 }
9807
9808 /* ------- Dumping definition line ---------- */
9809 if (!FDBDumpDefline(fdbp, srp->title, srp->seqid)) {
9810 ErrPostEx(SEV_ERROR, 0, 0,
9811 "FDBDumpDefline() failed. Formating terminated abnormaly");
9812 return 1;
9813 }
9814 return 1;
9815 }
9816
9817 assert(fdbp->options->version >= FORMATDB_VER);
9818
9819 for (pc = srp; pc; pc = pc->next) {
9820 DI_Record direc;
9821 MemSet((VoidPtr) & direc, 0, sizeof(direc));
9822 direc.oid = fdbp->num_of_seqs;
9823 direc.gi = pc->gi;
9824 direc.taxid = pc->taxid;
9825 direc.owner = pc->owner;
9826 direc.date = pc->ent;
9827 direc.len = SequenceLen;
9828 direc.hash = hash;
9829 direc.mol = pc->mol;
9830
9831 if (bdp_cur == NULL) {
9832 bdp_first = bdp_cur = FDLCreateAsnDF(fdbp, pc->seqid, pc->title,
9833 pc->taxid);
9834 } else {
9835 bdp_cur = bdp_cur->next =
9836 FDLCreateAsnDF(fdbp, pc->seqid, pc->title, pc->taxid);
9837 }
9838
9839 /* Add the PIG information */
9840 if (fdbp->options->is_protein && pig_id != PIG_NONE) {
9841 if (!bdp_cur->other_info) {
9842 ValNodeAddInt(&bdp_cur->other_info, 0, pig_id);
9843 }
9844 }
9845
9846 if (fdbp->options->dump_info) {
9847 /* ------ Dumping misc info file ----------- */
9848 CharPtr acc =
9849 FDFGetAccessionFromSeqIdChain((SeqIdPtr) bdp_cur->seqid);
9850 direc.acc = acc;
9851 fprintf(fdbp->fd_sdi, "%ld %ld %ld %ld %s %ld %ld %ld %s %u\n",
9852 (long) direc.oid, (long) direc.gi, (long) direc.taxid,
9853 (long) direc.owner, pc->div ? pc->div : "N/A",
9854 (long) direc.len, (long) direc.hash, (long) direc.date,
9855 (char *) (acc ? acc : "unknown"), (unsigned int) direc.mol);
9856 }
9857
9858 /* ------- Add the links and membership information -- */
9859 FDBAddLinksInformation(bdp_cur, fdbp->options->linkbit_listp);
9860 FDBAddMembershipInformation(bdp_cur, fdbp->options->memb_tblp,
9861 (VoidPtr) & direc);
9862 if (direc.acc)
9863 MemFree(direc.acc);
9864
9865 } /* end of SI record loop */
9866
9867 if (fdbp->options->is_protein)
9868 FDBAddPig(fdbp->ptable, pig_id, fdbp->num_of_seqs);
9869
9870
9871 /* ------- Dumping definition line ---------- */
9872 if (!FDBDumpDeflineAsn(fdbp, bdp_first)) {
9873 ErrPostEx(SEV_ERROR, 0, 0,
9874 "FDBDumpDeflineAsn() failed. Formating terminated abnormaly");
9875 return 1;
9876 }
9877
9878 BlastDefLineSetFree(bdp_first);
9879
9880 fdbp->num_of_seqs++; /* Finshed ... */
9881 fdbp->options->total_num_of_seqs++;
9882 /* ---------------------------------------------- */
9883
9884 return 0;
9885 }
9886
FDBAddBioseq(FormatDBPtr fdbp,BioseqPtr bsp,BlastDefLinePtr bdp)9887 Int2 FDBAddBioseq(FormatDBPtr fdbp, BioseqPtr bsp, BlastDefLinePtr bdp)
9888 {
9889 if (bsp == NULL || bsp->seq_data_type == Seq_code_gap) return 0;
9890
9891 if ( !bdp ) {
9892 ASSERT(fdbp->options->version == FORMATDB_VER_TEXT);
9893 return FDBAddSequence (fdbp, NULL, &bsp->seq_data_type,
9894 (ByteStorePtr PNTR) &bsp->seq_data,
9895 bsp->length, 0, BioseqGetTitle(bsp),
9896 0, 0, 0, 0, 0);
9897 } else {
9898 ASSERT(fdbp->options->version >= FORMATDB_VER);
9899 return FDBAddSequence (fdbp, bdp, &bsp->seq_data_type,
9900 (ByteStorePtr PNTR) &bsp->seq_data,
9901 bsp->length, NULL, NULL,
9902 0, 0, 0, 0, 0);
9903 }
9904
9905 }
9906
9907 /*******************************************************************************
9908 * Pass thru each bioseq into given SeqEntry and write corresponding information
9909 * into "def", "index", ...., files
9910 *******************************************************************************
9911 * Parameters:
9912 * fdbp - pointer to memory to be freed
9913 *
9914 * Returns NULL
9915 ******************************************************************************/
process_sep(SeqEntryPtr sep,FormatDBPtr fdbp)9916 Int2 process_sep (SeqEntryPtr sep, FormatDBPtr fdbp)
9917 {
9918
9919 Int4 SequenceLen;
9920 BioseqPtr bsp = NULL;
9921 CharPtr defline;
9922 Char tmpbuff[1024];
9923 Int4 buffer_size=0, defline_len=0;
9924 CharPtr buffer=NULL;
9925 Int4 len, id_length;
9926 Uint4Ptr AmbCharPtr = NULL;
9927 Uint1 ch, remainder;
9928 Uint4 i, total, index;
9929
9930 if (IS_Bioseq(sep))
9931 bsp = (BioseqPtr) sep->data.ptrvalue;
9932 else
9933 /* This is Bioseq-set. Exit */
9934 return 0;
9935
9936 if (bsp == NULL || bsp->seq_data_type == Seq_code_gap) return 0;
9937
9938 /* Make a convertion to stadard form */
9939
9940 if (fdbp->options->is_protein)
9941 BioseqRawConvert(bsp, Seq_code_ncbistdaa);
9942
9943 SequenceLen = bsp->length;
9944 fdbp->TotalLen += SequenceLen;
9945
9946 if (fdbp->MaxSeqLen < SequenceLen)
9947 fdbp->MaxSeqLen = SequenceLen;
9948
9949 if(fdbp->OffsetAllocated <= (fdbp->num_of_seqs+1)) {
9950 fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
9951
9952 fdbp->DefOffsetTable = (Int4Ptr)Realloc(fdbp->DefOffsetTable,
9953 fdbp->OffsetAllocated*sizeof(Uint4));
9954 fdbp->SeqOffsetTable = (Int4Ptr)Realloc(fdbp->SeqOffsetTable,
9955 fdbp->OffsetAllocated*sizeof(Uint4));
9956
9957 if (!fdbp->DefOffsetTable || !fdbp->SeqOffsetTable) {
9958 ErrLogPrintf("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9959 return 0;
9960 }
9961
9962 if(!fdbp->options->is_protein) {
9963 fdbp->AmbOffsetTable = (Int4Ptr)Realloc(fdbp->AmbOffsetTable,
9964 fdbp->OffsetAllocated*sizeof(Uint4));
9965 if (!fdbp->AmbOffsetTable) {
9966 ErrLogPrintf("Not enough memory to allocate main formatdb structure. Formatting failed.\n");
9967 return 0;
9968 }
9969 }
9970 }
9971
9972 if(fdbp->aip_def != NULL) /* Structured deflines */
9973 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
9974 else
9975 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
9976
9977 fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
9978
9979 /* ---------------------- */
9980
9981 if(fdbp->options->parse_mode == FALSE) {
9982 sprintf(tmpbuff, "%s%ld ", NON_SEQID_PREFIX, (long) fdbp->num_of_seqs);
9983 if (FileWrite(tmpbuff, StringLen(tmpbuff), 1, fdbp->fd_def) != (Uint4) 1)
9984 return 1;
9985 defline = (CharPtr)bsp->descr->data.ptrvalue;
9986 } else {
9987 if (bsp->descr)
9988 defline_len = StringLen(BioseqGetTitle(bsp));
9989 else
9990 defline_len = 0;
9991 defline_len += 255; /* Sufficient for an ID. */
9992 if (buffer_size < defline_len) {
9993 if (buffer)
9994 buffer = MemFree(buffer);
9995 buffer = MemNew((defline_len+1)*sizeof(Char));
9996 buffer_size = defline_len;
9997 }
9998 SeqIdWrite(bsp->id, buffer, PRINTID_FASTA_LONG, STRLENGTH);
9999 id_length = StringLen(buffer);
10000 buffer[id_length] = ' ';
10001 id_length++;
10002 StringCpy(&buffer[id_length], BioseqGetTitle(bsp));
10003 defline = buffer;
10004 }
10005 if (FileWrite(defline, StringLen(defline), 1, fdbp->fd_def) != (Uint4) 1)
10006 return 1;
10007
10008 /* -------- Now adding new entried into lookup hash table */
10009
10010 if((UpdateLookupInfo(defline, fdbp->lookup, fdbp->num_of_seqs,
10011 fdbp->fd_stmp, fdbp->options->parse_mode,
10012 fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
10013 return -1;
10014 }
10015
10016 defline = NULL;
10017 if (buffer)
10018 MemFree(buffer);
10019
10020 if(!fdbp->options->is_protein) {
10021 AmbCharPtr = NULL;
10022 if (bsp->seq_data_type == Seq_code_ncbi4na && bsp->seq_data != NULL){
10023
10024 /* ncbi4na require compression into ncbi2na */
10025
10026 if (fdbp->options->version > FORMATDB_VER_TEXT)
10027 {
10028 if((bsp->seq_data = (SeqDataPtr) BSCompressDNANew((ByteStorePtr) bsp->seq_data, bsp->length,
10029 &(AmbCharPtr))) == NULL) {
10030 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10031 "Formating failed.\n");
10032 return -1;
10033 }
10034 }
10035 else
10036 {
10037 if((bsp->seq_data = (SeqDataPtr) BSCompressDNA((ByteStorePtr) bsp->seq_data, bsp->length,
10038 &(AmbCharPtr))) == NULL) {
10039 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10040 "Formating failed.\n");
10041 return -1;
10042 }
10043 }
10044
10045 bsp->seq_data_type = Seq_code_ncbi2na; /* just for information */
10046 } else {
10047 /* if sequence already in ncbi2na format we have to update last byte */
10048
10049 if((remainder = (bsp->length%4)) == 0) {
10050 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4+1, SEEK_SET);
10051 BSPutByte((ByteStorePtr) bsp->seq_data, NULLB);
10052 } else {
10053 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10054 ch = remainder + BSGetByte((ByteStorePtr) bsp->seq_data);
10055 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10056 BSPutByte((ByteStorePtr) bsp->seq_data, ch);
10057 }
10058 }
10059 }
10060 /* Now dumping sequence */
10061
10062 BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
10063
10064 while((len = BSRead((ByteStorePtr) bsp->seq_data, tmpbuff, sizeof(tmpbuff))) != 0) {
10065 if (FileWrite(tmpbuff, len, 1, fdbp->fd_seq) != (Uint4) 1)
10066 return 1;
10067 }
10068
10069
10070 if(fdbp->options->is_protein) {
10071 i=0;
10072 if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
10073 return 1;
10074 } else {
10075 /* dump ambiguity characters. */
10076 fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq); /* Anyway... */
10077
10078 /* if AmbCharPtr is not NULL, then there was ambiguity. */
10079 if(AmbCharPtr != NULL) { /* The first Uint4 holds the total number of ambig. bp. */
10080 total = (*AmbCharPtr)+1;
10081 for (index=0; index<total; index++) {
10082 if (!FormatDbUint4Write(AmbCharPtr[index], fdbp->fd_seq))
10083 return 1;
10084 }
10085 MemFree(AmbCharPtr);
10086 AmbCharPtr = NULL;
10087 }
10088 }
10089
10090 fdbp->num_of_seqs++; /* Finshed ... */
10091
10092 return 0;
10093 }
10094
10095 /* ------------------------------------------------------------------
10096 This is handler for HeapSort function
10097 ------------------------------------------------------------------*/
ID_Compare(VoidPtr i,VoidPtr j)10098 static int LIBCALLBACK ID_Compare(VoidPtr i, VoidPtr j)
10099 {
10100 if (*(Int4Ptr)i > *(Int4Ptr)j)
10101 return (1);
10102 if (*(Int4Ptr)i < *(Int4Ptr)j)
10103 return (-1);
10104 return (0);
10105 }
10106
10107 /*******************************************************************************
10108 * Finish stage - out offset tables, etc, into files. Is to be called before
10109 * FormatDBClose()
10110 *******************************************************************************
10111 * Parameters:
10112 *
10113 *
10114 * Returns void
10115 ******************************************************************************/
10116 #define DATETIME_LENGTH 64
10117
FDBFinish(FormatDBPtr fdbp)10118 static Int2 FDBFinish (FormatDBPtr fdbp)
10119 {
10120 Char DBName[FILENAME_MAX];
10121 Int4 title_len;
10122 Char dateTime[DATETIME_LENGTH];
10123 ISAMObjectPtr object;
10124 ISAMErrorCode error;
10125 Uint4 i;
10126 Char filenamebuf[FILENAME_MAX];
10127 Int2 tmp, extra_bytes = 0;
10128
10129 if(fdbp->aip_def != NULL) /* Structured deflines */
10130 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
10131 else
10132 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
10133
10134 if(!fdbp->options->is_protein) {
10135 fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10136 fdbp->SeqOffsetTable[fdbp->num_of_seqs] =
10137 fdbp->AmbOffsetTable[fdbp->num_of_seqs];
10138 } else {
10139 fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10140 }
10141
10142 /* Parsing finished - now dumping index file */
10143
10144 if(fdbp->options->parse_mode)
10145 FILECLOSE(fdbp->fd_stmp);
10146
10147 /* Information */
10148
10149 if(fdbp->options->version == 0) /* Not Set */
10150 fdbp->options->version = FORMATDB_VER;
10151
10152 if (!FormatDbUint4Write(fdbp->options->version, fdbp->fd_ind))
10153 return 1;
10154 if (!FormatDbUint4Write(fdbp->options->is_protein, fdbp->fd_ind))
10155 return 1;
10156
10157 if(fdbp->options->db_title != NULL)
10158 title_len = StringLen(fdbp->options->db_title);
10159 else
10160 title_len = 0;
10161
10162 if (!FormatDbUint4Write(title_len, fdbp->fd_ind))
10163 return 1;
10164
10165 if (title_len != 0)
10166 if (FileWrite(fdbp->options->db_title, title_len, 1, fdbp->fd_ind) != (Uint4) 1)
10167 return 1;
10168
10169 MemSet(dateTime, 0, DATETIME_LENGTH);
10170 Nlm_DayTimeStr(dateTime, TRUE, TRUE);
10171
10172 /* write db_title and date-time stamp eigth bytes aligned */
10173 tmp = title_len + StringLen(dateTime);
10174 if (tmp%8) {
10175 extra_bytes = 8 - tmp%8;
10176 }
10177 if (!FormatDbUint4Write(StringLen(dateTime) + extra_bytes, fdbp->fd_ind))
10178 return 1;
10179 if (FileWrite(dateTime, StringLen(dateTime) + extra_bytes, 1, fdbp->fd_ind) != 1)
10180 return 1;
10181
10182 if (!FormatDbUint4Write(fdbp->num_of_seqs, fdbp->fd_ind))
10183 return 1;
10184
10185 if (fdbp->options->version == FORMATDB_VER_TEXT) {
10186 if (!FormatDbUint4Write(fdbp->TotalLen, fdbp->fd_ind))
10187 return 1;
10188 } else {
10189 if (!FormatDbUint8Write(fdbp->TotalLen, fdbp->fd_ind))
10190 return 1;
10191 }
10192
10193 if (!FormatDbUint4Write(fdbp->MaxSeqLen, fdbp->fd_ind))
10194 return 1;
10195
10196 /* Offset tables */
10197
10198 for(i=0; i <= fdbp->num_of_seqs; i++) {
10199 if (!FormatDbUint4Write(fdbp->DefOffsetTable[i], fdbp->fd_ind))
10200 return 1;
10201 }
10202
10203 for(i=0; i <= fdbp->num_of_seqs; i++) {
10204 if (!FormatDbUint4Write(fdbp->SeqOffsetTable[i], fdbp->fd_ind))
10205 return 1;
10206 }
10207 if(!fdbp->options->is_protein) {
10208 for(i=0; i <= fdbp->num_of_seqs; i++) {
10209 if (!FormatDbUint4Write(fdbp->AmbOffsetTable[i], fdbp->fd_ind))
10210 return 1;
10211 }
10212 }
10213
10214 if(fdbp->num_of_seqs==0){
10215
10216 Char db_type = fdbp->options->is_protein ? 'p' : 'n';
10217 ErrLogPrintf("FDBFinish: Empty %s database...\n",
10218 fdbp->options->is_protein?"protein":"nucleotide");
10219
10220 /* Close open files and remove them */
10221 FILECLOSE(fdbp->fd_seq);
10222 FILECLOSE(fdbp->fd_def);
10223 ASNIOCLOSE(fdbp->aip_def);
10224 FILECLOSE(fdbp->fd_stmp);
10225 FILECLOSE(fdbp->fd_sdi);
10226 sprintf(filenamebuf, "%s.%chr", fdbp->options->base_name, db_type);
10227 FileRemove(filenamebuf);
10228 sprintf(filenamebuf, "%s.%ctm", fdbp->options->base_name, db_type);
10229 FileRemove(filenamebuf);
10230 sprintf(filenamebuf, "%s.%csq", fdbp->options->base_name, db_type);
10231 FileRemove(filenamebuf);
10232 sprintf(filenamebuf, "%s.%cdi", fdbp->options->base_name, db_type);
10233 FileRemove(filenamebuf);
10234 FILECLOSE(fdbp->fd_ind); /* the only file standing */
10235 return 0;
10236 }
10237
10238 /* Numeric lookup table sort & dump */
10239
10240 if(fdbp->options->parse_mode && fdbp->lookup->used > 0) {
10241
10242 FILE *fd_lookup;
10243 sprintf(DBName, "%s.%cnd", fdbp->options->base_name,
10244 fdbp->options->is_protein ? 'p' : 'n');
10245
10246 fd_lookup = FileOpen(DBName, "wb");
10247
10248 HeapSort(fdbp->lookup->table, fdbp->lookup->used/2,
10249 sizeof(Uint4)*2, ID_Compare);
10250
10251 for(i=0; i < fdbp->lookup->used; i++) {
10252 if (!FormatDbUint4Write(fdbp->lookup->table[i], fd_lookup))
10253 return 1;
10254 }
10255
10256 FILECLOSE(fd_lookup);
10257
10258 /* Now creating numeric ISAM index */
10259
10260 sprintf(filenamebuf, "%s.%cni",
10261 fdbp->options->base_name, fdbp->options->is_protein ? 'p' : 'n');
10262
10263 if((object = ISAMObjectNew(ISAMNumeric,
10264 DBName, filenamebuf)) == NULL) {
10265 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create ISAM object.\n");
10266 return 1;
10267 }
10268
10269 if((error = ISAMMakeIndex(object, 0, 0)) != ISAMNoError) {
10270 if (error == ISAMNoOrder) {
10271 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create index."
10272 " Possibly a gi included more than once in the database.\n", (long) error);
10273 } else {
10274 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create index: ISAMErrorCode %ld.\n", (long) error);
10275 }
10276 return 1;
10277 }
10278 ISAMObjectFree(object);
10279 }
10280
10281 /* String file sorting */
10282
10283 if(fdbp->options->parse_mode) {
10284 if (!FormatdbCreateStringIndex(fdbp->options->base_name,
10285 fdbp->options->is_protein,
10286 fdbp->options->sparse_idx,
10287 fdbp->options->test_non_unique))
10288 return 1;
10289 }
10290 #ifdef FDB_TAXONOMYDB
10291 /* Creating taxonomy names lookup database */
10292 if(fdbp->options->tax_lookup != NULL) {
10293 FILE *tifp, *tdfp;
10294 RDBTaxLookupPtr tax_lookup;
10295 Int4 fd_position;
10296
10297 if (fdbp->options->tax_lookup->taxids_in_db != 0) {
10298
10299 tax_lookup = fdbp->options->tax_lookup;
10300
10301 sprintf(filenamebuf, "%s.%cti", fdbp->options->base_name,
10302 fdbp->options->is_protein ? 'p' : 'n');
10303 tifp = FileOpen(filenamebuf, "wb");
10304
10305 sprintf(filenamebuf, "%s.%ctd", fdbp->options->base_name,
10306 fdbp->options->is_protein ? 'p' : 'n');
10307 tdfp = FileOpen(filenamebuf, "wb");
10308
10309 FormatDbUint4Write(TAX_DB_MAGIC_NUMBER, tifp);
10310 FormatDbUint4Write(tax_lookup->taxids_in_db, tifp);
10311
10312 for(i = 0; i < 4; i++) { /* Here are 4 reserved numbers */
10313 FormatDbUint4Write(0, tifp);
10314 }
10315
10316 for(i = 0; i < tax_lookup->all_taxid_count; i++) {
10317 if(tax_lookup->tax_array[i] != NULL) {
10318 FormatDbUint4Write(tax_lookup->tax_array[i]->tax_id, tifp);
10319 fd_position = ftell(tdfp);
10320 FormatDbUint4Write(fd_position, tifp);
10321 fprintf(tdfp,"%s\t%s\t%s\t%s",
10322 tax_lookup->tax_array[i]->sci_name,
10323 tax_lookup->tax_array[i]->common_name,
10324 tax_lookup->tax_array[i]->blast_name,
10325 tax_lookup->tax_array[i]->s_king);
10326 }
10327 }
10328
10329 /* We need to write one more element to have offset of the last
10330 taxonomy id entry */
10331
10332 FormatDbUint4Write(0, tifp);
10333 fd_position = ftell(tdfp);
10334 FormatDbUint4Write(fd_position, tifp);
10335
10336 FILECLOSE(tifp);
10337 FILECLOSE(tdfp);
10338 } else {
10339 ErrLogPrintf("No taxonomy entries found, no taxonomy database "
10340 "will be created\n");
10341 }
10342 /* Free the taxonomy database built so far, but don't close the
10343 * connection to the taxonomy server, that should be done by the
10344 * client application by calling RDTaxLookupClose() */
10345 fdbp->options->tax_lookup = RDTaxLookupReset(fdbp->options->tax_lookup);
10346 } /* if(tax_lookup != NULL) */
10347 #endif
10348
10349
10350 /* PIG table sort and dump */
10351 if (fdbp->options->is_protein && fdbp->ptable && fdbp->ptable->count > 0) {
10352 FILE *fp;
10353
10354 sprintf(DBName, "%s.ppd", fdbp->options->base_name);
10355
10356 if ( !(fp = FileOpen(DBName, "wb")))
10357 return 1;
10358
10359 HeapSort(fdbp->ptable->pop, fdbp->ptable->count/2,
10360 sizeof(Uint4)*2, ID_Compare);
10361
10362 for (i = 0; i < fdbp->ptable->count; i++) {
10363 if (!FormatDbUint4Write(fdbp->ptable->pop[i], fp))
10364 return 1;
10365 }
10366
10367 FILECLOSE(fp);
10368
10369 /* Create ISAM index for PIG/ordinal id mapping */
10370 sprintf(filenamebuf, "%s.ppi", fdbp->options->base_name);
10371
10372 if ( !(object = ISAMObjectNew(ISAMNumeric, DBName, filenamebuf))) {
10373 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM object.\n");
10374 return 1;
10375 }
10376
10377 if ( (error = ISAMMakeIndex(object, 0, 0)) != ISAMNoError) {
10378 if (error == ISAMNoOrder) {
10379 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM index."
10380 " Possibly a PIG included more than once in the "
10381 "database.\n", (long) error);
10382 } else {
10383 ErrPostEx(SEV_ERROR, 0, 0, "Failed to create PIG ISAM index: "
10384 "ISAMErrorCode %ld.\n", (long) error);
10385 }
10386 return 1;
10387 }
10388 ISAMObjectFree(object);
10389 }
10390
10391 ErrLogPrintf("Formatted %ld sequences in volume %ld\n", fdbp->num_of_seqs,
10392 fdbp->options->volume);
10393
10394 return 0;
10395 } /* end FDBFinish() */
10396
10397
FDBOptionsFree(FDB_optionsPtr options)10398 FDB_optionsPtr FDBOptionsFree(FDB_optionsPtr options)
10399 {
10400 if (!options)
10401 return NULL;
10402
10403 MemFree(options->db_title);
10404 MemFree(options->db_file);
10405 MemFree(options->base_name);
10406 MemFree(options->alias_file_name);
10407 MemFree(options->gi_file);
10408 MemFree(options->gi_file_bin);
10409 MemFree(options);
10410
10411 return options;
10412 }
10413 /*******************************************************************************
10414 * Free memory allocated for given variable of FormatDB
10415 *******************************************************************************
10416 * Parameters:
10417 * fdbp - pointer to memory to be freed
10418 *
10419 * Returns NULL
10420 ******************************************************************************/
10421
FormatDBClose(FormatDBPtr fdbp)10422 Int2 FormatDBClose(FormatDBPtr fdbp)
10423 {
10424
10425 /* Now dumping all data to disk */
10426
10427 if(FDBFinish (fdbp))
10428 return 1;
10429
10430 /* ... and MemFree all stuff */
10431
10432 MemFree(fdbp->DefOffsetTable);
10433 MemFree(fdbp->SeqOffsetTable);
10434
10435 if(!fdbp->options->is_protein) {
10436 MemFree(fdbp->AmbOffsetTable);
10437 }
10438
10439 FASTALookupFree(fdbp->lookup);
10440 FDBPigTableFree(fdbp->ptable);
10441
10442 FILECLOSE(fdbp->fd);
10443
10444 ASNIOCLOSE(fdbp->aip_def);
10445 FILECLOSE(fdbp->fd_def);
10446 FILECLOSE(fdbp->fd_ind);
10447 FILECLOSE(fdbp->fd_seq);
10448 FILECLOSE(fdbp->fd_sdi);
10449
10450 ASNIOCLOSE(fdbp->aip);
10451
10452 /* Do not Clear options structure */
10453
10454 MemFree (fdbp);
10455
10456 return 0;
10457 }
SeqEntrysToBLAST(SeqEntryPtr sep,FormatDBPtr fdbp,Boolean is_na,Uint1 group_segs)10458 NLM_EXTERN Boolean SeqEntrysToBLAST (SeqEntryPtr sep, FormatDBPtr fdbp,
10459 Boolean is_na, Uint1 group_segs)
10460 {
10461 FastaDat tfa;
10462 MyFsa mfa;
10463 Char buf[255];
10464
10465 if ((sep == NULL) || (fdbp == NULL))
10466 return FALSE;
10467
10468 MemSet ((Pointer) (&mfa), 0, sizeof (MyFsa));
10469 mfa.buf = buf;
10470 mfa.buflen = 254;
10471 mfa.seqlen = 70;
10472 mfa.mydata = (Pointer)fdbp;
10473 mfa.myfunc = BLASTFileFunc;
10474 mfa.bad_asn1 = FALSE;
10475 mfa.order = 0;
10476 mfa.accession = NULL;
10477 mfa.organism = NULL;
10478 mfa.do_virtual = FALSE;
10479 mfa.tech = 0;
10480 mfa.no_sequence = FALSE;
10481 mfa.formatdb = TRUE;
10482
10483 if (is_na)
10484 /* in case of "formatdb" we wont use this parameter */
10485 mfa.code = Seq_code_ncbi2na;
10486 else
10487 mfa.code = Seq_code_ncbistdaa;
10488
10489 tfa.mfp = &mfa;
10490 tfa.is_na = is_na;
10491 if (group_segs == 3) { /* do 2 things */
10492 mfa.do_virtual = TRUE;
10493 group_segs = 1;
10494 }
10495
10496 tfa.group_segs = group_segs;
10497 tfa.last_indent = -1;
10498 tfa.parts = -1;
10499 tfa.seg = -1;
10500 tfa.got_one = FALSE;
10501 SeqEntryExplore(sep, (Pointer)&tfa, SeqEntryFasta);
10502
10503 return tfa.got_one;
10504 }
10505
10506 /*****************************************************************************
10507 *
10508 * FastaFileFunc(key, buf, data)
10509 * standard "write to file" callback
10510 *
10511 *****************************************************************************/
BLASTFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)10512 Boolean BLASTFileFunc (BioseqPtr bsp, Int2 key, CharPtr buf, Uint4 buflen,
10513 Pointer data)
10514 {
10515 FormatDBPtr fdbp = (FormatDBPtr) data;
10516 Int4 SequenceLen;
10517 Uint4 i, total, index;
10518
10519 switch (key) {
10520 case FASTA_ID:
10521
10522 SequenceLen = bsp->length;
10523 fdbp->TotalLen += SequenceLen;
10524
10525 if (fdbp->MaxSeqLen < SequenceLen)
10526 fdbp->MaxSeqLen = SequenceLen;
10527
10528 if(fdbp->OffsetAllocated <= fdbp->num_of_seqs) {
10529 fdbp->OffsetAllocated += INDEX_ARRAY_CHUNKS;
10530
10531 fdbp->DefOffsetTable = (Int4Ptr)Realloc(fdbp->DefOffsetTable,
10532 fdbp->OffsetAllocated*sizeof(Uint4));
10533 fdbp->SeqOffsetTable = (Int4Ptr)Realloc(fdbp->SeqOffsetTable,
10534 fdbp->OffsetAllocated*sizeof(Uint4));
10535 if(!fdbp->options->is_protein) {
10536 fdbp->AmbOffsetTable = (Int4Ptr)Realloc(fdbp->AmbOffsetTable,
10537 fdbp->OffsetAllocated*sizeof(Uint4));
10538 }
10539 }
10540
10541 if(fdbp->aip_def != NULL) /* Structured deflines */
10542 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->aip_def->fp);
10543 else
10544 fdbp->DefOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_def);
10545
10546 fdbp->SeqOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq);
10547
10548 if (FileWrite(buf, buflen, 1, fdbp->fd_def) != (Uint4) 1)
10549 return FALSE;
10550 if (FileWrite(" ", 1, 1, fdbp->fd_def) != (Uint4) 1)
10551 return FALSE;
10552
10553 /* Now adding new entried into lookup hash table */
10554
10555 if((UpdateLookupInfo(buf, fdbp->lookup, fdbp->num_of_seqs,
10556 fdbp->fd_stmp, fdbp->options->parse_mode,
10557 fdbp->options->sparse_idx)) != LOOKUP_NO_ERROR) {
10558 return FALSE;
10559 }
10560
10561 break;
10562 case FASTA_DEFLINE:
10563 if (FileWrite(buf, buflen, 1, fdbp->fd_def) != (Uint4) 1)
10564 return FALSE;
10565 break;
10566 case FASTA_SEQLINE:
10567 if (FileWrite(buf, buflen, 1, fdbp->fd_seq) != (Uint4) 1)
10568 return FALSE;
10569 break;
10570 case FASTA_EOS: /* end of sequence */
10571 if(fdbp->options->is_protein) {
10572 i=0;
10573 if (FileWrite(&i, 1, 1, fdbp->fd_seq) != (Uint4) 1)
10574 return FALSE;
10575 } else {
10576 /* dump ambiguity characters. */
10577 fdbp->AmbOffsetTable[fdbp->num_of_seqs] = ftell(fdbp->fd_seq); /* Anyway... */
10578
10579 /* if AmbCharPtr is not NULL, then there was ambiguity. */
10580 if(fdbp->AmbCharPtr != NULL) {
10581 /* The first Uint4 holds the total number of ambig. bp. */
10582 total = (*(fdbp->AmbCharPtr))+1;
10583 for (index=0; index<total; index++) {
10584 if (!FormatDbUint4Write(fdbp->AmbCharPtr[index], fdbp->fd_seq))
10585 return FALSE;
10586 }
10587 MemFree(fdbp->AmbCharPtr);
10588 fdbp->AmbCharPtr = NULL;
10589 }
10590 }
10591 fdbp->num_of_seqs++;
10592 break;
10593 case FASTA_FORMATDB_AMB: {
10594 Int4 len;
10595 Char tmpbuff[1024];
10596 /* In case of "formatdb" nucleotides have to be compressed */
10597
10598 fdbp->AmbCharPtr = NULL;
10599
10600 if (bsp->seq_data_type == Seq_code_ncbi4na && bsp->seq_data != NULL){
10601
10602 /* ncbi4na require compression into ncbi2na */
10603
10604 if (fdbp->options->version > FORMATDB_VER_TEXT)
10605 {
10606 if((bsp->seq_data = (SeqDataPtr) BSCompressDNANew((ByteStorePtr) bsp->seq_data, bsp->length,
10607 &(fdbp->AmbCharPtr))) == NULL) {
10608 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10609 "Formating failed.\n");
10610 return FALSE;
10611 }
10612 }
10613 else
10614 {
10615 if((bsp->seq_data = (SeqDataPtr) BSCompressDNA((ByteStorePtr) bsp->seq_data, bsp->length,
10616 &(fdbp->AmbCharPtr))) == NULL) {
10617 ErrLogPrintf("Error converting ncbi4na to ncbi2na. "
10618 "Formating failed.\n");
10619 return FALSE;
10620 }
10621 }
10622 bsp->seq_data_type = Seq_code_ncbi2na; /* just for information */
10623 } else {
10624 /* if sequence already in ncbi2na format we have to update last byte */
10625 Uint1 ch, remainder;
10626
10627 if((remainder = (bsp->length%4)) == 0) {
10628 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4+1, SEEK_SET);
10629 BSPutByte((ByteStorePtr) bsp->seq_data, NULLB);
10630 } else {
10631 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10632 ch = remainder + BSGetByte((ByteStorePtr) bsp->seq_data);
10633 BSSeek((ByteStorePtr) bsp->seq_data, bsp->length/4, SEEK_SET);
10634 BSPutByte((ByteStorePtr) bsp->seq_data, ch);
10635 }
10636 }
10637 /* Now dumping sequence */
10638
10639 BSSeek((ByteStorePtr) bsp->seq_data, 0, SEEK_SET);
10640 while((len = BSRead((ByteStorePtr) bsp->seq_data, tmpbuff, sizeof(tmpbuff))) != 0) {
10641 BLASTFileFunc(bsp, FASTA_SEQLINE, tmpbuff, len, data);
10642 }
10643
10644 BLASTFileFunc(bsp, FASTA_EOS, NULL, 0, data);
10645 }
10646
10647 break;
10648 default:
10649 break;
10650 }
10651 return TRUE;
10652 }
10653
10654 /* ----------------- Proccessing ASN.1 with formatdb ----------------- */
10655
10656 typedef struct _FDB_SEDataInfo {
10657 CharPtr seqid;
10658 ByteStorePtr bsp;
10659 Int4 length;
10660 Uint1 seq_data_type;
10661 CharPtr defline;
10662 FastaDat PNTR tfp;
10663 FormatDBPtr fdbp;
10664 } FDB_SEDataInfo, PNTR FDB_SEDataInfoPtr;
10665
FDB_FastaFileFunc(BioseqPtr bsp,Int2 key,CharPtr buf,Uint4 buflen,Pointer data)10666 static Boolean FDB_FastaFileFunc(BioseqPtr bsp, Int2 key, CharPtr buf,
10667 Uint4 buflen, Pointer data)
10668 {
10669 FDB_SEDataInfoPtr fsedip;
10670
10671 if((fsedip = data) == NULL)
10672 return TRUE;
10673
10674 switch (key) {
10675 case FASTA_DEFLINE:
10676 MemCpy(fsedip->defline, buf, buflen);
10677 fsedip->defline[buflen] = NULLB;
10678 break;
10679 case FASTA_SEQLINE:
10680 BSWrite(fsedip->bsp, buf, buflen);
10681 fsedip->length += buflen;
10682 break;
10683 case FASTA_ID:
10684 MemCpy(fsedip->seqid, buf, buflen);
10685 fsedip->seqid[buflen] = NULLB;
10686 break;
10687 case FASTA_EOS: /* end of sequence */
10688 /* Here we should add new entry to FD database and reset
10689 all spaces */
10690
10691 FDBAddSequence(fsedip->fdbp, NULL, &fsedip->seq_data_type,
10692 &fsedip->bsp, fsedip->length,
10693 fsedip->seqid, fsedip->defline,
10694 0, 0, 0, 0, 0);
10695
10696 BSSeek(fsedip->bsp, 0, SEEK_SET);
10697 BSDelete(fsedip->bsp, BSLen(fsedip->bsp));
10698 fsedip->length = 0;
10699
10700 break;
10701 }
10702
10703 return TRUE;
10704 }
10705
FDB_SEDataInfoNew(void)10706 FDB_SEDataInfoPtr FDB_SEDataInfoNew(void)
10707 {
10708 FDB_SEDataInfoPtr fsedip;
10709 MyFsa PNTR mfp;
10710
10711 fsedip = MemNew(sizeof(FDB_SEDataInfo));
10712
10713 fsedip->tfp = MemNew(sizeof (FastaDat));
10714 mfp = MemNew(sizeof (MyFsa));
10715 fsedip->tfp->mfp = mfp;
10716
10717 mfp->buf = MemNew(255);
10718 mfp->buflen = 254;
10719 mfp->seqlen = 254;
10720 mfp->myfunc = FDB_FastaFileFunc;
10721 mfp->bad_asn1 = FALSE;
10722 mfp->order = 0;
10723 mfp->accession = NULL;
10724 mfp->organism = NULL;
10725 mfp->do_virtual = TRUE;
10726 mfp->tech = 0;
10727 mfp->no_sequence = FALSE;
10728 mfp->formatdb = FALSE;
10729 mfp->mydata = fsedip; /* ... */
10730
10731 fsedip->tfp->group_segs = 1; /*** to trigger delta's and maps ***/
10732 fsedip->tfp->last_indent = -1;
10733 fsedip->tfp->parts = -1;
10734 fsedip->tfp->seg = -1;
10735 fsedip->tfp->got_one = FALSE;
10736
10737
10738 if(fsedip->seqid == NULL)
10739 fsedip->seqid = MemNew(fsedip->tfp->mfp->buflen+1);
10740
10741 fsedip->bsp = BSNew(2048);
10742
10743 if(fsedip->defline == NULL){
10744 fsedip->defline = MemNew(fsedip->tfp->mfp->buflen+1);
10745 }
10746 return fsedip;
10747 }
10748
FDB_SEDataInfoFree(FDB_SEDataInfoPtr fsedip)10749 void FDB_SEDataInfoFree(FDB_SEDataInfoPtr fsedip)
10750 {
10751 MemFree(fsedip->tfp->mfp->buf);
10752 MemFree(fsedip->tfp->mfp);
10753 MemFree(fsedip->tfp);
10754 BSFree(fsedip->bsp);
10755 MemFree(fsedip->defline);
10756 MemFree(fsedip->seqid);
10757
10758 MemFree(fsedip);
10759 }
10760
FDBSeqEntry_callback(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)10761 static void FDBSeqEntry_callback (SeqEntryPtr sep, Pointer data,
10762 Int4 index, Int2 indent)
10763 {
10764 FDB_SEDataInfoPtr fsedip;
10765 BioseqPtr bsp=NULL;
10766 Boolean is_na;
10767
10768 if((fsedip = (FDB_SEDataInfoPtr) data) == NULL)
10769 return;
10770
10771 if(!IS_Bioseq(sep)) {
10772 SeqEntryFasta(sep, fsedip->tfp, index, indent);
10773 return;
10774 }
10775
10776 bsp = sep->data.ptrvalue;
10777 is_na = ISA_na(bsp->mol);
10778
10779 /* We will format only sequences of one kind */
10780 if(fsedip->fdbp->options->is_protein != !is_na) {
10781 fsedip->tfp->mfp->no_sequence = TRUE;
10782 SeqEntryFasta(sep, fsedip->tfp, index, indent);
10783 return;
10784 }
10785
10786 /* Segmented and virtual sequences are not indexed */
10787 if(bsp->repr == Seq_repr_seg || bsp->repr == Seq_repr_virtual) {
10788 fsedip->tfp->mfp->no_sequence = TRUE;
10789 SeqEntryFasta(sep, fsedip->tfp, index, indent);
10790 return;
10791 }
10792
10793 fsedip->tfp->last_indent = -1;
10794
10795 if(bsp->repr == Seq_repr_raw || bsp->repr == Seq_repr_const){
10796
10797
10798 /* This will collect defline and seqid */
10799
10800 fsedip->tfp->mfp->no_sequence = TRUE;
10801 SeqEntryFasta(sep, fsedip->tfp, index, indent);
10802
10803 FDBAddSequence(fsedip->fdbp, NULL, &bsp->seq_data_type,
10804 (ByteStorePtr PNTR) &bsp->seq_data, bsp->length,
10805 fsedip->seqid, fsedip->defline, 0, 0, 0, 0, 0);
10806
10807 /* Reseting mfp structure */
10808 /* fsedip->tfp->mfp->accession = NULL;
10809 fsedip->tfp->mfp->organism = NULL;
10810 *fsedip->defline = NULLB;
10811 *fsedip->seqid = NULLB; */
10812
10813 } else { /* This will work for example for delta seqs */
10814 fsedip->seq_data_type = fsedip->tfp->mfp->code;
10815 fsedip->length = 0;
10816 fsedip->tfp->mfp->no_sequence = FALSE;
10817 BSSeek(fsedip->bsp, 0, SEEK_SET);
10818 SeqEntryFasta(sep, fsedip->tfp, index, indent);
10819 }
10820
10821 return;
10822 }
10823
FDBAddSeqEntry(FormatDBPtr fdbp,SeqEntryPtr sep)10824 Boolean FDBAddSeqEntry(FormatDBPtr fdbp, SeqEntryPtr sep)
10825 {
10826 FDB_SEDataInfoPtr fsedip;
10827
10828 fsedip = FDB_SEDataInfoNew();
10829 fsedip->fdbp = fdbp;
10830
10831 fsedip->tfp->is_na = !fsedip->fdbp->options->is_protein;
10832
10833 if (fsedip->tfp->is_na){
10834 fsedip->tfp->mfp->code = Seq_code_iupacna;
10835 } else {
10836 fsedip->tfp->mfp->code = Seq_code_ncbistdaa;
10837 }
10838
10839 SeqEntryExplore(sep, fsedip, FDBSeqEntry_callback);
10840
10841 FDB_SEDataInfoFree(fsedip);
10842
10843 return TRUE;
10844 }
10845
10846
10847 /* ---------------------------------------------------------------------*/
10848 /* ------------- End of functions, that uses in formatdb -------------- */
10849 /* ---------------------------------------------------------------------*/
10850
10851
10852 /* ---------------------------------------------------------------------*/
10853 /* ------- Functions used to initialize and access blast taxonomy DB -- */
10854 /* ---------------------------------------------------------------------*/
10855
10856
RDBTaxInfoInit()10857 RDBTaxInfoPtr RDBTaxInfoInit()
10858 {
10859 RDBTaxInfoPtr tip;
10860 Char buffer [1024], *filebuf = NULL;
10861 Uint4 value;
10862 Int4 i;
10863
10864 tip = MemNew(sizeof(RDBTaxInfo));
10865
10866 /* We do not suppose, that this database exists, but if it is
10867 exists we will intitialize it properly. So first message is just
10868 INFO, that database does not exists, but then - message will be
10869 ERROR if database is invalid */
10870
10871 sprintf(buffer, "%s.bti", BLAST_TAXDB_FILENAME);
10872 filebuf = FindBlastDBFile(buffer);
10873 if((tip->taxfp = NlmOpenMFILE(filebuf)) == NULL) {
10874 ErrPostEx(SEV_INFO, 0, 0, "RDBTaxInfoInit: Unable to open %s", filebuf);
10875 MemFree(filebuf);
10876 MemFree(tip);
10877 return NULL;
10878 }
10879
10880 filebuf[StringLen(filebuf)-1] = 'd';
10881 if((tip->name_fd = NlmOpenMFILE(filebuf)) == NULL) {
10882 ErrPostEx(SEV_ERROR, 0,0, "RDBTaxInfoInit: Unable to open %s", filebuf);
10883 NlmCloseMFILE(tip->taxfp);
10884 MemFree(filebuf);
10885 MemFree(tip);
10886 return NULL;
10887 }
10888 filebuf = MemFree(filebuf);
10889
10890 /* Last check-up of the database validity */
10891 NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10892 if (Nlm_SwapUint4(value) != TAX_DB_MAGIC_NUMBER) {
10893 ErrPostEx(SEV_ERROR, 0, 0, "RDBTaxInfoInit: Invalid database",
10894 buffer);
10895 NlmCloseMFILE(tip->taxfp);
10896 NlmCloseMFILE(tip->name_fd);
10897 MemFree(tip);
10898 return NULL;
10899 }
10900
10901 NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10902 tip->all_taxid_count = Nlm_SwapUint4(value);
10903
10904 for(i = 0; i < 4; i++) {
10905 NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10906 tip->reserved[i] = Nlm_SwapUint4(value);
10907 }
10908
10909 /* Load the taxid/file offsets from the remaining of the index file */
10910 if (tip->taxfp->mfile_true) {
10911 tip->taxdata = (RDBTaxIdPtr) tip->taxfp->mmp;
10912 tip->taxdata_alloc = FALSE;
10913 } else {
10914
10915 tip->taxdata = (RDBTaxIdPtr) MemNew(sizeof(RDBTaxId) *
10916 tip->all_taxid_count);
10917 if ((tip->taxdata) == NULL) {
10918 ErrPostEx(SEV_ERROR, 0, 0, "RDBTaxInfoInit: Not enough memory to "
10919 "load index table");
10920 NlmCloseMFILE(tip->taxfp);
10921 NlmCloseMFILE(tip->name_fd);
10922 MemFree(tip);
10923 return NULL;
10924 }
10925
10926 for (i = 0; i < tip->all_taxid_count; i++) {
10927 NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10928 tip->taxdata[i].taxid = Nlm_SwapUint4(value);
10929 NlmReadMFILE((Uint1Ptr) &value, 4, 1, tip->taxfp);
10930 tip->taxdata[i].offset = Nlm_SwapUint4(value);
10931 }
10932
10933 tip->taxfp = NlmCloseMFILE(tip->taxfp);
10934 tip->taxdata_alloc = TRUE;
10935 }
10936
10937 /* Only this thread will clean up this structure */
10938 tip->taxinfo_alloc = TRUE;
10939
10940 return tip;
10941 }
10942
10943 /* Free memory, unmap files etc. related to the taxonomy database */
RDBTaxInfoClose(RDBTaxInfoPtr tip)10944 void RDBTaxInfoClose(RDBTaxInfoPtr tip)
10945 {
10946 if(tip == NULL)
10947 return;
10948
10949 if (tip->taxinfo_alloc) {
10950 if (tip->taxdata_alloc)
10951 MemFree(tip->taxdata);
10952 else
10953 NlmCloseMFILE(tip->taxfp);
10954
10955 NlmCloseMFILE(tip->name_fd);
10956 MemFree(tip);
10957 } else {
10958 tip->taxfp = NlmCloseMFILE(tip->taxfp);
10959 tip->name_fd = NlmCloseMFILE(tip->name_fd);
10960 tip = MemFree(tip);
10961 }
10962
10963 return;
10964 }
10965
10966 /* Main function to get taxonomy names for given tax_id from
10967 blast taxonomy database. Returns NULL if tax_id is not in the database */
RDBGetTaxNames(RDBTaxInfoPtr tip,Int4 tax_id)10968 RDBTaxNamesPtr RDBGetTaxNames(RDBTaxInfoPtr tip, Int4 tax_id)
10969 {
10970 RDBTaxNamesPtr tnames;
10971 Int4 low_taxid, high_taxid;
10972 RDBTaxIdPtr taxdata;
10973 Int4 low_index, high_index, new_index, old_index, curr_taxid;
10974
10975 if(tip == NULL)
10976 return NULL;
10977
10978 taxdata = tip->taxdata;
10979
10980 low_index = 0;
10981 high_index = tip->all_taxid_count-1;
10982
10983 low_taxid = Nlm_SwapUint4(taxdata[low_index].taxid);
10984 high_taxid = Nlm_SwapUint4(taxdata[high_index].taxid);
10985
10986 if(tax_id < low_taxid || tax_id > high_taxid)
10987 return NULL;
10988
10989 new_index = (low_index+high_index)/2;
10990 old_index = new_index;
10991
10992 while(TRUE) {
10993
10994 curr_taxid = Nlm_SwapUint4(taxdata[new_index].taxid);
10995
10996 if (tax_id < curr_taxid) {
10997 high_index = new_index;
10998 } else if (tax_id > curr_taxid){
10999 low_index = new_index;
11000 } else { /* Got it ! */
11001 break;
11002 }
11003
11004 new_index = (low_index+high_index)/2;
11005 if (new_index == old_index) {
11006 if (tax_id > curr_taxid) {
11007 new_index++;
11008 }
11009 break;
11010 }
11011 old_index = new_index;
11012 }
11013
11014 if(tax_id == Nlm_SwapUint4(taxdata[new_index].taxid)) {
11015 Char buffer[1024];
11016 CharPtr chptr = NULL, start_ptr = NULL;
11017
11018 tnames = MemNew(sizeof(RDBTaxNames));
11019 tnames->tax_id = tax_id;
11020
11021 NlmSeekInMFILE(tip->name_fd, Nlm_SwapUint4(taxdata[new_index].offset),
11022 SEEK_SET);
11023
11024 NlmReadMFILE((Uint1Ptr)buffer,
11025 Nlm_SwapUint4(taxdata[new_index+1].offset) -
11026 Nlm_SwapUint4(taxdata[new_index].offset)+1, 1, tip->name_fd);
11027
11028 start_ptr = buffer;
11029
11030 /* Scientific name */
11031
11032 if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11033 RDBTaxNamesFree(tnames);
11034 return NULL;
11035 }
11036
11037 *chptr = NULLB;
11038 chptr++;
11039 tnames->sci_name = StringSave(start_ptr);
11040 start_ptr = chptr;
11041
11042 /* Common name */
11043
11044 if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11045 RDBTaxNamesFree(tnames);
11046 return NULL;
11047 }
11048
11049 *chptr = NULLB;
11050 chptr++;
11051 tnames->common_name = StringSave(start_ptr);
11052 start_ptr = chptr;
11053
11054 /* Blast name */
11055
11056 if((chptr = StringChr(start_ptr, '\t')) == NULL) {
11057 RDBTaxNamesFree(tnames);
11058 return NULL;
11059 }
11060
11061 *chptr = NULLB;
11062 chptr++;
11063 tnames->blast_name = StringSave(start_ptr);
11064 start_ptr = chptr;
11065
11066 /* Super - kingdom */
11067
11068 tnames->s_king[0] = *start_ptr;
11069
11070 /* fscanf(tip->name_fd, "%s\t%s\t%s\t%s",
11071 name1, name2, name3, tnames->s_king);
11072 tnames->sci_name = StringSave(name1);
11073 tnames->common_name = StringSave(name2);
11074 tnames->blast_name = StringSave(name3); */
11075
11076 return tnames;
11077 }
11078
11079 return NULL;
11080 }
11081
readdb_get_taxnames(ReadDBFILEPtr rdfp,Int4 tax_id)11082 RDBTaxNamesPtr LIBCALL readdb_get_taxnames(ReadDBFILEPtr rdfp, Int4 tax_id)
11083 {
11084 RDBTaxInfoPtr tip;
11085 RDBTaxNamesPtr tnames = NULL;
11086
11087 if((tip = rdfp->taxinfo) != NULL) {
11088 tnames = RDBGetTaxNames(tip, tax_id);
11089 }
11090
11091 return tnames;
11092 }
11093
11094 /************************************************************************/
11095 /* The CommonIndex stuff */
11096 /************************************************************************/
11097
11098 /* The function initializes CommonIndexPtr with give filename */
11099
CommonIndexInit(CharPtr indexfilename)11100 CommonIndexHeadPtr CommonIndexInit(CharPtr indexfilename)
11101 {
11102
11103 Nlm_MemMapPtr mmpindx;
11104 CommonIndexHeadPtr cihp = (CommonIndexHeadPtr) MemNew(sizeof(CommonIndexHead));
11105 CharPtr charptr = NULL;
11106
11107 if (!(mmpindx = Nlm_MemMapInit(indexfilename))) {
11108 ErrPostEx(SEV_ERROR, 0, 0, "Could not open Common Index file. Probably wrong path specified\n");
11109 CommonIndexDestruct(cihp); /* unable to find or parse config file. */
11110 return NULL;
11111 }
11112
11113 cihp->maxgi = FileLength(indexfilename) / sizeof(CommonIndex);
11114 cihp->memmap = mmpindx;
11115 cihp->ci = (CommonIndexPtr) mmpindx->mmp_begin;
11116
11117 /* read list of databases from the configuration file */
11118
11119 charptr = Nlm_FilePathFind(indexfilename);
11120 if (!(cihp->num_of_DBs = ParseDBConfigFile(&(cihp->dbids), charptr))) {
11121 if (charptr)
11122 MemFree(charptr);
11123 CommonIndexDestruct(cihp); /* unable to find or parse config file. */
11124 return NULL;
11125 }
11126 if (charptr)
11127 MemFree(charptr);
11128
11129 if (!(cihp->ci)) {
11130 return NULL;
11131 } else
11132 return cihp;
11133 }
11134
CommonIndexDestruct(CommonIndexHeadPtr cihp)11135 void CommonIndexDestruct(CommonIndexHeadPtr cihp) {
11136
11137 Int2 i;
11138
11139 if (cihp && cihp->memmap)
11140 Nlm_MemMapFini(cihp->memmap);
11141
11142 for (i=0; i < cihp->num_of_DBs; i++) {
11143 if (cihp && cihp->dbids && ((cihp->dbids + i)->name))
11144 MemFree((cihp->dbids + i)->name);
11145 }
11146 if (cihp && cihp->dbids)
11147 MemFree(cihp->dbids);
11148
11149 MemFree(cihp);
11150 }
11151 /* returns shift of bit for specified DB name */
11152
DBShift(Int2 num_of_DBs,DataBaseIDPtr dbids,CharPtr dbname,Boolean is_prot)11153 Int2 DBShift(Int2 num_of_DBs, DataBaseIDPtr dbids, CharPtr dbname, Boolean is_prot)
11154 {
11155 Int2 i;
11156
11157 if (!dbname) {
11158 ErrPostEx(SEV_ERROR, 0, 0, "Specified database name is NULL\n");
11159 return 0;
11160 }
11161
11162 for(i=0; i < num_of_DBs; i++) {
11163 if(!StrCmp(dbname, (dbids+i)->name) && ((dbids+i)->isprot == is_prot)) {
11164 return (dbids+i)->id;
11165 }
11166 }
11167
11168 return 0;
11169 }
11170
11171 /* returns name of the database by given bit shift */
11172
DBName(Int2 num_of_DBs,DataBaseIDPtr dbids,Int2 shift)11173 CharPtr DBName(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift)
11174 {
11175 Int2 i;
11176
11177 if (!shift) {
11178 ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift is zero\n");
11179 return NULL;
11180 }
11181
11182 for(i=0; i < num_of_DBs; i++) {
11183 if((dbids+i)->id == shift) {
11184 return (dbids+i)->name;
11185 }
11186 }
11187 ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift %d is not known\n", shift);
11188 return NULL;
11189 }
11190
11191 /* say if the database contains proteins */
11192
DBisProt(Int2 num_of_DBs,DataBaseIDPtr dbids,Int2 shift)11193 Boolean DBisProt(Int2 num_of_DBs, DataBaseIDPtr dbids, Int2 shift)
11194 {
11195 Int2 i;
11196
11197 if (!shift) {
11198 ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift is zero\n");
11199 return FALSE;
11200 }
11201
11202 for(i=0; i < num_of_DBs; i++) {
11203 if((dbids+i)->id == shift) {
11204 return (dbids+i)->isprot;
11205 }
11206 }
11207 ErrPostEx(SEV_ERROR, 0, 0, "Specified bit shift %d is not known\n", shift);
11208 return FALSE;
11209 }
11210
CommonIndexResultDestruct(CommonIndexResultPtr cir)11211 void CommonIndexResultDestruct(CommonIndexResultPtr cir)
11212 {
11213 if (!cir)
11214 return;
11215 if (cir->next)
11216 CommonIndexResultDestruct(cir->next);
11217 if (cir)
11218 MemFree(cir);
11219 }
11220
11221 /* returns OID by given GI */
GI2OID(CommonIndexHeadPtr cih,Int4 gi,Int4 dbmask,Int4 alias_dbmask,Int2Ptr dbid,Int2Ptr alias_dbid,ReadDBFILEPtr rdfp)11222 Int4 GI2OID(CommonIndexHeadPtr cih, Int4 gi, Int4 dbmask, Int4 alias_dbmask,
11223 Int2Ptr dbid, Int2Ptr alias_dbid, ReadDBFILEPtr rdfp)
11224 {
11225 CommonIndexResultPtr cir, cir_start;
11226 Int4 retval=-1;
11227 Uint4 dbmask_tmp;
11228
11229 /* gi is not in the database (or even in the common index).
11230 The most probable reason for this is that the gi was released
11231 after the database was built. Return -1 to indicate that it
11232 is not in the database. */
11233 if (gi < 0 || gi >= cih->maxgi) {
11234 return -1;
11235 }
11236
11237 cir_start = GIs2OIDs(cih, &gi, 1, dbmask | alias_dbmask, rdfp);
11238
11239
11240 /* Get oid in a real database */
11241 cir = cir_start;
11242 while (cir && (retval==-1)) {
11243 if (dbmask & (0x1<<cir->dbid)) {
11244 *dbid = cir->dbid;
11245 retval = cir->oid;
11246 }
11247 cir = cir->next;
11248 }
11249
11250
11251 /* now set dbid to correct alias database, if alias database mask specified */
11252 dbmask_tmp = SwapUint4(cih->ci[gi].dbmask);
11253 if (dbmask_tmp - dbmask > 0 && alias_dbid)
11254 *alias_dbid = bit_engine_firstbit(dbmask_tmp & alias_dbmask);
11255
11256 CommonIndexResultDestruct(cir_start);
11257
11258 return retval;
11259 }
11260
11261 /*
11262 gets list of GI's and returns all OID for each database from the mask
11263 the GI belongs to. dbmask == 0 means all databases.
11264 The list of OID is constructed as list of the CommonIndexResult items
11265 (see readdb.h for definition)
11266 noids - number of found oid on return
11267 */
11268
GIs2OIDs(CommonIndexHeadPtr cih,Int4Ptr gis,Int4 number_of_gis,Int4 dbmask,ReadDBFILEPtr startrdfp)11269 CommonIndexResultPtr GIs2OIDs(CommonIndexHeadPtr cih, Int4Ptr gis,
11270 Int4 number_of_gis, Int4 dbmask, ReadDBFILEPtr startrdfp)
11271 {
11272 Int4 i, gi, numDB, mask;
11273 Int2 firstpos, curfirstpos;
11274 CommonIndexPtr cigi;
11275 CommonIndexResultPtr cir = NULL, cirfirst = NULL;
11276 Boolean first = TRUE;
11277 ISAMObjectPtr nisam_opt = NULL;
11278 ISAMErrorCode error;
11279 Uint4 value;
11280 ReadDBFILEPtr rdfp;
11281
11282 /* for each given GI we need to check if this gi is in list */
11283
11284 for(i=0; i < number_of_gis; i++) {
11285 gi = gis[i];
11286 if (gi < 0)
11287 continue;
11288
11289 cigi = cih->ci + gi;
11290
11291 /* mask says what DBs the GI belongs to */
11292 mask = SwapUint4(cigi->dbmask);
11293
11294 if (dbmask && !(dbmask & mask)) {
11295 /* skip the gi if it is not in dbmask databases */
11296 continue;
11297 }
11298
11299 numDB = bit_engine_numofbits(mask);
11300
11301 if (numDB) {
11302 /* Okay, there is at least one database which contains such GI */
11303
11304 /* Check if this is the "often" database for the GI */
11305 firstpos = bit_engine_firstbit(mask);
11306
11307 /* dbmask == 0 means that we search for ALL DBs */
11308 if (!dbmask || (dbmask & (0x1 << firstpos))) {
11309 if (first) {
11310 /* create first if needed */
11311 cirfirst = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11312 first = FALSE;
11313 cir = cirfirst;
11314 } else {
11315 cir->next = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11316 cir = cir->next;
11317 }
11318 cir->gi = gi;
11319
11320 /* we know that for the first database the often field is used */
11321 cir->oid = SwapUint4(cigi->oftenOID);
11322
11323 cir->dbid = firstpos;
11324 cir->next = NULL;
11325 }
11326 curfirstpos = firstpos;
11327
11328 /* do for the rest of databases */
11329 while (--numDB) {
11330 /* shift mask to get next database bit shift */
11331 mask >>= (curfirstpos + 1);
11332 curfirstpos = bit_engine_firstbit(mask);
11333 /* update absolute bit shift */
11334 firstpos += curfirstpos + 1;
11335
11336 if (!dbmask || (dbmask & (0x1 << firstpos))) {
11337
11338 /* find OID using ISAM old index */
11339
11340 rdfp = startrdfp;
11341 while (rdfp) {
11342 if (rdfp->filebit == firstpos) {
11343 nisam_opt = rdfp->nisam_opt;
11344 break;
11345 }
11346 rdfp = rdfp->next;
11347 }
11348
11349 if (!nisam_opt) {
11350 /* that means that the database specified by 'firstpos' is mask */
11351 /* skip the database */
11352 continue;
11353 }
11354 if (first) {
11355 cirfirst = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11356 first = FALSE;
11357 cir = cirfirst;
11358 } else {
11359 cir->next = (CommonIndexResultPtr) MemNew(sizeof(CommonIndexResult));
11360 cir = cir->next;
11361 }
11362
11363 cir->gi = gi;
11364
11365 /* Initialize and perform the ISAM search */
11366 if((error = NISAMSearch(nisam_opt, gi, &value, NULL)) < 0) {
11367 ErrPostEx(SEV_ERROR, 0, 0, "Failed to initialize ISAM search");
11368 return NULL;
11369 }
11370
11371 if(error == ISAMNotFound) {
11372 ErrPostEx(SEV_ERROR, 0, 0, "Internal error inside GIs2OIDs(), we expected to find this GI into the database\n");
11373 }
11374
11375 cir->oid = (Int4) value;
11376
11377 cir->dbid = firstpos;
11378 cir->next = NULL;
11379 }
11380 }
11381 }
11382 }
11383 /* return first item of the list */
11384 return cirfirst;
11385 }
11386
11387
FindDBbyGI(CommonIndexHeadPtr cih,Int4 gi,Uint1 * is_prot)11388 CharPtr FindDBbyGI(CommonIndexHeadPtr cih, Int4 gi, Uint1 *is_prot)
11389 {
11390 Int4 numDB, mask;
11391 Int2 firstpos;
11392 CommonIndexPtr cigi;
11393
11394 if (gi > cih->maxgi)
11395 return NULL;
11396
11397 cigi = cih->ci + gi;
11398 mask = SwapUint4(cigi->dbmask);
11399
11400 numDB = bit_engine_numofbits(mask);
11401
11402 if (numDB) {
11403 firstpos = bit_engine_firstbit(mask);
11404 *is_prot = DBisProt(cih->num_of_DBs, cih->dbids, firstpos);
11405 return DBName(cih->num_of_DBs, cih->dbids, firstpos);
11406 } else {
11407 return NULL;
11408 }
11409
11410 }
11411
11412 /* returns senior (first) bit in the word */
bit_engine_firstbit(Int4 word)11413 Int2 bit_engine_firstbit (Int4 word)
11414 {
11415 Int2 i;
11416 Int4 senior_bit = 0x1;
11417
11418 for (i=0; i < 8*sizeof(Int4); i++) {
11419 if (word & senior_bit)
11420 return i;
11421 senior_bit <<= 1;
11422 }
11423 return -1;
11424 }
11425
11426 /* return number of bits which are ON in the give "word" */
bit_engine_numofbits(Int4 word)11427 Int2 bit_engine_numofbits(Int4 word)
11428 {
11429 Int2 i;
11430 Int4 tmpbit = 0x1;
11431 Int2 count = 0;
11432
11433 if (!word) {
11434 return 0;
11435 }
11436
11437 for (i=0; i < 8*sizeof(Int4); i++, tmpbit <<= 1) {
11438 if (word & tmpbit) {
11439 count++;
11440 }
11441 }
11442 return count;
11443 }
11444 /* returns:
11445 1. list of dbid shifts
11446 2. number of dbs
11447 */
11448
bit_engine_arr(Int4 word)11449 Int2Ptr bit_engine_arr(Int4 word)
11450 {
11451 Int2 i;
11452 Int4 tmpbit = 0x1;
11453 Int2Ptr retval;
11454 Int2 count = 0;
11455
11456 retval = (Int2Ptr) MemNew(sizeof(Int2)*8*sizeof(Int4));
11457
11458 if (!word) {
11459 retval[0] = 0;
11460 return retval;
11461 }
11462
11463 for (i=0; i < 8*sizeof(Int4); i++, tmpbit <<= 1) {
11464 if (word & tmpbit) {
11465 retval[count+1] = i;
11466 count++;
11467 }
11468 }
11469 retval[0] = count;
11470
11471 return retval;
11472 }
11473
11474 /************************************************************************/
11475 /* END The CommonIndex stuff */
11476 /************************************************************************/
11477
11478 /************************************************************************/
11479 /* The functions used with ID1 dump stuff */
11480 /************************************************************************/
11481
11482 /* This function iterates through the array of function poiters, invoking each
11483 * one and returning the logical AND of each of these function's return
11484 * values. */
DB_Subset(GMSubsetDataPtr gmsdp,DI_Record direc)11485 Boolean DB_Subset (GMSubsetDataPtr gmsdp, DI_Record direc)
11486 {
11487 Boolean retval;
11488 Int4 i;
11489
11490 retval = (*gmsdp->criteria[0])((void *)&direc);
11491 for (i = 1; i < gmsdp->count; i++) {
11492 retval = (retval && (*gmsdp->criteria[i])((void *)&direc));
11493 }
11494
11495 return retval;
11496 }
11497
is_EST_HUMAN(VoidPtr direc)11498 Boolean is_EST_HUMAN (VoidPtr direc)
11499 {
11500 return (((DI_RecordPtr)direc)->taxid == 9606);
11501 }
is_EST_MOUSE(VoidPtr direc)11502 Boolean is_EST_MOUSE (VoidPtr direc)
11503 {
11504 return (((DI_RecordPtr)direc)->taxid == 10090 ||
11505 ((DI_RecordPtr)direc)->taxid == 10091 ||
11506 ((DI_RecordPtr)direc)->taxid == 10092 ||
11507 ((DI_RecordPtr)direc)->taxid == 35531 ||
11508 ((DI_RecordPtr)direc)->taxid == 80274 ||
11509 ((DI_RecordPtr)direc)->taxid == 57486);
11510 }
is_EST_OTHERS(VoidPtr direc)11511 Boolean is_EST_OTHERS (VoidPtr direc)
11512 {
11513 return (!is_EST_HUMAN(direc) && !is_EST_MOUSE(direc));
11514 }
11515
is_SWISSPROT(VoidPtr direc)11516 Boolean is_SWISSPROT (VoidPtr direc)
11517 {
11518 return (((DI_RecordPtr)direc)->owner == 6);
11519 }
11520
is_MONTH(VoidPtr direc)11521 Boolean is_MONTH (VoidPtr direc)
11522 {
11523 return (((DI_RecordPtr)direc)->gi_threshold != -1 &&
11524 ((DI_RecordPtr)direc)->gi > ((DI_RecordPtr)direc)->gi_threshold);
11525 }
11526
is_PDB(VoidPtr direc)11527 Boolean is_PDB (VoidPtr direc)
11528 {
11529 return (((DI_RecordPtr)direc)->owner == 10);
11530 }
11531
11532 /* Criteria for determining whether a sequence is refseq:
11533 First 2 characters of the accession are letters, 3rd character is an '_',
11534 and it must be at least kMinAccessionLength characters long.
11535 Updated per suggestion from Misha Kimelman (via email)
11536 */
is_REFSEQ(VoidPtr direc)11537 Boolean is_REFSEQ(VoidPtr direc)
11538 {
11539 const int kMinAccessionLength = 9;
11540 const char* accession = ((DI_RecordPtr)direc)->acc;
11541
11542 if ((StringLen(accession) >= kMinAccessionLength) &&
11543 IS_ALPHA(accession[0]) &&
11544 IS_ALPHA(accession[1]) &&
11545 (accession[2] == '_')) {
11546 return TRUE;
11547 } else {
11548 return FALSE;
11549 }
11550 }
11551
is_REFSEQ_GENOMIC(VoidPtr ptr)11552 Boolean is_REFSEQ_GENOMIC(VoidPtr ptr)
11553 {
11554 return (is_REFSEQ(ptr) && !is_REFSEQ_RNA(ptr));
11555 }
11556
11557 /* Criteria for determining whether a sequence belongs in the refseq_rna
11558 database. This is a subset of the sequences identified by is_REFSEQ with the
11559 additional constraint that the molecule type must be RNA
11560 */
is_REFSEQ_RNA(VoidPtr ptr)11561 Boolean is_REFSEQ_RNA(VoidPtr ptr)
11562 {
11563 DI_RecordPtr direc = (DI_RecordPtr)ptr;
11564 if (!is_REFSEQ(direc)) {
11565 return FALSE;
11566 }
11567
11568 return (direc->mol == Seq_mol_rna);
11569 }
11570
is_CONTIG(VoidPtr direc)11571 Boolean is_CONTIG(VoidPtr direc)
11572 {
11573 return (((DI_RecordPtr)direc)->owner == 28);
11574 }
11575
FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list)11576 CharPtr FDFGetAccessionFromSeqIdChain(SeqIdPtr seqid_list)
11577 {
11578 SeqIdPtr sip;
11579 TextSeqIdPtr tsip;
11580 CharPtr acc;
11581
11582 if(seqid_list == NULL)
11583 return NULL;
11584
11585 for(acc = NULL, sip = seqid_list; sip != NULL; sip = sip->next)
11586 {
11587 if(sip->choice != SEQID_GENBANK && sip->choice != SEQID_EMBL &&
11588 sip->choice != SEQID_DDBJ && sip->choice != SEQID_OTHER)
11589 continue;
11590
11591 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
11592 if(tsip == NULL || tsip->accession == NULL ||
11593 tsip->accession[0] == '\0')
11594 continue;
11595
11596 acc = StringSave(tsip->accession);
11597 break;
11598 }
11599 return(acc);
11600 }
11601
11602
11603 /* Function scans .pdi or .ndi file and do callback() for this gi and oid */
11604
ScanDIFile(CharPtr difilename,GMSubsetDataPtr gmsubsetdp,Boolean (* callback)(DI_RecordPtr direc,VoidPtr data),VoidPtr data,FILE * out,Int4 gi_threshold)11605 Boolean ScanDIFile(CharPtr difilename, GMSubsetDataPtr gmsubsetdp,
11606 Boolean (*callback)(DI_RecordPtr direc, VoidPtr data), VoidPtr data,
11607 FILE *out, Int4 gi_threshold)
11608 {
11609 static const int kNumFieldsDiFile = 10;
11610 FILE *fdi;
11611 DI_Record direc;
11612 Char skipstr1[128], accession[128];
11613 long skipdate = 0;
11614 int readstat, total=0, progress_count=0, mol_type=0;
11615 int prev_oid = -1; /* helps to keep track of sequences which
11616 have been merged in a non-redundant
11617 database (i.e.: protein dbs) */
11618 #ifdef SHOW_PROGRESS
11619 Int4 progress_chunk = 100;
11620 #endif
11621
11622
11623 /* open index file */
11624 fdi = FileOpen(difilename, "r");
11625
11626 if (!fdi) {
11627 fprintf(out, "\nERROR: cannot open '%s'", difilename);
11628 return FALSE;
11629 }
11630
11631 /* set gi threshold for month subset */
11632 direc.gi_threshold = gi_threshold;
11633 MemSet(accession, NULLB, 128);
11634
11635 /* each line in index file looks like: */
11636 /* 2933800 5769963 9606 8 EST 427 -2021038615 38990825 M61958 2 */
11637 while ((readstat = fscanf(fdi, "%ld %ld %ld %ld %s %ld %ld %ld %s %u",
11638 (long *) &direc.oid, (long *) &direc.gi,
11639 (long *) &direc.taxid, (long *) &direc.owner,
11640 skipstr1, (long *) &direc.len,
11641 (long *) &direc.hash, (long *) &skipdate,
11642 accession, (unsigned int *) &mol_type)) ==
11643 kNumFieldsDiFile) {
11644 direc.acc = StringSave(accession);
11645 direc.mol = (Uint1)mol_type;
11646 direc.gi_threshold = gi_threshold;
11647 /*direc.oid += *curr_oid;*/
11648 if (DB_Subset(gmsubsetdp, direc)) {
11649 /* In the case of non-redundant databases, identical sequences will
11650 * be merged into the same sequence and have the same oid. Entries
11651 * with the same oid should only be counted once. */
11652 if (prev_oid != direc.oid) {
11653 callback(&direc, data);
11654 prev_oid = direc.oid;
11655 }
11656 progress_count++;
11657 }
11658 direc.acc = MemFree(direc.acc);
11659 MemSet((void*) accession, NULLB, sizeof(accession));
11660 MemSet((void*) &direc, NULLB, sizeof(direc));
11661 #ifdef SHOW_PROGRESS
11662 if (!(total % progress_chunk)) {
11663 if (progress_count < progress_chunk/3) {
11664 printf (".");
11665 } else if (progress_count > 2*progress_chunk/3) {
11666 printf ("X");
11667 } else {
11668 printf ("x");
11669 }
11670 progress_count = 0;
11671 fflush(out);
11672 }
11673 #endif
11674
11675 total++;
11676 }
11677
11678 if (readstat != EOF) {
11679 fprintf(out, "\nError occurred while parsing line %d, %s "
11680 "(read %d fields instead of the expected %d)", total+1,
11681 difilename, readstat, kNumFieldsDiFile);
11682 return FALSE;
11683 }
11684 FILECLOSE(fdi);
11685
11686 return TRUE;
11687 }
11688 /************************************************************************/
11689 /* END The functions used with ID1 dump stuff */
11690 /************************************************************************/
11691
11692 /************************************************************************/
11693 /* Fastacmd API */
11694 /************************************************************************/
11695
GetAccList(CharPtr file,Int4Ptr TotalItems)11696 FCMDAccListPtr LIBCALL GetAccList(CharPtr file, Int4Ptr TotalItems)
11697 {
11698 Char TmpBuff[128];
11699 Int4 i, j, k;
11700 Int4 FileLen = 0;
11701 FCMDAccListPtr AccList = NULL;
11702 FCMDAccListPtr AccListTmp, AccListLast;
11703 Int4 NumNotValid = 0;
11704 Int4 gi = 0;
11705
11706 if(file == NULL || file[0] == NULLB) {
11707 *TotalItems = 0;
11708 return NULL;
11709 }
11710
11711 FileLen = StringLen(file);
11712
11713 for(i = 0; i < FileLen; i++) {
11714
11715 if(isspace((int)file[i]) || file[i] == ',') /* Rolling spaces */
11716 continue;
11717
11718 /* This is defence from badly formatted requests */
11719
11720 if(NumNotValid > 10) {
11721 ErrPostEx(SEV_ERROR, 0, 0, "**** ERROR: Too many invalid Gis/Accessions, "
11722 "parsing aborted\n");
11723 *TotalItems = 0;
11724 return NULL;
11725 }
11726
11727 /* Rolling spaces */
11728
11729 j= 0;
11730 while (j < 128 && i < FileLen) {
11731 TmpBuff[j] = TO_LOWER(file[i]);
11732 j++; i++;
11733 if(isspace((int)file[i]) ||
11734 file[i] == ',' || /* Comma is valid delimiter */
11735 file[i] == '\n')
11736 break;
11737 }
11738 TmpBuff[j] = NULLB;
11739
11740 /* Is gi/accession too long ??? */
11741
11742 if(j == 128) {
11743 ErrPostEx(SEV_WARNING, 0, 0, "Gi/Accession \"%s\" is too long\r\n",
11744 TmpBuff);
11745 NumNotValid++;
11746
11747 while(!isspace((int)file[i]) ||
11748 file[i] == ',' ||
11749 file[i] == NULLB) /* Rolling until spaces */
11750 i++;
11751 continue; /* Next may be valid ... who knows...?? */
11752 }
11753
11754 /* Now validating accession/gi */
11755
11756 for(k =0; k < j; k++) {
11757 if(!IS_DIGIT(TmpBuff[k])) {
11758 break;
11759 }
11760 }
11761
11762 gi = 0;
11763 if(k == j)
11764 gi = atol(TmpBuff);
11765
11766 if (gi == 0) {
11767 if (StringChr(TmpBuff, '|') != NULL) {
11768 SeqIdPtr sip = SeqIdParse(TmpBuff);
11769 SeqIdPtr sip_var = sip;
11770 SeqIdPtr best_acc = SeqIdFindBestAccession(sip);
11771 if (best_acc)
11772 {
11773 switch (best_acc->choice)
11774 { /* Only TextSeqIdPtrs */
11775 case SEQID_GENBANK:
11776 case SEQID_EMBL:
11777 case SEQID_DDBJ:
11778 case SEQID_PIR:
11779 case SEQID_SWISSPROT:
11780 case SEQID_PRF:
11781 case SEQID_OTHER:
11782 case SEQID_TPG:
11783 case SEQID_TPE:
11784 case SEQID_TPD:
11785 case SEQID_GPIPE:
11786 SeqIdWrite(best_acc, TmpBuff, PRINTID_TEXTID_ACC_VER, 128);
11787 break;
11788 default:
11789 break;
11790 }
11791 }
11792 while (sip_var)
11793 {
11794 if (sip_var->choice == SEQID_GI)
11795 {
11796 gi = sip_var->data.intvalue;
11797 break;
11798 }
11799 sip_var = sip_var->next;
11800 }
11801 }
11802 }
11803
11804 /* If this is valid Accession check and tranfer it to gi */
11805
11806 /* It we come here - we got valid text ID */
11807
11808 if(AccList == NULL) { /* first element */
11809 AccList = (FCMDAccListPtr) MemNew(sizeof(FCMDAccList));
11810 AccListTmp = AccList;
11811 AccListTmp->acc = StringSave(TmpBuff);
11812 AccListTmp->gi = gi;
11813 AccListTmp->next = NULL;
11814 AccListLast=AccListTmp;
11815 *TotalItems = *TotalItems +1;
11816 } else {
11817 AccListTmp = (FCMDAccListPtr) MemNew(sizeof(FCMDAccList));
11818 AccListLast->next = AccListTmp;
11819 AccListTmp->acc = StringSave(TmpBuff);
11820 AccListTmp->gi = gi;
11821 AccListTmp->next = NULL;
11822 AccListLast = AccListTmp;
11823 *TotalItems = *TotalItems +1;
11824 }
11825 }
11826 if(NumNotValid) {
11827 ErrPostEx(SEV_ERROR, 0, 0, "**** %d invalid Gi%s/Accession%s present in fastacmd "
11828 "request\r\n",
11829 NumNotValid,
11830 NumNotValid == 1 ? "" : "s",
11831 NumNotValid == 1 ? "" : "s"
11832 );
11833 }
11834 return AccList;
11835 }
11836
FCMDAccListFree(FCMDAccListPtr falp)11837 void LIBCALL FCMDAccListFree(FCMDAccListPtr falp)
11838 {
11839 FCMDAccListPtr falp_tmp, falp_next;
11840
11841 if(falp == NULL)
11842 return;
11843
11844 for(falp_tmp = falp; falp_tmp != NULL; falp_tmp=falp_next) {
11845 falp_next = falp_tmp->next;
11846 MemFree(falp_tmp->acc);
11847 MemFree(falp_tmp);
11848 }
11849 }
11850
Fastacmd_PrintTaxonomyInfo(ReadDBFILEPtr rdfp,Int4 oid,FILE * fp,Int4 linelen)11851 static Boolean Fastacmd_PrintTaxonomyInfo(ReadDBFILEPtr rdfp, Int4 oid,
11852 FILE *fp, Int4 linelen)
11853 {
11854 RDBTaxNamesPtr tnames = NULL;
11855 BlastDefLinePtr bdp = NULL, bdp_tmp;
11856 Char buf[128];
11857
11858 if (rdfp == NULL || fp == NULL)
11859 return FALSE;
11860
11861 if ((bdp = FDReadDeflineAsn(rdfp, oid)) == NULL)
11862 return FALSE;
11863
11864 asn2ff_set_output(fp, NULL);
11865 ff_StartPrint(0, 0, linelen, NULL);
11866
11867 /* Print the taxonomy report for each sequence associated with this oid */
11868 for (bdp_tmp = bdp; bdp_tmp; bdp_tmp = bdp_tmp->next) {
11869
11870 /* skip irrelevant sequences if a gi target was specified */
11871 SeqIdPtr gi = SeqIdFindBest(bdp_tmp->seqid, SEQID_GI);
11872 if ( gi && (rdfp->gi_target != 0) && (gi->data.intvalue != rdfp->gi_target) )
11873 continue;
11874
11875 MemSet(buf, 0, sizeof(buf));
11876 SeqIdWrite(bdp_tmp->seqid, buf, PRINTID_FASTA_LONG, sizeof(buf)-1);
11877
11878 if (bdp_tmp->taxid == 0) {
11879 ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information not encoded for "
11880 "Seq-id '%s'", buf);
11881 continue;
11882 }
11883
11884 if ((tnames = RDBGetTaxNames(rdfp->taxinfo, bdp_tmp->taxid)) == NULL) {
11885 ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not available "
11886 "for Seq-id '%s'.\nIf you have not done so already, "
11887 "please update your copy from: %s\n", buf, TAXDB_ON_FTP);
11888 continue;
11889 }
11890
11891 ff_AddString("NCBI sequence id: ");
11892 ff_AddString(buf); NewContLine();
11893
11894 ff_AddString("NCBI taxonomy id: ");
11895 ff_AddInteger("%ld", bdp_tmp->taxid);
11896 NewContLine();
11897
11898 ff_AddString("Common name: ");
11899 ff_AddString(tnames->common_name); NewContLine();
11900 ff_AddString("Scientific name: ");
11901 ff_AddString(tnames->sci_name); NewContLine();
11902 if (bdp_tmp->next)
11903 NewContLine();
11904 RDBTaxNamesFree(tnames);
11905 }
11906
11907 ff_EndPrint();
11908 BlastDefLineSetFree(bdp);
11909
11910 return TRUE;
11911 }
11912
11913 /* Prints the output for the -I option in fastacmd */
11914 static Boolean
Fastacmd_PrintDbFullInformation(ReadDBFILEPtr rdfp,CharPtr databases,Int4 linelen,FILE * out)11915 Fastacmd_PrintDbFullInformation(ReadDBFILEPtr rdfp, CharPtr databases,
11916 Int4 linelen, FILE *out)
11917 {
11918 Boolean is_prot;
11919 CharPtr base_filename;
11920 Char buf[256];
11921 Int4 path_len;
11922
11923 is_prot = (rdfp->parameters & READDB_IS_PROT) ? TRUE : FALSE;
11924 PrintDbInformationWithRID(databases, is_prot, linelen, out, FALSE, NULL, FALSE);
11925
11926 asn2ff_set_output(out, NULL);
11927 ff_StartPrint(0, 0, linelen, NULL);
11928
11929 ff_AddString("File name");
11930 if (rdfp->next)
11931 ff_AddString("s:");
11932 else
11933 ff_AddString(":");
11934 NewContLine();
11935
11936 for (; rdfp; rdfp = rdfp->next) {
11937
11938 /** Print file name **/
11939 if (rdfp->aliasfilename && rdfp->oidlist) { /* subset database */
11940 base_filename = StringRChr(rdfp->full_filename,'/');
11941 MemSet(buf, 0, sizeof(buf));
11942 path_len = StringLen(rdfp->full_filename) -
11943 StringLen(base_filename);
11944 if (path_len > 0 && path_len < sizeof(buf)
11945 && base_filename != NULL) {
11946 StringNCpy(buf, rdfp->full_filename, path_len+1);
11947 StringNCat(buf, rdfp->aliasfilename, sizeof(buf)-1-path_len-1);
11948 } else {
11949 StringNCpy(buf, rdfp->aliasfilename, sizeof(buf)-1);
11950 }
11951
11952 ff_AddString(buf);
11953 base_filename = NULL;
11954 } else /* real database */
11955 ff_AddString(rdfp->full_filename);
11956 NewContLine();
11957
11958 /** Print date **/
11959 TabToColumn(4);
11960 ff_AddString("Date: "); ff_AddString(rdfp->date);
11961
11962 /** Print database version **/
11963 ff_AddString(" Version: ");
11964 ff_AddString(Ltostr((long)rdfp->formatdb_ver, 1));
11965
11966 /** Print length of longest sequence **/
11967 ff_AddString(" Longest sequence: ");
11968 ff_AddString(Ltostr((unsigned long)rdfp->maxlen, 1));
11969 if (readdb_is_prot(rdfp))
11970 ff_AddString(" res");
11971 else
11972 ff_AddString(" bp");
11973 NewContLine();
11974 TabToColumn(0);
11975 }
11976
11977 ff_EndPrint();
11978 return TRUE;
11979 }
11980
Fastacmd_ParseLocations(const char * str,Int4 locations[2])11981 void Fastacmd_ParseLocations(const char* str, Int4 locations[2])
11982 {
11983 const char* delimiters = " ,;";
11984 char* seqlocstr = NULL;
11985
11986 locations[0] = locations[1] = 0;
11987
11988 if ( !str ) {
11989 return;
11990 }
11991
11992 seqlocstr = StringSave((char*) str);
11993
11994 locations[0] =
11995 atol(StringTokMT(seqlocstr, (char*) delimiters, &seqlocstr));
11996 if (locations[0] < 0) {
11997 ErrPostEx(SEV_WARNING, 0, 0,
11998 "Starting location is negative, setting to 0");
11999 locations[0] = 0;
12000 }
12001
12002 if ( !seqlocstr ) {
12003 locations[1] = 0;
12004 } else {
12005 locations[1] = atol(seqlocstr);
12006 }
12007
12008 if (locations[1] < 0) {
12009 ErrPostEx(SEV_WARNING, 0, 0,
12010 "Ending location is negative, setting to 0");
12011 locations[1] = 0;
12012 }
12013 }
12014
Fastacmd_ParseSeqLoc(CharPtr str,Uint1 strand,BioseqPtr bsp)12015 static SeqLocPtr Fastacmd_ParseSeqLoc(CharPtr str, Uint1 strand, BioseqPtr bsp)
12016 {
12017 Int4 locations[2];
12018
12019 if (str == NULL) {
12020 return NULL;
12021 }
12022
12023 Fastacmd_ParseLocations(str, locations);
12024 ASSERT(locations[0] >= 0);
12025 ASSERT(locations[1] >= 0);
12026
12027 /* Sanity check */
12028 if (locations[1] > bsp->length) {
12029 ErrPostEx(SEV_ERROR, 0, 0, "From location cannot be greater "
12030 "than %ld. Ignoring sequence location.\n",
12031 bsp->length);
12032 locations[0] = 0; locations[1] = bsp->length - 1;
12033 }
12034
12035 /* Convert locations to zero-offsets... */
12036 if (locations[1] == 0) {
12037 locations[1] = bsp->length - 1;
12038 } else {
12039 locations[1]--;
12040 }
12041
12042 if (locations[0] > 0) {
12043 locations[0]--;
12044 }
12045
12046 if (ISA_aa(bsp->mol)) /* for proteins, the strand is irrelevant */
12047 strand = Seq_strand_unknown;
12048
12049 ASSERT(locations[0] >= 0);
12050 ASSERT(locations[1] >= 0);
12051 return SeqLocIntNew(locations[0], locations[1],
12052 strand, SeqIdFindBest(bsp->id, SEQID_GI));
12053
12054 }
12055
Fastacmd_Search(CharPtr searchstr,CharPtr database,CharPtr batchfile,Boolean dupl,Int4 linelen,FILE * out)12056 Int2 Fastacmd_Search (CharPtr searchstr, CharPtr database,
12057 CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out)
12058 {
12059 return Fastacmd_Search_ex(searchstr, database, READDB_DB_UNKNOWN,
12060 batchfile, dupl, linelen, out, FALSE, FALSE, eNoDump, NULL,
12061 Seq_strand_unknown, FALSE, FALSE, PIG_NONE);
12062 }
12063
Fastacmd_Search_ex(CharPtr searchstr,CharPtr database,Uint1 is_prot,CharPtr batchfile,Boolean dupl,Int4 linelen,FILE * out,Boolean use_target,Boolean use_ctrlAs,EBlastDbDumpType dump_db,CharPtr seqlocstr,Uint1 strand,Boolean taxonomy_info_only,Boolean dbinfo_only,Int4 pig)12064 Int2 Fastacmd_Search_ex (CharPtr searchstr, CharPtr database, Uint1 is_prot,
12065 CharPtr batchfile, Boolean dupl, Int4 linelen, FILE *out,
12066 Boolean use_target, Boolean use_ctrlAs, EBlastDbDumpType dump_db,
12067 CharPtr seqlocstr, Uint1 strand,
12068 Boolean taxonomy_info_only, Boolean dbinfo_only, Int4 pig)
12069 {
12070 BioseqPtr bsp;
12071 ReadDBFILEPtr rdfp = NULL, rdfp_tmp;
12072 Int4 i, fid, TotalItems=0, count = 0;
12073 FCMDAccListPtr falp=NULL, falp_tmp;
12074 CharPtr buffer = NULL, dbname = database;
12075 FILE *fd;
12076 Int4Ptr ids = NULL;
12077 Int4 guess_gi = -1;
12078 SeqLocPtr slp = NULL;
12079 Uint1 init_state = 0;
12080 Int2 retval = FASTACMD_SUCCESS;
12081
12082 if (searchstr)
12083 guess_gi = atol(searchstr);
12084
12085 if (dbname == NULL)
12086 dbname = FASTACMD_DEFAULT_DB;
12087
12088 ASSERT(dump_db >= eNoDump || dump_db < eDumpTypeMax);
12089
12090 if (taxonomy_info_only)
12091 init_state = READDB_NEW_DO_TAXDB;
12092 else if (dbinfo_only)
12093 init_state = READDB_NEW_DO_REPORT;
12094 else
12095 init_state = READDB_NEW_INDEX;
12096
12097 if (!(rdfp = readdb_new_ex2(dbname, is_prot, init_state, NULL, NULL))) {
12098 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Cannot initialize readdb for "
12099 "%s database\n", dbname);
12100 return FASTACMD_DB_NOT_FOUND;
12101 }
12102
12103 /* Validation of rdfp */
12104 {
12105 Int4 rv = readdb_validate(rdfp);
12106 ASSERT(rv != READDB_INVALID_NULL_ARG);
12107 if (rv == READDB_INVALID_MIXED_DBS) {
12108 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Cannot initialize mismatched "
12109 "protein/nucleotide databases '%s'\n", dbname);
12110 return FASTACMD_ERROR;
12111 }
12112 }
12113
12114 if (dbinfo_only) {
12115 Fastacmd_PrintDbFullInformation(rdfp, dbname, linelen, out);
12116 readdb_destruct(rdfp);
12117 return retval;
12118 }
12119
12120 if (pig != PIG_NONE) {
12121 if ( (fid = readdb_pig2oid(rdfp, pig, NULL)) == -1) {
12122 ErrPostEx(SEV_ERROR, 0, 0, "PIG %ld not found", (long) pig);
12123 return FASTACMD_FAILED_SEARCH;
12124 }
12125 bsp = readdb_get_bioseq_ex(rdfp, fid, TRUE, use_ctrlAs);
12126 slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12127 BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12128 bsp = BioseqFree(bsp);
12129 slp = SeqLocFree(slp);
12130 return retval;
12131 }
12132
12133 /* Taxonomy information is encoded only in the new database format */
12134 if (taxonomy_info_only) {
12135 for (rdfp_tmp = rdfp; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
12136 if (rdfp_tmp->formatdb_ver < FORMATDB_VER) {
12137 ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not supported "
12138 "in your version of\nthe blast databases (version %d). "
12139 "Please update your databases and download\nthe taxonomy "
12140 "blast database files (%s)\n", FORMATDB_VER_TEXT,
12141 TAXDB_ON_FTP);
12142 readdb_destruct(rdfp);
12143 return FASTACMD_NO_TAXDB;
12144 }
12145 }
12146 if (rdfp->taxinfo == NULL) {
12147 ErrPostEx(SEV_ERROR, 0, 0, "Taxonomy information is not "
12148 "available. Please download it from\n"
12149 "%s\n", TAXDB_ON_FTP);
12150 readdb_destruct(rdfp);
12151 return FASTACMD_NO_TAXDB;
12152 }
12153 }
12154
12155 if (dump_db == eNoDump) {
12156 if(searchstr != NULL) {
12157 if((falp = GetAccList(searchstr, &TotalItems)) == NULL) {
12158 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: No valid Gis/Accessions "
12159 "found. Exiting...\n");
12160 return FASTACMD_FAILED_SEARCH;
12161 }
12162 } else if(batchfile != NULL){
12163 if((fd = FileOpen(batchfile, "r")) == NULL) {
12164 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: Could not open %s",
12165 batchfile);
12166 return FASTACMD_ERROR;
12167 }
12168
12169 buffer = WWWReadFileInMemory(fd, 0, TRUE);
12170
12171 if((falp = GetAccList(buffer, &TotalItems)) == NULL) {
12172 ErrPostEx(SEV_ERROR, 0, 0, "ERROR: No valid Gis/Accessions "
12173 "found. Exiting...\n");
12174 return FASTACMD_FAILED_SEARCH;
12175 }
12176 }
12177 }
12178
12179 for (falp_tmp = falp; falp_tmp != NULL; falp_tmp = falp_tmp->next) {
12180
12181 if(falp_tmp->gi != 0) {
12182 fid = readdb_gi2seq(rdfp, falp_tmp->gi, NULL);
12183 } else {
12184 if(!dupl) {
12185 fid = readdb_acc2fasta(rdfp, falp_tmp->acc);
12186 } else {
12187 count = 0;
12188 fid = readdb_acc2fastaEx(rdfp, falp_tmp->acc, &ids, &count);
12189 }
12190 }
12191
12192 if (fid < 0 && fid != -1) {
12193 ErrPostEx(SEV_ERROR, 0, 0, "Accesion search failed for \"%s\" "
12194 "with error code %d\n", falp_tmp->acc, fid);
12195 return FASTACMD_FAILED_SEARCH;
12196 } else if (fid == -1) {
12197 ErrPostEx(SEV_ERROR, 0, 0, "Entry \"%s\" not found\n",
12198 falp_tmp->acc);
12199 retval = FASTACMD_FAILED_SEARCH;
12200 } else if (ids == NULL) { /* gi or SeqId */
12201 if (use_target) {
12202 ReadDBFILEPtr rdfp_tmp;
12203 for (rdfp_tmp = rdfp; rdfp_tmp; rdfp_tmp = rdfp_tmp->next)
12204 rdfp_tmp->gi_target = falp_tmp->gi;
12205 }
12206 if (taxonomy_info_only) {
12207 if (!Fastacmd_PrintTaxonomyInfo(rdfp, fid, out, linelen))
12208 retval = FASTACMD_FAILED_SEARCH;
12209 } else {
12210 bsp = readdb_get_bioseq_ex(rdfp, fid, TRUE, use_ctrlAs);
12211 slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12212 BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12213 bsp = BioseqFree(bsp);
12214 slp = SeqLocFree(slp);
12215 }
12216 } else {
12217 for(i = 0; i < count; i++) {
12218 if (taxonomy_info_only) {
12219 if (!Fastacmd_PrintTaxonomyInfo(rdfp, ids[i], out,
12220 linelen))
12221 retval = FASTACMD_FAILED_SEARCH;
12222 } else {
12223 bsp = readdb_get_bioseq_ex(rdfp, ids[i], TRUE,
12224 use_ctrlAs);
12225 slp = Fastacmd_ParseSeqLoc(seqlocstr, strand, bsp);
12226 BioseqRawToFastaExtraEx(bsp, out, linelen, slp);
12227 bsp = BioseqFree(bsp);
12228 slp = SeqLocFree(slp);
12229 }
12230 }
12231 ids = MemFree(ids);
12232 }
12233 }
12234
12235 /* sanity check */
12236 if (dump_db) {
12237 DumpBlastDB(rdfp, out, linelen, use_ctrlAs, dump_db);
12238 }
12239
12240 readdb_destruct(rdfp);
12241 MemFree(buffer);
12242 FCMDAccListFree(falp);
12243 return retval;
12244 }
12245
s_HasGiList(const ReadDBFILEPtr rdfp_list)12246 static Boolean s_HasGiList(const ReadDBFILEPtr rdfp_list)
12247 {
12248 ReadDBFILEPtr rdfp = (ReadDBFILEPtr) rdfp_list;
12249 if ( !rdfp_list ) {
12250 return FALSE;
12251 }
12252
12253 for (; rdfp; rdfp = rdfp->next) {
12254 if (rdfp->gilist || rdfp->gifile) {
12255 return TRUE;
12256 }
12257 }
12258 return FALSE;
12259 }
12260
12261 /* #define SHOW_PROGRESS */
12262
DumpBlastDB(const ReadDBFILEPtr rdfp,FILE * fp,Int4 linelen,Boolean ctrlA,EBlastDbDumpType dump_type)12263 Int2 DumpBlastDB(const ReadDBFILEPtr rdfp, FILE *fp, Int4 linelen,
12264 Boolean ctrlA, EBlastDbDumpType dump_type)
12265 {
12266 register Uint4 maskidx, bit_shift, dump;
12267 register Int4 i;
12268 Int4 total = 0, dumped = 0, nseqs = 0;
12269 OIDListPtr oidlist = NULL;
12270 Int8 tot_len = 0;
12271 ReadDBFILEPtr rdfp_tmp = rdfp;
12272 #ifdef SHOW_PROGRESS
12273 Int2 progress_chunk = 100;
12274 #endif
12275
12276 /* Obtain the total length of this database */
12277 if (!(readdb_get_totals(rdfp,&tot_len,&total))) {
12278 ErrPostEx(SEV_ERROR,0,0,"Could not retrieve database length");
12279 return -1;
12280 }
12281 readdb_get_totals_ex(rdfp,&tot_len,&nseqs,TRUE); /* for testing only */
12282
12283 /* readdb_new returns a sorted list of ReadDBFILEPtr's (real db's
12284 * followed by subset (mask) db's, and we do not support that yet */
12285 if (!rdfp->oidlist) {
12286 for (; rdfp_tmp; rdfp_tmp = rdfp_tmp->next) {
12287 if (rdfp_tmp->oidlist) {
12288 ErrPostEx(SEV_ERROR, 0, 0,
12289 "Feature not available - Cannot dump subset databases and "
12290 "real databases at the same time.");
12291 return -1;
12292 }
12293 }
12294 }
12295 if (s_HasGiList(rdfp)) {
12296 ErrPostEx(SEV_ERROR, 0, 0,
12297 "Feature not available - Cannot dump databases with gi files");
12298 return -1;
12299 }
12300 rdfp_tmp = rdfp;
12301
12302 if (!rdfp->oidlist) {
12303
12304 for (i = 0; i < total; i++) {
12305 #ifdef SHOW_PROGRESS
12306 if (!(i%progress_chunk)) {
12307 fprintf(stderr,"\b\b\b\b%3d%%",(int)((100*i)/total));
12308 }
12309 #endif
12310
12311 if (DumpOneSequence(rdfp, fp, linelen, ctrlA, dump_type, i))
12312 dumped++;
12313 }
12314 } else {
12315
12316 oidlist = rdfp_tmp->oidlist;
12317
12318 for (i = 0; i < total; i++) {
12319 #ifdef SHOW_PROGRESS
12320 if (!(i%progress_chunk)) {
12321 fprintf(stderr,"\b\b\b\b%3d%%",(int)((100*i)/total));
12322 }
12323 #endif
12324 /* Retrieve the correct oidlist, as each rdfp has its own */
12325 if (i > rdfp_tmp->stop) {
12326 rdfp_tmp = rdfp_tmp->next;
12327 if (rdfp_tmp) {
12328 oidlist = rdfp_tmp->oidlist;
12329 } else {
12330 ErrPostEx(SEV_FATAL, 1,0,
12331 "BlastDBToFasta: Oid %d is not in this mask");
12332 return -1;
12333 }
12334
12335 /* Make sure we have an oidlist! */
12336 if (!oidlist) {
12337 ErrPostEx(SEV_FATAL, 1,0,
12338 "This mask database does not have an oidlist!\n"
12339 "There is probably a wrong ordering problem in "
12340 "the ReadDBFILEPtrs");
12341 return -1;
12342 }
12343 }
12344
12345 /* Adjust the index i to this rdfp_tmp */
12346 maskidx = (i - rdfp_tmp->start)/MASK_WORD_SIZE;
12347 bit_shift = MASK_WORD_SIZE-1 - (i - rdfp_tmp->start) % MASK_WORD_SIZE;
12348
12349 /* Make sure we are not addressing an index that's larger
12350 * than the our oidlist */
12351 if ((i - rdfp_tmp->start) > oidlist->total)
12352 continue;
12353
12354 /* Mask this index! */
12355 dump = SwapUint4(oidlist->list[maskidx]) & (0x1 << bit_shift);
12356
12357 if ( !dump ) {
12358 continue;
12359 }
12360
12361 if (DumpOneSequence(rdfp, fp, linelen, ctrlA, dump_type, i))
12362 dumped++;
12363 }
12364 }
12365
12366 #ifdef SHOW_PROGRESS
12367 fprintf(stderr,"\n");
12368 fprintf(stderr,"Dumped %ld sequences (should be %d)\n", dumped,nseqs);
12369 Beep();
12370 #endif
12371
12372 return 0;
12373 }
12374
DumpOneSequence(const ReadDBFILEPtr rdfp,FILE * fp,Int4 linelen,Boolean ctrlA,EBlastDbDumpType dump_type,Int4 i)12375 Int2 DumpOneSequence(const ReadDBFILEPtr rdfp, FILE *fp, Int4 linelen,
12376 Boolean ctrlA, EBlastDbDumpType dump_type, Int4 i)
12377 {
12378 Int2 retval=0;
12379
12380 switch (dump_type) {
12381 case eFasta:
12382 {
12383 BioseqPtr bsp = NULL;
12384
12385 if ((bsp = readdb_get_bioseq_ex(rdfp,i, TRUE, ctrlA)) != NULL) {
12386 if (BioseqRawToFastaExtra(bsp, fp, linelen))
12387 retval = 1;
12388 else
12389 ErrPostEx(SEV_ERROR,0,0, "Could not convert Bioseq to FASTA");
12390 }
12391 BioseqFree(bsp);
12392 }
12393 break;
12394
12395 case eGi:
12396 case eAccession:
12397 {
12398 Uint4 h = 0; /* header marker for readdb_get_header_ex */
12399 SeqIdPtr sip = NULL;
12400 while (readdb_get_header(rdfp, i, &h, &sip, NULL)) {
12401 if (dump_type == eGi) {
12402 SeqIdPtr gi = SeqIdFindBest(sip, SEQID_GI);
12403 if (gi) {
12404 fprintf(fp, "%d\n", gi->data.intvalue);
12405 retval=1;
12406 }
12407 }
12408 if (dump_type == eAccession) {
12409 SeqIdPtr accn = SeqIdFindBestAccession(sip);
12410
12411 if (accn) {
12412 Int4 gi=0;
12413 CharPtr id=NULL;
12414 Boolean numeric_id = GetAccessionVersionFromSeqId(accn, &gi, &id, TRUE);
12415 if (id)
12416 {
12417 fprintf(fp, "%s\n", id);
12418 retval=1;
12419 }
12420 else
12421 ErrPostEx(SEV_WARNING, 0, 0, "No accession found for oid %d", i);
12422
12423 id = MemFree(id);
12424 }
12425 }
12426 sip = SeqIdFree(sip);
12427 }
12428 }
12429 break;
12430
12431 default:
12432 abort(); /* should never happen */
12433 }
12434
12435 return retval;
12436 }
12437
12438 /************************************************************************/
12439 /* END Fastacmd API */
12440 /************************************************************************/
12441
12442
12443 /*************************************************************************
12444 This function reads in a list of gi's from a text file
12445 and make a binary gilist file.
12446
12447 The binary gilist format has the following construction:
12448
12449 1.) 1st 4 bytes: a 'magic' number: UINT4_MAX
12450 2.) 2nd 4 bytes: total number of gi's in the file (call this value 'number').
12451 3.) 'number' set of 4 bytes, allowing 4 bytes for each gi.
12452
12453 The function GetGisFromFile first checks what the first 4 bytes
12454 of a file are, if they are the 'magic' number, then it proceeds
12455 to read values assuming a binary format. If they are not the
12456 'magic' number, then a text format is assumed.
12457
12458 *************************************************************************/
12459
12460 static int LIBCALLBACK
compare_gis(VoidPtr v1,VoidPtr v2)12461 compare_gis(VoidPtr v1, VoidPtr v2)
12462 {
12463 Uint4 gi1 = *(Uint4Ptr) v1;
12464 Uint4 gi2 = *(Uint4Ptr) v2;
12465
12466 return ((gi1<gi2) ? -1 : ((gi1>gi2) ? 1 : 0));
12467 }
12468
12469
12470 #define GIFILE_LINE_LEN 1024
12471 Int4 LIBCALL
readdb_MakeGiFileBinary(CharPtr input_file,CharPtr output_file)12472 readdb_MakeGiFileBinary (CharPtr input_file, CharPtr output_file)
12473 {
12474 FILE *infp=NULL, *outfp=NULL;
12475 Int4 index = 0, value, chunk_size = 24, gilist_size;
12476 Int2 status;
12477 Char line[GIFILE_LINE_LEN];
12478 long tmplong;
12479 Uint4Ptr gi_list;
12480
12481 if (!(infp = FileOpen(input_file, "r"))) {
12482 ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", input_file);
12483 return -1;
12484 }
12485
12486 if (!(outfp = FileOpen(output_file, "wb"))) {
12487 ErrPostEx(SEV_ERROR, 0, 0, "Unable to open file %s", output_file);
12488 return -1;
12489 }
12490
12491 gi_list = MemNew(chunk_size * sizeof(Uint4));
12492
12493 while (FileGets(line, GIFILE_LINE_LEN, infp))
12494 {
12495 /* do correct casting */
12496 status = sscanf(line, "%ld", &tmplong);
12497 value = tmplong;
12498
12499 /* skip non-valid lines */
12500 if (status > 0 && value > 0) {
12501 /* do we have enough space in gi_list ? */
12502 if (chunk_size < index + 1) {
12503 chunk_size *= 2;
12504 gi_list = Realloc(gi_list, chunk_size * sizeof(Uint4));
12505 }
12506
12507 gi_list[index++] = value;
12508 }
12509 }
12510
12511 FormatDbUint4Write(READDB_MAGIC_NUMBER, outfp);
12512 FormatDbUint4Write(index, outfp);
12513
12514 gilist_size = index;
12515 HeapSort(gi_list, gilist_size, sizeof(Uint4), compare_gis);
12516
12517 for (index=0; index<gilist_size; index++)
12518 {
12519 FormatDbUint4Write(gi_list[index], outfp);
12520 }
12521
12522 gi_list = MemFree(gi_list);
12523
12524 FILECLOSE(infp);
12525 FILECLOSE(outfp);
12526
12527 return gilist_size;
12528 }
12529
FastaToBlastDB(FDB_optionsPtr options,Int4 Bases_In_Volume)12530 Int4 FastaToBlastDB(FDB_optionsPtr options, Int4 Bases_In_Volume)
12531 {
12532 FILE *fd;
12533 FormatDBPtr fdbp;
12534 SeqEntryPtr sep;
12535 BioseqPtr bsp;
12536 Char filenamebuf[FILENAME_MAX];
12537 Int4 count=0, volume=0;
12538 BlastDefLinePtr bdp = NULL;
12539
12540 if ((fdbp = FormatDBInit(options)) == NULL)
12541 return 2;
12542 if((fd = FileOpen(options->db_file, "r")) == NULL)
12543 return 3;
12544
12545 /* Get sequences */
12546 while ((sep = FastaToSeqEntryEx(fd, (Boolean)!options->is_protein,
12547 NULL, options->parse_mode)) != NULL) {
12548
12549 if(!IS_Bioseq(sep)) { /* Not Bioseq - failure */
12550 ErrLogPrintf("Error in readind Bioseq Formating failed.\n");
12551 return 4;
12552 }
12553
12554 bsp = (BioseqPtr) sep->data.ptrvalue;
12555
12556 if(Bases_In_Volume >= 1) {
12557 if(count > Bases_In_Volume) {
12558 /* starting new volume ? */
12559 count = 0;
12560 if(FormatDBClose(fdbp))
12561 return 9;
12562
12563 if(Bases_In_Volume > 1) {
12564 sprintf(filenamebuf, "%s.%02ld",
12565 options->base_name, (long) volume);
12566 options->base_name = StringSave(filenamebuf);
12567 volume++;
12568 }
12569
12570 if ((fdbp = FormatDBInit(options)) == NULL)
12571 return 2;
12572 }
12573 count += bsp->length;
12574 }
12575 bdp = FDBGetDefAsnFromBioseq(bsp, NULL);
12576 FDBAddBioseq(fdbp, bsp, bdp);
12577 bdp = BlastDefLineFree(bdp);
12578
12579 SeqEntryFree(sep);
12580 }
12581 FILECLOSE(fd);
12582
12583 if(FormatDBClose(fdbp))
12584 return 9;
12585
12586 return 0;
12587 }
12588
FD_CreateAliasFileEx(CharPtr title,CharPtr basename,Int4 volumes,Boolean is_protein,CharPtr parent,Int4 first_oid,Int4 last_oid,Int8 total_length,Int4 number_seqs,CharPtr oidlist,CharPtr gilist)12589 Boolean FD_CreateAliasFileEx(CharPtr title, CharPtr basename,
12590 Int4 volumes, Boolean is_protein,
12591 CharPtr parent,
12592 Int4 first_oid, Int4 last_oid,
12593 Int8 total_length, Int4 number_seqs,
12594 CharPtr oidlist, CharPtr gilist)
12595 {
12596 Char filenamebuf[128];
12597 time_t tnow;
12598 Int4 i;
12599 FILE *fd;
12600
12601 sprintf(filenamebuf, "%s.%cal", basename, is_protein? 'p' : 'n');
12602
12603 if((fd = FileOpen(filenamebuf, "wb")) == NULL)
12604 return FALSE;
12605
12606 tnow = time(NULL);
12607 fprintf(fd, "#\n# Alias file created %s#\n#\n", ctime(&tnow));
12608
12609 if(title != NULL)
12610 fprintf(fd, "TITLE %s\n#\n", title);
12611 else if (basename != NULL)
12612 fprintf(fd, "TITLE %s\n#\n", basename);
12613 else
12614 fprintf(fd, "#TITLE\n#\n");
12615
12616 /* Now printing volume databases, or the parent database */
12617 fprintf(fd, "DBLIST ");
12618
12619 if (volumes == 0 && parent != NULL)
12620 fprintf(fd, "%s", parent);
12621 else {
12622 for(i = 0; i < volumes; i++) {
12623 fprintf(fd, "%s.%02ld ", basename, (long) i);
12624 }
12625 }
12626 fprintf(fd, "\n#\n");
12627
12628 if (gilist)
12629 fprintf(fd, "GILIST %s\n#\n", gilist);
12630 else
12631 fprintf(fd, "#GILIST\n#\n");
12632
12633 if (oidlist)
12634 fprintf(fd, "OIDLIST %s\n#\n", oidlist);
12635 else
12636 fprintf(fd, "#OIDLIST\n#\n");
12637
12638 if (first_oid > 0) {
12639 fprintf(fd, "FIRST_OID %ld\n#\n", (long) first_oid);
12640 fprintf(fd, "LAST_OID %ld\n#\n", (long) last_oid);
12641 fprintf(fd, "NSEQ %ld\n", (long) (last_oid - first_oid + 1));
12642 if (total_length > 0)
12643 fprintf(fd, "LENGTH %s\n", Nlm_Int8tostr(total_length, 0));
12644 }
12645 else if (gilist || number_seqs > 0)
12646 {
12647 /* When there is a gi list, print NSEQ and LENGTH even when they
12648 are 0. */
12649 fprintf(fd, "NSEQ %ld\n", (long) number_seqs);
12650 if (gilist || total_length > 0)
12651 fprintf(fd, "LENGTH %s\n", Nlm_Int8tostr(total_length, 0));
12652 }
12653 FILECLOSE(fd);
12654
12655 return TRUE;
12656 }
12657
12658 /* Returns the string that must be used in a multi-volume, multi-oidlist
12659 * (or multi-alias file) database. This string should be used as the DBLIST
12660 * field in the wrapper alias file for the multi-volume subset (or mask).
12661 * Caller is responsible to deallocate the return value */
FD_ConstructMultivolumeDBList(CharPtr basename,Int4 nvols)12662 CharPtr FD_ConstructMultivolumeDBList(CharPtr basename, Int4 nvols)
12663 {
12664 CharPtr retval = NULL;
12665 Int4 i, len = 0;
12666 Char numstr[10];
12667
12668 if (!basename || basename[0] == NULLB || nvols <= 0)
12669 return NULL;
12670
12671 /* Allocate memory for return value */
12672 len = ((StringLen(basename) + 1) * nvols) + nvols + 1;
12673 len += (4*nvols); /* for the '.NN' extension */
12674 if ((retval = (CharPtr)MemNew(sizeof(Char)*len)) == NULL) {
12675 ErrPostEx(SEV_ERROR, 0, 0,
12676 "FD_ConstructMultivolumeDBList: out of memory");
12677 return NULL;
12678 }
12679
12680 for (i = 0; i < nvols; i++) {
12681
12682 /* convert nvols to a string */
12683 MemSet(numstr, 0, sizeof(numstr));
12684 if (i < 100) {
12685 sprintf(numstr, ".%02ld ", (long) i);
12686 } else {
12687 sprintf(numstr, ".%03ld ", (long) i);
12688 }
12689 retval = StringCat(retval, basename);
12690 retval = StringCat(retval, numstr);
12691 }
12692 retval[StringLen(retval)] = NULLB;
12693
12694 return retval;
12695 }
12696
FD_CreateAliasFile(CharPtr title,CharPtr basename,Int4 volumes,Boolean is_protein)12697 Boolean FD_CreateAliasFile(CharPtr title, CharPtr basename,
12698 Int4 volumes, Boolean is_protein)
12699 {
12700 return FD_CreateAliasFileEx(title, basename, volumes, is_protein,
12701 NULL, 0, 0, 0, 0, NULL, NULL);
12702 }
12703
FD_MakeAliasFile(FDB_optionsPtr options)12704 Boolean FD_MakeAliasFile(FDB_optionsPtr options)
12705 {
12706 if (options == NULL)
12707 return FALSE;
12708
12709 if (options->volume > 0)
12710 return FD_CreateAliasFileEx(options->db_title, options->alias_file_name, options->volume+1,
12711 options->is_protein, NULL, 0, 0, 0, 0, NULL, NULL);
12712 else
12713 return FALSE;
12714 }
12715
12716
12717 #if defined(OS_UNIX_SOL) || defined(OS_UNIX_LINUX) || defined(__GLIBC__)
12718 #ifdef HAVE_MADVISE
12719
12720 /* IMPORTANT INFO:
12721 *
12722 * If we need to preload file(s) using madvise(), we do it now.
12723 * There are several file chunks that could be preloaded,
12724 * and several considerations to be taken into account.
12725 *
12726 * The file chunks are:
12727 * 1) the portion of the index file containing pointers to needed sections of
12728 * the header file.
12729 * 2) the portion of the index file containing pointers to needed sections of
12730 * the sequence file.
12731 * 3) the portion of the index file containing pointers to needed sections of
12732 * the ambchar.
12733 * 4) the needed portion of the header file.
12734 * 5) the needed portion of the sequence file.
12735 *
12736 * The preloading consideration are
12737 * 1) whether the file chunk has been memory mapped.
12738 * 2) the size of the individual chunks, and
12739 * 3) the combined size of all chunks.
12740 *
12741 * If the size of an individual chunk is smaller than MADVISE_MIN_SIZE pages,
12742 * there is no obvious benefit to applying madvise, and it could be avoided.
12743 *
12744 * Also, if the total size of all the chunks to be preloaded exceeds
12745 * certain share of RAM cache size, some chunk portions may not be preloaded,
12746 * -- and even then some preloaded chunks won't stay in memory, -- but this is
12747 * the best we can do. We should, however, minimize probability that preloaded
12748 * pages will be pushed out by some other process, so we'll assume that
12749 * given that the same database is likely to be processed again and again on
12750 * the same server, the available portion of RAM is some value greater than 50%,
12751 * and is defined by MADVISE_RAM_SHARE.
12752 *
12753 *
12754 */
12755 #define MADVISE_MIN_SIZE 16
12756 #define MADVISE_RAM_SHARE 90
12757
12758 /** exclusively for async madvise() */
12759 typedef struct {
12760 void * mp;
12761 size_t len;
12762 EMemMapAdvise advice;
12763
12764 }
12765 MadviseParam_t;
12766
12767 /** */
12768 static void*
readdb_do_madvise(void * param)12769 readdb_do_madvise(void *param)
12770 {
12771 #ifdef READDB_DEBUG
12772 fprintf(stderr, "madvise(%p, %u, %d)\n", ((MadviseParam_t *)param)->mp,
12773 ((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice);
12774 #else
12775 ErrPostEx(SEV_INFO, 0, 0, "madvise(0x%x, %u, %d)", ((MadviseParam_t *)param)->mp,
12776 ((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice);
12777
12778 #endif
12779
12780 if( !Nlm_MemMapAdvise(((MadviseParam_t *)param)->mp,
12781 ((MadviseParam_t *)param)->len, ((MadviseParam_t *)param)->advice) ) {
12782
12783 #ifdef READDB_DEBUG
12784 fprintf(stderr, "Nlm_MemMapAdvise(%p, %u, %d) failed: %s\n",
12785 ((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12786 ((MadviseParam_t *)param)->advice, strerror(errno));
12787 #else
12788 ErrPostEx(SEV_WARNING, 0, 0, "Nlm_MemMapAdvise(0x%x, %u, %d) failed: %s",
12789 ((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12790 ((MadviseParam_t *)param)->advice, strerror(errno));
12791 #endif
12792 }
12793
12794 #ifdef READDB_DEBUG
12795 fprintf(stderr, "\t\t\t\tdone madvise(%p, %u, %d)\n",
12796 ((MadviseParam_t *)param)->mp, ((MadviseParam_t *)param)->len,
12797 ((MadviseParam_t *)param)->advice);
12798 #endif
12799
12800 Nlm_MemFree(param);
12801 return NULL;
12802 }
12803
12804 /** */
12805 static void
readdb_madvise(void * mp,size_t len,EMemMapAdvise advice,Boolean sync,EThreadPriority pri)12806 readdb_madvise (void * mp, size_t len,
12807 EMemMapAdvise advice, Boolean sync, EThreadPriority pri)
12808 {
12809 MadviseParam_t *param = (MadviseParam_t *)Nlm_MemNew(sizeof(MadviseParam_t));
12810 if( param ) {
12811 param->mp = mp;
12812 param->len = len;
12813 param->advice = advice;
12814 if( sync ) {
12815 readdb_do_madvise(param);
12816 }
12817 else {
12818 NlmThreadCreateEx(readdb_do_madvise, (void *)param,
12819 THREAD_RUN | THREAD_DETACHED, pri, NULL, NULL);
12820 }
12821 }
12822 }
12823
12824 /** */
12825 static void
readdb_preload_index(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)12826 readdb_preload_index (ReadDBFILEPtr rdfp, Int4 first_db_seq,
12827 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
12828 {
12829 Uint4 idxHdrOffset = 0;
12830 Uint4 idxSeqOffset = 0;
12831 Uint4 idxAmbOffset = 0;
12832
12833 Uint4 idxLength = 0;
12834 Uint4 firstPage = 0;
12835
12836 uintptr_t baseOffset = (uintptr_t)rdfp->indexfp->mmp_begin;
12837
12838 /* get page size */
12839 long pagesz = sysconf(_SC_PAGESIZE);
12840
12841 /* sanity check */
12842 if( !rdfp || pagesz < 0 ) {
12843 return;
12844 }
12845
12846 /* insure that we are within the ordinal id range */
12847 if( first_db_seq < rdfp->start ) {
12848 first_db_seq = rdfp->start;
12849 }
12850 if( final_db_seq >= rdfp->stop ) {
12851 final_db_seq = rdfp->stop - 1;
12852 }
12853
12854 /* verify that the index file is memory mapped */
12855 if( rdfp->indexfp && rdfp->indexfp->mfile_true ) {
12856
12857 /* portion of the index file containing pointers to header file. */
12858 firstPage = (first_db_seq * 4) / pagesz;
12859 idxHdrOffset = firstPage * pagesz;
12860 idxLength = (final_db_seq - first_db_seq) * 4;
12861 idxLength += (pagesz - idxLength % pagesz);
12862
12863 /* madvise segments if they are big enough */
12864 if( idxLength / pagesz > MADVISE_MIN_SIZE ) {
12865
12866 /* portion of the index file containing pointers to sequence file. */
12867 firstPage = ((rdfp->num_seqs + 1 + first_db_seq) * 4) / pagesz;
12868 idxSeqOffset = firstPage * pagesz;
12869
12870 /* portion of the index file containing pointers to ambchars in seq file. */
12871 firstPage = ((2 * rdfp->num_seqs + 2 + first_db_seq) * 4) / pagesz;
12872 idxAmbOffset = firstPage * pagesz;
12873 }
12874 }
12875
12876 /* ensure that madvise() is called on page boundary */
12877 if( baseOffset % pagesz ) {
12878 uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
12879 baseOffset += adjustVal;
12880 idxHdrOffset -= ((adjustVal && idxHdrOffset > pagesz) ? pagesz : 0);
12881 idxSeqOffset -= ((adjustVal && idxSeqOffset > pagesz) ? pagesz : 0);
12882 idxAmbOffset -= ((adjustVal && idxAmbOffset > pagesz) ? pagesz : 0);
12883 }
12884
12885 #ifdef READDB_DEBUG
12886 fprintf(stderr, "MMP Offset: %p\n", rdfp->indexfp->mmp_begin);
12887 fprintf(stderr, "MMP Adjust Offset: %p\n", baseOffset);
12888 fprintf(stderr, "Index Head Offset: %ld\n", idxHdrOffset);
12889 fprintf(stderr, "Index File Length: %ld\n", idxLength);
12890 #endif
12891
12892 /* finally, preload chunks that are big enough to make it
12893 * worth while */
12894 if( rdfp->indexfp && idxLength / pagesz > MADVISE_MIN_SIZE ) {
12895 readdb_madvise((char *)(baseOffset + idxHdrOffset), idxLength,
12896 advice, sync, eTP_Default);
12897 readdb_madvise((char *)(baseOffset + idxSeqOffset), idxLength,
12898 advice, sync, eTP_Default);
12899 readdb_madvise((char *)(baseOffset + idxAmbOffset), idxLength,
12900 advice, sync, eTP_Default);
12901 }
12902 }
12903
12904 /** */
12905 static void
readdb_preload_data(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)12906 readdb_preload_data (ReadDBFILEPtr rdfp, Int4 first_db_seq,
12907 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
12908 {
12909 Uint4 hdrOffset = 0;
12910 Uint4 hdrLength = 0;
12911 Uint4 seqOffset = 0;
12912 Uint4 seqLength = 0;
12913
12914 Uint4 firstPage = 0;
12915
12916 long allowPages = 0;
12917 long needPages = 0;
12918
12919 /* get page size */
12920 long pagesz = sysconf(_SC_PAGESIZE);
12921 long totalPages = sysconf(_SC_PHYS_PAGES);
12922
12923 /* sanity check */
12924 if( !rdfp || pagesz < 0 || totalPages < 0 ) {
12925 return;
12926 }
12927
12928 /* insure that we are within the ordinal id range */
12929 if( first_db_seq < rdfp->start ) {
12930 first_db_seq = rdfp->start;
12931 }
12932 if( final_db_seq >= rdfp->stop ) {
12933 final_db_seq = rdfp->stop - 1;
12934 }
12935
12936 /** verify that the header file is memory mapped */
12937 if( rdfp->headerfp && rdfp->headerfp->mfile_true ) {
12938 long firstOff = Nlm_SwapUint4(rdfp->header_index[first_db_seq]);
12939 long lastOff = Nlm_SwapUint4(rdfp->header_index[final_db_seq]);
12940
12941 firstPage = firstOff / pagesz;
12942 hdrOffset = firstPage * pagesz;
12943 hdrLength = lastOff - firstOff;
12944 hdrLength += (pagesz - hdrLength % pagesz);
12945 }
12946
12947 #ifdef READDB_DEBUG
12948 if( !rdfp->sequencefp ) {
12949 fprintf(stderr, "rdfp->sequencefp == NULL\n");
12950 }
12951 #endif
12952
12953 /** verify that the sequence file is memory mapped */
12954 if( rdfp->sequencefp && rdfp->sequencefp->mfile_true ) {
12955 long firstOff = Nlm_SwapUint4(rdfp->sequence_index[first_db_seq]);
12956 long lastOff = Nlm_SwapUint4(rdfp->sequence_index[final_db_seq]);
12957
12958 firstPage = firstOff / pagesz;
12959 seqOffset = firstPage * pagesz;
12960 seqLength = lastOff - firstOff;
12961 seqLength += (pagesz - seqLength % pagesz);
12962 }
12963
12964 /** before preloading pages, trim sizes so that the total
12965 * is under MADVISE_RAM_SHARE */
12966 allowPages = totalPages / 100 * MADVISE_RAM_SHARE;
12967 needPages = (hdrLength + seqLength) / pagesz;
12968
12969 if( needPages > allowPages ) {
12970 int pctTrim = (needPages - allowPages) * 100 / needPages;
12971
12972 /* trim proportionately all chunks */
12973 hdrLength -= hdrLength * pctTrim / 100;
12974 hdrLength += (pagesz - hdrLength % pagesz);
12975
12976 seqLength -= seqLength * pctTrim / 100;
12977 seqLength += (pagesz - seqLength % pagesz);
12978 }
12979
12980 #ifdef READDB_DEBUG
12981 fprintf(stderr, "Header File: %ld\n", hdrLength);
12982 fprintf(stderr, "Sequence File: %ld\n", seqLength);
12983 #endif
12984
12985 /* finally, preload chunks that are big enough to make it
12986 * worth while */
12987 if( rdfp->headerfp && hdrLength / pagesz > MADVISE_MIN_SIZE ) {
12988 uintptr_t baseOffset = (uintptr_t)rdfp->headerfp->mmp_begin;
12989 if( baseOffset % pagesz ) {
12990 uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
12991 baseOffset += adjustVal;
12992 hdrOffset -= ((adjustVal && hdrOffset > pagesz) ? pagesz : 0);
12993 }
12994 readdb_madvise((char *)(baseOffset + hdrOffset), hdrLength,
12995 advice, sync, eTP_Default);
12996 }
12997
12998 if( rdfp->sequencefp && seqLength / pagesz > MADVISE_MIN_SIZE ) {
12999 uintptr_t baseOffset = (uintptr_t)rdfp->sequencefp->mmp_begin;
13000 if( baseOffset % pagesz ) {
13001 uintptr_t adjustVal = pagesz - (baseOffset % pagesz);
13002 baseOffset += adjustVal;
13003 seqOffset -= ((adjustVal && hdrOffset > pagesz) ? pagesz : 0);
13004 }
13005 readdb_madvise((char *)(baseOffset + seqOffset), seqLength,
13006 advice, sync, eTP_Default);
13007 }
13008 }
13009
13010 /** simpler and more efficient approach than the above */
13011 static void
readdb_preload_file(NlmMFILEPtr mFilePtr,Int4 nPages,EMemMapAdvise advice,Boolean sync,EThreadPriority pri)13012 readdb_preload_file (NlmMFILEPtr mFilePtr, Int4 nPages,
13013 EMemMapAdvise advice, Boolean sync, EThreadPriority pri)
13014 {
13015 long pagesz;
13016 size_t len;
13017
13018 /* general sanity check */
13019 if( !mFilePtr || !mFilePtr->mfile_true || !mFilePtr->mmp
13020 || !mFilePtr->mmp_madvise_end || !mFilePtr->mmp_end ) {
13021 return;
13022 }
13023
13024 /* check whether this portion was loaded before */
13025 if( mFilePtr->mmp < mFilePtr->mmp_madvise_end ||
13026 mFilePtr->mmp_end <= mFilePtr->mmp_madvise_end ) {
13027 return;
13028 }
13029
13030 pagesz = sysconf(_SC_PAGESIZE);
13031 len = madvisePreloadBlock * pagesz;
13032 if( len > mFilePtr->mmp_end - mFilePtr->mmp_madvise_end ) {
13033 len = mFilePtr->mmp_end - mFilePtr->mmp_madvise_end;
13034 }
13035 readdb_madvise(mFilePtr->mmp_madvise_end, len, advice, sync, pri);
13036 mFilePtr->mmp_madvise_end += len;
13037 }
13038
13039 /** */
13040 void LIBCALL
readdb_preload(ReadDBFILEPtr rdfp,Int4 first_db_seq,Int4 final_db_seq,EMemMapAdvise advice,Boolean sync)13041 readdb_preload (ReadDBFILEPtr rdfp, Int4 first_db_seq,
13042 Int4 final_db_seq, EMemMapAdvise advice, Boolean sync)
13043 {
13044 /* do not preload index */
13045 /* readdb_preload_index(rdfp, first_db_seq, final_db_seq, advice, sync); */
13046 readdb_preload_data(rdfp, first_db_seq, final_db_seq, advice, sync);
13047 }
13048
13049 /** */
13050 void LIBCALL
readdb_madvise_enable(Boolean enable)13051 readdb_madvise_enable (Boolean enable)
13052 {
13053 useMadvise = enable;
13054 }
13055
13056 /** */
13057 void LIBCALL
readdb_madvise_type(EMemMapAdvise advice)13058 readdb_madvise_type (EMemMapAdvise advice)
13059 {
13060 mmapAdvice = advice;
13061 }
13062
13063 /** */
13064 void LIBCALL
readdb_madvise_sync_mode(Boolean mode)13065 readdb_madvise_sync_mode (Boolean mode)
13066 {
13067 madviseSyncMode = mode;
13068 }
13069
13070 /** */
13071 void LIBCALL
readdb_madvise_block(Int4 nSeqs)13072 readdb_madvise_block (Int4 nSeqs)
13073 {
13074 madvisePreloadBlock = nSeqs;
13075 }
13076
13077 #endif /* HAVE_MADVISE */
13078 #endif /* SOL || LINUX */
13079
13080 /*** PIG (Protein Identifier Group) interface ***/
13081
13082 FDBPigTablePtr LIBCALL
FDBPigTableNew()13083 FDBPigTableNew()
13084 {
13085 FDBPigTablePtr fptp = NULL;
13086
13087 if ( !(fptp = (FDBPigTablePtr) MemNew(sizeof(FDBPigTable))))
13088 return NULL;
13089
13090 fptp->count = 0;
13091 fptp->allocated = INDEX_INIT_SIZE*2;
13092
13093 if ( !(fptp->pop = (Int4Ptr) MemNew(sizeof(Int4)*fptp->allocated)))
13094 return FDBPigTableFree(fptp);
13095
13096 return fptp;
13097 }
13098
13099 FDBPigTablePtr LIBCALL
FDBPigTableFree(FDBPigTablePtr fptp)13100 FDBPigTableFree(FDBPigTablePtr fptp)
13101 {
13102 if (!fptp)
13103 return NULL;
13104
13105 fptp->pop = MemFree(fptp->pop);
13106 return MemFree(fptp);
13107 }
13108
13109 Boolean LIBCALL
FDBAddPig(FDBPigTablePtr fptp,Int4 pig,Int4 oid)13110 FDBAddPig(FDBPigTablePtr fptp, Int4 pig, Int4 oid)
13111 {
13112 if (!fptp || pig == PIG_NONE || oid < 0)
13113 return FALSE;
13114
13115 /* Reallocate if necessary */
13116 if (fptp->count + 2 >= fptp->allocated) {
13117 fptp->allocated += (INDEX_ARRAY_CHUNKS*2);
13118 fptp->pop = (Int4Ptr) Realloc(fptp->pop, sizeof(Int4)*fptp->allocated);
13119
13120 if (!fptp->pop) {
13121 FDBPigTableFree(fptp);
13122 return FALSE;
13123 }
13124 }
13125
13126 fptp->pop[fptp->count++] = pig;
13127 fptp->pop[fptp->count++] = oid;
13128
13129 return TRUE;
13130 }
13131
13132 Int4 LIBCALL
readdb_get_pig(ReadDBFILEPtr rdfp,Int4 oid)13133 readdb_get_pig(ReadDBFILEPtr rdfp, Int4 oid)
13134 {
13135 BlastDefLineSetPtr bdp_set = NULL;
13136 BlastDefLinePtr bdp = NULL;
13137 Int4 pig = PIG_NONE;
13138
13139 if (rdfp->formatdb_ver < FORMATDB_VER)
13140 return pig;
13141
13142 if (!(bdp_set = FDReadDeflineAsn(rdfp, oid)))
13143 return pig;
13144
13145
13146 for (bdp = bdp_set; bdp; bdp = bdp->next) {
13147 if (bdp->other_info &&
13148 ( (pig = bdp->other_info->data.intvalue) != PIG_NONE)) {
13149 bdp_set = (BlastDefLinePtr) BlastDefLineSetFree(bdp_set);
13150 return pig;
13151 }
13152 }
13153 bdp_set = (BlastDefLinePtr) BlastDefLineSetFree(bdp_set);
13154 return pig;
13155 }
13156
13157 Int4 LIBCALL
readdb_pig2oid(ReadDBFILEPtr rdfp,Int4 pig,Int4Ptr start)13158 readdb_pig2oid(ReadDBFILEPtr rdfp, Int4 pig, Int4Ptr start)
13159 {
13160 Int4 retval = -1;
13161 ISAMErrorCode error;
13162 Uint4 oid = 0;
13163
13164 for ( ; rdfp; rdfp = rdfp->next) {
13165
13166 if (!rdfp->isam_pig)
13167 continue;
13168
13169 if ( (error = NISAMSearch(rdfp->isam_pig, pig, &oid, NULL)) < 0) {
13170 ErrPostEx(SEV_WARNING, 0, 0, "Failed to initialize PIG search"
13171 "on %s\nISAM Error code is %d\n", rdfp->filename, error);
13172 continue;
13173 } else if (error != ISAMNotFound) {
13174 if (start)
13175 *start = rdfp->start;
13176 retval = (Int4)oid + rdfp->start;
13177 break;
13178 }
13179 }
13180
13181 return retval;
13182 }
13183
s_IsTextFile(const char * filename)13184 static Boolean s_IsTextFile(const char* filename)
13185 {
13186 FILE* fp = NULL;
13187 Boolean retval = TRUE;
13188 Int4 i = 0;
13189
13190 if ( !(fp = FileOpen(filename, "r"))) {
13191 return FALSE;
13192 }
13193
13194 for (i = 0; i < 10 && !feof(fp); i++) {
13195 int c = getc(fp);
13196 if ( ! (isprint(c) || isspace(c)) ) {
13197 retval = FALSE;
13198 break;
13199 }
13200 }
13201 FileClose(fp);
13202 return retval;
13203 }
13204
13205 /*** TaxidDeflineTable interface ***/
13206
13207 const Int4 kTaxidDeflineSearch_NotFound = -1;
13208 static const Int4 kNoGi = -1;
13209 static const Char* kNoSeqid = NULL;
13210
13211 typedef enum EFDBTaxidDeflineDataType {
13212 eTaxidDefline_Gi = 1,
13213 eTaxidDefline_Seqid = 2
13214 } EFDBTaxidDeflineDataType;
13215
13216 typedef struct FDBTaxidDeflineData_Gi {
13217 Int4 gi;
13218 Int4 taxid;
13219 } FDBTaxidDeflineData_Gi;
13220
13221 typedef struct FDBTaxidDeflineData_Seqid {
13222 Char seqid[ID_MAX_SIZE+1];
13223 Int4 taxid;
13224 } FDBTaxidDeflineData_Seqid;
13225
13226 /** Gi/taxid structure used to read the file specified in the formatdb
13227 * configuration file to set the taxonomy ids for the listed gis.
13228 */
13229 struct FDBTaxidDeflineTable {
13230 EFDBTaxidDeflineDataType type; /* type of the table below */
13231 void* data; /* either an array of
13232 FDBTaxidDeflineTable_Gi or
13233 FDBTaxidDeflineTable_Seqid */
13234 Int4 count, allocated; /* keep track of table size */
13235 };
13236
13237 static size_t
s_FDBTaxidDeflineTable_GetDataTypeSize(FDBTaxidDeflineTablePtr taxid_tbl)13238 s_FDBTaxidDeflineTable_GetDataTypeSize(FDBTaxidDeflineTablePtr taxid_tbl)
13239 {
13240 size_t retval = 0;
13241
13242 ASSERT(taxid_tbl);
13243
13244 switch (taxid_tbl->type) {
13245 case eTaxidDefline_Gi:
13246 retval = sizeof(FDBTaxidDeflineData_Gi);
13247 break;
13248
13249 case eTaxidDefline_Seqid:
13250 retval = sizeof(FDBTaxidDeflineData_Seqid);
13251 break;
13252
13253 default:
13254 abort();
13255 }
13256
13257 return retval;
13258 }
13259
13260 /** Encapsulate addition of entries to FDBTaxidDeflineTable structure */
13261 static Boolean
s_FDBTaxidDeflineTableAddEntry(FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi,const char * seqid,Int4 taxid)13262 s_FDBTaxidDeflineTableAddEntry(FDBTaxidDeflineTablePtr taxid_tbl,
13263 Int4 gi, const char* seqid, Int4 taxid)
13264 {
13265 ASSERT(taxid_tbl);
13266
13267 if (taxid < 0) {
13268 ErrPostEx(SEV_ERROR, 0, 0, "Cannot add negative taxonomy id");
13269 return FALSE;
13270 }
13271
13272 /* Reallocate if necessary */
13273 if (taxid_tbl->count + 1 >= taxid_tbl->allocated) {
13274 size_t data_type_size =
13275 s_FDBTaxidDeflineTable_GetDataTypeSize(taxid_tbl);
13276 taxid_tbl->allocated += (INDEX_ARRAY_CHUNKS);
13277 taxid_tbl->data = Realloc(taxid_tbl->data,
13278 data_type_size*taxid_tbl->allocated);
13279 if ( !taxid_tbl->data ) {
13280 FDBTaxidDeflineTableFree(taxid_tbl);
13281 return FALSE;
13282 }
13283 }
13284
13285 switch (taxid_tbl->type) {
13286 case eTaxidDefline_Gi:
13287 {
13288 FDBTaxidDeflineData_Gi* gi_taxid_pairs =
13289 (FDBTaxidDeflineData_Gi*) taxid_tbl->data;
13290 ASSERT(seqid == NULL);
13291 gi_taxid_pairs[taxid_tbl->count].gi = gi;
13292 gi_taxid_pairs[taxid_tbl->count++].taxid = taxid;
13293 }
13294 break;
13295 case eTaxidDefline_Seqid:
13296 {
13297 FDBTaxidDeflineData_Seqid* seqid_taxid_pairs =
13298 (FDBTaxidDeflineData_Seqid*) taxid_tbl->data;
13299 ASSERT(seqid != NULL);
13300 StringNCpy_0(seqid_taxid_pairs[taxid_tbl->count].seqid, seqid,
13301 ID_MAX_SIZE);
13302 seqid_taxid_pairs[taxid_tbl->count++].taxid = taxid;
13303 }
13304 break;
13305 default:
13306 abort();
13307 }
13308
13309 return TRUE;
13310 }
13311
13312 /** HeapSort comparison function to sort a TaxidDeflineTable structure
13313 * (sorts by gi)
13314 */
s_TaxidDeflineDataGi_Compare(VoidPtr i,VoidPtr j)13315 static int LIBCALLBACK s_TaxidDeflineDataGi_Compare(VoidPtr i, VoidPtr j)
13316 {
13317 Int4 gi1 = ((FDBTaxidDeflineData_Gi*)i)->gi;
13318 Int4 gi2 = ((FDBTaxidDeflineData_Gi*)j)->gi;
13319
13320 return BLAST_CMP(gi1, gi2);
13321 }
13322
13323 /** HeapSort comparison function to sort a TaxidDeflineTable structure
13324 * (sorts by seqid strings)
13325 */
s_TaxidDeflineDataSeqid_Compare(VoidPtr i,VoidPtr j)13326 static int LIBCALLBACK s_TaxidDeflineDataSeqid_Compare(VoidPtr i, VoidPtr j)
13327 {
13328 const Char* seqid1 = ((FDBTaxidDeflineData_Seqid*)i)->seqid;
13329 const Char* seqid2 = ((FDBTaxidDeflineData_Seqid*)j)->seqid;
13330
13331 return StringCmp(seqid1, seqid2);
13332 }
13333
13334 static FDBTaxidDeflineTablePtr
s_FDBTaxidDeflineTableNew_Gi(const Char * filename)13335 s_FDBTaxidDeflineTableNew_Gi(const Char* filename)
13336 {
13337 FDBTaxidDeflineTablePtr retval = NULL;
13338 FILE* fp = NULL;
13339 size_t data_type_size = 0;
13340
13341 if ( !filename )
13342 return NULL;
13343
13344 if ( !(fp = FileOpen(filename, "r")))
13345 return NULL;
13346
13347 retval = (FDBTaxidDeflineTablePtr) MemNew(sizeof(FDBTaxidDeflineTable));
13348 if ( !retval ) {
13349 FileClose(fp);
13350 return NULL;
13351 }
13352
13353 retval->count = 0;
13354 retval->allocated = INDEX_INIT_SIZE;
13355 retval->type = eTaxidDefline_Gi;
13356 data_type_size = s_FDBTaxidDeflineTable_GetDataTypeSize(retval);
13357
13358 if ( !(retval->data = MemNew(data_type_size*retval->allocated))) {
13359 FileClose(fp);
13360 return FDBTaxidDeflineTableFree(retval);
13361 }
13362
13363 /* Each line in the input file has the following format:
13364 gi taxid
13365 gi taxid
13366 ...
13367 */
13368 {
13369 Int4 gi = -1, taxid = -1;
13370 Int4 nread = 0; /* number of elements assigned by fscanf */
13371 Boolean success = FALSE;
13372 while ( (nread = fscanf(fp, "%d %d", &gi, &taxid)) != EOF) {
13373 if (nread != 2) {
13374 break;
13375 }
13376 success = s_FDBTaxidDeflineTableAddEntry(retval, gi,
13377 kNoSeqid, taxid);
13378 if ( !success ) {
13379 break;
13380 }
13381 }
13382 if ( !feof(fp) || ferror(fp) || nread != EOF ) {
13383 ErrPostEx(SEV_INFO, 0, 0, "Failed to read "
13384 "gi/taxonomy id pairs from %s", filename);
13385 FileClose(fp);
13386 return FDBTaxidDeflineTableFree(retval);
13387 }
13388 FileClose(fp);
13389 }
13390
13391 if (retval->count == 0) {
13392 return FDBTaxidDeflineTableFree(retval);
13393 }
13394
13395 /* Sort the list by gis */
13396 HeapSort(retval->data, retval->count, data_type_size,
13397 s_TaxidDeflineDataGi_Compare);
13398
13399 ErrLogPrintf("Read %d gi/taxonomy id pairs from %s\n",
13400 retval->count, filename);
13401
13402 return retval;
13403 }
13404
13405 static FDBTaxidDeflineTablePtr
s_FDBTaxidDeflineTableNew_Seqid(const Char * filename)13406 s_FDBTaxidDeflineTableNew_Seqid(const Char* filename)
13407 {
13408 FDBTaxidDeflineTablePtr retval = NULL;
13409 FILE* fp = NULL;
13410 size_t data_type_size = 0;
13411
13412 if ( !filename )
13413 return NULL;
13414
13415 if ( !(fp = FileOpen(filename, "r")))
13416 return NULL;
13417
13418 retval = (FDBTaxidDeflineTablePtr) MemNew(sizeof(FDBTaxidDeflineTable));
13419 if ( !retval ) {
13420 FileClose(fp);
13421 return NULL;
13422 }
13423
13424 retval->count = 0;
13425 retval->allocated = INDEX_INIT_SIZE;
13426 retval->type = eTaxidDefline_Seqid;
13427 data_type_size = s_FDBTaxidDeflineTable_GetDataTypeSize(retval);
13428
13429 if ( !(retval->data = MemNew(data_type_size*retval->allocated))) {
13430 FileClose(fp);
13431 return FDBTaxidDeflineTableFree(retval);
13432 }
13433
13434 /* Each line in the input file has the following format:
13435 seqid taxid
13436 seqid taxid
13437 ...
13438
13439 N.B.: seqid is a string with ID_MAX_SIZE characters, which does NOT
13440 include a leading '>' character
13441 */
13442 {
13443 Int4 nread = 0; /* number of elements assigned by fscanf */
13444 Char format[ID_MAX_SIZE] = { '\0' }; /* format string for fscanf */
13445 Int4 taxid = -1;
13446 Char seqid_buf[ID_MAX_SIZE+1] = { '\0' };
13447 Boolean success = FALSE;
13448
13449 StringCat(format, "%");
13450 StringCat(format, Nlm_Int8tostr((Int8)ID_MAX_SIZE, 1));
13451 StringCat(format, "s %d");
13452
13453 while ( (nread = fscanf(fp, format, &seqid_buf, &taxid)) != EOF) {
13454 if (nread != 2) {
13455 break;
13456 }
13457 success = s_FDBTaxidDeflineTableAddEntry(retval, kNoGi,
13458 seqid_buf, taxid);
13459 if ( !success ) {
13460 break;
13461 }
13462 }
13463 if ( !feof(fp) || ferror(fp) || nread != EOF ) {
13464 ErrPostEx(SEV_INFO, 0, 0, "Failed to read "
13465 "Seq-id/taxonomy id pairs from %s", filename);
13466 FileClose(fp);
13467 return FDBTaxidDeflineTableFree(retval);
13468 }
13469 FileClose(fp);
13470 }
13471
13472 if (retval->count == 0) {
13473 return FDBTaxidDeflineTableFree(retval);
13474 }
13475
13476 /* Sort the list by seqids */
13477 HeapSort(retval->data, retval->count, data_type_size,
13478 s_TaxidDeflineDataSeqid_Compare);
13479
13480 ErrLogPrintf("Read %d Seq-id/taxonomy id pairs from %s\n",
13481 retval->count, filename);
13482
13483 return retval;
13484 }
13485
13486 FDBTaxidDeflineTablePtr LIBCALL
FDBTaxidDeflineTableNew(const Char * filename)13487 FDBTaxidDeflineTableNew PROTO((const Char* filename))
13488 {
13489 FDBTaxidDeflineTablePtr retval = NULL;
13490
13491 /* Try reading a list of gi/taxid pairs */
13492 retval = s_FDBTaxidDeflineTableNew_Gi(filename);
13493 if ( !retval ) {
13494 /* Try reading a list of seqid/taxid pairs */
13495 retval = s_FDBTaxidDeflineTableNew_Seqid(filename);
13496 }
13497 return retval;
13498 }
13499
13500 FDBTaxidDeflineTablePtr LIBCALL
FDBTaxidDeflineTableFree(FDBTaxidDeflineTablePtr taxid_tbl)13501 FDBTaxidDeflineTableFree PROTO((FDBTaxidDeflineTablePtr taxid_tbl))
13502 {
13503 if ( !taxid_tbl ) {
13504 return NULL;
13505 }
13506
13507 /* Use a switch statement in case there's ever a need for a more elaborate
13508 * TaxidDefline_* data type */
13509 switch (taxid_tbl->type) {
13510 case eTaxidDefline_Gi:
13511 taxid_tbl->data = MemFree(taxid_tbl->data);
13512 break;
13513
13514 case eTaxidDefline_Seqid:
13515 taxid_tbl->data = MemFree(taxid_tbl->data);
13516 break;
13517
13518 default:
13519 abort();
13520 }
13521
13522 return MemFree(taxid_tbl);
13523 }
13524
13525 static Int4
s_FDBTaxidDeflineTableSearch_Gi(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi)13526 s_FDBTaxidDeflineTableSearch_Gi(const FDBTaxidDeflineTablePtr taxid_tbl,
13527 Int4 gi)
13528 {
13529 FDBTaxidDeflineData_Gi* gi_taxid_pairs =
13530 (FDBTaxidDeflineData_Gi*) taxid_tbl->data;
13531 Int4 retval = kTaxidDeflineSearch_NotFound;
13532
13533 /* perform binary search */
13534 {
13535 Int4 m, b, e;
13536 b = 0;
13537 e = taxid_tbl->count;
13538 while (b <= e) {
13539 m = (b + e) / 2;
13540 if (gi_taxid_pairs[m].gi > gi) {
13541 e = m - 1;
13542 } else if (gi_taxid_pairs[m].gi < gi) {
13543 b = m + 1;
13544 } else {
13545 retval = gi_taxid_pairs[m].taxid;
13546 break;
13547 }
13548 }
13549 }
13550
13551 return retval;
13552 }
13553
13554 static Int4
s_FDBTaxidDeflineTableSearch_Seqid(const FDBTaxidDeflineTablePtr taxid_tbl,const Char * seqid)13555 s_FDBTaxidDeflineTableSearch_Seqid(const FDBTaxidDeflineTablePtr taxid_tbl,
13556 const Char* seqid)
13557 {
13558 FDBTaxidDeflineData_Seqid* seqid_taxid_pairs =
13559 (FDBTaxidDeflineData_Seqid*) taxid_tbl->data;
13560 Int4 retval = kTaxidDeflineSearch_NotFound;
13561
13562 if ( !seqid ) {
13563 return retval;
13564 }
13565
13566 /* perform binary search */
13567 {
13568 Int4 m, b, e, rv;
13569 b = 0;
13570 e = taxid_tbl->count;
13571 while (b <= e) {
13572 m = (b + e) / 2;
13573 rv = StringCmp(seqid_taxid_pairs[m].seqid, seqid);
13574 if (rv > 0) {
13575 e = m - 1;
13576 } else if (rv < 0) {
13577 b = m + 1;
13578 } else {
13579 retval = seqid_taxid_pairs[m].taxid;
13580 break;
13581 }
13582 }
13583 }
13584
13585 return retval;
13586 }
13587
13588 static Int4
s_FDBTaxidDeflineTableSearch(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi,const Char * seqid)13589 s_FDBTaxidDeflineTableSearch(const FDBTaxidDeflineTablePtr taxid_tbl,
13590 Int4 gi, const Char* seqid)
13591 {
13592 Int4 retval = kTaxidDeflineSearch_NotFound;
13593
13594 if ( !taxid_tbl ) {
13595 return retval;
13596 }
13597
13598 switch (taxid_tbl->type) {
13599 case eTaxidDefline_Gi:
13600 retval = s_FDBTaxidDeflineTableSearch_Gi(taxid_tbl, gi);
13601 break;
13602
13603 case eTaxidDefline_Seqid:
13604 retval = s_FDBTaxidDeflineTableSearch_Seqid(taxid_tbl, seqid);
13605 break;
13606
13607 default:
13608 abort();
13609 }
13610
13611 return retval;
13612 }
13613
13614 Int4 LIBCALL
FDBTaxidDeflineTableSearchGi(const FDBTaxidDeflineTablePtr taxid_tbl,Int4 gi)13615 FDBTaxidDeflineTableSearchGi PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
13616 Int4 gi))
13617 {
13618 return s_FDBTaxidDeflineTableSearch(taxid_tbl, gi, kNoSeqid);
13619 }
13620
13621 Int4 LIBCALL
FDBTaxidDeflineTableSearchSeqid(const FDBTaxidDeflineTablePtr taxid_tbl,const Char * seqid)13622 FDBTaxidDeflineTableSearchSeqid PROTO((const FDBTaxidDeflineTablePtr taxid_tbl,
13623 const Char* seqid))
13624 {
13625 return s_FDBTaxidDeflineTableSearch(taxid_tbl, kNoGi, seqid);
13626 }
13627
13628 static void
s_FDBUpdateTaxIdInSingleBdp(BlastDefLinePtr bdp,const FDBTaxidDeflineTablePtr taxid_tbl)13629 s_FDBUpdateTaxIdInSingleBdp(BlastDefLinePtr bdp,
13630 const FDBTaxidDeflineTablePtr taxid_tbl)
13631 {
13632 Int4 taxid = kTaxidDeflineSearch_NotFound;
13633
13634 if ( !taxid_tbl ) {
13635 return;
13636 }
13637
13638 /* Retrieve the tax id */
13639 switch (taxid_tbl->type) {
13640 case eTaxidDefline_Gi:
13641 {
13642 SeqIdPtr sip = NULL;
13643 if( (sip = SeqIdFindBest(bdp->seqid, SEQID_GI)) != NULL) {
13644 Int4 gi = sip->data.intvalue;
13645 #ifdef TAX_CS_LOOKUP
13646 taxid = tax1_getTaxId4GI(gi);
13647 #else
13648 taxid = FDBTaxidDeflineTableSearchGi(taxid_tbl, gi);
13649 #endif
13650 }
13651 }
13652 break;
13653
13654 case eTaxidDefline_Seqid:
13655 {
13656 Char buf[ID_MAX_SIZE+1] = { '\0' };
13657 SeqIdWrite(bdp->seqid, buf, PRINTID_FASTA_LONG, sizeof(buf));
13658 taxid = FDBTaxidDeflineTableSearchSeqid(taxid_tbl, buf);
13659 }
13660 break;
13661
13662 default:
13663 abort();
13664 }
13665
13666 /* Assign the tax id */
13667 if (taxid != kTaxidDeflineSearch_NotFound) {
13668 bdp->taxid = taxid;
13669 }
13670 }
13671
13672 static void
s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,const FDBTaxidDeflineTablePtr taxid_tbl)13673 s_FDBUpdateTaxIdInBdpList(BlastDefLinePtr bdp,
13674 const FDBTaxidDeflineTablePtr taxid_tbl)
13675 {
13676 for (; bdp; bdp = bdp->next) {
13677 s_FDBUpdateTaxIdInSingleBdp(bdp, taxid_tbl);
13678 }
13679 }
13680
13681