1 /* cdentrez.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * RCS $Id: cdentrez.c,v 6.2 1999/03/11 23:20:07 kans Exp $
27 *
28 * Author: Ostell, Kans
29 *
30 * Version Creation Date: 10/15/91
31 *
32 * File Description:
33 * entrez index access library for Entrez CDROM
34 *
35 * Modifications:
36 * --------------------------------------------------------------------------
37 * Date Name Description of modification
38 * ------- ---------- -----------------------------------------------------
39 * 07-07-94 Schuler Added CdEntrezGetInfo function
40 * 07-12-94 Schuler Added #ifdef _NEW_CdEntrez_/_OLD_CdEntrez
41 * 07-13-94 Schuler Moved CdTermFree here from cdromlib.c
42 * 09-22-94 Schuler CdEntrezFini: set _nouveau to FALSE
43 * 11-22-94 Schuler Cleaned up some integer size problems
44 *
45 * 05-19-95 Schuler Added rcs Log directive for automatic insertion of
46 * modification comments.
47 *
48 * Revision $Log: cdentrez.c,v $
49 * Revision Revision 6.2 1999/03/11 23:20:07 kans
50 * Revision sprintf cast
51 * Revision
52 * Revision Revision 6.1 1998/08/24 18:42:15 kans
53 * Revision fixed -v -fd warnings
54 * Revision
55 * Revision Revision 6.0 1997/08/25 18:12:52 madden
56 * Revision Revision changed to 6.0
57 * Revision
58 * Revision Revision 5.5 1997/06/26 21:55:31 vakatov
59 * Revision [PC] DLL'd "ncbicdr.lib", "ncbiacc.lib", "ncbinacc.lib" and "ncbicacc.lib"
60 * Revision
61 * Revision Revision 5.4 1997/03/07 17:16:10 epstein
62 * Revision always choose the highest GI in EntrezFindSeqId
63 * Revision
64 * Revision 5.3 1997/01/14 21:26:07 epstein
65 * plug memory leak when performing Entrez set-difference operations
66 *
67 * Revision 5.2 1996/11/22 18:02:19 epstein
68 * change algorithm for looking up PDB accessions
69 *
70 * Revision 5.1 1996/08/14 19:56:41 epstein
71 * add APIs for fetching pieces of biostruc annots (mostly written by Chris Hogue)
72 *
73 * Revision 5.0 1996/05/28 13:55:34 ostell
74 * Set to revision 5.0
75 *
76 * Revision 4.18 1996/05/14 21:01:12 epstein
77 * use SQID index to and docsum to convert back-and-forth between GIs and SeqIds, per Jim Ostell
78 *
79 * Revision 4.17 1996/04/01 21:02:31 epstein
80 * remove dead CdEntrezBiostrucAnnotSetGet() code
81 *
82 * Revision 4.16 1996/04/01 20:59:38 epstein
83 * Schuler/Epstein changes for cleaner CdEntrezBiostrucAnnotSetGet retrieval
84 *
85 * Revision 4.15 1996/03/29 18:52:12 epstein
86 * add support for structure alignments (includes kludge for now)
87 *
88 * Revision 4.14 1995/10/23 21:39:56 epstein
89 * another tweak for PC 16-bit addressing
90 *
91 * Revision 4.13 1995/10/23 14:17:52 epstein
92 * fix 16/32-bit portability problems
93 *
94 * Revision 4.12 1995/10/03 14:12:40 epstein
95 * repair term-checking logic to avoid erroneous caching
96 *
97 * Revision 4.11 1995/10/02 15:25:56 epstein
98 * correct range-checking logic due to semantics of StringXCmp()
99 *
100 * Revision 4.10 1995/10/02 12:51:23 epstein
101 * fix endpoints for range scanning
102 *
103 * Revision 4.9 1995/10/02 12:49:44 epstein
104 * add memory-based performance enhancements
105 *
106 * Revision 4.8 1995/10/02 02:35:33 epstein
107 * add range-checking
108 *
109 * Revision 4.7 1995/09/19 13:27:51 epstein
110 * add truncation limit
111 *
112 * Revision 4.6 1995/08/30 20:54:53 epstein
113 * search TYP_CH database if retcode is -1
114 *
115 * Revision 4.5 1995/08/28 23:20:47 kans
116 * includes new mmdbapi headers
117 *
118 * Revision 4.4 1995/08/28 17:44:01 epstein
119 * add code so that when retcode is -1, we perform less validation on the retrieve Seq-entry
120 *
121 * Revision 4.3 1995/08/24 20:44:10 epstein
122 * add more stuff for genomes
123 *
124 * Revision 4.2 1995/08/18 17:41:17 epstein
125 * fix (?) parsing of PDB accession per Brandon's observation
126 *
127 * Revision 4.1 1995/08/11 20:26:18 epstein
128 * add max-models support for biostrucs
129 *
130 * Revision 4.0 1995/07/26 13:50:32 ostell
131 * force revision to 4.0
132 *
133 * Revision 2.62 1995/07/19 22:07:00 kans
134 * added (probably superfluous) casts to some sprintf calls
135 *
136 * Revision 2.61 1995/06/29 15:57:03 epstein
137 * added Complexity argument when fetching structures
138 *
139 * Revision 2.60 95/06/27 11:54:35 kans
140 * replaced _OLD_CDEntrez_ with _OLD_CdEntrez_
141 *
142 * Revision 2.59 1995/06/23 16:02:43 kans
143 * support for accmmdbs.c stub to resolve symbols without MMDB link
144 *
145 * Revision 2.58 1995/06/23 13:22:25 kans
146 * Biostruc_CD_supported symbol needed for local MMDB access
147 *
148 * Revision 2.57 1995/05/16 14:36:20 schuler
149 * Automatic comment insertion enabled
150 *
151 *
152 * ==========================================================================
153 */
154
155 #define REVISION_STR "$Revision: 6.2 $"
156
157 static char * _this_module = "CdEntrez";
158 #define THIS_MODULE _this_module
159 static char * _this_file = __FILE__;
160 #define THIS_FILE _this_file
161
162 #include <accentr.h>
163 #include <cdentrez.h>
164 #include <sequtil.h>
165 #include <objall.h>
166
167 typedef struct posting {
168 ByteStorePtr uids;
169 DocUidPtr buffer;
170 Int4 bufsize;
171 Int4 index;
172 Int4 count;
173 } Posting, PNTR PostingPtr;
174
175 #define SCAN_MAX 200
176
177 /* the 1023 below is not a typo; it's needed to avoid overflowing 16-bit
178 addressing on PCs */
179 #define DEF_CDENTREZ_MEMUSAGE (64 * 1023L)
180 #define MAX_CDENTREZ_UID_LIST_SIZE (cdMemUsage)
181 #define CDENTREZ_TERM_MAX (cdMemUsage / 4)
182 #define MAX_CDENTREZ_BYTESTORE (cdMemUsage / 4)
183 #define MAX_CDENTREZ_SMALL_LIST (cdMemUsage / 2)
184
185 typedef struct scanData {
186 Int4 specialCount;
187 Int4 totalCount;
188 Int4 offset;
189 ByteStorePtr specialPtr;
190 ByteStorePtr remainderPtr;
191 } ScanData, PNTR ScanPtr;
192
193 static Int2 db;
194 static Int2 fld;
195
196 static DocUidPtr uidPtr;
197
198 static Int2 searchTermLen;
199 static Int4 cdMemUsage = 32768;
200
201 static ByteStorePtr specialPost;
202 static ByteStorePtr remainPost;
203
204 static Char selection [256];
205 static Char wildcard [256];
206 static Char topOfRange [256];
207 static Boolean rangeScanning = FALSE;
208
209 static ScanPtr scanPtr;
210 static Int4 scanCount;
211 static Boolean scanOk;
212 static CdTermProc userScanProc;
213
214 static CdTermPtr eset;
215
216 static ValNodePtr cachedExpr = NULL;
217 static ByteStorePtr cachedBsp = NULL;
218
219 static void NEAR NextNode PROTO((void));
220 static ByteStorePtr NEAR Factor PROTO((void));
221 static ByteStorePtr NEAR Term PROTO((void));
222 static ByteStorePtr NEAR Diff PROTO((void));
223 static ByteStorePtr NEAR Expression PROTO((void));
224 static CdTermPtr NEAR FindTermNode PROTO((CharPtr term, DocType type, DocField field, CharPtr highRange));
225 static ValNodePtr CdTLExprFree PROTO((ValNodePtr elst));
226
227 static PostingPtr NEAR NewPost PROTO((ByteStorePtr lst, Int4 defsize));
228 static PostingPtr NEAR FreePost PROTO((PostingPtr pst));
229 static Int4 NEAR PostLength PROTO((PostingPtr pst));
230 static void NEAR RewindPost PROTO((PostingPtr pst));
231 static DocUid NEAR ReadItem PROTO((PostingPtr pst));
232 static void NEAR WriteItem PROTO((PostingPtr pst, DocUid value));
233 static void NEAR FlushItems PROTO((PostingPtr pst));
234 static void NEAR SavePostingList PROTO((FILE *f, ByteStorePtr bsp));
235
236 static Boolean NEAR CdEntrezMergeTerm PROTO((DocType type, DocField field, CharPtr term, Int4Ptr spcl, Int4Ptr totl, CdTermProc userProc));
237 static void NEAR SingleSpaces PROTO((CharPtr str));
238 static void NEAR TermTruncate PROTO((CharPtr str));
239 static void NEAR QuickSortSmall PROTO((DocUidPtr uids, Int4 l, Int4 r));
240 static Int4 NEAR CompressSmall PROTO((DocUidPtr uids, Int4 count));
241 static Int4 NEAR UniqueSmall PROTO((DocUidPtr uids, Int4 count));
242 static ByteStorePtr NEAR MergeSmallLists PROTO((ByteStorePtr bsp, ByteStorePtr small));
243 static Boolean NEAR MergeSeveralLists PROTO((Int4 i, Int4 count));
244 static Boolean NEAR MergeSeveralOrderedLists PROTO((Int4 i, Int4 count));
245 static Boolean NEAR MergeUnorderedLists PROTO((Int4 i, Int4 count));
246 static Boolean NEAR ProcessScanResults PROTO((void));
247 static Boolean WildCardProc PROTO((CdTermPtr trmp));
248 static Boolean ScanOnlyProc PROTO((CdTermPtr trmp));
249 static Boolean ScanAndFreeProc PROTO((CdTermPtr trmp));
250
251 /**** Moved from cdentrez.h ********************/
252
253 static CdTermPtr NEAR CdEntrezCreateTerm PROTO((CharPtr term, DocType type, DocField field, ByteStorePtr special, ByteStorePtr remainder, CharPtr highRange));
254 static ByteStorePtr NEAR LoadPostingList PROTO((FILE *f, Int4 special, Int4 total));
255 static ByteStorePtr NEAR FreePostingList PROTO((ByteStorePtr lst));
256 static ByteStorePtr NEAR MergePostingLists PROTO((ByteStorePtr lst1, ByteStorePtr lst2));
257 static ByteStorePtr NEAR IntersectPostingLists PROTO((ByteStorePtr lst1, ByteStorePtr lst2));
258 static ByteStorePtr NEAR DifferencePostingLists PROTO((ByteStorePtr lst1, ByteStorePtr lst2));
259
260 static ValNodePtr currNode;
261 static Uint1 currChoice;
262
263 /************************* moved from old cdml.c ****************************/
264 static AsnTypePtr MEDLINE_ENTRY = NULL;
265 static AsnTypePtr MEDLINE_ENTRY_cit = NULL;
266 static AsnTypePtr MEDLINE_ENTRY_abstract = NULL;
267 static AsnTypePtr TITLE_E_trans = NULL;
268 static AsnTypePtr AUTH_LIST_names_ml_E = NULL;
269 static AsnTypePtr AUTH_LIST_names_str_E = NULL;
270 static AsnTypePtr DATE_STD_year = NULL;
271 static AsnTypePtr DATE_str = NULL;
272 static AsnTypePtr TITLE_E_name = NULL;
273 static AsnTypePtr MEDLINE_ENTRY_mesh = NULL;
274 static AsnTypePtr MEDLINE_ENTRY_substance = NULL;
275 static AsnTypePtr MEDLINE_ENTRY_xref = NULL;
276 static AsnTypePtr MEDLINE_ENTRY_idnum = NULL;
277 static AsnTypePtr MEDLINE_ENTRY_gene = NULL;
278
279 static DocSumPtr NEAR MedSumAsnRead PROTO((AsnIoPtr aip, DocUid uid));
280 static void NEAR StripAuthor PROTO((CharPtr author));
281 static void NEAR FindAsnType PROTO((AsnTypePtr PNTR atp, AsnModulePtr amp, CharPtr str));
282
283 static DocSumPtr NEAR CdEntMlSumGet PROTO((Int4 uid));
284
285 /************************* moved from old cdseq.c ****************************/
286 static AsnTypePtr SEQ_ENTRY = NULL;
287 static AsnTypePtr SEQ_ENTRY_seq = NULL;
288 static AsnTypePtr SEQ_ENTRY_set = NULL;
289 static AsnTypePtr TEXTSEQ_ID_name = NULL;
290 static AsnTypePtr TEXTSEQ_ID_accession = NULL;
291 static AsnTypePtr SEQ_DESCR_E_title = NULL;
292 static AsnTypePtr GIIMPORT_ID_id = NULL;
293 static AsnTypePtr BIOSEQ_inst = NULL;
294 static AsnTypePtr SEQ_INST_mol = NULL;
295 static AsnTypePtr SEQ_INST_repr = NULL;
296 static AsnTypePtr SEQ_ID_gibbsq = NULL;
297 static AsnTypePtr SEQ_ID_gibbmt = NULL;
298 static AsnTypePtr SEQ_ID_genbank = NULL;
299 static AsnTypePtr SEQ_ID_gi = NULL;
300 static AsnTypePtr SEQ_ID_embl = NULL;
301 static AsnTypePtr SEQ_ID_ddbj = NULL;
302 static AsnTypePtr SEQ_ID_pir = NULL;
303 static AsnTypePtr SEQ_ID_swissprot = NULL;
304 static AsnTypePtr PDB_BLOCK_compound_E = NULL;
305 static AsnTypePtr PDB_SEQ_ID_MOL = NULL;
306 static AsnTypePtr BIOSEQ_id = NULL;
307 static AsnTypePtr BIOSEQ_id_E = NULL;
308 static AsnTypePtr CIT_PAT_title = NULL;
309
310 static DocSumPtr NEAR CdEntSeqSumGet PROTO((Int4 uid, DocType type));
311
312 extern int _nouveau;
313
314 /*****************************************************************************
315 *
316 * CdEntrezInit ()
317 * Creates linked list of CdTerm nodes, creates temporary file for
318 * postings lists, saves file name in first node. When creating new
319 * nodes, posting file is appended to temporary file, node offset then
320 * points to temporary file location of posting information.
321 *
322 *****************************************************************************/
323
CdEntrezInit(Boolean no_warnings)324 NLM_EXTERN Boolean CdEntrezInit (Boolean no_warnings)
325
326 {
327 FILE *fp;
328 Char str [PATH_MAX];
329 Boolean inited = FALSE;
330 CharPtr prop;
331
332 #ifdef Biostruc_supported
333 objmmdb1AsnLoad ();
334 objmmdb2AsnLoad ();
335 objmmdb3AsnLoad ();
336 #endif
337 #ifdef _NEW_CdEntrez_
338 _nouveau = GetAppParamBoolean("ncbi","CdEntrez","NewStyle",TRUE);
339 if (_nouveau)
340 {
341 if (cd3_CdInit())
342 inited = TRUE;
343 else
344 ErrLogPrintf("cd3_CdInit() failed\n");
345 }
346 #endif
347 /* In the dual OLD/NEW case, go on to try CdInit if cd3_CdInit failed */
348 #ifdef _OLD_CdEntrez_
349 if (!inited)
350 {
351 if (CdInit())
352 {
353 inited = TRUE;
354 _nouveau = FALSE;
355 }
356 }
357 #endif
358 if (!inited)
359 return FALSE;
360
361 eset = MemNew (sizeof (CdTerm));
362 if (eset == NULL)
363 return FALSE;
364 eset->type = 255; /* set to not used */
365 TmpNam (str);
366 eset->term = StringSave (str);
367 #ifdef WIN_MAC
368 FileCreate (str, "????", "NCBI");
369 #endif
370 fp = FileOpen (str, "wb");
371 if (fp == NULL) {
372 ErrPostEx (SEV_ERROR, ERR_CD_FILEOPEN, 0, "Unable to open temporary file %s", str);
373 return FALSE;
374 }
375 FileClose (fp);
376 if ((prop = (CharPtr) GetAppProperty("CdEntrezMemUsage")) != NULL)
377 {
378 long tmplong;
379
380 sscanf(prop, "%ld", &tmplong);
381
382 cdMemUsage = tmplong;
383 } else {
384 cdMemUsage = DEF_CDENTREZ_MEMUSAGE;
385 }
386 cdMemUsage = MIN(cdMemUsage, MAXALLOC);
387 return TRUE;
388 }
389
390 /*****************************************************************************
391 *
392 * CdEntrezFini ()
393 * Frees linked list of CdTerm nodes and removes temporary posting file.
394 *
395 *****************************************************************************/
396
CdEntrezFini(void)397 NLM_EXTERN void CdEntrezFini (void)
398
399 {
400 CdTermPtr nxt;
401 Char temp [PATH_MAX];
402
403 if (eset != NULL) {
404 if (eset->term != NULL) {
405 StringCpy (temp, eset->term);
406 FileRemove (temp);
407 }
408 while (eset != NULL) {
409 nxt = eset->next;
410 CdTermFree (eset);
411 eset = nxt;
412 }
413 }
414 eset = NULL;
415
416 cachedExpr = CdTLExprFree(cachedExpr);
417 cachedBsp = BSFree(cachedBsp);
418
419 #ifdef _NEW_CdEntrez_
420 if (_nouveau)
421 cd3_CdFini();
422 #endif
423 #ifdef _OLD_CdEntrez_
424 if (!_nouveau)
425 CdFini();
426 #endif
427 _nouveau = FALSE;
428 }
429
430
431 /*****************************************************************************
432 *
433 * CdEntrezGetInfo ()
434 *
435 *****************************************************************************/
436
CdEntrezGetInfo(void)437 NLM_EXTERN EntrezInfo* CdEntrezGetInfo (void)
438 {
439 EntrezInfo *info = NULL;
440
441 #ifdef _NEW_CdEntrez_
442 if (_nouveau)
443 info = cd3_CdGetInfo();
444 #endif
445
446 #ifdef _OLD_CdEntrez_
447 if (!_nouveau)
448 info = CdGetInfo();
449 #endif
450
451 return info;
452 }
453
454 /*****************************************************************************
455 *
456 * CdEntrezDetailedInfo ()
457 *
458 *****************************************************************************/
459
CdEntrezDetailedInfo(void)460 NLM_EXTERN char* CdEntrezDetailedInfo (void)
461 {
462 char *info = NULL;
463
464 #ifdef _NEW_CdEntrez_
465 if (_nouveau)
466 info = cd3_CdDetailedInfo();
467 #endif
468
469 #ifdef _OLD_CdEntrez_
470 if (!_nouveau)
471 info = CdDetailedInfo();
472 #endif
473
474 return info;
475 }
476
477
478 /*****************************************************************************
479 *
480 * CdEntGetMaxLinks()
481 * returns max links in link set allowed by system
482 *
483 *****************************************************************************/
CdEntGetMaxLinks(void)484 NLM_EXTERN Int4 CdEntGetMaxLinks (void)
485
486 {
487 return (Int4)(INT_MAX / sizeof(DocUid));
488 }
489
490 /*****************************************************************************
491 *
492 * CdEntrezCreateNamedUidList(term, type, field, num, uids)
493 * Creates a term node in the entrez set structure if one does not
494 * yet exist, and loads the posting file from the uid parameter.
495 *
496 *****************************************************************************/
CdEntrezCreateNamedUidList(CharPtr term,DocType type,DocField field,Int4 num,DocUidPtr uids)497 NLM_EXTERN void CdEntrezCreateNamedUidList (CharPtr term, DocType type, DocField field, Int4 num, DocUidPtr uids)
498
499 {
500 Int4 count;
501 ByteStorePtr post;
502 Char str [256];
503
504 if (term != NULL && uids != NULL && num > 0 && num <= 16383) {
505 StringNCpy (str, term, sizeof (str) - 1);
506 post = BSNew (0);
507 if (post != NULL) {
508 count = (Int4) num;
509 QuickSortSmall (uids, 0, (Int4) (count - 1));
510 count = CompressSmall (uids, count);
511 count = UniqueSmall (uids, count);
512 BSWrite (post, uids, (Int4) (count * sizeof (DocUid)));
513 CdEntrezCreateTerm (str, type, field, NULL, post, NULL);
514 BSFree (post);
515 }
516 }
517 }
518
519 /*****************************************************************************
520 *
521 * CdEntrezCreateNamedUidListX(term, type, field, post)
522 * Creates a term node in the entrez set structure if one does not
523 * yet exist, and loads the posting file from the uid parameter.
524 *
525 *****************************************************************************/
CdEntrezCreateNamedUidListX(CharPtr term,DocType type,DocField field,ByteStorePtr bsp)526 NLM_EXTERN void CdEntrezCreateNamedUidListX (CharPtr term, DocType type, DocField field, ByteStorePtr bsp)
527
528 {
529 Int4 actual;
530 Int4 count;
531 ByteStorePtr post;
532 ByteStorePtr small;
533 Char str [256];
534 DocUidPtr uids;
535
536 if (term != NULL && bsp != NULL) {
537 StringNCpy (str, term, sizeof (str) - 1);
538 post = BSNew (0);
539 if (post != NULL) {
540 uids = MemNew (4096 * sizeof (DocUid));
541 BSSeek (bsp, 0L, 0);
542 actual = BSRead (bsp, uids, (Int4) (4096 * sizeof (DocUid)));
543 while (actual > 0) {
544 count = (Int4) actual;
545 QuickSortSmall (uids, 0, (Int4) (count - 1));
546 count = CompressSmall (uids, count);
547 count = UniqueSmall (uids, count);
548 if (count > 0) {
549 small = BSNew (0L);
550 if (small != NULL) {
551 BSWrite (small, uids, count * sizeof (DocUid));
552 post = MergePostingLists (post, small);
553 }
554 }
555 actual = BSRead (bsp, uids, (Int4) (4096 * sizeof (DocUid)));
556 }
557 CdEntrezCreateTerm (str, type, field, NULL, post, NULL);
558 MemFree (uids);
559 BSFree (post);
560 }
561 }
562 }
563
564 /*****************************************************************************
565 *
566 * CdEntTLNew (type)
567 * Creates linked list of asn nodes for constructing boolean query on
568 * terms. First node points to the EntrezSetNew-created structure that
569 * maps terms to posting lists. Remaining nodes contain symbols for AND,
570 * OR, LEFT PARENTHESIS, RIGHT PARENTHESIS, or a SPECIAL or TOTAL term
571 * specification. The term specification nodes point to a CdTerm node
572 * within the entrez set structure.
573 *
574 *****************************************************************************/
575
CdEntTLNew(DocType type)576 NLM_EXTERN ValNodePtr CdEntTLNew (DocType type)
577
578 {
579 ValNodePtr anp;
580
581 anp = NULL;
582 if (eset != NULL) {
583 anp = ValNodeNew (NULL);
584 if (anp != NULL) {
585 anp->choice = NULLSYM;
586 anp->data.ptrvalue = (Pointer) eset;
587 eset->type = type;
588 }
589 }
590 return anp;
591 }
592
593 /*****************************************************************************
594 *
595 * CdEntTLAddTerm (elst, term, type, field, special, highRange)
596 * Adds a term node to a boolean algebraic term query.
597 *
598 *****************************************************************************/
599
CdEntTLAddTerm(ValNodePtr elst,CharPtr term,DocType type,DocField field,Boolean special,CharPtr highRange)600 NLM_EXTERN ValNodePtr CdEntTLAddTerm (ValNodePtr elst, CharPtr term, DocType type, DocField field, Boolean special, CharPtr highRange)
601
602 {
603 ValNodePtr anp;
604 CdTermPtr trmp;
605
606 anp = NULL;
607 if (eset != NULL && elst != NULL) {
608 if (type != eset->type) /* mixed databases */
609 return NULL;
610 anp = ValNodeNew (elst);
611 if (anp != NULL) {
612 if (special) {
613 anp->choice = SPECIALTERM;
614 } else {
615 anp->choice = TOTALTERM;
616 }
617 trmp = FindTermNode (term, type, field, highRange);
618 anp->data.ptrvalue = (Pointer) trmp;
619 }
620 }
621 return anp;
622 }
623
624 /*****************************************************************************
625 *
626 * CdEntTLFree (elst)
627 * Frees a boolean algebraic term query list.
628 *
629 *****************************************************************************/
630
CdEntTLFree(ValNodePtr elst)631 NLM_EXTERN ValNodePtr CdEntTLFree (ValNodePtr elst)
632
633 {
634 if (elst != NULL) {
635 ValNodeFree (elst);
636 eset->type = 255; /* set to nothing */
637 }
638 return NULL;
639 }
640
641 /*****************************************************************************
642 *
643 * CdTLExprFree(elst)
644 *
645 * Free the CdEntrez-style expression, including all of its subordinate terms
646 ****************************************************************************/
CdTLExprFree(ValNodePtr elst)647 static ValNodePtr CdTLExprFree(ValNodePtr elst)
648 {
649 ValNodePtr np;
650 CdTermPtr tp;
651
652 for (np = elst; np != NULL; np = np->next) {
653 switch (np->choice) {
654 case SPECIALTERM:
655 case TOTALTERM:
656 if ((tp = np->data.ptrvalue) != NULL) {
657 MemFree (tp->term);
658 MemFree (tp->highRange);
659 MemFree (tp);
660 }
661 break;
662 default:
663 break;
664 }
665 }
666
667 ValNodeFree(elst);
668
669 return NULL;
670 }
671
672
673 /*****************************************************************************
674 *
675 * CdDupExpr(elst)
676 *
677 * Duplicate the input CdEntrez-style expression
678 ****************************************************************************/
CdDupExpr(ValNodePtr elst)679 static ValNodePtr CdDupExpr(ValNodePtr elst)
680 {
681 ValNodePtr dup = NULL;
682 ValNodePtr trailing = NULL;
683 ValNodePtr np;
684 CdTermPtr tp1, tp2;
685
686 for (; elst != NULL; elst = elst->next) {
687 np = ValNodeNew(NULL);
688 if (dup == NULL)
689 dup = np;
690 if (trailing != NULL)
691 trailing->next = np;
692 trailing = np;
693 np->choice = elst->choice;
694 switch (elst->choice) {
695 case SPECIALTERM:
696 case TOTALTERM:
697 tp2 = elst->data.ptrvalue;
698 if (tp2 != NULL)
699 {
700 tp1 = MemNew(sizeof(*tp1));
701 np->data.ptrvalue = tp1;
702 tp1->type = tp2->type;
703 tp1->field = tp2->field;
704 tp1->term = StringSave(tp2->term);
705 tp1->highRange = StringSave(tp2->highRange);
706 }
707 break;
708 default:
709 break;
710 }
711 }
712
713 return dup;
714 }
715
716 static Boolean
EqualTerms(CharPtr x,CharPtr y)717 EqualTerms (CharPtr x, CharPtr y)
718 {
719 if (x == NULL && y == NULL)
720 return TRUE;
721 if (x == NULL || y == NULL)
722 return FALSE;
723 return (StringICmp(x,y) == 0);
724 }
725
726
727 /*****************************************************************************
728 *
729 * CdEntTLExprEqual (elst1, elst2)
730 *
731 * Determine whether two CdEntrez-style boolean expressions are equal
732 ****************************************************************************/
733
734 static Boolean
CdTLExprEqual(ValNodePtr elst1,ValNodePtr elst2)735 CdTLExprEqual (ValNodePtr elst1, ValNodePtr elst2)
736 {
737 Boolean equal = TRUE;
738 CdTermPtr c1, c2;
739
740 for (; elst1 != NULL && elst2 != NULL && equal; elst1 = elst1->next,
741 elst2 = elst2->next) {
742 if (elst1->choice == elst2->choice) {
743 switch (elst1->choice) {
744 case SPECIALTERM:
745 case TOTALTERM:
746 c1 = elst1->data.ptrvalue;
747 c2 = elst2->data.ptrvalue;
748 equal = c1 != NULL && c2 != NULL && c1->type == c2->type &&
749 c1->field == c2->field && EqualTerms(c1->term, c2->term) &&
750 EqualTerms(c1->highRange, c2->highRange);
751 break;
752 default:
753 break;
754 }
755 } else {
756 equal = FALSE;
757 }
758 }
759
760 return elst1 == NULL && elst2 == NULL && equal;
761 }
762
763
764 /*****************************************************************************
765 *
766 * CdEntTLEvalCount (elst)
767 * Evaluates a boolean algebraic term query list, returning the
768 * count of resulting UIDs.
769 *
770 *****************************************************************************/
771
CdEntTLEvalCount(ValNodePtr elst)772 NLM_EXTERN Int4 CdEntTLEvalCount (ValNodePtr elst)
773 {
774 ByteStorePtr bsp;
775 Int4 len;
776
777 len = 0;
778 bsp = CdEntTLEvalX(elst);
779 if (bsp != NULL) {
780 len = BSLen(bsp) / sizeof(DocUid);
781 BSFree (bsp);
782 }
783 return len;
784 }
785
786
787 /*****************************************************************************
788 *
789 * CdEntTLEvalX (elst)
790 * Evaluates a boolean algebraic term query list, returning a pointer to
791 * a ByteStore containing the resultant unique identifiers. The number
792 * of UIDs is calculated as BSLen (bsp) / sizeof (DocUid).
793 *
794 *****************************************************************************/
795
CdEntTLEvalX(ValNodePtr elst)796 NLM_EXTERN ByteStorePtr CdEntTLEvalX (ValNodePtr elst)
797
798 {
799 ByteStorePtr bsp;
800
801 bsp = NULL;
802 if (eset != NULL && elst != NULL) {
803 if (cachedExpr != NULL && CdTLExprEqual(elst, cachedExpr)) {
804 BSSeek(cachedBsp, 0L, SEEK_SET);
805 bsp = BSDup (cachedBsp);
806 } else {
807 cachedExpr = CdTLExprFree(cachedExpr);
808 cachedExpr = CdDupExpr(elst);
809 cachedBsp = BSFree(cachedBsp);
810
811 currNode = elst;
812 currChoice = NULLSYM;
813 NextNode ();
814 if (eset->term != NULL && currNode != NULL) {
815 bsp = Expression ();
816 BSSeek(bsp, 0L, SEEK_SET);
817 cachedBsp = BSDup(bsp);
818 }
819
820 }
821 }
822 return bsp;
823 }
824
825 /*****************************************************************************
826 *
827 * CdEntTLEval (elst)
828 * Evaluates a boolean algebraic term query list, returning a pointer to
829 * a LinkSet containing the resultant unique identifiers.
830 *
831 *****************************************************************************/
832
CdEntTLEval(ValNodePtr elst)833 NLM_EXTERN LinkSetPtr CdEntTLEval (ValNodePtr elst)
834
835 {
836 ByteStorePtr bsp;
837 LinkSetPtr lsp = NULL;
838 Int4 numlinks;
839
840 bsp = CdEntTLEvalX (elst);
841 if (bsp != NULL)
842 {
843 numlinks = BSLen(bsp) / sizeof(DocUid);
844 lsp = LinkSetNew();
845 lsp->num = numlinks;
846 if (numlinks <= CdEntGetMaxLinks())
847 {
848 lsp->uids = MemNew((size_t)(numlinks * sizeof(DocUid)));
849 BSSeek (bsp, 0L, 0);
850 BSRead(bsp, lsp->uids, (numlinks * sizeof(DocUid)));
851 }
852 BSFree(bsp);
853 }
854 return lsp;
855 }
856
857 /*****************************************************************************
858 *
859 * DocSumPtr CdDocSum(type, uid)
860 *
861 *****************************************************************************/
CdDocSum(DocType type,DocUid uid)862 NLM_EXTERN DocSumPtr CdDocSum (DocType type, DocUid uid)
863
864 {
865 DocSum *sum = NULL;
866
867 #ifdef _NEW_CdEntrez_
868 if (_nouveau)
869 {
870 sum = CdGetDocSum(type,uid);
871 }
872 #endif
873
874 #ifdef _OLD_CdEntrez_
875 if (!_nouveau)
876 {
877 if (type == TYP_ML)
878 sum = CdEntMlSumGet(uid);
879 else
880 sum = CdEntSeqSumGet(uid, type);
881 }
882 #endif
883
884 return sum;
885 }
886
887
888 #ifdef _NEW_CdEntrez_
CdDocSumListGet(DocSum ** result,int numuid,DocType type,const DocUid * uids)889 NLM_EXTERN int CdDocSumListGet PROTO((DocSum **result, int numuid, DocType type, const DocUid *uids))
890 {
891 int i, n;
892 const DocUid *p = uids;
893 DocSum **s = result;
894
895 ASSERT(result != NULL);
896 ASSERT(uids != NULL);
897
898 for (i=n=0; i<numuid; ++i)
899 {
900 if ((*s = CdGetDocSum(type,*p++)) != NULL)
901 {
902 s++;
903 n++;
904 }
905 }
906 return n;
907 }
908 #endif
909
910
911
912 /*****************************************************************************
913 *
914 * CdLinkUidList(type, link_to_type, numuid, uids)
915 * returns count of input uids processed
916 * returns -1 on error
917 * if neighbors (type == link_to_type)
918 * sums weights for same uids
919 * if (more than EntrezUserMaxLinks() uids, frees uids and weights,
920 * but leaves num set)
921 *
922 *****************************************************************************/
CdLinkUidList(LinkSetPtr PNTR result,DocType type,DocType link_to_type,Int2 numuid,Int4Ptr uids,Boolean mark_missing)923 NLM_EXTERN Int2 CdLinkUidList (LinkSetPtr PNTR result, DocType type, DocType link_to_type, Int2 numuid, Int4Ptr uids, Boolean mark_missing)
924 {
925 Int4 max_links = CdEntGetMaxLinks();
926 Int4 count;
927
928 #ifdef _NEW_CdEntrez_
929 if (_nouveau)
930 count = cd3_CdLinkUidGet(result,type,link_to_type,numuid,uids,mark_missing,max_links);
931 #endif
932
933 #ifdef _OLD_CdEntrez_
934 if (!_nouveau)
935 count = CdLinkUidGet(result,type,link_to_type,numuid,uids,mark_missing,max_links);
936 #endif
937
938 return count;
939 }
940
941 /*****************************************************************************
942 *
943 * CdUidLinks()
944 * retrieves links to other uids
945 *
946 *****************************************************************************/
CdUidLinks(DocType type,DocUid uid,DocType link_to_type)947 NLM_EXTERN LinkSetPtr CdUidLinks (DocType type, DocUid uid, DocType link_to_type)
948 {
949 LinkSetPtr lsp = NULL;
950 DocUid u = uid;
951
952 #ifdef _NEW_CdEntrez_
953 if (_nouveau)
954 cd3_CdLinkUidGet(&lsp,type,link_to_type,1,&u,FALSE,CdEntGetMaxLinks());
955 #endif
956 #ifdef _OLD_CdEntrez_
957 if (!_nouveau)
958 CdLinkUidGet(&lsp,type,link_to_type,1,&u,FALSE,CdEntGetMaxLinks());
959 #endif
960
961 return lsp;
962 }
963
964 static Boolean TermListPageScanProc PROTO((CdTermPtr trmptr));
965 static Boolean TermListTermScanProc PROTO((CdTermPtr trmptr));
966 static TermListProc trmproc;
967 static Int4 trmcount;
968 static Int4 trmmax;
969 static Boolean trmfound;
970 static Char trmfirst [80];
971 static Int4 the_first_page;
972
973 /*****************************************************************************
974 *
975 * CdTermListByPage (type, field, page, numpage, proc)
976 * Gets terms starting at page, for numpage, by calling proc
977 * returns number of complete pages read
978 *
979 *****************************************************************************/
CdTermListByPage(DocType type,DocField field,Int2 page,Int2 numpage,TermListProc proc)980 NLM_EXTERN Int2 CdTermListByPage (DocType type, DocField field, Int2 page, Int2 numpage, TermListProc proc)
981
982 {
983 trmproc = proc;
984 if (trmproc != NULL) {
985 #ifdef _NEW_CdEntrez_
986 if (_nouveau)
987 return cd3_CdTermScan(type, field, page, numpage, TermListPageScanProc);
988 #endif
989 #ifdef _OLD_CdEntrez_
990 if (!_nouveau)
991 return CdTermScan(type, field, page, numpage, TermListPageScanProc);
992 #endif
993 } else {
994 return 0;
995 }
996
997 return 0;
998 }
999
1000 /*****************************************************************************
1001 *
1002 * CdTermListByTerm (type, field, term, numterms, proc, first_page)
1003 * Gets Terms starting with at term
1004 * returns number of complete pages read
1005 * sets first_page to first page read
1006 *
1007 *****************************************************************************/
CdTermListByTerm(DocType type,DocField field,CharPtr term,Int2 numterms,TermListProc proc,Int2Ptr first_page)1008 NLM_EXTERN Int2 CdTermListByTerm (DocType type, DocField field, CharPtr term, Int2 numterms, TermListProc proc, Int2Ptr first_page)
1009
1010 {
1011 Int4 first;
1012 Int4 rsult;
1013
1014 rsult = 0;
1015 #ifdef _NEW_CdEntrez_
1016 if (_nouveau)
1017 first = cd3_CdTrmLookup(type, field, term);
1018 #endif
1019 #ifdef _OLD_CdEntrez_
1020 if (!_nouveau)
1021 first = CdTrmLookup(type, field, term);
1022 #endif
1023 the_first_page = first;
1024 trmproc = proc;
1025 trmcount = 0;
1026 if (numterms > 0) {
1027 trmmax = numterms;
1028 } else {
1029 trmmax = INT2_MAX;
1030 }
1031 trmfound = FALSE;
1032 StringNCpy (trmfirst, term, sizeof (trmfirst) - 1);
1033 if (trmproc != NULL) {
1034 #ifdef _NEW_CdEntrez_
1035 if (_nouveau)
1036 rsult = cd3_CdTermScan(type,field,first,0,TermListTermScanProc);
1037 #endif
1038 #ifdef _OLD_CdEntrez_
1039 if (!_nouveau)
1040 rsult = CdTermScan(type,field,first,0,TermListTermScanProc);
1041 #endif
1042 }
1043 if (first_page != NULL) {
1044 *first_page = the_first_page;
1045 }
1046 return rsult;
1047 }
1048
1049 /*****************************************************************************
1050 *
1051 * TermListPageScanProc(trmptr)
1052 * Callback for CdTermListByPage
1053 *
1054 *****************************************************************************/
TermListPageScanProc(CdTermPtr trmptr)1055 static Boolean TermListPageScanProc(CdTermPtr trmptr)
1056 {
1057 Boolean ret = trmproc(trmptr->term,
1058 trmptr->special_count, trmptr->total_count);
1059 MemFree(trmptr);
1060 return ret;
1061 }
1062
1063 /*****************************************************************************
1064 *
1065 * TermListTermScanProc(trmptr)
1066 * Callback for CdTermListByTerm
1067 *
1068 *****************************************************************************/
TermListTermScanProc(CdTermPtr trmptr)1069 static Boolean TermListTermScanProc(CdTermPtr trmptr)
1070 {
1071 Boolean ret = TRUE;
1072 if (! trmfound) {
1073 if (MeshStringICmp (trmptr->term, trmfirst) >= 0) {
1074 trmfound = TRUE;
1075 the_first_page = trmptr->page;
1076 }
1077 }
1078 if (trmfound) {
1079 ret = trmproc(trmptr->term, trmptr->special_count, trmptr->total_count);
1080 trmcount++;
1081 } else {
1082 MemFree (trmptr->term);
1083 }
1084 MemFree(trmptr);
1085 return (ret && trmcount < trmmax);
1086 }
1087
1088 /*****************************************************************************
1089 *
1090 * CdEntrezFindTerm(type, field, term, spec, total)
1091 * returns count of special and total for a term
1092 * if term ends with "...", does a truncated merge of the term
1093 * if term contains '*' or '?', does a wild card merge
1094 *
1095 *****************************************************************************/
CdEntrezFindTerm(DocType type,DocField field,CharPtr term,Int4Ptr spcl,Int4Ptr totl)1096 NLM_EXTERN Boolean CdEntrezFindTerm (DocType type, DocField field, CharPtr term, Int4Ptr spcl, Int4Ptr totl)
1097
1098 {
1099 CharPtr tmp;
1100 CdTermPtr ctp;
1101
1102 tmp = term;
1103 while (*tmp != '\0')
1104 tmp++;
1105 tmp -= 3;
1106 if ((*tmp == '.') && (*(tmp+1) == '.') && (*(tmp+2) == '.')) {
1107 return CdEntrezMergeTerm (type, field, term, spcl, totl, NULL);
1108 } else if (StringChr (term, '*') != NULL || StringChr (term, '?') != NULL) {
1109 return CdEntrezMergeTerm (type, field, term, spcl, totl, WildCardProc);
1110 } else {
1111 #ifdef _NEW_CdEntrez_
1112 if (_nouveau)
1113 ctp = cd3_CdTrmFind(type,field,term);
1114 #endif
1115 #ifdef _OLD_CdEntrez_
1116 if (!_nouveau)
1117 ctp = CdTrmFind(type,field,term);
1118 #endif
1119 if (ctp == NULL)
1120 return FALSE;
1121 *spcl = ctp->special_count;
1122 *totl = ctp->total_count;
1123 CdTermFree(ctp);
1124 return TRUE;
1125 }
1126 }
1127
1128
1129 /*****************************************************************************
1130 *
1131 * CdTermFree(trmp)
1132 * frees a CdTerm structure
1133 *
1134 *****************************************************************************/
1135
CdTermFree(CdTermPtr trmp)1136 NLM_EXTERN CdTermPtr CdTermFree (CdTermPtr trmp)
1137
1138 {
1139 if (trmp == NULL)
1140 return NULL;
1141 if (trmp->term != NULL)
1142 MemFree (trmp->term);
1143 if (trmp->highRange != NULL)
1144 MemFree (trmp->highRange);
1145 return (CdTermPtr) MemFree(trmp);
1146 }
1147
1148
1149
1150
1151 /*****************************************************************************
1152 *
1153 * Below are static functions local to this module
1154 * ===============================================
1155 *
1156 *****************************************************************************/
1157
1158 /*****************************************************************************
1159 *
1160 * Functions to manipulate Boolean lists
1161 *
1162 *****************************************************************************/
1163
1164 /*****************************************************************************
1165 *
1166 * NextNode ()
1167 * Advances to the next node in a term query list.
1168 *
1169 *****************************************************************************/
1170
NextNode(void)1171 static void NEAR NextNode (void)
1172
1173 {
1174 if (currNode != NULL) {
1175 currNode = currNode->next;
1176 if (currNode != NULL) {
1177 currChoice = currNode->choice;
1178 } else {
1179 currChoice = NULLSYM;
1180 }
1181 } else {
1182 currChoice = NULLSYM;
1183 }
1184 }
1185
1186 /*****************************************************************************
1187 *
1188 * Factor ()
1189 * Processes individual term nodes or parenthetical expressions in a
1190 * term query list.
1191 *
1192 *****************************************************************************/
1193
Factor(void)1194 static ByteStorePtr NEAR Factor (void)
1195
1196 {
1197 ByteStorePtr bsp;
1198 FILE *fp;
1199 CdTermPtr trmp;
1200
1201 bsp = NULL;
1202 if (currChoice == LPAREN) {
1203 NextNode ();
1204 bsp = Expression ();
1205 if (currChoice != RPAREN) {
1206 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "Expected right parenthesis");
1207 } else {
1208 NextNode ();
1209 }
1210 } else if (currChoice == SPECIALTERM || currChoice == TOTALTERM) {
1211 if (currNode != NULL) {
1212 trmp = currNode->data.ptrvalue;
1213 if (trmp != NULL) {
1214 fp = FileOpen (eset->term, "rb");
1215 if (fp != NULL) {
1216 fseek (fp, trmp->offset, SEEK_SET);
1217 if (currChoice == SPECIALTERM) {
1218 bsp = LoadPostingList (fp, trmp->special_count, trmp->special_count);
1219 } else if (currChoice == TOTALTERM) {
1220 bsp = LoadPostingList (fp, trmp->special_count, trmp->total_count);
1221 }
1222 FileClose (fp);
1223 }
1224 }
1225 }
1226 NextNode ();
1227 } else {
1228 NextNode ();
1229 }
1230 return bsp;
1231 }
1232
1233 /*****************************************************************************
1234 *
1235 * Term ()
1236 * Processes strings of ANDed term nodes in a term query list.
1237 *
1238 *****************************************************************************/
1239
Term(void)1240 static ByteStorePtr NEAR Term (void)
1241
1242 {
1243 ByteStorePtr bsp;
1244 ByteStorePtr fct;
1245
1246 bsp = Factor ();
1247 while (currChoice == ANDSYMBL) {
1248 NextNode ();
1249 fct = Factor ();
1250 bsp = IntersectPostingLists (bsp, fct);
1251 }
1252 return bsp;
1253 }
1254
1255 /*****************************************************************************
1256 *
1257 * Diff ()
1258 * Processes strings of ORed term nodes in a term query list.
1259 *
1260 *****************************************************************************/
1261
Diff(void)1262 static ByteStorePtr NEAR Diff (void)
1263
1264 {
1265 ByteStorePtr bsp;
1266 ByteStorePtr trm;
1267
1268 bsp = Term ();
1269 while (currChoice == ORSYMBL) {
1270 NextNode ();
1271 trm = Term ();
1272 bsp = MergePostingLists (bsp, trm);
1273 }
1274 return bsp;
1275 }
1276
1277
1278 /*****************************************************************************
1279 *
1280 * Expression ()
1281 * Processes strings of BUTNOTed term nodes in a term query list.
1282 *
1283 *****************************************************************************/
1284
Expression(void)1285 static ByteStorePtr NEAR Expression (void)
1286
1287 {
1288 ByteStorePtr bsp;
1289 ByteStorePtr trm;
1290
1291 bsp = Diff ();
1292 while (currChoice == BUTNOTSYMBL) {
1293 NextNode ();
1294 trm = Diff ();
1295 bsp = DifferencePostingLists (bsp, trm);
1296 }
1297 return bsp;
1298 }
1299
1300
1301 /*****************************************************************************
1302 *
1303 * Low level functions to manipulate postings lists.
1304 *
1305 *****************************************************************************/
1306
NewPost(ByteStorePtr lst,Int4 defsize)1307 static PostingPtr NEAR NewPost (ByteStorePtr lst, Int4 defsize)
1308
1309 {
1310 PostingPtr pst;
1311
1312 pst = NULL;
1313 if (lst != NULL) {
1314 pst = MemNew (sizeof (Posting));
1315 if (pst != NULL) {
1316 pst->uids = lst;
1317 pst->buffer = NULL;
1318 if (defsize == 0) {
1319 pst->bufsize = (Int4) MIN (16384L, BSLen (lst));
1320 } else {
1321 pst->bufsize = (Int4) MIN (16384L, defsize);
1322 }
1323 pst->count = 0;
1324 pst->index = 0;
1325 }
1326 }
1327 return pst;
1328 }
1329
FreePost(PostingPtr pst)1330 static PostingPtr NEAR FreePost (PostingPtr pst)
1331
1332 {
1333 if (pst != NULL) {
1334 if (pst->uids != NULL) {
1335 BSFree (pst->uids);
1336 }
1337 if (pst->buffer != NULL) {
1338 MemFree (pst->buffer);
1339 }
1340 MemFree (pst);
1341 }
1342 return NULL;
1343 }
1344
PostLength(PostingPtr pst)1345 static Int4 NEAR PostLength (PostingPtr pst)
1346
1347 {
1348 Int4 k;
1349
1350 k = 0;
1351 if (pst != NULL) {
1352 k = (Int4) (BSLen (pst->uids) / (Int4) sizeof (DocUid));
1353 }
1354 return k;
1355 }
1356
RewindPost(PostingPtr pst)1357 static void NEAR RewindPost (PostingPtr pst)
1358
1359 {
1360 if (pst != NULL) {
1361 if (pst->uids != NULL) {
1362 BSSeek (pst->uids, 0L, 0);
1363 }
1364 pst->count = 0;
1365 pst->index = 0;
1366 }
1367 }
1368
ReadItem(PostingPtr pst)1369 static DocUid NEAR ReadItem (PostingPtr pst)
1370
1371 {
1372 DocUid rsult;
1373
1374 rsult = INT4_MAX;
1375 if (pst != NULL && pst->uids != NULL) {
1376 if (pst->buffer == NULL) {
1377 pst->buffer = MemNew ((size_t) pst->bufsize);
1378 pst->count = 0;
1379 pst->index = 0;
1380 }
1381 if (pst->count <= 0) {
1382 pst->count = (Int4) BSRead (pst->uids, pst->buffer, pst->bufsize);
1383 pst->index = 0;
1384 }
1385 if (pst->count > 0) {
1386 rsult = pst->buffer [pst->index];
1387 (pst->index)++;
1388 (pst->count) -= sizeof (DocUid);
1389 }
1390 }
1391 return rsult;
1392 }
1393
WriteItem(PostingPtr pst,DocUid value)1394 static void NEAR WriteItem (PostingPtr pst, DocUid value)
1395
1396 {
1397 if (pst != NULL && pst->uids != NULL) {
1398 if (pst->buffer == NULL) {
1399 pst->buffer = MemNew ((size_t) pst->bufsize);
1400 pst->count = 0;
1401 pst->index = 0;
1402 }
1403 pst->buffer [pst->index] = value;
1404 (pst->index)++;
1405 (pst->count) += sizeof (DocUid);
1406 if (pst->count >= pst->bufsize) {
1407 BSWrite (pst->uids, pst->buffer, pst->count);
1408 pst->count = 0;
1409 pst->index = 0;
1410 }
1411 }
1412 }
1413
FlushItems(PostingPtr pst)1414 static void NEAR FlushItems (PostingPtr pst)
1415
1416 {
1417 if (pst != NULL && pst->uids != NULL && pst->buffer != NULL) {
1418 BSWrite (pst->uids, pst->buffer, pst->count);
1419 if (pst->buffer != NULL) {
1420 pst->buffer = MemFree (pst->buffer);
1421 }
1422 pst->count = 0;
1423 pst->index = 0;
1424 }
1425 }
1426
MergePostingLists(ByteStorePtr lst1,ByteStorePtr lst2)1427 static ByteStorePtr NEAR MergePostingLists (ByteStorePtr lst1, ByteStorePtr lst2)
1428
1429 {
1430 PostingPtr buf1;
1431 PostingPtr buf2;
1432 PostingPtr buf3;
1433 Int4 k;
1434 Int4 k1;
1435 Int4 k2;
1436 DocUid pstar;
1437 DocUid qstar;
1438 ByteStorePtr rsult;
1439
1440 ProgMon ("MergePostingLists");
1441 rsult = NULL;
1442 if (lst1 != NULL && lst2 != NULL) {
1443 buf1 = NewPost (lst1, 0);
1444 buf2 = NewPost (lst2, 0);
1445 k1 = PostLength (buf1);
1446 k2 = PostLength (buf2);
1447 k = k1 + k2;
1448 rsult = BSNew (k * sizeof (DocUid));
1449 buf3 = NewPost (rsult, k * (Int4) sizeof (DocUid));
1450 if (rsult != NULL && buf1 != NULL && buf2 != NULL && buf3 != NULL) {
1451 RewindPost (buf1);
1452 RewindPost (buf2);
1453 pstar = ReadItem (buf1);
1454 qstar = ReadItem (buf2);
1455 while (k > 0) {
1456 if (pstar < qstar) {
1457 WriteItem (buf3, pstar);
1458 k--;
1459 pstar = ReadItem (buf1);
1460 } else if (qstar < pstar) {
1461 WriteItem (buf3, qstar);
1462 k--;
1463 qstar = ReadItem (buf2);
1464 } else {
1465 WriteItem (buf3, pstar);
1466 k -= 2;
1467 pstar = ReadItem (buf1);
1468 qstar = ReadItem (buf2);
1469 }
1470 }
1471 FlushItems (buf3);
1472 } else {
1473 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "List is too large to merge");
1474 }
1475 if (buf1 != NULL) {
1476 FreePost (buf1);
1477 }
1478 if (buf2 != NULL) {
1479 FreePost (buf2);
1480 }
1481 if (buf3 != NULL) {
1482 buf3->uids = NULL;
1483 FreePost (buf3);
1484 }
1485 } else if (lst1 != NULL) {
1486 rsult = lst1;
1487 } else if (lst2 != NULL) {
1488 rsult = lst2;
1489 }
1490 return rsult;
1491 }
1492
IntersectPostingLists(ByteStorePtr lst1,ByteStorePtr lst2)1493 static ByteStorePtr NEAR IntersectPostingLists (ByteStorePtr lst1, ByteStorePtr lst2)
1494
1495 {
1496 PostingPtr buf1;
1497 PostingPtr buf2;
1498 PostingPtr buf3;
1499 Int4 k;
1500 Int4 k1;
1501 Int4 k2;
1502 DocUid pstar;
1503 DocUid qstar;
1504 ByteStorePtr rsult;
1505
1506 ProgMon ("UnionPostingLists");
1507 rsult = NULL;
1508 if (lst1 != NULL && lst2 != NULL) {
1509 buf1 = NewPost (lst1, 0);
1510 buf2 = NewPost (lst2, 0);
1511 k1 = PostLength (buf1);
1512 k2 = PostLength (buf2);
1513 k = MIN (k1, k2);
1514 rsult = BSNew (k * sizeof (DocUid));
1515 buf3 = NewPost (rsult, k * (Int4) sizeof (DocUid));
1516 if (rsult != NULL && buf1 != NULL && buf2 != NULL && buf3 != NULL) {
1517 RewindPost (buf1);
1518 RewindPost (buf2);
1519 pstar = ReadItem (buf1);
1520 qstar = ReadItem (buf2);
1521 while (k1 > 0 && k2 > 0) {
1522 if (pstar < qstar) {
1523 k1--;
1524 pstar = ReadItem (buf1);
1525 } else if (qstar < pstar) {
1526 k2--;
1527 qstar = ReadItem (buf2);
1528 } else {
1529 WriteItem (buf3, pstar);
1530 k1--;
1531 k2--;
1532 pstar = ReadItem (buf1);
1533 qstar = ReadItem (buf2);
1534 }
1535 }
1536 FlushItems (buf3);
1537 } else {
1538 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "List is too large to intersect");
1539 }
1540 if (buf1 != NULL) {
1541 FreePost (buf1);
1542 }
1543 if (buf2 != NULL) {
1544 FreePost (buf2);
1545 }
1546 if (buf3 != NULL) {
1547 buf3->uids = NULL;
1548 FreePost (buf3);
1549 }
1550 }
1551 return rsult;
1552 }
1553
DifferencePostingLists(ByteStorePtr lst1,ByteStorePtr lst2)1554 static ByteStorePtr NEAR DifferencePostingLists (ByteStorePtr lst1, ByteStorePtr lst2)
1555
1556 {
1557 PostingPtr buf1;
1558 PostingPtr buf2;
1559 PostingPtr buf3;
1560 Int4 k;
1561 Int4 k1;
1562 Int4 k2;
1563 DocUid pstar;
1564 DocUid qstar;
1565 ByteStorePtr rsult;
1566
1567 ProgMon ("DiffPostingLists");
1568 rsult = NULL;
1569 if (lst1 != NULL && lst2 != NULL) {
1570 buf1 = NewPost (lst1, 0);
1571 buf2 = NewPost (lst2, 0);
1572 k1 = PostLength (buf1);
1573 k2 = PostLength (buf2);
1574 k = k1 + k2;
1575 rsult = BSNew (k * sizeof (DocUid));
1576 buf3 = NewPost (rsult, k * (Int4) sizeof (DocUid));
1577 if (rsult != NULL && buf1 != NULL && buf2 != NULL && buf3 != NULL) {
1578 RewindPost (buf1);
1579 RewindPost (buf2);
1580 pstar = ReadItem (buf1);
1581 qstar = ReadItem (buf2);
1582 while (k > 0) {
1583 if (pstar < qstar) {
1584 WriteItem (buf3, pstar);
1585 k--;
1586 pstar = ReadItem (buf1);
1587 } else if (qstar < pstar) {
1588 k--;
1589 qstar = ReadItem (buf2);
1590 } else {
1591 k -= 2;
1592 pstar = ReadItem (buf1);
1593 qstar = ReadItem (buf2);
1594 }
1595 }
1596 FlushItems (buf3);
1597 } else {
1598 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "List is too large to difference");
1599 }
1600 if (buf1 != NULL) {
1601 FreePost (buf1);
1602 }
1603 if (buf2 != NULL) {
1604 FreePost (buf2);
1605 }
1606 if (buf3 != NULL) {
1607 buf3->uids = NULL;
1608 FreePost (buf3);
1609 }
1610 } else if (lst1 != NULL) {
1611 rsult = lst1;
1612 }
1613 return rsult;
1614 }
1615
FreePostingList(ByteStorePtr lst)1616 static ByteStorePtr NEAR FreePostingList (ByteStorePtr lst)
1617
1618 {
1619 if (lst != NULL) {
1620 BSFree (lst);
1621 }
1622 return NULL;
1623 }
1624
LoadPostingList(FILE * f,Int4 special,Int4 total)1625 static ByteStorePtr NEAR LoadPostingList (FILE *f, Int4 special, Int4 total)
1626
1627 {
1628 VoidPtr bufr;
1629 Int4 cnt;
1630 Int4 cntr;
1631 Int4 k1;
1632 Int4 k2;
1633 ByteStorePtr lst1;
1634 ByteStorePtr lst2;
1635 ByteStorePtr rsult;
1636
1637 rsult = NULL;
1638 if (f != NULL && special >= 0 && total >= 0) {
1639 bufr = MemNew (MAX_CDENTREZ_BYTESTORE * sizeof (DocUid));
1640 if (bufr != NULL) {
1641 k1 = special;
1642 k2 = total - special;
1643 lst1 = BSNew (k1 * sizeof (DocUid));
1644 if (lst1 != NULL) {
1645 cntr = k1;
1646 cnt = MIN (k1, (long) MAX_CDENTREZ_BYTESTORE);
1647 while (cnt > 0) {
1648 FileRead (bufr, sizeof (DocUid), (size_t) cnt, f);
1649 BSWrite (lst1, bufr, cnt * sizeof (DocUid));
1650 cntr -= cnt;
1651 cnt = MIN (cntr, (long) MAX_CDENTREZ_BYTESTORE);
1652 }
1653 } else {
1654 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "List is too large to load");
1655 }
1656 lst2 = BSNew (k2 * sizeof (DocUid));
1657 if (lst2 != NULL) {
1658 cntr = k2;
1659 cnt = MIN (k2, (long) MAX_CDENTREZ_BYTESTORE);
1660 while (cnt > 0) {
1661 FileRead (bufr, sizeof (DocUid), (size_t) cnt, f);
1662 BSWrite (lst2, bufr, cnt * sizeof (DocUid));
1663 cntr -= cnt;
1664 cnt = MIN (cntr, (long) MAX_CDENTREZ_BYTESTORE);
1665 }
1666 } else {
1667 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "List is too large to load");
1668 }
1669 rsult = MergePostingLists (lst1, lst2);
1670 }
1671 MemFree (bufr);
1672 }
1673 return rsult;
1674 }
1675
1676 /*****************************************************************************
1677 *
1678 * CdEntrezCreateTerm (term, type, field, special, remainder)
1679 * Creates a CdTerm node in the entrez set structure if one does not yet
1680 * exist, and loads the posting file from two ByteStorePtr posting lists.
1681 *
1682 *****************************************************************************/
1683
SavePostingList(FILE * f,ByteStorePtr bsp)1684 static void NEAR SavePostingList (FILE *f, ByteStorePtr bsp)
1685
1686 {
1687 VoidPtr bufr;
1688 Int4 cnt;
1689 Int4 cntr;
1690
1691 if (f != NULL && bsp != NULL) {
1692 bufr = MemNew (MAX_CDENTREZ_BYTESTORE * sizeof (DocUid));
1693 if (bufr != NULL) {
1694 cntr = (BSLen (bsp) / (Int4) sizeof (DocUid));
1695 cnt = MIN (cntr, (long) MAX_CDENTREZ_BYTESTORE);
1696 BSSeek (bsp, 0L, 0);
1697 while (cnt > 0) {
1698 BSRead (bsp, bufr, cnt * sizeof (DocUid));
1699 FileWrite (bufr, sizeof (DocUid), (size_t) cnt, f);
1700 cntr -= cnt;
1701 cnt = MIN (cntr, (long) MAX_CDENTREZ_BYTESTORE);
1702 }
1703 }
1704 MemFree (bufr);
1705 }
1706 }
1707
CdEntrezCreateTerm(CharPtr term,DocType type,DocField field,ByteStorePtr special,ByteStorePtr remainder,CharPtr highRange)1708 static CdTermPtr NEAR CdEntrezCreateTerm (CharPtr term, DocType type, DocField field, ByteStorePtr special, ByteStorePtr remainder, CharPtr highRange)
1709
1710 {
1711 FILE *fp;
1712 Boolean goOn;
1713 CdTermPtr last;
1714 Int4 remainderCount;
1715 Int4 specialCount;
1716 CdTermPtr trmp;
1717
1718 trmp = NULL;
1719 if (eset != NULL && term != NULL) {
1720 trmp = eset->next;
1721 last = eset;
1722 goOn = TRUE;
1723 while (trmp != NULL && goOn) {
1724 if (trmp->type == type && trmp->field == field &&
1725 EqualTerms (trmp->term, term) &&
1726 EqualTerms (trmp->highRange, highRange)) {
1727 goOn = FALSE;
1728 } else {
1729 last = trmp;
1730 trmp = trmp->next;
1731 }
1732 }
1733 if (goOn) {
1734 trmp = MemNew (sizeof (CdTerm));
1735 if (trmp != NULL) {
1736 specialCount = 0;
1737 remainderCount = 0;
1738 if (special != NULL) {
1739 specialCount = (BSLen (special) / (Int4) sizeof (DocUid));
1740 }
1741 if (remainder != NULL) {
1742 remainderCount = (BSLen (remainder) / (Int4) sizeof (DocUid));
1743 }
1744 trmp->type = type;
1745 trmp->field = field;
1746 trmp->term = StringSave (term);
1747 trmp->special_count = specialCount;
1748 trmp->total_count = specialCount + remainderCount;
1749 trmp->highRange = StringSave(highRange);
1750 trmp->next = NULL;
1751 last->next = trmp;
1752 fp = FileOpen (eset->term, "ab");
1753 if (fp != NULL) {
1754 fseek (fp, 0, SEEK_END);
1755 trmp->offset = ftell (fp);
1756 SavePostingList (fp, special);
1757 SavePostingList (fp, remainder);
1758 FileClose (fp);
1759 } else {
1760 trmp->offset = 0;
1761 }
1762 }
1763 }
1764 }
1765 return trmp;
1766 }
1767
1768 /*****************************************************************************
1769 *
1770 * FindTermNode (term, type, field, highRange)
1771 * Returns a pointer to a CdTerm node in the entrez set structure,
1772 * creating the node and loading the posting file, if necessary. The
1773 * value of the offset field becomes the offset into the temporary file.
1774 *
1775 *****************************************************************************/
1776
FindTermNode(CharPtr term,DocType type,DocField field,CharPtr highRange)1777 static CdTermPtr NEAR FindTermNode (CharPtr term, DocType type, DocField field, CharPtr highRange)
1778
1779 {
1780 FILE *fp;
1781 Boolean goOn;
1782 CdTermPtr last;
1783 Int4 offset;
1784 Int4 remain;
1785 Int4 special;
1786 CharPtr tmp;
1787 Int4 total;
1788 CdTermPtr trmp;
1789
1790 trmp = NULL;
1791 if (eset != NULL && term != NULL) {
1792 trmp = eset->next;
1793 last = eset;
1794 goOn = TRUE;
1795 while (trmp != NULL && goOn) {
1796 if (trmp->type == type && trmp->field == field &&
1797 EqualTerms (trmp->term, term) &&
1798 EqualTerms (trmp->highRange, highRange)) {
1799 goOn = FALSE;
1800 } else {
1801 last = trmp;
1802 trmp = trmp->next;
1803 }
1804 }
1805 if (goOn) {
1806 tmp = term;
1807 while (*tmp != '\0')
1808 tmp++;
1809 tmp -= 3;
1810 rangeScanning = FALSE;
1811 if (highRange != NULL) {
1812 rangeScanning = TRUE;
1813 StrNCpy(topOfRange, highRange, sizeof(topOfRange));
1814 CdEntrezMergeTerm (type, field, term, NULL, NULL, NULL);
1815 } else {
1816 if ((*tmp == '.') && (*(tmp+1) == '.') && (*(tmp+2) == '.')) {
1817 CdEntrezMergeTerm (type, field, term, NULL, NULL, NULL);
1818 } else if (StringChr (term, '*') != NULL || StringChr (term, '?') != NULL) {
1819 CdEntrezMergeTerm (type, field, term, NULL, NULL, WildCardProc);
1820 }
1821 }
1822 trmp = eset->next;
1823 last = eset;
1824 goOn = TRUE;
1825 while (trmp != NULL && goOn) {
1826 if (trmp->type == type && trmp->field == field &&
1827 EqualTerms (trmp->term, term) &&
1828 EqualTerms (trmp->highRange, highRange)) {
1829 goOn = FALSE;
1830 } else {
1831 last = trmp;
1832 trmp = trmp->next;
1833 }
1834 }
1835 }
1836 if (goOn) {
1837 #ifdef _NEW_CdEntrez_
1838 if (_nouveau)
1839 trmp = cd3_CdTrmFind(type,field,term);
1840 #endif
1841 #ifdef _OLD_CdEntrez_
1842 if (!_nouveau)
1843 trmp = CdTrmFind(type,field,term);
1844 #endif
1845 if (trmp != NULL) {
1846 if (field != FLD_ORGN) {
1847 last->next = trmp;
1848 fp = FileOpen (eset->term, "rb");
1849 if (fp != NULL) {
1850 fseek (fp, 0, SEEK_END);
1851 offset = ftell (fp);
1852 FileClose (fp);
1853 } else {
1854 offset = 0;
1855 }
1856 #ifdef _NEW_CdEntrez_
1857 if (_nouveau)
1858 cd3_CdTrmUidsFil (type, field, trmp->offset, trmp->total_count, eset->term, TRUE);
1859 #endif
1860 #ifdef _OLD_CdEntrez_
1861 if (!_nouveau)
1862 CdTrmUidsFil (type, field, trmp->offset, trmp->total_count, eset->term, TRUE);
1863 #endif
1864 trmp->offset = offset;
1865 } else {
1866 db = type;
1867 fld = field;
1868 uidPtr = MemNew ((size_t) MAX_CDENTREZ_UID_LIST_SIZE);
1869 if (uidPtr != NULL) {
1870 scanPtr = MemNew (SCAN_MAX * sizeof (ScanData));
1871 if (scanPtr != NULL) {
1872 scanOk = TRUE;
1873 scanCount = 0;
1874 specialPost = NULL;
1875 remainPost = NULL;
1876 ScanOnlyProc (trmp);
1877 if (scanCount > 0) {
1878 ProcessScanResults ();
1879 }
1880 if (specialPost != NULL && remainPost != NULL) {
1881 remainPost = DifferencePostingLists (remainPost, specialPost);
1882 }
1883 if (specialPost == NULL) {
1884 specialPost = BSNew (0);
1885 }
1886 if (remainPost == NULL) {
1887 remainPost = BSNew (0);
1888 }
1889 special = BSLen (specialPost) / sizeof (DocUid);
1890 remain = BSLen (remainPost) / sizeof (DocUid);
1891 total = special + remain;
1892 scanPtr = MemFree (scanPtr);
1893 }
1894 uidPtr = MemFree (uidPtr);
1895 if (scanOk && total > 0) {
1896 trmp = CdTermFree (trmp);
1897 trmp = CdEntrezCreateTerm (term, db, fld, specialPost, remainPost, highRange);
1898 }
1899 specialPost = BSFree (specialPost);
1900 remainPost = BSFree (remainPost);
1901 }
1902 }
1903 }
1904 }
1905 }
1906 return trmp;
1907 }
1908
1909 /*****************************************************************************
1910 *
1911 * CdEntrezPreloadMerge (term, type, field, spcl, totl)
1912 * Creates a CdTerm node in the entrez set structure if one does not yet
1913 * exist, and loads the posting file by merging multiple postings files.
1914 *
1915 *****************************************************************************/
1916
SingleSpaces(CharPtr str)1917 static void NEAR SingleSpaces (CharPtr str)
1918
1919 {
1920 Char ch;
1921 Int2 i;
1922 Int2 j;
1923 Int2 k;
1924
1925 i = 0;
1926 j = 0;
1927 k = 0;
1928 ch = str [i];
1929 while (ch != '\0') {
1930 if (ch == ' ') {
1931 if (k == 0) {
1932 str [j] = ch;
1933 j++;
1934 }
1935 k++;
1936 i++;
1937 } else {
1938 k = 0;
1939 str [j] = ch;
1940 i++;
1941 j++;
1942 }
1943 ch = str [i];
1944 }
1945 str [j] = '\0';
1946 }
1947
TermTruncate(CharPtr str)1948 static void NEAR TermTruncate (CharPtr str)
1949
1950 {
1951 if (str != NULL && str [0] != '\0') {
1952 SingleSpaces (str);
1953 if (searchTermLen < (Int2) StringLen (str)) {
1954 str [searchTermLen] = '\0';
1955 }
1956 }
1957 }
1958
HeapCompare(VoidPtr ptr1,VoidPtr ptr2)1959 static int LIBCALLBACK HeapCompare (VoidPtr ptr1, VoidPtr ptr2)
1960
1961 {
1962 DocUidPtr uid1;
1963 DocUidPtr uid2;
1964
1965 if (ptr1 != NULL && ptr2 != NULL) {
1966 uid1 = (DocUidPtr) ptr1;
1967 uid2 = (DocUidPtr) ptr2;
1968 if (*uid1 > *uid2) {
1969 return 1;
1970 } else if (*uid1 < *uid2) {
1971 return -1;
1972 } else {
1973 return 0;
1974 }
1975 } else {
1976 return 0;
1977 }
1978 }
1979
QuickSortSmall(DocUidPtr uids,Int4 l,Int4 r)1980 static void NEAR QuickSortSmall (DocUidPtr uids, Int4 l, Int4 r)
1981
1982 {
1983 HeapSort (uids + l, (size_t) (r - l + 1), sizeof (DocUid), HeapCompare);
1984 }
1985
1986 /*
1987 static Boolean NEAR AlreadyInOrder (DocUidPtr uids, Int4 l, Int4 r)
1988
1989 {
1990 DocUid last;
1991 Boolean rsult;
1992
1993 rsult = TRUE;
1994 if (l < r) {
1995 last = 0;
1996 while (l <= r) {
1997 if (uids [l] < last) {
1998 rsult = FALSE;
1999 }
2000 last = uids [l];
2001 l++;
2002 }
2003 }
2004 return rsult;
2005 }
2006
2007 static void NEAR QuickSortSmall (DocUidPtr uids, Int4 l, Int4 r)
2008
2009 {
2010 DocUid a;
2011 DocUid b;
2012 DocUid c;
2013 Int4 i;
2014 Int4 j;
2015 DocUid temp;
2016 DocUid x;
2017
2018 if (AlreadyInOrder (uids, l, r)) {
2019 return;
2020 }
2021 i = l;
2022 j = r;
2023 a = uids [l];
2024 b = uids [(l + r) / 2];
2025 c = uids [r];
2026 if (a > b) {
2027 if (c > a) {
2028 x = a;
2029 } else if (c < b) {
2030 x = b;
2031 } else {
2032 x = c;
2033 }
2034 } else {
2035 if (c < a) {
2036 x = a;
2037 } else if (c > b) {
2038 x = b;
2039 } else {
2040 x = c;
2041 }
2042 }
2043 do {
2044 while (uids [i] < x) {
2045 i++;
2046 }
2047 while (x < uids [j]) {
2048 j--;
2049 }
2050 if (i <= j) {
2051 temp = uids [i];
2052 uids [i] = uids [j];
2053 uids [j] = temp;
2054 i++;
2055 j--;
2056 }
2057 } while (i <= j);
2058 if (i - l < r - j) {
2059 if (l < j) {
2060 QuickSortSmall (uids, l, j);
2061 }
2062 if (i < r) {
2063 QuickSortSmall (uids, i, r);
2064 }
2065 } else {
2066 if (i < r) {
2067 QuickSortSmall (uids, i, r);
2068 }
2069 if (l < j) {
2070 QuickSortSmall (uids, l, j);
2071 }
2072 }
2073 }
2074 */
2075
CompressSmall(DocUidPtr uids,Int4 count)2076 static Int4 NEAR CompressSmall (DocUidPtr uids, Int4 count)
2077
2078 {
2079 Int4 i;
2080 Int4 j;
2081
2082 i = 0;
2083 j = 0;
2084 while (i < count) {
2085 if (uids [i] > 0) {
2086 uids [j] = uids [i];
2087 i++;
2088 j++;
2089 } else {
2090 i++;
2091 }
2092 }
2093 i = j;
2094 while (j < count) {
2095 uids [j] = 0;
2096 j++;
2097 }
2098 return i;
2099 }
2100
UniqueSmall(DocUidPtr uids,Int4 count)2101 static Int4 NEAR UniqueSmall (DocUidPtr uids, Int4 count)
2102
2103 {
2104 Int4 i;
2105 Int4 j;
2106 DocUid last;
2107
2108 i = 0;
2109 if (count <= 1) {
2110 i = count;
2111 } else {
2112 i = 0;
2113 j = 0;
2114 last = 0;
2115 while (i < count) {
2116 if (uids [i] != last) {
2117 uids [j] = uids [i];
2118 last = uids [i];
2119 i++;
2120 j++;
2121 } else {
2122 i++;
2123 }
2124 }
2125 i = j;
2126 while (j < count) {
2127 uids [j] = 0;
2128 j++;
2129 }
2130 }
2131 return i;
2132 }
2133
MergeSmallLists(ByteStorePtr bsp,ByteStorePtr small)2134 static ByteStorePtr NEAR MergeSmallLists (ByteStorePtr bsp, ByteStorePtr small)
2135
2136 {
2137 Int4 count;
2138 Int4 len;
2139 DocUidPtr uids;
2140
2141 if (small != NULL) {
2142 len = BSLen (small) / (Int4) sizeof (DocUid);
2143 if (len <= (long) MAX_CDENTREZ_SMALL_LIST && len > 0) {
2144 count = (Int4) len;
2145 uids = MemNew ((size_t) count * sizeof (DocUid));
2146 if (uids != NULL) {
2147 BSMerge (small, (VoidPtr) uids);
2148 small = BSFree (small);
2149 QuickSortSmall (uids, 0, (Int4) (count - 1));
2150 count = CompressSmall (uids, count);
2151 count = UniqueSmall (uids, count);
2152 if (count > 0) {
2153 small = BSNew (0L);
2154 BSWrite (small, uids, count * sizeof (DocUid));
2155 }
2156 uids = MemFree (uids);
2157 if (small != NULL) {
2158 bsp = MergePostingLists (bsp, small);
2159 }
2160 } else {
2161 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "MergeSmallLists memory failure");
2162 }
2163 } else if (len > (long) MAX_CDENTREZ_SMALL_LIST) {
2164 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "MergeSmallLists > %d", MAX_CDENTREZ_SMALL_LIST);
2165 }
2166 }
2167 return bsp;
2168 }
2169
MergeUnorderedLists(Int4 i,Int4 count)2170 static Boolean NEAR MergeUnorderedLists (Int4 i, Int4 count)
2171
2172 {
2173 BytePtr bptr;
2174 Int4 finish;
2175 Boolean goOn;
2176 Int4 j;
2177 Int4 len;
2178 Int4 max;
2179 DocUidPtr mptr;
2180 Int4 number;
2181 Int4 offset;
2182 ByteStorePtr remainLarge;
2183 ByteStorePtr remainSmall;
2184 Int4 smallCount;
2185 Int4 start;
2186 Int4 total;
2187
2188 goOn = TRUE;
2189 j = i + count - 1;
2190 max = scanPtr [j].offset + scanPtr [j].totalCount *
2191 (Int4) sizeof (DocUid) - scanPtr [i].offset;
2192 if (max <= MAX_CDENTREZ_UID_LIST_SIZE) {
2193 offset = scanPtr [i].offset;
2194 len = (Int4) (max / (Int4) sizeof (DocUid));
2195 #ifdef _NEW_CdEntrez_
2196 if (_nouveau)
2197 cd3_CdTrmUidsMem (db, fld, offset, (Int4) len, uidPtr);
2198 #endif
2199 #ifdef _OLD_CdEntrez_
2200 if (!_nouveau)
2201 CdTrmUidsMem (db, fld, offset, (Int4) len, uidPtr);
2202 #endif
2203 remainSmall = NULL;
2204 smallCount = 0;
2205 for (j = i; j < i + count; j++) {
2206 scanPtr [j].offset -= offset;
2207 total = scanPtr [j].totalCount;
2208 bptr = ((BytePtr) uidPtr) + scanPtr [j].offset;
2209 mptr = (DocUidPtr) bptr;
2210 if (smallCount + total > MAX_CDENTREZ_SMALL_LIST) {
2211 if (remainSmall != NULL) {
2212 remainPost = MergeSmallLists (remainPost, remainSmall);
2213 remainSmall = NULL;
2214 }
2215 smallCount = 0;
2216 }
2217 if (total > 100) {
2218 start = 0;
2219 number = 0;
2220 while (start < total) {
2221 finish = start + 1;
2222 while (finish < total && mptr [finish - 1] < mptr [finish]) {
2223 finish++;
2224 }
2225 number = finish - start;
2226 if (number > 100) {
2227 remainLarge = BSNew (number * sizeof (DocUid));
2228 BSWrite (remainLarge, (mptr + start), number * sizeof (DocUid));
2229 remainPost = MergePostingLists (remainPost, remainLarge);
2230 } else {
2231 smallCount += number;
2232 if (number > 0) {
2233 if (remainSmall == NULL) {
2234 remainSmall = BSNew (0L);
2235 }
2236 BSWrite (remainSmall, (mptr + start), number * sizeof (DocUid));
2237 }
2238 if (smallCount > MAX_CDENTREZ_SMALL_LIST) {
2239 if (remainSmall != NULL) {
2240 remainPost = MergeSmallLists (remainPost, remainSmall);
2241 remainSmall = NULL;
2242 }
2243 smallCount = 0;
2244 }
2245 }
2246 start = finish;
2247 }
2248 } else {
2249 smallCount += total;
2250 if (total > 0) {
2251 if (remainSmall == NULL) {
2252 remainSmall = BSNew (0L);
2253 }
2254 BSWrite (remainSmall, mptr, total * sizeof (DocUid));
2255 }
2256 }
2257 }
2258 if (remainSmall != NULL) {
2259 remainPost = MergeSmallLists (remainPost, remainSmall);
2260 remainSmall = NULL;
2261 }
2262 } else {
2263 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "Cannot merge > 32 K element");
2264 scanOk = FALSE;
2265 goOn = FALSE;
2266 }
2267 return goOn;
2268 }
2269
MergeSeveralOrderedLists(Int4 i,Int4 count)2270 static Boolean NEAR MergeSeveralOrderedLists (Int4 i, Int4 count)
2271
2272 {
2273 BytePtr bptr;
2274 Boolean goOn;
2275 Int4 j;
2276 Int4 len;
2277 Int4 max;
2278 DocUidPtr mptr;
2279 Int4 offset;
2280 Int4 remainder;
2281 ByteStorePtr remainLarge;
2282 ByteStorePtr remainSmall;
2283 Int4 smallCount;
2284 Int4 special;
2285 ByteStorePtr specialLarge;
2286 ByteStorePtr specialSmall;
2287 Int4 total;
2288
2289 goOn = TRUE;
2290 j = i + count - 1;
2291 max = scanPtr [j].offset + scanPtr [j].totalCount *
2292 (Int4) sizeof (DocUid) - scanPtr [i].offset;
2293 if (max <= MAX_CDENTREZ_UID_LIST_SIZE) {
2294 offset = scanPtr [i].offset;
2295 len = (Int4) (max / (Int4) sizeof (DocUid));
2296 #ifdef _NEW_CdEntrez_
2297 if (_nouveau)
2298 cd3_CdTrmUidsMem (db, fld, offset, (Int4) len, uidPtr);
2299 #endif
2300 #ifdef _OLD_CdEntrez_
2301 if (!_nouveau)
2302 CdTrmUidsMem (db, fld, offset, (Int4) len, uidPtr);
2303 #endif
2304 specialSmall = NULL;
2305 remainSmall = NULL;
2306 smallCount = 0;
2307 for (j = i; j < i + count; j++) {
2308 scanPtr [j].offset -= offset;
2309 special = scanPtr [j].specialCount;
2310 total = scanPtr [j].totalCount;
2311 remainder = total - special;
2312 bptr = ((BytePtr) uidPtr) + scanPtr [j].offset;
2313 mptr = (DocUidPtr) bptr;
2314 if (smallCount + total > MAX_CDENTREZ_SMALL_LIST) {
2315 if (specialSmall != NULL) {
2316 specialPost = MergeSmallLists (specialPost, specialSmall);
2317 specialSmall = NULL;
2318 }
2319 if (remainSmall != NULL) {
2320 remainPost = MergeSmallLists (remainPost, remainSmall);
2321 remainSmall = NULL;
2322 }
2323 smallCount = 0;
2324 }
2325 if (total > 100) {
2326 specialLarge = BSNew (special * sizeof (DocUid));
2327 BSWrite (specialLarge, mptr, special * sizeof (DocUid));
2328 specialPost = MergePostingLists (specialPost, specialLarge);
2329 remainLarge = BSNew (remainder * sizeof (DocUid));
2330 BSWrite (remainLarge, (mptr + special),
2331 remainder * sizeof (DocUid));
2332 remainPost = MergePostingLists (remainPost, remainLarge);
2333 } else {
2334 smallCount += total;
2335 if (special > 0) {
2336 if (specialSmall == NULL) {
2337 specialSmall = BSNew (0L);
2338 }
2339 BSWrite (specialSmall, mptr, special * sizeof (DocUid));
2340 }
2341 if (remainder > 0) {
2342 if (remainSmall == NULL) {
2343 remainSmall = BSNew (0L);
2344 }
2345 BSWrite (remainSmall, (mptr + special), remainder * sizeof (DocUid));
2346 }
2347 }
2348 }
2349 if (specialSmall != NULL) {
2350 specialPost = MergeSmallLists (specialPost, specialSmall);
2351 specialSmall = NULL;
2352 }
2353 if (remainSmall != NULL) {
2354 remainPost = MergeSmallLists (remainPost, remainSmall);
2355 remainSmall = NULL;
2356 }
2357 } else {
2358 ErrPostEx (SEV_ERROR, ERR_CD_BOOL, 0, "Cannot merge > %ld element", (long) MAX_CDENTREZ_UID_LIST_SIZE);
2359 scanOk = FALSE;
2360 goOn = FALSE;
2361 }
2362 return goOn;
2363 }
2364
MergeSeveralLists(Int4 i,Int4 count)2365 static Boolean NEAR MergeSeveralLists (Int4 i, Int4 count)
2366
2367 {
2368 if (fld != FLD_ORGN) {
2369 return MergeSeveralOrderedLists (i, count);
2370 } else {
2371 return MergeUnorderedLists (i, count);
2372 }
2373 }
2374
ProcessScanResults(void)2375 static Boolean NEAR ProcessScanResults (void)
2376
2377 {
2378 Boolean goOn;
2379 Int4 i;
2380 Int4 j;
2381 Int4 max;
2382
2383 ProgMon ("ProcessScanResults");
2384 goOn = TRUE;
2385 i = 0;
2386 j = 0;
2387 max = 0;
2388 while (j < scanCount) {
2389 if (scanPtr [j].offset < scanPtr [i].offset) {
2390 goOn = MergeSeveralLists (i, (Int4) (j - i));
2391 max = 0;
2392 i = j;
2393 } else {
2394 max = scanPtr [j].offset + scanPtr [j].totalCount *
2395 (Int4) sizeof (DocUid) - scanPtr [i].offset;
2396 if (max >= MAX_CDENTREZ_UID_LIST_SIZE) {
2397 if (j == i) {
2398 goOn = MergeSeveralLists (i, 1);
2399 j++;
2400 i = j;
2401 max = 0;
2402 } else {
2403 goOn = MergeSeveralLists (i, (Int4) (j - i));
2404 i = j;
2405 max = 0;
2406 }
2407 } else {
2408 j++;
2409 }
2410 }
2411 }
2412 if (max > 0) {
2413 goOn = MergeSeveralLists (i, (Int4) (j - i));
2414 }
2415 scanCount = 0;
2416 return goOn;
2417 }
2418
ScanOnlyProc(CdTermPtr trmp)2419 static Boolean ScanOnlyProc (CdTermPtr trmp)
2420
2421 {
2422 Int4 count;
2423 Boolean goOn;
2424
2425 goOn = TRUE;
2426 if (scanCount >= SCAN_MAX) {
2427 goOn = ProcessScanResults ();
2428 }
2429 if (scanCount < SCAN_MAX) {
2430 if (trmp->total_count >= CDENTREZ_TERM_MAX) {
2431 while (trmp->special_count > 0) {
2432 if (scanCount >= SCAN_MAX) {
2433 goOn = ProcessScanResults ();
2434 }
2435 count = MIN (trmp->special_count, (long) CDENTREZ_TERM_MAX);
2436 scanPtr [scanCount].specialCount = count;
2437 scanPtr [scanCount].totalCount = count;
2438 scanPtr [scanCount].offset = trmp->offset;
2439 scanPtr [scanCount].specialPtr = NULL;
2440 scanPtr [scanCount].remainderPtr = NULL;
2441 scanCount++;
2442 trmp->special_count -= count;
2443 trmp->total_count -= count;
2444 trmp->offset += count * sizeof (DocUid);
2445 }
2446 while (trmp->total_count > 0) {
2447 if (scanCount >= SCAN_MAX) {
2448 goOn = ProcessScanResults ();
2449 }
2450 count = MIN (trmp->total_count, (long) CDENTREZ_TERM_MAX);
2451 scanPtr [scanCount].specialCount = 0;
2452 scanPtr [scanCount].totalCount = count;
2453 scanPtr [scanCount].offset = trmp->offset;
2454 scanPtr [scanCount].specialPtr = NULL;
2455 scanPtr [scanCount].remainderPtr = NULL;
2456 scanCount++;
2457 trmp->total_count -= count;
2458 trmp->offset += count * sizeof (DocUid);
2459 }
2460 } else {
2461 if (scanCount >= SCAN_MAX) {
2462 goOn = ProcessScanResults ();
2463 }
2464 scanPtr [scanCount].specialCount = trmp->special_count;
2465 scanPtr [scanCount].totalCount = trmp->total_count;
2466 scanPtr [scanCount].offset = trmp->offset;
2467 scanPtr [scanCount].specialPtr = NULL;
2468 scanPtr [scanCount].remainderPtr = NULL;
2469 scanCount++;
2470 }
2471 }
2472 return goOn;
2473 }
2474
WildCardProc(CdTermPtr trmp)2475 static Boolean WildCardProc (CdTermPtr trmp)
2476
2477 {
2478 Int4 diff;
2479 Boolean goOn;
2480 CharPtr src;
2481 CharPtr tgt;
2482
2483 goOn = FALSE;
2484 src = selection;
2485 tgt = trmp->term;
2486 diff = 0;
2487 while (*src != '\0' && *tgt != '\0' && diff == 0) {
2488 if (*src != '?') {
2489 diff = TO_UPPER (*src) - TO_UPPER (*tgt);
2490 }
2491 if (diff == 0) {
2492 src++;
2493 tgt++;
2494 }
2495 }
2496 if (diff != 0) {
2497 if (*src == '*') {
2498 goOn = TRUE;
2499 }
2500 } else if (*src == '*') {
2501 goOn = TRUE;
2502 } else if (*src == '\0' && *tgt == '\0') {
2503 goOn = TRUE;
2504 } else {
2505 goOn = FALSE;
2506 }
2507 return goOn;
2508 }
2509
ScanAndFreeProc(CdTermPtr trmp)2510 static Boolean ScanAndFreeProc (CdTermPtr trmp)
2511
2512 {
2513 Int4 compare;
2514 Boolean goOn;
2515 Char str [256];
2516
2517 goOn = TRUE;
2518 if (trmp != NULL && trmp->term != NULL) {
2519 if (rangeScanning) {
2520 compare = MeshStringICmp (trmp->term, selection);
2521 if (compare >= 0) {
2522 if (topOfRange[0] == '\0')
2523 compare = -1;
2524 else
2525 compare = MeshStringICmp (trmp->term, topOfRange);
2526 if (compare > 0)
2527 goOn = FALSE;
2528 else
2529 goOn = ScanOnlyProc (trmp);
2530 }
2531 } else {
2532 StringNCpy (str, trmp->term, sizeof (str));
2533 TermTruncate (str);
2534 if (userScanProc != NULL) {
2535 compare = MeshStringICmp (str, wildcard);
2536 } else {
2537 compare = MeshStringICmp (str, selection);
2538 }
2539 if (compare > 0) {
2540 str [searchTermLen] = '\0';
2541 if (userScanProc != NULL) {
2542 compare = MeshStringICmp (str, wildcard);
2543 } else {
2544 compare = MeshStringICmp (str, selection);
2545 }
2546 if (compare > 0) {
2547 goOn = FALSE;
2548 }
2549 } else if (compare == 0) {
2550 if (userScanProc != NULL) {
2551 if (userScanProc (trmp)) {
2552 goOn = ScanOnlyProc (trmp);
2553 }
2554 } else {
2555 goOn = ScanOnlyProc (trmp);
2556 }
2557 }
2558 }
2559 }
2560 trmp = CdTermFree (trmp);
2561 return goOn;
2562 }
2563
CdEntrezMergeTerm(DocType type,DocField field,CharPtr term,Int4Ptr spcl,Int4Ptr totl,CdTermProc userProc)2564 static Boolean NEAR CdEntrezMergeTerm (DocType type, DocField field, CharPtr term,
2565 Int4Ptr spcl, Int4Ptr totl, CdTermProc userProc)
2566
2567 {
2568 Char ch;
2569 Int4 remain;
2570 Int4 special;
2571 Char str [256];
2572 Int4 total;
2573 Int4 termpage;
2574 CharPtr tmp;
2575 Int4 limit = 0;
2576 CharPtr prop;
2577 Boolean retval = FALSE;
2578
2579 if (spcl != NULL) {
2580 *spcl = 0;
2581 }
2582 if (totl != NULL) {
2583 *totl = 0;
2584 }
2585 db = type;
2586 fld = field;
2587 userScanProc = userProc;
2588 StringNCpy (str, term, sizeof (str));
2589 tmp = str;
2590 while (*tmp != '\0') {
2591 tmp++;
2592 }
2593 tmp -= 3;
2594 if ((*tmp == '.') && (*(tmp+1) == '.') && (*(tmp+2) == '.')) {
2595 *tmp = '\0';
2596 }
2597 SingleSpaces (str);
2598 if (userProc != NULL) {
2599 searchTermLen = 0;
2600 ch = str [searchTermLen];
2601 while (ch != '\0' && ch != '*' && ch != '?') {
2602 searchTermLen++;
2603 ch = str [searchTermLen];
2604 }
2605 } else {
2606 searchTermLen = (Int4) StringLen (str);
2607 }
2608 if ((prop = (CharPtr) GetAppProperty("CdEntrezTruncLimit")) != NULL)
2609 {
2610 limit = atoi(prop);
2611 }
2612 if (searchTermLen > limit || str [0] == '?' || str [0] == '*' ||
2613 rangeScanning) {
2614 scanOk = TRUE;
2615 uidPtr = MemNew ((size_t) MAX_CDENTREZ_UID_LIST_SIZE);
2616 if (uidPtr != NULL) {
2617 scanPtr = MemNew (SCAN_MAX * sizeof (ScanData));
2618 if (scanPtr != NULL) {
2619 scanCount = 0;
2620 specialPost = NULL;
2621 remainPost = NULL;
2622 StringNCpy (selection, str, sizeof (selection));
2623 StringNCpy (wildcard, str, sizeof (wildcard));
2624 wildcard [searchTermLen] = '\0';
2625 #ifdef _NEW_CdEntrez_
2626 if (_nouveau)
2627 termpage = cd3_CdTrmLookup (db, fld, wildcard);
2628 #endif
2629 #ifdef _OLD_CdEntrez_
2630 if (!_nouveau)
2631 termpage = CdTrmLookup (db, fld, wildcard);
2632 #endif
2633 if (fld == FLD_MESH) {
2634 ch = str [0];
2635 str [0] = TO_UPPER (ch);
2636 }
2637 if (termpage >= 0) {
2638 #ifdef _NEW_CdEntrez_
2639 if (_nouveau)
2640 cd3_CdTermScan (db, fld, termpage, (Int4)0, ScanAndFreeProc);
2641 #endif
2642 #ifdef _OLD_CdEntrez_
2643 if (!_nouveau)
2644 CdTermScan (db, fld, termpage, (Int4)0, ScanAndFreeProc);
2645 #endif
2646 }
2647 if (scanCount > 0) {
2648 ProcessScanResults ();
2649 }
2650 if (specialPost != NULL && remainPost != NULL) {
2651 remainPost = DifferencePostingLists (remainPost, specialPost);
2652 }
2653 if (specialPost == NULL) {
2654 specialPost = BSNew (0);
2655 }
2656 if (remainPost == NULL) {
2657 remainPost = BSNew (0);
2658 }
2659 special = BSLen (specialPost) / sizeof (DocUid);
2660 remain = BSLen (remainPost) / sizeof (DocUid);
2661 total = special + remain;
2662 scanPtr = MemFree (scanPtr);
2663 }
2664 uidPtr = MemFree (uidPtr);
2665 if (scanOk && total > 0) {
2666 retval = TRUE;
2667 if (userProc == NULL && !rangeScanning) {
2668 StringCat (str, "...");
2669 }
2670 CdEntrezCreateTerm (str, db, fld, specialPost, remainPost, rangeScanning ? topOfRange : NULL);
2671 if (spcl != NULL) {
2672 *spcl = special;
2673 }
2674 if (totl != NULL) {
2675 *totl = total;
2676 }
2677 }
2678 specialPost = BSFree (specialPost);
2679 remainPost = BSFree (remainPost);
2680 }
2681 }
2682 return retval;
2683 }
2684
2685 /*****************************************************************************
2686 *
2687 * CdEntMedlineEntryListGet (result, numuid, uids, mark_missing)
2688 * returns a count of entries read
2689 * if (mark_missing) ids which could not be located are made negative
2690 *
2691 *****************************************************************************/
CdEntMedlineEntryListGet(MedlineEntryPtr PNTR result,Int2 numuid,Int4Ptr uids,Boolean mark_missing)2692 NLM_EXTERN Int2 CdEntMedlineEntryListGet (MedlineEntryPtr PNTR result, Int2 numuid, Int4Ptr uids, Boolean mark_missing)
2693
2694 {
2695 MedlineEntryPtr mep;
2696 Int2 count = 0, ctr;
2697 AsnIoPtr aip;
2698 DocType db = TYP_ML;
2699
2700 if (! MedlineAsnLoad())
2701 return 0;
2702
2703 for (ctr = 0; ctr < numuid; ctr++)
2704 {
2705 mep = NULL;
2706
2707 #ifdef _NEW_CdEntrez_
2708 if (_nouveau)
2709 aip = cd3_CdDocAsnOpen(db, uids[ctr]);
2710 #endif
2711 #ifdef _OLD_CdEntrez_
2712 if (!_nouveau)
2713 aip = CdDocAsnOpen(db, uids[ctr]);
2714 #endif
2715 if (aip != NULL)
2716 {
2717 mep = MedlineEntryAsnRead(aip, NULL);
2718 #ifdef _NEW_CdEntrez_
2719 if (_nouveau)
2720 cd3_CdDocAsnClose(aip);
2721 #endif
2722 #ifdef _OLD_CdEntrez_
2723 if (!_nouveau)
2724 CdDocAsnClose(aip);
2725 #endif
2726 }
2727 if (mep == NULL) /* didn't get it */
2728 {
2729 if (mark_missing)
2730 uids[ctr] *= -1;
2731 }
2732 else
2733 {
2734 count++;
2735 result[ctr] = mep;
2736 }
2737 }
2738
2739 return count;
2740 }
2741
2742 /*****************************************************************************
2743 *
2744 * CdEntSeqEntryListGet (result, numuid, uids, retcode, mark_missing)
2745 * returns a count of entries read
2746 * if (mark_missing) ids which could not be located are made negative
2747 * retcode is defined in objsset.h
2748 *
2749 *****************************************************************************/
2750 static AsnIo* CdSeqAsnOpen (DocType *type, DocUid uid, Boolean isGenome);
2751
CdEntSeqEntryListGet(SeqEntryPtr PNTR result,Int2 numuid,Int4Ptr uids,Int2 retcode,Boolean mark_missing)2752 NLM_EXTERN Int2 CdEntSeqEntryListGet (SeqEntryPtr PNTR result, Int2 numuid, Int4Ptr uids, Int2 retcode, Boolean mark_missing)
2753 {
2754 SeqEntryPtr sep;
2755 Int2 count = 0, ctr;
2756 AsnIoPtr aip;
2757 DocType db = TYP_SEQ;
2758 ValNode an;
2759
2760 if (! SeqSetAsnLoad())
2761 return 0;
2762
2763 an.data.intvalue = 0;
2764 an.choice = SEQID_GI;
2765
2766 for (ctr = 0; ctr < numuid; ctr++)
2767 {
2768 sep = NULL;
2769 aip = CdSeqAsnOpen(&db, uids[ctr], retcode == -1);
2770 if (aip != NULL)
2771 {
2772 an.data.intvalue = uids[ctr];
2773 if (retcode == -1)
2774 sep = SeqEntryAsnRead(aip, NULL);
2775 else
2776 sep = SeqEntryAsnGet(aip, NULL, &an, retcode);
2777 #ifdef _NEW_CdEntrez_
2778 if (_nouveau)
2779 cd3_CdDocAsnClose(aip);
2780 #endif
2781 #ifdef _OLD_CdEntrez_
2782 if (!_nouveau)
2783 CdDocAsnClose(aip);
2784 #endif
2785 }
2786 if (sep == NULL) /* didn't get it */
2787 {
2788 if (mark_missing)
2789 uids[ctr] *= -1;
2790 }
2791 else
2792 {
2793 count++;
2794 result[ctr] = sep;
2795 }
2796 }
2797
2798 return count;
2799 }
2800
2801
CdSeqAsnOpen(DocType * type,DocUid uid,Boolean isGenome)2802 static AsnIo* CdSeqAsnOpen (DocType *type, DocUid uid, Boolean isGenome)
2803 {
2804 AsnIo *aio = NULL;
2805
2806 #ifdef _NEW_CdEntrez_
2807 if (_nouveau)
2808 {
2809 if (isGenome) {
2810 if ((aio = cd3_CdDocAsnOpen(TYP_CH,uid)) != NULL)
2811 *type = TYP_CH;
2812 } else {
2813 if (*type != TYP_SEQ)
2814 {
2815 aio = cd3_CdDocAsnOpen(*type,uid);
2816 }
2817 else
2818 {
2819 if ((aio = cd3_CdDocAsnOpen(TYP_AA,uid)) != NULL)
2820 *type = TYP_AA;
2821 else if ((aio = cd3_CdDocAsnOpen(TYP_NT,uid)) != NULL)
2822 *type = TYP_NT;
2823 }
2824 }
2825 }
2826 #endif
2827
2828 #ifdef _OLD_CdEntrez_
2829 if (!_nouveau)
2830 aio = CdDocAsnOpen(*type,uid);
2831 #endif
2832
2833 return aio;
2834 }
2835
2836 /*****************************************************************************
2837 *
2838 * CdEntMlSumListGet (result, numuid, uids)
2839 * returns a count of entries read
2840 * head of linked list is in result
2841 *
2842 *****************************************************************************/
2843
CdEntMlSumListGet(DocSumPtr PNTR result,Int2 numuid,Int4Ptr uids)2844 NLM_EXTERN Int2 CdEntMlSumListGet (DocSumPtr PNTR result, Int2 numuid, Int4Ptr uids) /* Gi numbers */
2845 {
2846 Int2 count = 0;
2847
2848 #ifdef _NEW_CdEntrez_
2849 if (_nouveau)
2850 count = CdDocSumListGet(result,numuid,TYP_ML,uids);
2851 #endif
2852
2853 #ifdef _OLD_CdEntrez_
2854 if (!_nouveau)
2855 {
2856 Int2 ctr;
2857 DocType db = TYP_ML;
2858 AsnIoPtr aip;
2859
2860 for (ctr = 0; ctr < numuid; ctr++)
2861 {
2862 result[ctr] = NULL;
2863 aip = CdDocAsnOpen (db, uids[ctr]);
2864 if (aip != NULL)
2865 {
2866 result[ctr] = MedSumAsnRead(aip, uids[ctr]);
2867 CdDocAsnClose(aip);
2868 if (result[ctr] != NULL)
2869 count++;
2870 }
2871 }
2872 }
2873 #endif
2874
2875 return count;
2876 }
2877
2878
2879 /*****************************************************************************
2880 *
2881 * CdEntMlSumGet(uid)
2882 * get one MlSummary
2883 *
2884 *****************************************************************************/
2885 #ifdef _OLD_CdEntrez_
2886
CdEntMlSumGet(Int4 uid)2887 static DocSumPtr NEAR CdEntMlSumGet (Int4 uid)
2888 {
2889 DocSumPtr dsp = NULL;
2890
2891 CdEntMlSumListGet(&dsp, 1, &uid);
2892 return dsp;
2893 }
2894
2895 #endif
2896
2897 /*****************************************************************************
2898 *
2899 * void StripAuthor(author)
2900 *
2901 *****************************************************************************/
StripAuthor(CharPtr author)2902 static void NEAR StripAuthor (CharPtr author)
2903
2904 {
2905 CharPtr p1, p2;
2906
2907 p1 = author;
2908 while ((p1 = StringChr (p1, ' ')) != NULL) {
2909 for (p2 = p1 + 1; *p2 != '\0'; p2++) {
2910 if (*p2 == ' ') break;
2911 if (IS_ALPHA (*p2) && IS_LOWER (*p2)) break;
2912 }
2913 if (*p2 == '\0' || *p2 == ' ') {
2914 *p1 = '\0';
2915 return;
2916 }
2917 p1++;
2918 }
2919 }
2920
2921 /*****************************************************************************
2922 *
2923 * MedSumAsnRead(aip, uid)
2924 *
2925 *****************************************************************************/
FindAsnType(AsnTypePtr PNTR atp,AsnModulePtr amp,CharPtr str)2926 static void NEAR FindAsnType (AsnTypePtr PNTR atp, AsnModulePtr amp, CharPtr str)
2927
2928 {
2929 if (atp != NULL && (*atp) == NULL) {
2930 *atp = AsnTypeFind (amp, str);
2931 }
2932 }
2933
2934
MedSumAsnRead(AsnIoPtr aip,DocUid uid)2935 static DocSumPtr NEAR MedSumAsnRead (AsnIoPtr aip, DocUid uid)
2936
2937 {
2938 DataVal av;
2939 AsnModulePtr amp;
2940 AsnTypePtr atp;
2941 Boolean citFound;
2942 DocSumPtr dsp;
2943 Boolean goOn;
2944 Int2 i;
2945 CharPtr ptr;
2946 Char caption [50];
2947 Char author [40];
2948 Char year [10];
2949
2950 if ((aip == NULL) || (! AllObjLoad ()))
2951 return NULL;
2952
2953 amp = AsnAllModPtr ();
2954
2955 FindAsnType (&MEDLINE_ENTRY, amp, "Medline-entry");
2956 FindAsnType (&MEDLINE_ENTRY_cit, amp, "Medline-entry.cit");
2957 FindAsnType (&MEDLINE_ENTRY_abstract, amp, "Medline-entry.abstract");
2958 FindAsnType (&TITLE_E_trans, amp, "Title.E.trans");
2959 FindAsnType (&AUTH_LIST_names_ml_E, amp, "Auth-list.names.ml.E");
2960 FindAsnType (&AUTH_LIST_names_str_E, amp, "Auth-list.names.str.E");
2961 FindAsnType (&DATE_STD_year, amp, "Date-std.year");
2962 FindAsnType (&DATE_str, amp, "Date.str");
2963 FindAsnType (&TITLE_E_name, amp, "Title.E.name");
2964 FindAsnType (&MEDLINE_ENTRY_mesh, amp, "Medline-entry.mesh");
2965 FindAsnType (&MEDLINE_ENTRY_substance, amp, "Medline-entry.substance");
2966 FindAsnType (&MEDLINE_ENTRY_xref, amp, "Medline-entry.xref");
2967 FindAsnType (&MEDLINE_ENTRY_idnum, amp, "Medline-entry.idnum");
2968 FindAsnType (&MEDLINE_ENTRY_gene, amp, "Medline-entry.gene");
2969
2970 atp = AsnReadId (aip, amp, MEDLINE_ENTRY);
2971 AsnReadVal (aip, atp, &av);
2972
2973 dsp = MemNew (sizeof (DocSum));
2974 if (dsp != NULL) {
2975 dsp->no_abstract = TRUE;
2976 dsp->translated_title = FALSE;
2977 dsp->no_authors = TRUE;
2978 author [0] = '\0';
2979 year [0] = '\0';
2980 citFound = FALSE;
2981 goOn = TRUE;
2982 while (goOn) {
2983 atp = AsnReadId (aip, amp, atp);
2984 if (atp == MEDLINE_ENTRY) {
2985 AsnReadVal (aip, atp, NULL);
2986 goOn = FALSE;
2987 } else if (atp == MEDLINE_ENTRY_cit) {
2988 AsnReadVal (aip, atp, NULL);
2989 citFound = TRUE;
2990 } else if (atp == MEDLINE_ENTRY_abstract) {
2991 AsnReadVal (aip, atp, NULL);
2992 dsp->no_abstract = FALSE;
2993 goOn = FALSE;
2994 } else if (atp == TITLE_E_trans) {
2995 AsnReadVal (aip, atp, &av);
2996 dsp->translated_title = TRUE;
2997 if (dsp->title != NULL) {
2998 dsp->title = MemFree (dsp->title);
2999 }
3000 dsp->title = MemNew ((size_t) StringLen ((CharPtr) av.ptrvalue) + 3);
3001 ptr = dsp->title;
3002 *ptr = '[';
3003 ptr++;
3004 ptr = StringMove (ptr, (CharPtr) av.ptrvalue);
3005 *ptr = ']';
3006 ptr++;
3007 *ptr = '\0';
3008 AsnKillValue (atp, &av);
3009 } else if (atp == AUTH_LIST_names_ml_E) {
3010 AsnReadVal (aip, atp, &av);
3011 dsp->no_authors = FALSE;
3012 if (author [0] == '\0') {
3013 StringNCpy (author, (CharPtr) av.ptrvalue, sizeof (author));
3014 }
3015 AsnKillValue (atp, &av);
3016 } else if (atp == AUTH_LIST_names_str_E) {
3017 AsnReadVal (aip, atp, &av);
3018 dsp->no_authors = FALSE;
3019 if (author [0] == '\0') {
3020 StringNCpy (author, (CharPtr) av.ptrvalue, sizeof (author));
3021 }
3022 AsnKillValue (atp, &av);
3023 } else if (atp == DATE_STD_year) {
3024 AsnReadVal (aip, atp, &av);
3025 if (citFound) {
3026 sprintf (year, "%ld", (long) av.intvalue);
3027 }
3028 } else if (atp == DATE_str) {
3029 AsnReadVal (aip, atp, &av);
3030 if (citFound) {
3031 i = 0;
3032 ptr = av.ptrvalue;
3033 while (ptr [i] != '\0' && ptr [i] != ' ' && i < sizeof (year) - 1) {
3034 year [i] = ptr [i];
3035 i++;
3036 }
3037 year [i] = '\0';
3038 }
3039 AsnKillValue (atp, &av);
3040 } else if (atp == TITLE_E_name) {
3041 AsnReadVal (aip, atp, &av);
3042 if (dsp->title == NULL) {
3043 dsp->title = StringSave ((CharPtr) av.ptrvalue);
3044 }
3045 AsnKillValue (atp, &av);
3046 } else if (atp == MEDLINE_ENTRY_mesh || atp == MEDLINE_ENTRY_substance ||
3047 atp == MEDLINE_ENTRY_xref || atp == MEDLINE_ENTRY_idnum ||
3048 atp == MEDLINE_ENTRY_gene) {
3049 AsnReadVal (aip, atp, NULL);
3050 goOn = FALSE;
3051 } else {
3052 AsnReadVal (aip, atp, NULL);
3053 }
3054 }
3055 if (dsp->no_authors) {
3056 sprintf (caption, "[%ld], %s", (long) uid, year);
3057 } else if (author [0] != '\0') {
3058 StripAuthor (author);
3059 author [12] = '.';
3060 author [12] = '\0';
3061 sprintf (caption, "%s, %s", author, year);
3062 } else {
3063 sprintf (caption, "[%ld], %s", (long) uid, year);
3064 }
3065 dsp->caption = StringSave (caption);
3066 dsp->uid = uid;
3067 }
3068 AsnIoReset (aip);
3069 return dsp;
3070 }
3071
3072
3073 /*****************************************************************************
3074 *
3075 * CdSeqIdForGI(Int4 gi)
3076 *
3077 *****************************************************************************/
CdSeqIdForGI(Int4 gi)3078 NLM_EXTERN SeqIdPtr CdSeqIdForGI (Int4 gi)
3079 {
3080 #ifdef _NEW_CdEntrez_
3081 DocSum* dsp;
3082 SeqIdPtr sip = NULL, tmp, next;
3083
3084 dsp = cd3_CdGetDocSum (TYP_NT, gi); /* nucleic acid? */
3085 if (dsp == NULL)
3086 dsp = cd3_CdGetDocSum (TYP_AA, gi); /* protein? */
3087 if (dsp != NULL)
3088 {
3089 tmp = SeqIdParse(dsp->extra);
3090 DocSumFree(dsp);
3091
3092 while (tmp != NULL)
3093 {
3094 next = tmp->next;
3095 tmp->next = NULL;
3096 if (tmp->choice == SEQID_GI)
3097 SeqIdFree(tmp);
3098 else
3099 sip = tmp;
3100 tmp = next;
3101 }
3102 }
3103 return sip;
3104
3105 #else
3106
3107 SeqIdPtr sip = NULL, ids, curr, best;
3108 AsnIoPtr aip;
3109 AsnModulePtr amp;
3110 AsnTypePtr atp;
3111 Boolean gotit;
3112 DocType db = TYP_SEQ;
3113 GiimPtr gip;
3114
3115 static Uint1 pick_order[20] = {
3116 83, /* 0 = not set */
3117 65, /* 1 = local Object-id */
3118 65, /* 2 = gibbsq */
3119 65, /* 3 = gibbmt */
3120 70, /* 4 = giim Giimport-id */
3121 60, /* 5 = genbank */
3122 60, /* 6 = embl */
3123 60, /* 7 = pir */
3124 60, /* 8 = swissprot */
3125 65, /* 9 = patent */
3126 65, /* 10 = other TextSeqId */
3127 65, /* 11 = general Dbtag */
3128 90, /* 12 = gi */
3129 60, /* 13 = ddbj */
3130 60, /* 14 = prf */
3131 60, /* 15 = pdb */
3132 0, /* extras for new ids */
3133 0,
3134 0,
3135 0
3136 };
3137
3138 if (! AllObjLoad()) return sip;
3139 amp = AsnAllModPtr();
3140 FindAsnType (&SEQ_ENTRY, amp, "Seq-entry");
3141 FindAsnType (&BIOSEQ_id, amp, "Bioseq.id");
3142 FindAsnType (&BIOSEQ_id_E, amp, "Bioseq.id.E");
3143
3144 aip = CdSeqAsnOpen (&db, gi, FALSE);
3145 if (aip == NULL) return sip;
3146
3147 atp = SEQ_ENTRY;
3148 while ((atp = AsnReadId(aip, amp, atp)) != NULL)
3149 {
3150 if (atp == BIOSEQ_id)
3151 {
3152 gotit = FALSE;
3153 ids = SeqIdSetAsnRead(aip, atp, BIOSEQ_id_E);
3154 for (curr = ids; curr != NULL; curr = curr->next)
3155 {
3156 if (curr->choice == SEQID_GIIM)
3157 {
3158 gip = (GiimPtr)(curr->data.ptrvalue);
3159 if (gip->id == gi)
3160 {
3161 gotit = TRUE;
3162 break;
3163 }
3164 }
3165 else if (curr->choice == SEQID_GI)
3166 {
3167 if (curr->data.intvalue == gi)
3168 {
3169 gotit = TRUE;
3170 break;
3171 }
3172 }
3173 }
3174 if (gotit)
3175 {
3176 best = SeqIdSelect(ids, pick_order, 20);
3177 sip = ValNodeExtract(&ids, (Int2)(best->choice));
3178 }
3179 SeqIdSetFree(ids);
3180 if (gotit)
3181 break;
3182
3183 }
3184 else
3185 AsnReadVal(aip, atp, NULL);
3186 if (! AsnGetLevel(aip)) /* finished reading a Seq-entry */
3187 break; /* failed */
3188 }
3189
3190 #ifdef _NEW_CdEntrez_
3191 if (_nouveau)
3192 cd3_CdDocAsnClose(aip);
3193 #endif
3194 #ifdef _OLD_CdEntrez_
3195 if (!_nouveau)
3196 CdDocAsnClose(aip);
3197 #endif
3198
3199 return sip;
3200 #endif
3201 }
3202
3203
3204
3205 /*****************************************************************************
3206 *
3207 * CdEntSeqSumListGet (result, numuid, db, uids)
3208 * returns a count of entries read
3209 * head of linked list is in result
3210 *
3211 *****************************************************************************/
3212
CdEntSeqSumListGet(DocSumPtr PNTR result,Int2 numuid,DocType db,Int4Ptr uids)3213 NLM_EXTERN Int2 CdEntSeqSumListGet (DocSumPtr PNTR result, Int2 numuid, DocType db, Int4Ptr uids) /* Gi numbers */
3214 {
3215 Int2 count = 0;
3216
3217 #ifdef _NEW_CdEntrez_
3218 if (_nouveau)
3219 {
3220 ASSERT(db != TYP_SEQ);
3221 count = CdDocSumListGet(result,numuid,db,uids);
3222 }
3223 #endif
3224
3225 #ifdef _OLD_CdEntrez_
3226 if (!_nouveau)
3227 {
3228 Int2 ctr;
3229 AsnIoPtr aip;
3230
3231 for (ctr = 0; ctr < numuid; ctr++)
3232 {
3233 result[ctr] = NULL;
3234 aip = CdDocAsnOpen (db, uids[ctr]);
3235 if (aip != NULL)
3236 {
3237 result[ctr] = CdSeqSumAsnRead(aip, uids[ctr]);
3238 CdDocAsnClose(aip);
3239 if (result[ctr] != NULL)
3240 count++;
3241 }
3242 }
3243 }
3244 #endif
3245
3246 return count;
3247 }
3248
3249 /*****************************************************************************
3250 *
3251 * CdEntSeqSumGet(uid, type)
3252 * get one SeqSummary
3253 *
3254 *****************************************************************************/
3255 #ifdef _OLD_CdEntrez_
3256
CdEntSeqSumGet(Int4 uid,DocType type)3257 static DocSumPtr NEAR CdEntSeqSumGet (Int4 uid, DocType type)
3258 {
3259 DocSumPtr dsp = NULL;
3260
3261 CdEntSeqSumListGet(&dsp, 1, type, &uid);
3262 return dsp;
3263 }
3264
3265 #endif
3266
CdSeqSumAsnRead(AsnIoPtr aip,DocUid uid)3267 NLM_EXTERN DocSumPtr CdSeqSumAsnRead (AsnIoPtr aip, DocUid uid)
3268
3269 {
3270 DataVal av;
3271 AsnModulePtr amp;
3272 AsnTypePtr atp;
3273 DocSumPtr dsp;
3274 Boolean goOn;
3275 Char caption [50];
3276 Char author [40];
3277 Char year [10];
3278 Char locus [40];
3279 Char cds [10];
3280 CharPtr chptr;
3281 Int2 proteins;
3282 CharPtr recentTitle;
3283 Boolean backbone;
3284 Boolean genBank;
3285 Boolean embl;
3286 Boolean ddbj;
3287 Boolean pir;
3288 Boolean swissprot;
3289 Boolean isaNA;
3290 Boolean isaAA;
3291 Boolean isaSEG;
3292 Boolean in_id;
3293 Int2 level;
3294
3295 if ((aip == NULL) || (! AllObjLoad ()))
3296 return NULL;
3297
3298 amp = AsnAllModPtr ();
3299
3300 FindAsnType (&SEQ_ENTRY, amp, "Seq-entry");
3301 FindAsnType (&SEQ_ENTRY_seq, amp, "Seq-entry.seq");
3302 FindAsnType (&SEQ_ENTRY_set, amp, "Seq-entry.set");
3303 FindAsnType (&TEXTSEQ_ID_name, amp, "Textseq-id.name");
3304 FindAsnType (&TEXTSEQ_ID_accession, amp, "Textseq-id.accession");
3305 FindAsnType (&AUTH_LIST_names_str_E, amp, "Auth-list.names.str.E");
3306 FindAsnType (&DATE_STD_year, amp, "Date-std.year");
3307 FindAsnType (&DATE_str, amp, "Date.str");
3308 FindAsnType (&SEQ_DESCR_E_title, amp, "Seq-descr.E.title");
3309 FindAsnType (&GIIMPORT_ID_id, amp, "Giimport-id.id");
3310 FindAsnType (&BIOSEQ_inst, amp, "Bioseq.inst");
3311 FindAsnType (&SEQ_INST_mol, amp, "Seq-inst.mol");
3312 FindAsnType (&SEQ_INST_repr, amp, "Seq-inst.repr");
3313 FindAsnType (&SEQ_ID_gibbsq, amp, "Seq-id.gibbsq");
3314 FindAsnType (&SEQ_ID_gibbmt, amp, "Seq-id.gibbmt");
3315 FindAsnType (&SEQ_ID_genbank, amp, "Seq-id.genbank");
3316 FindAsnType (&SEQ_ID_gi, amp, "Seq-id.gi");
3317 FindAsnType (&SEQ_ID_embl, amp, "Seq-id.embl");
3318 FindAsnType (&SEQ_ID_ddbj, amp, "Seq-id.ddbj");
3319 FindAsnType (&SEQ_ID_pir, amp, "Seq-id.pir");
3320 FindAsnType (&SEQ_ID_swissprot, amp, "Seq-id.swissprot");
3321 FindAsnType (&PDB_BLOCK_compound_E, amp, "PDB-block.compound.E");
3322 FindAsnType (&PDB_SEQ_ID_MOL, amp, "PDB-seq-id.mol");
3323 FindAsnType (&BIOSEQ_id, amp, "Bioseq.id");
3324 FindAsnType (&CIT_PAT_title, amp, "Cit-pat.title");
3325
3326 atp = AsnReadId (aip, amp, SEQ_ENTRY);
3327 AsnReadVal (aip, atp, &av);
3328
3329 atp = AsnReadId (aip, amp, atp);
3330 AsnReadVal (aip, atp, &av);
3331
3332 dsp = MemNew (sizeof (DocSum));
3333 if (dsp != NULL) {
3334 dsp->no_abstract = TRUE;
3335 dsp->translated_title = FALSE;
3336 dsp->no_authors = TRUE;
3337 author [0] = '\0';
3338 year [0] = '\0';
3339 locus [0] = '\0';
3340 cds [0] = '\0';
3341 proteins = 1;
3342 recentTitle = NULL;
3343 backbone = FALSE;
3344 genBank = FALSE;
3345 embl = FALSE;
3346 ddbj = FALSE;
3347 pir = FALSE;
3348 swissprot = FALSE;
3349 isaNA = FALSE;
3350 isaAA = FALSE;
3351 isaSEG = FALSE;
3352 in_id = FALSE;
3353 goOn = TRUE;
3354 level = AsnGetLevel (aip);
3355 while (goOn) {
3356 atp = AsnReadId (aip, amp, atp);
3357 if (atp == SEQ_ENTRY_seq || atp == SEQ_ENTRY_set) {
3358 AsnReadVal (aip, atp, NULL);
3359 if (AsnGetLevel (aip) <= level) {
3360 goOn = FALSE;
3361 }
3362 } else if (atp == BIOSEQ_id) {
3363 AsnReadVal (aip, atp, &av);
3364 if (in_id) {
3365 in_id = FALSE;
3366 } else {
3367 in_id = TRUE;
3368 }
3369 } else if (in_id && ((atp == TEXTSEQ_ID_name) ||
3370 (atp == PDB_SEQ_ID_MOL))) {
3371 AsnReadVal (aip, atp, &av);
3372 if (locus [0] == '\0') {
3373 StringNCpy (locus, (CharPtr) av.ptrvalue, sizeof (locus));
3374 }
3375 AsnKillValue (atp, &av);
3376 } else if (in_id && (atp == TEXTSEQ_ID_accession)) {
3377 AsnReadVal (aip, atp, &av);
3378 if (locus [0] == '\0') {
3379 StringNCpy (locus, (CharPtr) av.ptrvalue, sizeof (locus));
3380 }
3381 AsnKillValue (atp, &av);
3382 } else if (atp == AUTH_LIST_names_str_E) {
3383 AsnReadVal (aip, atp, &av);
3384 if (author [0] == '\0') {
3385 StringNCpy (author, (CharPtr) av.ptrvalue, sizeof (author));
3386 }
3387 AsnKillValue (atp, &av);
3388 } else if (atp == DATE_STD_year) {
3389 AsnReadVal (aip, atp, &av);
3390 sprintf (year, "%ld", (long) av.intvalue);
3391 } else if (atp == DATE_str) {
3392 AsnReadVal (aip, atp, &av);
3393 StringNCpy (year, (CharPtr) av.ptrvalue, sizeof (year));
3394 AsnKillValue (atp, &av);
3395 } else if ((atp == SEQ_DESCR_E_title) ||
3396 (atp == PDB_BLOCK_compound_E) || (atp == CIT_PAT_title)) {
3397 AsnReadVal (aip, atp, &av);
3398 if (*((CharPtr)av.ptrvalue) != '\0')
3399 {
3400 if (recentTitle != NULL) {
3401 recentTitle = MemFree (recentTitle);
3402 }
3403
3404 if (dsp->uid == uid && dsp->title == NULL &&
3405 atp != CIT_PAT_title) {
3406 dsp->title = (CharPtr)av.ptrvalue;
3407 }
3408 else
3409 recentTitle = (CharPtr)av.ptrvalue;
3410 }
3411 else
3412 AsnKillValue (atp, &av);
3413 } else if (atp == GIIMPORT_ID_id || atp == SEQ_ID_gi) {
3414 AsnReadVal (aip, atp, &av);
3415 if (av.intvalue == uid) {
3416 dsp->uid = uid;
3417 }
3418 } else if (atp == SEQ_INST_mol) {
3419 AsnReadVal (aip, atp, &av);
3420 if ((! isaNA) && (! isaAA) && dsp->uid == uid) {
3421 isaNA = (Boolean) ISA_na (av.intvalue);
3422 isaAA = (Boolean) ISA_aa (av.intvalue);
3423 if (isaAA && cds [0] == '\0') {
3424 sprintf (cds, " cds%d", (int) proteins);
3425 }
3426 }
3427 if (ISA_aa (av.intvalue)) {
3428 proteins++;
3429 }
3430 } else if (atp == SEQ_INST_repr) {
3431 AsnReadVal (aip, atp, &av);
3432 if (av.intvalue == Seq_repr_seg) {
3433 isaSEG = TRUE;
3434 }
3435 } else if (atp == BIOSEQ_inst) {
3436 AsnReadVal (aip, atp, NULL);
3437 if (dsp->uid == uid && dsp->title == NULL) {
3438 dsp->title = recentTitle;
3439 recentTitle = NULL;
3440 }
3441 } else if (atp == SEQ_ID_gibbsq || atp == SEQ_ID_gibbmt) {
3442 AsnReadVal (aip, atp, NULL);
3443 backbone = TRUE;
3444 } else if (atp == SEQ_ID_genbank) {
3445 AsnReadVal (aip, atp, NULL);
3446 if (in_id)
3447 genBank = TRUE;
3448 } else if (atp == SEQ_ID_embl) {
3449 AsnReadVal (aip, atp, NULL);
3450 if (in_id)
3451 embl = TRUE;
3452 } else if (atp == SEQ_ID_ddbj) {
3453 AsnReadVal (aip, atp, NULL);
3454 if (in_id)
3455 ddbj = TRUE;
3456 } else if (atp == SEQ_ID_pir) {
3457 AsnReadVal (aip, atp, NULL);
3458 if (in_id)
3459 pir = TRUE;
3460 } else if (atp == SEQ_ID_swissprot) {
3461 AsnReadVal (aip, atp, NULL);
3462 if (in_id)
3463 swissprot = TRUE;
3464 } else {
3465 AsnReadVal (aip, atp, NULL);
3466 }
3467 if (dsp->title != NULL && dsp->uid == uid) {
3468 if (backbone) {
3469 if (author [0] != '\0' && year [0] != '\0') {
3470 goOn = FALSE;
3471 }
3472 } else if (genBank || embl || ddbj) {
3473 if (locus [0] != '\0') {
3474 if (isaAA && cds [0] != '\0') {
3475 goOn = FALSE;
3476 } else if (isaNA) {
3477 goOn = FALSE;
3478 }
3479 }
3480 } else if (pir) {
3481 if (locus [0] != '\0') {
3482 goOn = FALSE;
3483 }
3484 } else if (swissprot) {
3485 if (locus [0] != '\0') {
3486 goOn = FALSE;
3487 }
3488 } else if (embl) {
3489 }
3490 }
3491 }
3492 if (backbone) {
3493 chptr = StringChr (author, ',');
3494 if (chptr != NULL) {
3495 *chptr = '\0';
3496 }
3497 chptr = StringChr (year, ' ');
3498 if (chptr != NULL) {
3499 *chptr = '\0';
3500 }
3501 author [12] = '.';
3502 author [12] = '\0';
3503 sprintf (caption, "%s, %s", author, year);
3504 dsp->caption = StringSave (caption);
3505 } else if (genBank || embl || ddbj) {
3506 if (isaAA) {
3507 sprintf (caption, "%s%s", locus, cds);
3508 } else if (isaSEG) {
3509 sprintf (caption, "%s segs", locus);
3510 } else {
3511 sprintf (caption, "%s", locus);
3512 }
3513 dsp->caption = StringSave (caption);
3514 } else {
3515 sprintf (caption, "%s", locus);
3516 dsp->caption = StringSave (caption);
3517 }
3518 dsp->uid = uid;
3519 if (recentTitle != NULL) {
3520 recentTitle = MemFree (recentTitle);
3521 }
3522 }
3523 AsnIoReset (aip);
3524 return dsp;
3525 }
3526
3527 /*****************************************************************************
3528 *
3529 * CdEntrezFindSeqId(sip)
3530 * given a Seq-id, get the uid.
3531 * returns 0 on failure
3532 *
3533 *****************************************************************************/
CdEntrezFindSeqId(SeqIdPtr sip)3534 NLM_EXTERN Int4 CdEntrezFindSeqId (SeqIdPtr sip)
3535 {
3536 Int4 uid = 0;
3537 DocType db = -1;
3538 TextSeqIdPtr tsip;
3539 PDBSeqIdPtr psip;
3540 PatentSeqIdPtr patsip;
3541 CharPtr locus = NULL;
3542 Char localbuf[40];
3543 ValNodePtr lst;
3544 LinkSetPtr lsp;
3545 Boolean check_both, done;
3546 EntrezInfoPtr eip;
3547 Int4 index;
3548
3549 if ((eip = CdEntrezGetInfo()) != NULL && eip->field_count > FLD_SQID &&
3550 eip->types[TYP_NT].fields[FLD_SQID].num_terms > 0)
3551 {
3552 done = FALSE;
3553 check_both = TRUE;
3554 db = TYP_NT;
3555 SeqIdWrite(sip, localbuf, PRINTID_FASTA_LONG, sizeof(localbuf));
3556 while (! done) /* might need to check 2 types */
3557 {
3558 lst = CdEntTLNew(db);
3559 if (lst == NULL) return uid;
3560 CdEntTLAddTerm(lst, localbuf, db, FLD_SQID, TRUE, NULL);
3561 lsp = CdEntTLEval(lst);
3562 CdEntTLFree(lst);
3563 if (lsp != NULL)
3564 {
3565 for (index = 0; index < lsp->num; index++)
3566 { /* choose the highest one */
3567 if (lsp->uids[index] > uid)
3568 uid = lsp->uids[index];
3569 }
3570 LinkSetFree(lsp);
3571 }
3572 if ((! check_both) || (uid > 0))
3573 done = TRUE;
3574 else
3575 {
3576 if (db == TYP_AA)
3577 db = TYP_NT;
3578 else
3579 db = TYP_AA;
3580 check_both = FALSE;
3581 }
3582 }
3583 }
3584
3585 if (uid > 0)
3586 {
3587 return uid;
3588 }
3589
3590 check_both = FALSE;
3591 switch (sip->choice)
3592 {
3593 case SEQID_NOT_SET: /* not set */
3594 case SEQID_LOCAL: /* local */
3595 break;
3596 case SEQID_GIBBSQ: /* gibbsq */
3597 case SEQID_GIBBMT: /* gibbmt */
3598 sprintf(localbuf, "B%ld", (long)(sip->data.intvalue));
3599 locus = (CharPtr)localbuf;
3600 db = TYP_AA; /* guess it's a protein */
3601 check_both = TRUE;
3602 break; /* not on cdrom */
3603 case SEQID_GIIM: /* giim */
3604 uid = ((GiimPtr)sip->data.ptrvalue)->id;
3605 break;
3606 case SEQID_GI:
3607 uid = sip->data.intvalue;
3608 break;
3609 case SEQID_GENBANK: /* genbank */
3610 case SEQID_EMBL: /* embl */
3611 case SEQID_DDBJ:
3612 db = TYP_NT; /* guess it's a nucleic acid */
3613 check_both = TRUE;
3614 case SEQID_PIR: /* pir */
3615 case SEQID_SWISSPROT:
3616 case SEQID_PRF:
3617 if (db < 0)
3618 db = TYP_AA;
3619 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
3620 if (tsip->accession != NULL)
3621 locus = tsip->accession;
3622 else
3623 locus = tsip->name;
3624 break;
3625 case SEQID_PDB:
3626 psip = (PDBSeqIdPtr)(sip->data.ptrvalue);
3627 if (psip->chain == '\0' || psip->chain == ' ')
3628 StrCpy (localbuf, psip->mol);
3629 else
3630 sprintf(localbuf, "%s-%c", psip->mol, (Char)psip->chain);
3631 locus = localbuf;
3632 db = TYP_AA; /* guess protein */
3633 check_both = TRUE;
3634 break;
3635 case SEQID_PATENT:
3636 patsip = (PatentSeqIdPtr)(sip->data.ptrvalue);
3637 sprintf(localbuf, "%s%s %d", patsip->cit->country, patsip->cit->number,
3638 (int)patsip->seqid);
3639 locus = localbuf;
3640 db = TYP_AA; /* guess protein */
3641 check_both = TRUE;
3642 break;
3643 default:
3644 break;
3645 }
3646
3647 if ((! uid) && (locus != NULL)) /* got a term to find */
3648 {
3649 done = FALSE;
3650 while (! done) /* might need to check 2 types */
3651 {
3652 lst = CdEntTLNew(db);
3653 if (lst == NULL) return uid;
3654 CdEntTLAddTerm(lst, locus, db, FLD_ACCN, TRUE, NULL);
3655 lsp = CdEntTLEval(lst);
3656 CdEntTLFree(lst);
3657 if (lsp != NULL)
3658 {
3659 for (index = 0; index < lsp->num; index++)
3660 { /* choose the highest one */
3661 if (lsp->uids[index] > uid)
3662 uid = lsp->uids[index];
3663 }
3664 LinkSetFree(lsp);
3665 }
3666 if ((! check_both) || (uid > 0))
3667 done = TRUE;
3668 else
3669 {
3670 if (db == TYP_AA)
3671 db = TYP_NT;
3672 else
3673 db = TYP_AA;
3674 check_both = FALSE;
3675 }
3676 }
3677 }
3678
3679 return uid;
3680 }
3681
3682 #ifdef Biostruc_supported
CdEntrezBiostrucGet(DocUid uid,Int4 mdlLvl,Int4 maxModels)3683 NLM_EXTERN BiostrucPtr CdEntrezBiostrucGet (DocUid uid, Int4 mdlLvl, Int4 maxModels)
3684 {
3685 Biostruc *struc = NULL;
3686 AsnIo *stream = NULL;
3687
3688 if (! BiostrucAvail ()) return NULL;
3689 stream = cd3_CdDocAsnOpen(TYP_ST,uid);
3690 if (stream != NULL)
3691 {
3692 struc = BiostrucAsnGet(stream,NULL, mdlLvl, maxModels);
3693 cd3_CdDocAsnClose(stream);
3694 }
3695 return struc;
3696 }
3697
3698
3699 #ifdef OS_UNIX
3700
CdEntrezBiostrucAnnotSetGet(DocUid uid)3701 NLM_EXTERN BiostrucAnnotSetPtr CdEntrezBiostrucAnnotSetGet (DocUid uid)
3702 {
3703 BiostrucAnnotSetPtr retval = NULL;
3704 AsnIoPtr aip;
3705 FILE *pipe;
3706 char command[PATH_MAX+5];
3707 char fname[PATH_MAX];
3708
3709 if (CdMountEntrezVolume(1,fname,PATH_MAX-32))
3710 {
3711 sprintf(strchr(fname,0), "/vast/%ld.bas.Z", (long) uid);
3712 if (FileLength(fname) <= 0)
3713 {
3714 return NULL;
3715 }
3716 sprintf(command,"zcat %s", fname);
3717 if ((pipe=popen(command,"r")) ==NULL)
3718 {
3719 ErrPostEx(SEV_ERROR,0,0,"Unable to open pipe [%s]",command);
3720 return NULL;
3721 }
3722 aip = AsnIoNew(ASNIO_TEXT_IN, pipe, NULL, NULL, NULL);
3723 if (aip != NULL)
3724 {
3725 retval = BiostrucAnnotSetAsnRead(aip, NULL);
3726 }
3727 AsnIoFree(aip,FALSE);
3728 pclose(pipe);
3729 }
3730 return retval;
3731 }
3732
3733 #else
3734
CdEntrezBiostrucAnnotSetGet(DocUid uid)3735 NLM_EXTERN BiostrucAnnotSetPtr CdEntrezBiostrucAnnotSetGet (DocUid uid)
3736 {
3737 return NULL;
3738 }
3739
3740 #endif
3741
3742
CdEntrezBiostrucAnnotSetGetByFid(DocUid mmdbid,Int4 feature_id,Int4 feature_set_id)3743 NLM_EXTERN BiostrucAnnotSetPtr LIBCALL CdEntrezBiostrucAnnotSetGetByFid (DocUid mmdbid, Int4 feature_id, Int4 feature_set_id)
3744 {
3745 BiostrucAnnotSetPtr basp = CdEntrezBiostrucAnnotSetGet (mmdbid);
3746 BiostrucAnnotSetPtr basp2 = NULL;
3747 BiostrucFeatureSetPtr pbsfs = NULL;
3748 BiostrucFeaturePtr pbsf = NULL;
3749
3750 if (basp == NULL)
3751 return NULL;
3752
3753 pbsfs = basp->features;
3754 while (pbsfs)
3755 {
3756 if (pbsfs->id == feature_set_id)
3757 {
3758 pbsf = pbsfs->features;
3759 while(pbsf)
3760 {
3761 if (pbsf->id == feature_id)
3762 { /* found it */
3763 basp2 = BiostrucAnnotSetNew();
3764 basp2->id = basp->id;
3765 basp2->descr = basp->descr;
3766 basp->descr = NULL; /* unlink the descr from basp object */
3767 basp2->features = BiostrucFeatureSetNew();
3768 basp2->features->id = pbsfs->id;
3769 basp2->features->descr = pbsfs->descr;
3770 pbsfs->descr = NULL; /* unlink the feature-set descr from basp object */
3771 basp2->features->features = BiostrucFeatureNew();
3772 basp2->features->features->id = pbsf->id;
3773 basp2->features->features->name = StringSave(pbsf->name);
3774 basp2->features->features->type = pbsf->type;
3775 basp2->features->features->Property_property = pbsf->Property_property;
3776 pbsf->Property_property = NULL; /* unlink the property from basp object */
3777 basp2->features->features->Location_location = pbsf->Location_location;
3778 pbsf->Location_location = NULL; /* unlink the location from basp object */
3779 BiostrucAnnotSetFree(basp);
3780 return basp2;
3781 }
3782 pbsf = pbsf->next;
3783 }
3784 }
3785 pbsfs = pbsfs->next;
3786 }
3787
3788 BiostrucAnnotSetFree(basp);
3789 return basp2;
3790 }
3791
3792
CdEntrezBiostrucFeatIds(DocUid mmdbid,Int2 feature_type,Int4 feature_set_id)3793 NLM_EXTERN LinkSetPtr LIBCALL CdEntrezBiostrucFeatIds(DocUid mmdbid, Int2 feature_type, Int4 feature_set_id)
3794 {
3795 BiostrucAnnotSetPtr basp = CdEntrezBiostrucAnnotSetGet (mmdbid);
3796 LinkSetPtr retval = NULL;
3797 Int4Ptr ids = NULL;
3798 Int4Ptr scores = NULL;
3799 Int4 count = 0;
3800 BiostrucFeatureSetPtr pbsfs = NULL;
3801 BiostrucFeaturePtr pbsf = NULL;
3802 ChemGraphAlignmentPtr pcga = NULL;
3803
3804
3805 if (basp == NULL)
3806 return NULL;
3807
3808 /* count the number of features of type feature_type */
3809 pbsfs = basp->features;
3810 while (pbsfs)
3811 {
3812 if (pbsfs->id == feature_set_id)
3813 {
3814 pbsf = pbsfs->features;
3815 while(pbsf)
3816 {
3817 if (pbsf->type == feature_type)
3818 {
3819 count++;
3820 }
3821 pbsf = pbsf->next;
3822 }
3823 }
3824 pbsfs = pbsfs->next;
3825 }
3826
3827 /* allocate vectors for ids, scores iff alignment data */
3828
3829 ids = (Int4Ptr) MemNew(sizeof(Int4) * count);
3830 if (feature_type == 200) /* NCBI alignments */
3831 scores = (Int4Ptr) MemNew(sizeof(Int4) * count);
3832
3833 count = 0;
3834 /* collect the feature-id's and scores */
3835 pbsfs = basp->features;
3836 while (pbsfs)
3837 {
3838 if (pbsfs->id == feature_set_id)
3839 {
3840 pbsf = pbsfs->features;
3841 while(pbsf)
3842 {
3843 if (pbsf->type == feature_type)
3844 {
3845
3846 ids[count] = pbsf->id;
3847 if (feature_type == 200) /* alignment type id */
3848 {
3849 pcga = (ChemGraphAlignmentPtr) pbsf->Location_location->data.ptrvalue;
3850 scores[count] = pcga->aligndata->vast_mlogp; /* an Int4 already */
3851 }
3852 count++;
3853 }
3854 pbsf = pbsf->next;
3855 } /* while feature */
3856 retval = LinkSetNew();
3857 retval->num = count;
3858 retval->uids = ids;
3859 retval->weights = scores;
3860 MemFree(basp);
3861 return retval;
3862 } /* if feature_set_id */
3863 pbsfs = pbsfs->next;
3864 } /* while feature_set */
3865 MemFree(basp);
3866 return NULL;
3867 }
3868 #endif /* Biostruc_supported */
3869