1 /* casn.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * RCS $Id: casn.c,v 6.1 2001/04/27 18:00:30 juran Exp $
27 *
28 * Author: Greg Schuler
29 *
30 * Version Creation Date: 9/23/92
31 *
32 * File Description: functions to decompress a compressed ASN,1 (CASN) file.
33 *
34 * Modifications:
35 * --------------------------------------------------------------------------
36 * Date Name Description of modification
37 * ------- ---------- -----------------------------------------------------
38 * 04-21-93 Schuler CASN_ReadBuff declared as LIBCALLBACK
39 * 06-28-93 Schuler New function: CASN_Seek().
40 * 06-17-94 Schuler Modified to support new file format that is to debut
41 * in Entrez release 13.0
42 * 07-20-94 Schuler Fixed bug in CASN_Open (incorrect doc_type)
43 * 09-07-94 Schuler Changed implementation of rd_string (one byte length)
44 * 10-05-94 Schuler Added CASN_NextBiostruc
45 *
46 * 05-19-95 Schuler Added rcs Log directive for automatic insertion of
47 * modification comments.
48 *
49 * Revision $Log: casn.c,v $
50 * Revision Revision 6.1 2001/04/27 18:00:30 juran
51 * Revision Warnings.
52 * Revision
53 * Revision Revision 6.0 1997/08/25 18:12:41 madden
54 * Revision Revision changed to 6.0
55 * Revision
56 * Revision Revision 5.2 1997/06/26 21:55:21 vakatov
57 * Revision [PC] DLL'd "ncbicdr.lib", "ncbiacc.lib", "ncbinacc.lib" and "ncbicacc.lib"
58 * Revision
59 * Revision Revision 5.1 1997/05/29 18:17:11 savchuk
60 * Revision CASN_NextSeqEntry() function is now seeking to the end of compressed ASN
61 * Revision
62 * Revision 5.0 1996/05/28 13:55:34 ostell
63 * Set to revision 5.0
64 *
65 * Revision 4.0 1995/07/26 13:50:32 ostell
66 * force revision to 4.0
67 *
68 * Revision 2.11 1995/06/23 16:02:43 kans
69 * support for accmmdbs.c stub to resolve symbols without MMDB link
70 *
71 * Revision 2.10 1995/06/23 13:22:25 kans
72 * Biostruc_CD_supported symbol needed for local MMDB access
73 *
74 * Revision 2.9 1995/05/16 14:36:20 schuler
75 * Automatic comment insertion enabled
76 *
77 *
78 * ==========================================================================
79 */
80
81 #define REVISION_STR "$Revision: 6.1 $"
82
83
84 #include <asn.h>
85 #include <casn.h>
86
87 struct casn_ioblock
88 {
89 short rel_major;
90 short rel_minor;
91 int magic;
92 int format;
93 int compr;
94 long bytes;
95 CASN_Type doc_type;
96 long doc_count;
97 long uid_min;
98 long uid_max;
99 int huff_count;
100 short *huff_left;
101 short *huff_right;
102 unsigned byte;
103 unsigned mask;
104 FILE *fd;
105 AsnIo *aio;
106 AsnModule *amp;
107 AsnType *atp;
108 };
109
110
111 #define CURRENT_FILEFORMAT 2
112 #define MAGIC_FILEFORMAT 4541
113
114 #define MAGIC_IOBLOCK 3958
115 #define Handle_IsValid(x) ((x) && ((x)->magic == MAGIC_IOBLOCK))
116
117 enum CASN_Compr { CASN_ComprNone, CASN_ComprHuff };
118
119 static char * _asn_type[] = { "", "Medline-entry", "Seq-entry" };
120 static char * file_emsg = "Unrecognized compressed file format [%s]\n";
121
122 static int compr_none_read (CASN_Handle handle, char *buff, int count);
123 static int compr_huff_read (CASN_Handle handle, char *buff, int count);
124
125
126 static Int2 LIBCALLBACK CASN_ReadBuff(Pointer param, CharPtr buffer, Uint2 count);
127
128 static char * rd_string (FILE *fd);
129 static unsigned long rd_integer (FILE *fd, int bytes);
130 #define RD_SHORT(f) (short)rd_integer(fd,2)
131 #define RD_USHORT(f) (unsigned short)rd_integer(fd,2)
132 #define RD_INT(f) (int)rd_integer(fd,2)
133 #define RD_UINT(f) (unsigned int)rd_integer(fd,2)
134 #define RD_LONG(f) (long)rd_integer(fd,4)
135 #define RD_ULONG(f) rd_integer(fd,4)
136
137
138
139 /* --------------- High-Level Functions --------------- */
140
CASN_Open(char * fname)141 NLM_EXTERN CASN_Handle LIBCALL CASN_Open (char *fname)
142 {
143 int i, j;
144 CASN_Handle handle;
145 FILE *fd =NULL;
146 int doc_type;
147 long l1, l2, l3;
148 int huff_count;
149 short rel_major =0, rel_minor =0;
150
151 if (!(fd = FileOpen(fname,"rb")))
152 {
153 ErrPostEx(SEV_ERROR,CASN_ErrFileOpen,0,"Unable to open file %s\n",fname);
154 return NULL;
155 }
156
157 /* check to see that the file is recognizable */
158 i = RD_SHORT(fd);
159 j = RD_SHORT(fd);
160 if (i != MAGIC_FILEFORMAT || j > CURRENT_FILEFORMAT)
161 {
162 FileClose(fd);
163 ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,fname);
164 return NULL;
165 }
166 if (j == CURRENT_FILEFORMAT)
167 {
168 rel_major = RD_SHORT(fd);
169 rel_minor = RD_SHORT(fd);
170 }
171
172 l1 = RD_LONG(fd);
173 l2 = RD_LONG(fd);
174 l3 = RD_LONG(fd);
175
176 doc_type = RD_SHORT(fd);
177 if (j<CURRENT_FILEFORMAT)
178 doc_type = -doc_type;
179 huff_count = RD_SHORT(fd);
180
181 if (!(handle = CASN_New((CASN_Type)doc_type,huff_count)))
182 {
183 FileClose(fd);
184 return NULL;
185 }
186 handle->format = j;
187 handle->huff_count = huff_count;
188 for (i=0; i<huff_count; ++i)
189 handle->huff_left[i] = RD_SHORT(fd);
190 for (i=0; i<huff_count; ++i)
191 handle->huff_right[i] = RD_SHORT(fd);
192
193 if (!(handle->aio = AsnIoNew(ASNIO_BIN_IN,fd,handle,CASN_ReadBuff,NULL)))
194 {
195 FileClose(fd);
196 CASN_Free(handle);
197 return NULL;
198 }
199 handle->aio->fname = StrSave(fname);
200 handle->fd = fd;
201 handle->amp = AsnAllModPtr();
202 handle->doc_count = l1;
203 handle->uid_min = l2;
204 handle->uid_max = l3;
205 handle->rel_major = rel_major;
206 handle->rel_minor = rel_minor;
207
208 if (handle->format == CURRENT_FILEFORMAT)
209 {
210 char *asntype;
211 /* skip over some things */
212 MemFree((void*)rd_string(fd));
213 rd_integer(fd,2);
214 MemFree((void*)rd_string(fd));
215 MemFree((void*)rd_string(fd));
216 asntype = rd_string(fd);
217 handle->atp = AsnTypeFind(handle->amp,asntype);
218 MemFree((void*)asntype);
219 }
220 else
221 {
222 handle->atp = AsnTypeFind(handle->amp,_asn_type[-doc_type]);
223 }
224 return handle;
225 }
226
CASN_Close(CASN_Handle handle)227 NLM_EXTERN void LIBCALL CASN_Close (CASN_Handle handle)
228 {
229 ASSERT(Handle_IsValid(handle));
230 AsnIoClose(handle->aio);
231 CASN_Free(handle);
232 }
233
CASN_GetAsnIoPtr(CASN_Handle handle)234 NLM_EXTERN AsnIo* LIBCALL CASN_GetAsnIoPtr (CASN_Handle handle)
235 {
236 ASSERT(Handle_IsValid(handle));
237 return handle->aio;
238 }
239
CASN_DocType(CASN_Handle handle)240 NLM_EXTERN CASN_Type LIBCALL CASN_DocType (CASN_Handle handle)
241 {
242 ASSERT(Handle_IsValid(handle));
243 return handle->doc_type;
244 }
245
CASN_DocCount(CASN_Handle handle)246 NLM_EXTERN long LIBCALL CASN_DocCount (CASN_Handle handle)
247 {
248 ASSERT(Handle_IsValid(handle));
249 return handle->doc_count;
250 }
251
CASN_NextMedlineEntry(CASN_Handle handle)252 NLM_EXTERN MedlineEntry* LIBCALL CASN_NextMedlineEntry (CASN_Handle handle)
253 {
254 AsnTypePtr atp;
255
256 ASSERT(Handle_IsValid(handle));
257 atp = AsnReadId(handle->aio,handle->amp,handle->atp);
258 return atp ? MedlineEntryAsnRead(handle->aio,atp) : NULL;
259 }
260
261
CASN_NextSeqEntry(CASN_Handle handle)262 NLM_EXTERN SeqEntry* LIBCALL CASN_NextSeqEntry (CASN_Handle handle)
263 {
264 AsnTypePtr atp;
265
266 ASSERT(Handle_IsValid(handle));
267 if ((atp = AsnReadId(handle->aio, handle->amp, handle->atp))) {
268 SeqEntryPtr sep = SeqEntryAsnRead(handle->aio, atp);
269 while(handle->compr != -1) {
270 char buf[4];
271 compr_huff_read(handle, buf, 1);
272 }
273 return sep;
274 }
275 return NULL;
276 }
277
278
279 #ifdef Biostruc_supported
CASN_NextBiostruc(CASN_Handle handle)280 NLM_EXTERN Biostruc* LIBCALL CASN_NextBiostruc (CASN_Handle handle)
281 {
282 AsnTypePtr atp;
283
284 if (! BiostrucAvail ()) return NULL;
285 ASSERT(Handle_IsValid(handle));
286 atp = AsnReadId(handle->aio,handle->amp,handle->atp);
287 return atp ? BiostrucAsnRead(handle->aio,atp) : NULL;
288 }
289 #endif
290
CASN_Seek(CASN_Handle handle,long offset,int origin)291 NLM_EXTERN int LIBCALL CASN_Seek (CASN_Handle handle, long offset, int origin)
292 {
293 ASSERT(Handle_IsValid(handle));
294 handle->compr = -1; /* to reset the Huffman state */
295 AsnIoReset(handle->aio); /* to reset the ASN state */
296 return fseek(handle->fd,offset,origin);
297 }
298
299
300 /* --------------- Low-Level Functions --------------- */
301
CASN_New(CASN_Type doc_type,int huff_count)302 NLM_EXTERN CASN_Handle LIBCALL CASN_New (CASN_Type doc_type, int huff_count)
303 {
304 CASN_Handle handle;
305 short *left;
306 short *right;
307
308 if (!(handle = (CASN_Handle) MemNew(sizeof(struct casn_ioblock))))
309 return NULL;
310 if (!(left = (short*) MemNew(huff_count*sizeof(short))))
311 return NULL;
312 if (!(right = (short*) MemNew(huff_count*sizeof(short))))
313 return NULL;
314
315 handle->magic = MAGIC_IOBLOCK;
316 handle->doc_type = doc_type;
317 handle->compr = -1;
318 handle->huff_left = left;
319 handle->huff_right = right;
320 return handle;
321 }
322
323
CASN_Free(CASN_Handle handle)324 NLM_EXTERN void LIBCALL CASN_Free (CASN_Handle handle)
325 {
326 ASSERT(Handle_IsValid(handle));
327 MemFree(handle->huff_left);
328 MemFree(handle->huff_right);
329 MemFree(handle);
330 }
331
332
CASN_ReadBuff(Pointer param,CharPtr buff,Uint2 count)333 static Int2 LIBCALLBACK CASN_ReadBuff(Pointer param, CharPtr buff, Uint2 count)
334 {
335 CASN_Handle handle = (CASN_Handle) param;
336 Int2 retval = 0;
337
338 ASSERT(Handle_IsValid(handle));
339
340 while (! retval) /* has to allow for 0 bytes from compressed read */
341 {
342 if (handle->compr < 0)
343 {
344 Int2 c;
345
346 /* read the "decompression protocol identifier" */
347 if ((c = fgetc(handle->fd)) == EOF)
348 return 0;
349
350 if (c == CASN_ComprNone)
351 {
352 handle->bytes = rd_integer(handle->fd,3);
353 }
354 else if (c == CASN_ComprHuff)
355 {
356 if (handle->format ==2)
357 rd_integer(handle->fd,3); /* justskip over it for now */
358 handle->byte = 0;
359 handle->mask = 0;
360 }
361 else
362 {
363 ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,"ReadBuff");
364 return 0;
365 }
366 handle->compr = c;
367 }
368
369 switch(handle->compr)
370 {
371 case CASN_ComprNone:
372 return compr_none_read(handle,buff,count);
373
374 case CASN_ComprHuff:
375 if ((retval = compr_huff_read(handle,buff,count)) !=0)
376 return retval;
377 break;
378
379 default:
380 ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,"ReadBuff");
381 return 0;
382 }
383 }
384
385 return 0;
386 }
387
388
compr_none_read(CASN_Handle handle,char * buff,int count)389 static int compr_none_read (CASN_Handle handle, char *buff, int count)
390 {
391 size_t bytes;
392
393 ASSERT(Handle_IsValid(handle));
394 bytes = (size_t) MIN(handle->bytes,(Int4)count);
395 bytes = FileRead(buff,1,bytes,handle->fd);
396 handle->bytes -= bytes;
397 if (handle->bytes <= 0)
398 {
399 /* reset for stream read of next entry */
400 handle->compr = -1;
401 }
402 return (int)bytes;
403 }
404
405
compr_huff_read(CASN_Handle handle,char * buff,int count)406 static int compr_huff_read (CASN_Handle handle, char *buff, int count)
407 {
408 register unsigned mask, byte;
409 FILE *fd;
410 char *p = buff;
411 int i, cnt = 0;
412 int c;
413 int k;
414
415 ASSERT(Handle_IsValid(handle));
416
417 fd = handle->fd;
418 mask = handle->mask;
419 byte = handle->byte;
420
421 while (cnt < count)
422 {
423 for (i=0; i>=0; )
424 {
425 if (mask == 0)
426 {
427 if ((c = fgetc(fd)) == EOF)
428 {
429 /* should never reach this point */
430 ErrPostEx(SEV_INFO,0,0,
431 "Unexpected EOF reading Huffman-compressed ASN.1");
432 i = handle->huff_count - 257;
433 break;
434 }
435 else
436 {
437 byte = (Uint2) c;
438 mask = 0x80;
439 }
440 }
441
442 if (byte & mask)
443 i = handle->huff_left[i];
444 else
445 i = handle->huff_right[i];
446
447 mask >>= 1;
448 }
449
450 if ((k = i + 257) == handle->huff_count)
451 {
452 handle->compr = -1; /* reset for next record */
453 break;
454 }
455
456 *p++ = (char) k;
457 cnt++;
458 }
459
460 handle->mask = mask;
461 handle->byte = byte;
462 return cnt;
463 }
464
rd_integer(FILE * fd,int bytes)465 static unsigned long rd_integer (FILE *fd, int bytes)
466 {
467 int i, c;
468 unsigned long value = 0;
469
470 for (i=0; i<bytes; ++i)
471 {
472 if ((c = fgetc(fd)) ==EOF) break;
473 value <<= 8;
474 value |= (unsigned long)c;
475 }
476 return value;
477 }
478
rd_string(FILE * fd)479 static char * rd_string (FILE *fd)
480 {
481 size_t len = (size_t) fgetc(fd);
482 if (len > 0)
483 {
484 char *str = MemGet(len+1,MGET_ERRPOST);
485 if (fread((void*)str,1,len,fd) != len)
486 {
487 ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,"File format error");
488 MemFree((void*)str);
489 return NULL;
490 }
491 *(str+len) = '\0';
492 return str;
493 }
494 return NULL;
495 }
496
497