1 /* casn.c
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * RCS $Id: casn.c,v 6.1 2001/04/27 18:00:30 juran Exp $
27  *
28  * Author:  Greg Schuler
29  *
30  * Version Creation Date: 9/23/92
31  *
32  * File Description:  functions to decompress a compressed ASN,1 (CASN) file.
33  *
34  * Modifications:
35  * --------------------------------------------------------------------------
36  * Date     Name        Description of modification
37  * -------  ----------  -----------------------------------------------------
38  * 04-21-93 Schuler     CASN_ReadBuff declared as LIBCALLBACK
39  * 06-28-93 Schuler     New function:  CASN_Seek().
40  * 06-17-94 Schuler     Modified to support new file format that is to debut
41  *                      in Entrez release 13.0
42  * 07-20-94 Schuler     Fixed bug in CASN_Open (incorrect doc_type)
43  * 09-07-94 Schuler     Changed implementation of rd_string (one byte length)
44  * 10-05-94 Schuler     Added CASN_NextBiostruc
45  *
46  * 05-19-95 Schuler     Added rcs Log directive for automatic insertion of
47  *                      modification comments.
48  *
49  * Revision $Log: casn.c,v $
50  * Revision Revision 6.1  2001/04/27 18:00:30  juran
51  * Revision Warnings.
52  * Revision
53  * Revision Revision 6.0  1997/08/25 18:12:41  madden
54  * Revision Revision changed to 6.0
55  * Revision
56  * Revision Revision 5.2  1997/06/26 21:55:21  vakatov
57  * Revision [PC] DLL'd "ncbicdr.lib", "ncbiacc.lib", "ncbinacc.lib" and "ncbicacc.lib"
58  * Revision
59  * Revision Revision 5.1  1997/05/29 18:17:11  savchuk
60  * Revision CASN_NextSeqEntry() function is now seeking to the end of compressed ASN
61  * Revision
62  * Revision 5.0  1996/05/28  13:55:34  ostell
63  * Set to revision 5.0
64  *
65  * Revision 4.0  1995/07/26  13:50:32  ostell
66  * force revision to 4.0
67  *
68  * Revision 2.11  1995/06/23  16:02:43  kans
69  * support for accmmdbs.c stub to resolve symbols without MMDB link
70  *
71  * Revision 2.10  1995/06/23  13:22:25  kans
72  * Biostruc_CD_supported symbol needed for local MMDB access
73  *
74  * Revision 2.9  1995/05/16  14:36:20  schuler
75  * Automatic comment insertion enabled
76  *
77  *
78  * ==========================================================================
79  */
80 
81 #define REVISION_STR "$Revision: 6.1 $"
82 
83 
84 #include <asn.h>
85 #include <casn.h>
86 
87 struct casn_ioblock
88 {
89 	short      rel_major;
90 	short      rel_minor;
91 	int        magic;
92 	int        format;
93 	int        compr;
94 	long	   bytes;
95 	CASN_Type  doc_type;
96 	long       doc_count;
97 	long       uid_min;
98 	long       uid_max;
99 	int        huff_count;
100 	short     *huff_left;
101 	short     *huff_right;
102 	unsigned   byte;
103 	unsigned   mask;
104 	FILE      *fd;
105 	AsnIo     *aio;
106 	AsnModule *amp;
107 	AsnType   *atp;
108 };
109 
110 
111 #define CURRENT_FILEFORMAT	2
112 #define MAGIC_FILEFORMAT	4541
113 
114 #define MAGIC_IOBLOCK		3958
115 #define Handle_IsValid(x)  ((x) && ((x)->magic == MAGIC_IOBLOCK))
116 
117 enum CASN_Compr { CASN_ComprNone, CASN_ComprHuff };
118 
119 static char * _asn_type[] = { "", "Medline-entry", "Seq-entry" };
120 static char * file_emsg = "Unrecognized compressed file format [%s]\n";
121 
122 static int compr_none_read (CASN_Handle handle, char *buff, int count);
123 static int compr_huff_read (CASN_Handle handle, char *buff, int count);
124 
125 
126 static Int2 LIBCALLBACK CASN_ReadBuff(Pointer param, CharPtr buffer, Uint2 count);
127 
128 static char * rd_string (FILE *fd);
129 static unsigned long rd_integer (FILE *fd, int bytes);
130 #define RD_SHORT(f)   (short)rd_integer(fd,2)
131 #define RD_USHORT(f)  (unsigned short)rd_integer(fd,2)
132 #define RD_INT(f)     (int)rd_integer(fd,2)
133 #define RD_UINT(f)    (unsigned int)rd_integer(fd,2)
134 #define RD_LONG(f)    (long)rd_integer(fd,4)
135 #define RD_ULONG(f)   rd_integer(fd,4)
136 
137 
138 
139 /* --------------- High-Level Functions --------------- */
140 
CASN_Open(char * fname)141 NLM_EXTERN CASN_Handle LIBCALL CASN_Open (char *fname)
142 {
143 	int	i, j;
144 	CASN_Handle handle;
145 	FILE *fd =NULL;
146 	int	doc_type;
147 	long	l1, l2, l3;
148 	int     huff_count;
149 	short rel_major =0, rel_minor =0;
150 
151 	if (!(fd = FileOpen(fname,"rb")))
152 	{
153 		ErrPostEx(SEV_ERROR,CASN_ErrFileOpen,0,"Unable to open file %s\n",fname);
154 		return NULL;
155 	}
156 
157 	/* check to see that the file is recognizable */
158 	i = RD_SHORT(fd);
159 	j = RD_SHORT(fd);
160 	if (i != MAGIC_FILEFORMAT  ||  j > CURRENT_FILEFORMAT)
161 	{
162 		FileClose(fd);
163 		ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,fname);
164 		return NULL;
165 	}
166 	if (j == CURRENT_FILEFORMAT)
167 	{
168 		rel_major = RD_SHORT(fd);
169 		rel_minor = RD_SHORT(fd);
170 	}
171 
172 	l1 = RD_LONG(fd);
173 	l2 = RD_LONG(fd);
174 	l3 = RD_LONG(fd);
175 
176 	doc_type = RD_SHORT(fd);
177 	if (j<CURRENT_FILEFORMAT)
178 		doc_type = -doc_type;
179 	huff_count = RD_SHORT(fd);
180 
181 	if (!(handle = CASN_New((CASN_Type)doc_type,huff_count)))
182 	{
183 		FileClose(fd);
184 		return NULL;
185 	}
186 	handle->format = j;
187 	handle->huff_count = huff_count;
188 	for (i=0; i<huff_count; ++i)
189 		handle->huff_left[i] = RD_SHORT(fd);
190 	for (i=0; i<huff_count; ++i)
191 		handle->huff_right[i] = RD_SHORT(fd);
192 
193 	if (!(handle->aio = AsnIoNew(ASNIO_BIN_IN,fd,handle,CASN_ReadBuff,NULL)))
194 	{
195 		FileClose(fd);
196 		CASN_Free(handle);
197 		return NULL;
198 	}
199 	handle->aio->fname = StrSave(fname);
200 	handle->fd = fd;
201 	handle->amp = AsnAllModPtr();
202 	handle->doc_count = l1;
203 	handle->uid_min = l2;
204 	handle->uid_max = l3;
205 	handle->rel_major = rel_major;
206 	handle->rel_minor = rel_minor;
207 
208 	if (handle->format == CURRENT_FILEFORMAT)
209 	{
210 		char *asntype;
211 		/* skip over some things */
212 		MemFree((void*)rd_string(fd));
213 		rd_integer(fd,2);
214 		MemFree((void*)rd_string(fd));
215 		MemFree((void*)rd_string(fd));
216 		asntype = rd_string(fd);
217 		handle->atp = AsnTypeFind(handle->amp,asntype);
218 		MemFree((void*)asntype);
219 	}
220 	else
221 	{
222 		handle->atp = AsnTypeFind(handle->amp,_asn_type[-doc_type]);
223 	}
224 	return handle;
225 }
226 
CASN_Close(CASN_Handle handle)227 NLM_EXTERN void LIBCALL CASN_Close (CASN_Handle handle)
228 {
229 	ASSERT(Handle_IsValid(handle));
230 	AsnIoClose(handle->aio);
231 	CASN_Free(handle);
232 }
233 
CASN_GetAsnIoPtr(CASN_Handle handle)234 NLM_EXTERN AsnIo* LIBCALL CASN_GetAsnIoPtr (CASN_Handle handle)
235 {
236 	ASSERT(Handle_IsValid(handle));
237 	return handle->aio;
238 }
239 
CASN_DocType(CASN_Handle handle)240 NLM_EXTERN CASN_Type LIBCALL CASN_DocType (CASN_Handle handle)
241 {
242 	ASSERT(Handle_IsValid(handle));
243 	return handle->doc_type;
244 }
245 
CASN_DocCount(CASN_Handle handle)246 NLM_EXTERN long LIBCALL CASN_DocCount (CASN_Handle handle)
247 {
248 	ASSERT(Handle_IsValid(handle));
249 	return handle->doc_count;
250 }
251 
CASN_NextMedlineEntry(CASN_Handle handle)252 NLM_EXTERN MedlineEntry* LIBCALL CASN_NextMedlineEntry (CASN_Handle handle)
253 {
254 	AsnTypePtr atp;
255 
256 	ASSERT(Handle_IsValid(handle));
257 	atp = AsnReadId(handle->aio,handle->amp,handle->atp);
258 	return atp ? MedlineEntryAsnRead(handle->aio,atp) : NULL;
259 }
260 
261 
CASN_NextSeqEntry(CASN_Handle handle)262 NLM_EXTERN SeqEntry* LIBCALL CASN_NextSeqEntry (CASN_Handle handle)
263 {
264 	AsnTypePtr atp;
265 
266 	ASSERT(Handle_IsValid(handle));
267 	if ((atp = AsnReadId(handle->aio, handle->amp, handle->atp))) {
268 	  SeqEntryPtr sep = SeqEntryAsnRead(handle->aio, atp);
269 	  while(handle->compr != -1) {
270 	    char buf[4];
271 	    compr_huff_read(handle, buf, 1);
272 	  }
273 	  return sep;
274 	}
275 	return NULL;
276 }
277 
278 
279 #ifdef Biostruc_supported
CASN_NextBiostruc(CASN_Handle handle)280 NLM_EXTERN Biostruc* LIBCALL CASN_NextBiostruc (CASN_Handle handle)
281 {
282 	AsnTypePtr atp;
283 
284 	if (! BiostrucAvail ()) return NULL;
285 	ASSERT(Handle_IsValid(handle));
286 	atp = AsnReadId(handle->aio,handle->amp,handle->atp);
287 	return atp ? BiostrucAsnRead(handle->aio,atp) : NULL;
288 }
289 #endif
290 
CASN_Seek(CASN_Handle handle,long offset,int origin)291 NLM_EXTERN int LIBCALL CASN_Seek (CASN_Handle handle, long offset, int origin)
292 {
293 	ASSERT(Handle_IsValid(handle));
294 	handle->compr = -1;          /* to reset the Huffman state */
295 	AsnIoReset(handle->aio);     /* to reset the ASN state */
296 	return fseek(handle->fd,offset,origin);
297 }
298 
299 
300 /* --------------- Low-Level Functions --------------- */
301 
CASN_New(CASN_Type doc_type,int huff_count)302 NLM_EXTERN CASN_Handle  LIBCALL CASN_New (CASN_Type doc_type, int huff_count)
303 {
304 	CASN_Handle handle;
305 	short *left;
306 	short *right;
307 
308 	if (!(handle = (CASN_Handle) MemNew(sizeof(struct casn_ioblock))))
309 		return NULL;
310 	if (!(left = (short*) MemNew(huff_count*sizeof(short))))
311 		return NULL;
312 	if (!(right = (short*) MemNew(huff_count*sizeof(short))))
313 		return NULL;
314 
315 	handle->magic = MAGIC_IOBLOCK;
316 	handle->doc_type = doc_type;
317 	handle->compr = -1;
318 	handle->huff_left = left;
319 	handle->huff_right = right;
320 	return handle;
321 }
322 
323 
CASN_Free(CASN_Handle handle)324 NLM_EXTERN void LIBCALL CASN_Free (CASN_Handle handle)
325 {
326 	ASSERT(Handle_IsValid(handle));
327 	MemFree(handle->huff_left);
328 	MemFree(handle->huff_right);
329 	MemFree(handle);
330 }
331 
332 
CASN_ReadBuff(Pointer param,CharPtr buff,Uint2 count)333 static Int2 LIBCALLBACK CASN_ReadBuff(Pointer param, CharPtr buff, Uint2 count)
334 {
335 	CASN_Handle handle = (CASN_Handle) param;
336 	Int2 retval = 0;
337 
338 	ASSERT(Handle_IsValid(handle));
339 
340 	while (! retval)   /* has to allow for 0 bytes from compressed read */
341 	{
342 		if (handle->compr < 0)
343 		{
344 			Int2 c;
345 
346 			/* read the "decompression protocol identifier" */
347 			if ((c = fgetc(handle->fd)) == EOF)
348 				return 0;
349 
350 			if (c == CASN_ComprNone)
351 			{
352 				handle->bytes = rd_integer(handle->fd,3);
353 			}
354 			else if (c == CASN_ComprHuff)
355 			{
356 				if (handle->format ==2)
357 					rd_integer(handle->fd,3);   /* justskip over it for now */
358 				handle->byte = 0;
359 				handle->mask = 0;
360 			}
361 			else
362 			{
363 				ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,"ReadBuff");
364 				return 0;
365 			}
366 			handle->compr = c;
367 		}
368 
369 		switch(handle->compr)
370 		{
371 			case CASN_ComprNone:
372 				return compr_none_read(handle,buff,count);
373 
374 			case CASN_ComprHuff:
375 				if ((retval = compr_huff_read(handle,buff,count)) !=0)
376 					return retval;
377 				break;
378 
379 			default:
380 				ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,file_emsg,"ReadBuff");
381 				return 0;
382 		}
383 	}
384 
385 	return 0;
386 }
387 
388 
compr_none_read(CASN_Handle handle,char * buff,int count)389 static int compr_none_read (CASN_Handle handle, char *buff, int count)
390 {
391 	size_t bytes;
392 
393 	ASSERT(Handle_IsValid(handle));
394 	bytes = (size_t) MIN(handle->bytes,(Int4)count);
395 	bytes = FileRead(buff,1,bytes,handle->fd);
396 	handle->bytes -= bytes;
397 	if (handle->bytes <= 0)
398 	{
399 		/* reset for stream read of next entry */
400 		handle->compr = -1;
401 	}
402 	return (int)bytes;
403 }
404 
405 
compr_huff_read(CASN_Handle handle,char * buff,int count)406 static int compr_huff_read (CASN_Handle handle, char *buff, int count)
407 {
408 	register unsigned mask, byte;
409 	FILE *fd;
410 	char *p = buff;
411 	int i, cnt = 0;
412 	int c;
413 	int k;
414 
415 	ASSERT(Handle_IsValid(handle));
416 
417 	fd = handle->fd;
418 	mask = handle->mask;
419 	byte = handle->byte;
420 
421 	while (cnt < count)
422 	{
423 		for (i=0; i>=0; )
424 		{
425 			if (mask == 0)
426 			{
427 				if ((c = fgetc(fd)) == EOF)
428 				{
429 					/* should never reach this point */
430 					ErrPostEx(SEV_INFO,0,0,
431 						"Unexpected EOF reading Huffman-compressed ASN.1");
432 					i = handle->huff_count - 257;
433 					break;
434 				}
435 				else
436 				{
437 					byte = (Uint2) c;
438 					mask = 0x80;
439 				}
440 			}
441 
442 			if (byte & mask)
443 				i = handle->huff_left[i];
444 			else
445 				i = handle->huff_right[i];
446 
447 			mask >>= 1;
448 		}
449 
450 		if ((k = i + 257) == handle->huff_count)
451 		{
452 			handle->compr = -1; /* reset for next record */
453 			break;
454 		}
455 
456 		*p++ = (char) k;
457 		cnt++;
458 	}
459 
460 	handle->mask = mask;
461 	handle->byte = byte;
462 	return cnt;
463 }
464 
rd_integer(FILE * fd,int bytes)465 static unsigned long rd_integer (FILE *fd, int bytes)
466 {
467 	int i, c;
468 	unsigned long value = 0;
469 
470 	for (i=0; i<bytes; ++i)
471 	{
472 		if ((c = fgetc(fd)) ==EOF)  break;
473 		value <<= 8;
474 		value |= (unsigned long)c;
475 	}
476 	return value;
477 }
478 
rd_string(FILE * fd)479 static char * rd_string (FILE *fd)
480 {
481 	size_t len = (size_t) fgetc(fd);
482 	if (len > 0)
483 	{
484 		char *str = MemGet(len+1,MGET_ERRPOST);
485 		if (fread((void*)str,1,len,fd) != len)
486 		{
487 			ErrPostEx(SEV_ERROR,CASN_ErrFileFormat,0,"File format error");
488 			MemFree((void*)str);
489 			return NULL;
490 		}
491 		*(str+len) = '\0';
492 		return str;
493 	}
494 	return NULL;
495 }
496 
497