1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #ifndef _tools_cg_load_file_h_
27 #define _tools_cg_load_file_h_
28
29 #include <kfs/file.h>
30 #include <kfs/directory.h>
31
32 #include <klib/container.h>
33 #include <klib/log.h>
34 #include <klib/rc.h>
35
36 #include "defs.h"
37 #include "writer-seq.h"
38 #include "writer-algn.h"
39 #include "writer-evidence-intervals.h"
40 #include "writer-evidence-dnbs.h"
41
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <ctype.h>
46 #include <strtol.h>
47
48
49 /* some usefull utils */
50 /* strchr but in fixed size buffer (not asciiZ!) */
str_chr(const char * str,const size_t len,char sep)51 static __inline__ const char* str_chr(const char* str, const size_t len, char sep)
52 {
53 const char* end = str + len;
54 while( str < end ) {
55 if( *str == sep ) {
56 break;
57 }
58 str++;
59 }
60 return str == end ? NULL : str;
61 }
62
63 static __inline__
str2buf(const char * str,const size_t len,char * buf,const size_t buf_sz)64 rc_t str2buf(const char* str, const size_t len, char* buf, const size_t buf_sz)
65 {
66 if( buf_sz <= len ) {
67 rc_t rc = RC(rcRuntime, rcString, rcCopying, rcBuffer, rcInsufficient);
68 if (rc != 0) {
69 PLOGERR(klogErr, (klogErr, rc, "'$(str)': $(sz) <= $(len)",
70 "str=%.*s,sz=%lu,len=%lu", len, str, buf_sz, len));
71 }
72 return rc;
73 }
74 memmove(buf, str, len);
75 buf[len] = '\0';
76 return 0;
77 }
78
79 static __inline__
str2unsigned(const char * str,const size_t len,uint64_t max,uint64_t * value)80 rc_t str2unsigned(const char* str, const size_t len, uint64_t max, uint64_t* value)
81 {
82 char* end;
83 int64_t q;
84
85 if( len == 0 ) {
86 return RC(rcRuntime, rcString, rcConverting, rcData, rcTooShort);
87 }
88 q = strtou64(str, &end, 10);
89 if( end - str != len ) {
90 return RC(rcRuntime, rcString, rcConverting, rcData, rcInvalid);
91 }
92 if( q < 0 || ( uint64_t ) q > max ) {
93 return RC(rcRuntime, rcString, rcConverting, rcData, rcOutofrange);
94 }
95 *value = q;
96 return 0;
97 }
98 static __inline__
str2u64(const char * str,const size_t len,uint64_t * value)99 rc_t str2u64(const char* str, const size_t len, uint64_t* value)
100 {
101 rc_t rc;
102 uint64_t q;
103
104 if( (rc = str2unsigned(str, len, -1, &q)) == 0 ) {
105 *value = q;
106 }
107 return rc;
108 }
109
110 static __inline__
str2signed(const char * str,const size_t len,int64_t min,int64_t max,int64_t * value)111 rc_t str2signed(const char* str, const size_t len, int64_t min, int64_t max, int64_t* value)
112 {
113 char* end;
114 int64_t q;
115
116 if( len == 0 ) {
117 return RC(rcRuntime, rcString, rcConverting, rcData, rcTooShort);
118 }
119 q = strtoi64(str, &end, 10);
120 if( end - str != len ) {
121 return RC(rcRuntime, rcString, rcConverting, rcData, rcInvalid);
122 }
123 if( q < min || q > max ) {
124 return RC(rcRuntime, rcString, rcConverting, rcData, rcOutofrange);
125 }
126 *value = q;
127 return 0;
128 }
129 static __inline__
str2i64(const char * str,const size_t len,int64_t * value)130 rc_t str2i64(const char* str, const size_t len, int64_t* value)
131 {
132 rc_t rc;
133 int64_t q;
134
135 #if _ARCH_BITS == 32
136 if( (rc = str2signed(str, len, -0x7FFFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL, &q)) == 0 ) {
137 *value = q;
138 }
139 #else
140 if( (rc = str2signed(str, len, -0x7FFFFFFFFFFFFFFFL, 0x7FFFFFFFFFFFFFFFL, &q)) == 0 ) {
141 *value = q;
142 }
143 #endif
144 return rc;
145 }
146 static __inline__
str2i32(const char * str,const size_t len,int32_t * value)147 rc_t str2i32(const char* str, const size_t len, int32_t* value)
148 {
149 rc_t rc;
150 int64_t q;
151
152 if( (rc = str2signed(str, len, -0x7FFFFFFF - 1, 0x7FFFFFFF, &q)) == 0 ) {
153 *value = ( int32_t ) q;
154 }
155 return rc;
156 }
157 static __inline__
str2u32(const char * str,const size_t len,uint32_t * value)158 rc_t str2u32(const char* str, const size_t len, uint32_t* value)
159 {
160 rc_t rc;
161 uint64_t q;
162
163 if( (rc = str2unsigned(str, len, 0xFFFFFFFF, &q)) == 0 ) {
164 *value = ( uint32_t ) q;
165 }
166 return rc;
167 }
168 static __inline__
str2i16(const char * str,const size_t len,int16_t * value)169 rc_t str2i16(const char* str, const size_t len, int16_t* value)
170 {
171 rc_t rc;
172 int64_t q;
173
174 if( (rc = str2signed(str, len, -0x7FFF - 1, 0x7FFF, &q)) == 0 ) {
175 *value = ( int16_t ) q;
176 }
177 return rc;
178 }
179 static __inline__
str2u16(const char * str,const size_t len,uint16_t * value)180 rc_t str2u16(const char* str, const size_t len, uint16_t* value)
181 {
182 rc_t rc;
183 uint64_t q;
184
185 if( (rc = str2unsigned(str, len, 0xFFFF, &q)) == 0 ) {
186 *value = ( uint16_t ) q;
187 }
188 return rc;
189 }
190 #define CG_LINE_START(file, buf, len, res) \
191 do { \
192 const char* buf, *res; \
193 size_t len; \
194 if( (rc = CGLoaderFile_Readline(file, (const void**)&buf, &len)) != 0 ) { \
195 break; \
196 } \
197 res = buf - 1;
198
199 #define CG_LINE_NEXT_FIELD(buf, len, res) \
200 if( rc != 0 ) { \
201 break; \
202 } else { \
203 len -= ++res - buf; \
204 buf = res; \
205 if( (res = str_chr(buf, len, '\t')) == NULL ) { \
206 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcCorrupt); \
207 break; \
208 } \
209 }
210
211 #define CG_LINE_LAST_FIELD(buf, len, res) \
212 if( rc != 0 ) { \
213 break; \
214 } else { \
215 len -= ++res - buf; \
216 buf = res; \
217 res = buf + len; \
218 if( str_chr(buf, len, '\t') != NULL ) { \
219 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcCorrupt); \
220 break; \
221 } \
222 }
223
224 #define CG_LINE_END() \
225 } while(false)
226
227 #ifndef CGFILETYPE_IMPL
228 #define CGFILETYPE_IMPL CGFileType
229 #endif
230
231 typedef struct CGFileType CGFileType;
232
233 typedef struct CGFileType_vt_struct {
234 rc_t ( CC *header ) (const CGFILETYPE_IMPL* self, const char* buf, const size_t len);
235
236 rc_t ( CC *reads ) (const CGFILETYPE_IMPL* cself, TReadsData* data);
237 rc_t ( CC *get_start_row ) (const CGFILETYPE_IMPL* cself, int64_t* rowid);
238 rc_t ( CC *mappings ) (const CGFILETYPE_IMPL* cself, TMappingsData* data);
239 rc_t ( CC *evidence_intervals )(const CGFILETYPE_IMPL* cself, TEvidenceIntervalsData* data);
240 rc_t ( CC *evidence_dnbs )(const CGFILETYPE_IMPL* cself, const char* interval_id, TEvidenceDnbsData* data);
241 rc_t ( CC *tag_lfr )(const CGFILETYPE_IMPL* cself, TReadsData* data);
242
243 rc_t ( CC *assembly_id) (const CGFILETYPE_IMPL* self, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id);
244 rc_t ( CC *slide) (const CGFILETYPE_IMPL* self, const CGFIELD_SLIDE_TYPE** slide);
245 rc_t ( CC *lane) (const CGFILETYPE_IMPL* self, const CGFIELD_LANE_TYPE** lane);
246 rc_t ( CC *batch_file_number) (const CGFILETYPE_IMPL* self, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number);
247 rc_t ( CC *sample) (const CGFILETYPE_IMPL* self, const CGFIELD_SAMPLE_TYPE** sample);
248 rc_t ( CC *chromosome) (const CGFILETYPE_IMPL* self, const CGFIELD_CHROMOSOME_TYPE** chromosome);
249
250 void ( CC *destroy ) (const CGFILETYPE_IMPL* self, uint64_t* records);
251 } CGFileType_vt;
252
253 struct CGFileType {
254 uint32_t format_version;
255 CG_EFileType type;
256 CG_ELibraryType libraryType;
257 const CGFileType_vt* vt;
258 };
259
260 typedef struct CGLoaderFile
261 {
262 bool read_ahead;
263 const struct KLoaderFile *file;
264 const CGFileType* cg_file;
265 } CGLoaderFile;
266
267 typedef struct CGFileTypeFactory {
268 const char* name;
269 CG_EFileType type;
270 rc_t ( CC *make ) (const CGFileType** self, const CGLoaderFile* file);
271 } CGFileTypeFactory;
272
273 rc_t CGLoaderFile_Make(const CGLoaderFile **cself, const KDirectory* dir, const char* filename,
274 const uint8_t* md5_digest, bool read_ahead);
275
276 rc_t CGLoaderFile_Release(const CGLoaderFile* cself, bool ignored);
277
278 /* returns true if eof is reached and buffer is empty */
279 rc_t CGLoaderFile_IsEof(const CGLoaderFile* cself, bool* eof);
280
281 /* closes the underlying file */
282 rc_t CGLoaderFile_Close(const CGLoaderFile* cself);
283
284 /* returns current 1-based line number in file */
285 rc_t CGLoaderFile_Line(const CGLoaderFile* cself, uint64_t* line);
286
287 rc_t CGLoaderFile_Filename(const CGLoaderFile *cself, const char** name);
288
289 rc_t CGLoaderFile_LOG(const CGLoaderFile* cself, KLogLevel lvl, rc_t rc, const char *msg, const char *fmt, ...);
290
291 rc_t CGLoaderFile_GetType(const CGLoaderFile* cself, CG_EFileType* type);
292
293 rc_t CGLoaderFile_GetRead(const CGLoaderFile* cself, TReadsData* data);
294 rc_t CGLoaderFile_GetStartRow(const CGLoaderFile* cself, int64_t* rowid);
295
296 rc_t CGLoaderFile_GetTagLfr(const CGLoaderFile* cself, TReadsData* data);
297
298 rc_t CGLoaderFile_GetMapping(const CGLoaderFile* cself, TMappingsData* data);
299
300 rc_t CGLoaderFile_GetEvidenceIntervals(const CGLoaderFile* cself, TEvidenceIntervalsData* data);
301
302 rc_t CGLoaderFile_GetEvidenceDnbs(const CGLoaderFile* cself, const char* interval_id, TEvidenceDnbsData* data);
303
304 rc_t CGLoaderFile_GetAssemblyId(const CGLoaderFile* cself, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id);
305 rc_t CGLoaderFile_GetSlide(const CGLoaderFile* cself, const CGFIELD_SLIDE_TYPE** slide);
306 rc_t CGLoaderFile_GetLane(const CGLoaderFile* cself, const CGFIELD_LANE_TYPE** lane);
307 rc_t CGLoaderFile_GetBatchFileNumber(const CGLoaderFile* cself, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number);
308 rc_t CGLoaderFile_GetSample(const CGLoaderFile* cself, const CGFIELD_SAMPLE_TYPE** sample);
309 rc_t CGLoaderFile_GetChromosome(const CGLoaderFile* cself, const CGFIELD_CHROMOSOME_TYPE** chromosome);
310
311
312 /* Readline
313 * makes next line from a file available in buffer.
314 * eligable EOL symbols are: \n (unix), \r (older mac), \r\n (win)
315 * EOL symbol(s) never included in buffer length.
316 * line is \0 terminated.
317 * if there is no EOL at EOF - not an error.
318 * fails if internal buffer is insufficient.
319 * buffer is NULL on EOF
320 * rc state of (rcString rcTooLong) means line was too long
321 * you may copy line and readline again for the tail of the line
322 *
323 * "buffer" [ OUT ] and "length" [ OUT ] - returned line and it's length
324 */
325 rc_t CGLoaderFile_Readline(const CGLoaderFile* cself, const void** buffer, size_t* length);
326
327 rc_t CGLoaderFile_CreateCGFile(CGLoaderFile* self,
328 uint32_t FORMAT_VERSION, const char* TYPE);
329
330 rc_t CGLoaderFileMakeCGFileType(const CGLoaderFile* self, const char* type,
331 const CGFileTypeFactory* factory, size_t factories,
332 const CGFileType** ftype);
333
334 #endif /* _tools_cg_load_file_h_ */
335