1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #ifndef _tools_cg_load_file_h_
27 #define _tools_cg_load_file_h_
28 
29 #include <kfs/file.h>
30 #include <kfs/directory.h>
31 
32 #include <klib/container.h>
33 #include <klib/log.h>
34 #include <klib/rc.h>
35 
36 #include "defs.h"
37 #include "writer-seq.h"
38 #include "writer-algn.h"
39 #include "writer-evidence-intervals.h"
40 #include "writer-evidence-dnbs.h"
41 
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <ctype.h>
46 #include <strtol.h>
47 
48 
49 /* some usefull utils */
50 /* strchr but in fixed size buffer (not asciiZ!) */
str_chr(const char * str,const size_t len,char sep)51 static __inline__ const char* str_chr(const char* str, const size_t len, char sep)
52 {
53     const char* end = str + len;
54     while( str < end ) {
55         if( *str == sep ) {
56             break;
57         }
58         str++;
59     }
60     return str == end ? NULL : str;
61 }
62 
63 static __inline__
str2buf(const char * str,const size_t len,char * buf,const size_t buf_sz)64 rc_t str2buf(const char* str, const size_t len, char* buf, const size_t buf_sz)
65 {
66     if( buf_sz <= len ) {
67         rc_t rc = RC(rcRuntime, rcString, rcCopying, rcBuffer, rcInsufficient);
68         if (rc != 0) {
69             PLOGERR(klogErr, (klogErr, rc,  "'$(str)': $(sz) <= $(len)",
70                 "str=%.*s,sz=%lu,len=%lu", len, str, buf_sz, len));
71         }
72         return rc;
73     }
74     memmove(buf, str, len);
75     buf[len] = '\0';
76     return 0;
77 }
78 
79 static __inline__
str2unsigned(const char * str,const size_t len,uint64_t max,uint64_t * value)80 rc_t str2unsigned(const char* str, const size_t len, uint64_t max, uint64_t* value)
81 {
82     char* end;
83     int64_t q;
84 
85     if( len == 0 ) {
86         return RC(rcRuntime, rcString, rcConverting, rcData, rcTooShort);
87     }
88     q = strtou64(str, &end, 10);
89     if( end - str != len ) {
90         return RC(rcRuntime, rcString, rcConverting, rcData, rcInvalid);
91     }
92     if( q < 0 || ( uint64_t ) q > max ) {
93         return RC(rcRuntime, rcString, rcConverting, rcData, rcOutofrange);
94     }
95     *value = q;
96     return 0;
97 }
98 static __inline__
str2u64(const char * str,const size_t len,uint64_t * value)99 rc_t str2u64(const char* str, const size_t len, uint64_t* value)
100 {
101     rc_t rc;
102     uint64_t q;
103 
104     if( (rc = str2unsigned(str, len, -1, &q)) == 0 ) {
105         *value = q;
106     }
107     return rc;
108 }
109 
110 static __inline__
str2signed(const char * str,const size_t len,int64_t min,int64_t max,int64_t * value)111 rc_t str2signed(const char* str, const size_t len, int64_t min, int64_t max, int64_t* value)
112 {
113     char* end;
114     int64_t q;
115 
116     if( len == 0 ) {
117         return RC(rcRuntime, rcString, rcConverting, rcData, rcTooShort);
118     }
119     q = strtoi64(str, &end, 10);
120     if( end - str != len ) {
121         return RC(rcRuntime, rcString, rcConverting, rcData, rcInvalid);
122     }
123     if( q < min || q > max ) {
124         return RC(rcRuntime, rcString, rcConverting, rcData, rcOutofrange);
125     }
126     *value = q;
127     return 0;
128 }
129 static __inline__
str2i64(const char * str,const size_t len,int64_t * value)130 rc_t str2i64(const char* str, const size_t len, int64_t* value)
131 {
132     rc_t rc;
133     int64_t q;
134 
135 #if _ARCH_BITS == 32
136     if( (rc = str2signed(str, len, -0x7FFFFFFFFFFFFFFFLL, 0x7FFFFFFFFFFFFFFFLL, &q)) == 0 ) {
137         *value = q;
138     }
139 #else
140     if( (rc = str2signed(str, len, -0x7FFFFFFFFFFFFFFFL, 0x7FFFFFFFFFFFFFFFL, &q)) == 0 ) {
141         *value = q;
142     }
143 #endif
144     return rc;
145 }
146 static __inline__
str2i32(const char * str,const size_t len,int32_t * value)147 rc_t str2i32(const char* str, const size_t len, int32_t* value)
148 {
149     rc_t rc;
150     int64_t q;
151 
152     if( (rc = str2signed(str, len, -0x7FFFFFFF - 1, 0x7FFFFFFF, &q)) == 0 ) {
153         *value = ( int32_t ) q;
154     }
155     return rc;
156 }
157 static __inline__
str2u32(const char * str,const size_t len,uint32_t * value)158 rc_t str2u32(const char* str, const size_t len, uint32_t* value)
159 {
160     rc_t rc;
161     uint64_t q;
162 
163     if( (rc = str2unsigned(str, len, 0xFFFFFFFF, &q)) == 0 ) {
164         *value = ( uint32_t ) q;
165     }
166     return rc;
167 }
168 static __inline__
str2i16(const char * str,const size_t len,int16_t * value)169 rc_t str2i16(const char* str, const size_t len, int16_t* value)
170 {
171     rc_t rc;
172     int64_t q;
173 
174     if( (rc = str2signed(str, len, -0x7FFF - 1, 0x7FFF, &q)) == 0 ) {
175         *value = ( int16_t ) q;
176     }
177     return rc;
178 }
179 static __inline__
str2u16(const char * str,const size_t len,uint16_t * value)180 rc_t str2u16(const char* str, const size_t len, uint16_t* value)
181 {
182     rc_t rc;
183     uint64_t q;
184 
185     if( (rc = str2unsigned(str, len, 0xFFFF, &q)) == 0 ) {
186         *value = ( uint16_t ) q;
187     }
188     return rc;
189 }
190 #define CG_LINE_START(file, buf, len, res) \
191     do { \
192         const char* buf, *res; \
193         size_t len; \
194         if( (rc = CGLoaderFile_Readline(file, (const void**)&buf, &len)) != 0 ) { \
195             break; \
196         } \
197         res = buf - 1;
198 
199 #define CG_LINE_NEXT_FIELD(buf, len, res) \
200     if( rc != 0 ) { \
201         break; \
202     } else { \
203         len -= ++res - buf; \
204         buf = res; \
205         if( (res = str_chr(buf, len, '\t')) == NULL ) { \
206             rc = RC(rcRuntime, rcFile, rcReading, rcData, rcCorrupt); \
207             break; \
208         } \
209     }
210 
211 #define CG_LINE_LAST_FIELD(buf, len, res) \
212     if( rc != 0 ) { \
213         break; \
214     } else { \
215         len -= ++res - buf; \
216         buf = res; \
217         res = buf + len; \
218         if( str_chr(buf, len, '\t') != NULL ) { \
219             rc = RC(rcRuntime, rcFile, rcReading, rcData, rcCorrupt); \
220             break; \
221         } \
222     }
223 
224 #define CG_LINE_END() \
225     } while(false)
226 
227 #ifndef CGFILETYPE_IMPL
228 #define CGFILETYPE_IMPL CGFileType
229 #endif
230 
231 typedef struct CGFileType CGFileType;
232 
233 typedef struct CGFileType_vt_struct {
234     rc_t ( CC *header ) (const CGFILETYPE_IMPL* self, const char* buf, const size_t len);
235 
236     rc_t ( CC *reads ) (const CGFILETYPE_IMPL* cself, TReadsData* data);
237     rc_t ( CC *get_start_row ) (const CGFILETYPE_IMPL* cself, int64_t* rowid);
238     rc_t ( CC *mappings ) (const CGFILETYPE_IMPL* cself, TMappingsData* data);
239     rc_t ( CC *evidence_intervals )(const CGFILETYPE_IMPL* cself, TEvidenceIntervalsData* data);
240     rc_t ( CC *evidence_dnbs )(const CGFILETYPE_IMPL* cself, const char* interval_id, TEvidenceDnbsData* data);
241     rc_t ( CC *tag_lfr )(const CGFILETYPE_IMPL* cself, TReadsData* data);
242 
243     rc_t ( CC *assembly_id) (const CGFILETYPE_IMPL* self, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id);
244     rc_t ( CC *slide) (const CGFILETYPE_IMPL* self, const CGFIELD_SLIDE_TYPE** slide);
245     rc_t ( CC *lane) (const CGFILETYPE_IMPL* self, const CGFIELD_LANE_TYPE** lane);
246     rc_t ( CC *batch_file_number) (const CGFILETYPE_IMPL* self, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number);
247     rc_t ( CC *sample) (const CGFILETYPE_IMPL* self, const CGFIELD_SAMPLE_TYPE** sample);
248     rc_t ( CC *chromosome) (const CGFILETYPE_IMPL* self, const CGFIELD_CHROMOSOME_TYPE** chromosome);
249 
250     void ( CC *destroy ) (const CGFILETYPE_IMPL* self, uint64_t* records);
251 } CGFileType_vt;
252 
253 struct CGFileType {
254     uint32_t format_version;
255     CG_EFileType type;
256     CG_ELibraryType libraryType;
257     const CGFileType_vt* vt;
258 };
259 
260 typedef struct CGLoaderFile
261 {
262     bool read_ahead;
263     const struct KLoaderFile *file;
264     const CGFileType* cg_file;
265 } CGLoaderFile;
266 
267 typedef struct CGFileTypeFactory {
268     const char* name;
269     CG_EFileType type;
270     rc_t ( CC *make ) (const CGFileType** self, const CGLoaderFile* file);
271 } CGFileTypeFactory;
272 
273 rc_t CGLoaderFile_Make(const CGLoaderFile **cself, const KDirectory* dir, const char* filename,
274                        const uint8_t* md5_digest, bool read_ahead);
275 
276 rc_t CGLoaderFile_Release(const CGLoaderFile* cself, bool ignored);
277 
278 /* returns true if eof is reached and buffer is empty */
279 rc_t CGLoaderFile_IsEof(const CGLoaderFile* cself, bool* eof);
280 
281 /* closes the underlying file */
282 rc_t CGLoaderFile_Close(const CGLoaderFile* cself);
283 
284 /* returns current 1-based line number in file */
285 rc_t CGLoaderFile_Line(const CGLoaderFile* cself, uint64_t* line);
286 
287 rc_t CGLoaderFile_Filename(const CGLoaderFile *cself, const char** name);
288 
289 rc_t CGLoaderFile_LOG(const CGLoaderFile* cself, KLogLevel lvl, rc_t rc, const char *msg, const char *fmt, ...);
290 
291 rc_t CGLoaderFile_GetType(const CGLoaderFile* cself, CG_EFileType* type);
292 
293 rc_t CGLoaderFile_GetRead(const CGLoaderFile* cself, TReadsData* data);
294 rc_t CGLoaderFile_GetStartRow(const CGLoaderFile* cself, int64_t* rowid);
295 
296 rc_t CGLoaderFile_GetTagLfr(const CGLoaderFile* cself, TReadsData* data);
297 
298 rc_t CGLoaderFile_GetMapping(const CGLoaderFile* cself, TMappingsData* data);
299 
300 rc_t CGLoaderFile_GetEvidenceIntervals(const CGLoaderFile* cself, TEvidenceIntervalsData* data);
301 
302 rc_t CGLoaderFile_GetEvidenceDnbs(const CGLoaderFile* cself, const char* interval_id, TEvidenceDnbsData* data);
303 
304 rc_t CGLoaderFile_GetAssemblyId(const CGLoaderFile* cself, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id);
305 rc_t CGLoaderFile_GetSlide(const CGLoaderFile* cself, const CGFIELD_SLIDE_TYPE** slide);
306 rc_t CGLoaderFile_GetLane(const CGLoaderFile* cself, const CGFIELD_LANE_TYPE** lane);
307 rc_t CGLoaderFile_GetBatchFileNumber(const CGLoaderFile* cself, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number);
308 rc_t CGLoaderFile_GetSample(const CGLoaderFile* cself, const CGFIELD_SAMPLE_TYPE** sample);
309 rc_t CGLoaderFile_GetChromosome(const CGLoaderFile* cself, const CGFIELD_CHROMOSOME_TYPE** chromosome);
310 
311 
312 /* Readline
313  *  makes next line from a file available in buffer.
314  *  eligable EOL symbols are: \n (unix), \r (older mac), \r\n (win)
315  *  EOL symbol(s) never included in buffer length.
316  *  line is \0 terminated.
317  *  if there is no EOL at EOF - not an error.
318  *  fails if internal buffer is insufficient.
319  *  buffer is NULL on EOF
320  *  rc state of (rcString rcTooLong) means line was too long
321  *              you may copy line and readline again for the tail of the line
322  *
323  *  "buffer" [ OUT ] and "length" [ OUT ] - returned line and it's length
324  */
325 rc_t CGLoaderFile_Readline(const CGLoaderFile* cself, const void** buffer, size_t* length);
326 
327 rc_t CGLoaderFile_CreateCGFile(CGLoaderFile* self,
328     uint32_t FORMAT_VERSION, const char* TYPE);
329 
330 rc_t CGLoaderFileMakeCGFileType(const CGLoaderFile* self, const char* type,
331     const CGFileTypeFactory* factory, size_t factories,
332     const CGFileType** ftype);
333 
334 #endif /* _tools_cg_load_file_h_ */
335