1 /*==============================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25 
26 #include "debug.h"
27 #include "factory-cmn.h"
28 #include "factory-reads.h"
29 
30 typedef struct CGReads15 CGReads15;
31 #define CGFILETYPE_IMPL CGReads15
32 #include "file.h"
33 
34 #include <klib/printf.h>
35 #include <klib/rc.h>
36 
37 #include <os-native.h>
38 #include <sysalloc.h>
39 
40 #include <stdlib.h>
41 #include <string.h>
42 
43 struct CGReads15 {
44     CGFileType dad;
45     const CGLoaderFile* file;
46     int64_t start_rowid;
47     char spot_group[512];
48     uint64_t records;
49     /* headers */
50     CGFIELD15_ASSEMBLY_ID assembly_id;
51     CGFIELD15_BATCH_FILE_NUMBER batch_file_number;
52     CGFIELD15_BATCH_OFFSET batch_offset;
53     CGFIELD15_FIELD_SIZE field_size;
54     CGFIELD15_GENERATED_AT generated_at;
55     CGFIELD15_GENERATED_BY generated_by;
56     CGFIELD15_LANE lane;
57     CGFIELD15_LIBRARY library;
58     CGFIELD15_SAMPLE sample;
59     CGFIELD15_SLIDE slide;
60     CGFIELD15_SOFTWARE_VERSION software_version;
61 };
62 
63 static
CGReads15_Header(const CGReads15 * cself,const char * buf,const size_t len)64 rc_t CC CGReads15_Header(const CGReads15* cself, const char* buf, const size_t len)
65 {
66     rc_t rc = 0;
67     size_t slen;
68     CGReads15* self = (CGReads15*)cself;
69 
70     if( strncmp("ASSEMBLY_ID\t", buf, slen = 12) == 0 ) {
71         rc = str2buf(&buf[slen], len - slen, self->assembly_id, sizeof(self->assembly_id));
72     } else if( strncmp("BATCH_FILE_NUMBER\t", buf, slen = 18) == 0 ) {
73         rc = str2u32(&buf[slen], len - slen, &self->batch_file_number);
74         if( self->batch_file_number < 1 ) {
75             rc = RC(rcRuntime, rcFile, rcConstructing, rcItem, rcOutofrange);
76         }
77     } else if( strncmp("BATCH_OFFSET\t", buf, slen = 13) == 0 ) {
78         rc = str2u64(&buf[slen], len - slen, &self->batch_offset);
79     } else if( strncmp("FIELD_SIZE\t", buf, slen = 11) == 0 ) {
80         rc = str2u32(&buf[slen], len - slen, &self->field_size);
81     } else if( strncmp("GENERATED_AT\t", buf, slen = 13) == 0 ) {
82         rc = str2buf(&buf[slen], len - slen, self->generated_at, sizeof(self->generated_at));
83     } else if( strncmp("GENERATED_BY\t", buf, slen = 13) == 0 ) {
84         rc = str2buf(&buf[slen], len - slen, self->generated_by, sizeof(self->generated_by));
85     } else if( strncmp("LANE\t", buf, slen = 5) == 0 ) {
86         rc = str2buf(&buf[slen], len - slen, self->lane, sizeof(self->lane));
87     } else if( strncmp("LIBRARY\t", buf, slen = 8) == 0 ) {
88         rc = str2buf(&buf[slen], len - slen, self->library, sizeof(self->library));
89     } else if( strncmp("SAMPLE\t", buf, slen = 7) == 0 ) {
90         rc = str2buf(&buf[slen], len - slen, self->sample, sizeof(self->sample));
91     } else if( strncmp("SLIDE\t", buf, slen = 6) == 0 ) {
92         rc = str2buf(&buf[slen], len - slen, self->slide, sizeof(self->slide));
93     } else if( strncmp("SOFTWARE_VERSION\t", buf, slen = 17) == 0 ) {
94         rc = str2buf(&buf[slen], len - slen, self->software_version, sizeof(self->software_version));
95     } else {
96         rc = RC(rcRuntime, rcFile, rcConstructing, rcName, rcUnrecognized);
97     }
98     return rc;
99 }
100 
CGReads25_Header(const CGReads15 * cself,const char * buf,const size_t len)101 static rc_t CC CGReads25_Header(const CGReads15* cself,
102     const char* buf, const size_t len)
103 {
104     rc_t rc = 0;
105     size_t slen = 0;
106     CGReads15* self = (CGReads15*)cself;
107 
108     /* from SRA-2617 files */
109     if      (strncmp("APPROVAL\t", buf, slen = 9) == 0) {
110     }
111     else if (strncmp("TITLE\t", buf, slen = 6) == 0) {
112     }
113     else if (strncmp("ADDRESS\t", buf, slen = 8) == 0) {
114     }
115 
116     /* From Table 1: Header Metadata Present in all Data Files */
117     else if (strncmp("CUSTOMER_SAMPLE_ID\t", buf, slen = 19) == 0) {
118     }
119     else if (strncmp("SAMPLE_SOURCE\t", buf, slen = 14) == 0) {
120     }
121     else if (strncmp("REPORTED_GENDER\t", buf, slen = 16) == 0) {
122     }
123     else if (strncmp("CALLED_GENDER\t", buf, slen = 14) == 0) {
124     }
125     else if (strncmp("TUMOR_STATUS\t", buf, slen = 13) == 0) {
126     }
127     else if (strncmp("LIBRARY_TYPE\t", buf, slen = 13) == 0) {
128     }
129     else if (strncmp("LIBRARY_SOURCE\t", buf, slen = 13) == 0) {
130     }
131 
132     else if (strncmp("ASSEMBLY_ID\t", buf, slen = 12) == 0) {
133         rc = str2buf(&buf[slen], len - slen,
134             self->assembly_id, sizeof(self->assembly_id));
135     }
136     else if (strncmp("BATCH_FILE_NUMBER\t", buf, slen = 18) == 0) {
137         rc = str2u32(&buf[slen], len - slen, &self->batch_file_number);
138         if (self->batch_file_number < 1) {
139             rc = RC(rcRuntime, rcFile, rcConstructing, rcItem, rcOutofrange);
140         }
141     }
142     else if (strncmp("BATCH_OFFSET\t", buf, slen = 13) == 0) {
143         rc = str2u64(&buf[slen], len - slen, &self->batch_offset);
144     }
145     else if (strncmp("FIELD_SIZE\t", buf, slen = 11) == 0) {
146         rc = str2u32(&buf[slen], len - slen, &self->field_size);
147     }
148     else if (strncmp("GENERATED_AT\t", buf, slen = 13) == 0) {
149         rc = str2buf(&buf[slen], len - slen,
150             self->generated_at, sizeof(self->generated_at));
151     }
152     else if (strncmp("GENERATED_BY\t", buf, slen = 13) == 0) {
153         rc = str2buf(&buf[slen], len - slen,
154             self->generated_by, sizeof(self->generated_by));
155     }
156     else if (strncmp("LANE\t", buf, slen = 5) == 0) {
157         rc = str2buf(&buf[slen], len - slen,
158             self->lane, sizeof(self->lane));
159     }
160     else if (strncmp("LIBRARY\t", buf, slen = 8) == 0) {
161         rc = str2buf(&buf[slen], len - slen,
162             self->library, sizeof(self->library));
163     }
164     else if (strncmp("SAMPLE\t", buf, slen = 7) == 0) {
165         rc = str2buf(&buf[slen], len - slen,
166             self->sample, sizeof(self->sample));
167     }
168     else if (strncmp("SLIDE\t", buf, slen = 6) == 0) {
169         rc = str2buf(&buf[slen], len - slen, self->slide, sizeof(self->slide));
170     }
171     else if (strncmp("SOFTWARE_VERSION\t", buf, slen = 17) == 0) {
172         rc = str2buf(&buf[slen], len - slen,
173             self->software_version, sizeof(self->software_version));
174     }
175     else {
176         rc = RC(rcRuntime, rcFile, rcConstructing, rcName, rcUnrecognized);
177     }
178 
179     return rc;
180 }
181 
182 static
CGReads15_GetAssemblyId(const CGReads15 * cself,const CGFIELD_ASSEMBLY_ID_TYPE ** assembly_id)183 rc_t CGReads15_GetAssemblyId(const CGReads15* cself, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id)
184 {
185     if( cself->assembly_id[0] == '\0' ) {
186         return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
187     }
188     *assembly_id = cself->assembly_id;
189     return 0;
190 }
191 
192 static
CGReads15_GetSlide(const CGReads15 * cself,const CGFIELD_SLIDE_TYPE ** slide)193 rc_t CGReads15_GetSlide(const CGReads15* cself, const CGFIELD_SLIDE_TYPE** slide)
194 {
195     if( cself->slide[0] == '\0' ) {
196         return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
197     }
198     *slide = cself->slide;
199     return 0;
200 }
201 
202 static
CGReads15_GetLane(const CGReads15 * cself,const CGFIELD_LANE_TYPE ** lane)203 rc_t CGReads15_GetLane(const CGReads15* cself, const CGFIELD_LANE_TYPE** lane)
204 {
205     if( cself->lane[0] == '\0' ) {
206         return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
207     }
208     *lane = cself->lane;
209     return 0;
210 }
211 
212 static
CGReads15_GetBatchFileNumber(const CGReads15 * cself,const CGFIELD_BATCH_FILE_NUMBER_TYPE ** batch_file_number)213 rc_t CGReads15_GetBatchFileNumber(const CGReads15* cself, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number)
214 {
215     *batch_file_number = &cself->batch_file_number;
216     return 0;
217 }
218 
219 
CGReads15_Read(const CGReads15 * cself,TReadsData * data)220 static rc_t CC CGReads15_Read(const CGReads15* cself, TReadsData* data) {
221     rc_t rc = 0;
222 
223     if (cself->start_rowid == 0) {
224         ((CGReads15*)cself)->start_rowid = data->rowid;
225     }
226     CG_LINE_START(cself->file, b, len, p);
227     if (b == NULL || len == 0) {
228         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcDone);
229         break;
230     }
231     /*DEBUG_MSG(10, ("reads: '%.*s'\n", len, b));*/
232     CG_LINE_NEXT_FIELD(b, len, p);
233     if ((rc = str2u16(b, p - b, &data->flags)) != 0) {
234     }
235     else if (data->flags > 10) {
236         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcOutofrange);
237     }
238     else if ((data->flags & 0x03) == 3 || (data->flags & 0x07) == 7) {
239         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
240     }
241     CG_LINE_NEXT_FIELD(b, len, p);
242     data->seq.sequence.elements = p - b;
243     if (data->seq.sequence.elements != CG_READS15_SPOT_LEN) {
244         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
245     }
246     else {
247         rc = str2buf(b, data->seq.sequence.elements,
248             data->read, sizeof(data->read));
249         /* clear cache, set in algnment writer */
250         data->reverse[0] = '\0';
251         data->reverse[CG_READS15_SPOT_LEN / 2] = '\0';
252     }
253     CG_LINE_LAST_FIELD(b, len, p);
254     data->seq.quality.elements = p - b;
255     if (data->seq.quality.elements != CG_READS15_SPOT_LEN) {
256         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
257     }
258     else {
259         rc = str2buf(b, data->seq.quality.elements,
260             data->qual, sizeof(data->qual));
261     }
262     data->seq.spot_len = CG_READS15_SPOT_LEN;
263 
264     data->reads_format = 0x01050000;
265 
266     if (cself->records == 0) {
267         size_t w;
268 
269 #if 0
270     rc = string_printf(((CGReads15*)cself)->spot_group,
271         sizeof(cself->spot_group), &w, "%s:%s:%s:%04u", cself->assembly_id,
272         cself->slide, cself->lane, cself->batch_file_number);
273 #else
274         rc = string_printf(((CGReads15*)cself)->spot_group,
275             sizeof(cself->spot_group), &w, "%s-%s", cself->slide, cself->lane);
276 #endif
277         data->seq.spot_group.buffer = cself->spot_group;
278         data->seq.spot_group.elements = w;
279     }
280     ((CGReads15*)cself)->records++;
281     DEBUG_MSG(10,
282         ("reads:  %u\t'%s'\t'%s'\n", data->flags, data->read, data->qual));
283     CG_LINE_END();
284 
285     return rc;
286 }
287 
CGReads25_Read(const CGReads15 * cself,TReadsData * data)288 static rc_t CC CGReads25_Read(const CGReads15* cself, TReadsData* data) {
289     rc_t rc = 0;
290 
291     if (cself->start_rowid == 0) {
292         ((CGReads15*)cself)->start_rowid = data->rowid;
293     }
294     CG_LINE_START(cself->file, b, len, p);
295     if (b == NULL || len == 0) {
296         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcDone);
297         break;
298     }
299     /*DEBUG_MSG(10, ("reads: '%.*s'\n", len, b));*/
300     CG_LINE_NEXT_FIELD(b, len, p);
301     if ((rc = str2u16(b, p - b, &data->flags)) != 0) {
302     }
303     else if (data->flags > 10) {
304         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcOutofrange);
305     }
306     else if ((data->flags & 0x03) == 3 || (data->flags & 0x07) == 7) {
307         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
308     }
309     CG_LINE_NEXT_FIELD(b, len, p);
310     data->seq.sequence.elements = p - b;
311     if (data->seq.sequence.elements != CG_READS25_SPOT_LEN) {
312         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
313     }
314     else {
315         rc = str2buf(b, data->seq.sequence.elements,
316             data->read, sizeof(data->read));
317         /* clear cache, set in algnment writer */
318         data->reverse[0] = '\0';
319         data->reverse[CG_READS25_SPOT_LEN / 2] = '\0';
320     }
321     CG_LINE_LAST_FIELD(b, len, p);
322     data->seq.quality.elements = p - b;
323     if (data->seq.quality.elements != CG_READS25_SPOT_LEN) {
324         rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
325     }
326     else {
327         rc = str2buf(b, data->seq.quality.elements,
328             data->qual, sizeof(data->qual));
329     }
330     data->seq.spot_len = CG_READS25_SPOT_LEN;
331 
332     data->reads_format = 0x02050000;
333 
334     if (cself->records == 0) {
335         size_t w;
336 
337         rc = string_printf(((CGReads15*)cself)->spot_group,
338             sizeof(cself->spot_group), &w, "%s-%s", cself->slide, cself->lane);
339         data->seq.spot_group.buffer = cself->spot_group;
340         data->seq.spot_group.elements = w;
341     }
342     ((CGReads15*)cself)->records++;
343     DEBUG_MSG(10,
344         ("reads:  %u\t'%s'\t'%s'\n", data->flags, data->read, data->qual));
345     CG_LINE_END();
346 
347     return rc;
348 }
349 
350 
351 static
CGReads15_GetStartRow(const CGReads15 * cself,int64_t * rowid)352 rc_t CGReads15_GetStartRow(const CGReads15* cself, int64_t* rowid)
353 {
354     *rowid = cself->start_rowid;
355     return 0;
356 }
357 
358 static
CGReads15_Release(const CGReads15 * cself,uint64_t * records)359 void CC CGReads15_Release(const CGReads15* cself, uint64_t* records)
360 {
361     if( cself != NULL ) {
362         CGReads15* self = (CGReads15*)cself;
363         if( records != NULL ) {
364             *records = cself->records;
365         }
366         free(self);
367     }
368 }
369 
370 static const CGFileType_vt CGReads15_vt =
371 {
372     CGReads15_Header,
373     CGReads15_Read,
374     CGReads15_GetStartRow,
375     NULL,
376     NULL,
377     NULL,
378     NULL, /* tag_lfr */
379     CGReads15_GetAssemblyId,
380     CGReads15_GetSlide,
381     CGReads15_GetLane,
382     CGReads15_GetBatchFileNumber,
383     NULL,
384     NULL,
385     CGReads15_Release
386 };
387 
388 static const CGFileType_vt CGReads25_vt = {
389     CGReads25_Header,
390     CGReads25_Read,
391     CGReads15_GetStartRow,
392     NULL,
393     NULL,
394     NULL,
395     NULL, /* tag_lfr */
396     CGReads15_GetAssemblyId,
397     CGReads15_GetSlide,
398     CGReads15_GetLane,
399     CGReads15_GetBatchFileNumber,
400     NULL,
401     NULL,
402     CGReads15_Release
403 };
404 
CGReads_Make(const CGFileType ** cself,const CGLoaderFile * file,const CGFileType_vt * vt)405 static rc_t CC CGReads_Make(const CGFileType** cself,
406     const CGLoaderFile* file, const CGFileType_vt *vt)
407 {
408     rc_t rc = 0;
409     CGReads15* obj = NULL;
410 
411     if( cself == NULL || file == NULL ) {
412         rc = RC(rcRuntime, rcFile, rcConstructing, rcParam, rcNull);
413     }
414     if( rc == 0 ) {
415         *cself = NULL;
416         if( (obj = calloc(1, sizeof(*obj))) == NULL ) {
417             rc = RC(rcRuntime, rcFile, rcConstructing, rcMemory, rcExhausted);
418         } else {
419             obj->file = file;
420             obj->dad.type = cg_eFileType_READS;
421             obj->dad.vt = vt;
422         }
423     }
424     if( rc == 0 ) {
425         *cself = &obj->dad;
426     } else {
427         CGReads15_Release(obj, NULL);
428     }
429     return rc;
430 }
431 
CGReads15_Make(const CGFileType ** cself,const CGLoaderFile * file)432 rc_t CC CGReads15_Make(const CGFileType** cself, const CGLoaderFile* file)
433 {
434     return CGReads_Make(cself, file, &CGReads15_vt);
435 }
436 
CGReads13_Make(const CGFileType ** self,const CGLoaderFile * file)437 rc_t CC CGReads13_Make(const CGFileType** self, const CGLoaderFile* file)
438 {   return CGReads15_Make(self, file); }
439 
CGReads20_Make(const CGFileType ** self,const CGLoaderFile * file)440 rc_t CC CGReads20_Make(const CGFileType** self, const CGLoaderFile* file)
441 {   return CGReads15_Make(self, file); }
442 
CGReads22_Make(const CGFileType ** self,const CGLoaderFile * file)443 rc_t CC CGReads22_Make(const CGFileType** self, const CGLoaderFile* file)
444 {   return CGReads15_Make(self, file); }
445 
CGReads25_Make(const CGFileType ** self,const CGLoaderFile * file)446 rc_t CC CGReads25_Make(const CGFileType **self, const CGLoaderFile *file) {
447     return CGReads_Make(self, file, &CGReads25_vt);
448 }
449