1 /*==============================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 */
25
26 #include "debug.h"
27 #include "factory-cmn.h"
28 #include "factory-reads.h"
29
30 typedef struct CGReads15 CGReads15;
31 #define CGFILETYPE_IMPL CGReads15
32 #include "file.h"
33
34 #include <klib/printf.h>
35 #include <klib/rc.h>
36
37 #include <os-native.h>
38 #include <sysalloc.h>
39
40 #include <stdlib.h>
41 #include <string.h>
42
43 struct CGReads15 {
44 CGFileType dad;
45 const CGLoaderFile* file;
46 int64_t start_rowid;
47 char spot_group[512];
48 uint64_t records;
49 /* headers */
50 CGFIELD15_ASSEMBLY_ID assembly_id;
51 CGFIELD15_BATCH_FILE_NUMBER batch_file_number;
52 CGFIELD15_BATCH_OFFSET batch_offset;
53 CGFIELD15_FIELD_SIZE field_size;
54 CGFIELD15_GENERATED_AT generated_at;
55 CGFIELD15_GENERATED_BY generated_by;
56 CGFIELD15_LANE lane;
57 CGFIELD15_LIBRARY library;
58 CGFIELD15_SAMPLE sample;
59 CGFIELD15_SLIDE slide;
60 CGFIELD15_SOFTWARE_VERSION software_version;
61 };
62
63 static
CGReads15_Header(const CGReads15 * cself,const char * buf,const size_t len)64 rc_t CC CGReads15_Header(const CGReads15* cself, const char* buf, const size_t len)
65 {
66 rc_t rc = 0;
67 size_t slen;
68 CGReads15* self = (CGReads15*)cself;
69
70 if( strncmp("ASSEMBLY_ID\t", buf, slen = 12) == 0 ) {
71 rc = str2buf(&buf[slen], len - slen, self->assembly_id, sizeof(self->assembly_id));
72 } else if( strncmp("BATCH_FILE_NUMBER\t", buf, slen = 18) == 0 ) {
73 rc = str2u32(&buf[slen], len - slen, &self->batch_file_number);
74 if( self->batch_file_number < 1 ) {
75 rc = RC(rcRuntime, rcFile, rcConstructing, rcItem, rcOutofrange);
76 }
77 } else if( strncmp("BATCH_OFFSET\t", buf, slen = 13) == 0 ) {
78 rc = str2u64(&buf[slen], len - slen, &self->batch_offset);
79 } else if( strncmp("FIELD_SIZE\t", buf, slen = 11) == 0 ) {
80 rc = str2u32(&buf[slen], len - slen, &self->field_size);
81 } else if( strncmp("GENERATED_AT\t", buf, slen = 13) == 0 ) {
82 rc = str2buf(&buf[slen], len - slen, self->generated_at, sizeof(self->generated_at));
83 } else if( strncmp("GENERATED_BY\t", buf, slen = 13) == 0 ) {
84 rc = str2buf(&buf[slen], len - slen, self->generated_by, sizeof(self->generated_by));
85 } else if( strncmp("LANE\t", buf, slen = 5) == 0 ) {
86 rc = str2buf(&buf[slen], len - slen, self->lane, sizeof(self->lane));
87 } else if( strncmp("LIBRARY\t", buf, slen = 8) == 0 ) {
88 rc = str2buf(&buf[slen], len - slen, self->library, sizeof(self->library));
89 } else if( strncmp("SAMPLE\t", buf, slen = 7) == 0 ) {
90 rc = str2buf(&buf[slen], len - slen, self->sample, sizeof(self->sample));
91 } else if( strncmp("SLIDE\t", buf, slen = 6) == 0 ) {
92 rc = str2buf(&buf[slen], len - slen, self->slide, sizeof(self->slide));
93 } else if( strncmp("SOFTWARE_VERSION\t", buf, slen = 17) == 0 ) {
94 rc = str2buf(&buf[slen], len - slen, self->software_version, sizeof(self->software_version));
95 } else {
96 rc = RC(rcRuntime, rcFile, rcConstructing, rcName, rcUnrecognized);
97 }
98 return rc;
99 }
100
CGReads25_Header(const CGReads15 * cself,const char * buf,const size_t len)101 static rc_t CC CGReads25_Header(const CGReads15* cself,
102 const char* buf, const size_t len)
103 {
104 rc_t rc = 0;
105 size_t slen = 0;
106 CGReads15* self = (CGReads15*)cself;
107
108 /* from SRA-2617 files */
109 if (strncmp("APPROVAL\t", buf, slen = 9) == 0) {
110 }
111 else if (strncmp("TITLE\t", buf, slen = 6) == 0) {
112 }
113 else if (strncmp("ADDRESS\t", buf, slen = 8) == 0) {
114 }
115
116 /* From Table 1: Header Metadata Present in all Data Files */
117 else if (strncmp("CUSTOMER_SAMPLE_ID\t", buf, slen = 19) == 0) {
118 }
119 else if (strncmp("SAMPLE_SOURCE\t", buf, slen = 14) == 0) {
120 }
121 else if (strncmp("REPORTED_GENDER\t", buf, slen = 16) == 0) {
122 }
123 else if (strncmp("CALLED_GENDER\t", buf, slen = 14) == 0) {
124 }
125 else if (strncmp("TUMOR_STATUS\t", buf, slen = 13) == 0) {
126 }
127 else if (strncmp("LIBRARY_TYPE\t", buf, slen = 13) == 0) {
128 }
129 else if (strncmp("LIBRARY_SOURCE\t", buf, slen = 13) == 0) {
130 }
131
132 else if (strncmp("ASSEMBLY_ID\t", buf, slen = 12) == 0) {
133 rc = str2buf(&buf[slen], len - slen,
134 self->assembly_id, sizeof(self->assembly_id));
135 }
136 else if (strncmp("BATCH_FILE_NUMBER\t", buf, slen = 18) == 0) {
137 rc = str2u32(&buf[slen], len - slen, &self->batch_file_number);
138 if (self->batch_file_number < 1) {
139 rc = RC(rcRuntime, rcFile, rcConstructing, rcItem, rcOutofrange);
140 }
141 }
142 else if (strncmp("BATCH_OFFSET\t", buf, slen = 13) == 0) {
143 rc = str2u64(&buf[slen], len - slen, &self->batch_offset);
144 }
145 else if (strncmp("FIELD_SIZE\t", buf, slen = 11) == 0) {
146 rc = str2u32(&buf[slen], len - slen, &self->field_size);
147 }
148 else if (strncmp("GENERATED_AT\t", buf, slen = 13) == 0) {
149 rc = str2buf(&buf[slen], len - slen,
150 self->generated_at, sizeof(self->generated_at));
151 }
152 else if (strncmp("GENERATED_BY\t", buf, slen = 13) == 0) {
153 rc = str2buf(&buf[slen], len - slen,
154 self->generated_by, sizeof(self->generated_by));
155 }
156 else if (strncmp("LANE\t", buf, slen = 5) == 0) {
157 rc = str2buf(&buf[slen], len - slen,
158 self->lane, sizeof(self->lane));
159 }
160 else if (strncmp("LIBRARY\t", buf, slen = 8) == 0) {
161 rc = str2buf(&buf[slen], len - slen,
162 self->library, sizeof(self->library));
163 }
164 else if (strncmp("SAMPLE\t", buf, slen = 7) == 0) {
165 rc = str2buf(&buf[slen], len - slen,
166 self->sample, sizeof(self->sample));
167 }
168 else if (strncmp("SLIDE\t", buf, slen = 6) == 0) {
169 rc = str2buf(&buf[slen], len - slen, self->slide, sizeof(self->slide));
170 }
171 else if (strncmp("SOFTWARE_VERSION\t", buf, slen = 17) == 0) {
172 rc = str2buf(&buf[slen], len - slen,
173 self->software_version, sizeof(self->software_version));
174 }
175 else {
176 rc = RC(rcRuntime, rcFile, rcConstructing, rcName, rcUnrecognized);
177 }
178
179 return rc;
180 }
181
182 static
CGReads15_GetAssemblyId(const CGReads15 * cself,const CGFIELD_ASSEMBLY_ID_TYPE ** assembly_id)183 rc_t CGReads15_GetAssemblyId(const CGReads15* cself, const CGFIELD_ASSEMBLY_ID_TYPE** assembly_id)
184 {
185 if( cself->assembly_id[0] == '\0' ) {
186 return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
187 }
188 *assembly_id = cself->assembly_id;
189 return 0;
190 }
191
192 static
CGReads15_GetSlide(const CGReads15 * cself,const CGFIELD_SLIDE_TYPE ** slide)193 rc_t CGReads15_GetSlide(const CGReads15* cself, const CGFIELD_SLIDE_TYPE** slide)
194 {
195 if( cself->slide[0] == '\0' ) {
196 return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
197 }
198 *slide = cself->slide;
199 return 0;
200 }
201
202 static
CGReads15_GetLane(const CGReads15 * cself,const CGFIELD_LANE_TYPE ** lane)203 rc_t CGReads15_GetLane(const CGReads15* cself, const CGFIELD_LANE_TYPE** lane)
204 {
205 if( cself->lane[0] == '\0' ) {
206 return RC(rcRuntime, rcFile, rcReading, rcFormat, rcInvalid);
207 }
208 *lane = cself->lane;
209 return 0;
210 }
211
212 static
CGReads15_GetBatchFileNumber(const CGReads15 * cself,const CGFIELD_BATCH_FILE_NUMBER_TYPE ** batch_file_number)213 rc_t CGReads15_GetBatchFileNumber(const CGReads15* cself, const CGFIELD_BATCH_FILE_NUMBER_TYPE** batch_file_number)
214 {
215 *batch_file_number = &cself->batch_file_number;
216 return 0;
217 }
218
219
CGReads15_Read(const CGReads15 * cself,TReadsData * data)220 static rc_t CC CGReads15_Read(const CGReads15* cself, TReadsData* data) {
221 rc_t rc = 0;
222
223 if (cself->start_rowid == 0) {
224 ((CGReads15*)cself)->start_rowid = data->rowid;
225 }
226 CG_LINE_START(cself->file, b, len, p);
227 if (b == NULL || len == 0) {
228 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcDone);
229 break;
230 }
231 /*DEBUG_MSG(10, ("reads: '%.*s'\n", len, b));*/
232 CG_LINE_NEXT_FIELD(b, len, p);
233 if ((rc = str2u16(b, p - b, &data->flags)) != 0) {
234 }
235 else if (data->flags > 10) {
236 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcOutofrange);
237 }
238 else if ((data->flags & 0x03) == 3 || (data->flags & 0x07) == 7) {
239 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
240 }
241 CG_LINE_NEXT_FIELD(b, len, p);
242 data->seq.sequence.elements = p - b;
243 if (data->seq.sequence.elements != CG_READS15_SPOT_LEN) {
244 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
245 }
246 else {
247 rc = str2buf(b, data->seq.sequence.elements,
248 data->read, sizeof(data->read));
249 /* clear cache, set in algnment writer */
250 data->reverse[0] = '\0';
251 data->reverse[CG_READS15_SPOT_LEN / 2] = '\0';
252 }
253 CG_LINE_LAST_FIELD(b, len, p);
254 data->seq.quality.elements = p - b;
255 if (data->seq.quality.elements != CG_READS15_SPOT_LEN) {
256 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
257 }
258 else {
259 rc = str2buf(b, data->seq.quality.elements,
260 data->qual, sizeof(data->qual));
261 }
262 data->seq.spot_len = CG_READS15_SPOT_LEN;
263
264 data->reads_format = 0x01050000;
265
266 if (cself->records == 0) {
267 size_t w;
268
269 #if 0
270 rc = string_printf(((CGReads15*)cself)->spot_group,
271 sizeof(cself->spot_group), &w, "%s:%s:%s:%04u", cself->assembly_id,
272 cself->slide, cself->lane, cself->batch_file_number);
273 #else
274 rc = string_printf(((CGReads15*)cself)->spot_group,
275 sizeof(cself->spot_group), &w, "%s-%s", cself->slide, cself->lane);
276 #endif
277 data->seq.spot_group.buffer = cself->spot_group;
278 data->seq.spot_group.elements = w;
279 }
280 ((CGReads15*)cself)->records++;
281 DEBUG_MSG(10,
282 ("reads: %u\t'%s'\t'%s'\n", data->flags, data->read, data->qual));
283 CG_LINE_END();
284
285 return rc;
286 }
287
CGReads25_Read(const CGReads15 * cself,TReadsData * data)288 static rc_t CC CGReads25_Read(const CGReads15* cself, TReadsData* data) {
289 rc_t rc = 0;
290
291 if (cself->start_rowid == 0) {
292 ((CGReads15*)cself)->start_rowid = data->rowid;
293 }
294 CG_LINE_START(cself->file, b, len, p);
295 if (b == NULL || len == 0) {
296 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcDone);
297 break;
298 }
299 /*DEBUG_MSG(10, ("reads: '%.*s'\n", len, b));*/
300 CG_LINE_NEXT_FIELD(b, len, p);
301 if ((rc = str2u16(b, p - b, &data->flags)) != 0) {
302 }
303 else if (data->flags > 10) {
304 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcOutofrange);
305 }
306 else if ((data->flags & 0x03) == 3 || (data->flags & 0x07) == 7) {
307 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
308 }
309 CG_LINE_NEXT_FIELD(b, len, p);
310 data->seq.sequence.elements = p - b;
311 if (data->seq.sequence.elements != CG_READS25_SPOT_LEN) {
312 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
313 }
314 else {
315 rc = str2buf(b, data->seq.sequence.elements,
316 data->read, sizeof(data->read));
317 /* clear cache, set in algnment writer */
318 data->reverse[0] = '\0';
319 data->reverse[CG_READS25_SPOT_LEN / 2] = '\0';
320 }
321 CG_LINE_LAST_FIELD(b, len, p);
322 data->seq.quality.elements = p - b;
323 if (data->seq.quality.elements != CG_READS25_SPOT_LEN) {
324 rc = RC(rcRuntime, rcFile, rcReading, rcData, rcInvalid);
325 }
326 else {
327 rc = str2buf(b, data->seq.quality.elements,
328 data->qual, sizeof(data->qual));
329 }
330 data->seq.spot_len = CG_READS25_SPOT_LEN;
331
332 data->reads_format = 0x02050000;
333
334 if (cself->records == 0) {
335 size_t w;
336
337 rc = string_printf(((CGReads15*)cself)->spot_group,
338 sizeof(cself->spot_group), &w, "%s-%s", cself->slide, cself->lane);
339 data->seq.spot_group.buffer = cself->spot_group;
340 data->seq.spot_group.elements = w;
341 }
342 ((CGReads15*)cself)->records++;
343 DEBUG_MSG(10,
344 ("reads: %u\t'%s'\t'%s'\n", data->flags, data->read, data->qual));
345 CG_LINE_END();
346
347 return rc;
348 }
349
350
351 static
CGReads15_GetStartRow(const CGReads15 * cself,int64_t * rowid)352 rc_t CGReads15_GetStartRow(const CGReads15* cself, int64_t* rowid)
353 {
354 *rowid = cself->start_rowid;
355 return 0;
356 }
357
358 static
CGReads15_Release(const CGReads15 * cself,uint64_t * records)359 void CC CGReads15_Release(const CGReads15* cself, uint64_t* records)
360 {
361 if( cself != NULL ) {
362 CGReads15* self = (CGReads15*)cself;
363 if( records != NULL ) {
364 *records = cself->records;
365 }
366 free(self);
367 }
368 }
369
370 static const CGFileType_vt CGReads15_vt =
371 {
372 CGReads15_Header,
373 CGReads15_Read,
374 CGReads15_GetStartRow,
375 NULL,
376 NULL,
377 NULL,
378 NULL, /* tag_lfr */
379 CGReads15_GetAssemblyId,
380 CGReads15_GetSlide,
381 CGReads15_GetLane,
382 CGReads15_GetBatchFileNumber,
383 NULL,
384 NULL,
385 CGReads15_Release
386 };
387
388 static const CGFileType_vt CGReads25_vt = {
389 CGReads25_Header,
390 CGReads25_Read,
391 CGReads15_GetStartRow,
392 NULL,
393 NULL,
394 NULL,
395 NULL, /* tag_lfr */
396 CGReads15_GetAssemblyId,
397 CGReads15_GetSlide,
398 CGReads15_GetLane,
399 CGReads15_GetBatchFileNumber,
400 NULL,
401 NULL,
402 CGReads15_Release
403 };
404
CGReads_Make(const CGFileType ** cself,const CGLoaderFile * file,const CGFileType_vt * vt)405 static rc_t CC CGReads_Make(const CGFileType** cself,
406 const CGLoaderFile* file, const CGFileType_vt *vt)
407 {
408 rc_t rc = 0;
409 CGReads15* obj = NULL;
410
411 if( cself == NULL || file == NULL ) {
412 rc = RC(rcRuntime, rcFile, rcConstructing, rcParam, rcNull);
413 }
414 if( rc == 0 ) {
415 *cself = NULL;
416 if( (obj = calloc(1, sizeof(*obj))) == NULL ) {
417 rc = RC(rcRuntime, rcFile, rcConstructing, rcMemory, rcExhausted);
418 } else {
419 obj->file = file;
420 obj->dad.type = cg_eFileType_READS;
421 obj->dad.vt = vt;
422 }
423 }
424 if( rc == 0 ) {
425 *cself = &obj->dad;
426 } else {
427 CGReads15_Release(obj, NULL);
428 }
429 return rc;
430 }
431
CGReads15_Make(const CGFileType ** cself,const CGLoaderFile * file)432 rc_t CC CGReads15_Make(const CGFileType** cself, const CGLoaderFile* file)
433 {
434 return CGReads_Make(cself, file, &CGReads15_vt);
435 }
436
CGReads13_Make(const CGFileType ** self,const CGLoaderFile * file)437 rc_t CC CGReads13_Make(const CGFileType** self, const CGLoaderFile* file)
438 { return CGReads15_Make(self, file); }
439
CGReads20_Make(const CGFileType ** self,const CGLoaderFile * file)440 rc_t CC CGReads20_Make(const CGFileType** self, const CGLoaderFile* file)
441 { return CGReads15_Make(self, file); }
442
CGReads22_Make(const CGFileType ** self,const CGLoaderFile * file)443 rc_t CC CGReads22_Make(const CGFileType** self, const CGLoaderFile* file)
444 { return CGReads15_Make(self, file); }
445
CGReads25_Make(const CGFileType ** self,const CGLoaderFile * file)446 rc_t CC CGReads25_Make(const CGFileType **self, const CGLoaderFile *file) {
447 return CGReads_Make(self, file, &CGReads25_vt);
448 }
449