1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <klib/rc.h>
28 #include <klib/log.h>
29
30 #include <vdb/database.h>
31
32 #include <kdb/manager.h>
33
34 #include <insdc/sra.h>
35 #include <insdc/insdc.h>
36
37 #include <sysalloc.h>
38
39 #include <stdlib.h>
40 #include <string.h>
41 #include <stdio.h>
42 #include <assert.h>
43 #include <ctype.h>
44
45 // Note: this is essentially a copy of ncbi-vdb/interfaces/loader/sequnce-writer.h,
46 // with "bool dropReadnames" added to the parameters of SequenceWriteRecord (VDB-4149)
47 #include "sequence-writer.h"
48 //#include <loader/sequence-writer.h>
49
50 #include <align/writer-sequence.h>
51
52 #include <loader/common-reader.h>
53
54 /* MARK: SequenceWriter Object */
55
SequenceWriterInit(SequenceWriter * self,VDatabase * db)56 SequenceWriter *SequenceWriterInit(SequenceWriter *self, VDatabase *db) {
57 memset(self, 0, sizeof(*self));
58 self->db = db;
59 VDatabaseAddRef(db);
60 return self;
61 }
62
SequenceWriteRecord(SequenceWriter * self,SequenceRecord const * rec,bool color,bool isDup,INSDC_SRA_platform_id platform,bool keepMismatchQual,bool no_real_output,bool hasTI,char const * QualQuantizer,bool dropReadnames)63 rc_t SequenceWriteRecord(SequenceWriter *self,
64 SequenceRecord const *rec,
65 bool color,
66 bool isDup,
67 INSDC_SRA_platform_id platform,
68 bool keepMismatchQual,
69 bool no_real_output,
70 bool hasTI,
71 char const *QualQuantizer,
72 bool dropReadnames
73 )
74 {
75 rc_t rc = 0;
76 uint8_t nreads = rec->numreads;
77 unsigned i;
78 unsigned seqLen;
79 int64_t dummyRowId;
80
81 uint8_t readInfo[4096];
82 void *h_readInfo = NULL;
83
84 INSDC_coord_zero *readStart = (void *)readInfo;
85 INSDC_coord_len *readLen;
86 uint8_t *alcnt;
87 INSDC_SRA_xread_type *readType;
88 INSDC_SRA_read_filter *readFilter;
89 bool *mask = NULL;
90 size_t const elemSize = sizeof(alcnt[0]) + sizeof(readType[0])
91 + sizeof(readStart[0]) + sizeof(readLen[0])
92 + sizeof(readFilter[0]);
93
94 TableWriterSeqData data;
95
96 for (i = seqLen = 0; i != nreads; ++i) {
97 seqLen += rec->readLen[i];
98 }
99
100 if (nreads * elemSize + keepMismatchQual * seqLen * sizeof(mask[0]) > sizeof(readInfo))
101 {
102 h_readInfo = malloc(nreads * elemSize + keepMismatchQual * seqLen * sizeof(mask[0]));
103 if (h_readInfo == NULL)
104 return RC(rcAlign, rcTable, rcWriting, rcMemory, rcExhausted);
105 readStart = h_readInfo;
106 }
107 readLen = (INSDC_coord_len *)&readStart[nreads];
108 alcnt = (uint8_t *)&readLen[nreads];
109 readType = (INSDC_SRA_xread_type *)&alcnt[nreads];
110 readFilter = (INSDC_SRA_read_filter *)&readType[nreads];
111
112 if (keepMismatchQual) {
113 mask = (bool *)&readFilter[nreads];
114
115 for (i = 0; i != seqLen; ++i) {
116 mask[i] = (rec->qual[i] & 0x80) != 0;
117 }
118 }
119
120 for (i = 0; i != nreads; ++i) {
121 alcnt[i] = rec->aligned[i] ? 1 : 0;
122 readLen[i] = rec->readLen[i];
123 readStart[i] = rec->readStart[i];
124 readType[i] = readLen[i] ? SRA_READ_TYPE_BIOLOGICAL : SRA_READ_TYPE_TECHNICAL;
125 switch ( rec->orientation[i] )
126 {
127 case ReadOrientationForward:
128 readType[i] |= SRA_READ_TYPE_FORWARD;
129 break;
130 case ReadOrientationReverse:
131 readType[i] |= SRA_READ_TYPE_REVERSE;
132 break;
133 case ReadOrientationUnknown:
134 default:
135 break;
136 }
137 readFilter[i] = isDup ? SRA_READ_FILTER_CRITERIA
138 : rec->is_bad[i] ? SRA_READ_FILTER_REJECT : SRA_READ_FILTER_PASS;
139 }
140
141 memset(&data, 0, sizeof(data));
142
143 data.sequence.buffer = rec->seq;
144 data.sequence.elements = seqLen;
145
146 data.quality.buffer = rec->qual;
147 data.quality.elements = seqLen;
148
149 if (keepMismatchQual) {
150 data.no_quantize_mask.buffer = mask;
151 data.no_quantize_mask.elements = seqLen;
152 }
153
154 data.alignment_count.buffer = alcnt;
155 data.alignment_count.elements = nreads;
156
157 data.nreads = nreads;
158
159 data.read_type.buffer = readType;
160 data.read_type.elements = nreads;
161
162 data.read_start.buffer = readStart;
163 data.read_start.elements = nreads;
164
165 data.read_len.buffer = readLen;
166 data.read_len.elements = nreads;
167
168 data.tmp_key_id = rec->keyId;
169
170 data.spot_group.buffer = rec->spotGroup;
171 data.spot_group.elements = rec->spotGroupLen;
172
173 data.cskey.buffer = rec->cskey;
174 data.cskey.elements = nreads;
175
176 data.read_filter.buffer = readFilter;
177 data.read_filter.elements = nreads;
178
179 data.platform.buffer = &platform;
180 data.platform.elements = 1;
181
182 data.ti.buffer = rec->ti;
183 data.ti.elements = nreads;
184
185 data.spot_name.buffer = rec->spotName;
186 data.spot_name.elements = rec->spotNameLen;
187
188 if (!no_real_output) {
189 if (self->tbl == NULL) {
190 int csoption = (color ? ewseq_co_ColorSpace : 0);
191 int readNameOption = (dropReadnames ? 0 : ewseq_co_SpotName);
192
193 if(hasTI) csoption |= ewseq_co_TI;
194
195 rc = TableWriterSeq_Make(&self->tbl, self->db,
196 csoption | ewseq_co_NoLabelData | ewseq_co_SpotGroup | readNameOption, QualQuantizer);
197 }
198 if (rc == 0) {
199 rc = TableWriterSeq_Write(self->tbl, &data, &dummyRowId);
200 }
201 }
202
203 if (h_readInfo)
204 free(h_readInfo);
205
206 return rc;
207 }
208
SequenceDoneWriting(SequenceWriter * self)209 rc_t SequenceDoneWriting(SequenceWriter *self)
210 {
211 return TableWriterSeq_TmpKeyStart(self->tbl);
212 }
213
SequenceReadKey(const SequenceWriter * cself,int64_t row,uint64_t * keyId)214 rc_t SequenceReadKey(const SequenceWriter *cself, int64_t row, uint64_t *keyId)
215 {
216 return TableWriterSeq_TmpKey(cself->tbl, row, keyId);
217 }
218
SequenceUpdateAlignData(SequenceWriter * self,int64_t rowId,unsigned nreads,int64_t const primeId[],uint8_t const algnCnt[])219 rc_t SequenceUpdateAlignData(SequenceWriter *self, int64_t rowId, unsigned nreads,
220 int64_t const primeId[/* nreads */],
221 uint8_t const algnCnt[/* nreads */])
222 {
223 TableWriterData data[2];
224
225 data[0].buffer = primeId; data[0].elements = nreads;
226 data[1].buffer = algnCnt; data[1].elements = nreads;
227
228 return TableWriterSeq_WriteAlignmentData(self->tbl, rowId, &data[0], &data[1]);
229 }
230
SequenceWhack(SequenceWriter * self,bool commit)231 void SequenceWhack(SequenceWriter *self, bool commit) {
232 uint64_t dummyRows;
233 /* rc_t rc; */
234
235 VDatabaseRelease(self->db);
236
237 if (self->tbl == NULL)
238 return;
239
240 /* rc = */ TableWriterSeq_Whack(self->tbl, commit, &dummyRows);
241 }
242
243 /* MARK: SequenceRecord Object */
244 static
SequenceRecordResize(SequenceRecord * self,KDataBuffer * storage,unsigned numreads,unsigned seqLen)245 rc_t SequenceRecordResize(SequenceRecord *self,
246 KDataBuffer *storage,
247 unsigned numreads,
248 unsigned seqLen)
249 {
250 size_t sz;
251 rc_t rc;
252
253 sz = seqLen * (sizeof(self->seq[0]) + sizeof(self->qual[0])) +
254 numreads * (sizeof(self->ti) +
255 sizeof(self->readStart[0]) +
256 sizeof(self->readLen[0]) +
257 sizeof(self->aligned[0]) +
258 sizeof(self->orientation[0]) +
259 sizeof(self->alignmentCount[0]) +
260 sizeof(self->cskey[0])
261 );
262 storage->elem_bits = 8;
263 rc = KDataBufferResize(storage, sz);
264 if (rc)
265 return rc;
266 self->numreads = numreads;
267
268 self->ti = (uint64_t *)storage->base;
269 self->readStart = (uint32_t *)&self->ti[numreads];
270 self->readLen = (uint32_t *)&self->readStart[numreads];
271 self->aligned = (bool *)&self->readLen[numreads];
272 self->orientation = (uint8_t *)&self->aligned[numreads];
273 self->is_bad = (uint8_t *)&self->orientation[numreads];
274 self->alignmentCount = (uint8_t *)&self->is_bad[numreads];
275 self->cskey = (char *)&self->alignmentCount[numreads];
276 self->seq = (char *)&self->cskey[numreads];
277 self->qual = (uint8_t *)&self->seq[seqLen];
278
279 self->spotGroup = NULL;
280 self->spotGroupLen = 0;
281 self->spotName = NULL;
282 self->spotNameLen = 0;
283
284 return 0;
285 }
286
SequenceRecordInit(SequenceRecord * self,unsigned numreads,unsigned readLen[])287 rc_t SequenceRecordInit(SequenceRecord *self, unsigned numreads, unsigned readLen[])
288 {
289 unsigned i;
290 unsigned seqlen = 0;
291 rc_t rc;
292
293 for (i = 0; i != numreads; ++i) {
294 seqlen += readLen[i];
295 }
296 rc = SequenceRecordResize(self, &self->storage, numreads, seqlen);
297 if (rc)
298 return rc;
299 memset(self->storage.base, 0, KDataBufferBytes(&self->storage));
300
301 for (seqlen = 0, i = 0; i != numreads; ++i) {
302 self->readLen[i] = readLen[i];
303 self->readStart[i] = seqlen;
304 seqlen += readLen[i];
305 }
306 self->numreads = numreads;
307 memset(self->cskey, 'T', numreads);
308 return 0;
309 }
310
SequenceRecordAppend(SequenceRecord * self,const SequenceRecord * other)311 rc_t SequenceRecordAppend(SequenceRecord *self,
312 const SequenceRecord *other
313 )
314 {
315 /* save the locations of the original data */
316 unsigned const seq = (uint8_t const *)self->seq - (uint8_t const *)self->storage.base;
317 unsigned const qual = (uint8_t const *)self->qual - (uint8_t const *)self->storage.base;
318 unsigned const cskey = (uint8_t const *)self->cskey - (uint8_t const *)self->storage.base;
319 unsigned const alignmentCount = (uint8_t const *)self->alignmentCount - (uint8_t const *)self->storage.base;
320 unsigned const is_bad = (uint8_t const *)self->is_bad - (uint8_t const *)self->storage.base;
321 unsigned const orientation = (uint8_t const *)self->orientation - (uint8_t const *)self->storage.base;
322 unsigned const aligned = (uint8_t const *)self->aligned - (uint8_t const *)self->storage.base;
323 unsigned const ti = (uint8_t const *)self->ti - (uint8_t const *)self->storage.base;
324 unsigned const readLen = (uint8_t const *)self->readLen - (uint8_t const *)self->storage.base;
325 /* unsigned const readStart = (uint8_t const *)self->readStart - (uint8_t const *)self->storage.base;*/
326
327 rc_t rc;
328 unsigned seqlen;
329 unsigned otherSeqlen;
330 unsigned i;
331 unsigned numreads = self->numreads;
332
333 for (seqlen = 0, i = 0; i != numreads; ++i) {
334 seqlen += self->readLen[i];
335 }
336 for (otherSeqlen = 0, i = 0; i != other->numreads; ++i) {
337 otherSeqlen += other->readLen[i];
338 }
339
340 rc = SequenceRecordResize(self, &self->storage, self->numreads + other->numreads, seqlen + otherSeqlen);
341 if (rc)
342 return rc;
343 /* this needs to be reverse order from assignment in Resize function
344 * these regions can overlap
345 */
346 memmove(self->qual, &((uint8_t const *)self->storage.base)[qual], seqlen);
347 memmove(self->seq, &((uint8_t const *)self->storage.base)[seq], seqlen);
348 memmove(self->cskey, &((uint8_t const *)self->storage.base)[cskey], numreads * sizeof(self->cskey[0]));
349 memmove(self->alignmentCount, &((uint8_t const *)self->storage.base)[alignmentCount], numreads * sizeof(self->alignmentCount[0]));
350 memmove(self->is_bad, &((uint8_t const *)self->storage.base)[is_bad], numreads * sizeof(self->is_bad[0]));
351 memmove(self->orientation, &((uint8_t const *)self->storage.base)[orientation], numreads * sizeof(self->orientation[0]));
352 memmove(self->aligned, &((uint8_t const *)self->storage.base)[aligned], numreads * sizeof(self->aligned[0]));
353 memmove(self->readLen, &((uint8_t const *)self->storage.base)[readLen], numreads * sizeof(self->readLen[0]));
354 memmove(self->ti, &((uint8_t const *)self->storage.base)[ti], numreads * sizeof(self->ti[0]));
355
356 memmove(&self->ti[numreads], other->ti, other->numreads * sizeof(self->ti[0]));
357 memmove(&self->readLen[numreads], other->readLen, other->numreads * sizeof(self->readLen[0]));
358 memmove(&self->aligned[numreads], other->aligned, other->numreads * sizeof(self->aligned[0]));
359 memmove(&self->orientation[numreads], other->orientation, other->numreads * sizeof(self->orientation[0]));
360 memmove(&self->is_bad[numreads], other->is_bad, other->numreads * sizeof(self->is_bad[0]));
361 memmove(&self->alignmentCount[numreads], other->alignmentCount, other->numreads * sizeof(self->alignmentCount[0]));
362 memmove(&self->cskey[numreads], other->cskey, other->numreads * sizeof(self->cskey[0]));
363 memmove(&self->seq[seqlen], other->seq, otherSeqlen);
364 memmove(&self->qual[seqlen], other->qual, otherSeqlen);
365
366 for (i = 0, seqlen = 0; i != self->numreads; ++i) {
367 self->readStart[i] = seqlen;
368 seqlen += self->readLen[i];
369 }
370
371 return 0;
372 }
373