1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include <klib/rc.h>
28 #include <klib/log.h>
29 
30 #include <vdb/database.h>
31 
32 #include <kdb/manager.h>
33 
34 #include <insdc/sra.h>
35 #include <insdc/insdc.h>
36 
37 #include <sysalloc.h>
38 
39 #include <stdlib.h>
40 #include <string.h>
41 #include <stdio.h>
42 #include <assert.h>
43 #include <ctype.h>
44 
45 // Note: this is essentially a copy of ncbi-vdb/interfaces/loader/sequnce-writer.h,
46 // with "bool dropReadnames" added to the parameters of SequenceWriteRecord (VDB-4149)
47 #include "sequence-writer.h"
48 //#include <loader/sequence-writer.h>
49 
50 #include <align/writer-sequence.h>
51 
52 #include <loader/common-reader.h>
53 
54 /* MARK: SequenceWriter Object */
55 
SequenceWriterInit(SequenceWriter * self,VDatabase * db)56 SequenceWriter *SequenceWriterInit(SequenceWriter *self, VDatabase *db) {
57     memset(self, 0, sizeof(*self));
58     self->db = db;
59     VDatabaseAddRef(db);
60     return self;
61 }
62 
SequenceWriteRecord(SequenceWriter * self,SequenceRecord const * rec,bool color,bool isDup,INSDC_SRA_platform_id platform,bool keepMismatchQual,bool no_real_output,bool hasTI,char const * QualQuantizer,bool dropReadnames)63 rc_t SequenceWriteRecord(SequenceWriter *self,
64                          SequenceRecord const *rec,
65                          bool color,
66                          bool isDup,
67                          INSDC_SRA_platform_id platform,
68                          bool keepMismatchQual,
69                          bool no_real_output,
70                          bool hasTI,
71                          char const *QualQuantizer,
72                          bool dropReadnames
73                          )
74 {
75     rc_t rc = 0;
76     uint8_t nreads = rec->numreads;
77     unsigned i;
78     unsigned seqLen;
79     int64_t dummyRowId;
80 
81     uint8_t readInfo[4096];
82     void *h_readInfo = NULL;
83 
84     INSDC_coord_zero *readStart = (void *)readInfo;
85     INSDC_coord_len *readLen;
86     uint8_t *alcnt;
87     INSDC_SRA_xread_type *readType;
88     INSDC_SRA_read_filter *readFilter;
89     bool *mask = NULL;
90     size_t const elemSize = sizeof(alcnt[0]) + sizeof(readType[0])
91                           + sizeof(readStart[0]) + sizeof(readLen[0])
92                           + sizeof(readFilter[0]);
93 
94     TableWriterSeqData data;
95 
96     for (i = seqLen = 0; i != nreads; ++i) {
97         seqLen += rec->readLen[i];
98     }
99 
100     if (nreads * elemSize + keepMismatchQual * seqLen * sizeof(mask[0]) > sizeof(readInfo))
101     {
102         h_readInfo = malloc(nreads * elemSize + keepMismatchQual * seqLen * sizeof(mask[0]));
103         if (h_readInfo == NULL)
104             return RC(rcAlign, rcTable, rcWriting, rcMemory, rcExhausted);
105         readStart = h_readInfo;
106     }
107     readLen = (INSDC_coord_len *)&readStart[nreads];
108     alcnt = (uint8_t *)&readLen[nreads];
109     readType = (INSDC_SRA_xread_type *)&alcnt[nreads];
110     readFilter = (INSDC_SRA_read_filter *)&readType[nreads];
111 
112     if (keepMismatchQual) {
113         mask = (bool *)&readFilter[nreads];
114 
115         for (i = 0; i != seqLen; ++i) {
116             mask[i] = (rec->qual[i] & 0x80) != 0;
117         }
118     }
119 
120     for (i = 0; i != nreads; ++i) {
121         alcnt[i] = rec->aligned[i] ? 1 : 0;
122         readLen[i] = rec->readLen[i];
123         readStart[i] = rec->readStart[i];
124         readType[i] = readLen[i] ? SRA_READ_TYPE_BIOLOGICAL : SRA_READ_TYPE_TECHNICAL;
125         switch ( rec->orientation[i] )
126         {
127             case ReadOrientationForward:
128                 readType[i] |= SRA_READ_TYPE_FORWARD;
129                 break;
130             case ReadOrientationReverse:
131                 readType[i] |= SRA_READ_TYPE_REVERSE;
132                 break;
133             case ReadOrientationUnknown:
134             default:
135                 break;
136         }
137         readFilter[i] = isDup ? SRA_READ_FILTER_CRITERIA
138                       : rec->is_bad[i] ? SRA_READ_FILTER_REJECT : SRA_READ_FILTER_PASS;
139     }
140 
141     memset(&data, 0, sizeof(data));
142 
143     data.sequence.buffer = rec->seq;
144     data.sequence.elements = seqLen;
145 
146     data.quality.buffer = rec->qual;
147     data.quality.elements = seqLen;
148 
149     if (keepMismatchQual) {
150         data.no_quantize_mask.buffer = mask;
151         data.no_quantize_mask.elements = seqLen;
152     }
153 
154     data.alignment_count.buffer = alcnt;
155     data.alignment_count.elements = nreads;
156 
157     data.nreads = nreads;
158 
159     data.read_type.buffer = readType;
160     data.read_type.elements = nreads;
161 
162     data.read_start.buffer = readStart;
163     data.read_start.elements = nreads;
164 
165     data.read_len.buffer = readLen;
166     data.read_len.elements = nreads;
167 
168     data.tmp_key_id = rec->keyId;
169 
170     data.spot_group.buffer = rec->spotGroup;
171     data.spot_group.elements = rec->spotGroupLen;
172 
173     data.cskey.buffer = rec->cskey;
174     data.cskey.elements = nreads;
175 
176     data.read_filter.buffer = readFilter;
177     data.read_filter.elements = nreads;
178 
179     data.platform.buffer = &platform;
180     data.platform.elements = 1;
181 
182     data.ti.buffer = rec->ti;
183     data.ti.elements = nreads;
184 
185     data.spot_name.buffer = rec->spotName;
186     data.spot_name.elements = rec->spotNameLen;
187 
188     if (!no_real_output) {
189         if (self->tbl == NULL) {
190             int csoption = (color ? ewseq_co_ColorSpace : 0);
191             int readNameOption = (dropReadnames ? 0 : ewseq_co_SpotName);
192 
193             if(hasTI) csoption |= ewseq_co_TI;
194 
195             rc = TableWriterSeq_Make(&self->tbl, self->db,
196                                      csoption | ewseq_co_NoLabelData | ewseq_co_SpotGroup | readNameOption, QualQuantizer);
197         }
198         if (rc == 0) {
199             rc = TableWriterSeq_Write(self->tbl, &data, &dummyRowId);
200         }
201     }
202 
203     if (h_readInfo)
204         free(h_readInfo);
205 
206     return rc;
207 }
208 
SequenceDoneWriting(SequenceWriter * self)209 rc_t SequenceDoneWriting(SequenceWriter *self)
210 {
211     return TableWriterSeq_TmpKeyStart(self->tbl);
212 }
213 
SequenceReadKey(const SequenceWriter * cself,int64_t row,uint64_t * keyId)214 rc_t SequenceReadKey(const SequenceWriter *cself, int64_t row, uint64_t *keyId)
215 {
216     return TableWriterSeq_TmpKey(cself->tbl, row, keyId);
217 }
218 
SequenceUpdateAlignData(SequenceWriter * self,int64_t rowId,unsigned nreads,int64_t const primeId[],uint8_t const algnCnt[])219 rc_t SequenceUpdateAlignData(SequenceWriter *self, int64_t rowId, unsigned nreads,
220                              int64_t const primeId[/* nreads */],
221                              uint8_t const algnCnt[/* nreads */])
222 {
223     TableWriterData data[2];
224 
225     data[0].buffer = primeId; data[0].elements = nreads;
226     data[1].buffer = algnCnt; data[1].elements = nreads;
227 
228     return TableWriterSeq_WriteAlignmentData(self->tbl, rowId, &data[0], &data[1]);
229 }
230 
SequenceWhack(SequenceWriter * self,bool commit)231 void SequenceWhack(SequenceWriter *self, bool commit) {
232     uint64_t dummyRows;
233     /* rc_t rc; */
234 
235     VDatabaseRelease(self->db);
236 
237     if (self->tbl == NULL)
238         return;
239 
240     /* rc = */ TableWriterSeq_Whack(self->tbl, commit, &dummyRows);
241 }
242 
243 /* MARK: SequenceRecord Object */
244 static
SequenceRecordResize(SequenceRecord * self,KDataBuffer * storage,unsigned numreads,unsigned seqLen)245 rc_t SequenceRecordResize(SequenceRecord *self,
246                           KDataBuffer *storage,
247                           unsigned numreads,
248                           unsigned seqLen)
249 {
250     size_t sz;
251     rc_t rc;
252 
253     sz = seqLen * (sizeof(self->seq[0]) + sizeof(self->qual[0])) +
254          numreads * (sizeof(self->ti) +
255                      sizeof(self->readStart[0]) +
256                      sizeof(self->readLen[0]) +
257                      sizeof(self->aligned[0]) +
258                      sizeof(self->orientation[0]) +
259                      sizeof(self->alignmentCount[0]) +
260                      sizeof(self->cskey[0])
261                      );
262     storage->elem_bits = 8;
263     rc = KDataBufferResize(storage, sz);
264     if (rc)
265         return rc;
266     self->numreads = numreads;
267 
268     self->ti = (uint64_t *)storage->base;
269     self->readStart = (uint32_t *)&self->ti[numreads];
270     self->readLen = (uint32_t *)&self->readStart[numreads];
271     self->aligned = (bool *)&self->readLen[numreads];
272     self->orientation = (uint8_t *)&self->aligned[numreads];
273     self->is_bad = (uint8_t *)&self->orientation[numreads];
274     self->alignmentCount = (uint8_t *)&self->is_bad[numreads];
275     self->cskey = (char *)&self->alignmentCount[numreads];
276     self->seq = (char *)&self->cskey[numreads];
277     self->qual = (uint8_t *)&self->seq[seqLen];
278 
279     self->spotGroup = NULL;
280     self->spotGroupLen = 0;
281     self->spotName = NULL;
282     self->spotNameLen = 0;
283 
284     return 0;
285 }
286 
SequenceRecordInit(SequenceRecord * self,unsigned numreads,unsigned readLen[])287 rc_t SequenceRecordInit(SequenceRecord *self, unsigned numreads, unsigned readLen[])
288 {
289     unsigned i;
290     unsigned seqlen = 0;
291     rc_t rc;
292 
293     for (i = 0; i != numreads; ++i) {
294         seqlen += readLen[i];
295     }
296     rc = SequenceRecordResize(self, &self->storage, numreads, seqlen);
297     if (rc)
298         return rc;
299     memset(self->storage.base, 0, KDataBufferBytes(&self->storage));
300 
301     for (seqlen = 0, i = 0; i != numreads; ++i) {
302         self->readLen[i] = readLen[i];
303         self->readStart[i] = seqlen;
304         seqlen += readLen[i];
305     }
306     self->numreads = numreads;
307     memset(self->cskey, 'T', numreads);
308     return 0;
309 }
310 
SequenceRecordAppend(SequenceRecord * self,const SequenceRecord * other)311 rc_t SequenceRecordAppend(SequenceRecord *self,
312                           const SequenceRecord *other
313                           )
314 {
315     /* save the locations of the original data */
316     unsigned const seq = (uint8_t const *)self->seq - (uint8_t const *)self->storage.base;
317     unsigned const qual = (uint8_t const *)self->qual - (uint8_t const *)self->storage.base;
318     unsigned const cskey = (uint8_t const *)self->cskey - (uint8_t const *)self->storage.base;
319     unsigned const alignmentCount = (uint8_t const *)self->alignmentCount - (uint8_t const *)self->storage.base;
320     unsigned const is_bad = (uint8_t const *)self->is_bad - (uint8_t const *)self->storage.base;
321     unsigned const orientation = (uint8_t const *)self->orientation - (uint8_t const *)self->storage.base;
322     unsigned const aligned = (uint8_t const *)self->aligned - (uint8_t const *)self->storage.base;
323     unsigned const ti = (uint8_t const *)self->ti - (uint8_t const *)self->storage.base;
324     unsigned const readLen = (uint8_t const *)self->readLen - (uint8_t const *)self->storage.base;
325 /*    unsigned const readStart = (uint8_t const *)self->readStart - (uint8_t const *)self->storage.base;*/
326 
327     rc_t rc;
328     unsigned seqlen;
329     unsigned otherSeqlen;
330     unsigned i;
331     unsigned numreads = self->numreads;
332 
333     for (seqlen = 0, i = 0; i != numreads; ++i) {
334         seqlen += self->readLen[i];
335     }
336     for (otherSeqlen = 0, i = 0; i != other->numreads; ++i) {
337         otherSeqlen += other->readLen[i];
338     }
339 
340     rc = SequenceRecordResize(self, &self->storage, self->numreads + other->numreads, seqlen + otherSeqlen);
341     if (rc)
342         return rc;
343     /* this needs to be reverse order from assignment in Resize function
344      * these regions can overlap
345      */
346     memmove(self->qual,             &((uint8_t const *)self->storage.base)[qual],              seqlen);
347     memmove(self->seq,              &((uint8_t const *)self->storage.base)[seq],               seqlen);
348     memmove(self->cskey,            &((uint8_t const *)self->storage.base)[cskey],             numreads * sizeof(self->cskey[0]));
349     memmove(self->alignmentCount,   &((uint8_t const *)self->storage.base)[alignmentCount],    numreads * sizeof(self->alignmentCount[0]));
350     memmove(self->is_bad,           &((uint8_t const *)self->storage.base)[is_bad],            numreads * sizeof(self->is_bad[0]));
351     memmove(self->orientation,      &((uint8_t const *)self->storage.base)[orientation],       numreads * sizeof(self->orientation[0]));
352     memmove(self->aligned,          &((uint8_t const *)self->storage.base)[aligned],           numreads * sizeof(self->aligned[0]));
353     memmove(self->readLen,          &((uint8_t const *)self->storage.base)[readLen],           numreads * sizeof(self->readLen[0]));
354     memmove(self->ti,               &((uint8_t const *)self->storage.base)[ti],                numreads * sizeof(self->ti[0]));
355 
356     memmove(&self->ti[numreads],             other->ti,              other->numreads * sizeof(self->ti[0]));
357     memmove(&self->readLen[numreads],        other->readLen,         other->numreads * sizeof(self->readLen[0]));
358     memmove(&self->aligned[numreads],        other->aligned,         other->numreads * sizeof(self->aligned[0]));
359     memmove(&self->orientation[numreads],    other->orientation,     other->numreads * sizeof(self->orientation[0]));
360     memmove(&self->is_bad[numreads],         other->is_bad,          other->numreads * sizeof(self->is_bad[0]));
361     memmove(&self->alignmentCount[numreads], other->alignmentCount,  other->numreads * sizeof(self->alignmentCount[0]));
362     memmove(&self->cskey[numreads],          other->cskey,           other->numreads * sizeof(self->cskey[0]));
363     memmove(&self->seq[seqlen],              other->seq,             otherSeqlen);
364     memmove(&self->qual[seqlen],             other->qual,            otherSeqlen);
365 
366     for (i = 0, seqlen = 0; i != self->numreads; ++i) {
367         self->readStart[i] = seqlen;
368         seqlen += self->readLen[i];
369     }
370 
371     return 0;
372 }
373