1 /* @source ajassemwrite *******************************************************
2 **
3 ** AJAX assembly writing functions
4 **
5 ** @author Copyright (C) 2010 Peter Rice
6 ** @version $Revision: 1.50 $
7 ** @modified Oct 25 2010 pmr First AJAX version
8 ** @modified $Date: 2012/07/05 12:04:37 $ by $Author: rice $
9 ** @@
10 **
11 ** This library is free software; you can redistribute it and/or
12 ** modify it under the terms of the GNU Lesser General Public
13 ** License as published by the Free Software Foundation; either
14 ** version 2.1 of the License, or (at your option) any later version.
15 **
16 ** This library is distributed in the hope that it will be useful,
17 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 ** Lesser General Public License for more details.
20 **
21 ** You should have received a copy of the GNU Lesser General Public
22 ** License along with this library; if not, write to the Free Software
23 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
24 ** MA  02110-1301,  USA.
25 **
26 ******************************************************************************/
27 
28 
29 #include "ajlib.h"
30 
31 #include "ajassem.h"
32 #include "ajassemwrite.h"
33 #include "ajseqbam.h"
34 #include "ajfileio.h"
35 #include "ajseq.h"
36 #include "ajutil.h"
37 #include "ajnam.h"
38 #include "ajbamindex.h"
39 
40 #include <ctype.h>
41 
42 #ifdef WIN32
43 #define strdup _strdup
44 #endif
45 
46 static AjPStr assemoutSamLinetxt = NULL;
47 
48 static AjBool assemoutWriteList(AjPFile outf, const AjPAssem assem);
49 
50 static AjBool assemoutWriteBam(AjPFile outf, const AjPAssem assem);
51 static AjBool assemoutWriteMaf(AjPFile outf, const AjPAssem assem);
52 static AjBool assemoutWriteSam(AjPFile outf, const AjPAssem assem);
53 static AjBool assemoutWriteNextList(AjPOutfile outfile, const AjPAssem assem);
54 static AjBool assemoutWriteNextBam(AjPOutfile outfile, const AjPAssem assem);
55 static AjBool assemoutWriteNextMaf(AjPOutfile outfile, const AjPAssem assem);
56 static AjBool assemoutWriteNextSam(AjPOutfile outfile, const AjPAssem assem);
57 static AjBool assemoutWriteSamAlignment(AjPFile outf, const AjPAssemRead r,
58 					AjPAssemContig const* contigs,
59 					ajint ncontigs);
60 static AjBool assemoutWriteBamAlignment(AjPSeqBamBgzf gzfile,
61 					const AjPAssemRead r,
62 					AjPSeqBam bam);
63 
64 static AjPStr assemoutMakeCigar(const char* contig, const char* readseq);
65 
66 static const AjPStr assemSAMGetReadgroupHeaderlines(const AjPAssem assem);
67 static void assemoutBamIndex(AjPOutfile outf);
68 
69 
70 
71 
72 /* @datastatic AssemPOutFormat ************************************************
73 **
74 ** Assembly output formats data structure
75 **
76 ** @alias AssemSoutFormat
77 ** @alias AssemOOutFormat
78 **
79 ** @attr Name [const char*] Format name
80 ** @attr Desc [const char*] Format description
81 ** @attr Write [AjBool function] Output function, returns ajTrue on success
82 ** @attr WriteNext [AjBool function] Partial output function
83 ** @attr Cleanup [void function] Function to write remaining lines on closing
84 ** @@
85 ******************************************************************************/
86 
87 typedef struct AssemSOutFormat
88 {
89     const char *Name;
90     const char *Desc;
91     AjBool (*Write) (AjPFile outf, const AjPAssem assem);
92     AjBool (*WriteNext) (AjPOutfile outfile, const AjPAssem assem);
93     void (*Cleanup) (AjPOutfile outf);
94 } AssemOOutFormat;
95 
96 #define AssemPOutFormat AssemOOutFormat*
97 
98 
99 static AssemOOutFormat assemoutFormatDef[] =
100 {
101 /* "Name",        "Description" */
102 /*      WriteFunction       WriteNextFunction  */
103 
104   {"bam",         "BAM assembly format",
105         &assemoutWriteBam,  &assemoutWriteNextBam,  &assemoutBamIndex},
106   {"list",        "Identifier only",
107         &assemoutWriteList, &assemoutWriteNextList, NULL},
108   {"maf",        "MIRA assembly format",
109         &assemoutWriteMaf,  &assemoutWriteNextMaf,  NULL},
110   {"sam",        "SAM assembly format",
111         &assemoutWriteSam,  &assemoutWriteNextSam,  NULL},
112 
113   {NULL, NULL, NULL, NULL, NULL}
114 };
115 
116 
117 
118 
119 /* @filesection ajassem *******************************************************
120 **
121 ** @nam1rule aj Function belongs to the AJAX library.
122 **
123 ******************************************************************************/
124 
125 
126 
127 
128 
129 /* @datasection [AjPFile] Assembly data output ********************************
130 **
131 ** Function is for manipulating assembly data objects
132 **
133 ** @nam2rule Assemout Assembly data output
134 **
135 ******************************************************************************/
136 
137 
138 
139 
140 /* @section assembly data outputs *********************************************
141 **
142 ** These functions write the assembly data provided by the first argument
143 **
144 ** @fdata [AjPFile]
145 **
146 ** @nam3rule Write Write assembly data
147 ** @nam4rule Format Use a named format
148 ** @nam4rule Next  Write next block of data
149 **
150 ** @argrule Write outf [AjPOutfile] Output file
151 ** @argrule Write assem [const AjPAssem] Assembly data
152 ** @argrule Format fmt [const AjPStr] Format name
153 **
154 ** @valrule * [AjBool] true on success
155 **
156 ** @fcategory output
157 **
158 ******************************************************************************/
159 
160 
161 
162 
163 /* @func ajAssemoutWrite ******************************************************
164 **
165 ** Write assembly data in a named format
166 **
167 ** @param [u] outf [AjPOutfile] Output file
168 ** @param [r] assem [const AjPAssem] Assembly data object
169 **
170 ** @return [AjBool] True on success
171 **
172 **
173 ** @release 6.5.0
174 ******************************************************************************/
175 
ajAssemoutWrite(AjPOutfile outf,const AjPAssem assem)176 AjBool ajAssemoutWrite(AjPOutfile outf, const AjPAssem assem)
177 {
178     ajuint i = ajOutfileGetFormatindex(outf);
179     AjPFile outfile = ajOutfileGetFile(outf);
180 
181     ajDebug("ajAssemoutWrite %d\n",i);
182     return (*assemoutFormatDef[i].Write)(outfile, assem);
183 }
184 
185 
186 
187 
188 /* @func ajAssemoutWriteNext **************************************************
189 **
190 ** Write latest chunk of assembly data in a named format
191 **
192 ** @param [u] outf [AjPOutfile] Output file
193 ** @param [r] assem [const AjPAssem] Assembly data object
194 **
195 ** @return [AjBool] True on success
196 **
197 **
198 ** @release 6.5.0
199 ******************************************************************************/
200 
ajAssemoutWriteNext(AjPOutfile outf,const AjPAssem assem)201 AjBool ajAssemoutWriteNext(AjPOutfile outf, const AjPAssem assem)
202 {
203     ajuint i = ajOutfileGetFormatindex(outf);
204 
205     ajDebug("ajAssemoutWriteNext %d\n",i);
206     if(!outf->Cleanup && assemoutFormatDef[i].Cleanup)
207         outf->Cleanup = assemoutFormatDef[i].Cleanup;
208 
209     return (*assemoutFormatDef[i].WriteNext)(outf, assem);
210 }
211 
212 
213 
214 
215 /* @funcstatic assemoutWriteBam ***********************************************
216 **
217 ** Writes an assembly in binary alignment/map (BAM) format.
218 **
219 ** The sort order is "unsorted". Samtools can re-sort the file.
220 **
221 ** @param [u] outf [AjPFile] Output file object.
222 ** @param [r] assem [const AjPAssem] Assembly object
223 ** @return [AjBool] True on success
224 **
225 ** @release 6.5.0
226 ** @@
227 ******************************************************************************/
228 
assemoutWriteBam(AjPFile outf,const AjPAssem assem)229 static AjBool assemoutWriteBam(AjPFile outf, const AjPAssem assem)
230 {
231     AjPSeqBamHeader header = NULL;
232     AjPAssemContig c = NULL;
233     AjPSeqBam bam;
234     AjPAssemRead   r = NULL;
235     AjPAssemContig* contigs = NULL;
236     AjPAssemTag    t = NULL;
237     AjIList j = NULL;
238     AjPSeqBamBgzf gzfile = NULL;
239     AjPStr headertext=NULL;
240     const AjPStr rgheadertext=NULL;
241     AjBool ret = ajTrue;
242     ajint i=0;
243     ajulong ncontigs=0UL;
244 
245 
246     if(ajListGetLength(assem->ContigsOrder))
247 	ncontigs = ajListToarray(assem->ContigsOrder, (void***)&contigs);
248     else
249 	ncontigs = ajTableToarrayValues(assem->Contigs, (void***)&contigs);
250 
251     AJNEW0(bam);
252     bam->m_data=10;
253     AJCNEW0(bam->data, bam->m_data);
254 
255     gzfile = ajSeqBamBgzfNew(ajFileGetFileptr(outf), "w");
256 
257     ajFmtPrintS(&headertext, "@HD\tVN:1.3\tSO:%s\n",
258 		ajAssemGetSortorderC(assem));
259     header = ajSeqBamHeaderNewN((ajuint) ncontigs);
260 
261     while (contigs[i])   /* contigs */
262     {
263 	c = contigs[i];
264 
265 	if(ajStrMatchC(c->Name, "*"))
266 	{
267 	    i++;
268 	    continue;
269 	}
270 
271 	header->target_name[i] = strdup(ajStrGetPtr(c->Name));
272 	header->target_len[i++] = c->Length;
273 
274 	ajFmtPrintAppS(&headertext, "@SQ\tSN:%S\tLN:%d",
275 		c->Name, c->Length);
276 
277 	if(c->URI)
278 	    ajFmtPrintAppS(&headertext, "\tUR:%S", c->URI);
279 
280 	if(c->MD5)
281 	    ajFmtPrintAppS(&headertext, "\tM5:%S", c->MD5);
282 
283 	if(c->Species)
284 	    ajFmtPrintAppS(&headertext, "\tSP:%S", c->Species);
285 
286 	ajFmtPrintAppS(&headertext, "\n");
287 
288 
289 	j = ajListIterNewread(c->Tags);
290 	while (!ajListIterDone(j))
291 	{
292 	    t = ajListIterGet(j);
293 	    ajFmtPrintAppS(&headertext,
294 		    "@CO\t%S %u %u %S\n", t->Name, t->x1, t->y1,
295 		    t->Comment);
296 	}
297 	ajListIterDel(&j);
298     }
299 
300     rgheadertext = assemSAMGetReadgroupHeaderlines(assem);
301     if(rgheadertext)
302 	ajStrAppendS(&headertext, rgheadertext);
303 
304     ajSeqBamHeaderSetTextC(header, ajStrGetPtr(headertext));
305     ajSeqBamHeaderWrite(gzfile, header);
306 
307     j = ajListIterNewread(assem->Reads);
308 
309     while (!ajListIterDone(j))  /* reads */
310     {
311 	r = ajListIterGet(j);
312 	assemoutWriteBamAlignment(gzfile, r, bam);
313     }
314 
315     ajListIterDel(&j);
316 
317     ajSeqBamBgzfClose(gzfile);
318     ajSeqBamHeaderDel(&header);
319     ajStrDel(&headertext);
320 
321     AJFREE(contigs);
322     AJFREE(bam->data);
323     AJFREE(bam);
324 
325     ajBamIndexBuild(ajFileGetNameC(outf));
326 
327     return ret;
328 }
329 
330 
331 
332 
333 /* @funcstatic assemoutBamIndex ***********************************************
334 **
335 ** Closes and idexes a BAM output file
336 **
337 ** @param [u] outf [AjPOutfile] Output file object.
338 ** @return [void]
339 **
340 ** @release 6.5.0
341 ** @@
342 ******************************************************************************/
343 
assemoutBamIndex(AjPOutfile outf)344 static void assemoutBamIndex(AjPOutfile outf)
345 {
346     char *fname = ajCharNewC(ajFileGetNameC(ajOutfileGetFile(outf)));
347     AjPSeqBamBgzf gzfile = outf->OutData;
348 
349     ajSeqBamBgzfClose(gzfile);
350     ajFileClose(&outf->File);
351 
352     ajBamIndexBuild(fname);
353 
354     AJFREE(fname);
355 
356     return;
357 }
358 
359 
360 
361 
362 /* @funcstatic assemoutWriteNextBam *******************************************
363 **
364 ** Writes an assembly in binary alignment/map (BAM) format.
365 **
366 ** The sort order is "unsorted". Samtools can re-sort the file.
367 **
368 ** @param [u] outfile [AjPOutfile] Output file object.
369 ** @param [r] assem [const AjPAssem] Assembly object
370 ** @return [AjBool] True on success
371 **
372 ** @release 6.5.0
373 ** @@
374 ******************************************************************************/
375 
assemoutWriteNextBam(AjPOutfile outfile,const AjPAssem assem)376 static AjBool assemoutWriteNextBam(AjPOutfile outfile, const AjPAssem assem)
377 {
378     AjPFile outf = ajOutfileGetFile(outfile);
379     AjPSeqBamHeader header = NULL;
380     AjPAssemContig c = NULL;
381     AjPSeqBam bam;
382     AjPAssemRead   r = NULL;
383     AjPAssemContig* contigs = NULL;
384     AjPAssemTag    t = NULL;
385     AjIList j = NULL;
386     AjPSeqBamBgzf gzfile = NULL;
387     AjPStr headertext=NULL;
388     const AjPStr rgheadertext=NULL;
389     AjBool ret = ajTrue;
390     ajint i=0;
391     ajulong ncontigs=0UL;
392 
393     if(!outf) return ajFalse;
394     if(!assem) return ajFalse;
395 
396     if(!assem->Hasdata)
397     {
398         if(ajListGetLength(assem->ContigsOrder))
399             ncontigs = ajListToarray(assem->ContigsOrder, (void***)&contigs);
400         else
401             ncontigs = ajTableToarrayValues(assem->Contigs, (void***)&contigs);
402 
403         ajFmtPrintS(&headertext, "@HD\tVN:1.3\tSO:%s\n",
404                     ajAssemGetSortorderC(assem));
405         header = ajSeqBamHeaderNewN((ajuint) ncontigs);
406 
407         gzfile = ajSeqBamBgzfNew(ajFileGetFileptr(outf), "w");
408         outfile->OutData = gzfile;
409 
410         while (contigs[i])   /* contigs */
411         {
412             c = contigs[i];
413 
414             if(ajStrMatchC(c->Name, "*"))
415             {
416                 i++;
417                 continue;
418             }
419 
420             header->target_name[i] = strdup(ajStrGetPtr(c->Name));
421             header->target_len[i++] = c->Length;
422 
423             ajFmtPrintAppS(&headertext, "@SQ\tSN:%S\tLN:%d",
424                            c->Name, c->Length);
425 
426             if(c->URI)
427                 ajFmtPrintAppS(&headertext, "\tUR:%S", c->URI);
428 
429             if(c->MD5)
430                 ajFmtPrintAppS(&headertext, "\tM5:%S", c->MD5);
431 
432             if(c->Species)
433                 ajFmtPrintAppS(&headertext, "\tSP:%S", c->Species);
434 
435             ajFmtPrintAppS(&headertext, "\n");
436 
437 
438             j = ajListIterNewread(c->Tags);
439             while (!ajListIterDone(j))
440             {
441                 t = ajListIterGet(j);
442                 ajFmtPrintAppS(&headertext,
443                                "@CO\t%S %u %u %S\n", t->Name, t->x1, t->y1,
444                                t->Comment);
445             }
446             ajListIterDel(&j);
447         }
448 
449         rgheadertext = assemSAMGetReadgroupHeaderlines(assem);
450         if(rgheadertext)
451             ajStrAppendS(&headertext, rgheadertext);
452 
453         ajSeqBamHeaderSetTextC(header, ajStrGetPtr(headertext));
454         ajSeqBamHeaderWrite(gzfile, header);
455 
456         ajSeqBamHeaderDel(&header);
457         ajStrDel(&headertext);
458 
459         AJFREE(contigs);
460 
461         if(!assem->BamHeader)
462             return ajTrue;
463     }
464 
465     /* data */
466 
467     gzfile = outfile->OutData;
468 
469     AJNEW0(bam);
470     bam->m_data=10;
471     AJCNEW0(bam->data, bam->m_data);
472 
473     j = ajListIterNewread(assem->Reads);
474 
475     while (!ajListIterDone(j))  /* reads */
476     {
477 	r = ajListIterGet(j);
478 	assemoutWriteBamAlignment(gzfile, r, bam);
479     }
480 
481     ajListIterDel(&j);
482 
483     AJFREE(bam->data);
484     AJFREE(bam);
485 
486     /* ajSeqBamBgzfClose(gzfile);*/
487 
488     return ret;
489 }
490 
491 
492 
493 
494 /* @funcstatic assemoutWriteBamAlignment **************************************
495 **
496 ** Write individual alignment records of assembly data in BAM format
497 **
498 ** @param [u] gzfile [AjPSeqBamBgzf] Output file
499 ** @param [r] r [const AjPAssemRead] Read object with alignment information
500 ** @param [u] bam [AjPSeqBam] samtools data structure to store
501 **                            alignment data before it is written in BAM format
502 **
503 ** @return [AjBool] True on success
504 **
505 ** @release 6.5.0
506 **
507 ******************************************************************************/
508 
assemoutWriteBamAlignment(AjPSeqBamBgzf gzfile,const AjPAssemRead r,AjPSeqBam bam)509 static AjBool assemoutWriteBamAlignment(AjPSeqBamBgzf gzfile,
510 					const AjPAssemRead r,
511 					AjPSeqBam bam)
512 {
513     AjPSeqBamCore c;
514     AjPAssemTag tag;
515     unsigned char *dpos;
516     const char *s;
517     ajuint ilen;
518     ajuint slen;
519     ajuint i;
520     AjIList l = NULL;
521 
522     /* optional fields */
523     ajint tagvalsize = 0;
524     const unsigned char* tagval = 0;
525     ajint intval =0;
526 
527     /* processing cigar strings*/
528     char *t;
529     int op;
530     long x;
531 
532 
533     unsigned char bam_nt16_table[256] =
534     {
535      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
536      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
537      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
538      1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
539      15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
540      15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
541      15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
542      15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
543      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
544      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
545      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
546      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
547      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
548      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
549      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
550      15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
551     };
552 
553     /* bam_write1 for each alignment */
554 
555     c = &bam->core;
556 
557     ilen = ajStrGetLen(r->Seq);
558 
559     c->tid = (int) r->Reference;
560 
561     if(r->Flag & BAM_FREVERSE)
562 	c->pos = r->y1-1;
563     else
564 	c->pos = r->x1-1; /* BAM format is zero based;
565                              -1 is translated to 0, meaning unmapped */
566     c->bin = 0;
567     c->qual = r->MapQ;
568     c->l_qname = 1 + ajStrGetLen(r->Name);
569     c->flag = r->Flag;
570     c->n_cigar = 0;
571     c->l_qseq = ilen;
572     c->mtid = (int) r->Rnext;
573     c->mpos = (int) r->Pnext-1;
574     c->isize = r->Tlen;
575 
576 
577     /* get cigar string length */
578     s = ajStrGetPtr(r->Cigar);
579     if (strcmp(s,"*")) /* '*' means unavailable  */
580     {
581 	for (; *s; ++s)
582 	{
583 	  if ((isalpha((int)*s)) || (*s=='='))
584 		++c->n_cigar;
585 	    else
586 	      if (!isdigit((int)*s))
587 		    ajWarn("invalid CIGAR character: %c\n", *s);
588 	}
589     }
590 
591 
592     bam->data_len = c->n_cigar*4 + c->l_qname +
593         (ilen + 1)/2 + ilen;
594 
595     /* allocation for optional tags are made as they are appended */
596 
597     if(bam->data_len > bam->m_data)
598     {
599         AJCRESIZE0(bam->data,bam->m_data, bam->data_len);
600         bam->m_data = bam->data_len;
601     }
602 
603     dpos = bam->data;
604 
605     /* copy query name to bam->data */
606     memcpy(dpos, ajStrGetPtr(r->Name), c->l_qname);
607     dpos += c->l_qname;
608 
609     /* copy cigar string to bam->data */
610     s = ajStrGetPtr(r->Cigar);
611     for (i = 0; i != c->n_cigar; ++i)
612     {
613 	x = strtol(s, &t, 10);
614 	op = toupper((int)*t);
615 	if (op == 'M') op = BAM_CMATCH;
616 	else if (op == 'I') op = BAM_CINS;
617 	else if (op == 'D') op = BAM_CDEL;
618 	else if (op == 'N') op = BAM_CREF_SKIP;
619 	else if (op == 'S') op = BAM_CSOFT_CLIP;
620 	else if (op == 'H') op = BAM_CHARD_CLIP;
621 	else if (op == 'P') op = BAM_CPAD;
622 	else if (op == '=') op = BAM_CEQUAL;
623 	else if (op == 'X') op = BAM_CDIFF;
624 	else ajWarn("invalid CIGAR operation: %c",op);
625 	s = t + 1;
626 	((ajuint*)dpos)[i] = x << BAM_CIGAR_SHIFT | op;
627     }
628 
629     if (*s && c->n_cigar)
630 	ajWarn("unmatched CIGAR operation: %c", *s);
631 
632     c->bin = ajSeqBamReg2bin(c->pos, ajSeqBamCalend(c, MAJSEQBAMCIGAR(bam)));
633 
634     dpos += c->n_cigar*4;
635 
636 
637     /* copy sequence string to bam->data */
638     s = ajStrGetPtr(r->Seq);
639     slen = (ilen+1)/2;
640     for (i = 0; i < slen; ++i)
641         dpos[i] = 0;
642     for (i = 0; i < ilen; ++i)
643         dpos[i/2] |= bam_nt16_table[(ajuint)s[i]] << 4*(1-i%2);
644     dpos += slen;
645 
646     /* copy quality values to bam->data */
647     if(r->SeqQ && !ajStrMatchC(r->SeqQ, "*"))
648     {
649 	s = ajStrGetPtr(r->SeqQ);
650 
651 	for(i=0;i<ilen;i++)
652 	    dpos[i]= s[i]-33;
653 
654     }
655     else
656 	for(i=0;i<ilen;i++)
657 	    dpos[i]= 0xff;
658 
659 
660 
661     l = ajListIterNewread(r->Tags);
662     bam->l_aux=0;
663     while (!ajListIterDone(l))
664     {
665 
666 	tag = ajListIterGet(l);
667 
668 	/* TODO: array type 'B' and other types */
669 
670 	if(tag->type == 'i' || tag->type == 'I')
671 	{
672 	    tagvalsize = 4;
673 	    ajStrToInt(tag->Comment, &intval);
674 	    tagval = (unsigned char*)&intval;
675 	}
676 	else if(tag->type =='s' || tag->type =='S')
677 	{
678 	    tagvalsize = 2;
679 	    ajStrToInt(tag->Comment, &intval);
680 	    tagval = (unsigned char*)&intval;
681 	}
682 	else if(tag->type =='c' || tag->type =='C')
683 	{
684 	    tagvalsize = 1;
685 	    ajStrToInt(tag->Comment, &intval);
686 	    tagval = (unsigned char*)&intval;
687 	}
688 	else if(tag->type =='A')
689 	{
690 	    tagvalsize = 1;
691 	    tagval = (const unsigned char*)ajStrGetPtr(tag->Comment);
692 	}
693 	else if(tag->type =='Z')
694 	{
695 	    tagvalsize = ajStrGetLen(tag->Comment)+1;
696 	    tagval = (const unsigned char*)ajStrGetPtr(tag->Comment);
697 	}
698 	else
699 	{
700 	    ajWarn("tag type '%c' not yet supported",tag->type);
701 	    continue;
702 	}
703 	ajSeqBamAuxAppend(bam,
704 		ajStrGetPtr(tag->Name),
705 		tag->type,
706 		tagvalsize,
707 		tagval);
708 
709 
710     }
711     ajListIterDel(&l);
712 
713 
714 
715     ajSeqBamWrite(gzfile, bam);
716 
717     return ajTrue;
718 }
719 
720 
721 
722 
723 /* @funcstatic assemoutWriteMaf ***********************************************
724 **
725 ** Write assembly data in Mira Assembly Format
726 **
727 ** @param [u] outf [AjPFile] Output file
728 ** @param [r] assem [const AjPAssem] Assembly data object
729 **
730 ** @return [AjBool] True on success
731 **
732 **
733 ** @release 6.5.0
734 ******************************************************************************/
735 
assemoutWriteMaf(AjPFile outf,const AjPAssem assem)736 static AjBool assemoutWriteMaf(AjPFile outf, const AjPAssem assem)
737 {
738     AjPAssemContig c = NULL;
739     AjPAssemRead   r = NULL;
740     AjPAssemTag    t = NULL;
741     ajint i = 0;
742     ajint nreads = 0;
743     AjIList j = NULL;
744     AjIList k = NULL;
745     AjIList reads = NULL;
746     AjPAssemContig* contigs = NULL;
747 
748     if(!outf) return ajFalse;
749     if(!assem) return ajFalse;
750 
751     ajTableToarrayValues(assem->Contigs, (void***)&contigs);
752     reads = ajListIterNewread(assem->Reads);
753 
754     for (;contigs[i];i++)
755     {
756 	c = contigs[i];
757 	ajFmtPrintF(outf, "CO\t%S\n", c->Name);
758 	ajFmtPrintF(outf, "NR\t%d\n", c->Nreads);
759 	ajFmtPrintF(outf, "LC\t%d\n", c->Length);
760 
761 	j = ajListIterNewread(c->Tags);
762 	while (!ajListIterDone(j))
763 	{
764 	    t = ajListIterGet(j);
765 	    ajFmtPrintF(outf, "CT\t%S %u %u %S\n", t->Name, t->x1, t->y1,
766 		    t->Comment);
767 	}
768 	ajListIterDel(&j);
769 
770 	ajFmtPrintF(outf, "CS\t%S\n", c->Consensus);
771 	ajFmtPrintF(outf, "CQ\t%S\n", c->ConsensusQ);
772 
773 	ajFmtPrintF(outf, "\\\\\n");
774 
775 	while (nreads++ < c->Nreads && !ajListIterDone(reads))
776 	{
777 	    r = ajListIterGet(reads);
778 
779 	    if(r->Reference !=i)
780 		ajErr("different reference/contig number(%d) than expected(%d)"
781 			"\nreads were expected to be sorted by contigs");
782 
783 	    ajFmtPrintF(outf, "RD\t%S\n", r->Name);
784 	    ajFmtPrintF(outf, "RS\t%S\n", r->Seq);
785 	    ajFmtPrintF(outf, "RQ\t%S\n", r->SeqQ);
786 	    ajFmtPrintF(outf, "TN\t%S\n", r->Template);
787 
788 	    if(r->Direction)
789 		ajFmtPrintF(outf, "DI\t%c\n", r->Direction);
790 
791 	    if(r->TemplateSizeMin)
792 		ajFmtPrintF(outf, "TF\t%d\n", r->TemplateSizeMin);
793 
794 	    if(r->TemplateSizeMax)
795 		ajFmtPrintF(outf, "TT\t%d\n", r->TemplateSizeMax);
796 
797 	    if(r->File)
798 		ajFmtPrintF(outf, "SF\t%S\n", r->File);
799 
800 	    if(r->VectorLeft)
801 		ajFmtPrintF(outf, "SL\t%d\n", r->VectorLeft);
802 
803 	    if(r->VectorRight)
804 		ajFmtPrintF(outf, "SR\t%d\n", r->VectorRight);
805 
806 	    if(r->QualLeft)
807 		ajFmtPrintF(outf, "QL\t%d\n", r->QualLeft);
808 
809 	    if(r->QualRight)
810 		ajFmtPrintF(outf, "QR\t%d\n", r->QualRight);
811 
812 	    if(r->ClipLeft)
813 		ajFmtPrintF(outf, "SL\t%d\n", r->ClipLeft);
814 
815 	    if(r->ClipRight)
816 		ajFmtPrintF(outf, "SR\t%d\n", r->ClipRight);
817 
818 	    k = ajListIterNewread(r->Tags);
819 
820 	    while (!ajListIterDone(k))
821 	    {
822 		t = ajListIterGet(k);
823 
824 		ajFmtPrintF(outf, "RT\t%S %u %u\n", t->Name,
825 			t->x1, t->y1);
826 	    }
827 
828             ajListIterDel(&k);
829 
830 	    ajFmtPrintF(outf, "ST\t%S\n", r->Technology);
831 	    ajFmtPrintF(outf, "ER\n");
832 	    ajFmtPrintF(outf, "AT\t%u %u %u %u\n", r->x1, r->y1, r->x2, r->y2);
833 	}
834 
835 	ajListIterDel(&j);
836 	ajFmtPrintF(outf, "//\n");
837 	ajFmtPrintF(outf, "EC\n");
838     }
839 
840     AJFREE(contigs);
841     ajListIterDel(&reads);
842 
843     return ajTrue;
844 }
845 
846 
847 
848 
849 /* @funcstatic assemoutWriteNextMaf *******************************************
850 **
851 ** Write latest chunk of assembly data in Mira Assembly Format
852 **
853 ** @param [u] outfile [AjPOutfile] Output file
854 ** @param [r] assem [const AjPAssem] Assembly data object
855 **
856 ** @return [AjBool] True on success
857 **
858 **
859 ** @release 6.5.0
860 ******************************************************************************/
861 
assemoutWriteNextMaf(AjPOutfile outfile,const AjPAssem assem)862 static AjBool assemoutWriteNextMaf(AjPOutfile outfile, const AjPAssem assem)
863 {
864     AjPFile outf = ajOutfileGetFile(outfile);
865     AjPAssemContig c = NULL;
866     AjPAssemRead   r = NULL;
867     AjPAssemTag    t = NULL;
868     ajint i = 0;
869     ajint nreads = 0;
870     AjIList j = NULL;
871     AjIList k = NULL;
872     AjIList reads = NULL;
873     AjPAssemContig* contigs = NULL;
874 
875     if(!outf) return ajFalse;
876     if(!assem) return ajFalse;
877 
878     if(!assem->Hasdata)
879     {
880         ajTableToarrayValues(assem->Contigs, (void***)&contigs);
881         for (;contigs[i];i++)
882         {
883             c = contigs[i];
884             ajFmtPrintF(outf, "CO\t%S\n", c->Name);
885             ajFmtPrintF(outf, "NR\t%d\n", c->Nreads);
886             ajFmtPrintF(outf, "LC\t%d\n", c->Length);
887 
888             j = ajListIterNewread(c->Tags);
889             while (!ajListIterDone(j))
890             {
891                 t = ajListIterGet(j);
892                 ajFmtPrintF(outf, "CT\t%S %u %u %S\n", t->Name, t->x1, t->y1,
893                             t->Comment);
894             }
895             ajListIterDel(&j);
896 
897             ajFmtPrintF(outf, "CS\t%S\n", c->Consensus);
898             ajFmtPrintF(outf, "CQ\t%S\n", c->ConsensusQ);
899 
900             ajFmtPrintF(outf, "\\\\\n");
901         }
902         AJFREE(contigs);
903 
904         return ajTrue;
905     }
906 
907     /* data */
908 
909     reads = ajListIterNewread(assem->Reads);
910 
911     while (!ajListIterDone(reads))
912     {
913         nreads++;
914         r = ajListIterGet(reads);
915 
916         if(r->Reference !=i)
917             ajErr("different reference/contig number(%d) than expected(%d)"
918                   "\nreads were expected to be sorted by contigs");
919 
920         ajFmtPrintF(outf, "RD\t%S\n", r->Name);
921         ajFmtPrintF(outf, "RS\t%S\n", r->Seq);
922         ajFmtPrintF(outf, "RQ\t%S\n", r->SeqQ);
923         ajFmtPrintF(outf, "TN\t%S\n", r->Template);
924 
925         if(r->Direction)
926             ajFmtPrintF(outf, "DI\t%c\n", r->Direction);
927 
928         if(r->TemplateSizeMin)
929             ajFmtPrintF(outf, "TF\t%d\n", r->TemplateSizeMin);
930 
931         if(r->TemplateSizeMax)
932             ajFmtPrintF(outf, "TT\t%d\n", r->TemplateSizeMax);
933 
934         if(r->File)
935             ajFmtPrintF(outf, "SF\t%S\n", r->File);
936 
937         if(r->VectorLeft)
938             ajFmtPrintF(outf, "SL\t%d\n", r->VectorLeft);
939 
940         if(r->VectorRight)
941             ajFmtPrintF(outf, "SR\t%d\n", r->VectorRight);
942 
943         if(r->QualLeft)
944             ajFmtPrintF(outf, "QL\t%d\n", r->QualLeft);
945 
946         if(r->QualRight)
947             ajFmtPrintF(outf, "QR\t%d\n", r->QualRight);
948 
949         if(r->ClipLeft)
950             ajFmtPrintF(outf, "SL\t%d\n", r->ClipLeft);
951 
952         if(r->ClipRight)
953             ajFmtPrintF(outf, "SR\t%d\n", r->ClipRight);
954 
955         k = ajListIterNewread(r->Tags);
956 
957         while (!ajListIterDone(k))
958         {
959             t = ajListIterGet(k);
960 
961             ajFmtPrintF(outf, "RT\t%S %u %u\n", t->Name,
962 			t->x1, t->y1);
963         }
964 
965         ajListIterDel(&k);
966 
967         ajFmtPrintF(outf, "ST\t%S\n", r->Technology);
968         ajFmtPrintF(outf, "ER\n");
969         ajFmtPrintF(outf, "AT\t%u %u %u %u\n", r->x1, r->y1, r->x2, r->y2);
970     }
971 
972     ajListIterDel(&j);
973     ajListIterDel(&reads);
974 
975     if(!nreads)
976     {
977         ajFmtPrintF(outf, "//\n");
978         ajFmtPrintF(outf, "EC\n");
979     }
980 
981 
982     return ajTrue;
983 }
984 
985 
986 
987 
988 /* @funcstatic assemoutMakeCigar **********************************************
989 **
990 ** Given a reference sequence region and a read sequence returns CIGAR string
991 ** representing the alignment.
992 **
993 ** Based on make_cigar function in Peter Cock's maf2sam project
994 **
995 ** We have a similar function in ajalign.c, called alignCigar(),
996 ** we should aim using that function rather than having two similar functions
997 **
998 ** @param [r] contig  [const char*] Reference sequence region
999 ** @param [r] readseq [const char*] Read sequence
1000 **
1001 ** @return [AjPStr] CIGAR string
1002 **
1003 **
1004 ** @release 6.5.0
1005 ******************************************************************************/
1006 
assemoutMakeCigar(const char * contig,const char * readseq)1007 static AjPStr assemoutMakeCigar(const char* contig, const char* readseq)
1008 {
1009     AjPStr cigar = NULL;
1010     ajint count =0;
1011 
1012     char mode = '\0';
1013     char c;
1014     char r;
1015 
1016     c = contig[0];
1017     r = readseq[0];
1018 
1019     cigar = ajStrNewC("");
1020 
1021     for(;c!=0; c=*++contig, r=*++readseq)
1022     {
1023 
1024 	if(c=='*' && r=='*')
1025 	    continue;
1026 	else if(c!='*' && r!='*')
1027 	{
1028 	    /* alignment match/mismatch */
1029 	    if (mode!='M')
1030 	    {
1031 		if(count)
1032 		    ajFmtPrintAppS(&cigar, "%d%c", count, mode);
1033 		mode = 'M';
1034 		count = 1;
1035 	    }
1036 	    else
1037 		count++;
1038 	}
1039 	else if(c=='*')
1040 	{
1041 	    if (mode!='I')
1042 	    {
1043 		if(count)
1044 		    ajFmtPrintAppS(&cigar, "%d%c", count, mode);
1045 		mode = 'I';
1046 		count = 1;
1047 	    }
1048 	    else
1049 		count++;
1050 	}
1051 	else if(r=='*')
1052 	{
1053 	    if (mode!='D')
1054 	    {
1055 		if(count)
1056 		    ajFmtPrintAppS(&cigar, "%d%c", count, mode);
1057 		mode = 'D';
1058 		count = 1;
1059 	    }
1060 	    else
1061 		count++;
1062 	}
1063 	else
1064 	    ajErr("something wrong!!!");
1065     }
1066 
1067     if(count)
1068 	ajFmtPrintAppS(&cigar, "%d%c", count, mode);
1069 
1070     return cigar;
1071 }
1072 
1073 
1074 
1075 
1076 /* @funcstatic assemoutWriteSam ***********************************************
1077 **
1078 ** Write assembly data in SAM format
1079 **
1080 ** @param [u] outf [AjPFile] Output file
1081 ** @param [r] assem [const AjPAssem] Assembly data object
1082 **
1083 ** @return [AjBool] True on success
1084 **
1085 **
1086 ** @release 6.5.0
1087 ******************************************************************************/
1088 
assemoutWriteSam(AjPFile outf,const AjPAssem assem)1089 static AjBool assemoutWriteSam(AjPFile outf, const AjPAssem assem)
1090 {
1091     AjPAssemContig c = NULL;
1092     AjPAssemRead   r = NULL;
1093     AjPAssemTag    t = NULL;
1094     AjPAssemContig* contigs = NULL;
1095     AjIList j = NULL;
1096     AjPStr argstr = NULL;
1097     const AjPStr headertext = NULL;
1098     ajint n = 0;
1099     ajint i = 0;
1100     AjBool ret = ajTrue;
1101 
1102     if(!outf || !assem)
1103 	return ajFalse;
1104 
1105     ajDebug("assemoutWriteSam: # of contigs = %d\n", n);
1106 
1107     ajFmtPrintF(outf, "@HD\tVN:1.3\tSO:%s\n", ajAssemGetSortorderC(assem));
1108 
1109     /* Program record */
1110     argstr = ajStrNewS(ajUtilGetCmdline());
1111     ajStrExchangeKK(&argstr, '\n', ' ');
1112     ajFmtPrintF(outf, "@PG\tID:%S\tVN:%S\tCL:%S\n",
1113 	    ajUtilGetProgram(), ajNamValueVersion(), argstr);
1114     ajStrDel(&argstr);
1115 
1116 
1117     if(ajListGetLength(assem->ContigsOrder))
1118 	ajListToarray(assem->ContigsOrder, (void***)&contigs);
1119     else
1120 	ajTableToarrayValues(assem->Contigs, (void***)&contigs);
1121 
1122     while (contigs[i])   /* contigs */
1123     {
1124 	c = contigs[i++];
1125 
1126 	if(!ajStrMatchC(c->Name, "*"))
1127 	{
1128 	    ajFmtPrintF(outf, "@SQ\tSN:%S\tLN:%d", c->Name, c->Length);
1129 
1130 	    if(c->URI)
1131 		ajFmtPrintF(outf, "\tUR:%S", c->URI);
1132 
1133 	    if(c->MD5)
1134 		ajFmtPrintF(outf, "\tM5:%S", c->MD5);
1135 
1136 	    if(c->Species)
1137 		ajFmtPrintF(outf, "\tSP:%S", c->Species);
1138 
1139 	    ajFmtPrintF(outf, "\n");
1140 
1141 	    j = ajListIterNewread(c->Tags);
1142 	    while (!ajListIterDone(j))
1143 	    {
1144 		t = ajListIterGet(j);
1145 		ajFmtPrintF(outf, "@CO\t%S %u %u %S\n", t->Name, t->x1, t->y1,
1146 			t->Comment);
1147 	    }
1148 	    ajListIterDel(&j);
1149 	}
1150     }
1151 
1152     headertext = assemSAMGetReadgroupHeaderlines(assem);
1153     if(headertext)
1154 	ajFmtPrintF(outf,"%S", headertext);
1155 
1156     j = ajListIterNewread(assem->Reads);
1157 
1158     while (!ajListIterDone(j))  /* reads */
1159     {
1160 	r = ajListIterGet(j);
1161 	assemoutWriteSamAlignment(outf, r, contigs, i);
1162     }
1163 
1164     ajListIterDel(&j);
1165     AJFREE(contigs);
1166 
1167     return ret;
1168 }
1169 
1170 
1171 
1172 
1173 /* @funcstatic assemoutWriteNextSam *******************************************
1174 **
1175 ** Write latest chunk of assembly data in SAM format
1176 **
1177 ** @param [u] outfile [AjPOutfile] Output file
1178 ** @param [r] assem [const AjPAssem] Assembly data object
1179 **
1180 ** @return [AjBool] True on success
1181 **
1182 **
1183 ** @release 6.5.0
1184 ******************************************************************************/
1185 
assemoutWriteNextSam(AjPOutfile outfile,const AjPAssem assem)1186 static AjBool assemoutWriteNextSam(AjPOutfile outfile, const AjPAssem assem)
1187 {
1188     AjPFile outf = ajOutfileGetFile(outfile);
1189     AjPAssemContig c = NULL;
1190     AjPAssemRead   r = NULL;
1191     AjPAssemTag    t = NULL;
1192     AjPAssemContig* contigs = NULL;
1193     AjIList j = NULL;
1194     AjPStr argstr = NULL;
1195     const AjPStr headertext = NULL;
1196     ajint n = 0;
1197     ajulong i = 0UL;
1198     AjBool ret = ajTrue;
1199 
1200     if(!outf || !assem)
1201 	return ajFalse;
1202 
1203     ajDebug("assemoutWriteSam: # of contigs = %d\n", n);
1204 
1205     if(!assem->Hasdata)
1206     {
1207         ajFmtPrintF(outf, "@HD\tVN:1.3\tSO:%s\n", ajAssemGetSortorderC(assem));
1208 
1209         /* Program record */
1210         argstr = ajStrNewS(ajUtilGetCmdline());
1211         ajStrExchangeKK(&argstr, '\n', ' ');
1212         ajFmtPrintF(outf, "@PG\tID:%S\tVN:%S\tCL:%S\n",
1213                     ajUtilGetProgram(), ajNamValueVersion(), argstr);
1214         ajStrDel(&argstr);
1215 
1216 
1217         if(ajListGetLength(assem->ContigsOrder))
1218             ajListToarray(assem->ContigsOrder, (void***)&contigs);
1219         else
1220             ajTableToarrayValues(assem->Contigs, (void***)&contigs);
1221 
1222         while (contigs[i])   /* contigs */
1223         {
1224             c = contigs[i++];
1225 
1226             if(!ajStrMatchC(c->Name, "*"))
1227             {
1228                 ajFmtPrintF(outf, "@SQ\tSN:%S\tLN:%d", c->Name, c->Length);
1229 
1230                 if(c->URI)
1231                     ajFmtPrintF(outf, "\tUR:%S", c->URI);
1232 
1233                 if(c->MD5)
1234                     ajFmtPrintF(outf, "\tM5:%S", c->MD5);
1235 
1236                 if(c->Species)
1237                     ajFmtPrintF(outf, "\tSP:%S", c->Species);
1238 
1239                 ajFmtPrintF(outf, "\n");
1240 
1241                 j = ajListIterNewread(c->Tags);
1242                 while (!ajListIterDone(j))
1243                 {
1244                     t = ajListIterGet(j);
1245                     ajFmtPrintF(outf, "@CO\t%S %u %u %S\n",
1246                                 t->Name, t->x1, t->y1,
1247                                 t->Comment);
1248                 }
1249                 ajListIterDel(&j);
1250             }
1251         }
1252 
1253         headertext = assemSAMGetReadgroupHeaderlines(assem);
1254         if(headertext)
1255             ajFmtPrintF(outf,"%S", headertext);
1256 
1257         AJFREE(contigs);
1258 
1259         if(!assem->BamHeader)
1260             return ajTrue;
1261     }
1262 
1263 
1264     /* data */
1265 
1266     j = ajListIterNewread(assem->Reads);
1267     if(ajListGetLength(assem->ContigsOrder))
1268         i = ajListToarray(assem->ContigsOrder, (void***)&contigs);
1269     else
1270         i = ajTableToarrayValues(assem->Contigs, (void***)&contigs);
1271 
1272     while (!ajListIterDone(j))  /* reads */
1273     {
1274 	r = ajListIterGet(j);
1275 	assemoutWriteSamAlignment(outf, r, contigs, (ajuint) i);
1276     }
1277 
1278     ajListIterDel(&j);
1279     AJFREE(contigs);
1280 
1281     return ret;
1282 }
1283 
1284 
1285 
1286 
1287 /* @funcstatic assemSAMGetReadgroupHeaderlines ********************************
1288 **
1289 ** Returns read-group header lines as text for the given assembly
1290 **
1291 ** @param [r] assem [const AjPAssem] assembly object
1292 ** @return [const AjPStr] read-group header lines
1293 **
1294 **
1295 ** @release 6.5.0
1296 ******************************************************************************/
1297 
assemSAMGetReadgroupHeaderlines(const AjPAssem assem)1298 static const AjPStr assemSAMGetReadgroupHeaderlines(const AjPAssem assem)
1299 {
1300     AjPStr* rgids = NULL;
1301     const AjPAssemReadgroup rg = NULL;
1302     ajint i =0;
1303 
1304     ajTableToarrayKeys(assem->Readgroups, (void***)&rgids);
1305 
1306     ajStrAssignC(&assemoutSamLinetxt, "");
1307 
1308     while (rgids[i])  /* read groups */
1309     {
1310 	rg = ajTableFetchS(assem->Readgroups, rgids[i++]);
1311 	ajFmtPrintAppS(&assemoutSamLinetxt, "@RG\tID:%S", rg->ID);
1312 
1313 	if(rg->CN)
1314 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tCN:%S", rg->CN);
1315 
1316 	if(rg->Desc)
1317 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tDS:%S", rg->Desc);
1318 
1319 	if(rg->Date)
1320 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tDT:%S", rg->Date);
1321 
1322 	if(rg->FlowOrder)
1323 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tFO:%S", rg->FlowOrder);
1324 
1325 	if(rg->KeySeq)
1326 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tKS:%S", rg->KeySeq);
1327 
1328 	if(rg->Library)
1329 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tLB:%S", rg->Library);
1330 
1331 	if(rg->Programs)
1332 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tPG:%S", rg->Programs);
1333 
1334 	if(rg->Isize)
1335 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tPI:%d", rg->Isize);
1336 
1337 	if(rg->Platform)
1338 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tPL:%s",
1339 	                   ajAssemreadgroupGetPlatformname(rg));
1340 
1341 	if(rg->Unit)
1342 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tPU:%S", rg->Unit);
1343 
1344 	if(rg->Sample)
1345 	    ajFmtPrintAppS(&assemoutSamLinetxt, "\tSM:%S", rg->Sample);
1346 
1347 	ajStrAppendC(&assemoutSamLinetxt, "\n");
1348 
1349     }
1350 
1351     AJFREE(rgids);
1352 
1353     return assemoutSamLinetxt;
1354 }
1355 
1356 
1357 
1358 
1359 /* @funcstatic assemoutWriteSamAlignment **************************************
1360 **
1361 ** Write individual alignment records of assembly data in SAM format
1362 **
1363 ** @param [u] outf [AjPFile] Output file
1364 ** @param [r] r [const AjPAssemRead] Read object with alignment information
1365 ** @param [r] contigs [AjPAssemContig const*] Contigs/refseqs the read
1366 **                                             aligns to
1367 ** @param [r] ncontigs [ajint] Number of contigs
1368 **
1369 ** @return [AjBool] True on success
1370 **
1371 **
1372 ** @release 6.5.0
1373 ******************************************************************************/
1374 
assemoutWriteSamAlignment(AjPFile outf,const AjPAssemRead r,AjPAssemContig const * contigs,ajint ncontigs)1375 static AjBool assemoutWriteSamAlignment(AjPFile outf, const AjPAssemRead r,
1376 					AjPAssemContig const * contigs,
1377 					ajint ncontigs)
1378 {
1379     AjPAssemTag    t = NULL;
1380     AjIList l = NULL;
1381     AjPStr qualstr = NULL;
1382     AjPStr tmp  = NULL;
1383     ajint  POS  = 0;
1384     AjPStr CIGAR = NULL;
1385     const char* RNEXT = NULL;
1386     AjPStr SEQ  = NULL;
1387     AjPStr QUAL = NULL;
1388     AjPStr SEQunpadded  = NULL;
1389     AjPStr QUALunpadded = NULL;
1390     AjPStr consensus = NULL;
1391     AjBool rc= ajFalse;
1392     AjBool ret = ajTrue;
1393     const char* refseq = NULL;
1394     const AjPAssemContig contig = NULL;
1395 
1396     ajuint k = 0;
1397 
1398     if(r->Reference>=ncontigs)
1399 	ajDie("assemoutWriteSamAlignment: reference sequence number"
1400 		" '%d' is larger than or equal to known number of reference"
1401 		" sequences '%d'. Problem while processing read '%S'.",
1402 		r->Reference,
1403 		ncontigs,
1404 		r->Name);
1405 
1406     contig = (r->Reference==-1 ? NULL : contigs[r->Reference]);
1407 
1408     ajStrAssignRef(&SEQ, r->Seq);
1409     consensus = contig==NULL? NULL : contig->Consensus;
1410 
1411     if (r->Rnext==-1)
1412 	RNEXT= "*";
1413     else if(r->Rnext==r->Reference)
1414 	RNEXT = "=";
1415     else
1416 	RNEXT = ajStrGetPtr(contigs[r->Rnext]->Name);
1417 
1418     if (r->Flag & BAM_FREVERSE)
1419     {
1420 	rc = ajTrue;
1421 	qualstr = ajStrNewS(r->SeqQ);
1422 
1423 	if(!r->Reversed)
1424 	{
1425 	    ajStrReverse(&qualstr);
1426 	    ajSeqstrReverse(&SEQ);
1427 	}
1428 
1429 	QUAL = qualstr;
1430 	POS = r->y1;
1431 	ajStrAssignSubS(&tmp, SEQ,
1432 		ajStrGetLen(r->Seq) - r->y2,
1433 		ajStrGetLen(r->Seq) - r->x2
1434 	);
1435 
1436     }
1437     else
1438     {
1439 	rc= ajFalse;
1440 	POS = r->x1;
1441 	QUAL = r->SeqQ;
1442 	ajStrAssignSubS(&tmp, SEQ,
1443 		r->x2-1,
1444 		r->y2-1
1445 	);
1446     }
1447 
1448     if(r->Cigar==NULL && consensus)
1449     {
1450 	refseq = ajStrGetPtr(consensus) + (rc ? r->y1-1 : r->x1-1);
1451 
1452 	CIGAR = assemoutMakeCigar(refseq, ajStrGetPtr(tmp));
1453 
1454 	SEQunpadded = ajStrNewRes(ajStrGetLen(SEQ));
1455 	QUALunpadded = ajStrNewRes(ajStrGetLen(SEQ));
1456 
1457 	for(k=0; k< ajStrGetLen(SEQ); k++)
1458 	{
1459 	    if (ajStrGetCharPos(SEQ, k) == '*')
1460 		continue;
1461 
1462 	    ajStrAppendK(&SEQunpadded, ajStrGetCharPos(SEQ, k));
1463 	    ajStrAppendK(&QUALunpadded, ajStrGetCharPos(QUAL, k));
1464 	}
1465 
1466 	ajDebug("cigar: %S\n", CIGAR);
1467 
1468 	ajStrAssignS(&tmp, CIGAR);
1469 
1470 	if(rc)
1471 	{
1472 	    if(r->y2 < (ajint)ajStrGetLen(SEQ))
1473 		ajFmtPrintS(&CIGAR, "%dS%S",
1474 		            ajStrGetLen(SEQ) - r->y2, tmp);
1475 	    if(r->x2 > 1)
1476 		ajFmtPrintAppS(&CIGAR, "%dS", r->x2 - 1);
1477 	}
1478 	else
1479 	{
1480 	    if(r->x2 > 1)
1481 		ajFmtPrintS(&CIGAR, "%dS%S", r->x2 - 1, tmp);
1482 	    if(r->y2 < (ajint)ajStrGetLen(SEQ))
1483 		ajFmtPrintAppS(&CIGAR, "%dS",
1484 		               ajStrGetLen(SEQ) - r->y2);
1485 	}
1486 	ajStrDel(&tmp);
1487     }
1488     else if(r->Cigar==NULL)
1489     {
1490 	ajErr("both CIGAR string and consensus sequence not available");
1491 	ret = ajFalse;
1492 	ajStrAssignK(&CIGAR, '*');
1493     }
1494     else if(!ajStrGetLen(r->Cigar))
1495 	ajStrAssignK(&CIGAR, '*');
1496     else if(ajStrGetLen(r->Cigar))
1497     {
1498 	if(!ajStrGetLen(SEQ))
1499 	    ajStrAssignK(&SEQ, '*');
1500 
1501 	if(!ajStrGetLen(QUAL))
1502 	    ajStrAssignK(&QUAL, '*');
1503     }
1504 
1505     ajStrDel(&tmp);
1506 
1507     ajFmtPrintF(outf, "%S\t%d\t%s\t%d\t%d\t%S\t%s\t%Ld\t%d\t%S\t%S",
1508 	    r->Name,
1509 	    r->Flag,
1510 	    (contig==NULL ? "*" : ajStrGetPtr(contig->Name)),
1511 	    POS,
1512 	    r->MapQ,
1513 	    (CIGAR ? CIGAR : r->Cigar),
1514 	    RNEXT,
1515 	    r->Pnext,
1516 	    r->Tlen,
1517 	    (r->Cigar ? SEQ  : SEQunpadded),
1518 	    (r->Cigar ? QUAL : QUALunpadded));
1519 
1520     l = ajListIterNewread(r->Tags);
1521     while (!ajListIterDone(l))
1522     {
1523 	t = ajListIterGet(l);
1524 
1525 	/* TODO: array type, 'B' */
1526 
1527 	/* In SAM, all single integer types are mapped to int32_t [SAM spec] */
1528 	ajFmtPrintF(outf, "\t%S:%c:",
1529 		t->Name,
1530 		(t->type == 'c' || t->type == 'C' ||
1531 		 t->type == 's' || t->type == 'S'
1532 				|| t->type == 'I') ? 'i' : t->type
1533 	);
1534 
1535 	if(t->x1 || t->y1)
1536 	    ajFmtPrintF(outf, " %u %u", t->x1, t->y1);
1537 
1538 	if(t->Comment && ajStrGetLen(t->Comment)>0)
1539 	    ajFmtPrintF(outf, "%S", t->Comment);
1540 
1541     }
1542     ajListIterDel(&l);
1543 
1544     ajFmtPrintF(outf, "\n");
1545 
1546     if(qualstr)
1547 	ajStrDel(&qualstr);
1548 
1549     ajStrDel(&SEQ);
1550     ajStrDel(&CIGAR);
1551     ajStrDel(&SEQunpadded);
1552     ajStrDel(&QUALunpadded);
1553 
1554     return ret;
1555 }
1556 
1557 
1558 
1559 
1560 /* @funcstatic assemoutWriteList **********************************************
1561 **
1562 ** Write an assembly as the simple id
1563 **
1564 ** @param [u] outf [AjPFile] Output file
1565 ** @param [r] assem [const AjPAssem] Assembly object
1566 **
1567 ** @return [AjBool] True on success
1568 **
1569 **
1570 ** @release 6.4.0
1571 ******************************************************************************/
1572 
assemoutWriteList(AjPFile outf,const AjPAssem assem)1573 static AjBool assemoutWriteList(AjPFile outf, const AjPAssem assem)
1574 {
1575     if(!outf) return ajFalse;
1576     if(!assem) return ajFalse;
1577 
1578     if(ajStrGetLen(assem->Db))
1579         ajFmtPrintF(outf, "%S:%S\n", assem->Db, assem->Id);
1580     else
1581         ajFmtPrintF(outf, "%S\n", assem->Id);
1582 
1583     return ajTrue;
1584 }
1585 
1586 
1587 
1588 
1589 /* @funcstatic assemoutWriteNextList ******************************************
1590 **
1591 ** Write an assembly as the simple id
1592 **
1593 ** @param [u] outfile [AjPOutfile] Output file
1594 ** @param [r] assem [const AjPAssem] Assembly object
1595 **
1596 ** @return [AjBool] True on success
1597 **
1598 **
1599 ** @release 6.4.0
1600 ******************************************************************************/
1601 
assemoutWriteNextList(AjPOutfile outfile,const AjPAssem assem)1602 static AjBool assemoutWriteNextList(AjPOutfile outfile, const AjPAssem assem)
1603 {
1604     AjPFile outf = ajOutfileGetFile(outfile);
1605 
1606     if(!outf) return ajFalse;
1607     if(!assem) return ajFalse;
1608 
1609     if(!assem->Hasdata)
1610         return ajTrue;
1611 
1612     if(ajStrGetLen(assem->Db))
1613         ajFmtPrintF(outf, "%S:%S\n", assem->Db, assem->Id);
1614     else
1615         ajFmtPrintF(outf, "%S\n", assem->Id);
1616 
1617     return ajTrue;
1618 }
1619 
1620 
1621 
1622 
1623 /* @datasection [none] Miscellaneous functions ********************************
1624 **
1625 ** Functions to initialise and clean up internals
1626 **
1627 ** @nam2rule Assemout Assembly output internals
1628 **
1629 ******************************************************************************/
1630 
1631 
1632 
1633 
1634 /* @section Printing **********************************************************
1635 **
1636 ** Printing details of the internals to a file
1637 **
1638 ** @fdata [none]
1639 **
1640 ** @nam2rule Assemoutprint
1641 **
1642 ** @fcategory output
1643 **
1644 ******************************************************************************/
1645 
1646 
1647 
1648 
1649 /* @section Print *************************************************************
1650 **
1651 ** Printing to a file
1652 **
1653 ** @fdata [none]
1654 **
1655 ** @nam3rule Book Print as docbook table
1656 ** @nam3rule Html Print as html table
1657 ** @nam3rule Wiki Print as wiki table
1658 ** @nam3rule Text Print as text
1659 **
1660 ** @argrule * outf [AjPFile] output file
1661 ** @argrule Text full [AjBool] Print all details
1662 **
1663 ** @valrule * [void]
1664 **
1665 ** @fcategory cast
1666 **
1667 ******************************************************************************/
1668 
1669 
1670 
1671 
1672 /* @func ajAssemoutprintBook **************************************************
1673 **
1674 ** Reports the assembly format internals as Docbook text
1675 **
1676 ** @param [u] outf [AjPFile] Output file
1677 ** @return [void]
1678 **
1679 ** @release 6.4.0
1680 ** @@
1681 ******************************************************************************/
1682 
ajAssemoutprintBook(AjPFile outf)1683 void ajAssemoutprintBook(AjPFile outf)
1684 {
1685     ajint i = 0;
1686 
1687     ajFmtPrintF(outf, "<para>The supported assembly output "
1688                 "formats are summarised "
1689                 "in the table below. The columns are as follows: "
1690                 "<emphasis>Output format</emphasis> (format name), "
1691                  "<emphasis>Description</emphasis> (short description of "
1692                 "the format).</para> \n\n");
1693 
1694     ajFmtPrintF(outf, "<table frame=\"box\" rules=\"cols\">\n");
1695     ajFmtPrintF(outf, "  <caption>Assembly output formats</caption>\n");
1696     ajFmtPrintF(outf, "  <thead>\n");
1697     ajFmtPrintF(outf, "    <tr align=\"center\">\n");
1698     ajFmtPrintF(outf, "      <th>Output Format</th>\n");
1699     ajFmtPrintF(outf, "      <th>Description</th>\n");
1700     ajFmtPrintF(outf, "    </tr>\n");
1701     ajFmtPrintF(outf, "  </thead>\n");
1702     ajFmtPrintF(outf, "  <tbody>\n");
1703 
1704     for(i=0; assemoutFormatDef[i].Name; i++)
1705     {
1706         ajFmtPrintF(outf, "    <tr>\n");
1707         ajFmtPrintF(outf, "      <td>%s</td>\n",
1708                     assemoutFormatDef[i].Name);
1709         ajFmtPrintF(outf, "      <td>%s</td>\n",
1710                     assemoutFormatDef[i].Desc);
1711     }
1712 
1713     ajFmtPrintF(outf, "  </tbody>\n");
1714     ajFmtPrintF(outf, "</table>\n");
1715 
1716     return;
1717 }
1718 
1719 
1720 
1721 
1722 /* @func ajAssemoutprintHtml **************************************************
1723 **
1724 ** Reports the internal data structures
1725 **
1726 ** @param [u] outf [AjPFile] Output file
1727 ** @return [void]
1728 **
1729 ** @release 6.4.0
1730 ** @@
1731 ******************************************************************************/
1732 
ajAssemoutprintHtml(AjPFile outf)1733 void ajAssemoutprintHtml(AjPFile outf)
1734 {
1735     ajint i = 0;
1736 
1737     ajFmtPrintF(outf, "<table border=3>");
1738     ajFmtPrintF(outf, "<tr><th>Assembly Format</th>\n");
1739     ajFmtPrintF(outf, "<th>Description</th></tr>\n");
1740     ajFmtPrintF(outf, "\n");
1741     ajFmtPrintF(outf, "# Assembly output formats\n");
1742     ajFmtPrintF(outf, "# Name    Format name (or alias)\n");
1743     ajFmtPrintF(outf, "# Desc    Format description\n");
1744     ajFmtPrintF(outf, "# Name         Description\n");
1745     ajFmtPrintF(outf, "\n");
1746     ajFmtPrintF(outf, "AssemFormat {\n");
1747 
1748     for(i=0; assemoutFormatDef[i].Name; i++)
1749     {
1750         ajFmtPrintF(outf, "<tr><td>\n%-12s\n"
1751                         "<td>\"%s\"</td></tr>\n",
1752 			assemoutFormatDef[i].Name,
1753 			assemoutFormatDef[i].Desc);
1754     }
1755 
1756     ajFmtPrintF(outf, "}\n\n");
1757 
1758     return;
1759 }
1760 
1761 
1762 
1763 
1764 /* @func ajAssemoutprintText **************************************************
1765 **
1766 ** Reports the internal data structures
1767 **
1768 ** @param [u] outf [AjPFile] Output file
1769 ** @param [r] full [AjBool] Full report (usually ajFalse)
1770 ** @return [void]
1771 **
1772 ** @release 6.4.0
1773 ** @@
1774 ******************************************************************************/
1775 
ajAssemoutprintText(AjPFile outf,AjBool full)1776 void ajAssemoutprintText(AjPFile outf, AjBool full)
1777 {
1778     ajint i = 0;
1779 
1780     ajFmtPrintF(outf, "\n");
1781     ajFmtPrintF(outf, "# Assembly output formats\n");
1782     ajFmtPrintF(outf, "# Name    Format name (or alias)\n");
1783     ajFmtPrintF(outf, "# Desc    Format description\n");
1784     ajFmtPrintF(outf, "# Name         Description\n");
1785     ajFmtPrintF(outf, "\n");
1786     ajFmtPrintF(outf, "AssemFormat {\n");
1787 
1788     for(i=0; assemoutFormatDef[i].Name; i++)
1789     {
1790 	if(full)
1791 	    ajFmtPrintF(outf, "  %-12s \"%s\"\n",
1792 			assemoutFormatDef[i].Name,
1793 			assemoutFormatDef[i].Desc);
1794     }
1795 
1796     ajFmtPrintF(outf, "}\n\n");
1797 
1798     return;
1799 }
1800 
1801 
1802 
1803 
1804 /* @func ajAssemoutprintWiki **************************************************
1805 **
1806 ** Reports the asembly output format internals as wikitext
1807 **
1808 ** @param [u] outf [AjPFile] Output file
1809 ** @return [void]
1810 **
1811 ** @release 6.4.0
1812 ** @@
1813 ******************************************************************************/
1814 
ajAssemoutprintWiki(AjPFile outf)1815 void ajAssemoutprintWiki(AjPFile outf)
1816 {
1817     ajint i = 0;
1818     ajint j = 0;
1819     AjPStr namestr = NULL;
1820 
1821     ajFmtPrintF(outf, "{| class=\"wikitable sortable\" border=\"2\"\n");
1822     ajFmtPrintF(outf, "|-\n");
1823     ajFmtPrintF(outf, "!Format!!"
1824                 "class=\"unsortable\"|Description\n");
1825 
1826     for(i=0; assemoutFormatDef[i].Name; i++)
1827     {
1828         ajFmtPrintF(outf, "|-\n");
1829         ajStrAssignC(&namestr, assemoutFormatDef[i].Name);
1830 
1831         for(j=i+1; assemoutFormatDef[j].Name; j++)
1832         {
1833             if(assemoutFormatDef[j].Write == assemoutFormatDef[i].Write)
1834             {
1835                 ajFmtPrintAppS(&namestr, "<br>%s",
1836                                assemoutFormatDef[j].Name);
1837                 ajWarn("Assembly output format '%s' same as '%s' "
1838                        "but not alias",
1839                        assemoutFormatDef[j].Name,
1840                        assemoutFormatDef[i].Name);
1841             }
1842         }
1843         ajFmtPrintF(outf, "|%S||%s\n",
1844                     namestr,
1845                     assemoutFormatDef[i].Desc);
1846     }
1847 
1848     ajFmtPrintF(outf, "|}\n");
1849 
1850     ajStrDel(&namestr);
1851 
1852     return;
1853 }
1854 
1855 
1856 
1857 
1858 /* @datasection [none] Output formats *****************************************
1859 **
1860 ** Output formats internals
1861 **
1862 ** @nam2rule Assemoutformat Assembly output format specific
1863 **
1864 ******************************************************************************/
1865 
1866 
1867 
1868 
1869 /* @section cast **************************************************************
1870 **
1871 ** Values for output formats
1872 **
1873 ** @fdata [none]
1874 **
1875 ** @nam3rule Find Return index to named format
1876 ** @nam3rule Test Test format value
1877 **
1878 ** @argrule * format [const AjPStr] Format name
1879 ** @argrule Find iformat [ajint*] Index matching format name
1880 **
1881 ** @valrule * [AjBool] True if found
1882 **
1883 ** @fcategory cast
1884 **
1885 ******************************************************************************/
1886 
1887 
1888 
1889 
1890 /* @func ajAssemoutformatFind *************************************************
1891 **
1892 ** Looks for the specified format(s) in the internal definitions and
1893 ** returns the index.
1894 **
1895 ** Sets iformat as the recognised format, and returns ajTrue.
1896 **
1897 ** @param [r] format [const AjPStr] Format required.
1898 ** @param [w] iformat [ajint*] Index
1899 ** @return [AjBool] ajTrue on success.
1900 **
1901 ** @release 6.4.0
1902 ** @@
1903 ******************************************************************************/
1904 
ajAssemoutformatFind(const AjPStr format,ajint * iformat)1905 AjBool ajAssemoutformatFind(const AjPStr format, ajint* iformat)
1906 {
1907     AjPStr tmpformat = NULL;
1908     ajuint i = 0;
1909 
1910     /* ajDebug("ajAssemoutformatFind '%S'\n", format); */
1911     if(!ajStrGetLen(format))
1912 	return ajFalse;
1913 
1914     ajStrAssignS(&tmpformat, format);
1915     ajStrFmtLower(&tmpformat);
1916 
1917     for(i=0; assemoutFormatDef[i].Name; i++)
1918     {
1919 	/* ajDebug("test %d '%s' \n", i, assemoutFormatDef[i].Name); */
1920 	if(ajStrMatchCaseC(tmpformat, assemoutFormatDef[i].Name))
1921 	{
1922 	    *iformat = i;
1923 	    ajStrDel(&tmpformat);
1924 	    /* ajDebug("found '%s' at %d\n", assemoutFormatDef[i].Name, i); */
1925 	    return ajTrue;
1926 	}
1927     }
1928 
1929     ajErr("Unknown output format '%S'", format);
1930 
1931     ajStrDel(&tmpformat);
1932 
1933     *iformat = 0;
1934 
1935     return ajFalse;
1936 }
1937 
1938 
1939 
1940 
1941 /* @func ajAssemoutformatTest *************************************************
1942 **
1943 ** Tests whether a named assembly output format is known
1944 **
1945 ** @param [r] format [const AjPStr] Format
1946 ** @return [AjBool] ajTrue if formats was accepted
1947 **
1948 ** @release 6.4.0
1949 ** @@
1950 ******************************************************************************/
1951 
ajAssemoutformatTest(const AjPStr format)1952 AjBool ajAssemoutformatTest(const AjPStr format)
1953 {
1954     ajint i;
1955 
1956     for (i=0; assemoutFormatDef[i].Name; i++)
1957 	if(ajStrMatchCaseC(format, assemoutFormatDef[i].Name))
1958 	    return ajTrue;
1959 
1960     return ajFalse;
1961 }
1962 
1963 
1964 
1965 
1966 /* @datasection [none] Miscellaneous functions ********************************
1967 **
1968 ** Functions to initialise and clean up internals
1969 **
1970 ** @nam2rule Assemout Assembly output internals
1971 **
1972 ******************************************************************************/
1973 
1974 
1975 
1976 
1977 /* @section Miscellaneous *****************************************************
1978 **
1979 ** Functions to initialise and clean up internals
1980 **
1981 ** @fdata [none]
1982 **
1983 ** @nam3rule Exit Clean up and exit
1984 **
1985 ** @valrule * [void]
1986 **
1987 ** @fcategory misc
1988 **
1989 ******************************************************************************/
1990 
1991 
1992 
1993 
1994 /* @func ajAssemoutExit *******************************************************
1995 **
1996 ** Cleans up assembly output internal memory
1997 **
1998 ** @return [void]
1999 **
2000 ** @release 6.4.0
2001 ** @@
2002 ******************************************************************************/
2003 
ajAssemoutExit(void)2004 void ajAssemoutExit(void)
2005 {
2006     ajStrDel(&assemoutSamLinetxt);
2007 
2008     return;
2009 }
2010