1 /* @include ajassemdata *******************************************************
2 **
3 ** AJAX assembly datatypes
4 **
5 ** These functions control all aspects of AJAX assembly
6 ** parsing and include simple utilities.
7 **
8 ** @author Copyright (C) 2010 Peter Rice
9 ** @version $Revision: 1.29 $
10 ** @modified Oct 5 pmr First version
11 ** @modified $Date: 2012/07/02 16:44:55 $ by $Author: rice $
12 ** @@
13 **
14 ** This library is free software; you can redistribute it and/or
15 ** modify it under the terms of the GNU Lesser General Public
16 ** License as published by the Free Software Foundation; either
17 ** version 2.1 of the License, or (at your option) any later version.
18 **
19 ** This library is distributed in the hope that it will be useful,
20 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22 ** Lesser General Public License for more details.
23 **
24 ** You should have received a copy of the GNU Lesser General Public
25 ** License along with this library; if not, write to the Free Software
26 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
27 ** MA  02110-1301,  USA.
28 **
29 ******************************************************************************/
30 
31 #ifndef AJASSEMDATA_H
32 #define AJASSEMDATA_H
33 
34 /* ========================================================================= */
35 /* ============================= include files ============================= */
36 /* ========================================================================= */
37 
38 #include "ajdefine.h"
39 #include "ajstr.h"
40 #include "ajtable.h"
41 #include "ajtime.h"
42 #include "ajtextdata.h"
43 #include "ajseqbam.h"
44 #include "ajbamindex.h"
45 
46 AJ_BEGIN_DECLS
47 
48 
49 
50 
51 /* ========================================================================= */
52 /* =============================== constants =============================== */
53 /* ========================================================================= */
54 
55 
56 
57 
58 /* @enum AjEAssemSortOrder ****************************************************
59 **
60 ** Assembly reads sort order Type enumeration.
61 **
62 ** @value ajEAssemSortOrderUnknown Unknown - default in SAM/BAM formats
63 ** @value ajEAssemSortOrderUnsorted Unsorted - unsorted
64 ** @value ajEAssemSortOrderQueryname Queryname - sorted by query/read names
65 ** @value ajEAssemSortOrderCoordinate Coordinate - major sort key is the
66 **        the reference/contig name, order defined by the order of @SQ lines
67 **        in header in SAM/BAM assemblies.
68 ** @@
69 ******************************************************************************/
70 
71 typedef enum AjOAssemSortOrder
72 {
73     ajEAssemSortOrderUnknown,
74     ajEAssemSortOrderUnsorted,
75     ajEAssemSortOrderQueryname,
76     ajEAssemSortOrderCoordinate
77 } AjEAssemSortOrder;
78 
79 
80 
81 
82 /* @enum AjEAssemPlatform *****************************************************
83 **
84 ** Platforms/technologies to produce reads, as enumerated in SAM specv1.4.
85 **
86 ** @value ajEAssemPlatformUnknown Unknown
87 ** @value ajEAssemPlatformCapillary Capillary
88 ** @value ajEAssemPlatformLS454 LS 454
89 ** @value ajEAssemPlatformIllumina Illumina
90 ** @value ajEAssemPlatformSolid Solid
91 ** @value ajEAssemPlatformHelicos Helicos
92 ** @value ajEAssemPlatformIontorrent IonTorrent
93 ** @value ajEAssemPlatformPacbio Pacific Biosciences
94 ** @@
95 ******************************************************************************/
96 
97 typedef enum AjOAssemPlatform
98 {
99     ajEAssemPlatformUnknown,
100     ajEAssemPlatformCapillary,
101     ajEAssemPlatformLS454,
102     ajEAssemPlatformIllumina,
103     ajEAssemPlatformSolid,
104     ajEAssemPlatformHelicos,
105     ajEAssemPlatformIontorrent,
106     ajEAssemPlatformPacbio
107 } AjEAssemPlatform;
108 
109 
110 
111 
112 /* ========================================================================= */
113 /* ============================== public data ============================== */
114 /* ========================================================================= */
115 
116 
117 
118 
119 /* @data AjPAssemRead *********************************************************
120 **
121 ** Store individual alignments of reads in assemblies.
122 **
123 ** In some assemblies some reads can align to more than one contig
124 ** or reference sequence, so current name of the data type is not perfect.
125 **
126 ** For storing read information as part of an assembly object.
127 ** Current version of this data type is MIRA/SAM oriented,
128 ** it needs to be improved to be a proper common data type,
129 ** some of the current fields are used by MIRA MAF format only
130 ** while few others are used by the SAM format only.
131 **
132 ** @alias AjSAssemRead
133 ** @alias AjOAssemRead
134 **
135 **
136 ** In it's simplest form, a DNA template is sequenced only once.
137 ** In paired-end sequencing, a DNA template is sequenced once in forward
138 ** and once in reverse direction (Sanger, 454, Solexa).
139 ** In Sanger sequencing, several forward and/or reverse reads
140 ** can be sequenced from a DNA template.
141 ** In PacBio sequencing, a DNA template can be sequenced
142 ** in several "strobes", leading to multiple reads on a DNA template.
143 ** (ref:MAF format specification on mira web site)
144 **
145 **
146 ** @attr Name       [AjPStr]  Name
147 ** @attr Seq        [AjPStr]  Sequence
148 ** @attr SeqQ       [AjPStr]  Sequence quality string
149 ** @attr Template   [AjPStr]  Name of the DNA template a sequence comes from
150 ** @attr File       [AjPStr]  Name of the sequencing file
151 **                            which contains raw data for this read
152 ** @attr Technology [AjPStr]  Sequencing technology
153 ** @attr Cigar      [AjPStr]  CIGAR string
154 ** @attr Tags       [AjPList] List of AjPAssemTag objects
155 ** @attr AlignmentBlocks [AjPList] Alignment blocks
156 ** @attr Rnext      [ajlong]  Reference number of the mate/next fragment
157 ** @attr Reference  [ajlong]  Reference sequence
158 ** @attr Pnext      [ajlong]  Position of the mate/next fragment
159 ** @attr Tlen       [ajint]   Observed template length
160 ** @attr Flag       [ajint]   Flag
161 ** @attr MapQ       [ajint]   Map quality
162 ** @attr TemplateSizeMin [ajint] Minimum template size
163 ** @attr TemplateSizeMax [ajint] Maximum template size
164 ** @attr ClipLeft     [ajint] Clip left
165 ** @attr ClipRight    [ajint] Clip right
166 ** @attr VectorLeft   [ajint] Clip left due to sequencing vector
167 ** @attr VectorRight  [ajint] Clip right due to sequencing vector
168 ** @attr QualLeft     [ajint] Clip left due to quality
169 ** @attr QualRight    [ajint] Clip right due to quality
170 ** @attr x1      [ajint] interval of the contig (1-based as in SAM and MAF)
171 ** @attr y1      [ajint] end of contig interval
172 ** @attr x2      [ajint] interval of the read
173 ** @attr y2      [ajint] end of read interval
174 ** @attr Reversed  [AjBool] true: has been reverse-complemented
175 ** @attr Direction [char] Direction of the read with respect to the template
176 ** @attr Padding [char[7]] Padding to alignment boundary
177 ** @@
178 ******************************************************************************/
179 
180 typedef struct AjSAssemRead
181 {
182     AjPStr  Name;
183     AjPStr  Seq;
184     AjPStr  SeqQ;
185     AjPStr  Template;
186     AjPStr  File;
187     AjPStr  Technology;
188     AjPStr  Cigar;
189     AjPList Tags;
190     AjPList AlignmentBlocks;
191     ajlong  Rnext;
192     ajlong  Reference;
193     ajlong  Pnext;
194     ajint  Tlen;
195     ajint  Flag;
196     ajint  MapQ;
197     ajint  TemplateSizeMin;
198     ajint  TemplateSizeMax;
199     ajint  ClipLeft;
200     ajint  ClipRight;
201     ajint  VectorLeft;
202     ajint  VectorRight;
203     ajint  QualLeft;
204     ajint  QualRight;
205     ajint  x1;
206     ajint  y1;
207     ajint  x2;
208     ajint  y2;
209     AjBool Reversed;
210     char   Direction;
211     char   Padding[7];
212 } AjOAssemRead;
213 
214 #define AjPAssemRead AjOAssemRead*
215 
216 
217 
218 
219 /* @data AjPAssemContig *******************************************************
220 **
221 ** Ajax AssemContig object to store contigs in assemblies.
222 **
223 ** In mapping assemblies consensus sequence refers to the reference sequence.
224 **
225 ** @attr Name       [AjPStr]  Name
226 ** @attr Consensus  [AjPStr]  Consensus/reference sequence
227 ** @attr ConsensusQ [AjPStr]  Quality string for the consensus sequence
228 ** @attr Tags       [AjPList] Tags for the consensus sequence
229 ** @attr Length     [ajint]   Length of the consensus sequence
230 ** @attr Nreads     [ajint]   Number of reads
231 ** @attr AssemblyID [AjPStr]  Assembly ID
232 ** @attr MD5        [AjPStr]  MD5 checksum of the consensus sequence
233 **                            in the uppercase, with gaps and spaces removed
234 ** @attr Species    [AjPStr]  Species
235 ** @attr URI        [AjPStr]  URI of the consensus sequences
236 **
237 ** @@
238 ******************************************************************************/
239 
240 typedef struct AjSContig
241 {
242     AjPStr  Name;
243     AjPStr  Consensus;
244     AjPStr  ConsensusQ;
245     AjPList Tags;
246     ajint   Length;
247     ajint   Nreads;
248     AjPStr  AssemblyID;
249     AjPStr  MD5;
250     AjPStr  Species;
251     AjPStr  URI;
252 } AjOContig;
253 
254 #define AjPAssemContig AjOContig*
255 
256 
257 
258 
259 /* @data AjPAssemin ***********************************************************
260 **
261 ** Ajax Assembly Input object.
262 **
263 ** Holds the input specification and information needed to read
264 ** the assembly and possible further entries
265 **
266 ** @alias AjSAssemin
267 ** @alias AjOAssemin
268 **
269 ** @attr Input    [AjPTextin] General text input object
270 ** @attr BamIdx   [AjPBamIndex] BAM index
271 ** @attr BamInput [AjBool] BAM file input
272 ** @attr cbegin   [ajint]     Contig start position
273 ** @attr cend     [ajint]     Contig end position
274 ** @attr Loading  [AjBool] True if data is now loading
275 ** @@
276 ******************************************************************************/
277 
278 typedef struct AjSAssemin
279 {
280     AjPTextin Input;
281     AjPBamIndex BamIdx;
282     AjBool BamInput;
283     ajint cbegin;
284     ajint cend;
285     AjBool Loading;
286 } AjOAssemin;
287 
288 #define AjPAssemin AjOAssemin*
289 
290 
291 
292 
293 /* @data AjPAssem *************************************************************
294 **
295 ** Ajax Assembly object.
296 **
297 ** Holds the assembly itself, plus associated information.
298 **
299 ** @alias AjSAssem
300 ** @alias AjOAssem
301 **
302 ** @attr Id           [AjPStr]   Id of term
303 ** @attr Db           [AjPStr]   Database name from input
304 ** @attr Setdb        [AjPStr]   Database name from command line
305 ** @attr Full         [AjPStr]   Full name
306 ** @attr Qry          [AjPStr]   Query for re-reading
307 ** @attr Formatstr    [AjPStr]   Input format name
308 ** @attr Filename     [AjPStr]   Original filename
309 ** @attr Textptr      [AjPStr]   Full text
310 ** @attr BamHeader    [AjPSeqBamHeader] BAM header
311 ** @attr Contigs      [AjPTable] Contigs table, storing contigs by name
312 ** @attr ContigsIgnored [AjPTable]  Contigs named as '*' because of
313 ** 				    missing header '@SQ' entries
314 ** @attr ContigsOrder [AjPList] Order of contigs in the assembly
315 ** @attr ContigArray  [AjPAssemContig*] Contigs array for fast access
316 ** @attr Reads        [AjPList]  List of reads in the assembly
317 ** @attr Readgroups   [AjPTable] Table of read-groups in the assembly
318 ** @attr rec          [AjPAssemRead] Current/last read/alignment record read
319 ** @attr Fpos         [ajlong]   File position
320 ** @attr Format       [AjEnum]   Input format enum
321 ** @attr Count        [ajuint]   Number of lines read (contigs in case of BAM)
322 ** @attr Hasdata      [AjBool]  True when data has been loaded
323 ** @attr SO           [AjEAssemSortOrder] Reads sort order in the assembly
324 **
325 ** @@
326 ******************************************************************************/
327 
328 typedef struct AjSAssem
329 {
330     AjPStr  Id;
331     AjPStr  Db;
332     AjPStr  Setdb;
333     AjPStr  Full;
334     AjPStr  Qry;
335     AjPStr  Formatstr;
336     AjPStr  Filename;
337     AjPStr  Textptr;
338     AjPSeqBamHeader BamHeader;
339     AjPTable Contigs;
340     AjPTable ContigsIgnored;
341     AjPList ContigsOrder;
342     AjPAssemContig* ContigArray;
343     AjPList Reads;
344     AjPTable Readgroups;
345     AjPAssemRead rec;
346     ajlong  Fpos;
347     AjEnum  Format;
348     ajuint  Count;
349     AjBool Hasdata;
350     AjEAssemSortOrder SO;
351 } AjOAssem;
352 
353 #define AjPAssem AjOAssem*
354 
355 
356 
357 
358 /* @data AjPAssemload *********************************************************
359 **
360 ** Ajax assembly loader object.
361 **
362 ** Inherits an AjPAssem but allows more assembly data to be read from the
363 ** same input by also inheriting the AjPVarin input object.
364 **
365 ** @alias AjSAssemload
366 ** @alias AjOAssemload
367 **
368 ** @attr Assem [AjPAssem] Current variation
369 ** @attr Assemin [AjPAssemin] Assembly input for reading next
370 ** @attr Count [ajuint] Count of terms so far
371 ** @attr Loading [AjBool] True if data is now loading
372 ** @attr Returned [AjBool] if true: Assembly object has been returned to a new
373 **                         owner and is not to be deleted by the destructor
374 ** @attr Padding [ajuint] Padding to alignment boundary
375 ** @@
376 ******************************************************************************/
377 
378 typedef struct AjSAssemload
379 {
380     AjPAssem Assem;
381     AjPAssemin Assemin;
382     ajuint Count;
383     AjBool Loading;
384     AjBool Returned;
385     ajuint Padding;
386 } AjOAssemload;
387 
388 #define AjPAssemload AjOAssemload*
389 
390 
391 
392 
393 /* @data AjPAssemAccess *******************************************************
394 **
395 ** Ajax assembly access database reading object.
396 **
397 ** Holds information needed to read an assembly entry from a database.
398 ** Access methods are defined for each known database type.
399 **
400 ** Assembly entries are read from the database using the defined
401 ** database access function, which is usually a static function
402 ** within ajassemdb.c
403 **
404 ** This should be a static data object but is needed for the definition
405 ** of AjPAssemin.
406 **
407 ** @alias AjSAssemAccess
408 ** @alias AjOAssemAccess
409 **
410 ** @attr Name [const char*] Access method name used in emboss.default
411 ** @attr Access [AjBool function] Access function
412 ** @attr AccessFree [AjBool function] Access cleanup function
413 ** @attr Qlink [const char*] Supported query link operators
414 ** @attr Desc [const char*] Description
415 ** @attr Alias [AjBool] Alias for another name
416 ** @attr Entry [AjBool] Supports retrieval of single entries
417 ** @attr Query [AjBool] Supports retrieval of selected entries
418 ** @attr All [AjBool] Supports retrieval of all entries
419 ** @attr Chunked [AjBool] Supports retrieval of entries in chunks
420 ** @attr Padding [AjBool] Padding to alignment boundary
421 ** @@
422 ******************************************************************************/
423 
424 typedef struct AjSAssemAccess
425 {
426     const char *Name;
427     AjBool (*Access) (AjPAssemin assemin);
428     AjBool (*AccessFree) (void* qry);
429     const char* Qlink;
430     const char* Desc;
431     AjBool Alias;
432     AjBool Entry;
433     AjBool Query;
434     AjBool All;
435     AjBool Chunked;
436     AjBool Padding;
437 } AjOAssemAccess;
438 
439 #define AjPAssemAccess AjOAssemAccess*
440 
441 
442 
443 
444 /* @data AjPAssemTag **********************************************************
445 **
446 ** Ajax AssemTag object to store MIRA tags for the read sequences as well as
447 ** the contig consensus sequences.
448 **
449 ** SAM format also allows a set of predefined tags
450 ** as well as it reserves lowercase tags for end users.
451 **
452 ** @attr Name    [AjPStr] Name
453 ** @attr Comment [AjPStr] Tag comment in case of MAF,
454 **			  or tag value in case of SAM
455 ** @attr x1      [ajuint] X value
456 ** @attr y1      [ajuint] Y value
457 ** @attr type    [char]   Value type: AcCsSiIfZHB
458 ** @attr Padding [char[7]] Padding to alignment boundary
459 ** @@
460 ******************************************************************************/
461 
462 typedef struct AjSAssemTag
463 {
464     AjPStr  Name;
465     AjPStr  Comment;
466     ajuint  x1;
467     ajuint  y1;
468     char    type;
469     char    Padding[7];
470 } AjOAssemTag;
471 
472 #define AjPAssemTag AjOAssemTag*
473 
474 
475 
476 
477 /* @data AjPAssemReadalignmentblock *******************************************
478 **
479 ** read alignment block
480 **
481 ** @attr readStart      [int] Read start
482 ** @attr referenceStart [int] Reference start
483 ** @attr length         [int] alignment length
484 ** @@
485 ******************************************************************************/
486 
487 typedef struct AjSAssemReadalignmentblock
488 {
489     int readStart;
490     int referenceStart;
491     int length;
492 } AjOAssemReadalignmentblock;
493 
494 #define AjPAssemReadalignmentblock AjOAssemReadalignmentblock*
495 
496 
497 
498 
499 /* @data AjPAssemReadgroup ****************************************************
500 **
501 ** read-group object to store information about read groups as described
502 ** in SAM spec.
503 **
504 ** @attr ID        [AjPStr] Read group identifier
505 ** @attr CN        [AjPStr] Name of the sequencing center producing the read
506 ** @attr Desc      [AjPStr] Description
507 ** @attr Date      [AjPStr] Date the run was produced
508 ** @attr FlowOrder [AjPStr] The array of nucleotide bases that correspond to
509 **                          the nucleotides used for each flow of each record
510 ** @attr KeySeq    [AjPStr] The array of nucleotide bases that correspond to
511 **                          the key sequence of each read
512 ** @attr Library   [AjPStr] Library
513 ** @attr Programs  [AjPStr] Programs used for processing the read group
514 ** @attr Unit      [AjPStr] Platform unit (e.g. flowcell-barcode for Illumina)
515 ** @attr Sample    [AjPStr] Sample name, or pool name when a pool was sequenced
516 ** @attr Isize     [ajint]  Predicted median insert size
517 **
518 **
519 ** @attr Platform [AjEAssemPlatform] Instrument platform
520 ** @@
521 ******************************************************************************/
522 
523 typedef struct AjSAssemReadgroup
524 {
525     AjPStr  ID;
526     AjPStr  CN;
527     AjPStr  Desc;
528     AjPStr  Date;
529     AjPStr  FlowOrder;
530     AjPStr  KeySeq;
531     AjPStr  Library;
532     AjPStr  Programs;
533     AjPStr  Unit;
534     AjPStr  Sample;
535     ajint   Isize;
536     AjEAssemPlatform Platform;
537 } AjOAssemReadgroup;
538 
539 #define AjPAssemReadgroup AjOAssemReadgroup*
540 
541 
542 
543 
544 /* ========================================================================= */
545 /* =========================== public functions ============================ */
546 /* ========================================================================= */
547 
548 
549 
550 
551 /*
552 ** Prototype definitions
553 */
554 
555 /*
556 ** End of prototype definitions
557 */
558 
559 
560 
561 
562 AJ_END_DECLS
563 
564 #endif /* !AJASSEMDATA_H */
565