1 /* @include ajassemdata ******************************************************* 2 ** 3 ** AJAX assembly datatypes 4 ** 5 ** These functions control all aspects of AJAX assembly 6 ** parsing and include simple utilities. 7 ** 8 ** @author Copyright (C) 2010 Peter Rice 9 ** @version $Revision: 1.29 $ 10 ** @modified Oct 5 pmr First version 11 ** @modified $Date: 2012/07/02 16:44:55 $ by $Author: rice $ 12 ** @@ 13 ** 14 ** This library is free software; you can redistribute it and/or 15 ** modify it under the terms of the GNU Lesser General Public 16 ** License as published by the Free Software Foundation; either 17 ** version 2.1 of the License, or (at your option) any later version. 18 ** 19 ** This library is distributed in the hope that it will be useful, 20 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 21 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 ** Lesser General Public License for more details. 23 ** 24 ** You should have received a copy of the GNU Lesser General Public 25 ** License along with this library; if not, write to the Free Software 26 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 27 ** MA 02110-1301, USA. 28 ** 29 ******************************************************************************/ 30 31 #ifndef AJASSEMDATA_H 32 #define AJASSEMDATA_H 33 34 /* ========================================================================= */ 35 /* ============================= include files ============================= */ 36 /* ========================================================================= */ 37 38 #include "ajdefine.h" 39 #include "ajstr.h" 40 #include "ajtable.h" 41 #include "ajtime.h" 42 #include "ajtextdata.h" 43 #include "ajseqbam.h" 44 #include "ajbamindex.h" 45 46 AJ_BEGIN_DECLS 47 48 49 50 51 /* ========================================================================= */ 52 /* =============================== constants =============================== */ 53 /* ========================================================================= */ 54 55 56 57 58 /* @enum AjEAssemSortOrder **************************************************** 59 ** 60 ** Assembly reads sort order Type enumeration. 61 ** 62 ** @value ajEAssemSortOrderUnknown Unknown - default in SAM/BAM formats 63 ** @value ajEAssemSortOrderUnsorted Unsorted - unsorted 64 ** @value ajEAssemSortOrderQueryname Queryname - sorted by query/read names 65 ** @value ajEAssemSortOrderCoordinate Coordinate - major sort key is the 66 ** the reference/contig name, order defined by the order of @SQ lines 67 ** in header in SAM/BAM assemblies. 68 ** @@ 69 ******************************************************************************/ 70 71 typedef enum AjOAssemSortOrder 72 { 73 ajEAssemSortOrderUnknown, 74 ajEAssemSortOrderUnsorted, 75 ajEAssemSortOrderQueryname, 76 ajEAssemSortOrderCoordinate 77 } AjEAssemSortOrder; 78 79 80 81 82 /* @enum AjEAssemPlatform ***************************************************** 83 ** 84 ** Platforms/technologies to produce reads, as enumerated in SAM specv1.4. 85 ** 86 ** @value ajEAssemPlatformUnknown Unknown 87 ** @value ajEAssemPlatformCapillary Capillary 88 ** @value ajEAssemPlatformLS454 LS 454 89 ** @value ajEAssemPlatformIllumina Illumina 90 ** @value ajEAssemPlatformSolid Solid 91 ** @value ajEAssemPlatformHelicos Helicos 92 ** @value ajEAssemPlatformIontorrent IonTorrent 93 ** @value ajEAssemPlatformPacbio Pacific Biosciences 94 ** @@ 95 ******************************************************************************/ 96 97 typedef enum AjOAssemPlatform 98 { 99 ajEAssemPlatformUnknown, 100 ajEAssemPlatformCapillary, 101 ajEAssemPlatformLS454, 102 ajEAssemPlatformIllumina, 103 ajEAssemPlatformSolid, 104 ajEAssemPlatformHelicos, 105 ajEAssemPlatformIontorrent, 106 ajEAssemPlatformPacbio 107 } AjEAssemPlatform; 108 109 110 111 112 /* ========================================================================= */ 113 /* ============================== public data ============================== */ 114 /* ========================================================================= */ 115 116 117 118 119 /* @data AjPAssemRead ********************************************************* 120 ** 121 ** Store individual alignments of reads in assemblies. 122 ** 123 ** In some assemblies some reads can align to more than one contig 124 ** or reference sequence, so current name of the data type is not perfect. 125 ** 126 ** For storing read information as part of an assembly object. 127 ** Current version of this data type is MIRA/SAM oriented, 128 ** it needs to be improved to be a proper common data type, 129 ** some of the current fields are used by MIRA MAF format only 130 ** while few others are used by the SAM format only. 131 ** 132 ** @alias AjSAssemRead 133 ** @alias AjOAssemRead 134 ** 135 ** 136 ** In it's simplest form, a DNA template is sequenced only once. 137 ** In paired-end sequencing, a DNA template is sequenced once in forward 138 ** and once in reverse direction (Sanger, 454, Solexa). 139 ** In Sanger sequencing, several forward and/or reverse reads 140 ** can be sequenced from a DNA template. 141 ** In PacBio sequencing, a DNA template can be sequenced 142 ** in several "strobes", leading to multiple reads on a DNA template. 143 ** (ref:MAF format specification on mira web site) 144 ** 145 ** 146 ** @attr Name [AjPStr] Name 147 ** @attr Seq [AjPStr] Sequence 148 ** @attr SeqQ [AjPStr] Sequence quality string 149 ** @attr Template [AjPStr] Name of the DNA template a sequence comes from 150 ** @attr File [AjPStr] Name of the sequencing file 151 ** which contains raw data for this read 152 ** @attr Technology [AjPStr] Sequencing technology 153 ** @attr Cigar [AjPStr] CIGAR string 154 ** @attr Tags [AjPList] List of AjPAssemTag objects 155 ** @attr AlignmentBlocks [AjPList] Alignment blocks 156 ** @attr Rnext [ajlong] Reference number of the mate/next fragment 157 ** @attr Reference [ajlong] Reference sequence 158 ** @attr Pnext [ajlong] Position of the mate/next fragment 159 ** @attr Tlen [ajint] Observed template length 160 ** @attr Flag [ajint] Flag 161 ** @attr MapQ [ajint] Map quality 162 ** @attr TemplateSizeMin [ajint] Minimum template size 163 ** @attr TemplateSizeMax [ajint] Maximum template size 164 ** @attr ClipLeft [ajint] Clip left 165 ** @attr ClipRight [ajint] Clip right 166 ** @attr VectorLeft [ajint] Clip left due to sequencing vector 167 ** @attr VectorRight [ajint] Clip right due to sequencing vector 168 ** @attr QualLeft [ajint] Clip left due to quality 169 ** @attr QualRight [ajint] Clip right due to quality 170 ** @attr x1 [ajint] interval of the contig (1-based as in SAM and MAF) 171 ** @attr y1 [ajint] end of contig interval 172 ** @attr x2 [ajint] interval of the read 173 ** @attr y2 [ajint] end of read interval 174 ** @attr Reversed [AjBool] true: has been reverse-complemented 175 ** @attr Direction [char] Direction of the read with respect to the template 176 ** @attr Padding [char[7]] Padding to alignment boundary 177 ** @@ 178 ******************************************************************************/ 179 180 typedef struct AjSAssemRead 181 { 182 AjPStr Name; 183 AjPStr Seq; 184 AjPStr SeqQ; 185 AjPStr Template; 186 AjPStr File; 187 AjPStr Technology; 188 AjPStr Cigar; 189 AjPList Tags; 190 AjPList AlignmentBlocks; 191 ajlong Rnext; 192 ajlong Reference; 193 ajlong Pnext; 194 ajint Tlen; 195 ajint Flag; 196 ajint MapQ; 197 ajint TemplateSizeMin; 198 ajint TemplateSizeMax; 199 ajint ClipLeft; 200 ajint ClipRight; 201 ajint VectorLeft; 202 ajint VectorRight; 203 ajint QualLeft; 204 ajint QualRight; 205 ajint x1; 206 ajint y1; 207 ajint x2; 208 ajint y2; 209 AjBool Reversed; 210 char Direction; 211 char Padding[7]; 212 } AjOAssemRead; 213 214 #define AjPAssemRead AjOAssemRead* 215 216 217 218 219 /* @data AjPAssemContig ******************************************************* 220 ** 221 ** Ajax AssemContig object to store contigs in assemblies. 222 ** 223 ** In mapping assemblies consensus sequence refers to the reference sequence. 224 ** 225 ** @attr Name [AjPStr] Name 226 ** @attr Consensus [AjPStr] Consensus/reference sequence 227 ** @attr ConsensusQ [AjPStr] Quality string for the consensus sequence 228 ** @attr Tags [AjPList] Tags for the consensus sequence 229 ** @attr Length [ajint] Length of the consensus sequence 230 ** @attr Nreads [ajint] Number of reads 231 ** @attr AssemblyID [AjPStr] Assembly ID 232 ** @attr MD5 [AjPStr] MD5 checksum of the consensus sequence 233 ** in the uppercase, with gaps and spaces removed 234 ** @attr Species [AjPStr] Species 235 ** @attr URI [AjPStr] URI of the consensus sequences 236 ** 237 ** @@ 238 ******************************************************************************/ 239 240 typedef struct AjSContig 241 { 242 AjPStr Name; 243 AjPStr Consensus; 244 AjPStr ConsensusQ; 245 AjPList Tags; 246 ajint Length; 247 ajint Nreads; 248 AjPStr AssemblyID; 249 AjPStr MD5; 250 AjPStr Species; 251 AjPStr URI; 252 } AjOContig; 253 254 #define AjPAssemContig AjOContig* 255 256 257 258 259 /* @data AjPAssemin *********************************************************** 260 ** 261 ** Ajax Assembly Input object. 262 ** 263 ** Holds the input specification and information needed to read 264 ** the assembly and possible further entries 265 ** 266 ** @alias AjSAssemin 267 ** @alias AjOAssemin 268 ** 269 ** @attr Input [AjPTextin] General text input object 270 ** @attr BamIdx [AjPBamIndex] BAM index 271 ** @attr BamInput [AjBool] BAM file input 272 ** @attr cbegin [ajint] Contig start position 273 ** @attr cend [ajint] Contig end position 274 ** @attr Loading [AjBool] True if data is now loading 275 ** @@ 276 ******************************************************************************/ 277 278 typedef struct AjSAssemin 279 { 280 AjPTextin Input; 281 AjPBamIndex BamIdx; 282 AjBool BamInput; 283 ajint cbegin; 284 ajint cend; 285 AjBool Loading; 286 } AjOAssemin; 287 288 #define AjPAssemin AjOAssemin* 289 290 291 292 293 /* @data AjPAssem ************************************************************* 294 ** 295 ** Ajax Assembly object. 296 ** 297 ** Holds the assembly itself, plus associated information. 298 ** 299 ** @alias AjSAssem 300 ** @alias AjOAssem 301 ** 302 ** @attr Id [AjPStr] Id of term 303 ** @attr Db [AjPStr] Database name from input 304 ** @attr Setdb [AjPStr] Database name from command line 305 ** @attr Full [AjPStr] Full name 306 ** @attr Qry [AjPStr] Query for re-reading 307 ** @attr Formatstr [AjPStr] Input format name 308 ** @attr Filename [AjPStr] Original filename 309 ** @attr Textptr [AjPStr] Full text 310 ** @attr BamHeader [AjPSeqBamHeader] BAM header 311 ** @attr Contigs [AjPTable] Contigs table, storing contigs by name 312 ** @attr ContigsIgnored [AjPTable] Contigs named as '*' because of 313 ** missing header '@SQ' entries 314 ** @attr ContigsOrder [AjPList] Order of contigs in the assembly 315 ** @attr ContigArray [AjPAssemContig*] Contigs array for fast access 316 ** @attr Reads [AjPList] List of reads in the assembly 317 ** @attr Readgroups [AjPTable] Table of read-groups in the assembly 318 ** @attr rec [AjPAssemRead] Current/last read/alignment record read 319 ** @attr Fpos [ajlong] File position 320 ** @attr Format [AjEnum] Input format enum 321 ** @attr Count [ajuint] Number of lines read (contigs in case of BAM) 322 ** @attr Hasdata [AjBool] True when data has been loaded 323 ** @attr SO [AjEAssemSortOrder] Reads sort order in the assembly 324 ** 325 ** @@ 326 ******************************************************************************/ 327 328 typedef struct AjSAssem 329 { 330 AjPStr Id; 331 AjPStr Db; 332 AjPStr Setdb; 333 AjPStr Full; 334 AjPStr Qry; 335 AjPStr Formatstr; 336 AjPStr Filename; 337 AjPStr Textptr; 338 AjPSeqBamHeader BamHeader; 339 AjPTable Contigs; 340 AjPTable ContigsIgnored; 341 AjPList ContigsOrder; 342 AjPAssemContig* ContigArray; 343 AjPList Reads; 344 AjPTable Readgroups; 345 AjPAssemRead rec; 346 ajlong Fpos; 347 AjEnum Format; 348 ajuint Count; 349 AjBool Hasdata; 350 AjEAssemSortOrder SO; 351 } AjOAssem; 352 353 #define AjPAssem AjOAssem* 354 355 356 357 358 /* @data AjPAssemload ********************************************************* 359 ** 360 ** Ajax assembly loader object. 361 ** 362 ** Inherits an AjPAssem but allows more assembly data to be read from the 363 ** same input by also inheriting the AjPVarin input object. 364 ** 365 ** @alias AjSAssemload 366 ** @alias AjOAssemload 367 ** 368 ** @attr Assem [AjPAssem] Current variation 369 ** @attr Assemin [AjPAssemin] Assembly input for reading next 370 ** @attr Count [ajuint] Count of terms so far 371 ** @attr Loading [AjBool] True if data is now loading 372 ** @attr Returned [AjBool] if true: Assembly object has been returned to a new 373 ** owner and is not to be deleted by the destructor 374 ** @attr Padding [ajuint] Padding to alignment boundary 375 ** @@ 376 ******************************************************************************/ 377 378 typedef struct AjSAssemload 379 { 380 AjPAssem Assem; 381 AjPAssemin Assemin; 382 ajuint Count; 383 AjBool Loading; 384 AjBool Returned; 385 ajuint Padding; 386 } AjOAssemload; 387 388 #define AjPAssemload AjOAssemload* 389 390 391 392 393 /* @data AjPAssemAccess ******************************************************* 394 ** 395 ** Ajax assembly access database reading object. 396 ** 397 ** Holds information needed to read an assembly entry from a database. 398 ** Access methods are defined for each known database type. 399 ** 400 ** Assembly entries are read from the database using the defined 401 ** database access function, which is usually a static function 402 ** within ajassemdb.c 403 ** 404 ** This should be a static data object but is needed for the definition 405 ** of AjPAssemin. 406 ** 407 ** @alias AjSAssemAccess 408 ** @alias AjOAssemAccess 409 ** 410 ** @attr Name [const char*] Access method name used in emboss.default 411 ** @attr Access [AjBool function] Access function 412 ** @attr AccessFree [AjBool function] Access cleanup function 413 ** @attr Qlink [const char*] Supported query link operators 414 ** @attr Desc [const char*] Description 415 ** @attr Alias [AjBool] Alias for another name 416 ** @attr Entry [AjBool] Supports retrieval of single entries 417 ** @attr Query [AjBool] Supports retrieval of selected entries 418 ** @attr All [AjBool] Supports retrieval of all entries 419 ** @attr Chunked [AjBool] Supports retrieval of entries in chunks 420 ** @attr Padding [AjBool] Padding to alignment boundary 421 ** @@ 422 ******************************************************************************/ 423 424 typedef struct AjSAssemAccess 425 { 426 const char *Name; 427 AjBool (*Access) (AjPAssemin assemin); 428 AjBool (*AccessFree) (void* qry); 429 const char* Qlink; 430 const char* Desc; 431 AjBool Alias; 432 AjBool Entry; 433 AjBool Query; 434 AjBool All; 435 AjBool Chunked; 436 AjBool Padding; 437 } AjOAssemAccess; 438 439 #define AjPAssemAccess AjOAssemAccess* 440 441 442 443 444 /* @data AjPAssemTag ********************************************************** 445 ** 446 ** Ajax AssemTag object to store MIRA tags for the read sequences as well as 447 ** the contig consensus sequences. 448 ** 449 ** SAM format also allows a set of predefined tags 450 ** as well as it reserves lowercase tags for end users. 451 ** 452 ** @attr Name [AjPStr] Name 453 ** @attr Comment [AjPStr] Tag comment in case of MAF, 454 ** or tag value in case of SAM 455 ** @attr x1 [ajuint] X value 456 ** @attr y1 [ajuint] Y value 457 ** @attr type [char] Value type: AcCsSiIfZHB 458 ** @attr Padding [char[7]] Padding to alignment boundary 459 ** @@ 460 ******************************************************************************/ 461 462 typedef struct AjSAssemTag 463 { 464 AjPStr Name; 465 AjPStr Comment; 466 ajuint x1; 467 ajuint y1; 468 char type; 469 char Padding[7]; 470 } AjOAssemTag; 471 472 #define AjPAssemTag AjOAssemTag* 473 474 475 476 477 /* @data AjPAssemReadalignmentblock ******************************************* 478 ** 479 ** read alignment block 480 ** 481 ** @attr readStart [int] Read start 482 ** @attr referenceStart [int] Reference start 483 ** @attr length [int] alignment length 484 ** @@ 485 ******************************************************************************/ 486 487 typedef struct AjSAssemReadalignmentblock 488 { 489 int readStart; 490 int referenceStart; 491 int length; 492 } AjOAssemReadalignmentblock; 493 494 #define AjPAssemReadalignmentblock AjOAssemReadalignmentblock* 495 496 497 498 499 /* @data AjPAssemReadgroup **************************************************** 500 ** 501 ** read-group object to store information about read groups as described 502 ** in SAM spec. 503 ** 504 ** @attr ID [AjPStr] Read group identifier 505 ** @attr CN [AjPStr] Name of the sequencing center producing the read 506 ** @attr Desc [AjPStr] Description 507 ** @attr Date [AjPStr] Date the run was produced 508 ** @attr FlowOrder [AjPStr] The array of nucleotide bases that correspond to 509 ** the nucleotides used for each flow of each record 510 ** @attr KeySeq [AjPStr] The array of nucleotide bases that correspond to 511 ** the key sequence of each read 512 ** @attr Library [AjPStr] Library 513 ** @attr Programs [AjPStr] Programs used for processing the read group 514 ** @attr Unit [AjPStr] Platform unit (e.g. flowcell-barcode for Illumina) 515 ** @attr Sample [AjPStr] Sample name, or pool name when a pool was sequenced 516 ** @attr Isize [ajint] Predicted median insert size 517 ** 518 ** 519 ** @attr Platform [AjEAssemPlatform] Instrument platform 520 ** @@ 521 ******************************************************************************/ 522 523 typedef struct AjSAssemReadgroup 524 { 525 AjPStr ID; 526 AjPStr CN; 527 AjPStr Desc; 528 AjPStr Date; 529 AjPStr FlowOrder; 530 AjPStr KeySeq; 531 AjPStr Library; 532 AjPStr Programs; 533 AjPStr Unit; 534 AjPStr Sample; 535 ajint Isize; 536 AjEAssemPlatform Platform; 537 } AjOAssemReadgroup; 538 539 #define AjPAssemReadgroup AjOAssemReadgroup* 540 541 542 543 544 /* ========================================================================= */ 545 /* =========================== public functions ============================ */ 546 /* ========================================================================= */ 547 548 549 550 551 /* 552 ** Prototype definitions 553 */ 554 555 /* 556 ** End of prototype definitions 557 */ 558 559 560 561 562 AJ_END_DECLS 563 564 #endif /* !AJASSEMDATA_H */ 565