1 /* @include embsig ************************************************************ 2 ** 3 ** Data structures and algorithms for use with sparse sequence signatures. 4 ** Hit, Hitlist, Sigpos, Sigdat and Signature objects. 5 ** 6 ** @author Copyright (c) 2004 Jon Ison (jison@hgmp.mrc.ac.uk) 7 ** @modified $Date: 2012/04/12 20:39:51 $ by $Author: mks $ 8 ** @@ 9 ** 10 ** This library is free software; you can redistribute it and/or 11 ** modify it under the terms of the GNU Lesser General Public 12 ** License as published by the Free Software Foundation; either 13 ** version 2.1 of the License, or (at your option) any later version. 14 ** 15 ** This library is distributed in the hope that it will be useful, 16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of 17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 ** Lesser General Public License for more details. 19 ** 20 ** You should have received a copy of the GNU Lesser General Public 21 ** License along with this library; if not, write to the Free Software 22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 23 ** 24 *****************************************************************************/ 25 26 #ifndef EMBSIG_H 27 #define EMBSIG_H 28 29 30 31 /* ========================================================================= */ 32 /* ============================= include files ============================= */ 33 /* ========================================================================= */ 34 35 #include "ajdefine.h" 36 #include "ajarr.h" 37 #include "ajfile.h" 38 #include "ajlist.h" 39 #include "ajmatrices.h" 40 41 AJ_BEGIN_DECLS 42 43 44 45 46 /* ========================================================================= */ 47 /* =============================== constants =============================== */ 48 /* ========================================================================= */ 49 50 51 52 53 /* @enum EmbESignatureType **************************************************** 54 ** 55 ** NUCLEUS Signature Type enumeration 56 ** 57 ** @value embESignatureTypeNULL NULL 58 ** @value embESignatureTypeCATH CATH for domain signatures 59 ** @value embESignatureTypeSCOP SCOP for domain signatures 60 ** @value embESignatureTypeLIGAND Ligand for ligand signatures 61 ** @@ 62 ******************************************************************************/ 63 64 typedef enum EmbOSignatureType 65 { 66 embESignatureTypeNULL, 67 embESignatureTypeCATH, 68 embESignatureTypeSCOP, 69 embESignatureTypeLIGAND 70 } EmbESignatureType; 71 72 73 74 75 /* @enum EmbESignatureTypesig ************************************************* 76 ** 77 ** NUCLEUS Signature Type enumeration 78 ** 79 ** @value embESignatureTypesigNULL NULL 80 ** @value embESignatureTypesig1D 1D 81 ** @value embESignatureTypesig3D 3D 82 ** @@ 83 ******************************************************************************/ 84 85 typedef enum EmbOSignatureTypesig 86 { 87 embESignatureTypesigNULL, 88 embESignatureTypesig1D, 89 embESignatureTypesig3D 90 } EmbESignatureTypesig; 91 92 93 94 95 /* ========================================================================= */ 96 /* ============================== public data ============================== */ 97 /* ========================================================================= */ 98 99 100 101 102 /* @data EmbPSigpos ********************************************************** 103 ** 104 ** Nucleus Sigpos object. 105 ** 106 ** Holds data for compiled signature position 107 ** 108 ** EmbPSigpos is implemented as a pointer to a C data structure. 109 ** 110 ** @alias EmbSSigpos 111 ** @alias EmbOSigpos 112 ** 113 ** 114 ** 115 116 ** @attr gsiz [ajuint*] Gap sizes 117 ** @attr gpen [float*] Gap penalties 118 ** @attr subs [float*] Residue match values 119 ** @attr ngaps [ajuint] No. of gaps 120 ** @attr Padding [char[4]] Padding to alignment boundary 121 ** 122 ** @new embSigposNew Default Sigdat object constructor 123 ** @delete embSigposDel Default Sigdat object destructor 124 ** @@ 125 ****************************************************************************/ 126 127 typedef struct EmbSSigpos 128 { 129 ajuint *gsiz; 130 float *gpen; 131 float *subs; 132 ajuint ngaps; 133 char Padding[4]; 134 } EmbOSigpos; 135 #define EmbPSigpos EmbOSigpos* 136 137 138 139 140 141 /* @data EmbPSigdat ********************************************************** 142 ** 143 ** Nucleus Sigdat object. 144 ** 145 ** Holds empirical data for an (uncompiled) signature position. 146 ** Important: Functions which manipulate this structure rely on the data in 147 ** the gap arrays (gsiz and grfq) being filled in order of increasing gap 148 ** size. 149 ** 150 ** EmbPSigdat is implemented as a pointer to a C data structure. 151 ** 152 ** @alias EmbSSigdat 153 ** @alias EmbOSigdat 154 ** 155 ** 156 ** 157 ** @attr rids [AjPChar] Residue id's 158 ** @attr rfrq [AjPUint] Residue frequencies 159 ** 160 ** @attr nres [ajuint] No. diff. types of residue 161 ** @attr nenv [ajuint] No. diff. types of environment 162 ** @attr eids [AjPStr*] Environment id's 163 ** @attr efrq [AjPUint] Environment frequencies 164 ** 165 ** @attr gsiz [AjPUint] Gap sizes 166 ** @attr gfrq [AjPUint] Frequencies of gaps of each size 167 ** @attr ngap [ajuint] No. diff. sizes of empirical gap 168 ** @attr wsiz [ajuint] Window size for this gap 169 ** 170 ** @new embSigdatNew Default Sigdat object constructor 171 ** @delete embSigdatDel Default Sigdat object destructor 172 ** @@ 173 ****************************************************************************/ 174 175 typedef struct EmbSSigdat 176 { 177 178 AjPChar rids; 179 AjPUint rfrq; 180 ajuint nres; 181 ajuint nenv; 182 AjPStr *eids; 183 AjPUint efrq; 184 185 186 AjPUint gsiz; 187 AjPUint gfrq; 188 189 ajuint ngap; 190 ajuint wsiz; 191 } EmbOSigdat; 192 #define EmbPSigdat EmbOSigdat* 193 194 195 196 197 198 /* @data EmbPSignature ******************************************************* 199 ** 200 ** Nucleus Signature object. 201 ** 202 ** EmbPSignature is implemented as a pointer to a C data structure. 203 ** 204 ** @alias EmbSSignature 205 ** @alias EmbOSignature 206 ** 207 ** 208 ** 209 ** @attr Type [EmbESignatureType] NUCLEUS Signature Type enumeration 210 ** @attr Typesig [EmbESignatureTypesig] NUCLEUS Signature Typesig enumeration 211 ** for sequence or structure-based signatures respectively. 212 ** @attr Class [AjPStr] SCOP classification. 213 ** @attr Architecture [AjPStr] CATH classification. 214 ** @attr Topology [AjPStr] CATH classification. 215 ** @attr Fold [AjPStr] SCOP classification. 216 ** @attr Superfamily [AjPStr] SCOP classification. 217 ** @attr Family [AjPStr] SCOP classification. 218 ** @attr Sunid_Family [ajuint] SCOP sunid for family. 219 ** @attr npos [ajuint] No. of signature positions. 220 ** @attr pos [EmbPSigpos*] Array of derived data for puropses of 221 ** alignment. 222 ** @attr dat [EmbPSigdat*] Array of empirical data. 223 ** 224 ** @attr Id [AjPStr] Protein id code. 225 ** @attr Domid [AjPStr] Domain id code. 226 ** @attr Ligid [AjPStr] Ligand id code. 227 ** @attr Desc [AjPStr] Description of ligand (ajLIGAND only) 228 ** @attr ns [ajuint] No. of sites (ajLIGAND only) 229 ** @attr sn [ajuint] Site number (ajLIGAND only) 230 ** @attr np [ajuint] No. of patches (ajLIGAND only) 231 ** @attr pn [ajuint] Patch number (ajLIGAND only) 232 ** @attr minpatch [ajuint] Max. patch size (residues) (ajLIGAND only) 233 ** @attr maxgap [ajuint] Min. gap distance (residues) (ajLIGAND only) 234 ** @new embSignatureNew Default Signature constructor 235 ** @delete embSignatureDel Default Signature destructor 236 ** @output embSignatureWrite Write signature to file. 237 ** @input embSignatureReadNew Construct a Signature object from reading a 238 ** file in embl-like format (see documentation for the DOMAINATRIX 239 ** "sigscan" application). 240 ** @output embSignatureWrite Write a Signature object to a file in embl-like 241 ** format (see documentation for the DOMAINATRIX "sigscan" 242 ** application). 243 ** @input embSignatureHitsRead Construct a Hitlist object from reading a 244 ** signature hits file (see documentation for the DOMAINATRIX 245 ** "sigscan" application). 246 ** @output embSignatureHitsWrite Writes a list of Hit objects to a 247 ** signature hits file (see documentation for the DOMAINATRIX 248 ** "sigscan" application). 249 ** @modify embSignatureCompile Compiles a Signature object. The signature 250 ** must first have been allocated by using the embSignatureNew 251 ** function. 252 ** @use embSignatureAlignSeq Performs an alignment of a signature to a 253 ** protein sequence. The signature must have first been compiled by 254 ** calling embSignatureCompile. Write a Hit object with the result. 255 ** @use embSignatureAlignSeqall Performs an alignment of a signature to 256 ** protein sequences. The signature must have first been compiled by 257 ** calling embSignatureCompile. Write a list of Hit objects with 258 ** the result. 259 ** @@ 260 ****************************************************************************/ 261 262 typedef struct EmbSSignature 263 { 264 EmbESignatureType Type; 265 EmbESignatureTypesig Typesig; 266 AjPStr Class; 267 AjPStr Architecture; 268 AjPStr Topology; 269 AjPStr Fold; 270 AjPStr Superfamily; 271 AjPStr Family; 272 ajuint Sunid_Family; 273 ajuint npos; 274 EmbPSigpos *pos; 275 EmbPSigdat *dat; 276 277 AjPStr Id; 278 AjPStr Domid; 279 AjPStr Ligid; 280 AjPStr Desc; 281 ajuint ns; 282 ajuint sn; 283 ajuint np; 284 ajuint pn; 285 ajuint minpatch; 286 ajuint maxgap; 287 } EmbOSignature; 288 #define EmbPSignature EmbOSignature* 289 290 291 292 293 294 295 /* @data EmbPHit ************************************************************* 296 ** 297 ** Nucleus hit object. 298 ** 299 ** Holds data associated with a protein / domain sequence that is generated 300 ** and or manipulated by the EMBOSS applications seqsearch, seqsort, and 301 ** sigscan. 302 ** 303 ** EmbPHit is implemented as a pointer to a C data structure. 304 ** 305 ** @alias EmbSHit 306 ** @alias EmbOHit 307 ** 308 ** 309 ** 310 ** @attr Seq [AjPStr] Sequence as string. 311 ** @attr Start [ajuint] Start of sequence or signature alignment relative 312 ** to full length swissprot sequence, this is an 313 ** index so starts at 0. 314 ** @attr End [ajuint] End of sequence or signature alignment relative 315 ** to full length swissprot sequence, this is an 316 ** index so starts at 0. 317 ** @attr Acc [AjPStr] Accession number of sequence entry. 318 ** @attr Spr [AjPStr] Swissprot code of sequence entry. 319 ** @attr Dom [AjPStr] SCOP or CATH database identifier code of entry. 320 ** @attr Rank [ajuint] Rank order of hit 321 ** @attr Score [float] Score of hit 322 ** @attr Eval [float] E-value of hit 323 ** @attr Pval [float] p-value of hit 324 ** 325 ** @attr Typeobj [AjPStr] Primary (objective) classification of hit. 326 ** @attr Typesbj [AjPStr] Secondary (subjective) classification of hit 327 ** @attr Model [AjPStr] String for model type if used, one of 328 ** PSIBLAST, HMMER, SAM, SPARSE, HENIKOFF or GRIBSKOV 329 ** 330 ** @attr Alg [AjPStr] Alignment, e.g. of a signature to the sequence 331 ** @attr Group [AjPStr] Grouping of hit, e.g. 'REDUNDANT' or 332 ** 'NON_REDUNDANT' 333 ** @attr Target [AjBool] Used for garbage collection. 334 ** @attr Target2 [AjBool] Also used for garbage collection. 335 ** @attr Sig [EmbPSignature] Pointer to signature object for which hit 336 ** @attr Priority [AjBool] Also used for garbage collection. 337 ** @attr Padding [char[4]] Padding to alignment boundary 338 ** was generated. Used as a pointer only - memory is never freed or allocated 339 ** to it. 340 ** 341 ** 342 ** 343 ** @new embHitNew Default Hit constructor 344 ** @new embHitReadFasta Construct Hit object from reading the next entry 345 ** from a file in extended FASTA format (see documentation for the 346 ** DOMAINATRIX "seqsearch" application). 347 ** @delete embHitDel Default Hit destructor 348 ** @assign embHitMerge Create new Hit from merging two Hit objects 349 ** @use embMatchScore Sort Hit objects by Score element. 350 ** @use embMatchinvScore Sort (inverted order) Hit objects by Score 351 ** element. 352 ** @use embMatchLigid Sort Hit objects by Ligid element in Sig element. 353 ** @use embMatch Sort Hit objects by Ligid element in Sig element. 354 355 ** @use embHitsOverlap Checks for overlap between two Hit objects. 356 ** 357 ** @@ 358 ****************************************************************************/ 359 360 typedef struct EmbSHit 361 { 362 AjPStr Seq; 363 ajuint Start; 364 ajuint End; 365 AjPStr Acc; 366 AjPStr Spr; 367 AjPStr Dom; 368 ajuint Rank; 369 float Score; 370 float Eval; 371 float Pval; 372 373 AjPStr Typeobj; 374 AjPStr Typesbj; 375 AjPStr Model; 376 AjPStr Alg; 377 AjPStr Group; 378 AjBool Target; 379 AjBool Target2; 380 381 EmbPSignature Sig; 382 AjBool Priority; 383 char Padding[4]; 384 } EmbOHit; 385 #define EmbPHit EmbOHit* 386 387 388 389 390 391 392 /* @data EmbPHitlist ********************************************************* 393 ** 394 ** Nucleus hitlist object. 395 ** 396 ** Holds an array of hit structures and associated SCOP classification 397 ** records. 398 ** 399 ** EmbPHitlist is implemented as a pointer to a C data structure. 400 ** 401 ** @alias EmbSHitlist 402 ** @alias EmbOHitlist 403 ** 404 ** 405 ** 406 ** @attr Class [AjPStr] SCOP classification. 407 ** @attr Architecture [AjPStr] CATH classification. 408 ** @attr Topology [AjPStr] CATH classification. 409 ** @attr Fold [AjPStr] SCOP classification. 410 ** @attr Superfamily [AjPStr] SCOP classification. 411 ** @attr Family [AjPStr] SCOP classification. 412 ** @attr Model [AjPStr] SCOP classification. 413 ** @attr Sunid_Family [ajuint] SCOP sunid for family. 414 ** @attr Priority [AjBool] True if the Hitlist is high priority. 415 416 ** @attr hits [EmbPHit*] Array of hits. 417 ** @attr Type [EmbESignatureType] NUCLEUS Signature Type enumeration 418 ** @attr N [ajuint] No. of hits. 419 ** 420 ** @new embHitlistNew Default Hitlist constructor 421 ** @delete embHitlistDel Default Hitlist destructor 422 ** @use embHitlistMatchFold Sort Hitlist objects by Fold element 423 ** @input embHitlistRead Construct Hitlist object from reading the next entry 424 ** from a file in embl-like format (see documentation for the 425 ** DOMAINATRIX "seqsearch" application). 426 ** @new embHitlistReadFasta Construct Hitlist object from reading 427 ** the next entry 428 ** from a file in extended FASTA format (see documentation for the 429 ** DOMAINATRIX "seqsearch" application). 430 ** @input embHitlistReadNode Construct Hitlist object from reading a specific 431 ** entry from a file in embl-like format (see documentation for the 432 ** DOMAINATRIX "seqsearch" application). 433 ** @new embHitlistReadNodeFasta Construct Hitlist object from reading 434 ** a specific entry from a file in extended FASTA format 435 ** (see documentation for the DOMAINATRIX "seqsearch" application). 436 ** @output embHitlistWrite Write Hitlist to file in embl-like format (see 437 ** documentation for the DOMAINATRIX "seqsearch" application). 438 ** @output embHitlistWriteSubset Write a subset of a Hitlist to file in 439 ** embl-like format (see documentation for the DOMAINATRIX "seqsearch" 440 ** application). 441 ** @output embHitlistWriteFasta Write Hitlist to file in extended FASTA format 442 ** (see documentation for the DOMAINATRIX "seqsearch" application). 443 ** @output embHitlistWriteSubsetFasta Write a subset of a Hitlist to file in 444 ** extended FASTA format (see documentation for the DOMAINATRIX 445 ** "seqsearch" application). 446 ** @output embHitlistWriteHitFasta Write a single Hit from a Hitlist to file 447 ** in extended FASTA format (see documentation for the DOMAINATRIX 448 ** "seqsearch" application). 449 ** @use embHitlistClassify Classifies a list of signature-sequence hits 450 ** (held in a Hitlist object) according to list of target sequences 451 ** (a list of Hitlist objects). 452 ** @@ 453 ****************************************************************************/ 454 455 typedef struct EmbSHitlist 456 { 457 AjPStr Class; 458 AjPStr Architecture; 459 AjPStr Topology; 460 AjPStr Fold; 461 AjPStr Superfamily; 462 AjPStr Family; 463 AjPStr Model; 464 ajuint Sunid_Family; 465 AjBool Priority; 466 EmbPHit *hits; 467 EmbESignatureType Type; 468 ajuint N; 469 } EmbOHitlist; 470 #define EmbPHitlist EmbOHitlist* 471 472 473 474 475 476 /* ========================================================================= */ 477 /* =========================== public functions ============================ */ 478 /* ========================================================================= */ 479 480 481 482 /* 483 ** Prototype definitions 484 */ 485 486 487 /* ======================================================================= */ 488 /* =========================== Sigdat object ============================= */ 489 /* ======================================================================= */ 490 EmbPSigdat embSigdatNew(ajuint nres, ajuint ngap); 491 void embSigdatDel(EmbPSigdat *pthis); 492 493 494 495 496 /* ======================================================================= */ 497 /* =========================== Sigpos object ============================= */ 498 /* ======================================================================= */ 499 EmbPSigpos embSigposNew(ajuint ngap); 500 void embSigposDel(EmbPSigpos *thys); 501 502 503 504 505 /* ======================================================================= */ 506 /* ========================== Signature object =========================== */ 507 /* ======================================================================= */ 508 EmbPSignature embSignatureNew(ajuint n); 509 void embSignatureDel(EmbPSignature *ptr); 510 EmbPSignature embSignatureReadNew(AjPFile inf); 511 AjBool embSignatureWrite(AjPFile outf, const EmbPSignature obj); 512 AjBool embSignatureCompile(EmbPSignature *S, float gapo, float gape, 513 const AjPMatrixf matrix); 514 AjBool embSignatureAlignSeq(const EmbPSignature S, const AjPSeq seq, 515 EmbPHit *hit, 516 ajuint nterm); 517 AjBool embSignatureAlignSeqall(const EmbPSignature sig, AjPSeqall db, 518 ajuint n, EmbPHitlist *hitlist, 519 ajuint nterm); 520 AjBool embSignatureHitsWrite(AjPFile outf, const EmbPSignature sig, 521 const EmbPHitlist hitlist, ajuint n); 522 EmbPHitlist embSignatureHitsRead(AjPFile inf); 523 524 525 526 527 528 /* ======================================================================= */ 529 /* ============================= Hit object ============================== */ 530 /* ======================================================================= */ 531 EmbPHit embHitNew(void); 532 533 EmbPHit embHitReadFasta(AjPFile inf); 534 535 void embHitDel(EmbPHit *ptr); 536 537 EmbPHit embHitMerge(const EmbPHit hit1, 538 const EmbPHit hit2); 539 540 AjBool embHitsOverlap(const EmbPHit hit1, 541 const EmbPHit hit2, 542 ajuint n); 543 544 ajint embMatchScore(const void *hit1, 545 const void *hit2); 546 547 ajint embMatchinvScore(const void *hit1, 548 const void *hit2); 549 550 ajint embMatchLigid(const void *hit1, 551 const void *hit2); 552 553 ajint embMatchSN(const void *hit1, 554 const void *hit2); 555 556 557 /* ======================================================================= */ 558 /* =========================== Hitlist object ============================ */ 559 /* ======================================================================= */ 560 561 EmbPHitlist embHitlistNew(ajuint n); 562 563 void embHitlistDel(EmbPHitlist *ptr); 564 565 EmbPHitlist embHitlistRead(AjPFile inf); 566 567 EmbPHitlist embHitlistReadFasta(AjPFile inf); 568 569 AjBool embHitlistWrite(AjPFile outf, 570 const EmbPHitlist obj); 571 572 AjBool embHitlistWriteSubset(AjPFile outf, 573 const EmbPHitlist obj, 574 const AjPUint ok); 575 576 AjBool embHitlistWriteFasta(AjPFile outf, 577 const EmbPHitlist obj); 578 579 AjBool embHitlistWriteSubsetFasta(AjPFile outf, 580 const EmbPHitlist obj, 581 const AjPUint ok); 582 583 AjBool embHitlistWriteHitFasta(AjPFile outf, 584 ajuint n, 585 const EmbPHitlist obj); 586 587 AjPList embHitlistReadNode(AjPFile inf, 588 const AjPStr fam, 589 const AjPStr sfam, 590 const AjPStr fold, 591 const AjPStr klass); 592 593 AjPList embHitlistReadNodeFasta(AjPFile inf, 594 const AjPStr fam, 595 const AjPStr sfam, 596 const AjPStr fold, 597 const AjPStr klass); 598 599 AjBool embHitlistClassify(EmbPHitlist hits, 600 const AjPList targets, 601 ajuint thresh); 602 603 ajint embHitlistMatchFold(const void *hit1, 604 const void *hit2); 605 606 607 void embSigExit(void); 608 609 /* 610 ** End of prototype definitions 611 */ 612 613 AJ_END_DECLS 614 615 #endif /* !EMBSIG_H */ 616 617 618 619 620 621 622 623 624 625 626 627 628 629