1 /* @include embsig ************************************************************
2 **
3 ** Data structures and algorithms for use with sparse sequence signatures.
4 ** Hit, Hitlist, Sigpos, Sigdat and Signature objects.
5 **
6 ** @author Copyright (c) 2004 Jon Ison (jison@hgmp.mrc.ac.uk)
7 ** @modified $Date: 2012/04/12 20:39:51 $ by $Author: mks $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 **
24 *****************************************************************************/
25 
26 #ifndef EMBSIG_H
27 #define EMBSIG_H
28 
29 
30 
31 /* ========================================================================= */
32 /* ============================= include files ============================= */
33 /* ========================================================================= */
34 
35 #include "ajdefine.h"
36 #include "ajarr.h"
37 #include "ajfile.h"
38 #include "ajlist.h"
39 #include "ajmatrices.h"
40 
41 AJ_BEGIN_DECLS
42 
43 
44 
45 
46 /* ========================================================================= */
47 /* =============================== constants =============================== */
48 /* ========================================================================= */
49 
50 
51 
52 
53 /* @enum EmbESignatureType ****************************************************
54 **
55 ** NUCLEUS Signature Type enumeration
56 **
57 ** @value embESignatureTypeNULL NULL
58 ** @value embESignatureTypeCATH CATH for domain signatures
59 ** @value embESignatureTypeSCOP SCOP for domain signatures
60 ** @value embESignatureTypeLIGAND Ligand for ligand signatures
61 ** @@
62 ******************************************************************************/
63 
64 typedef enum EmbOSignatureType
65 {
66     embESignatureTypeNULL,
67     embESignatureTypeCATH,
68     embESignatureTypeSCOP,
69     embESignatureTypeLIGAND
70 } EmbESignatureType;
71 
72 
73 
74 
75 /* @enum EmbESignatureTypesig *************************************************
76 **
77 ** NUCLEUS Signature Type enumeration
78 **
79 ** @value embESignatureTypesigNULL NULL
80 ** @value embESignatureTypesig1D 1D
81 ** @value embESignatureTypesig3D 3D
82 ** @@
83 ******************************************************************************/
84 
85 typedef enum EmbOSignatureTypesig
86 {
87     embESignatureTypesigNULL,
88     embESignatureTypesig1D,
89     embESignatureTypesig3D
90 } EmbESignatureTypesig;
91 
92 
93 
94 
95 /* ========================================================================= */
96 /* ============================== public data ============================== */
97 /* ========================================================================= */
98 
99 
100 
101 
102 /* @data EmbPSigpos **********************************************************
103 **
104 ** Nucleus Sigpos object.
105 **
106 ** Holds data for compiled signature position
107 **
108 ** EmbPSigpos is implemented as a pointer to a C data structure.
109 **
110 ** @alias EmbSSigpos
111 ** @alias EmbOSigpos
112 **
113 **
114 **
115 
116 ** @attr  gsiz    [ajuint*] Gap sizes
117 ** @attr  gpen    [float*]  Gap penalties
118 ** @attr  subs    [float*]  Residue match values
119 ** @attr  ngaps   [ajuint]  No. of gaps
120 ** @attr  Padding [char[4]] Padding to alignment boundary
121 **
122 ** @new embSigposNew Default Sigdat object constructor
123 ** @delete embSigposDel Default Sigdat object destructor
124 ** @@
125 ****************************************************************************/
126 
127 typedef struct EmbSSigpos
128 {
129     ajuint   *gsiz;
130     float   *gpen;
131     float   *subs;
132     ajuint  ngaps;
133     char    Padding[4];
134 } EmbOSigpos;
135 #define EmbPSigpos EmbOSigpos*
136 
137 
138 
139 
140 
141 /* @data EmbPSigdat **********************************************************
142 **
143 ** Nucleus Sigdat object.
144 **
145 ** Holds empirical data for an (uncompiled) signature position.
146 ** Important: Functions which manipulate this structure rely on the data in
147 ** the gap arrays (gsiz and grfq) being filled in order of increasing gap
148 ** size.
149 **
150 ** EmbPSigdat is implemented as a pointer to a C data structure.
151 **
152 ** @alias EmbSSigdat
153 ** @alias EmbOSigdat
154 **
155 **
156 **
157 ** @attr  rids [AjPChar]   Residue id's
158 ** @attr  rfrq [AjPUint]   Residue frequencies
159 **
160 ** @attr  nres [ajuint]    No. diff. types of residue
161 ** @attr  nenv [ajuint]    No. diff. types of environment
162 ** @attr  eids [AjPStr*]   Environment id's
163 ** @attr  efrq [AjPUint]   Environment frequencies
164 **
165 ** @attr  gsiz [AjPUint]   Gap sizes
166 ** @attr  gfrq [AjPUint]   Frequencies of gaps of each size
167 ** @attr  ngap [ajuint]    No. diff. sizes of empirical gap
168 ** @attr  wsiz [ajuint]    Window size for this gap
169 **
170 ** @new embSigdatNew Default Sigdat object constructor
171 ** @delete embSigdatDel Default Sigdat object destructor
172 ** @@
173 ****************************************************************************/
174 
175 typedef struct EmbSSigdat
176 {
177 
178     AjPChar      rids;
179     AjPUint      rfrq;
180     ajuint       nres;
181     ajuint       nenv;
182     AjPStr      *eids;
183     AjPUint      efrq;
184 
185 
186     AjPUint      gsiz;
187     AjPUint      gfrq;
188 
189     ajuint       ngap;
190     ajuint       wsiz;
191 } EmbOSigdat;
192 #define EmbPSigdat EmbOSigdat*
193 
194 
195 
196 
197 
198 /* @data EmbPSignature *******************************************************
199 **
200 ** Nucleus Signature object.
201 **
202 ** EmbPSignature is implemented as a pointer to a C data structure.
203 **
204 ** @alias EmbSSignature
205 ** @alias EmbOSignature
206 **
207 **
208 **
209 ** @attr  Type [EmbESignatureType] NUCLEUS Signature Type enumeration
210 ** @attr  Typesig [EmbESignatureTypesig] NUCLEUS Signature Typesig enumeration
211 ** for sequence or structure-based signatures respectively.
212 ** @attr  Class        [AjPStr]      SCOP classification.
213 ** @attr  Architecture [AjPStr]      CATH classification.
214 ** @attr  Topology     [AjPStr]      CATH classification.
215 ** @attr  Fold         [AjPStr]      SCOP classification.
216 ** @attr  Superfamily  [AjPStr]      SCOP classification.
217 ** @attr  Family       [AjPStr]      SCOP classification.
218 ** @attr  Sunid_Family [ajuint]       SCOP sunid for family.
219 ** @attr  npos         [ajuint]       No. of signature positions.
220 ** @attr  pos          [EmbPSigpos*]  Array of derived data for puropses of
221 **                                   alignment.
222 ** @attr  dat          [EmbPSigdat*]  Array of empirical data.
223 **
224 ** @attr  Id    [AjPStr]   Protein id code.
225 ** @attr  Domid [AjPStr]   Domain id code.
226 ** @attr  Ligid [AjPStr]   Ligand id code.
227 ** @attr  Desc  [AjPStr]   Description of ligand (ajLIGAND only)
228 ** @attr  ns    [ajuint]    No. of sites (ajLIGAND only)
229 ** @attr  sn    [ajuint]    Site number (ajLIGAND only)
230 ** @attr  np    [ajuint]    No. of patches (ajLIGAND only)
231 ** @attr  pn    [ajuint]    Patch number (ajLIGAND only)
232 ** @attr  minpatch  [ajuint]   Max. patch size (residues) (ajLIGAND only)
233 ** @attr  maxgap   [ajuint]    Min. gap distance (residues) (ajLIGAND only)
234 ** @new    embSignatureNew Default Signature constructor
235 ** @delete embSignatureDel Default Signature destructor
236 ** @output embSignatureWrite Write signature to file.
237 ** @input  embSignatureReadNew Construct a Signature object from reading a
238 **         file in embl-like format (see documentation for the DOMAINATRIX
239 **         "sigscan" application).
240 ** @output embSignatureWrite Write a Signature object to a file in embl-like
241 **         format (see documentation for the DOMAINATRIX "sigscan"
242 **         application).
243 ** @input  embSignatureHitsRead Construct a Hitlist object from reading a
244 **         signature hits file (see documentation for the DOMAINATRIX
245 **         "sigscan" application).
246 ** @output embSignatureHitsWrite Writes a list of Hit objects to a
247 **         signature hits file (see documentation for the DOMAINATRIX
248 **         "sigscan" application).
249 ** @modify embSignatureCompile Compiles a Signature object.  The signature
250 **         must first have been allocated by using the embSignatureNew
251 **         function.
252 ** @use    embSignatureAlignSeq Performs an alignment of a signature to a
253 **         protein sequence. The signature must have first been compiled by
254 **         calling embSignatureCompile.  Write a Hit object with the result.
255 ** @use    embSignatureAlignSeqall Performs an alignment of a signature to
256 **         protein sequences. The signature must have first been compiled by
257 **         calling embSignatureCompile.  Write a list of Hit objects with
258 **         the result.
259 ** @@
260 ****************************************************************************/
261 
262 typedef struct EmbSSignature
263 {
264     EmbESignatureType Type;
265     EmbESignatureTypesig Typesig;
266     AjPStr      Class;
267     AjPStr      Architecture;
268     AjPStr      Topology;
269     AjPStr      Fold;
270     AjPStr      Superfamily;
271     AjPStr      Family;
272     ajuint      Sunid_Family;
273     ajuint      npos;
274     EmbPSigpos *pos;
275     EmbPSigdat *dat;
276 
277     AjPStr    Id;
278     AjPStr    Domid;
279     AjPStr    Ligid;
280     AjPStr    Desc;
281     ajuint    ns;
282     ajuint    sn;
283     ajuint    np;
284     ajuint    pn;
285     ajuint    minpatch;
286     ajuint    maxgap;
287 } EmbOSignature;
288 #define EmbPSignature EmbOSignature*
289 
290 
291 
292 
293 
294 
295 /* @data EmbPHit *************************************************************
296 **
297 ** Nucleus hit object.
298 **
299 ** Holds data associated with a protein / domain sequence that is generated
300 ** and or manipulated by the EMBOSS applications seqsearch, seqsort, and
301 ** sigscan.
302 **
303 ** EmbPHit is implemented as a pointer to a C data structure.
304 **
305 ** @alias EmbSHit
306 ** @alias EmbOHit
307 **
308 **
309 **
310 ** @attr  Seq	   [AjPStr]  Sequence as string.
311 ** @attr  Start    [ajuint]   Start of sequence or signature alignment relative
312 **	           	     to full length swissprot sequence, this is an
313 **		             index so starts at 0.
314 ** @attr  End      [ajuint]   End of sequence or signature alignment relative
315 **		             to full length swissprot sequence, this is an
316 **         		     index so starts at 0.
317 ** @attr  Acc      [AjPStr]  Accession number of sequence entry.
318 ** @attr  Spr      [AjPStr]  Swissprot code of sequence entry.
319 ** @attr  Dom      [AjPStr]  SCOP or CATH database identifier code of entry.
320 ** @attr  Rank     [ajuint]   Rank order of hit
321 ** @attr  Score    [float]   Score of hit
322 ** @attr  Eval     [float]   E-value of hit
323 ** @attr  Pval     [float]   p-value of hit
324 **
325 ** @attr  Typeobj  [AjPStr]  Primary (objective) classification of hit.
326 ** @attr  Typesbj  [AjPStr]  Secondary (subjective) classification of hit
327 ** @attr  Model    [AjPStr]  String for model type if used, one of
328 **  PSIBLAST, HMMER, SAM, SPARSE, HENIKOFF or GRIBSKOV
329 **
330 ** @attr  Alg      [AjPStr]  Alignment, e.g. of a signature to the sequence
331 ** @attr  Group    [AjPStr]  Grouping of hit, e.g. 'REDUNDANT' or
332 **                           'NON_REDUNDANT'
333 ** @attr  Target   [AjBool]  Used for garbage collection.
334 ** @attr  Target2  [AjBool]  Also used for garbage collection.
335 ** @attr  Sig      [EmbPSignature] Pointer to signature object for which hit
336 ** @attr  Priority [AjBool]  Also used for garbage collection.
337 ** @attr  Padding  [char[4]]  Padding to alignment boundary
338 ** was generated. Used as a pointer only - memory is never freed or allocated
339 ** to it.
340 **
341 **
342 **
343 ** @new    embHitNew Default Hit constructor
344 ** @new    embHitReadFasta  Construct Hit object from reading the next entry
345 **         from a file in extended FASTA format (see documentation for the
346 **         DOMAINATRIX "seqsearch" application).
347 ** @delete embHitDel Default Hit destructor
348 ** @assign embHitMerge Create new Hit from merging two Hit objects
349 ** @use    embMatchScore Sort Hit objects by Score element.
350 ** @use    embMatchinvScore Sort (inverted order) Hit objects by Score
351 **         element.
352 ** @use    embMatchLigid Sort Hit objects by Ligid element in Sig element.
353 ** @use    embMatch Sort Hit objects by Ligid element in Sig element.
354 
355 ** @use    embHitsOverlap Checks for overlap between two Hit objects.
356 **
357 ** @@
358 ****************************************************************************/
359 
360 typedef struct EmbSHit
361 {
362   AjPStr  Seq;
363   ajuint  Start;
364   ajuint  End;
365   AjPStr  Acc;
366   AjPStr  Spr;
367   AjPStr  Dom;
368   ajuint  Rank;
369   float   Score;
370   float  Eval;
371   float  Pval;
372 
373   AjPStr  Typeobj;
374   AjPStr  Typesbj;
375   AjPStr  Model;
376   AjPStr  Alg;
377   AjPStr  Group;
378   AjBool  Target;
379   AjBool  Target2;
380 
381   EmbPSignature Sig;
382   AjBool  Priority;
383   char    Padding[4];
384 } EmbOHit;
385 #define EmbPHit EmbOHit*
386 
387 
388 
389 
390 
391 
392 /* @data EmbPHitlist *********************************************************
393 **
394 ** Nucleus hitlist object.
395 **
396 ** Holds an array of hit structures and associated SCOP classification
397 ** records.
398 **
399 ** EmbPHitlist is implemented as a pointer to a C data structure.
400 **
401 ** @alias EmbSHitlist
402 ** @alias EmbOHitlist
403 **
404 **
405 **
406 ** @attr  Class         [AjPStr]    SCOP classification.
407 ** @attr  Architecture  [AjPStr]    CATH classification.
408 ** @attr  Topology      [AjPStr]    CATH classification.
409 ** @attr  Fold          [AjPStr]    SCOP classification.
410 ** @attr  Superfamily   [AjPStr]    SCOP classification.
411 ** @attr  Family        [AjPStr]    SCOP classification.
412 ** @attr  Model         [AjPStr]    SCOP classification.
413 ** @attr  Sunid_Family  [ajuint]     SCOP sunid for family.
414 ** @attr  Priority      [AjBool]    True if the Hitlist is high priority.
415 
416 ** @attr  hits          [EmbPHit*]  Array of hits.
417 ** @attr  Type          [EmbESignatureType] NUCLEUS Signature Type enumeration
418 ** @attr  N             [ajuint]    No. of hits.
419 **
420 ** @new    embHitlistNew Default Hitlist constructor
421 ** @delete embHitlistDel Default Hitlist destructor
422 ** @use    embHitlistMatchFold Sort Hitlist objects by Fold element
423 ** @input  embHitlistRead Construct Hitlist object from reading the next entry
424 **         from a file in embl-like format (see documentation for the
425 **         DOMAINATRIX "seqsearch" application).
426 ** @new    embHitlistReadFasta Construct Hitlist object from reading
427 **         the next entry
428 **         from a file in extended FASTA format (see documentation for the
429 **         DOMAINATRIX "seqsearch" application).
430 ** @input  embHitlistReadNode Construct Hitlist object from reading a specific
431 **         entry from a file in embl-like format (see documentation for the
432 **         DOMAINATRIX "seqsearch" application).
433 ** @new    embHitlistReadNodeFasta Construct Hitlist object from reading
434 **         a specific entry from a file in extended FASTA format
435 **         (see documentation for the DOMAINATRIX "seqsearch" application).
436 ** @output embHitlistWrite Write Hitlist to file in embl-like format (see
437 **         documentation for the DOMAINATRIX "seqsearch" application).
438 ** @output embHitlistWriteSubset Write a subset of a Hitlist to file in
439 **         embl-like format (see documentation for the DOMAINATRIX "seqsearch"
440 **         application).
441 ** @output embHitlistWriteFasta Write Hitlist to file in extended FASTA format
442 **         (see documentation for the DOMAINATRIX "seqsearch" application).
443 ** @output embHitlistWriteSubsetFasta Write a subset of a Hitlist to file in
444 **         extended FASTA format (see documentation for the DOMAINATRIX
445 **         "seqsearch" application).
446 ** @output embHitlistWriteHitFasta Write a single Hit from a Hitlist to file
447 **         in extended FASTA format (see documentation for the DOMAINATRIX
448 **         "seqsearch" application).
449 ** @use    embHitlistClassify Classifies a list of signature-sequence hits
450 **         (held in a Hitlist object) according to list of target sequences
451 **         (a list of Hitlist objects).
452 ** @@
453 ****************************************************************************/
454 
455 typedef struct EmbSHitlist
456 {
457     AjPStr   Class;
458     AjPStr   Architecture;
459     AjPStr   Topology;
460     AjPStr   Fold;
461     AjPStr   Superfamily;
462     AjPStr   Family;
463     AjPStr   Model;
464     ajuint   Sunid_Family;
465     AjBool   Priority;
466     EmbPHit *hits;
467     EmbESignatureType Type;
468     ajuint   N;
469 } EmbOHitlist;
470 #define EmbPHitlist EmbOHitlist*
471 
472 
473 
474 
475 
476 /* ========================================================================= */
477 /* =========================== public functions ============================ */
478 /* ========================================================================= */
479 
480 
481 
482 /*
483 ** Prototype definitions
484 */
485 
486 
487 /* ======================================================================= */
488 /* =========================== Sigdat object ============================= */
489 /* ======================================================================= */
490 EmbPSigdat   embSigdatNew(ajuint nres, ajuint ngap);
491 void         embSigdatDel(EmbPSigdat *pthis);
492 
493 
494 
495 
496 /* ======================================================================= */
497 /* =========================== Sigpos object ============================= */
498 /* ======================================================================= */
499 EmbPSigpos   embSigposNew(ajuint ngap);
500 void         embSigposDel(EmbPSigpos *thys);
501 
502 
503 
504 
505 /* ======================================================================= */
506 /* ========================== Signature object =========================== */
507 /* ======================================================================= */
508 EmbPSignature embSignatureNew(ajuint n);
509 void          embSignatureDel(EmbPSignature *ptr);
510 EmbPSignature embSignatureReadNew(AjPFile inf);
511 AjBool        embSignatureWrite(AjPFile outf, const EmbPSignature obj);
512 AjBool        embSignatureCompile(EmbPSignature *S, float gapo, float gape,
513 				  const AjPMatrixf matrix);
514 AjBool        embSignatureAlignSeq(const EmbPSignature S, const AjPSeq seq,
515 				   EmbPHit *hit,
516 				   ajuint nterm);
517 AjBool        embSignatureAlignSeqall(const EmbPSignature sig, AjPSeqall db,
518 				      ajuint n, EmbPHitlist *hitlist,
519 				      ajuint nterm);
520 AjBool        embSignatureHitsWrite(AjPFile outf, const EmbPSignature sig,
521 				    const EmbPHitlist hitlist, ajuint n);
522 EmbPHitlist   embSignatureHitsRead(AjPFile inf);
523 
524 
525 
526 
527 
528 /* ======================================================================= */
529 /* ============================= Hit object ============================== */
530 /* ======================================================================= */
531 EmbPHit       embHitNew(void);
532 
533 EmbPHit       embHitReadFasta(AjPFile inf);
534 
535 void          embHitDel(EmbPHit *ptr);
536 
537 EmbPHit       embHitMerge(const EmbPHit hit1,
538 			  const EmbPHit hit2);
539 
540 AjBool        embHitsOverlap(const EmbPHit hit1,
541 			     const EmbPHit hit2,
542 			     ajuint n);
543 
544 ajint         embMatchScore(const void *hit1,
545 			    const void *hit2);
546 
547 ajint         embMatchinvScore(const void *hit1,
548 			       const void *hit2);
549 
550 ajint         embMatchLigid(const void *hit1,
551 			    const void *hit2);
552 
553 ajint         embMatchSN(const void *hit1,
554 			 const void *hit2);
555 
556 
557 /* ======================================================================= */
558 /* =========================== Hitlist object ============================ */
559 /* ======================================================================= */
560 
561 EmbPHitlist   embHitlistNew(ajuint n);
562 
563 void          embHitlistDel(EmbPHitlist *ptr);
564 
565 EmbPHitlist   embHitlistRead(AjPFile inf);
566 
567 EmbPHitlist   embHitlistReadFasta(AjPFile inf);
568 
569 AjBool        embHitlistWrite(AjPFile outf,
570 			      const EmbPHitlist obj);
571 
572 AjBool        embHitlistWriteSubset(AjPFile outf,
573 				    const EmbPHitlist obj,
574 				    const AjPUint ok);
575 
576 AjBool        embHitlistWriteFasta(AjPFile outf,
577 				   const EmbPHitlist obj);
578 
579 AjBool        embHitlistWriteSubsetFasta(AjPFile outf,
580 					 const EmbPHitlist obj,
581 					 const AjPUint ok);
582 
583 AjBool        embHitlistWriteHitFasta(AjPFile outf,
584 				      ajuint n,
585 				      const EmbPHitlist obj);
586 
587 AjPList       embHitlistReadNode(AjPFile inf,
588 				 const AjPStr fam,
589 				 const AjPStr sfam,
590 				 const AjPStr fold,
591 				 const AjPStr klass);
592 
593 AjPList       embHitlistReadNodeFasta(AjPFile inf,
594 				      const AjPStr fam,
595 				      const AjPStr sfam,
596 				      const AjPStr fold,
597 				      const AjPStr klass);
598 
599 AjBool        embHitlistClassify(EmbPHitlist hits,
600 				 const AjPList targets,
601 				 ajuint thresh);
602 
603 ajint         embHitlistMatchFold(const void *hit1,
604 				  const void *hit2);
605 
606 
607 void          embSigExit(void);
608 
609 /*
610 ** End of prototype definitions
611 */
612 
613 AJ_END_DECLS
614 
615 #endif  /* !EMBSIG_H */
616 
617 
618 
619 
620 
621 
622 
623 
624 
625 
626 
627 
628 
629