1 /* @source embprop ************************************************************
2 **
3 ** Residue/sequence properties
4 **
5 ** @author Copyright (c) 1999 Alan Bleasby
6 ** @version $Revision: 1.54 $
7 ** @modified 24 Nov 1999 - GWW - Added embPropProtGaps and embPropProt1to3
8 ** @modified 1 Sept 2000 - GWW - Added embPropTransition embPropTranversion
9 ** @modified 4 July 2001 - DMAM - Modified embPropAminoRead embPropCalcMolwt
10 ** @modified 4 July 2001 - DMAM - Added embPropCalcMolwtMod
11 ** @modified 1 July 2008 - JISON - Added embPropGet* functions
12 ** @modified $Date: 2012/05/14 13:10:09 $ by $Author: uludag $
13 ** @@
14 **
15 ** This library is free software; you can redistribute it and/or
16 ** modify it under the terms of the GNU Lesser General Public
17 ** License as published by the Free Software Foundation; either
18 ** version 2.1 of the License, or (at your option) any later version.
19 **
20 ** This library is distributed in the hope that it will be useful,
21 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ** Lesser General Public License for more details.
24 **
25 ** You should have received a copy of the GNU Lesser General Public
26 ** License along with this library; if not, write to the Free Software
27 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
28 ** MA 02110-1301, USA.
29 **
30 ******************************************************************************/
31
32
33 #include "ajlib.h"
34
35 #include "embprop.h"
36 #include "ajbase.h"
37 #include "ajfileio.h"
38 #include "ajlist.h"
39 #include "ajseq.h"
40
41 #include <stdio.h>
42 #include <math.h>
43 #include <string.h>
44 #include <ctype.h>
45
46
47 char dayhoffstr[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
48
49 float dayhoff[] =
50 {
51 (float) 8.6, (float) 0.0, (float) 2.9, (float) 5.5, (float) 6.0,
52 (float) 3.6, (float) 8.4, (float) 2.0, (float) 4.5, (float) 0.0,
53 (float) 6.6, (float) 7.4, (float) 1.7, (float) 4.3, (float) 0.0,
54 (float) 5.2, (float) 3.9, (float) 4.9, (float) 7.0, (float) 6.1,
55 (float) 0.0, (float) 6.6, (float) 1.3, (float) 0.9, (float) 3.4,
56 (float) 0.0
57 };
58
59
60
61 #define PROPENZTRYPSIN 0
62 #define PROPENZLYSC 1
63 #define PROPENZARGC 2
64 #define PROPENZASPN 3
65 #define PROPENZV8B 4
66 #define PROPENZV8P 5
67 #define PROPENZCHYMOT 6
68 #define PROPENZCNBR 7
69
70 #define RAG_MINPEPLEN 3
71
72 #define AMINODATFILE "Eamino.dat"
73
74
75 /* static AjBool propInit = 0;*/
76
77 static char propPurines[] = "agrAGR";
78 static char propPyrimidines[] = "ctuyCTUY";
79
80
81
82
83 static ajint propFragCompare(const void *a, const void *b);
84
85
86
87
88 /* @func embPropEaminoRead ****************************************************
89 **
90 ** Read amino acid properties from Eamino.dat
91 **
92 ** @param [u] mfptr [AjPFile] Input file object
93 ** @return [EmbPPropAmino*] array of amino acid properties
94 **
95 ** @release 6.0.0
96 ** @@
97 ******************************************************************************/
98
embPropEaminoRead(AjPFile mfptr)99 EmbPPropAmino* embPropEaminoRead(AjPFile mfptr)
100 {
101 AjPStr line = NULL;
102 AjPStr token = NULL;
103
104 AjBool firstline;
105
106 const char *p;
107 ajuint i;
108 ajint n;
109
110 EmbPPropAmino *ret;
111
112 line = ajStrNew();
113 token = ajStrNew();
114
115 firstline = ajTrue;
116
117 AJCNEW0(ret,EMBPROPSIZE);
118
119 for(i=0; i < EMBPROPSIZE; ++i)
120 AJNEW0(ret[i]);
121
122 while(ajReadline(mfptr, &line))
123 {
124 ajStrRemoveWhiteExcess(&line);
125 p = ajStrGetPtr(line);
126
127 if(*p=='#' || *p=='!' || !*p)
128 continue;
129
130 if(firstline)
131 {
132 if(!ajStrPrefixC(line,"aa"))
133 ajFatal("Incorrect (old?) format amino data file");
134
135 firstline = ajFalse;
136 continue;
137 }
138
139 ajFmtScanS(line,"%S",&token);
140 ajStrFmtUpper(&token);
141
142 if(ajStrGetLen(token) != 1)
143 ajFatal("Amino file line doesn't begin with a single character");
144
145 i = ajBasecodeToInt((ajint) *ajStrGetPtr(token));
146
147 if(i == 27)
148 ajFatal("Amino file line doesn't begin with a single A->Z (%S)",
149 line);
150
151 n = ajFmtScanS(line,"%*s%d%d%d%d%d%d%f%d%d%d",
152 &ret[i]->tiny,
153 &ret[i]->sm_all,
154 &ret[i]->aliphatic,
155 &ret[i]->aromatic,
156 &ret[i]->nonpolar,
157 &ret[i]->polar,
158 &ret[i]->charge,
159 &ret[i]->pve,
160 &ret[i]->nve,
161 &ret[i]->extcoeff);
162 if(n!= 10)
163 ajFatal("Only %d columns in amino file - expected %d",n+1,11);
164 }
165
166 ajStrDel(&line);
167 ajStrDel(&token);
168
169 return ret;
170 }
171
172
173
174
175 /* @func embPropGetProperties *************************************************
176 **
177 ** Returns a string containing a list of defined properties
178 **
179 ** @param [r] prop [const EmbPPropAmino] Input properties object
180 ** @param [w] Pstr [AjPStr*] String of properties separated by commas
181 ** @return [AjBool] True if properties are defined
182 **
183 ** @release 6.0.0
184 ** @@
185 ******************************************************************************/
186
embPropGetProperties(const EmbPPropAmino prop,AjPStr * Pstr)187 AjBool embPropGetProperties(const EmbPPropAmino prop, AjPStr* Pstr)
188 {
189 ajStrAssignC(Pstr, "");
190
191 if(prop->tiny)
192 ajStrAppendC(Pstr, "tiny,");
193
194 if(prop->sm_all)
195 ajStrAppendC(Pstr, "small,");
196
197 if(prop->aliphatic)
198 ajStrAppendC(Pstr, "aliphatic,");
199
200 if(prop->aromatic)
201 ajStrAppendC(Pstr, "aromatic,");
202
203 if(prop->polar)
204 ajStrAppendC(Pstr, "polar,");
205
206 if(prop->nonpolar)
207 ajStrAppendC(Pstr, "nonpolar,");
208
209 ajStrTrimEndC(Pstr, ",");
210
211 if(!ajStrGetLen(*Pstr))
212 return ajFalse;
213
214 return ajTrue;
215 }
216
217
218
219
220 /* @func embPropEmolwtRead ****************************************************
221 **
222 ** Read molecular weights from Emolwt.dat
223 **
224 ** @param [u] mfptr [AjPFile] Input file object
225 ** @return [EmbPPropMolwt*] array of amino acid molecular weights
226 **
227 ** @release 6.0.0
228 ** @@
229 ******************************************************************************/
230
embPropEmolwtRead(AjPFile mfptr)231 EmbPPropMolwt* embPropEmolwtRead(AjPFile mfptr)
232 {
233 AjPStr line = NULL;
234 AjPStr token = NULL;
235
236 AjBool firstline;
237
238 const char *p;
239 ajuint i;
240 ajint n;
241
242 EmbPPropMolwt *ret;
243
244 line = ajStrNew();
245 token = ajStrNew();
246
247 firstline = ajTrue;
248
249 AJCNEW0(ret,EMBPROPSIZE+2);
250
251 for(i=0; i < EMBPROPSIZE+2; ++i)
252 AJNEW0(ret[i]);
253
254 while(ajReadline(mfptr, &line))
255 {
256 ajStrRemoveWhiteExcess(&line);
257 p = ajStrGetPtr(line);
258
259 if(*p=='#' || *p=='!' || !*p)
260 continue;
261
262 if(firstline)
263 {
264 if(!ajStrPrefixC(line,"Mol"))
265 ajFatal("Incorrect format molwt file: '%S'", line);
266
267 firstline = ajFalse;
268 continue;
269 }
270
271 ajFmtScanS(line,"%S",&token);
272 ajStrFmtUpper(&token);
273
274 if(ajStrGetLen(token) != 1)
275 {
276 if(ajStrPrefixC(token,"HYDROGEN"))
277 {
278 if(ajFmtScanS(line,"%*s%lf%lf",
279 &ret[EMBPROPHINDEX]->average,
280 &ret[EMBPROPHINDEX]->mono) != 2)
281 ajFatal("Bad format hydrogen data line");
282 }
283 else if(ajStrPrefixC(token,"OXYGEN"))
284 {
285 if(ajFmtScanS(line,"%*s%lf%lf",
286 &ret[EMBPROPOINDEX]->average,
287 &ret[EMBPROPOINDEX]->mono) != 2)
288 ajFatal("Bad format oxygen data line");
289 }
290 else if(ajStrPrefixC(token,"WATER"))
291 {
292 if(ajFmtScanS(line,"%*s%lf%lf",
293 &ret[EMBPROPWINDEX]->average,
294 &ret[EMBPROPWINDEX]->mono) != 2)
295 ajFatal("Bad format water data line");
296 }
297 else
298 ajFatal("Unknown molwt token %S",token);
299
300 continue;
301 }
302
303
304 i = ajBasecodeToInt((ajint) *ajStrGetPtr(token));
305
306 if(i == 27)
307 ajFatal("Molwt file line doesn't begin with a single A->Z (%S)",
308 line);
309
310 n = ajFmtScanS(line,"%*s%lf%lf",
311 &ret[i]->average,
312 &ret[i]->mono);
313 if(n != 2)
314 ajFatal("Only %d columns in amino file - expected %d",n,3);
315 }
316
317 ajStrDel(&line);
318 ajStrDel(&token);
319
320 return ret;
321 }
322
323
324
325
326 /* @func embPropMolwtGetMolwt *************************************************
327 **
328 ** Return molecular weight
329 **
330 ** @param [r] prop [const EmbPPropMolwt] Input molecular weights object
331 ** @return [float] molecular weight
332 **
333 ** @release 6.0.0
334 ** @@
335 ******************************************************************************/
336
embPropMolwtGetMolwt(const EmbPPropMolwt prop)337 float embPropMolwtGetMolwt(const EmbPPropMolwt prop)
338 {
339 float ret;
340
341 ret = (float) prop->average; /* satisfy VC++ */
342
343 return ret;
344 }
345
346
347
348
349 /* @func embPropGetCharge *****************************************************
350 **
351 ** Return charge value
352 **
353 ** @param [r] prop [const EmbPPropAmino] Input properties object
354 ** @return [float] charge
355 **
356 ** @release 6.0.0
357 ** @@
358 ******************************************************************************/
359
embPropGetCharge(const EmbPPropAmino prop)360 float embPropGetCharge(const EmbPPropAmino prop)
361 {
362 return prop->charge;
363 }
364
365
366
367
368 /* @func embPropGetTiny *******************************************************
369 **
370 ** Return tiny value
371 **
372 ** @param [r] prop [const EmbPPropAmino] Input properties object
373 ** @return [ajint] tiny
374 **
375 ** @release 6.0.0
376 ** @@
377 ******************************************************************************/
378
embPropGetTiny(const EmbPPropAmino prop)379 ajint embPropGetTiny(const EmbPPropAmino prop)
380 {
381 return prop->tiny;
382 }
383
384
385
386
387 /* @func embPropGetSmall ******************************************************
388 **
389 ** Return small value
390 **
391 ** @param [r] prop [const EmbPPropAmino] Input properties object
392 ** @return [ajint] small
393 **
394 ** @release 6.0.0
395 ** @@
396 ******************************************************************************/
397
embPropGetSmall(const EmbPPropAmino prop)398 ajint embPropGetSmall(const EmbPPropAmino prop)
399 {
400 return prop->sm_all;
401 }
402
403
404
405
406 /* @func embPropGetAliphatic **************************************************
407 **
408 ** Return aliphatic value
409 **
410 ** @param [r] prop [const EmbPPropAmino] Input properties object
411 ** @return [ajint] aliphatic
412 **
413 ** @release 6.0.0
414 ** @@
415 ******************************************************************************/
416
embPropGetAliphatic(const EmbPPropAmino prop)417 ajint embPropGetAliphatic(const EmbPPropAmino prop)
418 {
419 return prop->aliphatic;
420 }
421
422
423
424
425 /* @func embPropGetAromatic ***************************************************
426 **
427 ** Return aromatic value
428 **
429 ** @param [r] prop [const EmbPPropAmino] Input properties object
430 ** @return [ajint] aromatic
431 **
432 ** @release 6.0.0
433 ** @@
434 ******************************************************************************/
435
embPropGetAromatic(const EmbPPropAmino prop)436 ajint embPropGetAromatic(const EmbPPropAmino prop)
437 {
438 return prop->aromatic;
439 }
440
441
442
443
444 /* @func embPropGetNonpolar ***************************************************
445 **
446 ** Return nonpolar value
447 **
448 ** @param [r] prop [const EmbPPropAmino] Input properties object
449 ** @return [ajint] nonpolar
450 **
451 ** @release 6.0.0
452 ** @@
453 ******************************************************************************/
454
embPropGetNonpolar(const EmbPPropAmino prop)455 ajint embPropGetNonpolar(const EmbPPropAmino prop)
456 {
457 return prop->nonpolar;
458 }
459
460
461
462
463 /* @func embPropGetPolar ******************************************************
464 **
465 ** Return polar value
466 **
467 ** @param [r] prop [const EmbPPropAmino] Input properties object
468 ** @return [ajint] polar
469 **
470 ** @release 6.0.0
471 ** @@
472 ******************************************************************************/
473
embPropGetPolar(const EmbPPropAmino prop)474 ajint embPropGetPolar(const EmbPPropAmino prop)
475 {
476 return prop->polar;
477 }
478
479
480
481
482 /* @func embPropGetPve ********************************************************
483 **
484 ** Return pve value
485 **
486 ** @param [r] prop [const EmbPPropAmino] Input properties object
487 ** @return [ajint] pve
488 **
489 ** @release 6.0.0
490 ** @@
491 ******************************************************************************/
492
embPropGetPve(const EmbPPropAmino prop)493 ajint embPropGetPve(const EmbPPropAmino prop)
494 {
495 return prop->pve;
496 }
497
498
499
500
501 /* @func embPropGetNve ********************************************************
502 **
503 ** Return nve value
504 **
505 ** @param [r] prop [const EmbPPropAmino] Input properties object
506 ** @return [ajint] nve
507 **
508 ** @release 6.0.0
509 ** @@
510 ******************************************************************************/
511
embPropGetNve(const EmbPPropAmino prop)512 ajint embPropGetNve(const EmbPPropAmino prop)
513 {
514 return prop->nve;
515 }
516
517
518
519
520 /* @func embPropGetExtcoeff ***************************************************
521 **
522 ** Return extcoeff value
523 **
524 ** @param [r] prop [const EmbPPropAmino] Input properties object
525 ** @return [ajint] extcoeff
526 **
527 ** @release 6.0.0
528 ** @@
529 ******************************************************************************/
530
embPropGetExtcoeff(const EmbPPropAmino prop)531 ajint embPropGetExtcoeff(const EmbPPropAmino prop)
532 {
533 return prop->extcoeff;
534 }
535
536
537
538
539 /* @func embPropCalcMolwt ****************************************************
540 **
541 ** Calculate the molecular weight of a protein sequence
542 ** This is a shell around embPropCalcMolwtMod using water as the modifier.
543 **
544 ** @param [r] s [const char *] sequence
545 ** @param [r] start [ajint] start position
546 ** @param [r] end [ajint] end position
547 ** @param [r] mwdata [EmbPPropMolwt const *] molecular weight data
548 ** @param [r] mono [AjBool] true for monoisotopic values
549 **
550 ** @return [double] molecular weight
551 **
552 ** @release 1.0.0
553 ** @@
554 ******************************************************************************/
555
embPropCalcMolwt(const char * s,ajint start,ajint end,EmbPPropMolwt const * mwdata,AjBool mono)556 double embPropCalcMolwt(const char *s, ajint start, ajint end,
557 EmbPPropMolwt const *mwdata, AjBool mono)
558 {
559 double nmass = 0.;
560 double cmass = 0.;
561
562 nmass = (mono) ? mwdata[EMBPROPHINDEX]->mono :
563 mwdata[EMBPROPHINDEX]->average;
564
565 cmass = (mono) ? mwdata[EMBPROPOINDEX]->mono + nmass :
566 mwdata[EMBPROPOINDEX]->average + nmass;
567
568
569 return embPropCalcMolwtMod(s,start,end,mwdata,mono,nmass,cmass);
570 }
571
572
573
574
575 /* @func embPropCalcMolwtMod *************************************************
576 **
577 ** Calculate the molecular weight of a protein sequence
578 ** with chemically modified termini
579 **
580 ** @param [r] s [const char *] sequence
581 ** @param [r] start [ajint] start position
582 ** @param [r] end [ajint] end position
583 ** @param [r] mwdata [EmbPPropMolwt const *] molecular weight data
584 ** @param [r] mono [AjBool] true for monoisotopic values
585 ** @param [r] nmass [double] mass of the N-terminal group
586 ** @param [r] cmass [double] mass of the C-terminal group
587 **
588 ** @return [double] molecular weight
589 **
590 ** @release 2.1.0
591 ** @@
592 ******************************************************************************/
593
embPropCalcMolwtMod(const char * s,ajint start,ajint end,EmbPPropMolwt const * mwdata,AjBool mono,double nmass,double cmass)594 double embPropCalcMolwtMod(const char *s, ajint start, ajint end,
595 EmbPPropMolwt const *mwdata, AjBool mono,
596 double nmass, double cmass)
597 {
598 const char *p;
599 double sum;
600 ajint i;
601 ajint len;
602 ajint idx;
603 double mw = 0.;
604
605 len = end - start + 1;
606
607 p = s + start;
608 sum = 0.0;
609
610 for(i=0;i<len;++i)
611 {
612 idx = ajBasecodeToInt(toupper((ajint)p[i]));
613 mw = (mono) ? mwdata[idx]->mono : mwdata[idx]->average;
614
615 sum += mw;
616 }
617
618 return sum + nmass + cmass;
619 }
620
621
622
623
624 /* @func embPropCalcMolextcoeff************************************************
625 **
626 ** Calculate the molecular extinction coefficient of a protein sequence
627 **
628 ** @param [r] s [const char *] sequence
629 ** @param [r] start [ajint] start position
630 ** @param [r] end [ajint] end position
631 ** @param [r] cystine [AjBool] Treat C residues as cystine pairs
632 ** @param [r] aadata [EmbPPropAmino const *] amino acid data
633 **
634 ** @return [double] molar extinction coefficient
635 **
636 ** @release 2.8.0
637 ** @@
638 ******************************************************************************/
639
embPropCalcMolextcoeff(const char * s,ajint start,ajint end,AjBool cystine,EmbPPropAmino const * aadata)640 double embPropCalcMolextcoeff(const char *s, ajint start, ajint end,
641 AjBool cystine, EmbPPropAmino const *aadata)
642 {
643
644 const char *p;
645 char aa;
646 double sum;
647 ajint i;
648 ajint len;
649 ajuint havecystine = 0;
650
651 len = end-start+1;
652
653 p = s+start;
654 sum = 0.0;
655
656 for(i=0;i<len;++i)
657 {
658 aa = toupper((ajint)p[i]);
659 if(aa == 'C')
660 {
661 if(!cystine) continue;
662 havecystine++;
663 if(havecystine % 2)
664 continue;
665 }
666 sum += (double) aadata[ajBasecodeToInt(aa)]->extcoeff;
667 }
668
669 return sum;
670 }
671
672
673
674
675 /* @func embPropCharToThree **************************************************
676 **
677 ** Return 3 letter amino acid code A=Ala B=Asx C=Cys etc
678 **
679 ** @param [r] c [char] integer code
680 **
681 ** @return [const char*] three letter amino acid code
682 **
683 ** @release 1.0.0
684 ** @@
685 ******************************************************************************/
686
embPropCharToThree(char c)687 const char* embPropCharToThree(char c)
688 {
689 return embPropIntToThree(ajBasecodeToInt(c));
690 }
691
692
693
694
695 /* @func embPropIntToThree ***************************************************
696 **
697 ** Return 3 letter amino acid code 0=Ala 1=Asx 2=Cys etc
698 **
699 ** @param [r] c [ajint] integer code
700 **
701 ** @return [const char*] three letter amino acid code
702 **
703 ** @release 1.0.0
704 ** @@
705 ******************************************************************************/
706
embPropIntToThree(ajint c)707 const char* embPropIntToThree(ajint c)
708 {
709 static const char *tab[]=
710 {
711 "Ala","Asx","Cys","Asp","Glu","Phe","Gly","His","Ile","---","Lys",
712 "Leu","Met","Asn","---","Pro","Gln","Arg","Ser","Thr","---",
713 "Val","Trp","Xaa","Tyr","Glx"
714 };
715
716 return tab[c];
717 }
718
719
720
721
722 /* @func embPropCalcFragments ************************************************
723 **
724 ** Read amino acd properties
725 **
726 ** @param [r] s [const char *] sequence
727 ** @param [r] n [ajint] "enzyme" number
728 ** @param [w] l [AjPList *] list to push hits to
729 ** @param [w] pa [AjPList *] list to push partial hits to
730 ** @param [r] unfavoured [AjBool] allow unfavoured cuts
731 ** @param [r] overlap [AjBool] show overlapping partials
732 ** @param [r] allpartials [AjBool] show all possible partials
733 ** @param [w] ncomp [ajint *] number of complete digest fragments
734 ** @param [w] npart [ajint *] number of partial digest fragments
735 ** @param [w] rname [AjPStr *] name of reagent
736 ** @param [r] nterm [AjBool] nterm ragging
737 ** @param [r] cterm [AjBool] cterm ragging
738 ** @param [r] dorag [AjBool] true if ragging
739 ** @param [r] mwdata [EmbPPropMolwt const *] molecular weight data
740 ** @param [r] mono [AjBool] true for monoisotopic weights
741 **
742 ** @return [void]
743 **
744 ** @release 1.0.0
745 ** @@
746 ******************************************************************************/
747
embPropCalcFragments(const char * s,ajint n,AjPList * l,AjPList * pa,AjBool unfavoured,AjBool overlap,AjBool allpartials,ajint * ncomp,ajint * npart,AjPStr * rname,AjBool nterm,AjBool cterm,AjBool dorag,EmbPPropMolwt const * mwdata,AjBool mono)748 void embPropCalcFragments(const char *s, ajint n,
749 AjPList *l, AjPList *pa,
750 AjBool unfavoured, AjBool overlap,
751 AjBool allpartials, ajint *ncomp, ajint *npart,
752 AjPStr *rname, AjBool nterm, AjBool cterm,
753 AjBool dorag, EmbPPropMolwt const *mwdata,
754 AjBool mono)
755 {
756 static const char *PROPENZReagent[]=
757 {
758 "Trypsin","Lys-C","Arg-C","Asp-N","V8-bicarb","V8-phosph",
759 "Chymotrypsin","CNBr"
760 };
761
762 static const char *PROPENZSite[]=
763 {
764 "KR","K","R","D","E","DE","FYWLM","M"
765 };
766
767 static const char *PROPENZAminoCarboxyl[]=
768 {
769 "CC","C","C","N","C","CC","CCCCC","C"
770 };
771
772 static const char *PROPENZUnfavoured[]=
773 {
774 "KRIFLP","P","P","","KREP","P","P",""
775 };
776
777
778 ajint i;
779 ajint j;
780 ajint lim;
781 ajint len;
782 AjPList t;
783 EmbPPropFrag fr;
784 ajint *begsa = NULL;
785 ajint *endsa = NULL;
786 double molwt;
787 double *molwtsa = NULL;
788 AjBool *afrag = NULL;
789 ajint mark;
790 ajint bwp;
791 ajint ewp;
792 ajint *ival;
793 ajint defcnt;
794
795 ajint it;
796 ajint st = 0;
797 ajint mt = 0;
798 ajint et = 0;
799
800 ajStrAssignC(rname,PROPENZReagent[n]);
801 defcnt = 0;
802 len = (ajint) strlen(s);
803
804 t = ajListNew(); /* Temporary list */
805
806
807 /* First get all potential cut points */
808 for(i=0;i<len;++i)
809 {
810 if(!strchr(PROPENZSite[n],s[i]))
811 continue;
812
813 if(len==i+1)
814 continue;
815
816 if(strchr(PROPENZUnfavoured[n],s[i+1])
817 && !unfavoured)
818 continue;
819
820 AJNEW0(ival);
821 *ival = i;
822 ajListPushAppend(t,(void *)ival);
823 ++defcnt;
824 }
825
826 if(defcnt)
827 {
828 AJCNEW(begsa,(defcnt+1));
829 AJCNEW(endsa,(defcnt+1));
830 AJCNEW(molwtsa,(defcnt+1));
831 AJCNEW(afrag,(defcnt+1));
832 }
833
834 for(i=0;i<defcnt;++i) /* Pop them into a temporary array */
835 {
836 ajListPop(t,(void **)&ival);
837 endsa[i] = *ival;
838 AJFREE(ival);
839 }
840
841
842 mark = 0;
843
844 for(i=0;i<defcnt;++i) /* Work out true starts, ends and molwts */
845 {
846 bwp = mark;
847 ewp = endsa[i];
848
849 if(strchr(PROPENZAminoCarboxyl[n],'N'))
850 --ewp;
851
852 molwt=embPropCalcMolwt(s,bwp,ewp,mwdata,mono);
853
854 if(n==PROPENZCNBR)
855 molwt -= (17.045 + 31.095);
856
857 begsa[i] = mark;
858 endsa[i] = ewp;
859 molwtsa[i] = molwt;
860 afrag[i] = ajFalse;
861 mark = ewp+1;
862 }
863
864 if(defcnt) /* Special treatment for last fragment */
865 {
866 molwt = embPropCalcMolwt(s,mark,len-1,mwdata,mono);
867 if(n==PROPENZCNBR)
868 molwt -= (17.045 + 31.095);
869 begsa[i] = mark;
870 endsa[i] = len-1;
871 molwtsa[i] = molwt;
872 afrag[i] = ajFalse;
873 ++defcnt;
874 }
875
876 /* Push the hits */
877 for(i=0;i<defcnt;++i)
878 {
879 if(dorag)
880 {
881 st = begsa[i];
882 et = endsa[i];
883
884 for(it=st+RAG_MINPEPLEN-1; it < et; ++it)
885 {
886 AJNEW0(fr);
887 fr->start = st;
888 fr->end = it;
889 fr->molwt = embPropCalcMolwt(s,st,it,mwdata,mono);
890
891 if(n == PROPENZCNBR)
892 fr->molwt -= (17.045 + 31.095);
893
894 fr->isfrag = ajTrue;
895 ajListPush(*l,(void *)fr);
896 }
897 }
898
899 AJNEW0(fr);
900 fr->start = begsa[i];
901 fr->end = endsa[i];
902 fr->molwt = molwtsa[i];
903 fr->isfrag = afrag[i];
904 ajListPush(*l,(void *) fr);
905
906 if(dorag && nterm)
907 for(it=st+1; it < et-RAG_MINPEPLEN+2; ++it)
908 {
909 AJNEW0(fr);
910 fr->start = it;
911 fr->end = et;
912 fr->molwt = embPropCalcMolwt(s,it,et,mwdata,mono);
913
914 if(n == PROPENZCNBR)
915 fr->molwt -= (17.045 + 31.095);
916
917 fr->isfrag = ajTrue;
918 ajListPush(*l,(void *)fr);
919 }
920 }
921
922 if(!dorag)
923 ajListSort(*l, &propFragCompare);
924 *ncomp = defcnt;
925
926
927 /* Now deal with overlaps */
928 *npart = 0;
929
930 lim = defcnt -1;
931
932 if(overlap && !allpartials)
933 {
934 for(i=0;i<lim;++i)
935 {
936 if(dorag)
937 {
938 st = begsa[i];
939 mt = endsa[i];
940 et = endsa[i+1];
941
942 if(cterm)
943 for(it=mt+1; it < et; ++it)
944 {
945 AJNEW0(fr);
946 fr->start = st;
947 fr->end = it;
948 fr->molwt = embPropCalcMolwt(s,st,it,mwdata,mono);
949
950 if(n == PROPENZCNBR)
951 fr->molwt -= (17.045 + 31.095);
952
953 fr->isfrag = ajTrue;
954 ajListPush(*l,(void *)fr);
955 }
956 }
957
958 AJNEW0(fr);
959 fr->isfrag = ajTrue;
960 fr->molwt = embPropCalcMolwt(s,begsa[i],endsa[i+1],mwdata,mono);
961 if(n==PROPENZCNBR)
962 fr->molwt -= (17.045 + 31.095);
963 fr->start = begsa[i];
964 fr->end = endsa[i+1];
965 ajListPush(*pa,(void *)fr);
966 ++(*npart);
967
968 if(dorag && nterm)
969 for(it=st+1; it<mt; ++it)
970 {
971 AJNEW0(fr);
972 fr->start = it;
973 fr->end = et;
974 fr->molwt = embPropCalcMolwt(s,it,et,mwdata,mono);
975
976 if(n == PROPENZCNBR)
977 fr->molwt -= (17.045 + 31.095);
978
979 fr->isfrag = ajTrue;
980 ajListPush(*l,(void *)fr);
981 }
982 }
983
984 if(*npart) /* Remove complete sequence */
985 {
986 --(*npart);
987 ajListPop(*pa,(void **)&fr);
988 }
989
990 if(!dorag)
991 ajListSort(*pa, &propFragCompare);
992 }
993
994 if(allpartials)
995 {
996 lim = defcnt;
997
998 for(i=0;i<lim;++i)
999 for(j=i+1;j<lim;++j)
1000 {
1001 AJNEW0(fr);
1002 fr->isfrag = ajTrue;
1003 fr->molwt = embPropCalcMolwt(s,begsa[i],endsa[j],mwdata,mono);
1004
1005 if(n==PROPENZCNBR)
1006 fr->molwt -= (17.045 + 31.095);
1007
1008 fr->start = begsa[i];
1009 fr->end = endsa[j];
1010 ajListPush(*pa,(void *)fr);
1011 ++(*npart);
1012 }
1013
1014 if(*npart) /* Remove complete sequence */
1015 {
1016 --(*npart);
1017 ajListPop(*pa,(void **)&fr);
1018 }
1019
1020 if(!dorag)
1021 ajListSort(*pa, &propFragCompare);
1022 }
1023
1024 if(defcnt)
1025 {
1026 AJFREE(molwtsa);
1027 AJFREE(endsa);
1028 AJFREE(begsa);
1029 AJFREE(afrag);
1030 }
1031
1032 ajListFree(&t);
1033
1034 return;
1035 }
1036
1037
1038
1039
1040 /* @funcstatic propFragCompare ***********************************************
1041 **
1042 ** compare two molecular weight AjPFrag list elements for sorting
1043 **
1044 ** @param [r] a [const void*] First element
1045 ** @param [r] b [const void*] Second element
1046 **
1047 ** @return [ajint] 0=equal +ve=(a greater than b) -ve=(a less than b)
1048 **
1049 ** @release 1.0.0
1050 ** @@
1051 ******************************************************************************/
1052
propFragCompare(const void * a,const void * b)1053 static ajint propFragCompare(const void *a, const void *b)
1054 {
1055 return (ajint)((*(EmbPPropFrag const *)b)->molwt -
1056 (*(EmbPPropFrag const *)a)->molwt);
1057 }
1058
1059
1060
1061
1062 /* @func embPropProtGaps ******************************************************
1063 **
1064 ** Creates a string of a protein sequence which has been padded out with
1065 ** two spaces after every residue to aid aligning a translation under a
1066 ** nucleic sequence
1067 **
1068 ** @param [u] seq [AjPSeq] protein sequence to add spaces into
1069 ** @param [r] pad [ajint] number of spaces to insert at the start of the result
1070 ** @return [AjPStr] New string with the padded sequence
1071 **
1072 ** @release 1.0.0
1073 ** @@
1074 ******************************************************************************/
1075
embPropProtGaps(AjPSeq seq,ajint pad)1076 AjPStr embPropProtGaps(AjPSeq seq, ajint pad)
1077 {
1078 const char *p;
1079 AjPStr temp;
1080 ajint i;
1081
1082 temp = ajStrNewRes(ajSeqGetLen(seq)*3 + pad+1);
1083
1084 /* put any required padding spaces at the start */
1085 for(i=0; i<pad; i++)
1086 ajStrAppendC(&temp, " ");
1087
1088
1089 for(p=ajSeqGetSeqC(seq); *p; p++)
1090 {
1091 ajStrAppendK(&temp, *p);
1092 ajStrAppendC(&temp, " ");
1093 }
1094
1095 return temp;
1096 }
1097
1098
1099
1100
1101 /* @func embPropProt1to3 ******************************************************
1102 **
1103 ** Creates a a 3-letter sequence protein string from single-letter sequence
1104 ** EMBOSS is bad at reading 3-letter sequences, but this may be useful
1105 ** when displaying translations.
1106 **
1107 ** @param [u] seq [AjPSeq] protein sequence to convert to 3-letter codes
1108 ** @param [r] pad [ajint] number of spaces to insert at the start of the result
1109 ** @return [AjPStr] string containing 3-letter protein sequence
1110 **
1111 ** @release 1.0.0
1112 ** @@
1113 ******************************************************************************/
1114
embPropProt1to3(AjPSeq seq,ajint pad)1115 AjPStr embPropProt1to3(AjPSeq seq, ajint pad)
1116 {
1117 const char *p;
1118 const char *p3;
1119 AjPStr temp;
1120 ajint i;
1121
1122 temp = ajStrNewRes(ajSeqGetLen(seq)*3 + pad+1);
1123
1124 /* put any required padding spaces at the start */
1125 for(i=0; i<pad; i++)
1126 ajStrAppendC(&temp, " ");
1127
1128
1129 for(p=ajSeqGetSeqC(seq); *p; p++)
1130 {
1131 if(*p == '*')
1132 ajStrAppendC(&temp, "***");
1133 else if(*p == '.')
1134 ajStrAppendC(&temp, "...");
1135 else if(*p == '-')
1136 ajStrAppendC(&temp, "---");
1137 else if(!isalpha((ajint)*p))
1138 ajStrAppendC(&temp, "???");
1139 else
1140 {
1141 p3 = embPropCharToThree(*p);
1142 ajStrAppendK(&temp, *p3);
1143 ajStrAppendK(&temp, *(p3+1));
1144 ajStrAppendK(&temp, *(p3+2));
1145 }
1146 }
1147
1148 return temp;
1149 }
1150
1151
1152
1153
1154 /* @func embPropProt1to3Rev ***************************************************
1155 **
1156 ** Creates a a 3-letter sequence protein string from single-letter sequence
1157 ** in the reverse direction.
1158 ** EMBOSS is bad at reading 3-letter sequences, but this may be useful
1159 ** when displaying translations.
1160 **
1161 ** @param [u] seq [AjPSeq] protein sequence to convert to 3-letter codes
1162 ** @param [r] pad [ajint] number of characters to skip at the start
1163 ** of the result
1164 ** @return [AjPStr] string containing 3-letter protein sequence
1165 **
1166 ** @release 6.1.0
1167 ** @@
1168 ******************************************************************************/
1169
embPropProt1to3Rev(AjPSeq seq,ajint pad)1170 AjPStr embPropProt1to3Rev(AjPSeq seq, ajint pad)
1171 {
1172 const char *p;
1173 const char *p3;
1174 AjPStr temp;
1175 ajint i=0;
1176
1177 temp = ajStrNewRes(ajSeqGetLen(seq)*3 + pad+1);
1178
1179 for(p=ajSeqGetSeqC(seq); *p; p++)
1180 {
1181 if(*p == '*')
1182 ajStrAppendC(&temp, "***");
1183 else if(*p == '.')
1184 ajStrAppendC(&temp, "...");
1185 else if(*p == '-')
1186 ajStrAppendC(&temp, "---");
1187 else if(!isalpha((ajint)*p))
1188 ajStrAppendC(&temp, "???");
1189 else
1190 {
1191 p3 = embPropCharToThree(*p);
1192
1193 if(i++)
1194 {
1195 ajStrAppendK(&temp, *(p3+2));
1196 ajStrAppendK(&temp, *(p3+1));
1197 ajStrAppendK(&temp, *p3);
1198 }
1199 else
1200 {
1201 if(pad >= 2)
1202 ajStrAppendK(&temp, *(p3+2));
1203
1204 if(pad >= 1)
1205 ajStrAppendK(&temp, *(p3+1));
1206
1207 ajStrAppendK(&temp, *p3);
1208 }
1209 }
1210 }
1211
1212 return temp;
1213 }
1214
1215
1216
1217
1218 /* @func embPropPurine ********************************************************
1219 **
1220 ** Returns ajTrue if the input base is a Purine.
1221 ** Returns ajFalse if it is a Pyrimidine or it is ambiguous.
1222 **
1223 ** @param [r] base [char] base
1224 ** @return [AjBool] return ajTrue if this is a Purine
1225 **
1226 ** @release 1.4.3
1227 ** @@
1228 ******************************************************************************/
1229
embPropPurine(char base)1230 AjBool embPropPurine(char base)
1231 {
1232 if(strchr(propPurines, (ajint)base))
1233 return ajTrue;
1234
1235 return ajFalse;
1236 }
1237
1238
1239
1240
1241 /* @func embPropPyrimidine ****************************************************
1242 **
1243 ** Returns ajTrue if the input base is a Pyrimidine.
1244 ** Returns ajFalse if it is a Purine or it is ambiguous.
1245 **
1246 ** @param [r] base [char] base
1247 ** @return [AjBool] return ajTrue if this is a Pyrimidine
1248 **
1249 ** @release 1.4.3
1250 ** @@
1251 ******************************************************************************/
1252
embPropPyrimidine(char base)1253 AjBool embPropPyrimidine(char base)
1254 {
1255 if(strchr(propPyrimidines, (ajint)base))
1256 return ajTrue;
1257
1258 return ajFalse;
1259 }
1260
1261
1262
1263
1264 /* @func embPropTransversion **************************************************
1265 **
1266 ** Returns ajTrue if the input two bases have undergone a tranversion.
1267 ** (Pyrimidine to Purine, or vice versa)
1268 ** Returns ajFalse if this is not a transversion or it can not be determined
1269 ** (e.g. no change A->A, transition C->T, unknown A->N)
1270 **
1271 ** @param [r] base1 [char] first base
1272 ** @param [r] base2 [char] second base
1273 ** @return [AjBool] return ajTrue if this is a transversion
1274 **
1275 ** @release 1.4.3
1276 ** @@
1277 ******************************************************************************/
1278
embPropTransversion(char base1,char base2)1279 AjBool embPropTransversion(char base1, char base2)
1280 {
1281 AjBool bu1;
1282 AjBool bu2;
1283 AjBool by1;
1284 AjBool by2;
1285
1286 bu1 = embPropPurine(base1);
1287 bu2 = embPropPurine(base2);
1288
1289 by1 = embPropPyrimidine(base1);
1290 by2 = embPropPyrimidine(base2);
1291
1292 ajDebug("base1 py = %b, pu = %b", bu1, by1);
1293 ajDebug("base2 py = %b, pu = %b", bu2, by2);
1294
1295
1296 /* not a purine or a pyrimidine - ambiguous - return ajFalse */
1297 if(!bu1 && !by1)
1298 return ajFalse;
1299
1300 if(!bu2 && !by2)
1301 return ajFalse;
1302
1303 ajDebug("embPropTransversion result = %d", (bu1 != bu2));
1304
1305 return (bu1 != bu2);
1306 }
1307
1308
1309
1310
1311 /* @func embPropTransition ****************************************************
1312 **
1313 ** Returns ajTrue if the input two bases have undergone a transition.
1314 ** (Pyrimidine to Pyrimidine, or Purine to Purine)
1315 ** Returns ajFalse if this is not a transition or it can not be determined
1316 ** (e.g. no change A->A, transversion A->T, unknown A->N)
1317 **
1318 ** @param [r] base1 [char] first base
1319 ** @param [r] base2 [char] second base
1320 ** @return [AjBool] return ajTrue if this is a transition
1321 **
1322 ** @release 1.4.3
1323 ** @@
1324 ******************************************************************************/
1325
embPropTransition(char base1,char base2)1326 AjBool embPropTransition(char base1, char base2)
1327 {
1328 AjBool bu1;
1329 AjBool bu2;
1330 AjBool by1;
1331 AjBool by2;
1332
1333 bu1 = embPropPurine(base1);
1334 bu2 = embPropPurine(base2);
1335
1336 by1 = embPropPyrimidine(base1);
1337 by2 = embPropPyrimidine(base2);
1338
1339 ajDebug("base1 py = %b, pu = %b", bu1, by1);
1340 ajDebug("base2 py = %b, pu = %b", bu2, by2);
1341
1342 /* not a purine or a pyrimidine - ambiguous - return ajFalse */
1343 if(!bu1 && !by1)
1344 return ajFalse;
1345
1346 if(!bu2 && !by2)
1347 return ajFalse;
1348
1349 /* no change - return ajFalse */
1350 if(tolower((int)base1) == tolower((int)base2))
1351 return ajFalse;
1352
1353 /* U to T is not a transition */
1354 if(tolower((int)base1) == 't' && tolower((int)base2) == 'u')
1355 return ajFalse;
1356
1357 if(tolower((int)base1) == 'u' && tolower((int)base2) == 't')
1358 return ajFalse;
1359
1360 /* C to Y, T to Y, A to R, G to R - ambiguous - not a transition */
1361 if(bu1 && tolower((int)base2) == 'r')
1362 return ajFalse;
1363
1364 if(bu2 && tolower((int)base1) == 'r')
1365 return ajFalse;
1366
1367 if(by1 && tolower((int)base2) == 'y')
1368 return ajFalse;
1369
1370 if(by2 && tolower((int)base1) == 'y')
1371 return ajFalse;
1372
1373 ajDebug("embPropTransition result = %b", (bu1 == bu2));
1374
1375 return (bu1 == bu2);
1376 }
1377
1378
1379
1380
1381 /* @func embPropFixF **********************************************************
1382 **
1383 ** Fix for missing properties data in a float array
1384 **
1385 ** @param [u] matrix [float[]] Matrix
1386 ** @param [r] missing [float] Missing data value
1387 ** @return [void]
1388 **
1389 ** @release 4.1.0
1390 ******************************************************************************/
1391
embPropFixF(float matrix[],float missing)1392 void embPropFixF(float matrix[], float missing)
1393 {
1394 ajint i;
1395
1396 float mtot = 0.0;
1397 float dtot = 0.0;
1398
1399 for(i=0;i<26;i++)
1400 {
1401 if(matrix[i] == missing)
1402 {
1403 switch (i)
1404 {
1405 case 1: /* B: D + N */
1406 matrix[i] = ((matrix[3] * dayhoff[3]) +
1407 (matrix[13] * dayhoff[13])) /
1408 (dayhoff[3] + dayhoff[13]);
1409 ajDebug("Missing %d '%c' %f %f => %f\n",
1410 i, dayhoffstr[i], matrix[3], matrix[13], matrix[i]);
1411 break;
1412 case 9: /* J: I + L */
1413 matrix[i] = ((matrix[8] * dayhoff[8]) +
1414 (matrix[11] * dayhoff[11])) /
1415 (dayhoff[8] + dayhoff[11]);
1416 ajDebug("Missing %d '%c' %f %f => %f\n",
1417 i, dayhoffstr[i], matrix[8], matrix[11], matrix[i]);
1418 break;
1419 case 25: /* Z: E + Q */
1420 matrix[i] = ((matrix[4] * dayhoff[4]) +
1421 (matrix[16] * dayhoff[16])) /
1422 (dayhoff[4] + dayhoff[16]);
1423 ajDebug("Missing %d '%c' %f %f => %f\n",
1424 i, dayhoffstr[i], matrix[4], matrix[16], matrix[i]);
1425 break;
1426 default:
1427 ajDebug("Missing %d '%c' unknown\n", i, dayhoffstr[i]);
1428 break;
1429 }
1430 }
1431 else
1432 {
1433 if(dayhoff[i] > 0.0)
1434 {
1435 dtot += dayhoff[i];
1436 mtot += matrix[i] * dayhoff[i];
1437 }
1438 }
1439 }
1440
1441 mtot /= dtot;
1442
1443 for(i=0;i<26;i++)
1444 if(matrix[i] == missing) /* X:average O,U:X */
1445 {
1446 matrix[i] = mtot;
1447 ajDebug("Missing %d '%c' unknown %f\n",
1448 i, dayhoffstr[i], matrix[i]);
1449 }
1450
1451 return;
1452 }
1453
1454
1455
1456
1457 /* @func embPropNormalF *******************************************************
1458 **
1459 ** Normalize data values in a float array to have mean 0.0 and
1460 ** standard deviation 1.0
1461 **
1462 ** Assume the data values represent all values for a population
1463 ** (e.g. values for all standard amino acids) and use
1464 ** new value = (old value - mean) / standard deviation
1465 **
1466 ** @param [u] matrix [float[]] Matrix
1467 ** @param [r] missing [float] Missing data value
1468 ** @return [void]
1469 **
1470 ** @release 6.1.0
1471 ******************************************************************************/
1472
embPropNormalF(float matrix[],float missing)1473 void embPropNormalF(float matrix[], float missing)
1474 {
1475 ajuint i;
1476
1477 double count = 0.0;
1478 double total = 0.0;
1479 double sumsq = 0.0;
1480 double sigma = 0.0;
1481 double mean = 0.0;
1482 const char* alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1483
1484 for(i=0;i<26;i++)
1485 if(matrix[i] != missing)
1486 {
1487 count += 1.0;
1488 total += matrix[i];
1489 sumsq += matrix[i] * matrix[i];
1490 }
1491
1492 if(!count)
1493 return;
1494
1495 sigma = sqrt(count*sumsq - total*total)/count;
1496 mean = total/count;
1497
1498 ajDebug("matrix normalize mean: %.3f sigma: %.3f\n", mean, sigma);
1499
1500 for(i=0;i<26;i++)
1501 if(matrix[i] != missing)
1502 {
1503 ajDebug("matrix[%u] %c %.3f", i, alphabet[i], matrix[i]);
1504 matrix[i] = (float) ((matrix[i] - mean) / sigma);
1505 ajDebug(" => %.3f\n", matrix[i]);
1506 }
1507
1508 return;
1509 }
1510
1511
1512
1513
1514 /* @func embPropAminoDel ******************************************************
1515 **
1516 ** Delete array of amino acid properties
1517 **
1518 ** @param [w] thys [EmbPPropAmino**] amino acid properties
1519 ** @return [void]
1520 **
1521 ** @release 6.0.0
1522 ******************************************************************************/
1523
embPropAminoDel(EmbPPropAmino ** thys)1524 void embPropAminoDel(EmbPPropAmino **thys)
1525 {
1526 EmbPPropAmino *pthis = NULL;
1527 ajuint i;
1528
1529 pthis = *thys;
1530
1531 for(i=0; i < EMBPROPSIZE; ++i)
1532 AJFREE(pthis[i]);
1533
1534 AJFREE(pthis);
1535
1536 *thys = NULL;
1537
1538 return;
1539 }
1540
1541
1542
1543
1544 /* @func embPropMolwtDel ******************************************************
1545 **
1546 ** Delete array of molwts
1547 **
1548 ** @param [w] thys [EmbPPropMolwt**] molwts
1549 ** @return [void]
1550 **
1551 ** @release 6.0.0
1552 ******************************************************************************/
1553
embPropMolwtDel(EmbPPropMolwt ** thys)1554 void embPropMolwtDel(EmbPPropMolwt **thys)
1555 {
1556 EmbPPropMolwt *pthis = NULL;
1557 ajuint i;
1558
1559 pthis = *thys;
1560
1561 for(i=0; i < EMBPROPSIZE + 2; ++i)
1562 AJFREE(pthis[i]);
1563
1564 AJFREE(pthis);
1565
1566 *thys = NULL;
1567
1568 return;
1569 }
1570