1 /* @source ajseqtype **********************************************************
2 **
3 ** AJAX seqtype functions
4 **
5 ** @author Copyright (C) 2002 Peter Rice
6 ** @version $Revision: 1.84 $
7 ** @modified 2002-2011 Peter Rice
8 ** @modified $Date: 2013/06/29 22:30:31 $ by $Author: rice $
9 ** @@
10 **
11 ** This library is free software; you can redistribute it and/or
12 ** modify it under the terms of the GNU Lesser General Public
13 ** License as published by the Free Software Foundation; either
14 ** version 2.1 of the License, or (at your option) any later version.
15 **
16 ** This library is distributed in the hope that it will be useful,
17 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ** Lesser General Public License for more details.
20 **
21 ** You should have received a copy of the GNU Lesser General Public
22 ** License along with this library; if not, write to the Free Software
23 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
24 ** MA 02110-1301, USA.
25 **
26 ******************************************************************************/
27
28 #include "ajlib.h"
29
30 #include "ajseqtype.h"
31 #include "ajseq.h"
32 #include "ajfeat.h"
33 #include "ajfile.h"
34 #include "ajreg.h"
35
36
37
38
39 /* @datastatic SeqPType *******************************************************
40 **
41 ** Sequence types data structure, used to test input sequence against
42 ** a defined sequence type
43 **
44 ** @alias SeqSType
45 ** @alias SeqOType
46 **
47 ** @attr Name [const char*] sequence type name
48 ** @attr Gaps [AjBool] allow gap characters
49 ** @attr Ambig [AjBool] True if ambiguity codes are allowed
50 ** @attr Type [ajuint] enumerated ISANY=0 ISNUC=1 ISPROT=2
51 ** @attr Padding [ajint] Padding to alignment boundary
52 ** @attr ConvertFrom [const char*] Convert each of these characters to the
53 ** ConvertTo equivalent
54 ** @attr ConvertTo [const char*] Equivalent for each sequence character in
55 ** ConvertFrom
56 ** @attr Badchars [AjPRegexp function] Test function
57 ** @attr Goodchars [AjPStr function] Test function
58 ** @attr Filter [char*] Filter for character testing
59 ** @attr Desc [const char*] Description for documentation purposes
60 ** @@
61 ******************************************************************************/
62
63 typedef struct SeqSType
64 {
65 const char *Name;
66 AjBool Gaps;
67 AjBool Ambig;
68 ajuint Type;
69 ajint Padding;
70 const char *ConvertFrom;
71 const char *ConvertTo;
72 AjPRegexp (*Badchars) (void);
73 AjPStr (*Goodchars) (void);
74 char *Filter;
75 const char *Desc;
76 } SeqOType;
77
78 #define SeqPType SeqOType*
79
80
81
82
83 enum ProtNuc {ISANY=0, ISNUC=1, ISPROT=2};
84
85 static char* seqNewGapChars = NULL;
86
87 static AjPStr seqtypeTmpstr = NULL;
88
89 /*
90 ** gaps only allowed if it says so
91 ** gap conversion is a separate attribute, along with case conversion
92 */
93
94 static AjBool seqFindType(const AjPStr type_name, ajint* typenum);
95 static void seqGapSL(AjPStr* seq, char gapc, char padc, ajuint ilen);
96 static AjBool seqTypeFix(AjPSeq thys, ajint itype);
97 static AjBool seqTypeFixReg(AjPSeq thys, ajint itype, char fixchar);
98 static void seqTypeSet(AjPSeq thys, const AjPStr Type);
99 static AjBool seqTypeStopTrimS(AjPStr* pthys);
100 static char seqTypeTest(const AjPStr thys, AjPRegexp badchars);
101 static AjBool seqTypeTestI(AjPSeq thys, ajint itype);
102 static char seqTypeTestS(const AjPStr thys, const AjPStr goodchars);
103
104 static AjPRegexp seqTypeCharAny(void);
105 static AjPRegexp seqTypeCharAnyGap(void);
106 static AjPRegexp seqTypeCharNuc(void);
107 static AjPRegexp seqTypeCharNucGap(void);
108 static AjPRegexp seqTypeCharNucGapPhylo(void);
109 static AjPRegexp seqTypeCharNucPure(void);
110 static AjPRegexp seqTypeCharProt(void);
111 static AjPRegexp seqTypeCharProtGap(void);
112 static AjPRegexp seqTypeCharProtGapPhylo(void);
113 static AjPRegexp seqTypeCharProtPure(void);
114 static AjPRegexp seqTypeCharProtStop(void);
115 static AjPRegexp seqTypeCharProtStopGap(void);
116
117 static AjPStr seqTypeStrAny(void);
118 static AjPStr seqTypeStrAnyGap(void);
119 static AjPStr seqTypeStrDnaGap(void);
120 static AjPStr seqTypeStrNuc(void);
121 static AjPStr seqTypeStrNucGap(void);
122 static AjPStr seqTypeStrNucGapPhylo(void);
123 static AjPStr seqTypeStrNucPure(void);
124 static AjPStr seqTypeStrProt(void);
125 static AjPStr seqTypeStrProtAny(void);
126 static AjPStr seqTypeStrProtGap(void);
127 static AjPStr seqTypeStrProtGapPhylo(void);
128 static AjPStr seqTypeStrProtPure(void);
129 static AjPStr seqTypeStrProtStop(void);
130 static AjPStr seqTypeStrProtStopGap(void);
131 static AjPStr seqTypeStrRnaGap(void);
132
133 static AjPRegexp seqtypeRegAny = NULL;
134 static AjPRegexp seqtypeRegAnyGap = NULL;
135 static AjPRegexp seqtypeRegDnaGap = NULL;
136 static AjPRegexp seqtypeRegNuc = NULL;
137 static AjPRegexp seqtypeRegNucGap = NULL;
138 static AjPRegexp seqtypeRegNucGapPhylo = NULL;
139 static AjPRegexp seqtypeRegNucPure = NULL;
140 static AjPRegexp seqtypeRegProt = NULL;
141 static AjPRegexp seqtypeRegProtAny = NULL;
142 static AjPRegexp seqtypeRegProtGap = NULL;
143 static AjPRegexp seqtypeRegProtGapPhylo = NULL;
144 static AjPRegexp seqtypeRegProtPure = NULL;
145 static AjPRegexp seqtypeRegProtStop = NULL;
146 static AjPRegexp seqtypeRegProtStopGap = NULL;
147 static AjPRegexp seqtypeRegRnaGap = NULL;
148
149 static AjPStr seqtypeCharsetAny = NULL;
150 static AjPStr seqtypeCharsetAnyGap = NULL;
151 static AjPStr seqtypeCharsetDnaGap = NULL;
152 static AjPStr seqtypeCharsetNuc = NULL;
153 static AjPStr seqtypeCharsetNucGap = NULL;
154 static AjPStr seqtypeCharsetNucGapPhylo = NULL;
155 static AjPStr seqtypeCharsetNucPure = NULL;
156 static AjPStr seqtypeCharsetProt = NULL;
157 static AjPStr seqtypeCharsetProtAny = NULL;
158 static AjPStr seqtypeCharsetProtGap = NULL;
159 static AjPStr seqtypeCharsetProtGapPhylo = NULL;
160 static AjPStr seqtypeCharsetProtPure = NULL;
161 static AjPStr seqtypeCharsetProtStop = NULL;
162 static AjPStr seqtypeCharsetProtStopGap = NULL;
163 static AjPStr seqtypeCharsetRnaGap = NULL;
164
165
166
167
168 /*
169 ** gap characters known are:
170 **
171 ** . GCG and most others
172 ** - Phylip and some alignment output
173 ** ~ GCG for gaps at ends
174 ** * Staden for DNA but stop for protein (fix on input?)
175 ** O Phylip (fix on input?) - no longer possible: O is pyrrolysine in proteins
176 */
177
178
179
180 /*
181 char seqCharProt[] = "ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyBUXZbuxz*?";
182 */
183 char seqCharProtPure[] = "ACDEFGHIKLMNPQRSTVWY";
184 char seqCharProtAmbig[] = "BJOUXZ?"; /* convert unwanted ones to Xx */
185 char seqCharProtStop[] = "*";
186 char seqCharNuc[] = "ACGTUBDHKMNRSVWXY?";
187 char seqCharNucPure[] = "ACGTU";
188 char seqCharNucAmbig[] = "BDHKMNRSVWXY?";
189 char seqCharGap[] = ".~-"; /* phylip used O in old versions */
190 char seqCharNucDna[] = "ACGTBDHKMNRSVWXY?";
191 char seqCharNucRna[] = "ACGUBDHKMNRSVWXY?";
192 char seqCharGapany[] = ".~-"; /* phylip used O in old versions*/
193 char seqCharGapdash[] = "-";
194 char seqCharGapdot[] = ".";
195 char seqGap = '-'; /* the (only) EMBOSS gap character */
196 char seqCharGapTest[] = " .~-"; /* phylip used O - don't forget space */
197 char seqCharPhylo[] = "?"; /* phylip uses ? for unknown or gap */
198
199
200
201
202 /* @funclist seqType **********************************************************
203 **
204 ** Functions to test each sequence type
205 **
206 ******************************************************************************/
207
208 static SeqOType seqType[] =
209 {
210 /* "name" Gaps Ambig Type Padding CvtFrom CvtTo
211 BadcharsFunction GoodCharsFunction Description */
212 {"any", AJFALSE, AJTRUE, ISANY, 0, "?", "X",
213 seqTypeCharAny,
214 seqTypeStrAny,
215 NULL,
216 "any valid sequence"}, /* reset type */
217 {"gapany", AJTRUE, AJTRUE, ISANY, 0, "?", "X",
218 seqTypeCharAnyGap,
219 seqTypeStrAnyGap,
220 NULL,
221 "any valid sequence with gaps"}, /* reset type */
222 {"dna", AJFALSE, AJTRUE, ISNUC, 0, "?XxUu", "NNnTt",
223 seqTypeCharNuc,
224 seqTypeStrNuc,
225 NULL,
226 "DNA sequence"},
227 {"puredna", AJFALSE, AJFALSE, ISNUC, 0, "Uu", "Tt",
228 seqTypeCharNucPure,
229 seqTypeStrNucPure,
230 NULL,
231 "DNA sequence, bases ACGT only"},
232 {"gapdna", AJTRUE, AJTRUE, ISNUC, 0, "?XxUu", "NNnTt",
233 seqTypeCharNucGap,
234 seqTypeStrNucGap,
235 NULL,
236 "DNA sequence with gaps"},
237 {"gapdnaphylo", AJTRUE, AJTRUE, ISNUC, 0, "Uu", "Tt",
238 seqTypeCharNucGapPhylo,
239 seqTypeStrNucGapPhylo,
240 NULL,
241 "DNA sequence with gaps and queries"},
242 {"rna", AJFALSE, AJTRUE, ISNUC, 0, "?XxTt", "NNnUu",
243 seqTypeCharNuc,
244 seqTypeStrNuc,
245 NULL,
246 "RNA sequence"},
247 {"purerna", AJFALSE, AJFALSE, ISNUC, 0, "Tt", "Uu",
248 seqTypeCharNucPure,
249 seqTypeStrNucPure,
250 NULL,
251 "RNA sequence, bases ACGU only"},
252 {"gaprna", AJTRUE, AJTRUE, ISNUC, 0, "?XxTt", "NNnUu",
253 seqTypeCharNucGap,
254 seqTypeStrNucGap,
255 NULL,
256 "RNA sequence with gaps"},
257 {"gaprnaphylo", AJTRUE, AJTRUE, ISNUC, 0, "Tt", "Uu",
258 seqTypeCharNucGapPhylo,
259 seqTypeStrNucGapPhylo,
260 NULL,
261 "RNA sequence with gaps and queries"},
262 {"nucleotide", AJFALSE, AJTRUE, ISNUC, 0, "?Xx", "NNn",
263 seqTypeCharNuc,
264 seqTypeStrNuc,
265 NULL,
266 "nucleotide sequence"},
267 {"purenucleotide", AJFALSE, AJFALSE, ISNUC, 0, NULL, NULL,
268 seqTypeCharNucPure,
269 seqTypeStrNucPure,
270 NULL,
271 "nucleotide sequence, bases ACGTU only"},
272 {"gapnucleotide", AJTRUE, AJTRUE, ISNUC, 0, "?Xx", "NNn",
273 seqTypeCharNucGap,
274 seqTypeStrNucGap,
275 NULL,
276 "nucleotide sequence with gaps"},
277 {"gapnucleotidephylo", AJTRUE, AJTRUE, ISNUC, 0, NULL, NULL,
278 seqTypeCharNucGapPhylo,
279 seqTypeStrNucGapPhylo,
280 NULL,
281 "nucleotide sequence with gaps and queries"},
282 {"gapnucleotidesimple",AJTRUE, AJTRUE , ISNUC, 0,
283 "BbDdHhKkMmRrSsVvWwXxYy?", "NnNnNnNnNnNnNnNnNnNnNnN",
284 seqTypeCharNucGap,
285 seqTypeStrNucGap,
286 NULL,
287 "nucleotide sequence with gaps but only N for ambiguity"},
288 {"protein", AJFALSE, AJTRUE, ISPROT, 0, "?*", "XX",
289 seqTypeCharProt,
290 seqTypeStrProt,
291 NULL,
292 "protein sequence"},
293 {"pureprotein", AJFALSE, AJFALSE, ISPROT, 0, NULL, NULL,
294 seqTypeCharProtPure,
295 seqTypeStrProtPure,
296 NULL,
297 "protein sequence without BZ U X or *"},
298 {"stopprotein", AJFALSE, AJTRUE, ISPROT, 0, "?", "X",
299 seqTypeCharProtStop,
300 seqTypeStrProtStop,
301 NULL,
302 "protein sequence with possible stops"},
303 {"gapprotein", AJTRUE, AJTRUE, ISPROT, 0, "?*", "XX",
304 seqTypeCharProtGap,
305 seqTypeStrProtGap,
306 NULL,
307 "protein sequence with gaps"},
308 {"gapstopprotein", AJTRUE, AJTRUE, ISPROT, 0, "?", "X",
309 seqTypeCharProtStopGap,
310 seqTypeStrProtStopGap,
311 NULL,
312 "protein sequence with gaps and possible stops"},
313 {"gapproteinphylo", AJTRUE, AJTRUE, ISPROT, 0, NULL, NULL,
314 seqTypeCharProtGapPhylo,
315 seqTypeStrProtGapPhylo,
316 NULL,
317 "protein sequence with gaps, stops and queries"},
318 {"proteinstandard",AJFALSE, AJTRUE, ISPROT, 0, "?*UuJjOo", "XXXxXxXx",
319 seqTypeCharProt,
320 seqTypeStrProt,
321 NULL,
322 "protein sequence with no selenocysteine"},
323 {"stopproteinstandard",AJFALSE, AJTRUE, ISPROT, 0, "?UuJjOo", "XXxXxXx",
324 seqTypeCharProtStop,
325 seqTypeStrProtStop,
326 NULL,
327 "protein sequence with a possible stop but no selenocysteine"},
328 {"gapproteinstandard", AJTRUE, AJTRUE, ISPROT, 0, "?*UuJjOo", "XXXxXxXx",
329 seqTypeCharProtGap,
330 seqTypeStrProtGap,
331 NULL,
332 "protein sequence with gaps but no selenocysteine"},
333 {"gapproteinsimple", AJTRUE, AJTRUE, ISPROT, 0,
334 "?*BbZzUuJjOo", "XXXxXxXxXxXx",
335 seqTypeCharProtGap,
336 seqTypeStrProtGap,
337 NULL,
338 "protein sequence with gaps but no selenocysteine"},
339 {NULL, AJFALSE, AJTRUE, ISANY, 0, NULL, NULL,
340 NULL,
341 NULL,
342 NULL,
343 NULL}
344 };
345
346
347
348
349 /* @funcstatic seqTypeTestI ***************************************************
350 **
351 ** Tests the type of a sequence is compatible with a defined type.
352 ** If the type can have gaps, also tests for gap characters.
353 ** Used only for testing, so never writes any error message
354 **
355 ** @param [u] thys [AjPSeq] Sequence object
356 ** @param [r] itype [ajint] Sequence type index
357 ** @return [AjBool] ajTrue if compatible.
358 **
359 ** @release 2.8.0
360 ** @@
361 ******************************************************************************/
362
seqTypeTestI(AjPSeq thys,ajint itype)363 static AjBool seqTypeTestI(AjPSeq thys, ajint itype)
364 {
365
366 /*
367 ** We have a known type, now we need to either show the sequence
368 ** matches it, or fix it so it does (or, of course, give up)
369 */
370
371 /*
372 ** First we test the type - predefined by a database,
373 ** or by checking the sequence characters
374 */
375
376 if(seqType[itype].Gaps)
377 {
378 ajDebug("Convert gaps to '-'\n");
379 ajSeqGap(thys, seqGap, 0);
380 }
381 else
382 {
383 ajDebug("Remove all gaps\n");
384
385 if(thys->Qualsize)
386 {
387 ajStrRemoveGapF(&thys->Seq, thys->Accuracy);
388 if(thys->Qualsize > ajStrGetLen(thys->Seq))
389 {
390 thys->Qualsize = ajStrGetLen(thys->Seq);
391 AJCRESIZE(thys->Accuracy, thys->Qualsize);
392 }
393 }
394 else
395 ajStrRemoveGap(&thys->Seq);
396 }
397
398 if(seqType[itype].Type == ISPROT && !ajSeqIsProt(thys))
399 {
400 ajDebug("Sequence is not a protein\n");
401
402 return ajFalse;
403 }
404
405 if(seqType[itype].Type == ISNUC && !ajSeqIsNuc(thys))
406 {
407 ajDebug("Sequence is not nucleic\n");
408
409 return ajFalse;
410 }
411
412 if(ajStrIsCharsetCaseS(thys->Seq, (*seqType[itype].Goodchars)()))
413 {
414 if(seqType[itype].ConvertFrom)
415 {
416 ajDebug("Convert '%s' to '%s'\n",
417 seqType[itype].ConvertFrom,
418 seqType[itype].ConvertTo);
419 ajStrExchangeSetCC(&thys->Seq,
420 seqType[itype].ConvertFrom,
421 seqType[itype].ConvertTo);
422 }
423
424 return ajTrue;
425 }
426
427 ajDebug("seqTypeTestI: Sequence must be %s: found bad character\n",
428 seqType[itype].Desc);
429
430 return ajFalse;
431 }
432
433
434
435
436 /* @funcstatic seqTypeFix *****************************************************
437 **
438 ** Fixes (if possible) unacceptable sequence characters by removing gaps
439 ** (if no gaps are allowed) and by setting ambiguity codes (if they
440 ** are allowed).
441 **
442 ** @param [u] thys [AjPSeq] Sequence object
443 ** @param [r] itype [ajint] Sequence type index
444 ** @return [AjBool] ajTrue if the type can be fixed
445 **
446 ** @release 2.7.0
447 ** @@
448 ******************************************************************************/
449
seqTypeFix(AjPSeq thys,ajint itype)450 static AjBool seqTypeFix(AjPSeq thys, ajint itype)
451 {
452 ajDebug("seqTypeFix '%s' '%S'\n", seqType[itype].Name, thys->Seq);
453
454 /*
455 ** if ungapped, remove any gap characters
456 */
457
458 if(!seqType[itype].Gaps)
459 {
460 if(thys->Qualsize)
461 {
462 ajStrRemoveGapF(&thys->Seq, thys->Accuracy);
463 if(thys->Qualsize > ajStrGetLen(thys->Seq))
464 {
465 thys->Qualsize = ajStrGetLen(thys->Seq);
466 AJCRESIZE(thys->Accuracy, thys->Qualsize);
467 }
468 }
469 else
470 ajStrRemoveGap(&thys->Seq);
471 }
472
473
474
475 if (ajCharMatchC(seqType[itype].Name, "pureprotein"))
476 seqTypeStopTrimS(&thys->Seq);
477
478 if(seqType[itype].Ambig)
479 {
480 /*
481 ** list the bad characters, change to 'X' or 'N'
482 */
483 switch(seqType[itype].Type)
484 {
485 case ISPROT:
486 if (ajCharMatchC(seqType[itype].Name, "protein"))
487 seqTypeStopTrimS(&thys->Seq);
488 seqTypeFixReg(thys, itype, 'X');
489 break;
490 case ISNUC:
491 seqTypeFixReg(thys, itype, 'N');
492 break;
493 case ISANY:
494 if(ajSeqIsNuc(thys))
495 seqTypeFixReg(thys, itype, 'N');
496 else
497 seqTypeFixReg(thys, itype, 'X');
498 break;
499 default:
500 ajDie("Unknown sequence type code for '%s'",
501 seqType[itype].Name);
502 return ajFalse;
503 }
504 }
505
506 if (ajCharMatchC(seqType[itype].Name, "pureprotein"))
507 seqTypeStopTrimS(&thys->Seq);
508
509 ajDebug("seqTypeFix done '%S'\n", thys->Seq);
510
511 return seqTypeTestI(thys, itype);
512 }
513
514
515
516
517 /* @funcstatic seqTypeFixReg **************************************************
518 **
519 ** Fixes (if possible) unacceptable sequence characters by removing gaps
520 ** (if no gaps are allowed) and by setting ambiguity codes (if they
521 ** are allowed).
522 **
523 ** @param [u] thys [AjPSeq] Sequence object
524 ** @param [r] itype [ajint] Sequence type index
525 ** @param [r] fixchar [char] Character to replace with
526 ** @return [AjBool] ajTrue if the type can be fixed
527 **
528 ** @release 2.7.0
529 ** @@
530 ******************************************************************************/
531
seqTypeFixReg(AjPSeq thys,ajint itype,char fixchar)532 static AjBool seqTypeFixReg(AjPSeq thys, ajint itype, char fixchar)
533 {
534 ajDebug("seqTypeFixReg '%s' '%S'\n", seqType[itype].Name, thys->Seq);
535 /*ajDebug("Seq old '%S'\n", thys->Seq);*/
536
537 return ajStrExchangeSetRestSK(&thys->Seq,
538 (*seqType[itype].Goodchars)(), fixchar);
539 }
540
541
542
543
544 /* @funcstatic seqTypeSet *****************************************************
545 **
546 ** Sets the sequence type. Uses the first character of the type
547 ** which can be N or P
548 **
549 ** @param [u] thys [AjPSeq] Sequence object
550 ** @param [r] Type [const AjPStr] Sequence type
551 ** @return [void]
552 **
553 ** @release 1.0.0
554 ** @@
555 ******************************************************************************/
556
seqTypeSet(AjPSeq thys,const AjPStr Type)557 static void seqTypeSet(AjPSeq thys, const AjPStr Type)
558 {
559 const char* cp;
560
561 ajDebug("seqTypeSet '%S'\n", Type);
562
563 cp = ajStrGetPtr(Type);
564
565 switch(*cp)
566 {
567 case 'P':
568 case 'p':
569 ajSeqSetProt(thys);
570 break;
571 case 'N':
572 case 'n':
573 ajSeqSetNuc(thys);
574 break;
575 case '\0':
576 case 'S':
577 case 's':
578 break;
579 default:
580 ajDie("Unknown sequence type '%c'", *cp);
581 }
582
583 return;
584 }
585
586
587
588
589 /* @func ajSeqTypeCheckS ******************************************************
590 **
591 ** Tests the type of a sequence is compatible with a defined type.
592 ** If the type can have gaps, also tests for gap characters.
593 ** Used for input validation - writes error message if the type check fails
594 **
595 ** @param [u] pthys [AjPStr*] Sequence string
596 ** @param [r] type_name [const AjPStr] Sequence type
597 ** @return [AjBool] ajTrue if compatible.
598 **
599 ** @release 2.7.0
600 ** @@
601 ******************************************************************************/
602
ajSeqTypeCheckS(AjPStr * pthys,const AjPStr type_name)603 AjBool ajSeqTypeCheckS(AjPStr* pthys, const AjPStr type_name)
604 {
605 /* AjPStr tmpstr = NULL; */
606 ajint itype = -1;
607
608 /* ajDebug("ajSeqTypeCheckS type '%S' seq '%S'\n", type_name, *pthys); */
609
610 if(!ajStrGetLen(type_name)) /* nothing given - anything goes */
611 {
612 ajSeqGapS(pthys, seqGap);
613
614 return ajTrue;
615 }
616
617 if(!seqFindType(type_name, &itype))
618 {
619 ajDie("Sequence type '%S' unknown", type_name);
620
621 return ajFalse;
622 }
623
624 ajDebug("ajSeqTypeCheckS type '%s' found (%s)\n",
625 seqType[itype].Name, seqType[itype].Desc);
626
627 if(seqType[itype].Gaps)
628 {
629 ajDebug("Convert gaps to '-'\n");
630 ajSeqGapS(pthys, seqGap);
631 }
632 else
633 {
634 ajDebug("Remove all gaps\n");
635 ajStrRemoveGap(pthys);
636 }
637
638 /* no need to test sequence type, we will test every character below */
639
640 if(ajStrIsCharsetCaseS(*pthys, (*seqType[itype].Goodchars)()))
641 {
642 if(seqType[itype].ConvertFrom)
643 {
644 ajDebug("Convert '%s' to '%s'\n",
645 seqType[itype].ConvertFrom,
646 seqType[itype].ConvertTo);
647 ajStrExchangeSetCC(pthys,
648 seqType[itype].ConvertFrom,
649 seqType[itype].ConvertTo);
650 }
651 return ajTrue;
652 }
653
654 return ajTrue;
655 }
656
657
658
659
660 /* @func ajSeqTypeCheckIn *****************************************************
661 **
662 ** Tests the type of a sequence is compatible with a defined type.
663 ** If the type can have gaps, also tests for gap characters.
664 ** Used for input validation - writes error message if the type check fails
665 **
666 ** @param [u] thys [AjPSeq] Sequence object
667 ** @param [r] seqin [const AjPSeqin] Sequence input object
668 ** @return [AjBool] ajTrue if compatible.
669 **
670 ** @release 2.7.0
671 ** @@
672 ******************************************************************************/
673
ajSeqTypeCheckIn(AjPSeq thys,const AjPSeqin seqin)674 AjBool ajSeqTypeCheckIn(AjPSeq thys, const AjPSeqin seqin)
675 {
676 ajint itype = -1;
677 AjPStr Type;
678 ajlong i;
679
680 ajDebug("testing sequence '%s' '%50.50S' type '%S' IsNuc %B IsProt %B\n",
681 ajSeqGetNameC(thys), thys->Seq,
682 seqin->Inputtype, seqin->IsNuc, seqin->IsProt);
683
684 Type = seqin->Inputtype; /* ACD file had a predefined seq type */
685
686 if(seqin->IsNuc)
687 ajSeqSetNuc(thys);
688
689 if(seqin->IsProt)
690 ajSeqSetProt(thys);
691
692 if(seqin->Input->Query && ajStrGetLen(seqin->Input->Query->DbType))
693 seqTypeSet(thys, seqin->Input->Query->DbType);
694
695
696 if(!ajStrGetLen(Type)) /* nothing given - anything goes */
697 {
698 ajSeqGap(thys, seqGap, 0);
699 ajDebug("ajSeqTypeCheckIn: OK - no type, gaps converted to '-'\n");
700
701 return ajTrue;
702 }
703
704 if(!seqFindType(Type, &itype))
705 {
706 ajDebug("ajSeqTypeCheckIn: rejected - unknown type\n");
707 ajDie("Sequence type '%S' unknown", Type);
708
709 return ajFalse;
710 }
711
712 ajDebug("ajSeqTypeCheckIn type '%s' found (%s)\n",
713 seqType[itype].Name, seqType[itype].Desc);
714
715 if(seqType[itype].Gaps)
716 {
717 ajDebug("Convert gaps to '-'\n");
718 ajSeqGap(thys, seqGap, 0);
719 }
720 else
721 {
722 ajDebug("Remove all gaps\n");
723 ajStrRemoveGap(&thys->Seq);
724 }
725
726 if(seqType[itype].Type == ISPROT)
727 {
728 if (ajSeqIsProt(thys))
729 ajSeqSetProt(thys);
730 else
731 {
732 ajErr("Sequence is not a protein\n");
733 ajDebug("ajSeqTypeCheckIn: rejected - not a protein\n");
734
735 return ajFalse;
736 }
737 }
738
739 if(seqType[itype].Type == ISNUC)
740 {
741 if (ajSeqIsNuc(thys))
742 ajSeqSetNuc(thys);
743 else
744 {
745 ajErr("Sequence is not nucleic\n");
746 ajDebug("ajSeqTypeCheckIn: rejected - not nucleic\n");
747
748 return ajFalse;
749 }
750 }
751
752 if(!seqType[itype].Filter)
753 seqType[itype].Filter =
754 ajStrGetfilterCase((*seqType[itype].Goodchars)());
755 if(ajStrIsFilter(thys->Seq, seqType[itype].Filter))
756 {
757 ajDebug("ajSeqTypeCheckIn: bad characters test passed, convert\n");
758
759 if(seqType[itype].ConvertFrom)
760 {
761 ajDebug("Convert '%s' to '%s'\n",
762 seqType[itype].ConvertFrom,
763 seqType[itype].ConvertTo);
764 ajStrExchangeSetCC(&thys->Seq,
765 seqType[itype].ConvertFrom,
766 seqType[itype].ConvertTo);
767 }
768 ajDebug("ajSeqTypeCheckIn: OK - no badchars\n");
769
770 return ajTrue;
771 }
772
773 if(seqTypeFix(thys, itype)) /* this will reuse badchars */
774 {
775 ajDebug("ajSeqTypeCheckIn: OK - type fixed\n");
776
777 return ajTrue;
778 }
779
780 i = ajStrFindRestCaseS(thys->Seq, (*seqType[itype].Goodchars)());
781
782 if(i >= 0)
783 {
784 ajErr("ajSeqTypeCheckIn: Sequence must be %s: "
785 "found bad character '%c'",
786 seqType[itype].Desc, ajStrGetCharPos(thys->Seq, i));
787 ajDebug("ajSeqTypeCheckIn: rejected - still had badchars\n");
788
789 return ajFalse;
790 }
791
792 ajDebug("ajSeqTypeCheckIn: OK - fixed finally\n");
793 ajDebug("Final sequence '%S' type '%S' IsNuc %B IsProt %B\n",
794 thys->Seq, seqin->Inputtype, seqin->IsNuc, seqin->IsProt);
795
796 return ajTrue;
797 }
798
799
800
801
802
803 /* @func ajSeqTypeNucS ********************************************************
804 **
805 ** Checks sequence type for nucleotide without gaps.
806 **
807 ** RNA and DNA codes are accepted as is.
808 **
809 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
810 ** @return [char] invalid character if any.
811 **
812 ** @release 2.7.0
813 ** @@
814 ******************************************************************************/
815
ajSeqTypeNucS(const AjPStr thys)816 char ajSeqTypeNucS(const AjPStr thys)
817 {
818 char ret;
819 ajDebug("ajSeqTypeNucS test\n");
820
821 ret = seqTypeTestS(thys, seqTypeStrNuc());
822
823 if (ret)
824 return ret;
825
826 return seqTypeTestS(thys, seqTypeStrNucGap());
827 }
828
829
830
831
832 /* @func ajSeqTypeDnaS ********************************************************
833 **
834 ** Checks sequence type for DNA without gaps.
835 **
836 ** RNA and DNA codes are accepted as is.
837 **
838 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
839 ** @return [char] invalid character if any.
840 **
841 ** @release 2.7.0
842 ** @@
843 ******************************************************************************/
844
ajSeqTypeDnaS(const AjPStr thys)845 char ajSeqTypeDnaS(const AjPStr thys)
846 {
847 char ret;
848 ajDebug("ajSeqTypeDnaS test\n");
849
850 ret = seqTypeTestS(thys, seqTypeStrNuc());
851
852 if (ret)
853 return ret;
854
855 return seqTypeTestS(thys, seqTypeStrDnaGap());
856 }
857
858
859
860
861 /* @func ajSeqTypeRnaS ********************************************************
862 **
863 ** Checks sequence type for RNA without gaps
864 **
865 ** RNA codes are accepted as is.
866 **
867 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
868 ** @return [char] invalid character if any.
869 **
870 ** @release 2.7.0
871 ** @@
872 ******************************************************************************/
873
ajSeqTypeRnaS(const AjPStr thys)874 char ajSeqTypeRnaS(const AjPStr thys)
875 {
876 char ret;
877 ajDebug("ajSeqTypeRnaS test\n");
878
879 ret = seqTypeTestS(thys, seqTypeStrNuc());
880
881 if (ret)
882 return ret;
883
884 return seqTypeTestS(thys, seqTypeStrRnaGap());
885 }
886
887
888
889
890 /* @func ajSeqTypeGapdnaS *****************************************************
891 **
892 ** Checks sequence type for Dna with gaps
893 **
894 ** DNA codes are accepted as is.
895 **
896 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
897 ** @return [char] invalid character if any.
898 **
899 ** @release 2.7.0
900 ** @@
901 ******************************************************************************/
902
ajSeqTypeGapdnaS(const AjPStr thys)903 char ajSeqTypeGapdnaS(const AjPStr thys)
904 {
905 char ret;
906 ajDebug("ajSeqTypeGapdnaS test\n");
907
908 ret = seqTypeTestS(thys, seqTypeStrNucGap());
909
910 if (ret)
911 return ret;
912
913 return seqTypeTestS(thys, seqTypeStrDnaGap());
914 }
915
916
917
918
919 /* @func ajSeqTypeGaprnaS *****************************************************
920 **
921 ** Checks sequence type for RNA with gaps
922 **
923 ** RNA codes are accepted as is.
924 **
925 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
926 ** @return [char] invalid character if any.
927 **
928 ** @release 2.7.0
929 ** @@
930 ******************************************************************************/
931
ajSeqTypeGaprnaS(const AjPStr thys)932 char ajSeqTypeGaprnaS(const AjPStr thys)
933 {
934 char ret;
935 ajDebug("ajSeqTypeGaprnaS test\n");
936
937 ret = seqTypeTestS(thys, seqTypeStrNucGap());
938
939 if (ret)
940 return ret;
941
942 return seqTypeTestS(thys, seqTypeStrRnaGap());
943 }
944
945
946
947
948 /* @func ajSeqTypeGapnucS *****************************************************
949 **
950 ** Checks sequence type for nucleotide with gaps.
951 **
952 ** RNA and DNA codes are accepted as is.
953 **
954 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
955 ** @return [char] invalid character if any.
956 **
957 ** @release 2.7.0
958 ** @@
959 ******************************************************************************/
960
ajSeqTypeGapnucS(const AjPStr thys)961 char ajSeqTypeGapnucS(const AjPStr thys)
962 {
963 ajDebug("ajSeqTypeGapnucS test\n");
964
965 return seqTypeTestS(thys, seqTypeStrNucGap());
966 }
967
968
969
970
971 /* @func ajSeqTypeAnyprotS ****************************************************
972 **
973 ** Checks sequence type for anything that can be in a protein sequence
974 **
975 ** Stop codes are replaced with gaps.
976 **
977 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
978 ** @return [char] invalid character if any.
979 **
980 ** @release 2.7.0
981 ** @@
982 ******************************************************************************/
983
ajSeqTypeAnyprotS(const AjPStr thys)984 char ajSeqTypeAnyprotS(const AjPStr thys)
985 {
986 ajDebug("ajSeqTypeAnyprotS test\n");
987
988 return seqTypeTestS(thys, seqTypeStrProtAny());
989 }
990
991
992
993
994 /* @func ajSeqTypeProtS *******************************************************
995 **
996 ** Checks sequence type for anything that can be in a protein sequence
997 **
998 ** Stop codes are replaced with gaps.
999 **
1000 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
1001 ** @return [char] invalid character if any.
1002 **
1003 ** @release 2.7.0
1004 ** @@
1005 ******************************************************************************/
1006
ajSeqTypeProtS(const AjPStr thys)1007 char ajSeqTypeProtS(const AjPStr thys)
1008 {
1009 ajDebug("ajSeqTypeProtS test\n");
1010
1011 return seqTypeTestS(thys, seqTypeStrProt());
1012 }
1013
1014
1015
1016
1017 /* @func ajSeqTypeGapanyS *****************************************************
1018 **
1019 ** Checks sequence type for any sequence with gaps.
1020 **
1021 ** Stops ('*') are allowed so this could be a 3 frame translation of DNA.
1022 **
1023 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
1024 ** @return [char] invalid character if any.
1025 **
1026 ** @release 2.7.0
1027 ** @@
1028 ******************************************************************************/
1029
ajSeqTypeGapanyS(const AjPStr thys)1030 char ajSeqTypeGapanyS(const AjPStr thys)
1031 {
1032 ajDebug("ajSeqTypeGapanyS test\n");
1033
1034 return seqTypeTestS(thys, seqTypeStrAnyGap());
1035 }
1036
1037
1038
1039
1040 /* @func ajSeqGap *************************************************************
1041 **
1042 ** Sets non-sequence characters to valid gap characters,
1043 ** and pads with extra gaps if necessary to a specified length
1044 **
1045 ** @param [u] thys [AjPSeq] Sequence
1046 ** @param [r] gapc [char] Standard gap character
1047 ** @param [r] padc [char] Gap character for ends of sequence
1048 ** @return [void]
1049 **
1050 ** @release 1.0.0
1051 ** @@
1052 ******************************************************************************/
1053
ajSeqGap(AjPSeq thys,char gapc,char padc)1054 void ajSeqGap(AjPSeq thys, char gapc, char padc)
1055 {
1056 seqGapSL(&thys->Seq, gapc, padc, 0);
1057
1058 return;
1059 }
1060
1061
1062
1063
1064 /* @func ajSeqGapLen **********************************************************
1065 **
1066 ** Sets non-sequence characters to valid gap characters,
1067 ** and pads with extra gaps if necessary to a specified length
1068 **
1069 ** @param [u] thys [AjPSeq] Sequence
1070 ** @param [r] gapc [char] Standard gap character
1071 ** @param [r] padc [char] Gap character for ends of sequence
1072 ** @param [r] ilen [ajint] Sequence length. Expanded if longer than
1073 ** current length
1074 ** @return [void]
1075 **
1076 ** @release 1.0.0
1077 ** @@
1078 ******************************************************************************/
1079
ajSeqGapLen(AjPSeq thys,char gapc,char padc,ajint ilen)1080 void ajSeqGapLen(AjPSeq thys, char gapc, char padc, ajint ilen)
1081 {
1082 seqGapSL(&thys->Seq, gapc, padc, ilen);
1083
1084 return;
1085 }
1086
1087
1088
1089
1090 /* @func ajSeqGapS ************************************************************
1091 **
1092 ** Sets non-sequence characters to valid gap characters,
1093 ** and pads with extra gaps if necessary to a specified length
1094 **
1095 ** @param [u] seq [AjPStr*] Sequence
1096 ** @param [r] gapc [char] Standard gap character
1097 ** @return [void]
1098 **
1099 ** @release 1.0.0
1100 ** @@
1101 ******************************************************************************/
1102
ajSeqGapS(AjPStr * seq,char gapc)1103 void ajSeqGapS(AjPStr* seq, char gapc)
1104 {
1105 seqGapSL(seq, gapc, 0, 0);
1106
1107 return;
1108 }
1109
1110
1111
1112
1113 /* @funcstatic seqGapSL *******************************************************
1114 **
1115 ** Sets non-sequence characters in a string to valid gap characters,
1116 ** and pads with extra gaps if necessary to a specified length
1117 **
1118 ** @param [u] seq [AjPStr*] String of sequence characters
1119 ** @param [r] gapc [char] Standard gap character
1120 ** @param [r] padc [char] Gap character for ends of sequence
1121 ** @param [r] ilen [ajuint] Sequence length. Expanded if longer than
1122 ** current length
1123 ** @return [void]
1124 **
1125 ** @release 1.0.0
1126 ** @@
1127 ******************************************************************************/
1128
seqGapSL(AjPStr * seq,char gapc,char padc,ajuint ilen)1129 static void seqGapSL(AjPStr* seq, char gapc, char padc, ajuint ilen)
1130 {
1131 ajuint i;
1132 static ajuint igap;
1133 char* cp;
1134 char endc = gapc;
1135
1136 igap = strlen(seqCharGapTest);
1137
1138 if(!seqNewGapChars)
1139 {
1140 seqNewGapChars = ajCharNewRes(igap);
1141 seqNewGapChars[0] = '\0';
1142 }
1143
1144 /* Set the seqNewGapChars string to match gapc */
1145
1146 if(*seqNewGapChars != gapc)
1147 {
1148 for(i=0; i < igap; i++)
1149 seqNewGapChars[i] = gapc;
1150
1151 seqNewGapChars[igap] = '\0';
1152 }
1153
1154
1155 if(ilen >= MAJSTRGETRES(*seq))
1156 ajStrSetRes(seq, ilen+1);
1157
1158 ajStrExchangeSetCC(seq, seqCharGapTest, seqNewGapChars);
1159
1160 if(padc)
1161 { /* start and end characters updated */
1162 endc = padc;
1163
1164 if(strchr(seqCharGapTest, ajStrGetCharFirst(*seq)))
1165 {
1166 /* pad start */
1167 for(cp = ajStrGetuniquePtr(seq); strchr(seqCharGapTest, *cp); cp++)
1168 *cp = padc;
1169 }
1170
1171 if(strchr(seqCharGapTest, ajStrGetCharLast(*seq)))
1172 {
1173 cp = ajStrGetuniquePtr(seq);
1174
1175 for(i=ajStrGetLen(*seq) - 1; i && strchr(seqCharGapTest, cp[i]); i--)
1176 cp[i] = padc;
1177 }
1178 }
1179
1180 if(ajStrGetLen(*seq) < ilen) /* ilen can be zero to skip this */
1181 {
1182 cp = ajStrGetuniquePtr(seq);
1183
1184 for(i=ajStrGetLen(*seq); i < ilen; i++)
1185 cp[i] = endc;
1186
1187 cp[ilen] = '\0';
1188 ajStrSetValid(seq);
1189 }
1190
1191 /* ajDebug("seqGapSL after '%S'\n", *seq); */
1192
1193 return;
1194 }
1195
1196
1197
1198
1199 /* @funcstatic seqTypeStopTrimS ***********************************************
1200 **
1201 ** Removes a trailing stop (asterisk) from a protein sequence
1202 **
1203 ** @param [u] pthys [AjPStr*] Sequence string
1204 ** @return [AjBool] ajTrue if a stop was removed.
1205 **
1206 ** @release 2.7.0
1207 ** @@
1208 ******************************************************************************/
1209
seqTypeStopTrimS(AjPStr * pthys)1210 static AjBool seqTypeStopTrimS(AjPStr* pthys)
1211 {
1212 if(strchr(seqCharProtStop,ajStrGetCharLast(*pthys)))
1213 {
1214 ajDebug("Trailing stop removed %c\n", ajStrGetCharLast(*pthys));
1215 ajStrCutEnd(pthys, 1);
1216
1217 return ajTrue;
1218 }
1219
1220 return ajFalse;
1221 }
1222
1223
1224
1225
1226 /* @func ajSeqSetNuc **********************************************************
1227 **
1228 ** Sets a sequence type to "nucleotide"
1229 **
1230 ** @param [u] thys [AjPSeq] Sequence object
1231 ** @return [void]
1232 ** @category modify [AjPSeq] Sets sequence to be nucleotide
1233 **
1234 ** @release 1.0.0
1235 ** @@
1236 ******************************************************************************/
1237
ajSeqSetNuc(AjPSeq thys)1238 void ajSeqSetNuc(AjPSeq thys)
1239 {
1240 if(ajStrMatchC(thys->Type, "N"))
1241 return;
1242
1243 ajStrAssignC(&thys->Type, "N");
1244
1245 if(thys->Fttable)
1246 ajFeattableSetNuc(thys->Fttable);
1247
1248 /* set N as the ambiguity code */
1249 ajStrExchangeSetCC(&thys->Seq,
1250 "xX",
1251 "nN");
1252
1253 return;
1254 }
1255
1256
1257
1258
1259 /* @func ajSeqSetProt *********************************************************
1260 **
1261 ** Sets a sequence type to "protein"
1262 **
1263 ** @param [u] thys [AjPSeq] Sequence object
1264 ** @return [void]
1265 ** @category modify [AjPSeq] Sets sequence to be protein
1266 **
1267 ** @release 1.0.0
1268 ** @@
1269 ******************************************************************************/
1270
ajSeqSetProt(AjPSeq thys)1271 void ajSeqSetProt(AjPSeq thys)
1272 {
1273 if(ajStrMatchC(thys->Type, "P"))
1274 return;
1275
1276 ajStrAssignC(&thys->Type, "P");
1277
1278 if(thys->Fttable)
1279 ajFeattableSetProt(thys->Fttable);
1280
1281 return;
1282 }
1283
1284
1285
1286
1287 /* @func ajSeqsetSetNuc *******************************************************
1288 **
1289 ** Sets a sequence set type to "nucleotide"
1290 **
1291 ** @param [u] thys [AjPSeqset] Sequence set object
1292 ** @return [void]
1293 **
1294 ** @release 2.8.0
1295 ** @@
1296 ******************************************************************************/
1297
ajSeqsetSetNuc(AjPSeqset thys)1298 void ajSeqsetSetNuc(AjPSeqset thys)
1299 {
1300 ajStrAssignC(&thys->Type, "N");
1301
1302 return;
1303 }
1304
1305
1306
1307
1308 /* @func ajSeqsetSetProt ******************************************************
1309 **
1310 ** Sets a sequence set type to "protein"
1311 **
1312 ** @param [u] thys [AjPSeqset] Sequence set object
1313 ** @return [void]
1314 **
1315 ** @release 2.8.0
1316 ** @@
1317 ******************************************************************************/
1318
ajSeqsetSetProt(AjPSeqset thys)1319 void ajSeqsetSetProt(AjPSeqset thys)
1320 {
1321 ajStrAssignC(&thys->Type, "P");
1322
1323 return;
1324 }
1325
1326
1327
1328
1329 /* @func ajSeqType ************************************************************
1330 **
1331 ** Sets the type of a sequence if it has not yet been defined.
1332 **
1333 ** @param [u] thys [AjPSeq] Sequence object
1334 ** @return [void]
1335 ** @category modify [AjPSeq] Sets the sequence type
1336 **
1337 ** @release 1.0.0
1338 ** @@
1339 ******************************************************************************/
1340
ajSeqType(AjPSeq thys)1341 void ajSeqType(AjPSeq thys)
1342 {
1343 ajDebug("ajSeqType current: %S\n", thys->Type);
1344
1345 if(ajStrGetLen(thys->Type))
1346 return;
1347
1348 if(ajSeqIsNuc(thys))
1349 {
1350 ajSeqSetNuc(thys);
1351 ajDebug("ajSeqType nucleotide: %S\n", thys->Type);
1352
1353 return;
1354 }
1355
1356 if(ajSeqIsProt(thys))
1357 {
1358 ajSeqSetProt(thys);
1359 ajDebug("ajSeqType protein: %S\n", thys->Type);
1360
1361 return;
1362 }
1363
1364 ajDebug("ajSeqType unknown: %S\n", thys->Type);
1365
1366 return;
1367 }
1368
1369
1370
1371
1372 /* @func ajSeqPrintType *******************************************************
1373 **
1374 ** Prints the seqType definitions.
1375 ** For EMBOSS entrails output
1376 **
1377 ** @param [u] outf [AjPFile] Output file
1378 ** @param [r] full [AjBool] Full output
1379 ** @return [void]
1380 **
1381 ** @release 2.5.0
1382 ******************************************************************************/
1383
ajSeqPrintType(AjPFile outf,AjBool full)1384 void ajSeqPrintType(AjPFile outf, AjBool full)
1385 {
1386 ajuint i;
1387 AjPStr tmpstr = NULL;
1388 ajuint maxtmp = 0;
1389
1390 const char* typeName[] = {"ANY", "NUC", "PRO"};
1391
1392
1393 (void) full; /* make used - no extra detail reported */
1394
1395 ajFmtPrintF(outf, "\n# Sequence types\n");
1396 ajFmtPrintF(outf, "# Name Gap Ambig N/P "
1397 "From To Description\n");
1398 ajFmtPrintF(outf, "seqType {\n");
1399
1400 for(i=0; seqType[i].Name; i++)
1401 {
1402 if (seqType[i].ConvertFrom)
1403 {
1404 ajFmtPrintF(outf, " %-20s %3B %3B %3s",
1405 seqType[i].Name, seqType[i].Gaps,
1406 seqType[i].Ambig, typeName[seqType[i].Type]);
1407 ajFmtPrintS(&tmpstr, "\"%s\"", seqType[i].ConvertFrom);
1408 if(maxtmp > ajStrGetLen(tmpstr))
1409 maxtmp = ajStrGetLen(tmpstr);
1410 ajFmtPrintF(outf, " %-8S", tmpstr);
1411 ajFmtPrintS(&tmpstr, "\"%s\"", seqType[i].ConvertTo);
1412 if(maxtmp > ajStrGetLen(tmpstr))
1413 maxtmp = ajStrGetLen(tmpstr);
1414 ajFmtPrintF(outf, " %-8S", tmpstr);
1415 ajFmtPrintF(outf, " \"%s\"\n", seqType[i].Desc);
1416 }
1417 else
1418 {
1419 ajFmtPrintF(outf, " %-20s %3B %3B %s \"\" \"\" "
1420 "\"%s\"\n",
1421 seqType[i].Name, seqType[i].Gaps,
1422 seqType[i].Ambig, typeName[seqType[i].Type],
1423 seqType[i].Desc);
1424 }
1425 }
1426
1427 ajFmtPrintF(outf, "}\n");
1428
1429 if(maxtmp > 8) ajWarn("ajSeqPrintType max tmpstr len %d",
1430 maxtmp);
1431 ajStrDel(&tmpstr);
1432
1433 return;
1434 }
1435
1436
1437
1438
1439 /* @funcstatic seqTypeTest ****************************************************
1440 **
1441 ** Checks sequence contains only expected characters.
1442 **
1443 ** Returns an invalid character for failure, or a null character for success.
1444 **
1445 ** @param [r] thys [const AjPStr] Sequence string
1446 ** @param [u] badchars [AjPRegexp] Regular expression for
1447 ** sequence characters disallowed
1448 ** @return [char] invalid character if any.
1449 **
1450 ** @release 2.7.0
1451 ******************************************************************************/
1452
seqTypeTest(const AjPStr thys,AjPRegexp badchars)1453 static char seqTypeTest(const AjPStr thys, AjPRegexp badchars)
1454 {
1455 AjPStr tmpstr = NULL;
1456 char ret = '\0';
1457
1458 if(!ajStrGetLen(thys))
1459 return ret;
1460
1461 /*ajDebug("seqTypeTest, Sequence '%S'\n", thys);*/
1462 if(!ajRegExec(badchars, thys))
1463 return ret;
1464
1465 ajRegSubI(badchars, 1, &tmpstr);
1466 ret = ajStrGetCharFirst(tmpstr);
1467 ajDebug("seqTypeTest, Sequence had bad character '%c' (%x) "
1468 "at %d of %d/%d\n '%S'\n",
1469 ret, ret,
1470 ajRegOffset(badchars),
1471 ajStrGetLen(thys), strlen(ajStrGetPtr(thys)), tmpstr);
1472
1473 ajStrDel(&tmpstr);
1474
1475 return ret;
1476 }
1477
1478
1479
1480
1481 /* @funcstatic seqTypeTestS ***************************************************
1482 **
1483 ** Checks sequence contains only expected characters.
1484 **
1485 ** Returns an invalid character for failure, or a null character for success.
1486 **
1487 ** @param [r] thys [const AjPStr] Sequence string
1488 ** @param [r] goodchars [const AjPStr] String of
1489 ** sequence characters allowed
1490 ** @return [char] invalid character if any.
1491 **
1492 ** @release 4.1.0
1493 ******************************************************************************/
1494
seqTypeTestS(const AjPStr thys,const AjPStr goodchars)1495 static char seqTypeTestS(const AjPStr thys, const AjPStr goodchars)
1496 {
1497 char ret = '\0';
1498 ajlong i;
1499
1500 if(!ajStrGetLen(thys))
1501 return ret;
1502
1503 ajDebug("seqTypeTestS, len %d goodchars '%S'\n",
1504 ajStrGetLen(thys), goodchars);
1505
1506 if(ajStrIsCharsetCaseS(thys, goodchars))
1507 return ret;
1508
1509 i = ajStrFindRestCaseS(thys, goodchars);
1510
1511 if (i < 0)
1512 return ret;
1513
1514 ret = ajStrGetCharPos(thys, (size_t) i);
1515
1516 ajDebug("seqTypeTest, Sequence had bad character '%c' (%x) "
1517 "at %d of %d/%d\n",
1518 ret, ret,
1519 i,
1520 ajStrGetLen(thys), strlen(ajStrGetPtr(thys)));
1521
1522
1523 return ret;
1524 }
1525
1526
1527
1528
1529 /* @funcstatic seqTypeCharAny *************************************************
1530 **
1531 ** Returns regular expression to test for type Any
1532 **
1533 ** @return [AjPRegexp] valid characters
1534 **
1535 ** @release 2.7.0
1536 ******************************************************************************/
1537
seqTypeCharAny(void)1538 static AjPRegexp seqTypeCharAny(void)
1539 {
1540 AjPStr regstr = NULL;
1541 AjPStr tmpstr = NULL;
1542
1543 if(!seqtypeRegAny)
1544 {
1545 regstr = ajStrNewRes(256);
1546 tmpstr = ajStrNewS(seqTypeStrAny());
1547
1548 ajStrAppendC(®str, "([^");
1549 ajStrKeepSetAlpha(&tmpstr);
1550 ajStrFmtLower(&tmpstr);
1551 ajStrAppendS(®str, tmpstr);
1552 ajStrAppendS(®str, seqTypeStrAny());
1553 ajStrAppendC(®str, "+])");
1554
1555 seqtypeRegAny = ajRegComp(regstr);
1556
1557 ajStrDel(®str);
1558 ajStrDel(&tmpstr);
1559 }
1560
1561 return seqtypeRegAny;
1562 }
1563
1564
1565
1566
1567 /* @funcstatic seqTypeCharAnyGap **********************************************
1568 **
1569 ** Returns regular expression to test for type Any with gaps
1570 **
1571 ** @return [AjPRegexp] valid characters
1572 **
1573 ** @release 2.7.0
1574 ******************************************************************************/
1575
seqTypeCharAnyGap(void)1576 static AjPRegexp seqTypeCharAnyGap(void)
1577 {
1578 AjPStr regstr = NULL;
1579 AjPStr tmpstr = NULL;
1580
1581 if(!seqtypeRegAnyGap)
1582 {
1583 regstr = ajStrNewRes(256);
1584 tmpstr = ajStrNewS(seqTypeStrAnyGap());
1585
1586 ajStrAppendC(®str, "([^");
1587 ajStrAppendS(®str, tmpstr);
1588 ajStrKeepSetAlpha(&tmpstr);
1589 ajStrFmtLower(&tmpstr);
1590 ajStrAppendS(®str, tmpstr);
1591 ajStrAppendC(®str, "+])");
1592
1593 seqtypeRegAnyGap = ajRegComp(regstr);
1594
1595 ajStrDel(®str);
1596 ajStrDel(&tmpstr);
1597 }
1598
1599 return seqtypeRegAnyGap;
1600 }
1601
1602
1603
1604
1605 /* @funcstatic seqTypeCharNuc *************************************************
1606 **
1607 ** Returns regular expression to test for nucleotide bases
1608 **
1609 ** @return [AjPRegexp] valid characters
1610 **
1611 ** @release 2.7.0
1612 ******************************************************************************/
1613
seqTypeCharNuc(void)1614 static AjPRegexp seqTypeCharNuc(void)
1615 {
1616 AjPStr regstr = NULL;
1617 AjPStr tmpstr = NULL;
1618
1619 if(!seqtypeRegNuc)
1620 {
1621 regstr = ajStrNewRes(256);
1622 tmpstr = ajStrNewS(seqTypeStrNuc());
1623
1624 ajStrAppendC(®str, "([^");
1625 ajStrKeepSetAlpha(&tmpstr);
1626 ajStrFmtLower(&tmpstr);
1627 ajStrAppendS(®str, tmpstr);
1628 ajStrAppendS(®str, seqTypeStrNuc());
1629 ajStrAppendC(®str, "+])");
1630
1631 seqtypeRegNuc = ajRegComp(regstr);
1632
1633 ajStrDel(®str);
1634 ajStrDel(&tmpstr);
1635 }
1636
1637 return seqtypeRegNuc;
1638 }
1639
1640
1641
1642
1643 /* @funcstatic seqTypeCharNucGap **********************************************
1644 **
1645 ** Returns regular expression to test for nucleotide bases with gaps
1646 **
1647 ** @return [AjPRegexp] valid characters
1648 **
1649 ** @release 2.7.0
1650 ******************************************************************************/
1651
seqTypeCharNucGap(void)1652 static AjPRegexp seqTypeCharNucGap(void)
1653 {
1654 AjPStr regstr = NULL;
1655 AjPStr tmpstr = NULL;
1656
1657 if(!seqtypeRegNucGap)
1658 {
1659 regstr = ajStrNewRes(256);
1660 tmpstr = ajStrNewS(seqTypeStrNucGap());
1661
1662 ajStrAppendC(®str, "([^");
1663 ajStrAppendS(®str, tmpstr);
1664 ajStrKeepSetAlpha(&tmpstr);
1665 ajStrFmtLower(&tmpstr);
1666 ajStrAppendS(®str, tmpstr);
1667 ajStrAppendC(®str, "+])");
1668
1669 seqtypeRegNucGap = ajRegComp(regstr);
1670
1671 ajStrDel(®str);
1672 ajStrDel(&tmpstr);
1673 }
1674
1675 return seqtypeRegNucGap;
1676 }
1677
1678
1679
1680
1681 /* @funcstatic seqTypeCharNucGapPhylo *****************************************
1682 **
1683 ** Returns regular expression to test for nucleotide bases with gaps
1684 ** and queries
1685 **
1686 ** @return [AjPRegexp] valid characters
1687 **
1688 ** @release 2.9.0
1689 ******************************************************************************/
1690
seqTypeCharNucGapPhylo(void)1691 static AjPRegexp seqTypeCharNucGapPhylo(void)
1692 {
1693 AjPStr regstr = NULL;
1694 AjPStr tmpstr = NULL;
1695
1696 if(!seqtypeRegNucGapPhylo)
1697 {
1698 regstr = ajStrNewRes(256);
1699 tmpstr = ajStrNewS(seqTypeStrNucGapPhylo());
1700
1701 ajStrAppendC(®str, "([^");
1702 ajStrKeepSetAlpha(&tmpstr);
1703 ajStrFmtLower(&tmpstr);
1704 ajStrAppendS(®str, tmpstr);
1705 ajStrAppendS(®str, seqTypeStrNucGapPhylo());
1706 ajStrAppendC(®str, "+])");
1707
1708 seqtypeRegNucGapPhylo = ajRegComp(regstr);
1709
1710 ajStrDel(®str);
1711 ajStrDel(&tmpstr);
1712 }
1713
1714 return seqtypeRegNucGapPhylo;
1715 }
1716
1717
1718
1719
1720 /* @funcstatic seqTypeCharNucPure *********************************************
1721 **
1722 ** Returns regular expression to test for nucleotide bases
1723 ** with no ambiguity
1724 **
1725 ** @return [AjPRegexp] valid characters
1726 **
1727 ** @release 2.7.0
1728 ******************************************************************************/
1729
seqTypeCharNucPure(void)1730 static AjPRegexp seqTypeCharNucPure(void)
1731 {
1732 AjPStr regstr = NULL;
1733 AjPStr tmpstr = NULL;
1734
1735 if(!seqtypeRegNucPure)
1736 {
1737 regstr = ajStrNewRes(256);
1738 tmpstr = ajStrNewS(seqTypeStrNucPure());
1739
1740 ajStrAppendC(®str, "([^");
1741 ajStrKeepSetAlpha(&tmpstr);
1742 ajStrFmtLower(&tmpstr);
1743 ajStrAppendS(®str, tmpstr);
1744 ajStrAppendS(®str, seqTypeStrNucPure());
1745 ajStrAppendC(®str, "+])");
1746
1747 seqtypeRegNucPure = ajRegComp(regstr);
1748
1749 ajStrDel(®str);
1750 ajStrDel(&tmpstr);
1751 }
1752
1753 return seqtypeRegNucPure;
1754 }
1755
1756
1757
1758
1759 /* @funcstatic seqTypeCharProt ************************************************
1760 **
1761 ** Returns regular expression to test for protein residues
1762 **
1763 ** @return [AjPRegexp] valid characters
1764 **
1765 ** @release 2.7.0
1766 ******************************************************************************/
1767
seqTypeCharProt(void)1768 static AjPRegexp seqTypeCharProt(void)
1769 {
1770 AjPStr regstr = NULL;
1771 AjPStr tmpstr = NULL;
1772
1773 if(!seqtypeRegProt)
1774 {
1775 regstr = ajStrNewRes(256);
1776 tmpstr = ajStrNewS(seqTypeStrProt());
1777
1778 ajStrAppendC(®str, "([^");
1779 ajStrKeepSetAlpha(&tmpstr);
1780 ajStrFmtLower(&tmpstr);
1781 ajStrAppendS(®str, tmpstr);
1782 ajStrAppendS(®str, seqTypeStrProt());
1783 ajStrAppendC(®str, "+])");
1784
1785 seqtypeRegProt = ajRegComp(regstr);
1786
1787 ajStrDel(®str);
1788 ajStrDel(&tmpstr);
1789 }
1790
1791 return seqtypeRegProt;
1792 }
1793
1794
1795
1796
1797 /* @funcstatic seqTypeCharProtGap *********************************************
1798 **
1799 ** Returns regular expression to test for protein residues or gaps
1800 **
1801 ** @return [AjPRegexp] valid characters
1802 **
1803 ** @release 2.7.0
1804 ******************************************************************************/
1805
seqTypeCharProtGap(void)1806 static AjPRegexp seqTypeCharProtGap(void)
1807 {
1808 AjPStr regstr = NULL;
1809 AjPStr tmpstr = NULL;
1810
1811 if(!seqtypeRegProtGap)
1812 {
1813 regstr = ajStrNewRes(256);
1814 tmpstr = ajStrNewS(seqTypeStrProtGap());
1815
1816 ajStrAppendC(®str, "([^");
1817 ajStrKeepSetAlpha(&tmpstr);
1818 ajStrFmtLower(&tmpstr);
1819 ajStrAppendS(®str, tmpstr);
1820 ajStrAppendS(®str, seqTypeStrProtGap());
1821 ajStrAppendC(®str, "+])");
1822
1823 seqtypeRegProtGap = ajRegComp(regstr);
1824
1825 ajStrDel(®str);
1826 ajStrDel(&tmpstr);
1827 }
1828
1829 return seqtypeRegProtGap;
1830 }
1831
1832
1833
1834
1835 /* @funcstatic seqTypeCharProtGapPhylo ****************************************
1836 **
1837 ** Returns regular expression to test for protein residues or gaps
1838 ** stops and queries
1839 **
1840 ** @return [AjPRegexp] valid characters
1841 **
1842 ** @release 2.9.0
1843 ******************************************************************************/
1844
seqTypeCharProtGapPhylo(void)1845 static AjPRegexp seqTypeCharProtGapPhylo(void)
1846 {
1847 AjPStr regstr = NULL;
1848 AjPStr tmpstr = NULL;
1849
1850 if(!seqtypeRegProtGapPhylo)
1851 {
1852 regstr = ajStrNewRes(256);
1853 tmpstr = ajStrNewS(seqTypeStrProtGapPhylo());
1854
1855 ajStrAppendC(®str, "([^");
1856 ajStrKeepSetAlpha(&tmpstr);
1857 ajStrFmtLower(&tmpstr);
1858 ajStrAppendS(®str, tmpstr);
1859 ajStrAppendS(®str, seqTypeStrProtGapPhylo());
1860 ajStrAppendC(®str, "+])");
1861
1862 seqtypeRegProtGapPhylo = ajRegComp(regstr);
1863
1864 ajStrDel(®str);
1865 ajStrDel(&tmpstr);
1866 }
1867
1868 return seqtypeRegProtGapPhylo;
1869 }
1870
1871
1872
1873
1874 /* @funcstatic seqTypeCharProtPure ********************************************
1875 **
1876 ** Returns regular expression to test for protein residues
1877 ** with no ambiguity
1878 **
1879 ** @return [AjPRegexp] valid characters
1880 **
1881 ** @release 2.7.0
1882 ******************************************************************************/
1883
seqTypeCharProtPure(void)1884 static AjPRegexp seqTypeCharProtPure(void)
1885 {
1886 AjPStr regstr = NULL;
1887 AjPStr tmpstr = NULL;
1888
1889 if(!seqtypeRegProtPure)
1890 {
1891 regstr = ajStrNewRes(256);
1892 tmpstr = ajStrNewS(seqTypeStrProtPure());
1893
1894 ajStrAppendC(®str, "([^");
1895 ajStrKeepSetAlpha(&tmpstr);
1896 ajStrFmtLower(&tmpstr);
1897 ajStrAppendS(®str, tmpstr);
1898 ajStrAppendS(®str, seqTypeStrProtPure());
1899 ajStrAppendC(®str, "+])");
1900
1901 seqtypeRegProtPure = ajRegComp(regstr);
1902
1903 ajStrDel(®str);
1904 ajStrDel(&tmpstr);
1905 }
1906
1907 return seqtypeRegProtPure;
1908 }
1909
1910
1911
1912
1913 /* @funcstatic seqTypeCharProtStop ********************************************
1914 **
1915 ** Returns regular expression to test for protein residues or stop codons
1916 **
1917 ** @return [AjPRegexp] valid characters
1918 **
1919 ** @release 2.7.0
1920 ******************************************************************************/
1921
seqTypeCharProtStop(void)1922 static AjPRegexp seqTypeCharProtStop(void)
1923 {
1924 AjPStr regstr = NULL;
1925 AjPStr tmpstr = NULL;
1926
1927 if(!seqtypeRegProtStop)
1928 {
1929 regstr = ajStrNewRes(256);
1930 tmpstr = ajStrNewS(seqTypeStrProtStop());
1931
1932 ajStrAppendC(®str, "([^");
1933 ajStrKeepSetAlpha(&tmpstr);
1934 ajStrFmtLower(&tmpstr);
1935 ajStrAppendS(®str, tmpstr);
1936 ajStrAppendS(®str, seqTypeStrProtStop());
1937 ajStrAppendC(®str, "+])");
1938
1939 seqtypeRegProtStop = ajRegComp(regstr);
1940
1941 ajStrDel(®str);
1942 ajStrDel(&tmpstr);
1943 }
1944
1945 return seqtypeRegProtStop;
1946 }
1947
1948
1949
1950
1951 /* @funcstatic seqTypeCharProtStopGap *****************************************
1952 **
1953 ** Returns regular expression to test for protein residues or stop codons
1954 ** or gap characters
1955 **
1956 ** @return [AjPRegexp] valid characters
1957 **
1958 ** @release 4.0.0
1959 ******************************************************************************/
1960
seqTypeCharProtStopGap(void)1961 static AjPRegexp seqTypeCharProtStopGap(void)
1962 {
1963 AjPStr regstr = NULL;
1964 AjPStr tmpstr = NULL;
1965
1966 if(!seqtypeRegProtStopGap)
1967 {
1968 regstr = ajStrNewRes(256);
1969 tmpstr = ajStrNewS(seqTypeStrProtStopGap());
1970
1971 ajStrAppendC(®str, "([^");
1972 ajStrKeepSetAlpha(&tmpstr);
1973 ajStrFmtLower(&tmpstr);
1974 ajStrAppendS(®str, tmpstr);
1975 ajStrAppendS(®str, seqTypeStrProtStopGap());
1976 ajStrAppendC(®str, "+])");
1977
1978 seqtypeRegProtStopGap = ajRegComp(regstr);
1979
1980 ajStrDel(®str);
1981 ajStrDel(&tmpstr);
1982 }
1983
1984 return seqtypeRegProtStopGap;
1985 }
1986
1987
1988
1989
1990 /* @funcstatic seqTypeStrAny **************************************************
1991 **
1992 ** Returns string of valid characters to test for type Any
1993 **
1994 ** @return [AjPStr] valid characters
1995 **
1996 ** @release 4.1.0
1997 ******************************************************************************/
1998
seqTypeStrAny(void)1999 static AjPStr seqTypeStrAny(void)
2000 {
2001 if(!seqtypeCharsetAny)
2002 {
2003 ajFmtPrintS(&seqtypeCharsetAny, "%s%s%s%s%s",
2004 seqCharProtPure,
2005 seqCharProtAmbig,
2006 seqCharProtStop,
2007 seqCharNucPure,
2008 seqCharNucAmbig);
2009 ajStrRemoveDupchar(&seqtypeCharsetAny);
2010 }
2011
2012 return seqtypeCharsetAny;
2013 }
2014
2015
2016
2017
2018 /* @funcstatic seqTypeStrAnyGap ***********************************************
2019 **
2020 ** Returns string of valid characters to test for type Anygap
2021 **
2022 ** @return [AjPStr] valid characters
2023 **
2024 ** @release 4.1.0
2025 ******************************************************************************/
2026
seqTypeStrAnyGap(void)2027 static AjPStr seqTypeStrAnyGap(void)
2028 {
2029 if(!seqtypeCharsetAnyGap)
2030 {
2031 ajFmtPrintS(&seqtypeCharsetAnyGap, "%s%s%s%s%s%s",
2032 seqCharProtPure,
2033 seqCharProtAmbig,
2034 seqCharProtStop,
2035 seqCharNucPure,
2036 seqCharNucAmbig,
2037 seqCharGap);
2038 ajStrRemoveDupchar(&seqtypeCharsetAnyGap);
2039 }
2040
2041 return seqtypeCharsetAnyGap;
2042 }
2043
2044
2045
2046
2047 /* @funcstatic seqTypeStrDnaGap ***********************************************
2048 **
2049 ** Returns string of valid characters to test for type Dnagap
2050 **
2051 ** @return [AjPStr] valid characters
2052 **
2053 ** @release 4.1.0
2054 ******************************************************************************/
2055
seqTypeStrDnaGap(void)2056 static AjPStr seqTypeStrDnaGap(void)
2057 {
2058 if(!seqtypeCharsetDnaGap)
2059 {
2060 ajFmtPrintS(&seqtypeCharsetDnaGap, "%s%s%s",
2061 seqCharNucPure,
2062 seqCharNucAmbig,
2063 seqCharGap);
2064 ajStrRemoveDupchar(&seqtypeCharsetDnaGap);
2065 }
2066
2067 return seqtypeCharsetDnaGap;
2068 }
2069
2070
2071
2072
2073 /* @funcstatic seqTypeStrNuc **************************************************
2074 **
2075 ** Returns string of valid characters to test for type Nuc
2076 **
2077 ** @return [AjPStr] valid characters
2078 **
2079 ** @release 4.1.0
2080 ******************************************************************************/
2081
seqTypeStrNuc(void)2082 static AjPStr seqTypeStrNuc(void)
2083 {
2084 if(!seqtypeCharsetNuc)
2085 {
2086 ajFmtPrintS(&seqtypeCharsetNuc, "%s%s",
2087 seqCharNucPure,
2088 seqCharNucAmbig);
2089 ajStrRemoveDupchar(&seqtypeCharsetNuc);
2090 }
2091
2092 return seqtypeCharsetNuc;
2093 }
2094
2095
2096
2097
2098 /* @funcstatic seqTypeStrNucGap ***********************************************
2099 **
2100 ** Returns string of valid characters to test for type Nucgap
2101 **
2102 ** @return [AjPStr] valid characters
2103 **
2104 ** @release 4.1.0
2105 ******************************************************************************/
2106
seqTypeStrNucGap(void)2107 static AjPStr seqTypeStrNucGap(void)
2108 {
2109 if(!seqtypeCharsetNucGap)
2110 {
2111 ajFmtPrintS(&seqtypeCharsetNucGap, "%s%s%s",
2112 seqCharNucPure,
2113 seqCharNucAmbig,
2114 seqCharGap);
2115 ajStrRemoveDupchar(&seqtypeCharsetNucGap);
2116 }
2117
2118 return seqtypeCharsetNucGap;
2119 }
2120
2121
2122
2123
2124 /* @funcstatic seqTypeStrNucGapPhylo ******************************************
2125 **
2126 ** Returns string of valid characters to test for type Nucgapphylo
2127 **
2128 ** @return [AjPStr] valid characters
2129 **
2130 ** @release 4.1.0
2131 ******************************************************************************/
2132
seqTypeStrNucGapPhylo(void)2133 static AjPStr seqTypeStrNucGapPhylo(void)
2134 {
2135 if(!seqtypeCharsetNucGapPhylo)
2136 {
2137 ajFmtPrintS(&seqtypeCharsetNucGapPhylo, "%s%s%s%s",
2138 seqCharNucPure,
2139 seqCharNucAmbig,
2140 seqCharPhylo,
2141 seqCharGap);
2142 ajStrRemoveDupchar(&seqtypeCharsetNucGapPhylo);
2143 }
2144
2145 return seqtypeCharsetNucGapPhylo;
2146 }
2147
2148
2149
2150
2151 /* @funcstatic seqTypeStrNucPure **********************************************
2152 **
2153 ** Returns string of valid characters to test for type Nucpure
2154 **
2155 ** @return [AjPStr] valid characters
2156 **
2157 ** @release 4.1.0
2158 ******************************************************************************/
2159
seqTypeStrNucPure(void)2160 static AjPStr seqTypeStrNucPure(void)
2161 {
2162 if(!seqtypeCharsetNucPure)
2163 {
2164 ajFmtPrintS(&seqtypeCharsetNucPure, "%s",
2165 seqCharNucPure);
2166 ajStrRemoveDupchar(&seqtypeCharsetNucPure);
2167 }
2168
2169 return seqtypeCharsetNucPure;
2170 }
2171
2172
2173
2174
2175 /* @funcstatic seqTypeStrProt *************************************************
2176 **
2177 ** Returns string of valid characters to test for type Prot
2178 **
2179 ** @return [AjPStr] valid characters
2180 **
2181 ** @release 4.1.0
2182 ******************************************************************************/
2183
seqTypeStrProt(void)2184 static AjPStr seqTypeStrProt(void)
2185 {
2186 if(!seqtypeCharsetProt)
2187 {
2188 ajFmtPrintS(&seqtypeCharsetProt, "%s%s",
2189 seqCharProtPure,
2190 seqCharProtAmbig);
2191 ajStrRemoveDupchar(&seqtypeCharsetProt);
2192 }
2193
2194 return seqtypeCharsetProt;
2195 }
2196
2197
2198
2199
2200 /* @funcstatic seqTypeStrProtAny **********************************************
2201 **
2202 ** Returns string of valid characters to test for type Protany
2203 **
2204 ** @return [AjPStr] valid characters
2205 **
2206 ** @release 4.1.0
2207 ******************************************************************************/
2208
seqTypeStrProtAny(void)2209 static AjPStr seqTypeStrProtAny(void)
2210 {
2211 if(!seqtypeCharsetProtAny)
2212 {
2213 ajFmtPrintS(&seqtypeCharsetProtAny, "%s%s%s%s%s",
2214 seqCharProtPure,
2215 seqCharProtAmbig,
2216 seqCharProtStop,
2217 seqCharPhylo,
2218 seqCharGap);
2219 ajStrRemoveDupchar(&seqtypeCharsetProtAny);
2220 }
2221
2222 return seqtypeCharsetProtAny;
2223 }
2224
2225
2226
2227
2228 /* @funcstatic seqTypeStrProtGap **********************************************
2229 **
2230 ** Returns string of valid characters to test for type Protgap
2231 **
2232 ** @return [AjPStr] valid characters
2233 **
2234 ** @release 4.1.0
2235 ******************************************************************************/
2236
seqTypeStrProtGap(void)2237 static AjPStr seqTypeStrProtGap(void)
2238 {
2239 if(!seqtypeCharsetProtGap)
2240 {
2241 ajFmtPrintS(&seqtypeCharsetProtGap, "%s%s%s",
2242 seqCharProtPure,
2243 seqCharProtAmbig,
2244 seqCharGap);
2245 ajStrRemoveDupchar(&seqtypeCharsetProtGap);
2246 }
2247
2248 return seqtypeCharsetProtGap;
2249 }
2250
2251
2252
2253
2254 /* @funcstatic seqTypeStrProtGapPhylo *****************************************
2255 **
2256 ** Returns string of valid characters to test for type Protgapphylo
2257 **
2258 ** @return [AjPStr] valid characters
2259 **
2260 ** @release 4.1.0
2261 ******************************************************************************/
2262
seqTypeStrProtGapPhylo(void)2263 static AjPStr seqTypeStrProtGapPhylo(void)
2264 {
2265 if(!seqtypeCharsetProtGapPhylo)
2266 {
2267 ajFmtPrintS(&seqtypeCharsetProtGapPhylo, "%s%s%s%s%s",
2268 seqCharProtPure,
2269 seqCharProtAmbig,
2270 seqCharProtStop,
2271 seqCharPhylo,
2272 seqCharGap);
2273 ajStrRemoveDupchar(&seqtypeCharsetProtGapPhylo);
2274 }
2275
2276 return seqtypeCharsetProtGapPhylo;
2277 }
2278
2279
2280
2281
2282 /* @funcstatic seqTypeStrProtPure *********************************************
2283 **
2284 ** Returns string of valid characters to test for type Protpure
2285 **
2286 ** @return [AjPStr] valid characters
2287 **
2288 ** @release 4.1.0
2289 ******************************************************************************/
2290
seqTypeStrProtPure(void)2291 static AjPStr seqTypeStrProtPure(void)
2292 {
2293 if(!seqtypeCharsetProtPure)
2294 {
2295 ajFmtPrintS(&seqtypeCharsetProtPure, "%s",
2296 seqCharProtPure);
2297 ajStrRemoveDupchar(&seqtypeCharsetProtPure);
2298 }
2299
2300 return seqtypeCharsetProtPure;
2301 }
2302
2303
2304
2305
2306 /* @funcstatic seqTypeStrProtStop *********************************************
2307 **
2308 ** Returns string of valid characters to test for type Protstop
2309 **
2310 ** @return [AjPStr] valid characters
2311 **
2312 ** @release 4.1.0
2313 ******************************************************************************/
2314
seqTypeStrProtStop(void)2315 static AjPStr seqTypeStrProtStop(void)
2316 {
2317 if(!seqtypeCharsetProtStop)
2318 {
2319 ajFmtPrintS(&seqtypeCharsetProtStop, "%s%s%s",
2320 seqCharProtPure,
2321 seqCharProtAmbig,
2322 seqCharProtStop);
2323 ajStrRemoveDupchar(&seqtypeCharsetProtStop);
2324 }
2325
2326 return seqtypeCharsetProtStop;
2327 }
2328
2329
2330
2331
2332 /* @funcstatic seqTypeStrProtStopGap ******************************************
2333 **
2334 ** Returns string of valid characters to test for type Protstopgap
2335 **
2336 ** @return [AjPStr] valid characters
2337 **
2338 ** @release 4.1.0
2339 ******************************************************************************/
2340
seqTypeStrProtStopGap(void)2341 static AjPStr seqTypeStrProtStopGap(void)
2342 {
2343 if(!seqtypeCharsetProtStopGap)
2344 {
2345 ajFmtPrintS(&seqtypeCharsetProtStopGap, "%s%s%s%s",
2346 seqCharProtPure,
2347 seqCharProtAmbig,
2348 seqCharProtStop,
2349 seqCharGap);
2350 ajStrRemoveDupchar(&seqtypeCharsetProtStopGap);
2351 }
2352
2353 return seqtypeCharsetProtStopGap;
2354 }
2355
2356
2357
2358
2359 /* @funcstatic seqTypeStrRnaGap ***********************************************
2360 **
2361 ** Returns string of valid characters to test for type Rnagap
2362 **
2363 ** @return [AjPStr] valid characters
2364 **
2365 ** @release 4.1.0
2366 ******************************************************************************/
2367
seqTypeStrRnaGap(void)2368 static AjPStr seqTypeStrRnaGap(void)
2369 {
2370 if(!seqtypeCharsetRnaGap)
2371 {
2372 ajFmtPrintS(&seqtypeCharsetRnaGap, "%s%s",
2373 seqCharNucRna,
2374 seqCharGap);
2375 ajStrRemoveDupchar(&seqtypeCharsetRnaGap);
2376 }
2377
2378 return seqtypeCharsetRnaGap;
2379 }
2380
2381
2382
2383
2384 /* @funcstatic seqFindType ****************************************************
2385 **
2386 ** Returns sequence type index and ajTrue if type was found
2387 **
2388 ** @param [r] type_name [const AjPStr] Sequence type
2389 ** @param [w] typenum [ajint*] Sequence type index
2390 ** @return [AjBool] ajTrue if sequence type was found
2391 **
2392 **
2393 ** @release 2.7.0
2394 ******************************************************************************/
2395
seqFindType(const AjPStr type_name,ajint * typenum)2396 static AjBool seqFindType(const AjPStr type_name, ajint* typenum)
2397 {
2398 ajint i;
2399 ajint itype = -1;
2400
2401 ajStrAssignS(&seqtypeTmpstr, type_name);
2402 ajStrFmtLower(&seqtypeTmpstr);
2403
2404 for(i = 0; seqType[i].Name; i++)
2405 if(ajStrMatchC(seqtypeTmpstr, seqType[i].Name))
2406 {
2407 itype = i;
2408 break;
2409 }
2410
2411 if(itype < 0)
2412 {
2413 *typenum = i;
2414
2415 return ajFalse;
2416 }
2417
2418 *typenum = itype;
2419
2420 return ajTrue;
2421 }
2422
2423
2424
2425
2426 /* @func ajSeqTypeIsProt ******************************************************
2427 **
2428 ** Returns ajTrue is sequence type can be a protein (or 'any')
2429 **
2430 ** @param [r] type_name [const AjPStr] Sequence type
2431 ** @return [AjBool] ajTrue if sequence can be protein
2432 **
2433 **
2434 ** @release 2.7.0
2435 ******************************************************************************/
2436
ajSeqTypeIsProt(const AjPStr type_name)2437 AjBool ajSeqTypeIsProt(const AjPStr type_name)
2438 {
2439 ajint itype;
2440
2441 if(seqFindType(type_name, &itype))
2442 switch(seqType[itype].Type)
2443 {
2444 case ISNUC:
2445 return ajFalse;
2446 default:
2447 return ajTrue;
2448 }
2449
2450 return ajFalse;
2451 }
2452
2453
2454
2455
2456 /* @func ajSeqTypeIsNuc *******************************************************
2457 **
2458 ** Returns ajTrue is sequence type can be a nucleotide (or 'any')
2459 **
2460 ** @param [r] type_name [const AjPStr] Sequence type
2461 ** @return [AjBool] ajTrue if sequence can be nucleotide
2462 **
2463 **
2464 ** @release 2.7.0
2465 ******************************************************************************/
2466
ajSeqTypeIsNuc(const AjPStr type_name)2467 AjBool ajSeqTypeIsNuc(const AjPStr type_name)
2468 {
2469 ajint itype;
2470
2471 if(seqFindType(type_name, &itype))
2472 switch(seqType[itype].Type)
2473 {
2474 case ISPROT:
2475 return ajFalse;
2476
2477 default:
2478 return ajTrue;
2479 }
2480
2481 return ajFalse;
2482 }
2483
2484
2485
2486
2487 /* @func ajSeqTypeIsAny *******************************************************
2488 **
2489 ** Returns ajTrue is sequence type can be a protein or nucleotide
2490 **
2491 ** @param [r] type_name [const AjPStr] Sequence type
2492 ** @return [AjBool] ajTrue if sequence can be protein or nucleotide
2493 **
2494 **
2495 ** @release 2.7.0
2496 ******************************************************************************/
2497
ajSeqTypeIsAny(const AjPStr type_name)2498 AjBool ajSeqTypeIsAny(const AjPStr type_name)
2499 {
2500 ajint itype;
2501
2502 if(seqFindType(type_name, &itype))
2503 switch(seqType[itype].Type)
2504 {
2505 case ISNUC:
2506 return ajFalse;
2507
2508 case ISPROT:
2509 return ajFalse;
2510
2511 default:
2512 return ajTrue;
2513 }
2514
2515 return ajFalse;
2516 }
2517
2518
2519
2520
2521 /* @func ajSeqTypeSummary *****************************************************
2522 **
2523 ** Returns ajTrue is sequence type can be a protein or nucleotide
2524 **
2525 ** @param [r] type_name [const AjPStr] Sequence type
2526 ** @param [w] Ptype [AjPStr*] Sequence type 'protein' 'nucleotide' or 'any'
2527 ** @param [w] gaps [AjBool*] True if gap characters are preserved
2528 ** @return [AjBool] ajTrue if sequence can be protein or nucleotide
2529 **
2530 **
2531 ** @release 4.0.0
2532 ******************************************************************************/
2533
ajSeqTypeSummary(const AjPStr type_name,AjPStr * Ptype,AjBool * gaps)2534 AjBool ajSeqTypeSummary(const AjPStr type_name, AjPStr* Ptype, AjBool* gaps)
2535 {
2536 ajint itype;
2537
2538 if(seqFindType(type_name, &itype))
2539 {
2540 *gaps = seqType[itype].Gaps;
2541
2542 switch(seqType[itype].Type)
2543 {
2544 case ISNUC:
2545 ajStrAssignC(Ptype, "nucleotide");
2546 break;
2547 case ISPROT:
2548 ajStrAssignC(Ptype, "protein");
2549 break;
2550 default:
2551 ajStrAssignClear(Ptype);
2552 break;
2553 }
2554 return ajTrue;
2555 }
2556
2557 return ajFalse;
2558 }
2559
2560
2561
2562
2563 /* @func ajSeqTypeExit ********************************************************
2564 **
2565 ** Cleans up sequence type processing internal memory
2566 **
2567 ** @return [void]
2568 **
2569 ** @release 4.0.0
2570 ** @@
2571 ******************************************************************************/
2572
ajSeqTypeExit(void)2573 void ajSeqTypeExit(void)
2574 {
2575 ajuint i;
2576
2577 ajStrDel(&seqtypeTmpstr);
2578
2579 ajRegFree(&seqtypeRegAny);
2580 ajRegFree(&seqtypeRegAnyGap);
2581 ajRegFree(&seqtypeRegDnaGap);
2582 ajRegFree(&seqtypeRegNuc);
2583 ajRegFree(&seqtypeRegNucGap);
2584 ajRegFree(&seqtypeRegNucPure);
2585 ajRegFree(&seqtypeRegProt);
2586 ajRegFree(&seqtypeRegProtAny);
2587 ajRegFree(&seqtypeRegProtGap);
2588 ajRegFree(&seqtypeRegProtPure);
2589 ajRegFree(&seqtypeRegProtStop);
2590 ajRegFree(&seqtypeRegRnaGap);
2591
2592 ajStrDel(&seqtypeCharsetAny);
2593 ajStrDel(&seqtypeCharsetAnyGap);
2594 ajStrDel(&seqtypeCharsetDnaGap);
2595 ajStrDel(&seqtypeCharsetNuc);
2596 ajStrDel(&seqtypeCharsetNucGap);
2597 ajStrDel(&seqtypeCharsetNucGapPhylo);
2598 ajStrDel(&seqtypeCharsetNucPure);
2599 ajStrDel(&seqtypeCharsetProt);
2600 ajStrDel(&seqtypeCharsetProtAny);
2601 ajStrDel(&seqtypeCharsetProtGap);
2602 ajStrDel(&seqtypeCharsetProtPure);
2603 ajStrDel(&seqtypeCharsetProtStop);
2604 ajStrDel(&seqtypeCharsetProtStopGap);
2605 ajStrDel(&seqtypeCharsetRnaGap);
2606
2607 ajCharDel(&seqNewGapChars);
2608
2609 for(i=0;seqType[i].Name;i++)
2610 {
2611 if(seqType[i].Filter)
2612 AJFREE(seqType[i].Filter);
2613 }
2614
2615 return;
2616 }
2617
2618
2619
2620
2621 /* @func ajSeqTypeUnused ******************************************************
2622 **
2623 ** Dummy function to catch all unused functions defined in the ajseqtype
2624 ** source file.
2625 **
2626 ** @return [void]
2627 **
2628 **
2629 ** @release 4.1.0
2630 ******************************************************************************/
2631
ajSeqTypeUnused(void)2632 void ajSeqTypeUnused(void)
2633 {
2634 AjPStr ajpstr=NULL;
2635 AjPRegexp ajpregexp = NULL;
2636
2637 seqTypeTest(ajpstr, ajpregexp);
2638
2639 return;
2640 }
2641