1 /* @source ajseqtype **********************************************************
2 **
3 ** AJAX seqtype functions
4 **
5 ** @author Copyright (C) 2002 Peter Rice
6 ** @version $Revision: 1.84 $
7 ** @modified 2002-2011 Peter Rice
8 ** @modified $Date: 2013/06/29 22:30:31 $ by $Author: rice $
9 ** @@
10 **
11 ** This library is free software; you can redistribute it and/or
12 ** modify it under the terms of the GNU Lesser General Public
13 ** License as published by the Free Software Foundation; either
14 ** version 2.1 of the License, or (at your option) any later version.
15 **
16 ** This library is distributed in the hope that it will be useful,
17 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 ** Lesser General Public License for more details.
20 **
21 ** You should have received a copy of the GNU Lesser General Public
22 ** License along with this library; if not, write to the Free Software
23 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
24 ** MA  02110-1301,  USA.
25 **
26 ******************************************************************************/
27 
28 #include "ajlib.h"
29 
30 #include "ajseqtype.h"
31 #include "ajseq.h"
32 #include "ajfeat.h"
33 #include "ajfile.h"
34 #include "ajreg.h"
35 
36 
37 
38 
39 /* @datastatic SeqPType *******************************************************
40 **
41 ** Sequence types data structure, used to test input sequence against
42 ** a defined sequence type
43 **
44 ** @alias SeqSType
45 ** @alias SeqOType
46 **
47 ** @attr Name [const char*] sequence type name
48 ** @attr Gaps [AjBool] allow gap characters
49 ** @attr Ambig [AjBool] True if ambiguity codes are allowed
50 ** @attr Type [ajuint] enumerated ISANY=0 ISNUC=1 ISPROT=2
51 ** @attr Padding [ajint] Padding to alignment boundary
52 ** @attr ConvertFrom [const char*] Convert each of these characters to the
53 **                           ConvertTo equivalent
54 ** @attr ConvertTo [const char*] Equivalent for each sequence character in
55 **                         ConvertFrom
56 ** @attr Badchars [AjPRegexp function] Test function
57 ** @attr Goodchars [AjPStr function] Test function
58 ** @attr Filter [char*] Filter for character testing
59 ** @attr Desc [const char*] Description for documentation purposes
60 ** @@
61 ******************************************************************************/
62 
63 typedef struct SeqSType
64 {
65     const char *Name;
66     AjBool Gaps;
67     AjBool Ambig;
68     ajuint Type;
69     ajint Padding;
70     const char *ConvertFrom;
71     const char *ConvertTo;
72     AjPRegexp (*Badchars) (void);
73     AjPStr (*Goodchars) (void);
74     char *Filter;
75     const char *Desc;
76 } SeqOType;
77 
78 #define SeqPType SeqOType*
79 
80 
81 
82 
83 enum ProtNuc {ISANY=0, ISNUC=1, ISPROT=2};
84 
85 static char* seqNewGapChars = NULL;
86 
87 static AjPStr seqtypeTmpstr = NULL;
88 
89 /*
90 ** gaps only allowed if it says so
91 ** gap conversion is a separate attribute, along with case conversion
92 */
93 
94 static AjBool     seqFindType(const AjPStr type_name, ajint* typenum);
95 static void       seqGapSL(AjPStr* seq, char gapc, char padc, ajuint ilen);
96 static AjBool     seqTypeFix(AjPSeq thys, ajint itype);
97 static AjBool     seqTypeFixReg(AjPSeq thys, ajint itype, char fixchar);
98 static void       seqTypeSet(AjPSeq thys, const AjPStr Type);
99 static AjBool     seqTypeStopTrimS(AjPStr* pthys);
100 static char       seqTypeTest(const AjPStr thys, AjPRegexp badchars);
101 static AjBool     seqTypeTestI(AjPSeq thys, ajint itype);
102 static char       seqTypeTestS(const AjPStr thys, const AjPStr goodchars);
103 
104 static AjPRegexp  seqTypeCharAny(void);
105 static AjPRegexp  seqTypeCharAnyGap(void);
106 static AjPRegexp  seqTypeCharNuc(void);
107 static AjPRegexp  seqTypeCharNucGap(void);
108 static AjPRegexp  seqTypeCharNucGapPhylo(void);
109 static AjPRegexp  seqTypeCharNucPure(void);
110 static AjPRegexp  seqTypeCharProt(void);
111 static AjPRegexp  seqTypeCharProtGap(void);
112 static AjPRegexp  seqTypeCharProtGapPhylo(void);
113 static AjPRegexp  seqTypeCharProtPure(void);
114 static AjPRegexp  seqTypeCharProtStop(void);
115 static AjPRegexp  seqTypeCharProtStopGap(void);
116 
117 static AjPStr  seqTypeStrAny(void);
118 static AjPStr  seqTypeStrAnyGap(void);
119 static AjPStr  seqTypeStrDnaGap(void);
120 static AjPStr  seqTypeStrNuc(void);
121 static AjPStr  seqTypeStrNucGap(void);
122 static AjPStr  seqTypeStrNucGapPhylo(void);
123 static AjPStr  seqTypeStrNucPure(void);
124 static AjPStr  seqTypeStrProt(void);
125 static AjPStr  seqTypeStrProtAny(void);
126 static AjPStr  seqTypeStrProtGap(void);
127 static AjPStr  seqTypeStrProtGapPhylo(void);
128 static AjPStr  seqTypeStrProtPure(void);
129 static AjPStr  seqTypeStrProtStop(void);
130 static AjPStr  seqTypeStrProtStopGap(void);
131 static AjPStr  seqTypeStrRnaGap(void);
132 
133 static AjPRegexp seqtypeRegAny          = NULL;
134 static AjPRegexp seqtypeRegAnyGap       = NULL;
135 static AjPRegexp seqtypeRegDnaGap       = NULL;
136 static AjPRegexp seqtypeRegNuc          = NULL;
137 static AjPRegexp seqtypeRegNucGap       = NULL;
138 static AjPRegexp seqtypeRegNucGapPhylo  = NULL;
139 static AjPRegexp seqtypeRegNucPure      = NULL;
140 static AjPRegexp seqtypeRegProt         = NULL;
141 static AjPRegexp seqtypeRegProtAny      = NULL;
142 static AjPRegexp seqtypeRegProtGap      = NULL;
143 static AjPRegexp seqtypeRegProtGapPhylo = NULL;
144 static AjPRegexp seqtypeRegProtPure     = NULL;
145 static AjPRegexp seqtypeRegProtStop     = NULL;
146 static AjPRegexp seqtypeRegProtStopGap  = NULL;
147 static AjPRegexp seqtypeRegRnaGap       = NULL;
148 
149 static AjPStr seqtypeCharsetAny          = NULL;
150 static AjPStr seqtypeCharsetAnyGap       = NULL;
151 static AjPStr seqtypeCharsetDnaGap       = NULL;
152 static AjPStr seqtypeCharsetNuc          = NULL;
153 static AjPStr seqtypeCharsetNucGap       = NULL;
154 static AjPStr seqtypeCharsetNucGapPhylo  = NULL;
155 static AjPStr seqtypeCharsetNucPure      = NULL;
156 static AjPStr seqtypeCharsetProt         = NULL;
157 static AjPStr seqtypeCharsetProtAny      = NULL;
158 static AjPStr seqtypeCharsetProtGap      = NULL;
159 static AjPStr seqtypeCharsetProtGapPhylo = NULL;
160 static AjPStr seqtypeCharsetProtPure     = NULL;
161 static AjPStr seqtypeCharsetProtStop     = NULL;
162 static AjPStr seqtypeCharsetProtStopGap  = NULL;
163 static AjPStr seqtypeCharsetRnaGap       = NULL;
164 
165 
166 
167 
168 /*
169 ** gap characters known are:
170 **
171 ** . GCG and most others
172 ** - Phylip and some alignment output
173 ** ~ GCG for gaps at ends
174 ** * Staden for DNA but stop for protein (fix on input?)
175 ** O Phylip (fix on input?) - no longer possible: O is pyrrolysine in proteins
176 */
177 
178 
179 
180 /*
181 char seqCharProt[]  = "ACDEFGHIKLMNPQRSTVWYacdefghiklmnpqrstvwyBUXZbuxz*?";
182 */
183 char seqCharProtPure[]  = "ACDEFGHIKLMNPQRSTVWY";
184 char seqCharProtAmbig[] = "BJOUXZ?"; /* convert unwanted ones to Xx */
185 char seqCharProtStop[]  = "*";
186 char seqCharNuc[]       = "ACGTUBDHKMNRSVWXY?";
187 char seqCharNucPure[]   = "ACGTU";
188 char seqCharNucAmbig[]  = "BDHKMNRSVWXY?";
189 char seqCharGap[]       = ".~-";	/* phylip used O in old versions */
190 char seqCharNucDna[]    = "ACGTBDHKMNRSVWXY?";
191 char seqCharNucRna[]    = "ACGUBDHKMNRSVWXY?";
192 char seqCharGapany[]    = ".~-";	/* phylip used O in old versions*/
193 char seqCharGapdash[]   = "-";
194 char seqCharGapdot[]    = ".";
195 char seqGap = '-';		/* the (only) EMBOSS gap character */
196 char seqCharGapTest[]   = " .~-";   /* phylip used O - don't forget space */
197 char seqCharPhylo[]       = "?";	/* phylip uses ? for unknown or gap */
198 
199 
200 
201 
202 /* @funclist seqType **********************************************************
203 **
204 ** Functions to test each sequence type
205 **
206 ******************************************************************************/
207 
208 static SeqOType seqType[] =
209 {
210 /*   "name"            Gaps     Ambig    Type    Padding CvtFrom CvtTo
211          BadcharsFunction GoodCharsFunction Description */
212     {"any",            AJFALSE, AJTRUE,  ISANY,  0, "?",    "X",
213 	 seqTypeCharAny,
214 	 seqTypeStrAny,
215          NULL,
216          "any valid sequence"},		/* reset type */
217     {"gapany",         AJTRUE,  AJTRUE,  ISANY,  0, "?",    "X",
218 	 seqTypeCharAnyGap,
219 	 seqTypeStrAnyGap,
220          NULL,
221 	 "any valid sequence with gaps"}, /* reset type */
222     {"dna",            AJFALSE, AJTRUE,  ISNUC,  0, "?XxUu", "NNnTt",
223 	 seqTypeCharNuc,
224 	 seqTypeStrNuc,
225          NULL,
226 	 "DNA sequence"},
227     {"puredna",        AJFALSE, AJFALSE, ISNUC,  0, "Uu", "Tt",
228 	 seqTypeCharNucPure,
229 	 seqTypeStrNucPure,
230          NULL,
231 	 "DNA sequence, bases ACGT only"},
232     {"gapdna",         AJTRUE,  AJTRUE,  ISNUC,  0, "?XxUu", "NNnTt",
233 	 seqTypeCharNucGap,
234 	 seqTypeStrNucGap,
235          NULL,
236 	 "DNA sequence with gaps"},
237     {"gapdnaphylo",    AJTRUE,  AJTRUE,  ISNUC,  0, "Uu",  "Tt",
238 	 seqTypeCharNucGapPhylo,
239 	 seqTypeStrNucGapPhylo,
240          NULL,
241 	 "DNA sequence with gaps and queries"},
242     {"rna",            AJFALSE, AJTRUE,  ISNUC,  0, "?XxTt", "NNnUu",
243 	 seqTypeCharNuc,
244 	 seqTypeStrNuc,
245          NULL,
246 	 "RNA sequence"},
247     {"purerna",        AJFALSE, AJFALSE, ISNUC,  0, "Tt", "Uu",
248 	 seqTypeCharNucPure,
249 	 seqTypeStrNucPure,
250          NULL,
251 	 "RNA sequence, bases ACGU only"},
252     {"gaprna",         AJTRUE,  AJTRUE,  ISNUC,  0, "?XxTt", "NNnUu",
253 	 seqTypeCharNucGap,
254 	 seqTypeStrNucGap,
255          NULL,
256 	 "RNA sequence with gaps"},
257     {"gaprnaphylo",     AJTRUE,  AJTRUE,  ISNUC, 0, "Tt",  "Uu",
258 	 seqTypeCharNucGapPhylo,
259 	 seqTypeStrNucGapPhylo,
260          NULL,
261 	 "RNA sequence with gaps and queries"},
262     {"nucleotide",     AJFALSE, AJTRUE,  ISNUC,  0, "?Xx",   "NNn",
263 	 seqTypeCharNuc,
264 	 seqTypeStrNuc,
265          NULL,
266 	 "nucleotide sequence"},
267     {"purenucleotide", AJFALSE, AJFALSE, ISNUC,  0, NULL,  NULL,
268 	 seqTypeCharNucPure,
269 	 seqTypeStrNucPure,
270          NULL,
271 	 "nucleotide sequence, bases ACGTU only"},
272     {"gapnucleotide",  AJTRUE,  AJTRUE,  ISNUC,  0, "?Xx",   "NNn",
273 	 seqTypeCharNucGap,
274 	 seqTypeStrNucGap,
275          NULL,
276 	 "nucleotide sequence with gaps"},
277     {"gapnucleotidephylo",  AJTRUE,  AJTRUE,  ISNUC,  0, NULL,  NULL,
278 	 seqTypeCharNucGapPhylo,
279 	 seqTypeStrNucGapPhylo,
280          NULL,
281 	 "nucleotide sequence with gaps and queries"},
282     {"gapnucleotidesimple",AJTRUE, AJTRUE , ISNUC,  0,
283                      "BbDdHhKkMmRrSsVvWwXxYy?", "NnNnNnNnNnNnNnNnNnNnNnN",
284 	 seqTypeCharNucGap,
285 	 seqTypeStrNucGap,
286          NULL,
287 	 "nucleotide sequence with gaps but only N for ambiguity"},
288     {"protein",        AJFALSE, AJTRUE,  ISPROT, 0, "?*",  "XX",
289 	 seqTypeCharProt,
290 	 seqTypeStrProt,
291          NULL,
292 	 "protein sequence"},
293     {"pureprotein",    AJFALSE, AJFALSE, ISPROT, 0, NULL,  NULL,
294 	 seqTypeCharProtPure,
295 	 seqTypeStrProtPure,
296          NULL,
297 	 "protein sequence without BZ U X or *"},
298     {"stopprotein",    AJFALSE, AJTRUE,  ISPROT, 0, "?",   "X",
299 	 seqTypeCharProtStop,
300 	 seqTypeStrProtStop,
301          NULL,
302 	 "protein sequence with possible stops"},
303     {"gapprotein",     AJTRUE,  AJTRUE,  ISPROT, 0, "?*",  "XX",
304 	 seqTypeCharProtGap,
305 	 seqTypeStrProtGap,
306          NULL,
307 	 "protein sequence with gaps"},
308     {"gapstopprotein", AJTRUE,  AJTRUE,  ISPROT, 0, "?",  "X",
309 	 seqTypeCharProtStopGap,
310 	 seqTypeStrProtStopGap,
311          NULL,
312 	 "protein sequence with gaps and possible stops"},
313     {"gapproteinphylo", AJTRUE,  AJTRUE,  ISPROT, 0, NULL,  NULL,
314 	 seqTypeCharProtGapPhylo,
315 	 seqTypeStrProtGapPhylo,
316          NULL,
317 	 "protein sequence with gaps, stops and queries"},
318     {"proteinstandard",AJFALSE, AJTRUE,  ISPROT, 0, "?*UuJjOo", "XXXxXxXx",
319 	 seqTypeCharProt,
320 	 seqTypeStrProt,
321          NULL,
322 	 "protein sequence with no selenocysteine"},
323     {"stopproteinstandard",AJFALSE, AJTRUE, ISPROT, 0, "?UuJjOo", "XXxXxXx",
324 	 seqTypeCharProtStop,
325 	 seqTypeStrProtStop,
326          NULL,
327 	 "protein sequence with a possible stop but no selenocysteine"},
328     {"gapproteinstandard", AJTRUE,  AJTRUE, ISPROT, 0, "?*UuJjOo", "XXXxXxXx",
329 	 seqTypeCharProtGap,
330 	 seqTypeStrProtGap,
331          NULL,
332 	 "protein sequence with gaps but no selenocysteine"},
333     {"gapproteinsimple", AJTRUE,  AJTRUE, ISPROT, 0,
334                                               "?*BbZzUuJjOo", "XXXxXxXxXxXx",
335 	 seqTypeCharProtGap,
336 	 seqTypeStrProtGap,
337          NULL,
338 	 "protein sequence with gaps but no selenocysteine"},
339     {NULL,             AJFALSE, AJTRUE,  ISANY,  0, NULL,  NULL,
340 	 NULL,
341 	 NULL,
342          NULL,
343 	 NULL}
344 };
345 
346 
347 
348 
349 /* @funcstatic seqTypeTestI ***************************************************
350 **
351 ** Tests the type of a sequence is compatible with a defined type.
352 ** If the type can have gaps, also tests for gap characters.
353 ** Used only for testing, so never writes any error message
354 **
355 ** @param [u] thys [AjPSeq] Sequence object
356 ** @param [r] itype [ajint] Sequence type index
357 ** @return [AjBool] ajTrue if compatible.
358 **
359 ** @release 2.8.0
360 ** @@
361 ******************************************************************************/
362 
seqTypeTestI(AjPSeq thys,ajint itype)363 static AjBool seqTypeTestI(AjPSeq thys, ajint itype)
364 {
365 
366     /*
367      ** We have a known type, now we need to either show the sequence
368      ** matches it, or fix it so it does (or, of course, give up)
369      */
370 
371     /*
372      ** First we test the type - predefined by a database,
373      ** or by checking the sequence characters
374      */
375 
376     if(seqType[itype].Gaps)
377     {
378 	ajDebug("Convert gaps to '-'\n");
379 	ajSeqGap(thys, seqGap, 0);
380     }
381     else
382     {
383 	ajDebug("Remove all gaps\n");
384 
385         if(thys->Qualsize)
386         {
387             ajStrRemoveGapF(&thys->Seq, thys->Accuracy);
388             if(thys->Qualsize > ajStrGetLen(thys->Seq))
389             {
390                 thys->Qualsize = ajStrGetLen(thys->Seq);
391                 AJCRESIZE(thys->Accuracy, thys->Qualsize);
392             }
393         }
394         else
395             ajStrRemoveGap(&thys->Seq);
396     }
397 
398     if(seqType[itype].Type == ISPROT && !ajSeqIsProt(thys))
399     {
400 	ajDebug("Sequence is not a protein\n");
401 
402 	return ajFalse;
403     }
404 
405     if(seqType[itype].Type == ISNUC && !ajSeqIsNuc(thys))
406     {
407 	ajDebug("Sequence is not nucleic\n");
408 
409 	return ajFalse;
410     }
411 
412     if(ajStrIsCharsetCaseS(thys->Seq, (*seqType[itype].Goodchars)()))
413     {
414 	if(seqType[itype].ConvertFrom)
415 	{
416 	    ajDebug("Convert '%s' to '%s'\n",
417 		    seqType[itype].ConvertFrom,
418 		    seqType[itype].ConvertTo);
419 	    ajStrExchangeSetCC(&thys->Seq,
420 			   seqType[itype].ConvertFrom,
421 			   seqType[itype].ConvertTo);
422 	}
423 
424 	return ajTrue;
425     }
426 
427     ajDebug("seqTypeTestI: Sequence must be %s: found bad character\n",
428 	    seqType[itype].Desc);
429 
430     return ajFalse;
431 }
432 
433 
434 
435 
436 /* @funcstatic seqTypeFix *****************************************************
437 **
438 ** Fixes (if possible) unacceptable sequence characters by removing gaps
439 ** (if no gaps are allowed) and by setting ambiguity codes (if they
440 ** are allowed).
441 **
442 ** @param [u] thys [AjPSeq] Sequence object
443 ** @param [r] itype [ajint] Sequence type index
444 ** @return [AjBool] ajTrue if the type can be fixed
445 **
446 ** @release 2.7.0
447 ** @@
448 ******************************************************************************/
449 
seqTypeFix(AjPSeq thys,ajint itype)450 static AjBool seqTypeFix(AjPSeq thys, ajint itype)
451 {
452     ajDebug("seqTypeFix '%s' '%S'\n", seqType[itype].Name, thys->Seq);
453 
454     /*
455      ** if ungapped, remove any gap characters
456      */
457 
458     if(!seqType[itype].Gaps)
459     {
460         if(thys->Qualsize)
461         {
462             ajStrRemoveGapF(&thys->Seq, thys->Accuracy);
463             if(thys->Qualsize > ajStrGetLen(thys->Seq))
464             {
465                 thys->Qualsize = ajStrGetLen(thys->Seq);
466                 AJCRESIZE(thys->Accuracy, thys->Qualsize);
467             }
468         }
469         else
470             ajStrRemoveGap(&thys->Seq);
471     }
472 
473 
474 
475     if (ajCharMatchC(seqType[itype].Name, "pureprotein"))
476 	seqTypeStopTrimS(&thys->Seq);
477 
478     if(seqType[itype].Ambig)
479     {
480 	/*
481 	 ** list the bad characters, change to 'X' or 'N'
482 	 */
483 	switch(seqType[itype].Type)
484 	{
485             case ISPROT:
486                 if (ajCharMatchC(seqType[itype].Name, "protein"))
487                     seqTypeStopTrimS(&thys->Seq);
488                 seqTypeFixReg(thys, itype, 'X');
489                 break;
490             case ISNUC:
491                 seqTypeFixReg(thys, itype, 'N');
492                 break;
493             case ISANY:
494                 if(ajSeqIsNuc(thys))
495                     seqTypeFixReg(thys, itype, 'N');
496                 else
497                     seqTypeFixReg(thys, itype, 'X');
498                 break;
499             default:
500                 ajDie("Unknown sequence type code for '%s'",
501                       seqType[itype].Name);
502                 return ajFalse;
503 	}
504     }
505 
506     if (ajCharMatchC(seqType[itype].Name, "pureprotein"))
507 	seqTypeStopTrimS(&thys->Seq);
508 
509     ajDebug("seqTypeFix done  '%S'\n", thys->Seq);
510 
511     return seqTypeTestI(thys, itype);
512 }
513 
514 
515 
516 
517 /* @funcstatic seqTypeFixReg **************************************************
518 **
519 ** Fixes (if possible) unacceptable sequence characters by removing gaps
520 ** (if no gaps are allowed) and by setting ambiguity codes (if they
521 ** are allowed).
522 **
523 ** @param [u] thys [AjPSeq] Sequence object
524 ** @param [r] itype [ajint] Sequence type index
525 ** @param [r] fixchar [char] Character to replace with
526 ** @return [AjBool] ajTrue if the type can be fixed
527 **
528 ** @release 2.7.0
529 ** @@
530 ******************************************************************************/
531 
seqTypeFixReg(AjPSeq thys,ajint itype,char fixchar)532 static AjBool seqTypeFixReg(AjPSeq thys, ajint itype, char fixchar)
533 {
534     ajDebug("seqTypeFixReg '%s' '%S'\n", seqType[itype].Name, thys->Seq);
535     /*ajDebug("Seq old '%S'\n", thys->Seq);*/
536 
537     return ajStrExchangeSetRestSK(&thys->Seq,
538 				  (*seqType[itype].Goodchars)(), fixchar);
539 }
540 
541 
542 
543 
544 /* @funcstatic seqTypeSet *****************************************************
545 **
546 ** Sets the sequence type. Uses the first character of the type
547 ** which can be N or P
548 **
549 ** @param [u] thys [AjPSeq] Sequence object
550 ** @param [r] Type [const AjPStr] Sequence type
551 ** @return [void]
552 **
553 ** @release 1.0.0
554 ** @@
555 ******************************************************************************/
556 
seqTypeSet(AjPSeq thys,const AjPStr Type)557 static void seqTypeSet(AjPSeq thys, const AjPStr Type)
558 {
559     const char* cp;
560 
561     ajDebug("seqTypeSet '%S'\n", Type);
562 
563     cp = ajStrGetPtr(Type);
564 
565     switch(*cp)
566     {
567         case 'P':
568         case 'p':
569             ajSeqSetProt(thys);
570             break;
571         case 'N':
572         case 'n':
573             ajSeqSetNuc(thys);
574             break;
575         case '\0':
576         case 'S':
577         case 's':
578             break;
579         default:
580             ajDie("Unknown sequence type '%c'", *cp);
581     }
582 
583     return;
584 }
585 
586 
587 
588 
589 /* @func ajSeqTypeCheckS ******************************************************
590 **
591 ** Tests the type of a sequence is compatible with a defined type.
592 ** If the type can have gaps, also tests for gap characters.
593 ** Used for input validation - writes error message if the type check fails
594 **
595 ** @param [u] pthys [AjPStr*] Sequence string
596 ** @param [r] type_name [const AjPStr] Sequence type
597 ** @return [AjBool] ajTrue if compatible.
598 **
599 ** @release 2.7.0
600 ** @@
601 ******************************************************************************/
602 
ajSeqTypeCheckS(AjPStr * pthys,const AjPStr type_name)603 AjBool ajSeqTypeCheckS(AjPStr* pthys, const AjPStr type_name)
604 {
605     /*    AjPStr tmpstr = NULL; */
606     ajint itype = -1;
607 
608     /* ajDebug("ajSeqTypeCheckS type '%S' seq '%S'\n", type_name, *pthys); */
609 
610     if(!ajStrGetLen(type_name))	   /* nothing given - anything goes */
611     {
612 	ajSeqGapS(pthys, seqGap);
613 
614 	return ajTrue;
615     }
616 
617     if(!seqFindType(type_name, &itype))
618     {
619 	ajDie("Sequence type '%S' unknown", type_name);
620 
621 	return ajFalse;
622     }
623 
624     ajDebug("ajSeqTypeCheckS type '%s' found (%s)\n",
625 	    seqType[itype].Name, seqType[itype].Desc);
626 
627     if(seqType[itype].Gaps)
628     {
629 	ajDebug("Convert gaps to '-'\n");
630 	ajSeqGapS(pthys, seqGap);
631     }
632     else
633     {
634 	ajDebug("Remove all gaps\n");
635 	ajStrRemoveGap(pthys);
636     }
637 
638     /* no need to test sequence type, we will test every character below */
639 
640     if(ajStrIsCharsetCaseS(*pthys, (*seqType[itype].Goodchars)()))
641     {
642 	if(seqType[itype].ConvertFrom)
643 	{
644 	    ajDebug("Convert '%s' to '%s'\n",
645 		    seqType[itype].ConvertFrom,
646 		    seqType[itype].ConvertTo);
647 	    ajStrExchangeSetCC(pthys,
648 			   seqType[itype].ConvertFrom,
649 			   seqType[itype].ConvertTo);
650 	}
651 	return ajTrue;
652     }
653 
654     return ajTrue;
655 }
656 
657 
658 
659 
660 /* @func ajSeqTypeCheckIn *****************************************************
661 **
662 ** Tests the type of a sequence is compatible with a defined type.
663 ** If the type can have gaps, also tests for gap characters.
664 ** Used for input validation - writes error message if the type check fails
665 **
666 ** @param [u] thys [AjPSeq] Sequence object
667 ** @param [r] seqin [const AjPSeqin] Sequence input object
668 ** @return [AjBool] ajTrue if compatible.
669 **
670 ** @release 2.7.0
671 ** @@
672 ******************************************************************************/
673 
ajSeqTypeCheckIn(AjPSeq thys,const AjPSeqin seqin)674 AjBool ajSeqTypeCheckIn(AjPSeq thys, const AjPSeqin seqin)
675 {
676     ajint itype = -1;
677     AjPStr Type;
678     ajlong i;
679 
680     ajDebug("testing sequence '%s' '%50.50S' type '%S' IsNuc %B IsProt %B\n",
681 	    ajSeqGetNameC(thys), thys->Seq,
682 	    seqin->Inputtype, seqin->IsNuc, seqin->IsProt);
683 
684     Type = seqin->Inputtype; /* ACD file had a predefined seq type */
685 
686     if(seqin->IsNuc)
687 	ajSeqSetNuc(thys);
688 
689     if(seqin->IsProt)
690 	ajSeqSetProt(thys);
691 
692     if(seqin->Input->Query && ajStrGetLen(seqin->Input->Query->DbType))
693 	seqTypeSet(thys, seqin->Input->Query->DbType);
694 
695 
696     if(!ajStrGetLen(Type))		   /* nothing given - anything goes */
697     {
698 	ajSeqGap(thys, seqGap, 0);
699 	ajDebug("ajSeqTypeCheckIn: OK - no type, gaps converted to '-'\n");
700 
701 	return ajTrue;
702     }
703 
704     if(!seqFindType(Type, &itype))
705     {
706 	ajDebug("ajSeqTypeCheckIn: rejected - unknown type\n");
707 	ajDie("Sequence type '%S' unknown", Type);
708 
709 	return ajFalse;
710     }
711 
712     ajDebug("ajSeqTypeCheckIn type '%s' found (%s)\n",
713 	    seqType[itype].Name, seqType[itype].Desc);
714 
715     if(seqType[itype].Gaps)
716     {
717 	ajDebug("Convert gaps to '-'\n");
718 	ajSeqGap(thys, seqGap, 0);
719     }
720     else
721     {
722 	ajDebug("Remove all gaps\n");
723 	ajStrRemoveGap(&thys->Seq);
724     }
725 
726     if(seqType[itype].Type == ISPROT)
727     {
728 	if (ajSeqIsProt(thys))
729 	    ajSeqSetProt(thys);
730 	else
731 	{
732 	    ajErr("Sequence is not a protein\n");
733 	    ajDebug("ajSeqTypeCheckIn: rejected - not a protein\n");
734 
735 	    return ajFalse;
736 	}
737     }
738 
739     if(seqType[itype].Type == ISNUC)
740     {
741 	if (ajSeqIsNuc(thys))
742 	    ajSeqSetNuc(thys);
743 	else
744 	{
745 	    ajErr("Sequence is not nucleic\n");
746 	    ajDebug("ajSeqTypeCheckIn: rejected - not nucleic\n");
747 
748 	    return ajFalse;
749 	}
750     }
751 
752     if(!seqType[itype].Filter)
753         seqType[itype].Filter =
754             ajStrGetfilterCase((*seqType[itype].Goodchars)());
755     if(ajStrIsFilter(thys->Seq, seqType[itype].Filter))
756     {
757 	ajDebug("ajSeqTypeCheckIn: bad characters test passed, convert\n");
758 
759 	if(seqType[itype].ConvertFrom)
760 	{
761 	    ajDebug("Convert '%s' to '%s'\n",
762 		    seqType[itype].ConvertFrom,
763 		    seqType[itype].ConvertTo);
764 	    ajStrExchangeSetCC(&thys->Seq,
765 			   seqType[itype].ConvertFrom,
766 			   seqType[itype].ConvertTo);
767 	}
768 	ajDebug("ajSeqTypeCheckIn: OK - no badchars\n");
769 
770 	return ajTrue;
771     }
772 
773     if(seqTypeFix(thys, itype))		/* this will reuse badchars */
774     {
775 	ajDebug("ajSeqTypeCheckIn: OK - type fixed\n");
776 
777 	return ajTrue;
778     }
779 
780     i = ajStrFindRestCaseS(thys->Seq, (*seqType[itype].Goodchars)());
781 
782     if(i >= 0)
783     {
784 	ajErr("ajSeqTypeCheckIn: Sequence must be %s: "
785 	      "found bad character '%c'",
786 	      seqType[itype].Desc, ajStrGetCharPos(thys->Seq, i));
787 	ajDebug("ajSeqTypeCheckIn: rejected - still had badchars\n");
788 
789 	return ajFalse;
790     }
791 
792     ajDebug("ajSeqTypeCheckIn: OK - fixed finally\n");
793     ajDebug("Final sequence '%S' type '%S' IsNuc %B IsProt %B\n",
794 	    thys->Seq, seqin->Inputtype, seqin->IsNuc, seqin->IsProt);
795 
796     return ajTrue;
797 }
798 
799 
800 
801 
802 
803 /* @func ajSeqTypeNucS ********************************************************
804 **
805 ** Checks sequence type for nucleotide without gaps.
806 **
807 ** RNA and DNA codes are accepted as is.
808 **
809 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
810 ** @return [char] invalid character if any.
811 **
812 ** @release 2.7.0
813 ** @@
814 ******************************************************************************/
815 
ajSeqTypeNucS(const AjPStr thys)816 char ajSeqTypeNucS(const AjPStr thys)
817 {
818     char ret;
819     ajDebug("ajSeqTypeNucS test\n");
820 
821     ret = seqTypeTestS(thys, seqTypeStrNuc());
822 
823     if (ret)
824 	return ret;
825 
826     return seqTypeTestS(thys, seqTypeStrNucGap());
827 }
828 
829 
830 
831 
832 /* @func ajSeqTypeDnaS ********************************************************
833 **
834 ** Checks sequence type for DNA without gaps.
835 **
836 ** RNA and DNA codes are accepted as is.
837 **
838 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
839 ** @return [char] invalid character if any.
840 **
841 ** @release 2.7.0
842 ** @@
843 ******************************************************************************/
844 
ajSeqTypeDnaS(const AjPStr thys)845 char ajSeqTypeDnaS(const AjPStr thys)
846 {
847     char ret;
848     ajDebug("ajSeqTypeDnaS test\n");
849 
850     ret = seqTypeTestS(thys, seqTypeStrNuc());
851 
852     if (ret)
853 	return ret;
854 
855     return seqTypeTestS(thys, seqTypeStrDnaGap());
856 }
857 
858 
859 
860 
861 /* @func ajSeqTypeRnaS ********************************************************
862 **
863 ** Checks sequence type for RNA without gaps
864 **
865 ** RNA codes are accepted as is.
866 **
867 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
868 ** @return [char] invalid character if any.
869 **
870 ** @release 2.7.0
871 ** @@
872 ******************************************************************************/
873 
ajSeqTypeRnaS(const AjPStr thys)874 char ajSeqTypeRnaS(const AjPStr thys)
875 {
876     char ret;
877     ajDebug("ajSeqTypeRnaS test\n");
878 
879     ret = seqTypeTestS(thys, seqTypeStrNuc());
880 
881     if (ret)
882 	return ret;
883 
884     return seqTypeTestS(thys, seqTypeStrRnaGap());
885 }
886 
887 
888 
889 
890 /* @func ajSeqTypeGapdnaS *****************************************************
891 **
892 ** Checks sequence type for Dna with gaps
893 **
894 ** DNA codes are accepted as is.
895 **
896 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
897 ** @return [char] invalid character if any.
898 **
899 ** @release 2.7.0
900 ** @@
901 ******************************************************************************/
902 
ajSeqTypeGapdnaS(const AjPStr thys)903 char ajSeqTypeGapdnaS(const AjPStr thys)
904 {
905     char ret;
906     ajDebug("ajSeqTypeGapdnaS test\n");
907 
908     ret = seqTypeTestS(thys, seqTypeStrNucGap());
909 
910     if (ret)
911 	return ret;
912 
913     return seqTypeTestS(thys, seqTypeStrDnaGap());
914 }
915 
916 
917 
918 
919 /* @func ajSeqTypeGaprnaS *****************************************************
920 **
921 ** Checks sequence type for RNA with gaps
922 **
923 ** RNA codes are accepted as is.
924 **
925 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
926 ** @return [char] invalid character if any.
927 **
928 ** @release 2.7.0
929 ** @@
930 ******************************************************************************/
931 
ajSeqTypeGaprnaS(const AjPStr thys)932 char ajSeqTypeGaprnaS(const AjPStr thys)
933 {
934     char ret;
935     ajDebug("ajSeqTypeGaprnaS test\n");
936 
937     ret = seqTypeTestS(thys, seqTypeStrNucGap());
938 
939     if (ret)
940 	return ret;
941 
942     return seqTypeTestS(thys, seqTypeStrRnaGap());
943 }
944 
945 
946 
947 
948 /* @func ajSeqTypeGapnucS *****************************************************
949 **
950 ** Checks sequence type for nucleotide with gaps.
951 **
952 ** RNA and DNA codes are accepted as is.
953 **
954 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
955 ** @return [char] invalid character if any.
956 **
957 ** @release 2.7.0
958 ** @@
959 ******************************************************************************/
960 
ajSeqTypeGapnucS(const AjPStr thys)961 char ajSeqTypeGapnucS(const AjPStr thys)
962 {
963     ajDebug("ajSeqTypeGapnucS test\n");
964 
965     return seqTypeTestS(thys, seqTypeStrNucGap());
966 }
967 
968 
969 
970 
971 /* @func ajSeqTypeAnyprotS ****************************************************
972 **
973 ** Checks sequence type for anything that can be in a protein sequence
974 **
975 ** Stop codes are replaced with gaps.
976 **
977 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
978 ** @return [char] invalid character if any.
979 **
980 ** @release 2.7.0
981 ** @@
982 ******************************************************************************/
983 
ajSeqTypeAnyprotS(const AjPStr thys)984 char ajSeqTypeAnyprotS(const AjPStr thys)
985 {
986     ajDebug("ajSeqTypeAnyprotS test\n");
987 
988     return seqTypeTestS(thys, seqTypeStrProtAny());
989 }
990 
991 
992 
993 
994 /* @func ajSeqTypeProtS *******************************************************
995 **
996 ** Checks sequence type for anything that can be in a protein sequence
997 **
998 ** Stop codes are replaced with gaps.
999 **
1000 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
1001 ** @return [char] invalid character if any.
1002 **
1003 ** @release 2.7.0
1004 ** @@
1005 ******************************************************************************/
1006 
ajSeqTypeProtS(const AjPStr thys)1007 char ajSeqTypeProtS(const AjPStr thys)
1008 {
1009     ajDebug("ajSeqTypeProtS test\n");
1010 
1011     return seqTypeTestS(thys, seqTypeStrProt());
1012 }
1013 
1014 
1015 
1016 
1017 /* @func ajSeqTypeGapanyS *****************************************************
1018 **
1019 ** Checks sequence type for any sequence with gaps.
1020 **
1021 ** Stops ('*') are allowed so this could be a 3 frame translation of DNA.
1022 **
1023 ** @param [r] thys [const AjPStr] Sequence string (unchanged at present)
1024 ** @return [char] invalid character if any.
1025 **
1026 ** @release 2.7.0
1027 ** @@
1028 ******************************************************************************/
1029 
ajSeqTypeGapanyS(const AjPStr thys)1030 char ajSeqTypeGapanyS(const AjPStr thys)
1031 {
1032     ajDebug("ajSeqTypeGapanyS test\n");
1033 
1034     return seqTypeTestS(thys, seqTypeStrAnyGap());
1035 }
1036 
1037 
1038 
1039 
1040 /* @func ajSeqGap *************************************************************
1041 **
1042 ** Sets non-sequence characters to valid gap characters,
1043 ** and pads with extra gaps if necessary to a specified length
1044 **
1045 ** @param [u] thys [AjPSeq] Sequence
1046 ** @param [r] gapc [char] Standard gap character
1047 ** @param [r] padc [char] Gap character for ends of sequence
1048 ** @return [void]
1049 **
1050 ** @release 1.0.0
1051 ** @@
1052 ******************************************************************************/
1053 
ajSeqGap(AjPSeq thys,char gapc,char padc)1054 void ajSeqGap(AjPSeq thys, char gapc, char padc)
1055 {
1056     seqGapSL(&thys->Seq, gapc, padc, 0);
1057 
1058     return;
1059 }
1060 
1061 
1062 
1063 
1064 /* @func ajSeqGapLen **********************************************************
1065 **
1066 ** Sets non-sequence characters to valid gap characters,
1067 ** and pads with extra gaps if necessary to a specified length
1068 **
1069 ** @param [u] thys [AjPSeq] Sequence
1070 ** @param [r] gapc [char] Standard gap character
1071 ** @param [r] padc [char] Gap character for ends of sequence
1072 ** @param [r] ilen [ajint] Sequence length. Expanded if longer than
1073 **                       current length
1074 ** @return [void]
1075 **
1076 ** @release 1.0.0
1077 ** @@
1078 ******************************************************************************/
1079 
ajSeqGapLen(AjPSeq thys,char gapc,char padc,ajint ilen)1080 void ajSeqGapLen(AjPSeq thys, char gapc, char padc, ajint ilen)
1081 {
1082     seqGapSL(&thys->Seq, gapc, padc, ilen);
1083 
1084     return;
1085 }
1086 
1087 
1088 
1089 
1090 /* @func ajSeqGapS ************************************************************
1091 **
1092 ** Sets non-sequence characters to valid gap characters,
1093 ** and pads with extra gaps if necessary to a specified length
1094 **
1095 ** @param [u] seq [AjPStr*] Sequence
1096 ** @param [r] gapc [char] Standard gap character
1097 ** @return [void]
1098 **
1099 ** @release 1.0.0
1100 ** @@
1101 ******************************************************************************/
1102 
ajSeqGapS(AjPStr * seq,char gapc)1103 void ajSeqGapS(AjPStr* seq, char gapc)
1104 {
1105     seqGapSL(seq, gapc, 0, 0);
1106 
1107     return;
1108 }
1109 
1110 
1111 
1112 
1113 /* @funcstatic seqGapSL *******************************************************
1114 **
1115 ** Sets non-sequence characters in a string to valid gap characters,
1116 ** and pads with extra gaps if necessary to a specified length
1117 **
1118 ** @param [u] seq [AjPStr*] String of sequence characters
1119 ** @param [r] gapc [char] Standard gap character
1120 ** @param [r] padc [char] Gap character for ends of sequence
1121 ** @param [r] ilen [ajuint] Sequence length. Expanded if longer than
1122 **                       current length
1123 ** @return [void]
1124 **
1125 ** @release 1.0.0
1126 ** @@
1127 ******************************************************************************/
1128 
seqGapSL(AjPStr * seq,char gapc,char padc,ajuint ilen)1129 static void seqGapSL(AjPStr* seq, char gapc, char padc, ajuint ilen)
1130 {
1131     ajuint i;
1132     static ajuint igap;
1133     char* cp;
1134     char endc = gapc;
1135 
1136     igap = strlen(seqCharGapTest);
1137 
1138     if(!seqNewGapChars)
1139     {
1140 	seqNewGapChars = ajCharNewRes(igap);
1141 	seqNewGapChars[0] = '\0';
1142     }
1143 
1144     /* Set the seqNewGapChars string to match gapc */
1145 
1146     if(*seqNewGapChars != gapc)
1147     {
1148 	for(i=0; i < igap; i++)
1149 	    seqNewGapChars[i] = gapc;
1150 
1151 	seqNewGapChars[igap] = '\0';
1152     }
1153 
1154 
1155     if(ilen >= MAJSTRGETRES(*seq))
1156 	ajStrSetRes(seq, ilen+1);
1157 
1158     ajStrExchangeSetCC(seq, seqCharGapTest, seqNewGapChars);
1159 
1160     if(padc)
1161     {				/* start and end characters updated */
1162 	endc = padc;
1163 
1164         if(strchr(seqCharGapTest, ajStrGetCharFirst(*seq)))
1165         {
1166             /* pad start */
1167             for(cp = ajStrGetuniquePtr(seq); strchr(seqCharGapTest, *cp); cp++)
1168                 *cp = padc;
1169         }
1170 
1171         if(strchr(seqCharGapTest, ajStrGetCharLast(*seq)))
1172         {
1173             cp = ajStrGetuniquePtr(seq);
1174 
1175             for(i=ajStrGetLen(*seq) - 1; i && strchr(seqCharGapTest, cp[i]); i--)
1176                 cp[i] = padc;
1177         }
1178     }
1179 
1180     if(ajStrGetLen(*seq) < ilen)	   /* ilen can be zero to skip this */
1181     {
1182 	cp = ajStrGetuniquePtr(seq);
1183 
1184 	for(i=ajStrGetLen(*seq); i < ilen; i++)
1185 	    cp[i] = endc;
1186 
1187 	cp[ilen] = '\0';
1188 	ajStrSetValid(seq);
1189     }
1190 
1191     /*  ajDebug("seqGapSL after  '%S'\n", *seq); */
1192 
1193     return;
1194 }
1195 
1196 
1197 
1198 
1199 /* @funcstatic seqTypeStopTrimS ***********************************************
1200 **
1201 ** Removes a trailing stop (asterisk) from a protein sequence
1202 **
1203 ** @param [u] pthys [AjPStr*] Sequence string
1204 ** @return [AjBool] ajTrue if a stop was removed.
1205 **
1206 ** @release 2.7.0
1207 ** @@
1208 ******************************************************************************/
1209 
seqTypeStopTrimS(AjPStr * pthys)1210 static AjBool seqTypeStopTrimS(AjPStr* pthys)
1211 {
1212     if(strchr(seqCharProtStop,ajStrGetCharLast(*pthys)))
1213     {
1214 	ajDebug("Trailing stop removed %c\n", ajStrGetCharLast(*pthys));
1215 	ajStrCutEnd(pthys, 1);
1216 
1217 	return ajTrue;
1218     }
1219 
1220     return ajFalse;
1221 }
1222 
1223 
1224 
1225 
1226 /* @func ajSeqSetNuc **********************************************************
1227 **
1228 ** Sets a sequence type to "nucleotide"
1229 **
1230 ** @param [u] thys [AjPSeq] Sequence object
1231 ** @return [void]
1232  ** @category modify [AjPSeq] Sets sequence to be nucleotide
1233 **
1234 ** @release 1.0.0
1235 ** @@
1236 ******************************************************************************/
1237 
ajSeqSetNuc(AjPSeq thys)1238 void ajSeqSetNuc(AjPSeq thys)
1239 {
1240     if(ajStrMatchC(thys->Type, "N"))
1241       return;
1242 
1243     ajStrAssignC(&thys->Type, "N");
1244 
1245     if(thys->Fttable)
1246 	ajFeattableSetNuc(thys->Fttable);
1247 
1248     /* set N as the ambiguity code */
1249     ajStrExchangeSetCC(&thys->Seq,
1250 		       "xX",
1251 		       "nN");
1252 
1253     return;
1254 }
1255 
1256 
1257 
1258 
1259 /* @func ajSeqSetProt *********************************************************
1260 **
1261 ** Sets a sequence type to "protein"
1262 **
1263 ** @param [u] thys [AjPSeq] Sequence object
1264 ** @return [void]
1265 ** @category modify [AjPSeq] Sets sequence to be protein
1266 **
1267 ** @release 1.0.0
1268 ** @@
1269 ******************************************************************************/
1270 
ajSeqSetProt(AjPSeq thys)1271 void ajSeqSetProt(AjPSeq thys)
1272 {
1273     if(ajStrMatchC(thys->Type, "P"))
1274       return;
1275 
1276     ajStrAssignC(&thys->Type, "P");
1277 
1278     if(thys->Fttable)
1279 	ajFeattableSetProt(thys->Fttable);
1280 
1281     return;
1282 }
1283 
1284 
1285 
1286 
1287 /* @func ajSeqsetSetNuc *******************************************************
1288 **
1289 ** Sets a sequence set type to "nucleotide"
1290 **
1291 ** @param [u] thys [AjPSeqset] Sequence set object
1292 ** @return [void]
1293 **
1294 ** @release 2.8.0
1295 ** @@
1296 ******************************************************************************/
1297 
ajSeqsetSetNuc(AjPSeqset thys)1298 void ajSeqsetSetNuc(AjPSeqset thys)
1299 {
1300     ajStrAssignC(&thys->Type, "N");
1301 
1302     return;
1303 }
1304 
1305 
1306 
1307 
1308 /* @func ajSeqsetSetProt ******************************************************
1309 **
1310 ** Sets a sequence set type to "protein"
1311 **
1312 ** @param [u] thys [AjPSeqset] Sequence set object
1313 ** @return [void]
1314 **
1315 ** @release 2.8.0
1316 ** @@
1317 ******************************************************************************/
1318 
ajSeqsetSetProt(AjPSeqset thys)1319 void ajSeqsetSetProt(AjPSeqset thys)
1320 {
1321     ajStrAssignC(&thys->Type, "P");
1322 
1323     return;
1324 }
1325 
1326 
1327 
1328 
1329 /* @func ajSeqType ************************************************************
1330 **
1331 ** Sets the type of a sequence if it has not yet been defined.
1332 **
1333 ** @param [u] thys [AjPSeq] Sequence object
1334 ** @return [void]
1335 ** @category modify [AjPSeq] Sets the sequence type
1336 **
1337 ** @release 1.0.0
1338 ** @@
1339 ******************************************************************************/
1340 
ajSeqType(AjPSeq thys)1341 void ajSeqType(AjPSeq thys)
1342 {
1343     ajDebug("ajSeqType current: %S\n", thys->Type);
1344 
1345     if(ajStrGetLen(thys->Type))
1346 	return;
1347 
1348     if(ajSeqIsNuc(thys))
1349     {
1350 	ajSeqSetNuc(thys);
1351 	ajDebug("ajSeqType nucleotide: %S\n", thys->Type);
1352 
1353 	return;
1354     }
1355 
1356     if(ajSeqIsProt(thys))
1357     {
1358 	ajSeqSetProt(thys);
1359 	ajDebug("ajSeqType protein: %S\n", thys->Type);
1360 
1361 	return;
1362     }
1363 
1364     ajDebug("ajSeqType unknown: %S\n", thys->Type);
1365 
1366     return;
1367 }
1368 
1369 
1370 
1371 
1372 /* @func ajSeqPrintType *******************************************************
1373 **
1374 ** Prints the seqType definitions.
1375 ** For EMBOSS entrails output
1376 **
1377 ** @param [u] outf [AjPFile] Output file
1378 ** @param [r] full [AjBool] Full output
1379 ** @return [void]
1380 **
1381 ** @release 2.5.0
1382 ******************************************************************************/
1383 
ajSeqPrintType(AjPFile outf,AjBool full)1384 void ajSeqPrintType(AjPFile outf, AjBool full)
1385 {
1386     ajuint i;
1387     AjPStr tmpstr = NULL;
1388     ajuint maxtmp = 0;
1389 
1390     const char* typeName[] = {"ANY", "NUC", "PRO"};
1391 
1392 
1393     (void) full;	    /* make used - no extra detail reported */
1394 
1395     ajFmtPrintF(outf, "\n# Sequence types\n");
1396     ajFmtPrintF(outf, "# Name                 Gap Ambig N/P "
1397 		"From     To       Description\n");
1398     ajFmtPrintF(outf, "seqType {\n");
1399 
1400     for(i=0; seqType[i].Name; i++)
1401     {
1402 	if (seqType[i].ConvertFrom)
1403 	{
1404 	    ajFmtPrintF(outf, "  %-20s %3B   %3B %3s",
1405 			seqType[i].Name, seqType[i].Gaps,
1406 			seqType[i].Ambig, typeName[seqType[i].Type]);
1407 	    ajFmtPrintS(&tmpstr, "\"%s\"", seqType[i].ConvertFrom);
1408 	    if(maxtmp > ajStrGetLen(tmpstr))
1409 	       maxtmp = ajStrGetLen(tmpstr);
1410 	    ajFmtPrintF(outf, " %-8S", tmpstr);
1411 	    ajFmtPrintS(&tmpstr, "\"%s\"", seqType[i].ConvertTo);
1412 	    if(maxtmp > ajStrGetLen(tmpstr))
1413 	       maxtmp = ajStrGetLen(tmpstr);
1414 	    ajFmtPrintF(outf, " %-8S", tmpstr);
1415 	    ajFmtPrintF(outf, " \"%s\"\n", seqType[i].Desc);
1416 	}
1417 	else
1418 	{
1419 	    ajFmtPrintF(outf, "  %-20s %3B   %3B %s \"\"       \"\"       "
1420 			"\"%s\"\n",
1421 			seqType[i].Name, seqType[i].Gaps,
1422 			seqType[i].Ambig, typeName[seqType[i].Type],
1423 			seqType[i].Desc);
1424 	}
1425     }
1426 
1427     ajFmtPrintF(outf, "}\n");
1428 
1429     if(maxtmp > 8) ajWarn("ajSeqPrintType max tmpstr len %d",
1430 			maxtmp);
1431     ajStrDel(&tmpstr);
1432 
1433     return;
1434 }
1435 
1436 
1437 
1438 
1439 /* @funcstatic seqTypeTest ****************************************************
1440 **
1441 ** Checks sequence contains only expected characters.
1442 **
1443 ** Returns an invalid character for failure, or a null character for success.
1444 **
1445 ** @param [r] thys [const AjPStr] Sequence string
1446 ** @param [u] badchars [AjPRegexp] Regular expression for
1447 **                                 sequence characters disallowed
1448 ** @return [char] invalid character if any.
1449 **
1450 ** @release 2.7.0
1451 ******************************************************************************/
1452 
seqTypeTest(const AjPStr thys,AjPRegexp badchars)1453 static char seqTypeTest(const AjPStr thys, AjPRegexp badchars)
1454 {
1455     AjPStr tmpstr = NULL;
1456     char ret = '\0';
1457 
1458     if(!ajStrGetLen(thys))
1459 	return ret;
1460 
1461     /*ajDebug("seqTypeTest, Sequence '%S'\n", thys);*/
1462     if(!ajRegExec(badchars, thys))
1463 	return ret;
1464 
1465     ajRegSubI(badchars, 1, &tmpstr);
1466     ret = ajStrGetCharFirst(tmpstr);
1467     ajDebug("seqTypeTest, Sequence had bad character '%c' (%x) "
1468 	    "at %d of %d/%d\n '%S'\n",
1469 	    ret, ret,
1470 	    ajRegOffset(badchars),
1471 	    ajStrGetLen(thys), strlen(ajStrGetPtr(thys)), tmpstr);
1472 
1473     ajStrDel(&tmpstr);
1474 
1475     return ret;
1476 }
1477 
1478 
1479 
1480 
1481 /* @funcstatic seqTypeTestS ***************************************************
1482 **
1483 ** Checks sequence contains only expected characters.
1484 **
1485 ** Returns an invalid character for failure, or a null character for success.
1486 **
1487 ** @param [r] thys [const AjPStr] Sequence string
1488 ** @param [r] goodchars [const AjPStr] String of
1489 **                                 sequence characters allowed
1490 ** @return [char] invalid character if any.
1491 **
1492 ** @release 4.1.0
1493 ******************************************************************************/
1494 
seqTypeTestS(const AjPStr thys,const AjPStr goodchars)1495 static char seqTypeTestS(const AjPStr thys, const AjPStr goodchars)
1496 {
1497     char ret = '\0';
1498     ajlong i;
1499 
1500     if(!ajStrGetLen(thys))
1501 	return ret;
1502 
1503     ajDebug("seqTypeTestS, len %d goodchars '%S'\n",
1504 	    ajStrGetLen(thys), goodchars);
1505 
1506     if(ajStrIsCharsetCaseS(thys, goodchars))
1507       return ret;
1508 
1509     i = ajStrFindRestCaseS(thys, goodchars);
1510 
1511     if (i < 0)
1512       return ret;
1513 
1514     ret = ajStrGetCharPos(thys, (size_t) i);
1515 
1516     ajDebug("seqTypeTest, Sequence had bad character '%c' (%x) "
1517 	    "at %d of %d/%d\n",
1518 	    ret, ret,
1519 	    i,
1520 	    ajStrGetLen(thys), strlen(ajStrGetPtr(thys)));
1521 
1522 
1523     return ret;
1524 }
1525 
1526 
1527 
1528 
1529 /* @funcstatic seqTypeCharAny *************************************************
1530 **
1531 ** Returns regular expression to test for type Any
1532 **
1533 ** @return [AjPRegexp] valid characters
1534 **
1535 ** @release 2.7.0
1536 ******************************************************************************/
1537 
seqTypeCharAny(void)1538 static AjPRegexp seqTypeCharAny(void)
1539 {
1540     AjPStr regstr = NULL;
1541     AjPStr tmpstr = NULL;
1542 
1543     if(!seqtypeRegAny)
1544     {
1545 	regstr = ajStrNewRes(256);
1546 	tmpstr = ajStrNewS(seqTypeStrAny());
1547 
1548 	ajStrAppendC(&regstr, "([^");
1549 	ajStrKeepSetAlpha(&tmpstr);
1550 	ajStrFmtLower(&tmpstr);
1551 	ajStrAppendS(&regstr, tmpstr);
1552 	ajStrAppendS(&regstr, seqTypeStrAny());
1553 	ajStrAppendC(&regstr, "+])");
1554 
1555 	seqtypeRegAny = ajRegComp(regstr);
1556 
1557 	ajStrDel(&regstr);
1558 	ajStrDel(&tmpstr);
1559     }
1560 
1561     return seqtypeRegAny;
1562 }
1563 
1564 
1565 
1566 
1567 /* @funcstatic seqTypeCharAnyGap **********************************************
1568 **
1569 ** Returns regular expression to test for type Any with gaps
1570 **
1571 ** @return [AjPRegexp] valid characters
1572 **
1573 ** @release 2.7.0
1574 ******************************************************************************/
1575 
seqTypeCharAnyGap(void)1576 static AjPRegexp seqTypeCharAnyGap(void)
1577 {
1578     AjPStr regstr = NULL;
1579     AjPStr tmpstr = NULL;
1580 
1581     if(!seqtypeRegAnyGap)
1582     {
1583 	regstr = ajStrNewRes(256);
1584 	tmpstr = ajStrNewS(seqTypeStrAnyGap());
1585 
1586 	ajStrAppendC(&regstr, "([^");
1587 	ajStrAppendS(&regstr, tmpstr);
1588 	ajStrKeepSetAlpha(&tmpstr);
1589 	ajStrFmtLower(&tmpstr);
1590 	ajStrAppendS(&regstr, tmpstr);
1591 	ajStrAppendC(&regstr, "+])");
1592 
1593 	seqtypeRegAnyGap = ajRegComp(regstr);
1594 
1595 	ajStrDel(&regstr);
1596 	ajStrDel(&tmpstr);
1597     }
1598 
1599     return seqtypeRegAnyGap;
1600 }
1601 
1602 
1603 
1604 
1605 /* @funcstatic seqTypeCharNuc *************************************************
1606 **
1607 ** Returns regular expression to test for nucleotide bases
1608 **
1609 ** @return [AjPRegexp] valid characters
1610 **
1611 ** @release 2.7.0
1612 ******************************************************************************/
1613 
seqTypeCharNuc(void)1614 static AjPRegexp seqTypeCharNuc(void)
1615 {
1616     AjPStr regstr = NULL;
1617     AjPStr tmpstr = NULL;
1618 
1619     if(!seqtypeRegNuc)
1620     {
1621 	regstr = ajStrNewRes(256);
1622 	tmpstr = ajStrNewS(seqTypeStrNuc());
1623 
1624 	ajStrAppendC(&regstr, "([^");
1625 	ajStrKeepSetAlpha(&tmpstr);
1626 	ajStrFmtLower(&tmpstr);
1627 	ajStrAppendS(&regstr, tmpstr);
1628 	ajStrAppendS(&regstr, seqTypeStrNuc());
1629 	ajStrAppendC(&regstr, "+])");
1630 
1631 	seqtypeRegNuc = ajRegComp(regstr);
1632 
1633 	ajStrDel(&regstr);
1634 	ajStrDel(&tmpstr);
1635      }
1636 
1637     return seqtypeRegNuc;
1638 }
1639 
1640 
1641 
1642 
1643 /* @funcstatic seqTypeCharNucGap **********************************************
1644 **
1645 ** Returns regular expression to test for nucleotide bases with gaps
1646 **
1647 ** @return [AjPRegexp] valid characters
1648 **
1649 ** @release 2.7.0
1650 ******************************************************************************/
1651 
seqTypeCharNucGap(void)1652 static AjPRegexp seqTypeCharNucGap(void)
1653 {
1654     AjPStr regstr = NULL;
1655     AjPStr tmpstr = NULL;
1656 
1657     if(!seqtypeRegNucGap)
1658     {
1659 	regstr = ajStrNewRes(256);
1660 	tmpstr = ajStrNewS(seqTypeStrNucGap());
1661 
1662 	ajStrAppendC(&regstr, "([^");
1663 	ajStrAppendS(&regstr, tmpstr);
1664 	ajStrKeepSetAlpha(&tmpstr);
1665 	ajStrFmtLower(&tmpstr);
1666 	ajStrAppendS(&regstr, tmpstr);
1667 	ajStrAppendC(&regstr, "+])");
1668 
1669 	seqtypeRegNucGap = ajRegComp(regstr);
1670 
1671 	ajStrDel(&regstr);
1672 	ajStrDel(&tmpstr);
1673     }
1674 
1675     return seqtypeRegNucGap;
1676 }
1677 
1678 
1679 
1680 
1681 /* @funcstatic seqTypeCharNucGapPhylo *****************************************
1682 **
1683 ** Returns regular expression to test for nucleotide bases with gaps
1684 ** and queries
1685 **
1686 ** @return [AjPRegexp] valid characters
1687 **
1688 ** @release 2.9.0
1689 ******************************************************************************/
1690 
seqTypeCharNucGapPhylo(void)1691 static AjPRegexp seqTypeCharNucGapPhylo(void)
1692 {
1693     AjPStr regstr = NULL;
1694     AjPStr tmpstr = NULL;
1695 
1696     if(!seqtypeRegNucGapPhylo)
1697     {
1698 	regstr = ajStrNewRes(256);
1699 	tmpstr = ajStrNewS(seqTypeStrNucGapPhylo());
1700 
1701 	ajStrAppendC(&regstr, "([^");
1702 	ajStrKeepSetAlpha(&tmpstr);
1703 	ajStrFmtLower(&tmpstr);
1704 	ajStrAppendS(&regstr, tmpstr);
1705 	ajStrAppendS(&regstr, seqTypeStrNucGapPhylo());
1706 	ajStrAppendC(&regstr, "+])");
1707 
1708 	seqtypeRegNucGapPhylo = ajRegComp(regstr);
1709 
1710 	ajStrDel(&regstr);
1711 	ajStrDel(&tmpstr);
1712     }
1713 
1714     return seqtypeRegNucGapPhylo;
1715 }
1716 
1717 
1718 
1719 
1720 /* @funcstatic seqTypeCharNucPure *********************************************
1721 **
1722 ** Returns regular expression to test for nucleotide bases
1723 ** with no ambiguity
1724 **
1725 ** @return [AjPRegexp] valid characters
1726 **
1727 ** @release 2.7.0
1728 ******************************************************************************/
1729 
seqTypeCharNucPure(void)1730 static AjPRegexp seqTypeCharNucPure(void)
1731 {
1732     AjPStr regstr = NULL;
1733     AjPStr tmpstr = NULL;
1734 
1735     if(!seqtypeRegNucPure)
1736     {
1737 	regstr = ajStrNewRes(256);
1738 	tmpstr = ajStrNewS(seqTypeStrNucPure());
1739 
1740 	ajStrAppendC(&regstr, "([^");
1741 	ajStrKeepSetAlpha(&tmpstr);
1742 	ajStrFmtLower(&tmpstr);
1743 	ajStrAppendS(&regstr, tmpstr);
1744 	ajStrAppendS(&regstr, seqTypeStrNucPure());
1745 	ajStrAppendC(&regstr, "+])");
1746 
1747 	seqtypeRegNucPure = ajRegComp(regstr);
1748 
1749 	ajStrDel(&regstr);
1750 	ajStrDel(&tmpstr);
1751     }
1752 
1753     return seqtypeRegNucPure;
1754 }
1755 
1756 
1757 
1758 
1759 /* @funcstatic seqTypeCharProt ************************************************
1760 **
1761 ** Returns regular expression to test for protein residues
1762 **
1763 ** @return [AjPRegexp] valid characters
1764 **
1765 ** @release 2.7.0
1766 ******************************************************************************/
1767 
seqTypeCharProt(void)1768 static AjPRegexp seqTypeCharProt(void)
1769 {
1770     AjPStr regstr = NULL;
1771     AjPStr tmpstr = NULL;
1772 
1773     if(!seqtypeRegProt)
1774     {
1775 	regstr = ajStrNewRes(256);
1776 	tmpstr = ajStrNewS(seqTypeStrProt());
1777 
1778 	ajStrAppendC(&regstr, "([^");
1779 	ajStrKeepSetAlpha(&tmpstr);
1780 	ajStrFmtLower(&tmpstr);
1781 	ajStrAppendS(&regstr, tmpstr);
1782 	ajStrAppendS(&regstr, seqTypeStrProt());
1783 	ajStrAppendC(&regstr, "+])");
1784 
1785 	seqtypeRegProt = ajRegComp(regstr);
1786 
1787 	ajStrDel(&regstr);
1788 	ajStrDel(&tmpstr);
1789     }
1790 
1791     return seqtypeRegProt;
1792 }
1793 
1794 
1795 
1796 
1797 /* @funcstatic seqTypeCharProtGap *********************************************
1798 **
1799 ** Returns regular expression to test for protein residues or gaps
1800 **
1801 ** @return [AjPRegexp] valid characters
1802 **
1803 ** @release 2.7.0
1804 ******************************************************************************/
1805 
seqTypeCharProtGap(void)1806 static AjPRegexp seqTypeCharProtGap(void)
1807 {
1808     AjPStr regstr = NULL;
1809     AjPStr tmpstr = NULL;
1810 
1811     if(!seqtypeRegProtGap)
1812     {
1813 	regstr = ajStrNewRes(256);
1814 	tmpstr = ajStrNewS(seqTypeStrProtGap());
1815 
1816 	ajStrAppendC(&regstr, "([^");
1817 	ajStrKeepSetAlpha(&tmpstr);
1818 	ajStrFmtLower(&tmpstr);
1819 	ajStrAppendS(&regstr, tmpstr);
1820 	ajStrAppendS(&regstr, seqTypeStrProtGap());
1821 	ajStrAppendC(&regstr, "+])");
1822 
1823 	seqtypeRegProtGap = ajRegComp(regstr);
1824 
1825 	ajStrDel(&regstr);
1826 	ajStrDel(&tmpstr);
1827     }
1828 
1829     return seqtypeRegProtGap;
1830 }
1831 
1832 
1833 
1834 
1835 /* @funcstatic seqTypeCharProtGapPhylo ****************************************
1836 **
1837 ** Returns regular expression to test for protein residues or gaps
1838 ** stops and queries
1839 **
1840 ** @return [AjPRegexp] valid characters
1841 **
1842 ** @release 2.9.0
1843 ******************************************************************************/
1844 
seqTypeCharProtGapPhylo(void)1845 static AjPRegexp seqTypeCharProtGapPhylo(void)
1846 {
1847     AjPStr regstr = NULL;
1848     AjPStr tmpstr = NULL;
1849 
1850     if(!seqtypeRegProtGapPhylo)
1851     {
1852 	regstr = ajStrNewRes(256);
1853 	tmpstr = ajStrNewS(seqTypeStrProtGapPhylo());
1854 
1855 	ajStrAppendC(&regstr, "([^");
1856 	ajStrKeepSetAlpha(&tmpstr);
1857 	ajStrFmtLower(&tmpstr);
1858 	ajStrAppendS(&regstr, tmpstr);
1859 	ajStrAppendS(&regstr, seqTypeStrProtGapPhylo());
1860 	ajStrAppendC(&regstr, "+])");
1861 
1862 	seqtypeRegProtGapPhylo = ajRegComp(regstr);
1863 
1864 	ajStrDel(&regstr);
1865 	ajStrDel(&tmpstr);
1866     }
1867 
1868     return seqtypeRegProtGapPhylo;
1869 }
1870 
1871 
1872 
1873 
1874 /* @funcstatic seqTypeCharProtPure ********************************************
1875 **
1876 ** Returns regular expression to test for protein residues
1877 ** with no ambiguity
1878 **
1879 ** @return [AjPRegexp] valid characters
1880 **
1881 ** @release 2.7.0
1882 ******************************************************************************/
1883 
seqTypeCharProtPure(void)1884 static AjPRegexp seqTypeCharProtPure(void)
1885 {
1886     AjPStr regstr = NULL;
1887     AjPStr tmpstr = NULL;
1888 
1889     if(!seqtypeRegProtPure)
1890     {
1891 	regstr = ajStrNewRes(256);
1892 	tmpstr = ajStrNewS(seqTypeStrProtPure());
1893 
1894 	ajStrAppendC(&regstr, "([^");
1895 	ajStrKeepSetAlpha(&tmpstr);
1896 	ajStrFmtLower(&tmpstr);
1897 	ajStrAppendS(&regstr, tmpstr);
1898 	ajStrAppendS(&regstr, seqTypeStrProtPure());
1899 	ajStrAppendC(&regstr, "+])");
1900 
1901 	seqtypeRegProtPure = ajRegComp(regstr);
1902 
1903 	ajStrDel(&regstr);
1904 	ajStrDel(&tmpstr);
1905     }
1906 
1907     return seqtypeRegProtPure;
1908 }
1909 
1910 
1911 
1912 
1913 /* @funcstatic seqTypeCharProtStop ********************************************
1914 **
1915 ** Returns regular expression to test for protein residues or stop codons
1916 **
1917 ** @return [AjPRegexp] valid characters
1918 **
1919 ** @release 2.7.0
1920 ******************************************************************************/
1921 
seqTypeCharProtStop(void)1922 static AjPRegexp seqTypeCharProtStop(void)
1923 {
1924     AjPStr regstr = NULL;
1925     AjPStr tmpstr = NULL;
1926 
1927     if(!seqtypeRegProtStop)
1928     {
1929 	regstr = ajStrNewRes(256);
1930 	tmpstr = ajStrNewS(seqTypeStrProtStop());
1931 
1932 	ajStrAppendC(&regstr, "([^");
1933 	ajStrKeepSetAlpha(&tmpstr);
1934 	ajStrFmtLower(&tmpstr);
1935 	ajStrAppendS(&regstr, tmpstr);
1936 	ajStrAppendS(&regstr, seqTypeStrProtStop());
1937 	ajStrAppendC(&regstr, "+])");
1938 
1939 	seqtypeRegProtStop = ajRegComp(regstr);
1940 
1941 	ajStrDel(&regstr);
1942 	ajStrDel(&tmpstr);
1943     }
1944 
1945     return seqtypeRegProtStop;
1946 }
1947 
1948 
1949 
1950 
1951 /* @funcstatic seqTypeCharProtStopGap *****************************************
1952 **
1953 ** Returns regular expression to test for protein residues or stop codons
1954 ** or gap characters
1955 **
1956 ** @return [AjPRegexp] valid characters
1957 **
1958 ** @release 4.0.0
1959 ******************************************************************************/
1960 
seqTypeCharProtStopGap(void)1961 static AjPRegexp seqTypeCharProtStopGap(void)
1962 {
1963     AjPStr regstr = NULL;
1964     AjPStr tmpstr = NULL;
1965 
1966     if(!seqtypeRegProtStopGap)
1967     {
1968 	regstr = ajStrNewRes(256);
1969 	tmpstr = ajStrNewS(seqTypeStrProtStopGap());
1970 
1971 	ajStrAppendC(&regstr, "([^");
1972 	ajStrKeepSetAlpha(&tmpstr);
1973 	ajStrFmtLower(&tmpstr);
1974 	ajStrAppendS(&regstr, tmpstr);
1975 	ajStrAppendS(&regstr, seqTypeStrProtStopGap());
1976 	ajStrAppendC(&regstr, "+])");
1977 
1978 	seqtypeRegProtStopGap = ajRegComp(regstr);
1979 
1980 	ajStrDel(&regstr);
1981 	ajStrDel(&tmpstr);
1982     }
1983 
1984     return seqtypeRegProtStopGap;
1985 }
1986 
1987 
1988 
1989 
1990 /* @funcstatic seqTypeStrAny **************************************************
1991 **
1992 ** Returns string of valid characters to test for type Any
1993 **
1994 ** @return [AjPStr] valid characters
1995 **
1996 ** @release 4.1.0
1997 ******************************************************************************/
1998 
seqTypeStrAny(void)1999 static AjPStr seqTypeStrAny(void)
2000 {
2001     if(!seqtypeCharsetAny)
2002     {
2003         ajFmtPrintS(&seqtypeCharsetAny, "%s%s%s%s%s",
2004 		    seqCharProtPure,
2005 		    seqCharProtAmbig,
2006 		    seqCharProtStop,
2007 		    seqCharNucPure,
2008 		    seqCharNucAmbig);
2009         ajStrRemoveDupchar(&seqtypeCharsetAny);
2010     }
2011 
2012     return seqtypeCharsetAny;
2013 }
2014 
2015 
2016 
2017 
2018 /* @funcstatic seqTypeStrAnyGap ***********************************************
2019 **
2020 ** Returns string of valid characters to test for type Anygap
2021 **
2022 ** @return [AjPStr] valid characters
2023 **
2024 ** @release 4.1.0
2025 ******************************************************************************/
2026 
seqTypeStrAnyGap(void)2027 static AjPStr seqTypeStrAnyGap(void)
2028 {
2029     if(!seqtypeCharsetAnyGap)
2030     {
2031 	ajFmtPrintS(&seqtypeCharsetAnyGap, "%s%s%s%s%s%s",
2032 		    seqCharProtPure,
2033 		    seqCharProtAmbig,
2034 		    seqCharProtStop,
2035 		    seqCharNucPure,
2036 		    seqCharNucAmbig,
2037 		    seqCharGap);
2038         ajStrRemoveDupchar(&seqtypeCharsetAnyGap);
2039     }
2040 
2041     return seqtypeCharsetAnyGap;
2042 }
2043 
2044 
2045 
2046 
2047 /* @funcstatic seqTypeStrDnaGap ***********************************************
2048 **
2049 ** Returns string of valid characters to test for type Dnagap
2050 **
2051 ** @return [AjPStr] valid characters
2052 **
2053 ** @release 4.1.0
2054 ******************************************************************************/
2055 
seqTypeStrDnaGap(void)2056 static AjPStr seqTypeStrDnaGap(void)
2057 {
2058     if(!seqtypeCharsetDnaGap)
2059     {
2060 	ajFmtPrintS(&seqtypeCharsetDnaGap, "%s%s%s",
2061 		    seqCharNucPure,
2062 		    seqCharNucAmbig,
2063 		    seqCharGap);
2064         ajStrRemoveDupchar(&seqtypeCharsetDnaGap);
2065     }
2066 
2067     return seqtypeCharsetDnaGap;
2068 }
2069 
2070 
2071 
2072 
2073 /* @funcstatic seqTypeStrNuc **************************************************
2074 **
2075 ** Returns string of valid characters to test for type Nuc
2076 **
2077 ** @return [AjPStr] valid characters
2078 **
2079 ** @release 4.1.0
2080 ******************************************************************************/
2081 
seqTypeStrNuc(void)2082 static AjPStr seqTypeStrNuc(void)
2083 {
2084     if(!seqtypeCharsetNuc)
2085     {
2086 	ajFmtPrintS(&seqtypeCharsetNuc, "%s%s",
2087 		    seqCharNucPure,
2088 		    seqCharNucAmbig);
2089         ajStrRemoveDupchar(&seqtypeCharsetNuc);
2090     }
2091 
2092     return seqtypeCharsetNuc;
2093 }
2094 
2095 
2096 
2097 
2098 /* @funcstatic seqTypeStrNucGap ***********************************************
2099 **
2100 ** Returns string of valid characters to test for type Nucgap
2101 **
2102 ** @return [AjPStr] valid characters
2103 **
2104 ** @release 4.1.0
2105 ******************************************************************************/
2106 
seqTypeStrNucGap(void)2107 static AjPStr seqTypeStrNucGap(void)
2108 {
2109     if(!seqtypeCharsetNucGap)
2110     {
2111 	ajFmtPrintS(&seqtypeCharsetNucGap, "%s%s%s",
2112 		    seqCharNucPure,
2113 		    seqCharNucAmbig,
2114 		    seqCharGap);
2115         ajStrRemoveDupchar(&seqtypeCharsetNucGap);
2116     }
2117 
2118     return seqtypeCharsetNucGap;
2119 }
2120 
2121 
2122 
2123 
2124 /* @funcstatic seqTypeStrNucGapPhylo ******************************************
2125 **
2126 ** Returns string of valid characters to test for type Nucgapphylo
2127 **
2128 ** @return [AjPStr] valid characters
2129 **
2130 ** @release 4.1.0
2131 ******************************************************************************/
2132 
seqTypeStrNucGapPhylo(void)2133 static AjPStr seqTypeStrNucGapPhylo(void)
2134 {
2135     if(!seqtypeCharsetNucGapPhylo)
2136     {
2137 	ajFmtPrintS(&seqtypeCharsetNucGapPhylo, "%s%s%s%s",
2138 		    seqCharNucPure,
2139 		    seqCharNucAmbig,
2140 		    seqCharPhylo,
2141 		    seqCharGap);
2142         ajStrRemoveDupchar(&seqtypeCharsetNucGapPhylo);
2143     }
2144 
2145     return seqtypeCharsetNucGapPhylo;
2146 }
2147 
2148 
2149 
2150 
2151 /* @funcstatic seqTypeStrNucPure **********************************************
2152 **
2153 ** Returns string of valid characters to test for type Nucpure
2154 **
2155 ** @return [AjPStr] valid characters
2156 **
2157 ** @release 4.1.0
2158 ******************************************************************************/
2159 
seqTypeStrNucPure(void)2160 static AjPStr seqTypeStrNucPure(void)
2161 {
2162     if(!seqtypeCharsetNucPure)
2163     {
2164 	ajFmtPrintS(&seqtypeCharsetNucPure, "%s",
2165 		    seqCharNucPure);
2166         ajStrRemoveDupchar(&seqtypeCharsetNucPure);
2167     }
2168 
2169     return seqtypeCharsetNucPure;
2170 }
2171 
2172 
2173 
2174 
2175 /* @funcstatic seqTypeStrProt *************************************************
2176 **
2177 ** Returns string of valid characters to test for type Prot
2178 **
2179 ** @return [AjPStr] valid characters
2180 **
2181 ** @release 4.1.0
2182 ******************************************************************************/
2183 
seqTypeStrProt(void)2184 static AjPStr seqTypeStrProt(void)
2185 {
2186     if(!seqtypeCharsetProt)
2187     {
2188 	ajFmtPrintS(&seqtypeCharsetProt, "%s%s",
2189 		    seqCharProtPure,
2190 		    seqCharProtAmbig);
2191         ajStrRemoveDupchar(&seqtypeCharsetProt);
2192     }
2193 
2194     return seqtypeCharsetProt;
2195 }
2196 
2197 
2198 
2199 
2200 /* @funcstatic seqTypeStrProtAny **********************************************
2201 **
2202 ** Returns string of valid characters to test for type Protany
2203 **
2204 ** @return [AjPStr] valid characters
2205 **
2206 ** @release 4.1.0
2207 ******************************************************************************/
2208 
seqTypeStrProtAny(void)2209 static AjPStr seqTypeStrProtAny(void)
2210 {
2211     if(!seqtypeCharsetProtAny)
2212     {
2213 	ajFmtPrintS(&seqtypeCharsetProtAny, "%s%s%s%s%s",
2214 		    seqCharProtPure,
2215 		    seqCharProtAmbig,
2216 		    seqCharProtStop,
2217 		    seqCharPhylo,
2218 		    seqCharGap);
2219         ajStrRemoveDupchar(&seqtypeCharsetProtAny);
2220     }
2221 
2222     return seqtypeCharsetProtAny;
2223 }
2224 
2225 
2226 
2227 
2228 /* @funcstatic seqTypeStrProtGap **********************************************
2229 **
2230 ** Returns string of valid characters to test for type Protgap
2231 **
2232 ** @return [AjPStr] valid characters
2233 **
2234 ** @release 4.1.0
2235 ******************************************************************************/
2236 
seqTypeStrProtGap(void)2237 static AjPStr seqTypeStrProtGap(void)
2238 {
2239     if(!seqtypeCharsetProtGap)
2240     {
2241 	ajFmtPrintS(&seqtypeCharsetProtGap, "%s%s%s",
2242 		    seqCharProtPure,
2243 		    seqCharProtAmbig,
2244 		    seqCharGap);
2245         ajStrRemoveDupchar(&seqtypeCharsetProtGap);
2246     }
2247 
2248     return seqtypeCharsetProtGap;
2249 }
2250 
2251 
2252 
2253 
2254 /* @funcstatic seqTypeStrProtGapPhylo *****************************************
2255 **
2256 ** Returns string of valid characters to test for type Protgapphylo
2257 **
2258 ** @return [AjPStr] valid characters
2259 **
2260 ** @release 4.1.0
2261 ******************************************************************************/
2262 
seqTypeStrProtGapPhylo(void)2263 static AjPStr seqTypeStrProtGapPhylo(void)
2264 {
2265     if(!seqtypeCharsetProtGapPhylo)
2266     {
2267         ajFmtPrintS(&seqtypeCharsetProtGapPhylo, "%s%s%s%s%s",
2268 		    seqCharProtPure,
2269 		    seqCharProtAmbig,
2270 		    seqCharProtStop,
2271 		    seqCharPhylo,
2272 		    seqCharGap);
2273         ajStrRemoveDupchar(&seqtypeCharsetProtGapPhylo);
2274     }
2275 
2276     return seqtypeCharsetProtGapPhylo;
2277 }
2278 
2279 
2280 
2281 
2282 /* @funcstatic seqTypeStrProtPure *********************************************
2283 **
2284 ** Returns string of valid characters to test for type Protpure
2285 **
2286 ** @return [AjPStr] valid characters
2287 **
2288 ** @release 4.1.0
2289 ******************************************************************************/
2290 
seqTypeStrProtPure(void)2291 static AjPStr seqTypeStrProtPure(void)
2292 {
2293     if(!seqtypeCharsetProtPure)
2294     {
2295         ajFmtPrintS(&seqtypeCharsetProtPure, "%s",
2296 		    seqCharProtPure);
2297         ajStrRemoveDupchar(&seqtypeCharsetProtPure);
2298     }
2299 
2300     return seqtypeCharsetProtPure;
2301 }
2302 
2303 
2304 
2305 
2306 /* @funcstatic seqTypeStrProtStop *********************************************
2307 **
2308 ** Returns string of valid characters to test for type Protstop
2309 **
2310 ** @return [AjPStr] valid characters
2311 **
2312 ** @release 4.1.0
2313 ******************************************************************************/
2314 
seqTypeStrProtStop(void)2315 static AjPStr seqTypeStrProtStop(void)
2316 {
2317     if(!seqtypeCharsetProtStop)
2318     {
2319         ajFmtPrintS(&seqtypeCharsetProtStop, "%s%s%s",
2320 		    seqCharProtPure,
2321 		    seqCharProtAmbig,
2322 		    seqCharProtStop);
2323         ajStrRemoveDupchar(&seqtypeCharsetProtStop);
2324     }
2325 
2326     return seqtypeCharsetProtStop;
2327 }
2328 
2329 
2330 
2331 
2332 /* @funcstatic seqTypeStrProtStopGap ******************************************
2333 **
2334 ** Returns string of valid characters to test for type Protstopgap
2335 **
2336 ** @return [AjPStr] valid characters
2337 **
2338 ** @release 4.1.0
2339 ******************************************************************************/
2340 
seqTypeStrProtStopGap(void)2341 static AjPStr seqTypeStrProtStopGap(void)
2342 {
2343     if(!seqtypeCharsetProtStopGap)
2344     {
2345         ajFmtPrintS(&seqtypeCharsetProtStopGap, "%s%s%s%s",
2346 		    seqCharProtPure,
2347 		    seqCharProtAmbig,
2348 		    seqCharProtStop,
2349 		    seqCharGap);
2350         ajStrRemoveDupchar(&seqtypeCharsetProtStopGap);
2351     }
2352 
2353     return seqtypeCharsetProtStopGap;
2354 }
2355 
2356 
2357 
2358 
2359 /* @funcstatic seqTypeStrRnaGap ***********************************************
2360 **
2361 ** Returns string of valid characters to test for type Rnagap
2362 **
2363 ** @return [AjPStr] valid characters
2364 **
2365 ** @release 4.1.0
2366 ******************************************************************************/
2367 
seqTypeStrRnaGap(void)2368 static AjPStr seqTypeStrRnaGap(void)
2369 {
2370     if(!seqtypeCharsetRnaGap)
2371     {
2372         ajFmtPrintS(&seqtypeCharsetRnaGap, "%s%s",
2373 		    seqCharNucRna,
2374 		    seqCharGap);
2375         ajStrRemoveDupchar(&seqtypeCharsetRnaGap);
2376     }
2377 
2378     return seqtypeCharsetRnaGap;
2379 }
2380 
2381 
2382 
2383 
2384 /* @funcstatic seqFindType ****************************************************
2385 **
2386 ** Returns sequence type index and ajTrue if type was found
2387 **
2388 ** @param [r] type_name [const AjPStr] Sequence type
2389 ** @param [w] typenum [ajint*] Sequence type index
2390 ** @return [AjBool] ajTrue if sequence type was found
2391 **
2392 **
2393 ** @release 2.7.0
2394 ******************************************************************************/
2395 
seqFindType(const AjPStr type_name,ajint * typenum)2396 static AjBool seqFindType(const AjPStr type_name, ajint* typenum)
2397 {
2398     ajint i;
2399     ajint itype = -1;
2400 
2401     ajStrAssignS(&seqtypeTmpstr, type_name);
2402     ajStrFmtLower(&seqtypeTmpstr);
2403 
2404     for(i = 0; seqType[i].Name; i++)
2405 	if(ajStrMatchC(seqtypeTmpstr, seqType[i].Name))
2406 	{
2407 	    itype = i;
2408 	    break;
2409 	}
2410 
2411     if(itype < 0)
2412     {
2413 	*typenum = i;
2414 
2415 	return ajFalse;
2416     }
2417 
2418     *typenum = itype;
2419 
2420     return ajTrue;
2421 }
2422 
2423 
2424 
2425 
2426 /* @func ajSeqTypeIsProt ******************************************************
2427 **
2428 ** Returns ajTrue is sequence type can be a protein (or 'any')
2429 **
2430 ** @param [r] type_name [const AjPStr] Sequence type
2431 ** @return [AjBool] ajTrue if sequence can be protein
2432 **
2433 **
2434 ** @release 2.7.0
2435 ******************************************************************************/
2436 
ajSeqTypeIsProt(const AjPStr type_name)2437 AjBool ajSeqTypeIsProt(const AjPStr type_name)
2438 {
2439     ajint itype;
2440 
2441     if(seqFindType(type_name, &itype))
2442 	switch(seqType[itype].Type)
2443 	{
2444             case ISNUC:
2445                 return ajFalse;
2446             default:
2447                 return ajTrue;
2448 	}
2449 
2450     return ajFalse;
2451 }
2452 
2453 
2454 
2455 
2456 /* @func ajSeqTypeIsNuc *******************************************************
2457 **
2458 ** Returns ajTrue is sequence type can be a nucleotide (or 'any')
2459 **
2460 ** @param [r] type_name [const AjPStr] Sequence type
2461 ** @return [AjBool] ajTrue if sequence can be nucleotide
2462 **
2463 **
2464 ** @release 2.7.0
2465 ******************************************************************************/
2466 
ajSeqTypeIsNuc(const AjPStr type_name)2467 AjBool ajSeqTypeIsNuc(const AjPStr type_name)
2468 {
2469     ajint itype;
2470 
2471     if(seqFindType(type_name, &itype))
2472 	switch(seqType[itype].Type)
2473 	{
2474             case ISPROT:
2475                 return ajFalse;
2476 
2477             default:
2478                 return ajTrue;
2479 	}
2480 
2481     return ajFalse;
2482 }
2483 
2484 
2485 
2486 
2487 /* @func ajSeqTypeIsAny *******************************************************
2488 **
2489 ** Returns ajTrue is sequence type can be a protein or nucleotide
2490 **
2491 ** @param [r] type_name [const AjPStr] Sequence type
2492 ** @return [AjBool] ajTrue if sequence can be protein or nucleotide
2493 **
2494 **
2495 ** @release 2.7.0
2496 ******************************************************************************/
2497 
ajSeqTypeIsAny(const AjPStr type_name)2498 AjBool ajSeqTypeIsAny(const AjPStr type_name)
2499 {
2500     ajint itype;
2501 
2502     if(seqFindType(type_name, &itype))
2503 	switch(seqType[itype].Type)
2504 	{
2505             case ISNUC:
2506                 return ajFalse;
2507 
2508             case ISPROT:
2509                 return ajFalse;
2510 
2511             default:
2512                 return ajTrue;
2513 	}
2514 
2515     return ajFalse;
2516 }
2517 
2518 
2519 
2520 
2521 /* @func ajSeqTypeSummary *****************************************************
2522 **
2523 ** Returns ajTrue is sequence type can be a protein or nucleotide
2524 **
2525 ** @param [r] type_name [const AjPStr] Sequence type
2526 ** @param [w] Ptype [AjPStr*] Sequence type 'protein' 'nucleotide' or 'any'
2527 ** @param [w] gaps [AjBool*] True if gap characters are preserved
2528 ** @return [AjBool] ajTrue if sequence can be protein or nucleotide
2529 **
2530 **
2531 ** @release 4.0.0
2532 ******************************************************************************/
2533 
ajSeqTypeSummary(const AjPStr type_name,AjPStr * Ptype,AjBool * gaps)2534 AjBool ajSeqTypeSummary(const AjPStr type_name, AjPStr* Ptype, AjBool* gaps)
2535 {
2536     ajint itype;
2537 
2538     if(seqFindType(type_name, &itype))
2539     {
2540 	*gaps = seqType[itype].Gaps;
2541 
2542 	switch(seqType[itype].Type)
2543 	{
2544             case ISNUC:
2545                 ajStrAssignC(Ptype, "nucleotide");
2546                 break;
2547             case ISPROT:
2548                 ajStrAssignC(Ptype, "protein");
2549                 break;
2550             default:
2551                 ajStrAssignClear(Ptype);
2552                 break;
2553 	}
2554 	return ajTrue;
2555     }
2556 
2557     return ajFalse;
2558 }
2559 
2560 
2561 
2562 
2563 /* @func ajSeqTypeExit ********************************************************
2564 **
2565 ** Cleans up sequence type processing internal memory
2566 **
2567 ** @return [void]
2568 **
2569 ** @release 4.0.0
2570 ** @@
2571 ******************************************************************************/
2572 
ajSeqTypeExit(void)2573 void ajSeqTypeExit(void)
2574 {
2575     ajuint i;
2576 
2577     ajStrDel(&seqtypeTmpstr);
2578 
2579     ajRegFree(&seqtypeRegAny);
2580     ajRegFree(&seqtypeRegAnyGap);
2581     ajRegFree(&seqtypeRegDnaGap);
2582     ajRegFree(&seqtypeRegNuc);
2583     ajRegFree(&seqtypeRegNucGap);
2584     ajRegFree(&seqtypeRegNucPure);
2585     ajRegFree(&seqtypeRegProt);
2586     ajRegFree(&seqtypeRegProtAny);
2587     ajRegFree(&seqtypeRegProtGap);
2588     ajRegFree(&seqtypeRegProtPure);
2589     ajRegFree(&seqtypeRegProtStop);
2590     ajRegFree(&seqtypeRegRnaGap);
2591 
2592     ajStrDel(&seqtypeCharsetAny);
2593     ajStrDel(&seqtypeCharsetAnyGap);
2594     ajStrDel(&seqtypeCharsetDnaGap);
2595     ajStrDel(&seqtypeCharsetNuc);
2596     ajStrDel(&seqtypeCharsetNucGap);
2597     ajStrDel(&seqtypeCharsetNucGapPhylo);
2598     ajStrDel(&seqtypeCharsetNucPure);
2599     ajStrDel(&seqtypeCharsetProt);
2600     ajStrDel(&seqtypeCharsetProtAny);
2601     ajStrDel(&seqtypeCharsetProtGap);
2602     ajStrDel(&seqtypeCharsetProtPure);
2603     ajStrDel(&seqtypeCharsetProtStop);
2604     ajStrDel(&seqtypeCharsetProtStopGap);
2605     ajStrDel(&seqtypeCharsetRnaGap);
2606 
2607     ajCharDel(&seqNewGapChars);
2608 
2609     for(i=0;seqType[i].Name;i++)
2610     {
2611         if(seqType[i].Filter)
2612             AJFREE(seqType[i].Filter);
2613     }
2614 
2615     return;
2616 }
2617 
2618 
2619 
2620 
2621 /* @func ajSeqTypeUnused ******************************************************
2622 **
2623 ** Dummy function to catch all unused functions defined in the ajseqtype
2624 ** source file.
2625 **
2626 ** @return [void]
2627 **
2628 **
2629 ** @release 4.1.0
2630 ******************************************************************************/
2631 
ajSeqTypeUnused(void)2632 void ajSeqTypeUnused(void)
2633 {
2634     AjPStr ajpstr=NULL;
2635     AjPRegexp ajpregexp = NULL;
2636 
2637     seqTypeTest(ajpstr, ajpregexp);
2638 
2639     return;
2640 }
2641