1 /* @source embpat *************************************************************
2 **
3 ** General routines for pattern matching.
4 **
5 ** @author Copyright (C) Alan Bleasby 1999
6 ** @version $Revision: 1.84 $
7 ** @modified $Date: 2013/07/15 20:53:52 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA  02110-1301,  USA.
24 **
25 ******************************************************************************/
26 
27 
28 #include "ajlib.h"
29 
30 #include "embpat.h"
31 #include "embmat.h"
32 
33 #include "ajmath.h"
34 #include "ajreg.h"
35 #include "ajlist.h"
36 #include "ajtable.h"
37 #include "ajpat.h"
38 #include "ajseq.h"
39 
40 #include <stdlib.h>
41 #include <limits.h>
42 
43 
44 
45 
46 /* @datastatic PatPTypes ***************************************************
47 **
48 ** Prosite pattern types
49 **
50 ** @alias PatSTypes
51 ** @alias PatOTypes
52 **
53 ** @attr Name [const char*] Type name
54 ** @attr Desc [const char*] Type description
55 ** @@
56 ******************************************************************************/
57 
58 typedef struct PatSTypes
59 {
60     const char *Name;
61     const char *Desc;
62 } PatOTypes;
63 
64 #define PatPTypes PatOTypes*
65 
66 static PatOTypes patTypes[] =
67 {
68 /* "Name",    "Description" */
69   {"unknown", "Unknown/undefined pattern"},
70   {"BMH",     "Boyer Moore Horspool pattern"},
71   {"BYP",     "Baeza-Yates Perleberg pattern"},
72   {"SO",      "Shift-OR pattern"},
73   {"BYGC",    "Baeza-Yates Gonnet class pattern"},
74   {"Regex",   "Prosite converted to regex"},
75   {"TUB",     "Tarhio-Ukkonen-Bleasby"},
76   {"OUB",     "Brute force processing"},
77   {NULL, NULL}
78 };
79 
80 
81 
82 
83 /* @datastatic MethPData ***************************************************
84 **
85 ** Methylation data
86 **
87 ** @alias MethSData
88 ** @alias MethOData
89 **
90 ** @attr Name [AjPStr] Name of methylation type
91 ** @attr Site [AjPStr] Methylase recognition site
92 ** @attr Replace [AjPStr] Methylase recognition site replacement sequence
93 ** @@
94 ******************************************************************************/
95 
96 typedef struct MethSData
97 {
98     AjPStr Name;
99     AjPStr Site;
100     AjPStr Replace;
101 } MethOData;
102 
103 #define MethPData MethOData*
104 
105 
106 static void    patRestrictPushHit(const EmbPPatRestrict enz,
107 				 AjPList l, ajuint pos,
108 				 ajuint begin, ajuint len,
109 				 AjBool forward, AjBool plasmid);
110 static void    patRestrictMethylMod(AjPStr *str, AjPStr *rstr,
111                                     AjPList methlist);
112 static AjPList patRestrictReadMethyl(AjPFile methfile);
113 
114 
115 static void   patAminoCarboxyl(const AjPStr s,AjPStr *cs,
116 			       AjBool *amino, AjBool *carboxyl);
117 static AjBool patParenTest(const char *p, AjBool *repeat, AjBool *range);
118 static AjBool patExpandRepeat(AjPStr *s);
119 static void   patIUBTranslate(AjPStr *pat);
120 static void   patProteinTranslate(AjPStr *pat);
121 static AjBool patBruteClass(const char *p, char c);
122 static AjBool patBruteCompl(const char *p, char c);
123 static AjBool patBruteIsRange(const char *t, ajuint *x, ajuint *y);
124 static AjBool patBruteCharMatch(const char *t, char c);
125 static ajuint patBruteNextPatChar(const char *t, ajuint ppos);
126 static AjBool patOUBrute(const char *seq, const char *pat, ajuint spos,
127 			 ajuint ppos, ajuint mm,
128 			 ajuint omm, ajuint level, AjPList l, AjBool carboxyl,
129 			 ajuint begin, ajuint *count, const AjPStr name,
130 			 ajuint st);
131 
132 
133 
134 
135 /* @funcstatic patStringFree **************************************************
136 **
137 ** Free a pattern structure, through an ajListMap call
138 **
139 ** @param [d] x [void**] pattern
140 ** @param [r] cl [void*] Optional function - required by ajListMap but unused
141 ** @return [void]
142 **
143 ** @release 2.0.0
144 ** @@
145 ******************************************************************************/
146 
patStringFree(void ** x,void * cl)147 static void patStringFree(void **x, void *cl)
148 {
149     ajuint **ptr = (ajuint **)x;
150 
151     (void) cl;				/* make it used */
152     AJFREE(*ptr);
153 
154     return;
155 }
156 
157 
158 
159 
160 /* @func embPatSeqCreateRegExp ************************************************
161 **
162 ** Create a regular expression for a string and substitute the chars for
163 ** Nucleotides or proteins as needed.
164 **
165 ** @param [r] thys [const AjPStr] string to create reg expr from.
166 ** @param [r] protein [AjBool] is it a protein.
167 **
168 ** @return [AjPStr] the new regular expression.
169 **
170 ** @release 1.0.0
171 ******************************************************************************/
172 
embPatSeqCreateRegExp(const AjPStr thys,AjBool protein)173 AjPStr embPatSeqCreateRegExp(const AjPStr thys, AjBool protein)
174 {
175     return embPatSeqCreateRegExpC(ajStrGetPtr(thys), protein);
176 }
177 
178 
179 
180 
181 /* @func embPatSeqCreateRegExpC ***********************************************
182 **
183 ** Create a regular expression for a string and substitute the chars for
184 ** Nucleotides or proteins as needed.
185 **
186 ** @param [r] ptr [const char *] text to create reg expr from.
187 ** @param [r] protein [AjBool] is it a protein.
188 **
189 ** @return [AjPStr] the new regular expression.
190 **
191 ** @release 1.0.0
192 ******************************************************************************/
193 
embPatSeqCreateRegExpC(const char * ptr,AjBool protein)194 AjPStr embPatSeqCreateRegExpC(const char *ptr, AjBool protein)
195 {
196 
197 /* codes for A-Z including ambiguity codes */
198 
199     const char *nucpatternmatch[] =
200     {
201 	"[Aa]", "[CcGgTtUu]", "[Cc]", "[AaGgTtUu]",
202 	"", "", "[Gg]", "[AaCcTtUu]",
203 	"", "", "", "",
204 	"[AaCc]", "[A-Za-z]", "", "",
205 	"", "[AaGg]", "[GgCc]", "[TtUu]",
206 	"[TtUu]","[AaCcGg]", "[AaTtUu]", "[A-Za-z]",
207 	"[CTU]", ""
208     };
209 
210     const char *protpatternmatch[] =
211     {
212 	"[Aa]","[DdNn]","[Cc]","[Dd]",
213 	"[Ee]","[Ff]","[Gg]","[AaCcTtUu]",
214 	"[Ii]","","[Kk]","[Ll]",
215 	"[Mm]","[A-Za-z]","","[Pp]",
216 	"[Qq]","[Rr]","[Ss]","[Tt]",
217 	"[Uu]","[Vv]","[Ww]","[A-Za-z]",
218 	"[Yy]","[EeQq]"
219     };
220 
221     AjPStr regexp  = 0;
222     ajuint match;
223     char match2[2] = " ";
224     const char* optr = ptr;
225 
226     regexp = ajStrNewRes((ajuint)strlen(ptr) * 4); /* just a rough guess */
227 
228     while(*ptr != '\0')
229     {
230 	/* alphabetic characters converted to character sets */
231 	if((*ptr > 64 && *ptr < 91) || (*ptr > 96 && *ptr < 123))
232 	{
233 	    if(*ptr > 91)
234 		match = ((ajuint) *ptr) - 97;
235 	    else
236 		match = ((ajuint) *ptr) - 65;
237 
238 	    if(protein)
239 		ajStrAppendC(&regexp,protpatternmatch[match]);
240 	    else
241 		ajStrAppendC(&regexp,nucpatternmatch[match]);
242 	}
243 	else
244 	{
245 	    match2[0] = *ptr;
246 	    ajStrAppendC(&regexp,match2);
247 	}
248 	ptr++;
249     }
250 
251     ajDebug("embPatSeqCreateRegExpC ptr: '%s' returns regexp: %S'\n",
252 	    optr, regexp);
253 
254     return regexp;
255 }
256 
257 
258 
259 
260 /* @func embPatSeqMatchFind ***************************************************
261 **
262 ** Find all the regular expression matches of reg in the string string.
263 **
264 ** @param [r] seq [const AjPSeq] Sequence to be searched.
265 ** @param [r] reg [const AjPStr] regular expression string.
266 **
267 ** @return [EmbPPatMatch] Results of the pattern matching.
268 **
269 **
270 ** @release 1.0.0
271 ******************************************************************************/
272 
embPatSeqMatchFind(const AjPSeq seq,const AjPStr reg)273 EmbPPatMatch embPatSeqMatchFind(const AjPSeq seq, const AjPStr reg)
274 {
275     return embPatSeqMatchFindC(seq, ajStrGetPtr(reg));
276 }
277 
278 
279 
280 
281 /* @func embPatSeqMatchFindC **************************************************
282 **
283 ** Find all the regular expression matches of reg in the string string.
284 **
285 ** @param [r] seq [const AjPSeq] Sequence to be searched.
286 ** @param [r] reg [const char*] regular expression text.
287 **
288 ** @return [EmbPPatMatch] Results of the pattern matching.
289 **
290 **
291 ** @release 1.0.0
292 ******************************************************************************/
293 
embPatSeqMatchFindC(const AjPSeq seq,const char * reg)294 EmbPPatMatch embPatSeqMatchFindC(const AjPSeq seq, const char *reg)
295 {
296     AjPStr regexp = NULL;
297     AjBool protein;
298     EmbPPatMatch results;
299 
300     protein = ajSeqIsProt(seq);
301 
302     regexp  = embPatSeqCreateRegExpC(reg,protein);
303     results = embPatMatchFind(regexp, ajSeqGetSeqS(seq), ajFalse, ajFalse);
304 
305     ajStrDel(&regexp);
306 
307     return results;
308 }
309 
310 
311 
312 
313 /* @func embPatSeqMatchFindAll ************************************************
314 **
315 ** Find all the regular expression matches of reg in the string string.
316 **
317 ** @param [r] seq [const AjPSeq] Sequence to be searched.
318 ** @param [r] reg [const AjPStr] regular expression string.
319 **
320 ** @return [EmbPPatMatch] Results of the pattern matching.
321 **
322 **
323 ** @release 1.0.0
324 ******************************************************************************/
325 
embPatSeqMatchFindAll(const AjPSeq seq,const AjPStr reg)326 EmbPPatMatch embPatSeqMatchFindAll(const AjPSeq seq, const AjPStr reg)
327 {
328     return embPatSeqMatchFindAllC(seq, ajStrGetPtr(reg));
329 }
330 
331 
332 
333 
334 /* @func embPatSeqMatchFindAllC ***********************************************
335 **
336 ** Find all the regular expression matches of reg in the string string.
337 **
338 ** @param [r] seq [const AjPSeq] Sequence to be searched.
339 ** @param [r] reg [const char*] regular expression text.
340 **
341 ** @return [EmbPPatMatch] Results of the pattern matching.
342 **
343 **
344 ** @release 1.0.0
345 ******************************************************************************/
346 
embPatSeqMatchFindAllC(const AjPSeq seq,const char * reg)347 EmbPPatMatch embPatSeqMatchFindAllC(const AjPSeq seq, const char *reg)
348 {
349     AjPStr regexp = NULL;
350     AjBool protein;
351     EmbPPatMatch results;
352 
353     protein = ajSeqIsProt(seq);
354 
355     regexp  = embPatSeqCreateRegExpC(reg,protein);
356     results = embPatMatchFindAll(regexp, ajSeqGetSeqS(seq), ajFalse, ajFalse);
357 
358     ajStrDel(&regexp);
359 
360     return results;
361 }
362 
363 
364 
365 
366 /* @func embPatMatchFind ******************************************************
367 **
368 ** Find all the regular expression matches of reg in the string string.
369 **
370 ** @param [r] regexp [const AjPStr] Regular expression string.
371 ** @param [r] strng [const AjPStr] String to be searched.
372 ** @param [r] left [AjBool] has to match the start
373 ** @param [r] right [AjBool] has to match the end
374 **
375 ** @return [EmbPPatMatch] Results of the pattern matching.
376 **
377 **
378 ** @release 1.0.0
379 ******************************************************************************/
380 
embPatMatchFind(const AjPStr regexp,const AjPStr strng,AjBool left,AjBool right)381 EmbPPatMatch embPatMatchFind(const AjPStr regexp, const AjPStr strng,
382 			     AjBool left, AjBool right)
383 {
384     return embPatMatchFindC(regexp, ajStrGetPtr(strng), left, right);
385 }
386 
387 
388 
389 
390 /* @func embPatMatchFindC *****************************************************
391 **
392 ** Find all the regular expression matches of reg in the string string.
393 **
394 ** @param [r] regexp [const AjPStr] Regular expression string.
395 ** @param [r] sptr   [const char *] String to be searched.
396 ** @param [r] left [AjBool] has to match the start
397 ** @param [r] right [AjBool] has to match the end
398 **
399 ** @return [EmbPPatMatch] Results of the pattern matching.
400 **
401 **
402 ** @release 1.0.0
403 ******************************************************************************/
404 
embPatMatchFindC(const AjPStr regexp,const char * sptr,AjBool left,AjBool right)405 EmbPPatMatch embPatMatchFindC(const AjPStr regexp, const char *sptr,
406 			     AjBool left, AjBool right)
407 {
408     AjPRegexp compexp = NULL;
409     EmbPPatMatch results;
410     AjPList poslist = ajListNew();
411     AjPList lenlist = ajListNew();
412     AjIList iter;
413     ajuint *pos;
414     ajuint *len;
415     ajuint posi;
416     ajuint i;
417     const char *ptr;
418     AjBool nterm = ajFalse;
419 /*    AjPListNode node;*/
420     AjPStr regstr = NULL;
421 
422     if(*regexp->Ptr == '^')
423 	nterm  = ajTrue;
424 
425     regstr = ajStrNewS(regexp);
426 
427     if(left)
428     {
429 	if(!nterm)
430 	    ajStrInsertC(&regstr, 0, "^");
431 	nterm = ajTrue;
432     }
433 
434     if(right)
435 	ajStrAppendC(&regstr, "$");
436 
437     ajDebug("embPatMatchFindC regexp: '%S' regstr: '%S'\n",
438 	    regexp, regstr);
439 
440     ajDebug("embPatMatchFindC sptr '%s'\n",
441 	    sptr);
442 
443     compexp = ajRegComp(regstr);
444 
445     ptr = sptr;
446 
447     AJNEW(results);
448 
449     while(*sptr != '\0' && ajRegExecC(compexp,sptr))
450     {
451 	AJNEW(pos);
452 	*pos = posi = ajRegOffset(compexp);
453 	AJNEW(len);
454 	*len = ajRegLenI(compexp,0);
455 	*pos += (ajuint) (sptr-ptr);
456 	ajListPush(poslist, pos);
457 	ajListPush(lenlist, len);
458 	sptr += posi+1;
459 
460 	if(nterm)
461 	    break;
462     }
463 
464     ajRegFree(&compexp);
465     results->number  = (ajuint) ajListGetLength(poslist);
466 
467     ajDebug("embPatMatchFindC '%S' nterm:%B results: %d\n",
468 	    regstr, nterm, results->number);
469 
470     if(results->number)
471     {
472 	AJCNEW(results->start, results->number);
473 	AJCNEW(results->len, results->number);
474 
475 	i = 0;
476 	iter = ajListIterNewread(poslist);
477 
478 	while(!ajListIterDone(iter))
479 	{
480 	    results->start[i] = *(ajuint *) ajListIterGet(iter);
481 	    i++;
482 	}
483 
484 	ajListIterDel(&iter);
485 
486 	i = 0;
487 	iter = ajListIterNewread(lenlist);
488 
489 	while(!ajListIterDone(iter))
490 	{
491 	    results->len[i] = *(ajuint *) ajListIterGet(iter);
492 	    i++;
493 	}
494 
495 	ajListIterDel(&iter);
496 
497 	ajListMap(poslist, &patStringFree, NULL);
498 	ajListMap(lenlist, &patStringFree, NULL);
499 	ajListFree(&poslist);
500 	ajListFree(&lenlist);
501 
502     }
503     else
504     {
505 	ajListFree(&poslist);
506 	ajListFree(&lenlist);
507     }
508 
509     ajStrDel(&regstr);
510 
511     return results;
512 }
513 
514 
515 
516 
517 /* @func embPatMatchFindAll ***************************************************
518 **
519 ** Find all the regular expression matches of reg in the string string.
520 **
521 ** @param [r] regexp [const AjPStr] Regular expression string.
522 ** @param [r] strng [const AjPStr] String to be searched.
523 ** @param [r] left [AjBool] has to match the start
524 ** @param [r] right [AjBool] has to match the end
525 **
526 ** @return [EmbPPatMatch] Results of the pattern matching.
527 **
528 **
529 ** @release 1.0.0
530 ******************************************************************************/
531 
embPatMatchFindAll(const AjPStr regexp,const AjPStr strng,AjBool left,AjBool right)532 EmbPPatMatch embPatMatchFindAll(const AjPStr regexp, const AjPStr strng,
533                                 AjBool left, AjBool right)
534 {
535     return embPatMatchFindAllC(regexp, ajStrGetPtr(strng), left, right);
536 }
537 
538 
539 
540 
541 /* @func embPatMatchFindAllC **************************************************
542 **
543 ** Find all the regular expression matches of reg in the string string.
544 **
545 ** @param [r] regexp [const AjPStr] Regular expression string.
546 ** @param [r] sptr   [const char *] String to be searched.
547 ** @param [r] left [AjBool] has to match the start
548 ** @param [r] right [AjBool] has to match the end
549 **
550 ** @return [EmbPPatMatch] Results of the pattern matching.
551 **
552 **
553 ** @release 1.0.0
554 ******************************************************************************/
555 
embPatMatchFindAllC(const AjPStr regexp,const char * sptr,AjBool left,AjBool right)556 EmbPPatMatch embPatMatchFindAllC(const AjPStr regexp, const char *sptr,
557                                  AjBool left, AjBool right)
558 {
559     AjPRegexp compexp = NULL;
560     EmbPPatMatch results;
561     AjPList poslist = ajListNew();
562     AjPList lenlist = ajListNew();
563     AjIList iter;
564     ajuint *pos;
565     ajuint *len;
566     ajuint posi;
567     ajuint i;
568     ajuint j;
569     ajuint k;
570     const char *ptr;
571     AjBool nterm = ajFalse;
572     AjPStr regstr = NULL;
573     ajuint soff;
574 
575     if(*regexp->Ptr == '^')
576 	nterm  = ajTrue;
577 
578     regstr = ajStrNewS(regexp);
579 
580     if(left)
581     {
582 	if(!nterm)
583 	    ajStrInsertC(&regstr, 0, "^");
584 	nterm = ajTrue;
585     }
586 
587     if(right)
588 	ajStrAppendC(&regstr, "$");
589 
590     ajDebug("embPatMatchFindAllC regexp: '%S' regstr: '%S'\n",
591 	    regexp, regstr);
592 
593     ajDebug("embPatMatchFindAllC sptr '%s'\n",
594 	    sptr);
595 
596     compexp = ajRegComp(regstr);
597 
598     ptr = sptr;
599 
600     AJNEW(results);
601 
602     while(*sptr != '\0' && ajRegExecallC(compexp,sptr))
603     {
604         soff = sptr - ptr;
605         j = ajRegGetMatches(compexp);
606         for(k=0; k < j; k++)
607         {
608             AJNEW(pos);
609             *pos = posi = ajRegOffsetI(compexp,k);
610             AJNEW(len);
611             *len = ajRegLenI(compexp,k);
612             *pos += soff;
613             ajListPush(poslist, pos);
614             ajListPush(lenlist, len);
615 
616             if(!k)
617                 sptr += posi+1;
618 
619             if(nterm)
620                 break;
621         }
622     }
623 
624     ajRegFree(&compexp);
625     results->number  = (ajuint) ajListGetLength(poslist);
626 
627     ajDebug("embPatMatchFindAllC '%S' nterm:%B results: %d\n",
628 	    regstr, nterm, results->number);
629 
630     if(results->number)
631     {
632 	AJCNEW(results->start, results->number);
633 	AJCNEW(results->len, results->number);
634 
635 	i = 0;
636 	iter = ajListIterNewread(poslist);
637 
638 	while(!ajListIterDone(iter))
639 	{
640 	    results->start[i] = *(ajuint *) ajListIterGet(iter);
641 	    i++;
642 	}
643 
644 	ajListIterDel(&iter);
645 
646 	i = 0;
647 	iter = ajListIterNewread(lenlist);
648 
649 	while(!ajListIterDone(iter))
650 	{
651 	    results->len[i] = *(ajuint *) ajListIterGet(iter);
652 	    i++;
653 	}
654 
655 	ajListIterDel(&iter);
656 
657 	ajListMap(poslist, &patStringFree, NULL);
658 	ajListMap(lenlist, &patStringFree, NULL);
659 	ajListFree(&poslist);
660 	ajListFree(&lenlist);
661 
662     }
663     else
664     {
665 	ajListFree(&poslist);
666 	ajListFree(&lenlist);
667     }
668 
669     ajStrDel(&regstr);
670 
671     return results;
672 }
673 
674 
675 
676 
677 /* @func embPatMatchGetLen ****************************************************
678 **
679 ** Returns the length from the pattern match structure for index'th item.
680 **
681 ** @param [r] data [const EmbPPatMatch] results of match.
682 ** @param [r] indexnum   [ajuint] index to structure.
683 **
684 ** @return [ajuint] returns -1 if not available.
685 **
686 **
687 ** @release 1.0.0
688 ******************************************************************************/
689 
embPatMatchGetLen(const EmbPPatMatch data,ajuint indexnum)690 ajuint embPatMatchGetLen(const EmbPPatMatch data, ajuint indexnum)
691 {
692     if(data->number <= indexnum)
693 	return -1;
694 
695     return data->len[indexnum];
696 }
697 
698 
699 
700 
701 /* @func embPatMatchGetEnd ****************************************************
702 **
703 ** Returns the End point for the pattern match structure for index'th item.
704 **
705 ** @param [r] data [const EmbPPatMatch] results of match.
706 ** @param [r] indexnum   [ajuint] index to structure.
707 **
708 ** @return [ajuint] returns -1 if not available.
709 **
710 **
711 ** @release 1.0.0
712 ******************************************************************************/
713 
embPatMatchGetEnd(const EmbPPatMatch data,ajuint indexnum)714 ajuint embPatMatchGetEnd(const EmbPPatMatch data, ajuint indexnum)
715 {
716     if(data->number <= indexnum)
717 	return -1;
718 
719     return data->len[indexnum]+data->start[indexnum]-1;
720 }
721 
722 
723 
724 
725 /* @func embPatMatchGetNumber *************************************************
726 **
727 ** Returns the number of pattern matches in the structure.
728 **
729 ** @param [r] data [const EmbPPatMatch] results of match.
730 **
731 ** @return [ajuint] returns -1 if not available.
732 **
733 **
734 ** @release 1.0.0
735 ******************************************************************************/
736 
embPatMatchGetNumber(const EmbPPatMatch data)737 ajuint embPatMatchGetNumber(const EmbPPatMatch data)
738 {
739   return data->number;
740 }
741 
742 
743 
744 
745 /* @func embPatMatchGetStart **************************************************
746 **
747 ** Returns the start position from the pattern match structure for
748 ** index'th item.
749 **
750 ** @param [r] data [const EmbPPatMatch] results of match.
751 ** @param [r] indexnum  [ajuint] index to structure.
752 **
753 ** @return [ajuint] returns -1 if not available.
754 **
755 **
756 ** @release 1.0.0
757 ******************************************************************************/
758 
embPatMatchGetStart(const EmbPPatMatch data,ajuint indexnum)759 ajuint embPatMatchGetStart(const EmbPPatMatch data, ajuint indexnum)
760 {
761     if(data->number <= indexnum)
762 	return -1;
763 
764     return data->start[indexnum];
765 }
766 
767 
768 
769 
770 /* @func embPatMatchDel *******************************************************
771 **
772 ** Free all the memory from the pattern match search.
773 **
774 ** @param [d] pthis [EmbPPatMatch*] results to be freed.
775 ** @return [void]
776 ** @category delete [EmbPPatMatch] Standard destructor
777 **
778 ** @release 1.0.0
779 ******************************************************************************/
780 
embPatMatchDel(EmbPPatMatch * pthis)781 void embPatMatchDel(EmbPPatMatch* pthis)
782 {
783     EmbPPatMatch thys;
784 
785     thys = pthis ? *pthis : 0;
786 
787     if(!pthis)
788 	return;
789 
790     if(!*pthis)
791 	return;
792 
793     if(thys->number)
794     {
795 	AJFREE(thys->start);
796 	AJFREE(thys->len);
797     }
798 
799     AJFREE(*pthis);
800 
801     return;
802 }
803 
804 
805 
806 
807 /* @func embPatPrositeToRegExp ************************************************
808 **
809 ** Convert a prosite pattern to a regular expression
810 **
811 ** Start and end are indicated by boolean options, as the pattern may have
812 ** been processed for the other pattern matching methods
813 **
814 ** @param [r] s [const AjPStr] prosite pattern
815 ** @return [AjPStr] regular expression
816 **
817 ** @release 1.0.0
818 ******************************************************************************/
819 
embPatPrositeToRegExp(const AjPStr s)820 AjPStr embPatPrositeToRegExp(const AjPStr s)
821 {
822     return embPatPrositeToRegExpEnds(s, AJFALSE, AJFALSE);
823 }
824 
825 
826 
827 
828 /* @func embPatPrositeToRegExpEnds ********************************************
829 **
830 ** Convert a prosite pattern to a regular expression string.
831 **
832 ** start and end say whether the ends should match in case this is a processed
833 ** pattern, but there is still a check for angle brackets.
834 **
835 ** @param [r] s [const AjPStr] prosite pattern
836 ** @param [r] start [AjBool] must match start
837 ** @param [r] end [AjBool] must match end
838 ** @return [AjPStr] regular expression
839 **
840 ** @release 2.9.0
841 ******************************************************************************/
842 
embPatPrositeToRegExpEnds(const AjPStr s,AjBool start,AjBool end)843 AjPStr embPatPrositeToRegExpEnds (const AjPStr s, AjBool start, AjBool end)
844 {
845     AjPStr t;
846     AjPStr c;
847     AjBool isnt = start;
848     AjBool isct = end;
849 
850     const char   *p;
851     static const char *aa = "ACDEFGHIKLMNPQRSTVWY";
852     static char ch[2];
853     ajuint len;
854     ajuint i;
855 
856     t   = ajStrNewC("");
857     len = ajStrGetLen(s);
858 
859     if(!len)
860 	return t;
861 
862     c = ajStrNew();
863     ajStrAssignS(&c, s);
864     ajStrFmtUpper(&c);
865     ajStrRemoveWhiteExcess(&c);
866     ch[1]='\0';
867 
868     p = ajStrGetPtr(c);
869 
870     for(i=0;i<len;++i)
871     {
872 	if(p[i]=='>')
873 	    isct = 1;
874 
875 	if(p[i]=='<')
876 	    isnt = 1;
877     }
878 
879     if(isnt)
880 	ajStrAppendC(&t,"^");
881 
882     while(*p)
883     {
884 	if(*p=='?')
885 	{
886 	    ajStrAppendC(&t,"[^#]");
887 	    ++p;
888 	    continue;
889 	}
890 
891 	if(*p=='X')
892 	{
893 	    ajStrAppendC(&t,"[^BJOUXZ]");
894 	    ++p;
895 	    continue;
896 	}
897 
898 	if(*p=='(')
899 	{
900 	    ++p;
901 	    ajStrAppendC(&t,"{");
902 
903 	    while(*p != ')')
904 	    {
905 		if(!*p)
906 		    ajFatal("Unmatched '(' in %S\n",s);
907 
908 		if(*p=='<' || *p=='>')
909 		{
910 		    ++p;
911 		    continue;
912 		}
913 
914 		*ch= *p;
915 		ajStrAppendC(&t,ch);
916 		++p;
917 	    }
918 
919 	    ajStrAppendC(&t,"}");
920 	    ++p;
921 	    continue;
922 	}
923 
924 	if(*p=='[')
925 	{
926 	    while(*p != ']')
927 	    {
928 		if(!*p)
929 		    ajFatal("Unmatched '[' in %S\n",s);
930 
931 		if(*p=='<' || *p=='>')
932 		{
933 		    ++p;
934 		    continue;
935 		}
936 
937 		*ch = *p;
938 		ajStrAppendC(&t,ch);
939 		++p;
940 	    }
941 
942 	    ajStrAppendC(&t,"]");
943 	    ++p;
944 	    continue;
945 	}
946 
947 	if(*p=='{')
948 	{
949 	    ++p;
950 	    ajStrAppendC(&t,"[^");
951 
952 	    while(*p != '}')
953 	    {
954 		if(!*p)
955 		    ajFatal("Unmatched '{' in %S\n",s);
956 
957 		if(*p=='<' || *p=='>')
958 		{
959 		    ++p;
960 		    continue;
961 		}
962 
963 		*ch = *p;
964 		ajStrAppendC(&t,ch);
965 		++p;
966 	    }
967 
968 	    ajStrAppendC(&t,"]");
969 	    ++p;
970 	    continue;
971 	}
972 
973 	if(strchr(aa,*p))
974 	{
975 	    *ch = *p;
976 	    ajStrAppendC(&t,ch);
977 	    ++p;
978 	    continue;
979 	}
980 
981 	if(!(*p==' ' || *p=='-' || *p=='>' || *p=='<'))
982 	    ajFatal("Unrecognised character in %S\n",s);
983 	++p;
984     }
985 
986     if(isct)
987 	ajStrAppendC(&t,"$");
988 
989     ajStrAssignS(&c,t);
990     ajStrDel(&t);
991 
992     return c;
993 }
994 
995 
996 
997 
998 /* @func embPatRestrictNew ****************************************************
999 **
1000 ** Create a new restriction object
1001 **
1002 ** @return [EmbPPatRestrict] the allocated object
1003 **
1004 ** @release 1.0.0
1005 ******************************************************************************/
1006 
embPatRestrictNew(void)1007 EmbPPatRestrict embPatRestrictNew(void)
1008 {
1009     EmbPPatRestrict thys;
1010 
1011     AJNEW0(thys);
1012 
1013     thys->cod  = ajStrNew();
1014     thys->pat  = ajStrNew();
1015     thys->bin  = ajStrNew();
1016     thys->org  = ajStrNew();
1017     thys->iso  = ajStrNew();
1018     thys->meth = ajStrNew();
1019     thys->sou  = ajStrNew();
1020     thys->sup  = ajStrNew();
1021 
1022     return thys;
1023 }
1024 
1025 
1026 
1027 
1028 /* @func embPatRestrictDel ****************************************************
1029 **
1030 ** Delete a restriction object
1031 **
1032 ** @param [d] thys [EmbPPatRestrict *] restriction object
1033 ** @return [void]
1034 ** @category delete [EmbPPatRestrict] Standard destructor
1035 **
1036 ** @release 1.0.0
1037 ******************************************************************************/
1038 
embPatRestrictDel(EmbPPatRestrict * thys)1039 void embPatRestrictDel(EmbPPatRestrict *thys)
1040 {
1041     ajStrDel(&(*thys)->cod);
1042     ajStrDel(&(*thys)->pat);
1043     ajStrDel(&(*thys)->bin);
1044     ajStrDel(&(*thys)->org);
1045     ajStrDel(&(*thys)->iso);
1046     ajStrDel(&(*thys)->meth);
1047     ajStrDel(&(*thys)->sou);
1048     ajStrDel(&(*thys)->sup);
1049     AJFREE(*thys);
1050 
1051     return;
1052 }
1053 
1054 
1055 
1056 
1057 /* @func embPatRestrictReadEntry **********************************************
1058 **
1059 ** Read next restriction enzyme from re file
1060 **
1061 ** @param [w] re [EmbPPatRestrict] restriction object to fill
1062 ** @param [u] inf [AjPFile] input file pointer
1063 ** @return [AjBool] True if read successful
1064 ** @category input [EmbPPatRestrict] Read next restriction enzyme from file
1065 **
1066 ** @release 1.0.0
1067 ******************************************************************************/
1068 
embPatRestrictReadEntry(EmbPPatRestrict re,AjPFile inf)1069 AjBool embPatRestrictReadEntry(EmbPPatRestrict re, AjPFile inf)
1070 {
1071     AjPStr line = NULL;
1072     AjBool ret = AJFALSE;
1073     const char *p = NULL;
1074     char *q = NULL;
1075     ajuint i;
1076 
1077     line = ajStrNew();
1078 
1079     while((ret=ajReadlineTrim(inf,&line)))
1080     {
1081 	p = ajStrGetPtr(line);
1082 
1083 	if(!(!*p || *p=='#' || *p=='!'))
1084 	    break;
1085     }
1086 
1087     if(!ret)
1088     {
1089 	ajStrDel(&line);
1090 
1091 	return ajFalse;
1092     }
1093 
1094 
1095     p = ajSysFuncStrtok(p,"\t \n");
1096     if(!p)
1097       return ajFalse;
1098     ajStrAssignC(&re->cod,p);
1099 
1100     p = ajSysFuncStrtok(NULL,"\t \n");
1101     if(!p)
1102       return ajFalse;
1103     ajStrAssignC(&re->pat,p);
1104     ajStrAssignC(&re->bin,p);
1105 
1106     p = ajSysFuncStrtok(NULL,"\t \n");
1107     if(!p)
1108       return ajFalse;
1109     if(!sscanf(p,"%u",&re->len))
1110       return ajFalse;
1111 
1112     p = ajSysFuncStrtok(NULL,"\t \n");
1113     if(!p)
1114       return ajFalse;
1115     if(!sscanf(p,"%u",&re->ncuts))
1116       return ajFalse;
1117 
1118     p = ajSysFuncStrtok(NULL,"\t \n");
1119     if(!p)
1120       return ajFalse;
1121     if(!sscanf(p,"%d",&re->blunt))
1122       return ajFalse;
1123 
1124     p = ajSysFuncStrtok(NULL,"\t \n");
1125     if(!p)
1126       return ajFalse;
1127     if(!sscanf(p,"%d",&re->cut1))
1128       return ajFalse;
1129 
1130     p = ajSysFuncStrtok(NULL,"\t \n");
1131     if(!p)
1132       return ajFalse;
1133     if(!sscanf(p,"%d",&re->cut2))
1134       return ajFalse;
1135 
1136     p = ajSysFuncStrtok(NULL,"\t \n");
1137     if(!p)
1138       return ajFalse;
1139     if(!sscanf(p,"%d",&re->cut3))
1140       return ajFalse;
1141 
1142     p = ajSysFuncStrtok(NULL,"\t \n");
1143     if(!p)
1144       return ajFalse;
1145     if(!sscanf(p,"%d",&re->cut4))
1146       return ajFalse;
1147 
1148 
1149     for(i=0,q=ajStrGetuniquePtr(&re->bin);i<re->len;++i)
1150 	*(q+i)=(char)ajBaseAlphaToBin((int)*(q+i));
1151 
1152     ajStrDel(&line);
1153 
1154     return ajTrue;
1155 }
1156 
1157 
1158 
1159 
1160 /* @funcstatic patRestrictPushHit *********************************************
1161 **
1162 ** Put a matching restriction enzyme on the heap
1163 ** as an EmbPMatMatch structure
1164 **
1165 ** @param [r] enz [const EmbPPatRestrict] Enyme information
1166 ** @param [u] l [AjPList] List to add to
1167 ** @param [r] pos [ajuint] Sequence match position
1168 ** @param [r] begin [ajuint] Sequence offset
1169 ** @param [r] len [ajuint] Sequence length
1170 ** @param [r] forward [AjBool] True if forward strand
1171 ** @param [r] plasmid [AjBool] Allow circular DNA (currently ignored)
1172 **
1173 ** @return [void]
1174 **
1175 ** @release 1.0.0
1176 ******************************************************************************/
1177 
patRestrictPushHit(const EmbPPatRestrict enz,AjPList l,ajuint pos,ajuint begin,ajuint len,AjBool forward,AjBool plasmid)1178 static void patRestrictPushHit(const EmbPPatRestrict enz,
1179 			       AjPList l, ajuint pos,
1180 			       ajuint begin, ajuint len,
1181 			       AjBool forward, AjBool plasmid)
1182 {
1183 
1184     EmbPMatMatch hit;
1185     ajuint v;
1186 
1187     (void) plasmid;			/* make it used */
1188 
1189     AJNEW0(hit);
1190 
1191     hit->seqname = ajStrNew();
1192     hit->cod = ajStrNewS(enz->cod);
1193     hit->pat = ajStrNewS(enz->pat);
1194     hit->acc = ajStrNew();
1195     hit->tit = ajStrNew();
1196     hit->iso = ajStrNew();
1197     hit->len = enz->len;
1198 
1199     if(forward)
1200     {
1201 	hit->forward = 1;
1202 	hit->start = pos+begin;
1203 	hit->cut1 = pos+begin+enz->cut1-1;
1204 	hit->cut2 = pos+begin+enz->cut2-1;
1205 
1206 	if(hit->cut1 > (ajint)(len+begin-1))
1207 	    hit->cut1-=len;
1208 
1209 	if(hit->cut2 > (ajint)(len+begin-1))
1210 	    hit->cut2-=len;
1211 
1212  	if(enz->cut1<1)
1213 	    ++hit->cut1;
1214 
1215 	if(enz->cut2<1)
1216 	    ++hit->cut2;
1217 
1218 	if(hit->cut1<1)
1219 	{
1220 	    hit->cut1+=len;
1221 	    hit->circ12 = ajTrue;
1222 	}
1223 
1224 	if(hit->cut2<1)
1225 	{
1226 	    hit->cut2+=len;
1227 	    hit->circ12 = ajTrue;
1228 	}
1229 
1230 	if(enz->ncuts == 4)
1231 	{
1232 	    hit->cut3 = pos+begin+enz->cut3-1;
1233 	    hit->cut4 = pos+begin+enz->cut4-1;
1234 
1235 	    if(hit->cut3>(ajint)(len+begin-1))
1236 	    {
1237 		hit->cut3-=len;
1238 		hit->circ34 = ajTrue;
1239 	    }
1240 
1241 	    if(hit->cut4>(ajint)(len+begin-1))
1242 	    {
1243 		hit->cut4-=len;
1244 		hit->circ34 = ajTrue;
1245 	    }
1246 	}
1247 	else hit->cut3 = hit->cut4 = 0;
1248     }
1249     else
1250     {
1251 	hit->forward = 0;
1252 	hit->start = len+begin-pos-1;
1253 	hit->cut1  = len+begin-pos-enz->cut1-1;
1254 	hit->cut2  = len+begin-pos-enz->cut2-1;
1255 
1256 	if(enz->cut1<1)
1257 	    --hit->cut1;
1258 
1259 	if(enz->cut2<1)
1260 	    --hit->cut2;
1261 
1262 	/* cuts beyond start of sequence */
1263 
1264 	if(hit->cut1<1)
1265 	{
1266 	    hit->cut1+=len;
1267 	    hit->circ12 = ajTrue;
1268 	}
1269 
1270 	if(hit->cut2<1)
1271 	{
1272 	    hit->cut2+=len;
1273 	    hit->circ12 = ajTrue;
1274 	}
1275 
1276 	/* cuts beyond end of sequence */
1277 
1278 	if(hit->cut1>(ajint)(len+begin-1))
1279 	{
1280 	    hit->cut1-=len;
1281 	    hit->circ12 = ajTrue;
1282 	}
1283 
1284 	if(hit->cut2>(ajint)(len+begin-1))
1285 	{
1286 	    hit->cut2-=len;
1287 	    hit->circ12 = ajTrue;
1288 	}
1289 
1290 	if(enz->ncuts == 4)
1291 	{
1292 	    /* ajDebug("so far, len:%d pos:%d begin:%d\n",
1293 		    len, pos, begin); */
1294 	    /* ajDebug("before, cut3:%d 4:%d circ34:%b\n",
1295 		    hit->cut3, hit->cut4, hit->circ34); */
1296 
1297 	    hit->cut3 = len+begin-pos-enz->cut3-1;
1298 	    hit->cut4 = len+begin-pos-enz->cut4-1;
1299 
1300 	    /* ajDebug("middle, cut3:%d 4:%d\n",
1301 		    hit->cut3, hit->cut4); */
1302 
1303 	    if(hit->cut3<0)
1304 	    {
1305 		hit->cut3+=len;
1306 		hit->circ34 = ajTrue;
1307 	    }
1308 
1309 	    if(hit->cut4<0)
1310 	    {
1311 		hit->cut4+=len;
1312 		hit->circ34 = ajTrue;
1313 	    }
1314 
1315 	    /* ajDebug("after, cut3:%d 4:%d circ34: %b\n",
1316 		    hit->cut3, hit->cut4, hit->circ34); */
1317 	}
1318 	else
1319 	    hit->cut3 = hit->cut4 = 0;
1320 
1321 	/* Reverse them to refer to forward strand */
1322 	v = hit->cut1;
1323 	hit->cut1 = hit->cut2;
1324 	hit->cut2 = v;
1325 	v = hit->cut3;
1326 	hit->cut3 = hit->cut4;
1327 	hit->cut4 = v;
1328     }
1329 
1330     /* ajDebug("embPatRestrictPushHit forward:%b\n", forward); */
1331     /* ajDebug("cut1:%d 2:%d 3:%d 4:%d\n",
1332 	    hit->cut1, hit->cut2, hit->cut3, hit->cut4); */
1333     ajListPush(l,(void *) hit);
1334 
1335     return;
1336 }
1337 
1338 
1339 
1340 
1341 /* @funcstatic patRestrictReadMethyl ******************************************
1342 **
1343 ** Read methylation data as a list of MethPdata structures
1344 **
1345 ** @param [u] methfile [AjPFile] Methylation data file
1346 **
1347 ** @return [AjPList] List of MethPData objects
1348 **
1349 ** @release 6.1.0
1350 ******************************************************************************/
1351 
patRestrictReadMethyl(AjPFile methfile)1352 static AjPList patRestrictReadMethyl(AjPFile methfile)
1353 {
1354     AjPStr line    = NULL;
1355     AjPStr name    = NULL;
1356     AjPStr site    = NULL;
1357     AjPStr replace = NULL;
1358     AjPStr pattern = NULL;
1359 
1360     AjPList ret = NULL;
1361 
1362     MethPData m = NULL;
1363 
1364 
1365     const char *p = NULL;
1366     char  c;
1367 
1368     replace = ajStrNew();
1369     name    = ajStrNew();
1370     site    = ajStrNew();
1371     line    = ajStrNew();
1372     pattern = ajStrNew();
1373 
1374     ret = ajListNew();
1375 
1376     while(ajReadlineTrim(methfile,&line))
1377     {
1378         p = ajStrGetPtr(line);
1379 
1380         if(!*p || *p=='#' || *p=='!')
1381             continue;
1382 
1383         if(ajFmtScanS(line,"%S%S%S",&name,&pattern) != 2)
1384         {
1385             ajWarn("Invalid methylation data line: %S",line);
1386             continue;
1387         }
1388 
1389         ajStrFmtUpper(&pattern);
1390         p = ajStrGetPtr(pattern);
1391 
1392         AJNEW(m);
1393         m->Replace = ajStrNew();
1394         m->Name    = ajStrNewS(name);
1395         m->Site    = ajStrNew();
1396 
1397 
1398         while((c=*p))
1399         {
1400             if(c == 'M')
1401             {
1402                 ajStrAppendK(&m->Replace,'N');
1403                 ++p;
1404                 ajStrAppendK(&m->Site,*p);
1405             }
1406             else
1407             {
1408                 ajStrAppendK(&m->Site,c);
1409                 ajStrAppendK(&m->Replace,c);
1410             }
1411 
1412             ++p;
1413         }
1414 
1415         ajStrFmtUpper(&m->Name);
1416         ajListPush(ret, (void *)m);
1417     }
1418 
1419 
1420     ajStrDel(&line);
1421     ajStrDel(&name);
1422     ajStrDel(&site);
1423     ajStrDel(&replace);
1424     ajStrDel(&pattern);
1425 
1426     return ret;
1427 }
1428 
1429 
1430 
1431 
1432 /* @funcstatic patRestrictMethylMod *******************************************
1433 **
1434 ** Search using methylation data as a list of MethPdata structures
1435 **
1436 ** Note that this routine does a simple substitution and
1437 ** relies on substitution strings having the same length
1438 ** as the methylase recognition sites. It also assumes that
1439 ** there are no overlap conflicts. If a methylase is found that
1440 ** violates the above then this routine should be revisited.
1441 **
1442 ** @param [w] Pstr [AjPStr*] sequence string
1443 ** @param [w] Prstr [AjPStr*] sequence string
1444 ** @param [u] methlist [AjPList] methlist
1445 **
1446 ** @return [void]
1447 **
1448 ** @release 6.1.0
1449 ******************************************************************************/
1450 
patRestrictMethylMod(AjPStr * Pstr,AjPStr * Prstr,AjPList methlist)1451 static void patRestrictMethylMod(AjPStr *Pstr, AjPStr *Prstr, AjPList methlist)
1452 {
1453     ajuint listlen;
1454     MethPData md = NULL;
1455     ajuint i;
1456 
1457     EmbOPatBYPNode off[AJALPHA];
1458     ajuint *sotable = NULL;
1459     ajuint solimit;
1460     AjPStr regexp = NULL;
1461     ajuint **skipm = NULL;
1462     ajint *buf    = NULL;
1463     const void *tidy    = NULL;
1464 
1465     ajuint plen = 0;
1466     ajint type = 0;
1467     AjBool amino    = ajFalse;
1468     AjBool carboxyl = ajFalse;
1469     ajint mismatch = 0;
1470     ajuint hits    = 0;
1471 
1472     ajuint m       = 0;
1473     ajuint j;
1474 
1475     ajint fpos = 0;
1476     ajint slen = 0;
1477 
1478     AjPStr origpat = NULL;
1479     AjPStr pattern = NULL;
1480     AjPStr seqname = NULL;
1481     AjPList l = NULL;
1482 
1483 
1484     EmbPMatMatch match = NULL;
1485 
1486     char *p = NULL;
1487     const char *q = NULL;
1488 
1489     char *rp = NULL;
1490 
1491 
1492     origpat = ajStrNew();
1493     pattern = ajStrNew();
1494     seqname = ajStrNewC("");
1495 
1496 
1497     listlen = (ajuint) ajListGetLength(methlist);
1498 
1499 
1500     l = ajListNew();
1501 
1502     p = ajStrGetuniquePtr(Pstr);
1503 
1504     slen = ajStrGetLen(*Pstr);
1505 
1506     for(i=0; i < listlen; ++i)
1507     {
1508         ajListPop(methlist, (void **)&md);
1509         ajStrAssignS(&origpat, md->Site);
1510         ajStrAssignS(&pattern, md->Site);
1511 
1512         if(!(type=embPatGetType(origpat,&pattern,mismatch,0,&m,&amino,
1513                                 &carboxyl)))
1514             ajFatal("patRestrictMethylMod: Illegal pattern");
1515 
1516         embPatCompile(type,pattern,&plen,&buf,off,&sotable,&solimit,&m,
1517                       &regexp,&skipm,mismatch);
1518 
1519 	embPatFuzzSearch(type,0,pattern,seqname,*Pstr,l,
1520 			 plen,mismatch,amino,carboxyl,buf,off,sotable,
1521 			 solimit,regexp,skipm,&hits,m,(const void **)&tidy);
1522 
1523 
1524         rp = ajStrGetuniquePtr(Prstr);
1525 
1526         while(ajListPop(l,(void **)&match))
1527         {
1528 
1529             q = ajStrGetPtr(md->Replace);
1530 
1531             for(j=0; j < match->len; ++j)
1532             {
1533                 fpos = match->start + j;
1534                 p[fpos] = q[j];
1535 
1536                 if(q[j] == 'N') /* Know out base on opposite strand too */
1537                     rp[slen - fpos -1] = 'N';
1538             }
1539 
1540             embMatMatchDel(&match);
1541         }
1542 
1543 
1544         if(type==6)
1545             for(j=0;j<m;++j)
1546                 AJFREE(skipm[j]);
1547 
1548 /*
1549         if(tidy)
1550         {
1551             tydy = (void *)tidy;
1552             AJFREE(tydy);
1553         }
1554 */
1555 
1556         ajListPushAppend(methlist, (void *)md);
1557     }
1558 
1559 
1560     ajStrDel(&origpat);
1561     ajStrDel(&pattern);
1562     ajStrDel(&seqname);
1563 
1564     ajListFree(&l);
1565 
1566 
1567     return;
1568 }
1569 
1570 
1571 
1572 
1573 /* @func embPatRestrictScan ***************************************************
1574 **
1575 ** Scan a sequence with a restriction object
1576 **
1577 ** @param [r] enz [const EmbPPatRestrict] Enyme information
1578 ** @param [r] substr [const AjPStr] Sequence as ASCII
1579 ** @param [r] binstr [const AjPStr] Sequence as binary IUB
1580 ** @param [r] revstr [const AjPStr] Sequence as ASCII reversed
1581 ** @param [r] binrev [const AjPStr] Sequence as binary IUB reversed
1582 ** @param [r] len [ajuint] Length of sequence
1583 ** @param [r] ambiguity [AjBool] Allow ambiguity (binary search)
1584 ** @param [r] plasmid [AjBool] Allow circular DNA
1585 ** @param [r] min [ajuint] Minimum # of matches allowed
1586 ** @param [r] max [ajuint] Maximum # of matches
1587 ** @param [r] begin [ajuint] Sequence offset
1588 ** @param [u] l [AjPList] List to push hits to
1589 **
1590 ** @return [ajuint] Number of matches
1591 **
1592 ** @release 1.0.0
1593 ******************************************************************************/
1594 
embPatRestrictScan(const EmbPPatRestrict enz,const AjPStr substr,const AjPStr binstr,const AjPStr revstr,const AjPStr binrev,ajuint len,AjBool ambiguity,AjBool plasmid,ajuint min,ajuint max,ajuint begin,AjPList l)1595 ajuint embPatRestrictScan(const EmbPPatRestrict enz,
1596 			 const AjPStr substr, const AjPStr binstr,
1597 			 const AjPStr revstr, const AjPStr binrev, ajuint len,
1598 			 AjBool ambiguity, AjBool plasmid, ajuint min,
1599 			 ajuint max, ajuint begin, AjPList l)
1600 {
1601     ajuint limit;
1602     ajuint i;
1603     ajuint j;
1604     ajuint hits;
1605     ajuint rhits = 0;
1606     const char *p;
1607     const char *q;
1608     const char *t;
1609     ajint  mincut;
1610     ajint  maxcut;
1611     AjBool forward;
1612     ajint  v;
1613     AjPList tx     = NULL;
1614     AjPList ty     = NULL;
1615     EmbPMatMatch m = NULL;
1616     EmbPMatMatch z = NULL;
1617 
1618     if(len < enz->len)
1619         return 0;
1620 
1621     if(plasmid)
1622 	limit=len;
1623     else
1624 	limit=len-enz->len+1;
1625 
1626     mincut=AJMIN(enz->cut1,enz->cut2);
1627 
1628     if(enz->ncuts==4)
1629     {
1630 	mincut=AJMIN(mincut,enz->cut3);
1631 	mincut=AJMIN(mincut,enz->cut4);
1632     }
1633 
1634     maxcut=AJMAX(enz->cut1,enz->cut2);
1635 
1636     if(enz->ncuts==4)
1637     {
1638 	maxcut=AJMAX(maxcut,enz->cut3);
1639 	maxcut=AJMAX(maxcut,enz->cut4);
1640     }
1641 
1642     ajDebug("embPatRestrictScan '%S' '%S' ncuts:%d blunt:%b\n",
1643 	    enz->cod, enz->pat, enz->ncuts, enz->blunt);
1644     /* ajDebug("cut1:%d 2:%d 3:%d 4:%d\n",
1645 	    enz->cut1, enz->cut2, enz->cut3, enz->cut4 ); */
1646 
1647     tx = ajListNew();
1648     ty = ajListNew();
1649 
1650     if(ambiguity)
1651     {
1652 	p = ajStrGetPtr(binstr);
1653 	t = ajStrGetPtr(enz->bin);
1654 
1655 	forward = ajTrue;
1656 
1657 	for(i=0,hits=0;i<limit;++i)
1658 	{
1659 	    for(j=0,q=t;j<enz->len;++j,++q)
1660 	    {
1661 		v=*(p+i+j);
1662 
1663 		if(!(*q & v) || v==15)
1664 		    break;
1665 	    }
1666 
1667 	    if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1668 					      i+enz->cut2>=len))
1669 		continue;
1670 
1671 	    if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1672 	    {
1673 		++hits;
1674 		patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1675 	    }
1676 
1677 	}
1678 
1679 	forward = ajFalse;
1680 	p = ajStrGetPtr(binrev);
1681 
1682 	for(i=0;i<limit;++i)
1683 	{
1684 	    for(j=0,q=t;j<enz->len;++j,++q)
1685 	    {
1686 		v = *(p+i+j);
1687 
1688 		if(!(*q & v) || v==15)
1689 		    break;
1690 	    }
1691 
1692 	    if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1693 					      i+enz->cut2>=len))
1694 		continue;
1695 
1696 	    if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1697 	    {
1698 		++hits;
1699 		patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1700 	    }
1701 
1702 	}
1703 
1704     }
1705     else
1706     {
1707 	p = ajStrGetPtr(substr);
1708 	t = ajStrGetPtr(enz->pat);
1709 	forward = ajTrue;
1710 
1711 	for(i=0,hits=0;i<limit;++i)
1712 	{
1713 	    for(j=0,q=t;j<enz->len;++j,++q)
1714 	    {
1715 		v=*(p+i+j);
1716 
1717 		if(*q != v || v=='N')
1718 		    break;
1719 	    }
1720 
1721 	    if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1722 					      i+enz->cut2>=len))
1723 		continue;
1724 
1725 	    if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1726 	    {
1727 		++hits;
1728 		patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1729 	    }
1730 
1731 	}
1732 
1733 	forward = ajFalse;
1734 	p = ajStrGetPtr(revstr);
1735 
1736 	for(i=0;i<limit;++i)
1737 	{
1738 	    for(j=0,q=t;j<enz->len;++j,++q)
1739 	    {
1740 		v = *(p+i+j);
1741 
1742 		if(*q != v || v=='N')
1743 		    break;
1744 	    }
1745 
1746 	    if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1747 					      i+enz->cut2>=len))
1748 		continue;
1749 
1750 	    if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1751 	    {
1752 		++hits;
1753 		patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1754 	    }
1755 	}
1756     }
1757 
1758 
1759     if(hits)
1760     {
1761 	ajListSort(tx, &embPatRestrictCutCompare);
1762 
1763 	for(i=0,rhits=0,v=0;i<hits;++i)
1764 	{
1765 	    ajListPop(tx,(void **)&m);
1766 
1767 	    if(m->cut1 != v)
1768 	    {
1769 		ajListPush(ty,(void *)m);
1770 		++rhits;
1771 		v = m->cut1;
1772 	    }
1773 	    else
1774 	    {
1775 		if(i)
1776 		    if(m->forward)
1777 		    {
1778 			ajListPop(ty,(void **)&z);
1779 			ajListPush(ty,(void *)m);
1780 			m=z;
1781 		    }
1782 
1783 		embMatMatchDel(&m);
1784 	    }
1785 	}
1786 
1787 	if(rhits<min || rhits>max)
1788 	{
1789 	    while(ajListPop(ty,(void **)&m));
1790 
1791 	    ajListFree(&tx);
1792 	    ajListFree(&ty);
1793 
1794 	    return 0;
1795 	}
1796 	else
1797 	{
1798 	    while(ajListPop(ty,(void **)&m))
1799 		ajListPush(l,(void *)m);
1800 	    hits = rhits;
1801 	}
1802     }
1803 
1804     ajListFree(&tx);
1805     ajListFree(&ty);
1806 
1807     return hits;
1808 }
1809 
1810 
1811 
1812 
1813 /* @func embPatKMPInit ********************************************************
1814 **
1815 ** Initialise a Knuth-Morris-Pratt pattern.
1816 **
1817 ** @param [r] pat [const AjPStr] pattern
1818 ** @param [r] len [ajuint] length of pattern
1819 ** @param [w] next [ajint *] offset table
1820 **
1821 ** @return [void]
1822 **
1823 ** @release 1.0.0
1824 ******************************************************************************/
1825 
embPatKMPInit(const AjPStr pat,ajuint len,ajint * next)1826 void embPatKMPInit(const AjPStr pat, ajuint len, ajint *next)
1827 {
1828     ajuint i;
1829     ajint k;
1830     ajuint t;
1831     const char *p;
1832 
1833     p = ajStrGetPtr(pat);
1834     t = len-1;
1835 
1836     i = 0;
1837     k = -1;
1838     next[0] = -1;
1839 
1840     while(i<t)
1841     {
1842 	while(k>=0 && p[i]!=p[k])
1843 	    k = next[k];
1844 
1845 	++i;
1846 	++k;
1847 
1848 	if(p[i]==p[k])
1849 	    next[i] = next[k];
1850 	else
1851 	    next[i] = k;
1852     }
1853 
1854     return;
1855 }
1856 
1857 
1858 
1859 
1860 /* @func embPatKMPSearch ******************************************************
1861 **
1862 ** Perform a Knuth-Morris-Pratt search
1863 **
1864 ** @param [r] str [const AjPStr] string to search
1865 ** @param [r] pat [const AjPStr] pattern to use
1866 ** @param [r] slen [ajuint] length of string
1867 ** @param [r] plen [ajuint] length of pattern
1868 ** @param [r] next [const ajint *] array from embPatKMPInit (can be -1)
1869 ** @param [r] start [ajuint] position within str to start search
1870 **
1871 ** @return [ajuint] Index of match in str or -1 if not found
1872 **
1873 ** @release 1.0.0
1874 ******************************************************************************/
1875 
embPatKMPSearch(const AjPStr str,const AjPStr pat,ajuint slen,ajuint plen,const ajint * next,ajuint start)1876 ajuint embPatKMPSearch(const AjPStr str, const AjPStr pat,
1877 		      ajuint slen, ajuint plen, const ajint *next,
1878 		      ajuint start)
1879 {
1880     ajuint i;
1881     ajint j;
1882     const char *p;
1883     const char *q;
1884 
1885     p = ajStrGetPtr(str);
1886     q = ajStrGetPtr(pat);
1887 
1888     i = start;
1889     j = 0;
1890 
1891     while(i<slen && j<(ajint)plen)
1892     {
1893 	while(j>=0 && p[i]!=q[j])
1894 	    j = next[j];
1895 
1896 	++i;
1897 	++j;
1898     }
1899 
1900     if(j==(ajint)plen)
1901 	return i-plen;
1902 
1903     return -1;
1904 }
1905 
1906 
1907 
1908 
1909 /* @func embPatBMHInit ********************************************************
1910 **
1911 ** Initialise a Boyer-Moore-Horspool pattern.
1912 **
1913 ** @param [r] pat [const AjPStr] pattern
1914 ** @param [r] len [ajuint] pattern length
1915 ** @param [w] skip [ajint *] offset table (can be -1)
1916 **
1917 ** @return [void]
1918 **
1919 ** @release 1.0.0
1920 ******************************************************************************/
1921 
embPatBMHInit(const AjPStr pat,ajuint len,ajint * skip)1922 void embPatBMHInit(const AjPStr pat, ajuint len, ajint *skip)
1923 {
1924     ajuint i;
1925     ajuint t;
1926     const char *p;
1927 
1928     p = ajStrGetPtr(pat);
1929 
1930     t = len-1;
1931 
1932     for(i=0;i<AJALPHA;++i)
1933 	skip[i] = t;
1934 
1935     for(i=0;i<t;++i)
1936 	skip[(ajuint)p[i]] = t-i;
1937 
1938     return;
1939 }
1940 
1941 
1942 
1943 
1944 /* @func embPatBMHSearch ******************************************************
1945 **
1946 ** Perform a Boyer-Moore-Horspool search
1947 **
1948 ** @param [r] str [const AjPStr] string to search
1949 ** @param [r] pat [const AjPStr] pattern to use
1950 ** @param [r] slen [ajuint] length of string
1951 ** @param [r] plen [ajuint] length of pattern
1952 ** @param [r] skip [const ajint *] array from embPatBMHInit (can be -1)
1953 ** @param [r] start [ajuint] position within str to start search
1954 ** @param [r] left [AjBool] has to match the start
1955 ** @param [r] right [AjBool] has to match the end
1956 ** @param [u] l [AjPList] list to push to
1957 ** @param [r] name [const AjPStr] name of entry
1958 ** @param [r] begin [ajuint] offset in orig sequence
1959 **
1960 ** @return [ajuint] number of hits
1961 **
1962 ** @release 1.0.0
1963 ******************************************************************************/
1964 
embPatBMHSearch(const AjPStr str,const AjPStr pat,ajuint slen,ajuint plen,const ajint * skip,ajuint start,AjBool left,AjBool right,AjPList l,const AjPStr name,ajuint begin)1965 ajuint embPatBMHSearch(const AjPStr str, const AjPStr pat,
1966 		      ajuint slen, ajuint plen, const ajint *skip,
1967 		      ajuint start, AjBool left, AjBool right, AjPList l,
1968 		      const AjPStr name, ajuint begin)
1969 {
1970     ajuint i;
1971     ajuint j;
1972     ajuint jj;
1973     ajuint k = 0;
1974     const char *p;
1975     const char *q;
1976     AjBool flag;
1977     ajuint count;
1978 
1979     if(left && start)
1980 	return 0;
1981 
1982     p = ajStrGetPtr(str);
1983     q = ajStrGetPtr(pat);
1984 
1985     flag = ajTrue;
1986     count = 0;
1987 
1988     i = start+(plen-1);
1989     jj = plen;
1990     j = jj-1;
1991 
1992     while(flag)
1993     {
1994 	while(jj>0 && i<slen)
1995 	{
1996 	    k = i;
1997 
1998 	    while(jj>0 && p[k]==q[j])
1999 	    {
2000 		--k;
2001 		--j;
2002 		--jj;
2003 	    }
2004 
2005 	    if(jj>0)
2006 	    {
2007 		i += skip[(ajuint)p[i]];
2008 		jj = plen;
2009 		j = jj-1;
2010 	    }
2011 	}
2012 
2013 	if(jj == 0)
2014 	{
2015 	    if(left && k+1)
2016 		return 0;
2017 
2018 	    if(!right || (right && k+1+plen==slen))
2019 	    {
2020 		++count;
2021 		embPatPushHit(l,name,k+1,plen,begin,0);
2022 	    }
2023 
2024 	    i = start+(plen-1)+k+2;
2025 	    jj = plen;
2026 	    j = jj-1;
2027 
2028 	}
2029 	else
2030 	    flag=ajFalse;
2031     }
2032 
2033     return count;
2034 }
2035 
2036 
2037 
2038 
2039 /* @func embPatBYPInit ********************************************************
2040 **
2041 ** Initialise a Baeza-Yates,Perleberg pattern.
2042 **
2043 ** @param [r] pat [const AjPStr] pattern
2044 ** @param [r] len [ajuint] pattern length
2045 ** @param [w] offset [EmbPPatBYPNode] character index
2046 ** @param [w] buf [ajint *] mismatch count
2047 **
2048 ** @return [void]
2049 **
2050 ** @release 1.0.0
2051 ******************************************************************************/
2052 
embPatBYPInit(const AjPStr pat,ajuint len,EmbPPatBYPNode offset,ajint * buf)2053 void embPatBYPInit(const AjPStr pat, ajuint len, EmbPPatBYPNode offset,
2054 		   ajint *buf)
2055 {
2056     ajuint i;
2057     ajuint j;
2058 
2059     const char *p;
2060     EmbPPatBYPNode op;
2061 
2062     p = ajStrGetPtr(pat);
2063 
2064     for(i=0;i<AJALPHA;++i)
2065     {
2066 	offset[i].offset = -1;
2067 	offset[i].next   = NULL;
2068 	buf[i] = len;
2069     }
2070 
2071     for(i=0,j=AJALPHA>>1;i<len;++i,++p)
2072     {
2073 	buf[i] = AJALPHA;
2074 
2075 	if(offset[(ajuint)*p].offset == -1)
2076 	    offset[(ajuint)*p].offset = len-i-1;
2077 	else
2078 	{
2079 	    op=offset[(ajuint)*p].next;
2080 	    offset[(ajuint)*p].next=&offset[j++];
2081 	    offset[(ajuint)*p].next->offset = len-i-1;
2082 	    offset[(ajuint)*p].next->next = op;
2083 	}
2084     }
2085 
2086     buf[len-1] = len;
2087 
2088     return;
2089 }
2090 
2091 
2092 
2093 
2094 /* @func embPatPushHit ********************************************************
2095 **
2096 ** Put a matching string search hit on the heap
2097 ** as an EmbPMatMatch structure
2098 **
2099 ** @param [u] l [AjPList] list to push to
2100 ** @param [r] name [const AjPStr] string name
2101 ** @param [r] pos [ajuint] Sequence match position
2102 ** @param [r] plen [ajuint] pattern length
2103 ** @param [r] begin [ajuint] Sequence offset
2104 ** @param [r] mm [ajuint] number of mismatches
2105 **
2106 ** @return [void]
2107 **
2108 ** @release 1.0.0
2109 ******************************************************************************/
2110 
embPatPushHit(AjPList l,const AjPStr name,ajuint pos,ajuint plen,ajuint begin,ajuint mm)2111 void embPatPushHit(AjPList l, const AjPStr name, ajuint pos, ajuint plen,
2112 		   ajuint begin, ajuint mm)
2113 {
2114     EmbPMatMatch hit;
2115 
2116     AJNEW0(hit);
2117 
2118     hit->seqname = ajStrNewS(name);
2119     hit->len = plen;
2120     hit->cod = ajStrNew();
2121     hit->pat = ajStrNew();
2122     hit->acc = ajStrNew();
2123     hit->tit = ajStrNew();
2124     hit->start = pos+begin;
2125     hit->mm = mm;
2126     hit->end = pos+begin+plen-1;
2127     if(hit->start <= hit->end)
2128         hit->forward = ajTrue;
2129     ajListPush(l,(void *) hit);
2130 
2131     return;
2132 }
2133 
2134 
2135 
2136 
2137 /* @func embPatBYPSearch ******************************************************
2138 **
2139 ** Perform a Baeza-Yates,Perleberg search.
2140 **
2141 ** @param [r] str [const AjPStr] search string
2142 ** @param [r] name [const AjPStr] search string
2143 ** @param [r] begin [ajuint] sequence offset
2144 ** @param [r] slen [ajuint] string length
2145 ** @param [r] plen [ajuint] pattern length
2146 ** @param [r] mm [ajuint] allowed mismatches (Hamming distance)
2147 ** @param [u] offset [EmbPPatBYPNode] character index
2148 ** @param [u] buf [ajint *] mismatch count array
2149 ** @param [u] l [AjPList] list to push hits to
2150 ** @param [r] amino [AjBool] if true, match at amino terminal end
2151 ** @param [r] carboxyl [AjBool] if true, match at carboxyl terminal end
2152 ** @param [r] pat [const AjPStr] original pattern
2153 **
2154 ** @return [ajuint] number of matches
2155 **
2156 ** @release 1.0.0
2157 ******************************************************************************/
2158 
embPatBYPSearch(const AjPStr str,const AjPStr name,ajuint begin,ajuint slen,ajuint plen,ajuint mm,EmbPPatBYPNode offset,ajint * buf,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr pat)2159 ajuint embPatBYPSearch(const AjPStr str, const AjPStr name,
2160 		      ajuint begin, ajuint slen,
2161 		      ajuint plen, ajuint mm, EmbPPatBYPNode offset,
2162 		      ajint *buf,
2163 		      AjPList l, AjBool amino, AjBool carboxyl,
2164 		      const AjPStr pat)
2165 {
2166     const char *p;
2167     const char *q;
2168     ajuint  i;
2169     ajint  t;
2170     EmbPPatBYPNode off;
2171     ajint count;
2172     AjPStr pattern;
2173 
2174     p = MAJSTRGETPTR(str);
2175     pattern = ajStrNewS(pat);
2176     ajStrFmtUpper(&pattern);
2177     q = MAJSTRGETPTR(pattern);
2178 
2179     count = mm;
2180 
2181     for(i=0;i<plen;++i)
2182 	if(*q++!=*p++)
2183 	    if(--count<0)
2184 		break;
2185 
2186     if(count>=0 && (!carboxyl || (carboxyl && i==slen)))
2187     {
2188 	embPatPushHit(l,name,0,plen,begin,mm-count);
2189 	count = 1;
2190     }
2191     else
2192 	count = 0;
2193 
2194     p = MAJSTRGETPTR(str);
2195 
2196     for(i=0;i<slen;++i)
2197     {
2198 	if((t=(off=&offset[(ajuint)*p++])->offset)>=0)
2199 	{
2200 	    buf[(i+t)&AJMOD256]--;
2201 
2202 	    for(off=off->next;off!=NULL;off=off->next)
2203 		buf[(i+off->offset)&AJMOD256]--;
2204 	}
2205 
2206 	if(buf[i&AJMOD256]<=(ajint)mm)
2207 	{
2208 	    if(amino && i-plen+1!=0)
2209 		return count;
2210 
2211 	    if(!carboxyl || (carboxyl && i+1==slen))
2212 	    {
2213 		++count;
2214 		embPatPushHit(l,name,i-plen+1,plen,begin,buf[i&AJMOD256]);
2215 	    }
2216 	}
2217 
2218 	buf[i&AJMOD256] = plen;
2219     }
2220 
2221     MAJSTRDEL(&pattern);
2222 
2223     return count;
2224 }
2225 
2226 
2227 
2228 
2229 /* @funcstatic patAminoCarboxyl ***********************************************
2230 **
2231 ** Checks for start and/or end angle bracket markers
2232 ** Removes them from the string and sets bools accordingly
2233 **
2234 ** @param [r] s [const AjPStr] original pattern
2235 ** @param [u] cs [AjPStr *] modified pattern
2236 ** @param [w] amino [AjBool *] set if start marker (left angle bracket)
2237 ** @param [w] carboxyl [AjBool *] set if end marker (right angle bracket)
2238 **
2239 ** @return [void]
2240 **
2241 ** @release 1.0.0
2242 ******************************************************************************/
2243 
patAminoCarboxyl(const AjPStr s,AjPStr * cs,AjBool * amino,AjBool * carboxyl)2244 static void patAminoCarboxyl(const AjPStr s, AjPStr *cs,
2245 			     AjBool *amino, AjBool *carboxyl)
2246 {
2247     AjPStr t;
2248     const char *p;
2249 
2250     t = ajStrNewC("");
2251     p = ajStrGetPtr(s);
2252 
2253     while(*p)
2254     {
2255 	if(*p==' ' || *p=='-' || *p=='.')
2256 	{
2257 	    ++p;
2258 	    continue;
2259 	}
2260 
2261 	if(*p=='<')
2262 	{
2263 	    *amino = ajTrue;
2264 	    ++p;
2265 	    continue;
2266 	}
2267 
2268 	if(*p=='>')
2269 	{
2270 	    *carboxyl = ajTrue;
2271 	    ++p;
2272 	    continue;
2273 	}
2274 
2275 	ajStrAppendK(&t,*p);
2276 	++p;
2277     }
2278 
2279     ajStrAssignS(cs,t);
2280     ajStrDel(&t);
2281 
2282     return;
2283 }
2284 
2285 
2286 
2287 
2288 /* @funcstatic patParenTest ***************************************************
2289 **
2290 ** Checks parenthesis grammar. Sets repeat and range bools
2291 **
2292 ** @param [r] p [const char *] pattern
2293 ** @param [w] repeat [AjBool *] set if any parenthesis e.g. (3)
2294 ** @param [w] range [AjBool *] set if range e.g. (5,8)
2295 **
2296 ** @return [AjBool] True if grammar correct
2297 **
2298 ** @release 1.0.0
2299 ******************************************************************************/
2300 
patParenTest(const char * p,AjBool * repeat,AjBool * range)2301 static AjBool patParenTest(const char *p, AjBool *repeat, AjBool *range)
2302 {
2303     ajuint i;
2304 
2305     *repeat = ajTrue;
2306     p = p+2;
2307 
2308     if(sscanf(p,"%u",&i)!=1)
2309     {
2310 	ajWarn("Illegal pattern. Missing repeat number");
2311 
2312 	return ajFalse;
2313     }
2314 
2315     while(*p)
2316     {
2317 	if(*p==')')
2318 	    break;
2319 
2320 	if(*p=='('||*p=='['||*p=='{'||*p=='}'||*p==']' ||
2321 	   isalpha((ajuint)*p))
2322 	{
2323 	    ajWarn("Illegal pattern. Nesting not allowed");
2324 
2325 	    return ajFalse;
2326 	}
2327 
2328 	if(*p==',')
2329 	{
2330 	    *range = ajTrue;
2331 	    ++p;
2332 
2333 	    if(sscanf(p,"%u",&i)!=1)
2334 	    {
2335 		ajWarn("Illegal pattern. Missing range number");
2336 
2337 		return ajFalse;
2338 	    }
2339 	    continue;
2340 	}
2341 
2342 	++p;
2343     }
2344 
2345     if(!*p)
2346     {
2347 	ajWarn("Illegal pattern. Missing parenthesis");
2348 
2349 	return ajFalse;
2350     }
2351 
2352     return ajTrue;
2353 }
2354 
2355 
2356 
2357 
2358 /* @funcstatic patExpandRepeat ************************************************
2359 **
2360 ** Expand repeats e.g. [ABC](2) to [ABC][ABC]
2361 **
2362 ** @param [u] s [AjPStr *] pattern
2363 **
2364 ** @return [AjBool] ajTrue on success
2365 **
2366 ** @release 1.0.0
2367 ** @@
2368 ******************************************************************************/
2369 
patExpandRepeat(AjPStr * s)2370 static AjBool patExpandRepeat(AjPStr *s)
2371 {
2372     AjPStr t;
2373     const char *p;
2374     const char *q;
2375     ajuint count;
2376     ajuint i;
2377 
2378     t = ajStrNewC("");
2379     p = ajStrGetPtr(*s);
2380 
2381     while(*p)
2382     {
2383 	if(*p=='[' || *p=='{')
2384 	{
2385 	    q = p;
2386 
2387 	    while(!(*p==']' || *p=='}'))
2388 		++p;
2389 
2390 	    if(*(p+1)!='(')
2391 		count = 1;
2392 	    else
2393 		sscanf(p+2,"%u",&count);
2394 
2395 	    if(count<=0)
2396 	    {
2397 		ajWarn("Illegal pattern. Bad repeat count");
2398 
2399 		return ajFalse;
2400 	    }
2401 
2402 	    for(i=0;i<count;++i)
2403 	    {
2404 		p = q;
2405 
2406 		while(!(*p==']'||*p=='}'))
2407 		{
2408 		    ajStrAppendK(&t,*p);
2409 		    ++p;
2410 		}
2411 
2412 		ajStrAppendK(&t,*p);
2413 	    }
2414 
2415 	    if(*(p+1)=='(')
2416 		while(*p!=')')
2417 		    ++p;
2418 
2419 	    ++p;
2420 	    continue;
2421 	}
2422 
2423 	if(*p=='(')
2424 	{
2425 	    sscanf(p+1,"%u",&count);
2426 
2427 	    if(count<=0)
2428 	    {
2429 		ajWarn("Illegal pattern. Bad range number");
2430 
2431 		return ajFalse;
2432 	    }
2433 
2434 	    for(i=1;i<count;++i)
2435 		ajStrAppendK(&t,*(p-1));
2436 
2437 	    while(*p!=')')
2438 		++p;
2439 
2440 	    ++p;
2441 	    continue;
2442 	}
2443 
2444 	ajStrAppendK(&t,*p);
2445 	++p;
2446     }
2447 
2448     ajStrAssignC(s,ajStrGetPtr(t));
2449     ajStrDel(&t);
2450 
2451     return ajTrue;
2452 }
2453 
2454 
2455 
2456 
2457 /* @funcstatic patIUBTranslate ************************************************
2458 **
2459 ** Convert IUB nucleotide symbols to classes e.g. S to [GC]
2460 **
2461 ** @param [u] pat [AjPStr *] pattern
2462 **
2463 ** @return [void]
2464 **
2465 ** @release 1.0.0
2466 ******************************************************************************/
2467 
patIUBTranslate(AjPStr * pat)2468 static void patIUBTranslate(AjPStr *pat)
2469 {
2470     AjPStr t;
2471     const char *p;
2472     AjBool escape = ajFalse;
2473 
2474     t = ajStrNewC(ajStrGetPtr(*pat));
2475     p = ajStrGetPtr(t);
2476     ajStrSetClear(pat);
2477 
2478     while(*p)
2479     {
2480         if(escape)
2481         {
2482             ajStrAppendK(pat,*p);
2483             ++p;
2484             escape=ajFalse;
2485             continue;
2486         }
2487 
2488         if(*p=='\\')
2489         {
2490             escape = ajTrue;
2491             ++p;
2492             continue;
2493         }
2494 
2495 	if(*p=='N')
2496 	{
2497 	    ajStrAppendC(pat,"?");
2498 	    ++p;
2499 	    continue;
2500 	}
2501 
2502 	if(*p=='B')
2503 	{
2504 	    ajStrAppendC(pat,"[TGCBSYK]");
2505 	    ++p;
2506 	    continue;
2507 	}
2508 
2509 	if(*p=='D')
2510 	{
2511 	    ajStrAppendC(pat,"[TGADWRK]");
2512 	    ++p;
2513 	    continue;
2514 	}
2515 
2516 	if(*p=='H')
2517 	{
2518 	    ajStrAppendC(pat,"[TCAHWYM]");
2519 	    ++p;
2520 	    continue;
2521 	}
2522 
2523 	if(*p=='K')
2524 	{
2525 	    ajStrAppendC(pat,"[TGK]");
2526 	    ++p;
2527 	    continue;
2528 	}
2529 
2530 	if(*p=='M')
2531 	{
2532 	    ajStrAppendC(pat,"[CAM]");
2533 	    ++p;
2534 	    continue;
2535 	}
2536 
2537 	if(*p=='R')
2538 	{
2539 	    ajStrAppendC(pat,"[GAR]");
2540 	    ++p;
2541 	    continue;
2542 	}
2543 
2544 	if(*p=='S')
2545 	{
2546 	    ajStrAppendC(pat,"[GCS]");
2547 	    ++p;
2548 	    continue;
2549 	}
2550 
2551 	if(*p=='V')
2552 	{
2553 	    ajStrAppendC(pat,"[GCAVSRM]");
2554 	    ++p;
2555 	    continue;
2556 	}
2557 
2558 	if(*p=='W')
2559 	{
2560 	    ajStrAppendC(pat,"[TAW]");
2561 	    ++p;
2562 	    continue;
2563 	}
2564 
2565 	if(*p=='Y')
2566 	{
2567 	    ajStrAppendC(pat,"[TCY]");
2568 	    ++p;
2569 	    continue;
2570 	}
2571 
2572 	ajStrAppendK(pat,*p);
2573 	++p;
2574     }
2575 
2576     ajStrDel(&t);
2577 
2578     return;
2579 }
2580 
2581 
2582 
2583 
2584 /* @funcstatic patProteinTranslate ********************************************
2585 **
2586 ** Convert protein symbols to classes e.g. B to [DN]
2587 **
2588 ** @param [u] pat [AjPStr *] pattern
2589 **
2590 ** @return [void]
2591 **
2592 ** @release 4.0.0
2593 ******************************************************************************/
2594 
patProteinTranslate(AjPStr * pat)2595 static void patProteinTranslate(AjPStr *pat)
2596 {
2597     AjPStr t;
2598     const char *p;
2599     AjBool escape = ajFalse;
2600 
2601     t = ajStrNewC(ajStrGetPtr(*pat));
2602     p = ajStrGetPtr(t);
2603     ajStrSetClear(pat);
2604 
2605     while(*p)
2606     {
2607         if(escape)
2608         {
2609             ajStrAppendK(pat,*p);
2610             ++p;
2611             escape=ajFalse;
2612             continue;
2613         }
2614 
2615         if(*p=='\\')
2616         {
2617             escape = ajTrue;
2618             ++p;
2619             continue;
2620         }
2621 
2622 	if(*p=='X')
2623 	{
2624 	    ajStrAppendC(pat,"?");
2625 	    ++p;
2626 	    continue;
2627 	}
2628 
2629 	if(*p=='B')
2630 	{
2631 	    ajStrAppendC(pat,"[DN]");
2632 	    ++p;
2633 	    continue;
2634 	}
2635 
2636 	if(*p=='Z')
2637 	{
2638 	    ajStrAppendC(pat,"[EQ]");
2639 	    ++p;
2640 	    continue;
2641 	}
2642 
2643 	if(*p=='J')
2644 	{
2645 	    ajStrAppendC(pat,"[IL]");
2646 	    ++p;
2647 	    continue;
2648 	}
2649 
2650 
2651 	ajStrAppendK(pat,*p);
2652 	++p;
2653     }
2654 
2655     ajStrDel(&t);
2656 
2657     return;
2658 }
2659 
2660 
2661 
2662 
2663 /* @func embPatClassify *******************************************************
2664 **
2665 ** Classify patterns according to type. The pattern is set up upper case,
2666 ** has start and end processing turned into boolean flags. Sets other boolean
2667 ** flags for properties of the pattern so that a suitable processing
2668 ** method can be selected.
2669 **
2670 ** @param [r] pat      [const AjPStr] original pattern
2671 ** @param [w] cleanpat [AjPStr *] cleaned pattern
2672 ** @param [w] amino    [AjBool*] set if must match start of sequence
2673 ** @param [w] carboxyl [AjBool*] set if must match end of sequence
2674 ** @param [w] fclass   [AjBool*] set if class e.g. [ABC]
2675 ** @param [w] ajcompl  [AjBool*] set if complement e.g. {ABC}
2676 ** @param [w] dontcare [AjBool*] set if X (protein) or N (DNA)
2677 ** @param [w] range    [AjBool*] set if range specified e.g. (3,10)
2678 ** @param [r] protein  [AjBool] true if protein false if DNA
2679 **
2680 ** @return [AjBool] ajTrue on success
2681 **
2682 ** @release 1.0.0
2683 ** @@
2684 ******************************************************************************/
2685 
embPatClassify(const AjPStr pat,AjPStr * cleanpat,AjBool * amino,AjBool * carboxyl,AjBool * fclass,AjBool * ajcompl,AjBool * dontcare,AjBool * range,AjBool protein)2686 AjBool embPatClassify(const AjPStr pat, AjPStr *cleanpat,
2687 		      AjBool *amino, AjBool *carboxyl,
2688 		      AjBool *fclass, AjBool *ajcompl, AjBool *dontcare,
2689 		      AjBool *range, AjBool protein)
2690 {
2691     char *p;
2692     AjBool repeat;
2693     AjPStr tmppat = NULL;
2694 
2695     ajDebug("embPatClassify pat '%S' protein %b\n", pat, protein);
2696 
2697     repeat=*amino=*carboxyl=*fclass=*ajcompl=*dontcare=*range=ajFalse;
2698 
2699     ajStrAssignS(&tmppat, pat);
2700     ajStrRemoveWhiteExcess(&tmppat);
2701     patAminoCarboxyl(tmppat,cleanpat, amino,carboxyl);
2702     ajStrDel(&tmppat);
2703 
2704     ajDebug("cleaned pat '%S' amino %b carboxyl %b\n",
2705 	    *cleanpat, *amino, *carboxyl);
2706 
2707     ajStrFmtUpper(cleanpat);
2708 
2709     if(!protein)
2710     {
2711 	patIUBTranslate(cleanpat);
2712 	ajDebug("IUB translated pat '%S'\n", *cleanpat);
2713     }
2714     else
2715     {
2716 	patProteinTranslate(cleanpat);
2717 	ajDebug("Protein codes translated pat '%S'\n", *cleanpat);
2718     }
2719 
2720     p = ajStrGetuniquePtr(cleanpat);
2721 
2722     while(*p)
2723     {
2724 	if(isalpha((ajuint)*p) || (*p=='?'))
2725 	{
2726 	    if(*p=='?')
2727 	    {
2728 		*dontcare = ajTrue;
2729 	    }
2730 
2731 	    if(*(p+1)=='(')
2732 	    {
2733 		if(!patParenTest(p,&repeat,range))
2734 		    return ajFalse;
2735 
2736 		while(*p!=')')
2737 		    ++p;
2738 	    }
2739 
2740 	    ++p;
2741 	    continue;
2742 	}
2743 
2744 
2745 
2746 	if(*p=='[')
2747 	{
2748 	    *fclass = ajTrue;
2749 	    ++p;
2750 
2751 	    while(*p)
2752 	    {
2753 		if(*p==']')
2754 		    break;
2755 
2756 		if(*p=='('||*p=='['||*p=='{'||*p=='}'||*p==')')
2757 		{
2758 		    ajWarn("Illegal pattern. Nesting '%c' in [] not allowed",
2759 			   *p);
2760 
2761 		    return ajFalse;
2762 		}
2763 
2764 		if(!isalpha((ajuint)*p))
2765 		{
2766 		    ajWarn("Illegal pattern. Non alpha character '%c'",
2767 			   *p);
2768 
2769 		    return ajFalse;
2770 		}
2771 
2772 		if((protein&&*p=='X')||(!protein&&*p=='N'))
2773 		{
2774 		    ajWarn("Illegal pattern. Dontcare character '%c' in []",
2775 			   *p);
2776 
2777 		    return ajFalse;
2778 		}
2779 
2780 		++p;
2781 	    }
2782 
2783 	    if(!*p)
2784 	    {
2785 		ajWarn("Illegal pattern. Missing ']'");
2786 
2787 		return ajFalse;
2788 	    }
2789 
2790 	    if(*(p+1)=='(')
2791 	    {
2792 		if(!patParenTest(p,&repeat,range))
2793 		    return ajFalse;
2794 
2795 		while(*p!=')')
2796 		    ++p;
2797 	    }
2798 
2799 	    ++p;
2800 	    continue;
2801 	}
2802 
2803 	if(*p=='{')
2804 	{
2805 	    *ajcompl = ajTrue;
2806 	    ++p;
2807 	    while(*p)
2808 	    {
2809 		if(*p=='}')
2810 		    break;
2811 
2812 		if(*p=='('||*p=='['||*p=='{'||*p==']'||*p==')')
2813 		{
2814 		    ajWarn("Illegal pattern. Nesting '%c' in {} not allowed.",
2815 			   *p);
2816 
2817 		    return ajFalse;
2818 		}
2819 
2820 		if(!isalpha((ajuint)*p))
2821 		{
2822 		    ajWarn("Illegal pattern. Non alpha character '%c'", *p);
2823 
2824 		    return ajFalse;
2825 		}
2826 
2827 		if((protein&&*p=='X')||(!protein&&*p=='N'))
2828 		{
2829 		    ajWarn("Illegal pattern. Ambiguous character '%c' in {}",
2830 			   *p);
2831 
2832 		    return ajFalse;
2833 		}
2834 
2835 		++p;
2836 	    }
2837 
2838 	    if(!*p)
2839 	    {
2840 		ajWarn("Illegal pattern. Missing '}'");
2841 
2842 		return ajFalse;
2843 	    }
2844 
2845 	    if(*(p+1)=='(')
2846 	    {
2847 		if(!patParenTest(p,&repeat,range))
2848 		    return ajFalse;
2849 
2850 		while(*p!=')')
2851 		    ++p;
2852 	    }
2853 
2854 	    ++p;
2855 	    continue;
2856 	}
2857 
2858 	ajWarn("Illegal character '%c'",*p);
2859 
2860 	return ajFalse;
2861     }
2862 
2863     ajDebug("patterns expanded pat '%S'\n", *cleanpat);
2864 
2865     if(repeat && !*range)
2866     {
2867 	ajDebug("testing repeat expansion\n");
2868 
2869 	if(!patExpandRepeat(cleanpat))
2870 	    return ajFalse;
2871 
2872 	ajDebug("repeats expanded pat '%S'\n", *cleanpat);
2873     }
2874 
2875     return ajTrue;
2876 }
2877 
2878 
2879 
2880 
2881 /* @func embPatSOInit *********************************************************
2882 **
2883 ** Initialise a Shift-Or pattern.
2884 **
2885 ** @param [r] pat [const AjPStr] pattern
2886 ** @param [w] table [ajuint *] SO table
2887 ** @param [w] limit [ajuint *] match limit
2888 **
2889 ** @return [void]
2890 **
2891 ** @release 1.0.0
2892 ******************************************************************************/
2893 
embPatSOInit(const AjPStr pat,ajuint * table,ajuint * limit)2894 void embPatSOInit(const AjPStr pat, ajuint *table, ajuint *limit)
2895 {
2896     ajuint  i;
2897     const char *p;
2898 
2899     if(ajStrGetLen(pat)>AJWORD)
2900 	ajFatal("Pattern too long for Shift-OR search");
2901 
2902 
2903     for(i=0;i<AJALPHA2;++i)
2904 	table[i] = ~0;
2905 
2906     *limit = 0;
2907 
2908     for(i=1,p=ajStrGetPtr(pat);*p;i<<=AJBPS,++p)
2909     {
2910 	table[(ajuint)*p] &= ~i;
2911 	*limit |=  i;
2912     }
2913 
2914     *limit = ~(*limit>>AJBPS);
2915 
2916     return;
2917 }
2918 
2919 
2920 
2921 
2922 /* @func embPatSOSearch *******************************************************
2923 **
2924 ** Perform a Shift-OR search.
2925 **
2926 ** @param [r] str [const AjPStr] search string
2927 ** @param [r] name [const AjPStr] search string
2928 ** @param [r] first [ajint] first character of pattern (as an integer)
2929 ** @param [r] begin [ajuint] sequence offset
2930 ** @param [r] plen [ajuint] pattern length
2931 ** @param [r] table [const ajuint *] SO table
2932 ** @param [r] limit [ajuint] SO limit
2933 ** @param [u] l [AjPList] list to push hits to
2934 ** @param [r] amino [AjBool] must match start
2935 ** @param [r] carboxyl [AjBool] must match end
2936 **
2937 ** @return [ajuint] number of matches
2938 **
2939 ** @release 1.0.0
2940 ******************************************************************************/
2941 
embPatSOSearch(const AjPStr str,const AjPStr name,ajint first,ajuint begin,ajuint plen,const ajuint * table,ajuint limit,AjPList l,AjBool amino,AjBool carboxyl)2942 ajuint embPatSOSearch(const AjPStr str, const AjPStr name,
2943 		     ajint first, ajuint begin,
2944 		     ajuint plen, const ajuint *table, ajuint limit, AjPList l,
2945 		     AjBool amino, AjBool carboxyl)
2946 {
2947     register ajuint state;
2948     register ajuint initial;
2949     const char *p;
2950     const char *q;
2951     ajuint pos;
2952 
2953     ajuint matches;
2954     ajuint slen;
2955 
2956     p = q = ajStrGetPtr(str);
2957     slen  = ajStrGetLen(str);
2958     matches = 0;
2959     initial = ~0;
2960 
2961     do
2962     {
2963 	while(*p && *p != first)
2964 	    ++p;
2965 
2966 	state = initial;
2967 
2968 	do
2969 	{
2970 	    state = (state<<AJBPS) | table[(ajuint)*p];
2971 
2972 	    if(state < limit)
2973 	    {
2974 		pos = (ajuint) ((p-q)-plen+1);
2975 
2976 		if(amino && pos)
2977 		    return matches;
2978 
2979 		if(!carboxyl || (carboxyl && pos==slen-plen))
2980 		{
2981 		    ++matches;
2982 		    embPatPushHit(l,name,pos,plen,begin,0);
2983 		}
2984 	    }
2985 	    ++p;
2986 	}
2987 	while(state!=initial);
2988 
2989     }
2990     while(*(p-1));
2991 
2992     return matches;
2993 }
2994 
2995 
2996 
2997 
2998 /* @func embPatBYGCInit *******************************************************
2999 **
3000 ** Initialise a Baeza-Yates Gonnet class pattern.
3001 **
3002 ** @param [r] pat [const AjPStr] pattern
3003 ** @param [w] m [ajuint *] real pattern length
3004 ** @param [w] table [ajuint *] SO table
3005 ** @param [w] limit [ajuint *] match limit
3006 **
3007 ** @return [void]
3008 **
3009 ** @release 1.0.0
3010 ******************************************************************************/
3011 
embPatBYGCInit(const AjPStr pat,ajuint * m,ajuint * table,ajuint * limit)3012 void embPatBYGCInit(const AjPStr pat, ajuint *m, ajuint *table,
3013 		    ajuint *limit)
3014 {
3015     const char *p;
3016     const char *q;
3017     ajuint initval;
3018     ajuint shift;
3019     ajuint i;
3020 
3021     p = q = ajStrGetPtr(pat);
3022     initval = ~0;
3023     shift   = 1;
3024     *m = 0;
3025     *limit  = 0;
3026 
3027     while(*p)
3028     {
3029 	if(*p=='?')
3030 	    initval &= ~shift;
3031 	else if(*p=='{')
3032 	{
3033 	    initval &= ~shift;
3034 
3035 	    while(*p!='}')
3036 		++p;
3037 	}
3038 	else if(*p=='[')
3039 	    while(*p!=']')
3040 		++p;
3041 	++p;
3042 	++*m;
3043 	*limit |= shift;
3044 	shift<<=AJBPS;
3045     }
3046 
3047     for(i=0;i<AJALPHA2;++i)
3048 	table[i] = initval;
3049 
3050     p = q;
3051 
3052     shift = 1;
3053 
3054     while(*p)
3055     {
3056 	if(*p=='{')
3057 	{
3058 	    ++p;
3059 
3060 	    while(*p!='}')
3061 	    {
3062 		table[(ajuint)*p] |= shift;
3063 		++p;
3064 	    }
3065 	}
3066 	else if(*p=='[')
3067 	{
3068 	    ++p;
3069 
3070 	    while(*p!=']')
3071 	    {
3072 		table[(ajuint)*p] &= ~shift;
3073 		++p;
3074 	    }
3075 	}
3076 	else if(*p!='?')
3077 	    table[(ajuint)*p] &= ~shift;
3078 
3079 	shift <<= AJBPS;
3080 	++p;
3081     }
3082 
3083     *limit = ~(*limit>>AJBPS);
3084 
3085     return;
3086 }
3087 
3088 
3089 
3090 
3091 /* @func embPatBYGSearch ******************************************************
3092 **
3093 ** Perform a Baeza-Yates Gonnet search.
3094 **
3095 ** @param [r] str [const AjPStr] search string
3096 ** @param [r] name [const AjPStr] search string
3097 ** @param [r] begin [ajuint] sequence offset
3098 ** @param [r] plen [ajuint] pattern length
3099 ** @param [r] table [const ajuint *] SO table
3100 ** @param [r] limit [ajuint] SO limit
3101 ** @param [u] l [AjPList] list to push hits to
3102 ** @param [r] amino [AjBool] must match start
3103 ** @param [r] carboxyl [AjBool] must match end
3104 **
3105 ** @return [ajuint] number of matches
3106 **
3107 ** @release 1.0.0
3108 ******************************************************************************/
3109 
embPatBYGSearch(const AjPStr str,const AjPStr name,ajuint begin,ajuint plen,const ajuint * table,ajuint limit,AjPList l,AjBool amino,AjBool carboxyl)3110 ajuint embPatBYGSearch(const AjPStr str, const AjPStr name,
3111 		      ajuint begin, ajuint plen,
3112 		      const ajuint *table, ajuint limit, AjPList l,
3113 		      AjBool amino, AjBool carboxyl)
3114 {
3115     register ajuint state;
3116     register ajuint initial;
3117     const char *p;
3118     const char *q;
3119     ajuint pos;
3120     ajuint matches;
3121     ajuint slen;
3122 
3123     p = q = MAJSTRGETPTR(str);
3124     slen  = MAJSTRGETLEN(str);
3125     matches = 0;
3126     initial = ~0;
3127 
3128 /*
3129 //    ajDebug("..pat initial %lx\n", initial);
3130 //    ajDebug("..pat strlen:%d str:'%S'\n", slen, str);
3131 */
3132     do
3133     {
3134 	state = initial;
3135 
3136 	do
3137 	{
3138 	    state = (state<<AJBPS) | table[(ajuint)*p];
3139 /*
3140 //            ajDebug("..pat table: %lx state %lx p:%c\n",
3141 //		    table[(ajuint)*p], state, *p);
3142 */
3143             if(state < limit)
3144 	    {
3145 		pos = (ajuint) ((p-q)-plen+1);
3146 
3147 		if(amino && pos) /* must match start but already beyond */
3148 		    return matches;
3149 
3150 		if(!carboxyl || (carboxyl && pos==slen-plen))
3151 		{
3152 		    ++matches;
3153 		    embPatPushHit(l,name,pos,plen,begin,0);
3154 /*
3155 //                    ajDebug("..pat hit matches:%d list:%Lu name:'%S' pos:%u "
3156 //                            "carboxyl:%B (slen-plen) %u\n",
3157 //			    matches, ajListGetLength(l), name, pos,
3158 //                            carboxyl, (slen-plen));
3159 */
3160 		}
3161 	    }
3162 	    ++p;
3163 	}
3164 	while(state!=initial && *p);
3165 
3166     }
3167     while((ajuint) (p-q)<slen);
3168 
3169     return matches;
3170 }
3171 
3172 
3173 
3174 
3175 /* @func embPatTUInit *********************************************************
3176 **
3177 ** Initialise a Tarhio-Ukkonen search
3178 **
3179 ** @param [r] pat [const AjPStr] pattern
3180 ** @param [w] skipm [ajuint **] mismatch skip array
3181 ** @param [r] m [ajuint] real pattern length
3182 ** @param [r] k [ajuint] allowed mismatches
3183 **
3184 ** @return [void]
3185 **
3186 ** @release 1.0.0
3187 ******************************************************************************/
3188 
embPatTUInit(const AjPStr pat,ajuint ** skipm,ajuint m,ajuint k)3189 void embPatTUInit(const AjPStr pat, ajuint **skipm, ajuint m, ajuint k)
3190 {
3191     const char *p;
3192     ajuint i;
3193     ajint j;
3194     ajint jj;
3195     ajuint x;
3196 
3197     ajuint ready[AJALPHA];
3198 
3199     p = ajStrGetPtr(pat);
3200 
3201     for(i=0;i<AJALPHA;++i)
3202     {
3203 	ready[i] = m;
3204 
3205 	for(j=m-k-1;j<(ajint)m;++j)
3206 	    skipm[j][i] = m-k-1;
3207     }
3208 
3209     for(j=m-2;j>-1;--j)
3210     {
3211 	jj = m-k-1;
3212 	x = AJMAX(j+1,jj);
3213 
3214 	for(i=ready[(ajuint)p[j]]-1;i>=x;--i)
3215 	    skipm[i][(ajuint)p[j]] = i-j;
3216 
3217 	ready[(ajuint)p[j]] = x;
3218     }
3219 
3220     return;
3221 }
3222 
3223 
3224 
3225 
3226 /* @func embPatTUSearch *******************************************************
3227 **
3228 ** Perform a Tarhio-Ukkonen search
3229 **
3230 ** @param [r] pat [const AjPStr] pattern
3231 ** @param [r] text [const AjPStr] text to search (incl ajcompl/class)
3232 ** @param [r] slen [ajuint] length of text
3233 ** @param [r] skipm [ajuint * const *] mismatch skip array
3234 ** @param [r] m [ajuint] real pattern length
3235 ** @param [r] k [ajuint] allowed mismatches
3236 ** @param [r] begin [ajuint] text offset
3237 ** @param [u] l [AjPList] list to push to
3238 ** @param [r] amino [AjBool] true if text start
3239 ** @param [r] carboxyl [AjBool] true if text end
3240 ** @param [r] name [const AjPStr] name of text
3241 **
3242 ** @return [ajuint] number of hits
3243 **
3244 ** @release 1.0.0
3245 ******************************************************************************/
3246 
embPatTUSearch(const AjPStr pat,const AjPStr text,ajuint slen,ajuint * const * skipm,ajuint m,ajuint k,ajuint begin,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr name)3247 ajuint embPatTUSearch(const AjPStr pat, const AjPStr text, ajuint slen,
3248 		     ajuint * const *skipm, ajuint m,
3249 		     ajuint k, ajuint begin, AjPList l, AjBool amino,
3250 		     AjBool carboxyl, const AjPStr name)
3251 {
3252     ajuint i;
3253     ajint j;
3254     ajint jj;
3255     ajuint h;
3256     ajuint mm;
3257     ajuint skip;
3258     const char *p;
3259     const char *q;
3260     ajuint  matches;
3261 
3262     p = ajStrGetPtr(pat);
3263     q = ajStrGetPtr(text);
3264 
3265     matches = 0;
3266 
3267     i = m-1;
3268 
3269     while(i<slen)
3270     {
3271 	h = i;
3272 	j = m-1;
3273 	skip = m-k;
3274 	mm = 0;
3275 
3276 	while(j>-1 && mm<=k)
3277 	{
3278 	    jj = m-k-1;
3279 
3280 	    if(j>=jj)
3281 		skip = AJMIN(skip,(ajuint)skipm[j][(ajuint)q[h]]);
3282 
3283 	    if(q[h]!=p[j])
3284 		++mm;
3285 	    --h;
3286 	    --j;
3287 	}
3288 
3289 	if(mm<=k)
3290 	{
3291 	    if(amino && h+1)
3292 		return matches;
3293 
3294 	    if(!carboxyl || (carboxyl && h+1==slen-m))
3295 	    {
3296 		++matches;
3297 		embPatPushHit(l,name,h+1,m,begin,mm);
3298 	    }
3299 	}
3300 
3301 	i+=skip;
3302     }
3303 
3304     return matches;
3305 }
3306 
3307 
3308 
3309 
3310 /* @func embPatTUBInit ********************************************************
3311 **
3312 ** Initialise a Tarhio-Ukkonen-Bleasby search
3313 **
3314 ** @param [r] pat [const AjPStr] pattern
3315 ** @param [w] skipm [ajuint **] mismatch skip array
3316 ** @param [r] m [ajuint] real pattern length
3317 ** @param [r] k [ajuint] allowed mismatches
3318 ** @param [r] plen [ajuint] full pattern length (incl ajcompl & class)
3319 **
3320 ** @return [void]
3321 **
3322 ** @release 1.0.0
3323 ******************************************************************************/
3324 
embPatTUBInit(const AjPStr pat,ajuint ** skipm,ajuint m,ajuint k,ajuint plen)3325 void embPatTUBInit(const AjPStr pat, ajuint **skipm, ajuint m, ajuint k,
3326 		   ajuint plen)
3327 {
3328     const char *p;
3329     const char *q;
3330     const char *s;
3331     ajint i;
3332     ajint j;
3333     ajint jj;
3334     ajint x;
3335     ajint z;
3336     ajuint flag;
3337     ajuint ready[AJALPHA];
3338 
3339     p = ajStrGetPtr(pat);
3340 
3341     for(i=0;i<AJALPHA;++i)
3342     {
3343 	ready[i] = m;
3344 
3345         /*
3346         ** AJB: Note that if the mismatches are 1 less than the real
3347         ** pattern length then the skip value can become 0 - hence
3348         ** the use of AJMAX to ensure that the skip value is always
3349         ** positive. This prevents a potential infinite loop in the
3350         ** upcoming TUBSearch function. Need to check the original
3351         ** algorithm to check that nothing is amiss here, then
3352         ** delete this comment block.
3353         */
3354 	for(j=m-k-1;j<(ajint)m;++j)
3355 	    skipm[j][i] = AJMAX(m-k-1,1);
3356     }
3357 
3358     p += plen-1;
3359 
3360     if(*p=='}' || *p==']')
3361     {
3362 	while(*p!='{' && *p!='[')
3363 	    --p;
3364 
3365 	--p;
3366     }
3367     else
3368 	--p;
3369 
3370     for(j=m-2;j>-1;--j)
3371     {
3372 	jj = m-k-1;
3373 	x = AJMAX(j+1,jj);
3374 
3375 	if(*p=='?')
3376 	{
3377 	    for(z='A';z<='Z';++z)
3378 	    {
3379 		for(i=ready[z]-1;i>=x;--i)
3380 		    skipm[i][z] = i-j;
3381 
3382 		ready[z] = x;
3383 	    }
3384 
3385 	    --p;
3386 	    continue;
3387 	}
3388 
3389 	if(*p==']')
3390 	{
3391 	    --p;
3392 	    while(*p!='[')
3393 	    {
3394 		for(i=ready[(ajuint)*p]-1;i>=x;--i)
3395 		    skipm[i][(ajuint)*p] = i-j;
3396 		ready[(ajuint)*p] = x;
3397 		--p;
3398 	    }
3399 	    --p;
3400 	    continue;
3401 	}
3402 
3403 	if(*p=='}')
3404 	{
3405 	    s=--p;
3406 
3407 	    for(z='A';z<='Z';++z)
3408 	    {
3409 		q    = s;
3410 		flag = 0;
3411 
3412 		while(*q!='{')
3413 		{
3414 		    if(*q==z)
3415 		    {
3416 			flag = 1;
3417 			break;
3418 		    }
3419 
3420 		    --q;
3421 		}
3422 
3423 		if(!flag)
3424 		{
3425 		    for(i=ready[z]-1;i>=x;--i)
3426 			skipm[i][z] = i-j;
3427 
3428 		    ready[z] = x;
3429 		}
3430 	    }
3431 
3432 	    while(*p!='{')
3433 		--p;
3434 
3435 	    --p;
3436 	    continue;
3437 	}
3438 
3439 	for(i=ready[(ajuint)*p]-1;i>=x;--i)
3440 	    skipm[i][(ajuint)*p] = i-j;
3441 
3442 	ready[(ajuint)*p] = x;
3443 	--p;
3444     }
3445 
3446     return;
3447 }
3448 
3449 
3450 
3451 
3452 /* @func embPatTUBSearch ******************************************************
3453 **
3454 ** Perform a Tarhio-Ukkonen-Bleasby search
3455 **
3456 ** @param [r] pat [const AjPStr] pattern
3457 ** @param [r] text [const AjPStr] text to search (incl ajcompl/class)
3458 ** @param [r] slen [ajuint] length of text
3459 ** @param [r] skipm [ajuint * const *] mismatch skip array
3460 ** @param [r] m [ajuint] real pattern length
3461 ** @param [r] k [ajuint] allowed mismatches
3462 ** @param [r] begin [ajuint] text offset
3463 ** @param [u] l [AjPList] list to push to
3464 ** @param [r] amino [AjBool] true if text start
3465 ** @param [r] carboxyl [AjBool] true if text end
3466 ** @param [r] name [const AjPStr] name of text
3467 ** @param [r] plen [ajuint] total pattern length
3468 **
3469 ** @return [ajuint] number of hits
3470 **
3471 ** @release 1.0.0
3472 ******************************************************************************/
3473 
embPatTUBSearch(const AjPStr pat,const AjPStr text,ajuint slen,ajuint * const * skipm,ajuint m,ajuint k,ajuint begin,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr name,ajuint plen)3474 ajuint embPatTUBSearch(const AjPStr pat,const AjPStr text, ajuint slen,
3475 		      ajuint * const *skipm, ajuint m,
3476 		      ajuint k, ajuint begin, AjPList l, AjBool amino,
3477 		      AjBool carboxyl, const AjPStr name, ajuint plen)
3478 {
3479     ajuint i;
3480     ajint j;
3481     ajint jj;
3482     ajuint h;
3483     ajuint mm;
3484     ajuint skip;
3485     const char *p;
3486     const char *q;
3487     const char *s;
3488 
3489     ajuint  matches;
3490     ajint  a;
3491     ajuint  flag;
3492 
3493     s = ajStrGetPtr(pat);
3494     q = ajStrGetPtr(text);
3495 
3496     matches = 0;
3497 
3498     i = m-1;
3499 
3500     while(i<slen)
3501     {
3502 	h = i;
3503 	j = m-1;
3504 	p = s+plen-1;
3505 	skip = m-k;
3506 	mm = 0;
3507 
3508 	while(j>-1 && mm<=k)
3509 	{
3510 	    jj = m-k-1;
3511 
3512 	    if(j>=jj)
3513 		skip = AJMIN(skip,skipm[j][(ajuint)q[h]]);
3514 
3515 	    a = q[h];
3516 
3517 	    if(*p!='?')
3518 	    {
3519 		if(*p==']')
3520 		{
3521 		    flag = 0;
3522 		    --p;
3523 
3524 		    while(*p!='[')
3525 		    {
3526 			if(a==*p)
3527 			    flag = 1;
3528 
3529 			--p;
3530 		    }
3531 
3532 		    if(!flag)
3533 			++mm;
3534 		}
3535 		else if(*p=='}')
3536 		{
3537 		    flag = 0;
3538 		    --p;
3539 
3540 		    while(*p!='{')
3541 		    {
3542 			if(a==*p)
3543 			    flag=1;
3544 			--p;
3545 		    }
3546 
3547 		    if(flag)
3548 			++mm;
3549 		}
3550 		else
3551 		    if(a!=*p)
3552 			++mm;
3553 	    }
3554 
3555 	    --p;
3556 	    --h;
3557 	    --j;
3558 	}
3559 
3560 	if(mm<=k)
3561 	{
3562 	    if(amino && h+1)
3563 		return matches;
3564 
3565 	    if(!carboxyl || (carboxyl && h+1==slen-m))
3566 	    {
3567 		++matches;
3568 		embPatPushHit(l,name,h+1,m,begin,mm);
3569 	    }
3570 	}
3571 
3572 	i += skip;
3573     }
3574 
3575     return matches;
3576 }
3577 
3578 
3579 
3580 
3581 /* @funcstatic patBruteClass **************************************************
3582 **
3583 ** Test if character matches one of a class
3584 **
3585 ** @param [r] p [const char *] pattern class e.g. [abc]
3586 ** @param [r] c [char] character to match
3587 **
3588 ** @return [AjBool] true if match success
3589 **
3590 ** @release 1.0.0
3591 ******************************************************************************/
3592 
patBruteClass(const char * p,char c)3593 static AjBool patBruteClass(const char *p, char c)
3594 {
3595     const char *s;
3596 
3597     s = p+1;
3598 
3599     while(*s!=']')
3600 	if(*s++==c)
3601 	    return ajTrue;
3602 
3603     return ajFalse;
3604 }
3605 
3606 
3607 
3608 
3609 /* @funcstatic patBruteCompl **************************************************
3610 **
3611 ** Test if character matches one of a complement
3612 **
3613 ** @param [r] p [const char *] pattern complement e.g. {abc}
3614 ** @param [r] c [char] character to match
3615 **
3616 ** @return [AjBool] true if character not in complement
3617 **
3618 ** @release 1.0.0
3619 ******************************************************************************/
3620 
patBruteCompl(const char * p,char c)3621 static AjBool patBruteCompl(const char *p, char c)
3622 {
3623     const char *s;
3624 
3625     s = p+1;
3626 
3627     while(*s!='}')
3628 	if(*s++==c)
3629 	    return ajFalse;
3630 
3631     return ajTrue;
3632 }
3633 
3634 
3635 
3636 
3637 /* @funcstatic patBruteIsRange ************************************************
3638 **
3639 ** Check if character, class or complement has a range e.g. [abc](3,6)
3640 **
3641 ** @param [r] t [const char *] pattern
3642 ** @param [w] x [ajuint *] first range value
3643 ** @param [w] y [ajuint *] second range value
3644 **
3645 ** @return [AjBool] true if there is a range & sets x/y accordingly
3646 **
3647 ** @release 1.0.0
3648 ******************************************************************************/
3649 
patBruteIsRange(const char * t,ajuint * x,ajuint * y)3650 static AjBool patBruteIsRange(const char *t, ajuint *x, ajuint *y)
3651 {
3652     const char *u;
3653 
3654     if((*t >='A' && *t<='Z') || *t=='?')
3655     {
3656 	if(*(t+1)=='(')
3657 	{
3658 	    if(sscanf(t+2,"%u,%u",x,y)!=2)
3659 	    {
3660 		sscanf(t+2,"%u",x);
3661 		*y=*x;
3662 	    }
3663 
3664 	    return ajTrue;
3665 	}
3666 
3667 	return ajFalse;
3668     }
3669 
3670     u = t;
3671 
3672     if(*t=='[')
3673     {
3674 	while(*u!=']')
3675 	    ++u;
3676 
3677 	if(*(u+1)=='(')
3678 	{
3679 	    if(sscanf(u+2,"%u,%u",x,y)!=2)
3680 	    {
3681 		sscanf(u+2,"%u",x);
3682 		*y = *x;
3683 	    }
3684 
3685 	    return ajTrue;
3686 	}
3687 	return ajFalse;
3688     }
3689 
3690     while(*u!='}')
3691 	++u;
3692 
3693     if(*(u+1)=='(')
3694     {
3695 	if(sscanf(u+2,"%u,%u",x,y)!=2)
3696 	{
3697 	    sscanf(u+2,"%u",x);
3698 	    *y = *x;
3699 	}
3700 
3701 	return ajTrue;
3702     }
3703 
3704     return ajFalse;
3705 }
3706 
3707 
3708 
3709 
3710 /* @funcstatic patBruteCharMatch **********************************************
3711 **
3712 ** Check if text character matches a char/class or complement
3713 **
3714 ** @param [r] t [const char *] text
3715 ** @param [r] c [char] character to match
3716 **
3717 ** @return [AjBool] true if match
3718 **
3719 ** @release 1.0.0
3720 ******************************************************************************/
3721 
patBruteCharMatch(const char * t,char c)3722 static AjBool patBruteCharMatch(const char *t, char c)
3723 {
3724     if(!c)
3725 	return ajFalse;
3726 
3727     if(*t=='?')
3728 	return ajTrue;
3729 
3730     if(*t>='A' && *t<='Z')
3731     {
3732 	if(*t==c)
3733 	    return ajTrue;
3734 
3735 	return ajFalse;
3736     }
3737 
3738     if(*t=='[')
3739     {
3740 	if(patBruteClass(t,c))
3741 	    return ajTrue;
3742 
3743 	return ajFalse;
3744     }
3745 
3746     if(patBruteCompl(t,c))
3747 	return ajTrue;
3748 
3749     return ajFalse;
3750 }
3751 
3752 
3753 
3754 
3755 /* @funcstatic patBruteNextPatChar ********************************************
3756 **
3757 ** Get index of next char/class/complement
3758 **
3759 ** @param [r] t [const char *] text
3760 ** @param [r] ppos [ajuint] current index
3761 **
3762 ** @return [ajuint] next index
3763 **
3764 ** @release 1.0.0
3765 ******************************************************************************/
3766 
patBruteNextPatChar(const char * t,ajuint ppos)3767 static ajuint patBruteNextPatChar(const char *t, ajuint ppos)
3768 {
3769     if(t[ppos]=='{')
3770 	while(t[ppos]!='}')
3771 	    ++ppos;
3772 
3773     if(t[ppos]=='[')
3774 	while(t[ppos]!=']')
3775 	    ++ppos;
3776 
3777     ++ppos;
3778 
3779     if(t[ppos]=='(')
3780     {
3781 	while(t[ppos]!=')')
3782 	    ++ppos;
3783 
3784 	++ppos;
3785     }
3786 
3787     return ppos;
3788 }
3789 
3790 
3791 
3792 
3793 /* @funcstatic patOUBrute *****************************************************
3794 **
3795 ** Match pattern to current sequence position
3796 **
3797 ** @param [r] seq [const char *] text
3798 ** @param [r] pat [const char *] pattern
3799 ** @param [r] spos [ajuint] sequence index
3800 ** @param [r] ppos [ajuint] pattern index
3801 ** @param [r] mm [ajuint] mismatches left
3802 ** @param [r] omm [ajuint] allowed mismatches
3803 ** @param [r] level [ajuint] level of recursion
3804 ** @param [u] l [AjPList] list on which to push hits
3805 ** @param [r] carboxyl [AjBool] true if pattern must only match end of text
3806 ** @param [r] begin [ajuint] text offset
3807 ** @param [w] count [ajuint *] hit counter
3808 ** @param [r] name [const AjPStr] text entry name
3809 ** @param [r] st [ajuint] original text index
3810 **
3811 ** @return [AjBool] true if hit found (Care! Function recursive.)
3812 **
3813 ** @release 1.0.0
3814 ******************************************************************************/
3815 
patOUBrute(const char * seq,const char * pat,ajuint spos,ajuint ppos,ajuint mm,ajuint omm,ajuint level,AjPList l,AjBool carboxyl,ajuint begin,ajuint * count,const AjPStr name,ajuint st)3816 static AjBool patOUBrute(const char *seq, const char *pat, ajuint spos,
3817 			 ajuint ppos, ajuint mm,
3818 			 ajuint omm, ajuint level, AjPList l, AjBool carboxyl,
3819 			 ajuint begin, ajuint *count,
3820 			 const AjPStr name, ajuint st)
3821 {
3822     const char *t;
3823     ajuint x;
3824     ajuint y;
3825     ajuint i;
3826 
3827     if(level==1000)
3828 	ajFatal("Pattern too complex: 1000 levels of recursion");
3829 
3830 
3831     while(pat[ppos])
3832     {
3833 	t = pat+ppos;
3834 
3835 	if(!seq[spos])
3836 	    return ajFalse;
3837 
3838 	if(!patBruteIsRange(t,&x,&y))
3839 	{
3840 	    if(!patBruteCharMatch(t,seq[spos]))
3841 	    {
3842 		if(mm==0)
3843 		    return ajFalse;
3844 
3845 		--mm;
3846 	    }
3847 
3848 	    ppos = patBruteNextPatChar(pat,ppos);
3849 	    ++spos;
3850 	    continue;
3851 	}
3852 
3853 	for(i=0;i<x;++i)
3854 	{
3855 	    if(!patBruteCharMatch(t,seq[spos++]))
3856 	    {
3857 		if(mm==0)
3858 		    return ajFalse;
3859 
3860 		--mm;
3861 	    }
3862 
3863 	    if(!seq[spos-1])
3864 		return ajFalse;
3865 	}
3866 
3867 	ppos=patBruteNextPatChar(pat,ppos);
3868 
3869 	for(i=0;i<y-x;++i)
3870 	{
3871 	    patOUBrute(seq,pat,spos,ppos,mm,omm,level,l,carboxyl,
3872 		       begin,count,name,st);
3873 
3874 	    if(!patBruteCharMatch(t,seq[spos]))
3875 		return ajFalse;
3876 
3877 	    ++spos;
3878 	}
3879     }
3880 
3881     if(!carboxyl || (carboxyl && !seq[spos]))
3882     {
3883 	*count += 1;
3884 	embPatPushHit(l,name,st,spos-st,begin,omm-mm);
3885 
3886     }
3887 
3888     return ajTrue;
3889 }
3890 
3891 
3892 
3893 
3894 /* @func embPatBruteForce *****************************************************
3895 **
3896 ** Match pattern to a sequence
3897 **
3898 ** @param [r] seq [const AjPStr] text
3899 ** @param [r] pat [const AjPStr] pattern
3900 ** @param [r] amino [AjBool] true if must match start
3901 ** @param [r] carboxyl [AjBool] true if must match end
3902 ** @param [u] l [AjPList] list on which to push hits
3903 ** @param [r] begin [ajuint] text offset
3904 ** @param [r] mm [ajuint] allowed mismatches
3905 ** @param [r] name [const AjPStr] text entry name
3906 **
3907 ** @return [ajuint] number of hits
3908 **
3909 ** @release 1.0.0
3910 ******************************************************************************/
3911 
embPatBruteForce(const AjPStr seq,const AjPStr pat,AjBool amino,AjBool carboxyl,AjPList l,ajuint begin,ajuint mm,const AjPStr name)3912 ajuint embPatBruteForce(const AjPStr seq, const AjPStr pat,
3913 		       AjBool amino, AjBool carboxyl,
3914 		       AjPList l, ajuint begin, ajuint mm, const AjPStr name)
3915 {
3916     const char *s;
3917     const char *p;
3918     ajuint  sum;
3919     ajuint  len;
3920     ajuint  i;
3921     ajuint  count;
3922 
3923 /*
3924 //  ajDebug("embPatBruteForce amino:%B carboxyl:%B begin:%u mm:%u len:%u\n",
3925 //	    amino, carboxyl, begin, mm, ajStrGetLen(seq));
3926 */
3927 
3928     sum=count = 0;
3929     s = ajStrGetPtr(seq);
3930     p = ajStrGetPtr(pat);
3931 
3932     if(amino)
3933     {
3934 	patOUBrute(s,p,0,0,mm,mm,1,l,carboxyl,begin,&count,name,0);
3935 
3936 	return count;
3937     }
3938 
3939     len = (ajuint) strlen(s);
3940 
3941     for(i=0;i<len;++i)
3942     {
3943 	patOUBrute(s,p,i,0,mm,mm,1,l,carboxyl,begin,&count,name,i);
3944 	sum += count;
3945 	count = 0;
3946     }
3947 
3948     return sum;
3949 }
3950 
3951 
3952 
3953 
3954 /* @func embPatVariablePattern ************************************************
3955 **
3956 ** Match variable pattern against constant text.
3957 ** Used for matching many patterns against one sequence.
3958 **
3959 ** @param [r] pattern [const AjPStr] pattern to match
3960 ** @param [r] text [const AjPStr] text to scan
3961 ** @param [r] patname [const AjPStr] ID or AC of pattern
3962 ** @param [u] l [AjPList] list on which to push hits
3963 ** @param [r] mode [ajuint] 1 for protein, 0 for nucleic acid
3964 ** @param [r] mismatch [ajuint] allowed mismatches
3965 ** @param [r] begin [ajuint] text offset
3966 **
3967 ** @return [ajuint] number of hits
3968 **
3969 ** @release 1.0.0
3970 ** @@
3971 ******************************************************************************/
3972 
embPatVariablePattern(const AjPStr pattern,const AjPStr text,const AjPStr patname,AjPList l,ajuint mode,ajuint mismatch,ajuint begin)3973 ajuint embPatVariablePattern(const AjPStr pattern,
3974 			    const AjPStr text,
3975 			    const AjPStr patname, AjPList l, ajuint mode,
3976 			    ajuint mismatch, ajuint begin)
3977 {
3978     ajuint plen;
3979     ajuint slen = 0;
3980 
3981     AjBool amino;
3982     AjBool carboxyl;
3983     AjBool fclass;
3984     AjBool ajcompl;
3985     AjBool dontcare;
3986     AjBool range;
3987     ajint    *buf;			/* can be -1 */
3988     ajuint    hits;
3989     ajuint    m;
3990     ajuint    n;
3991     ajuint    i;
3992     ajuint    start;
3993     ajuint    end;
3994 
3995     AjOPatBYPNode off[AJALPHA];
3996 
3997     ajuint *sotable = NULL;
3998     ajuint solimit;
3999 
4000     EmbPPatMatch ppm;
4001     AjPStr regexp;
4002 
4003     ajuint **skipm;
4004 
4005     AjPStr cleanpattern = NULL;
4006 
4007     cleanpattern = ajStrNew();
4008 
4009     if(!embPatClassify(pattern,&cleanpattern,
4010 		       &amino,&carboxyl,&fclass,&ajcompl,&dontcare,
4011 		       &range,mode))
4012 	ajFatal("Illegal pattern");
4013 
4014     plen = ajStrGetLen(cleanpattern);
4015 
4016     /*
4017     **  Select type of search depending on pattern
4018     */
4019 
4020     if(!range && !dontcare && !fclass && !ajcompl && !mismatch && plen>4)
4021     {
4022 	/* Boyer Moore Horspool is the choice for long exact patterns */
4023 	plen = ajStrGetLen(cleanpattern);
4024 	AJCNEW(buf, AJALPHA);
4025 	embPatBMHInit(cleanpattern,plen,buf);
4026 	hits = embPatBMHSearch(text,cleanpattern,ajStrGetLen(text),
4027 			       ajStrGetLen(cleanpattern),buf,
4028 			       0,amino,carboxyl,l,
4029 			       patname,begin);
4030         AJFREE(buf);
4031         ajStrDel(&cleanpattern);
4032 
4033 	return hits;
4034     }
4035 
4036     else if(mismatch && !dontcare && !range && !fclass && !ajcompl)
4037     {
4038 	/* Baeza-Yates Perleberg for exact patterns plus don't cares */
4039 	plen = ajStrGetLen(cleanpattern);
4040 	AJCNEW(buf, AJALPHA);
4041 	embPatBYPInit(cleanpattern,plen,off,buf);
4042 	hits = embPatBYPSearch(text,patname,begin,
4043 			       ajStrGetLen(text),plen,mismatch,off,buf,l,
4044 			       amino,carboxyl,cleanpattern);
4045 	AJFREE(buf);
4046         ajStrDel(&cleanpattern);
4047 
4048 	return hits;
4049     }
4050 
4051 
4052     if(!range && !dontcare && !fclass && !ajcompl && !mismatch)
4053     {
4054 	/* Shift-OR is the choice for small exact patterns */
4055 	AJCNEW(sotable, AJALPHA2);
4056 	embPatSOInit(cleanpattern,sotable,&solimit);
4057 	hits = embPatSOSearch(text,patname,*ajStrGetPtr(cleanpattern),
4058 			      begin,plen,sotable,solimit,l,
4059 			      amino,carboxyl);
4060 	AJFREE(sotable);
4061         ajStrDel(&cleanpattern);
4062 
4063 	return hits;
4064     }
4065 
4066 
4067     /* Next get m, the real pattern length */
4068     /* May as well set up a class search as well tho' it needs 'free's later */
4069     AJCNEW(sotable, AJALPHA2);
4070     embPatBYGCInit(cleanpattern,&m,sotable,&solimit);
4071 
4072 
4073     if(!range && (fclass || ajcompl) && !mismatch && m <= AJWORD)
4074     {
4075 	/*
4076 	**  Baeza-Yates Gonnet for classes and don't cares.
4077 	**  No mismatches or ranges. Patterns less than (e.g.) 32
4078 	**  Uses Shift-OR search engine
4079         */
4080 	AJFREE(sotable);
4081 	AJCNEW(sotable, AJALPHA2);
4082 	embPatBYGCInit(cleanpattern,&m,sotable,&solimit);
4083 	plen = m;
4084 	hits = embPatBYGSearch(text,patname,
4085 			       begin,plen,sotable,solimit,l,
4086 			       amino,carboxyl);
4087 
4088 	AJFREE(sotable);
4089         ajStrDel(&cleanpattern);
4090 
4091 	return hits;
4092     }
4093 
4094 
4095 
4096     if(!mismatch && (range || m > AJWORD))
4097     {
4098 	/*
4099 	**  PCRE for ranges and simple classes longer than
4100         **  e.g. 32. No mismatches allowed
4101         **/
4102 	AJFREE(sotable);
4103 	regexp = embPatPrositeToRegExp(pattern); /* original pattern */
4104 	ppm = embPatMatchFind(regexp,text,amino,carboxyl);
4105 	n = embPatMatchGetNumber(ppm);
4106 
4107 	for(i=0;i<n;++i)
4108 	{
4109 	    start = embPatMatchGetStart(ppm,i);
4110 	    end   = embPatMatchGetEnd(ppm,i);
4111 
4112 	    if(amino && start)
4113 	    {
4114 		n = 0;
4115 		break;
4116 	    }
4117 
4118 	    if(!carboxyl || (carboxyl && start==slen-(end-start+1)))
4119 		embPatPushHit(l,patname,start,end-start+1,
4120 			      begin,0);
4121 	}
4122 
4123 	embPatMatchDel(&ppm);
4124 	hits = n;
4125 	ajStrDel(&regexp);
4126         ajStrDel(&cleanpattern);
4127 
4128 	return hits;
4129     }
4130 
4131 
4132     if(mismatch && !range && (fclass || ajcompl))
4133     {
4134 	/* Try a Tarhio-Ukkonen-Bleasby         */
4135 
4136 	AJFREE(sotable);
4137 
4138 	AJCNEW(skipm, m);
4139 
4140 	for(i=0;i<m;++i)
4141 	    AJCNEW(skipm[i], AJALPHA);
4142 
4143 	embPatTUBInit(cleanpattern,skipm,m,mismatch,plen);
4144 	hits = embPatTUBSearch(cleanpattern,text,ajStrGetLen(text),skipm,
4145 			      m,mismatch,begin,
4146 			      l,amino,carboxyl,patname,plen);
4147 	for(i=0;i<m;++i)
4148 	    AJFREE(skipm[i]);
4149 
4150 	AJFREE(skipm);
4151         ajStrDel(&cleanpattern);
4152 
4153 	return hits;
4154     }
4155 
4156     /*
4157     **  No choice left but to do a Bleasby recursive brute force
4158     */
4159     hits = embPatBruteForce(text,cleanpattern,amino,carboxyl,l,
4160 			    begin,mismatch,patname);
4161     AJFREE(sotable);
4162 
4163     ajStrDel(&cleanpattern);
4164 
4165     return hits;
4166 }
4167 
4168 
4169 
4170 
4171 /* @func embPatRestrictPreferred **********************************************
4172 **
4173 ** Replace RE names by the name of the prototype for that RE
4174 **
4175 ** @param [u] l [AjPList] list of EmbPMatMatch hits
4176 ** @param [r] t [const AjPTable] table from embossre.equ file
4177 **
4178 ** @return [void]
4179 **
4180 ** @release 2.7.0
4181 ** @@
4182 ******************************************************************************/
4183 
embPatRestrictPreferred(AjPList l,const AjPTable t)4184 void embPatRestrictPreferred(AjPList l, const AjPTable t)
4185 {
4186     AjIList iter   = NULL;
4187     EmbPMatMatch m = NULL;
4188     const AjPStr value   = NULL;
4189 
4190     iter = ajListIterNewread(l);
4191 
4192     while((m = (EmbPMatMatch)ajListIterGet(iter)))
4193     {
4194 	value = ajTableFetchS(t,m->cod);
4195 
4196 	if(value)
4197 	    ajStrAssignS(&m->cod,value);
4198     }
4199 
4200     ajListIterDel(&iter);
4201 
4202     return;
4203 }
4204 
4205 
4206 
4207 
4208 /* @func embPatRestrictRestrict ***********************************************
4209 **
4210 ** Cut down the number of restriction enzyme hits from embPatRestrictScan
4211 ** Notably double reporting of symmetric palindromes and reporting
4212 ** of isoschizomers. Also provides an optional alphabetic sort.
4213 **
4214 ** If we don't allow isoschizomers, then names of all isoschizomers
4215 ** found will be added to the string 'iso' in the returned list of
4216 ** EmbPMatMatch structures.  If 'isos' is AjTrue then they will be left alone.
4217 **
4218 ** @param [u] l [AjPList] list of hits from embPatRestrictScan
4219 ** @param [r] hits [ajuint] number of hits from embPatRestrictScan
4220 ** @param [r] isos [AjBool] Allow isoschizomers
4221 ** @param [r] alpha [AjBool] Sort alphabetically
4222 **
4223 ** @return [ajuint] adjusted number of hits
4224 **
4225 ** @release 1.0.0
4226 ** @@
4227 ******************************************************************************/
4228 
embPatRestrictRestrict(AjPList l,ajuint hits,AjBool isos,AjBool alpha)4229 ajuint embPatRestrictRestrict(AjPList l, ajuint hits, AjBool isos,
4230 	AjBool alpha)
4231 {
4232     EmbPMatMatch m  = NULL;
4233     EmbPMatMatch archetype = NULL; /* archetype of a set of isoschizomers */
4234     AjPStr  ps      = NULL;
4235     AjPList tlist   = NULL;
4236     AjPList newlist = NULL;
4237 
4238     ajuint i;
4239     ajuint v;
4240     ajuint tc   = 0;
4241     ajuint nc   = 0;
4242     ajint cut1 = 0;
4243     ajint cut2;
4244     ajint cut3;
4245     ajint cut4;
4246     ajuint pos  = 0;
4247 
4248     ps      = ajStrNew();
4249     tlist   = ajListNew();
4250     newlist = ajListNew();
4251 
4252     /* Remove Mirrors for each enzyme separately */
4253     ajListSort(l, &embPatRestrictNameCompare);
4254     tc = nc = 0;
4255 
4256     if(hits)
4257     {
4258 	ajListPop(l,(void **)&m);
4259 	ajStrAssignS(&ps,m->cod);
4260 	ajListPush(l,(void *)m);
4261     }
4262 
4263     while(ajListPop(l,(void **)&m))
4264     {
4265 	if(!ajStrCmpS(m->cod,ps))
4266 	{
4267 	    ajListPush(tlist,(void *)m);
4268 	    ++tc;
4269 	}
4270 	else
4271 	{
4272 	    ajStrAssignS(&ps,m->cod);
4273 	    ajListPush(l,(void *)m);
4274 	    ajListSort(tlist, &embPatRestrictStartCompare);
4275 	    ajListSort(tlist, &embPatRestrictCutCompare);
4276 	    cut1 = cut2 = INT_MAX;
4277 
4278 	    for(i=0;i<tc;++i)
4279 	    {
4280 		ajListPop(tlist,(void **)&m);
4281 
4282 		if(cut1!=m->cut1)
4283 		{
4284 		    cut1=m->cut1;
4285 		    ajListPush(newlist,(void *)m);
4286 		    ++nc;
4287 		}
4288 		else
4289 		    embMatMatchDel(&m);
4290 	    }
4291 
4292 	    tc = 0;
4293 	}
4294     }
4295 
4296     ajListSort(tlist, &embPatRestrictStartCompare);
4297     ajListSort(tlist, &embPatRestrictCutCompare);
4298 
4299     cut1 = cut2 = INT_MAX;
4300 
4301     for(i=0;i<tc;++i)
4302     {
4303 	ajListPop(tlist,(void **)&m);
4304 
4305 	if(cut1!=m->cut1)
4306 	{
4307 	    cut1=m->cut1;
4308 	    ajListPush(newlist,(void *)m);
4309 	    ++nc;
4310 	}
4311 	else
4312 	    embMatMatchDel(&m);
4313     }
4314 
4315     /* List l is currently empty - now reuse it  */
4316 
4317     hits = nc;
4318     ajListFree(&tlist);
4319     tlist = ajListNew();
4320 
4321 
4322     if(!isos)
4323     {
4324 	/* Keep only first alphabetical isoschizomer */
4325 	ajListSort(newlist, &embPatRestrictStartCompare);
4326 
4327 	if(hits)
4328 	{
4329 	    ajListPop(newlist,(void **)&m);
4330 	    pos = m->start;
4331 	    ajListPush(newlist,(void *)m);
4332 	}
4333 
4334 	tc = nc =0;
4335 
4336 	while(ajListPop(newlist,(void **)&m))
4337 	{
4338 	    if(pos==m->start)
4339 	    {
4340 		/*
4341 		** push groups of RE's that share the same start
4342 		** position onto tlist to
4343 		** be checked later to see if they are isoschizomers
4344 		*/
4345 		ajListPush(tlist,(void *)m);
4346 		++tc;
4347 	    }
4348 	    else
4349 	    {
4350 		pos = m->start;
4351 
4352 		ajListPush(newlist,(void *)m);
4353 
4354 		/*
4355 		** Now for list of all enz's which cut at same pos
4356 		**  sorted by Name
4357 		*/
4358 		ajListSort(tlist, &embPatRestrictNameCompare);
4359 
4360 		/*
4361 		** Now loop rejecting, for each left in the list,
4362 		** anything similar
4363 		*/
4364 
4365 		/*
4366 		** check for isoschizomers in the group sharing the
4367 		** previous 'pos' here
4368 		*/
4369 		while(tc)
4370 		{
4371 		    ajListPop(tlist,(void **)&m);
4372 		    cut1 = m->cut1;
4373 		    cut2 = m->cut2;
4374 		    cut3 = m->cut3;
4375 		    cut4 = m->cut4;
4376 		    ajStrAssignC(&ps,ajStrGetPtr(m->pat));
4377 
4378 		    /*
4379 		    ** first one of the group is not an isoschizomer,
4380 		    ** by definition, so return it
4381 		    */
4382 		    ajListPush(l,(void *)m);
4383 		    archetype = m;
4384 		    ++nc;
4385 		    --tc;
4386 
4387 
4388 		    for(i=0,v=0;i<tc;++i)
4389 		    {
4390 			ajListPop(tlist,(void **)&m);
4391 
4392 			if(m->cut1!=cut1 || m->cut2!=cut2 || m->cut3!=cut3 ||
4393 			   m->cut4!=cut4 || ajStrCmpS(ps,m->pat))
4394 			{
4395 			    ajListPushAppend(tlist,(void *)m);
4396 			    ++v;
4397 			}
4398 
4399 			else
4400 			{
4401 			    /*
4402 			    ** same cut sites and pattern at the RE just
4403 			    ** pushed onto 'l', so is an
4404 			    **isoschizomer - add its name to the
4405 			    ** archetype's list of isoschizomers and
4406 			    ** delete
4407 			    */
4408 			    if(ajStrGetLen(archetype->iso) > 0)
4409 			        ajStrAppendC(&archetype->iso, ",");
4410 
4411 			    ajStrAppendS(&archetype->iso, m->cod);
4412 			    embMatMatchDel(&m);
4413 			}
4414 		    }
4415 
4416 		    tc = v;
4417 		}
4418 	    }
4419 	}
4420 
4421 
4422 
4423 	ajListSort(tlist, &embPatRestrictNameCompare);
4424 
4425 	while(tc)
4426 	{
4427 	    ajListPop(tlist,(void **)&m);
4428 	    cut1 = m->cut1;
4429 	    cut2 = m->cut2;
4430 	    cut3 = m->cut3;
4431 	    cut4 = m->cut4;
4432 	    ajStrAssignC(&ps,ajStrGetPtr(m->pat));
4433 	    /*
4434 	    ** first one of the group is not an isoschizomer,
4435 	    ** by definition, so return it
4436 	    */
4437 	    ajListPush(l,(void *)m);
4438 	    archetype = m;
4439 	    ++nc;
4440 	    --tc;
4441 
4442 
4443 	    for(i=0,v=0;i<tc;++i)
4444 	    {
4445 		ajListPop(tlist,(void **)&m);
4446 
4447 		if(m->cut1!=cut1 || m->cut2!=cut2 || m->cut3!=cut3 ||
4448 		   m->cut4!=cut4 || ajStrCmpS(ps,m->pat))
4449 		{
4450 		    ajListPushAppend(tlist,(void *)m);
4451 		    ++v;
4452 		}
4453 		else
4454 		{
4455 		    /*
4456 		    ** same cut sites and pattern as the RE
4457 		    ** just pushed onto 'l', so is an
4458 		    ** isoschizomer - add its name to the archetype's
4459 		    ** list of isoschizomers and
4460 		    ** delete
4461 		    */
4462 		    if(ajStrGetLen(archetype->iso) > 0)
4463 		        ajStrAppendC(&archetype->iso, ",");
4464 
4465 		    ajStrAppendS(&archetype->iso, m->cod);
4466 		    embMatMatchDel(&m);
4467 		}
4468 	    }
4469 
4470 	    tc = v;
4471 	}
4472 
4473 	hits = nc;
4474     }
4475     else
4476     {
4477 	while(ajListPop(newlist,(void **)&m))
4478 	{
4479 	    ajListPush(l, (void*) m);
4480 	}
4481 
4482 	ajListFree(&newlist);
4483     }
4484 
4485     /* Finally sort on position of recognition sequence and print */
4486     ajListSort(l, &embPatRestrictStartCompare);
4487 
4488     if(alpha)
4489 	ajListSortTwo(l,
4490 		      &embPatRestrictNameCompare,
4491 		      &embPatRestrictStartCompare);
4492 
4493     ajStrDel(&ps);
4494     ajListFree(&tlist);
4495     ajListFree(&newlist);
4496 
4497     return hits;
4498 }
4499 
4500 
4501 
4502 
4503 /* @func embPatRestrictStartCompare *******************************************
4504 **
4505 ** Sort restriction site hits on the basis of start position
4506 **
4507 ** @param [r] a [const void *] First EmbPMatMatch hit
4508 ** @param [r] b [const void *] Second EmbPMatMatch hit
4509 **
4510 ** @return [ajint] 0 if a and b are equal
4511 **               -ve if a is less than b,
4512 **               +ve if a is greater than b
4513 **
4514 ** @release 1.13.0
4515 ******************************************************************************/
4516 
embPatRestrictStartCompare(const void * a,const void * b)4517 ajint embPatRestrictStartCompare(const void *a, const void *b)
4518 {
4519     ajuint astart = (*(EmbPMatMatch const *)a)->start;
4520     ajuint bstart = (*(EmbPMatMatch const *)b)->start;
4521 
4522     if(!(*(EmbPMatMatch const *)a)->forward)
4523         astart = (*(EmbPMatMatch const *)a)->end;
4524     if(!(*(EmbPMatMatch const *)b)->forward)
4525         bstart = (*(EmbPMatMatch const *)b)->end;
4526 
4527     return astart - bstart;
4528 }
4529 
4530 
4531 
4532 
4533 /* @func embPatRestrictCutCompare *********************************************
4534 **
4535 ** Sort restriction site hits on the basis of cut position
4536 **
4537 ** @param [r] a [const void *] First EmbPMatMatch hit
4538 ** @param [r] b [const void *] Second EmbPMatMatch hit
4539 **
4540 ** @return [ajint] 0 if a and b are equal
4541 **               -ve if a is less than b,
4542 **               +ve if a is greater than b
4543 **
4544 ** @release 1.13.0
4545 ******************************************************************************/
4546 
embPatRestrictCutCompare(const void * a,const void * b)4547 ajint embPatRestrictCutCompare(const void *a, const void *b)
4548 {
4549     return (*(EmbPMatMatch const *)a)->cut1 - (*(EmbPMatMatch const *)b)->cut1;
4550 }
4551 
4552 
4553 
4554 
4555 /* @func embPatRestrictNameCompare ********************************************
4556 **
4557 ** Sort restriction site hits on the basis of enzyme name
4558 **
4559 ** @param [r] a [const void *] First EmbPMatMatch hit
4560 ** @param [r] b [const void *] Second EmbPMatMatch hit
4561 **
4562 ** @return [ajint] 0 if a and b are equal
4563 **               -ve if a is less than b,
4564 **               +ve if a is greater than b
4565 **
4566 ** @release 1.13.0
4567 ******************************************************************************/
4568 
embPatRestrictNameCompare(const void * a,const void * b)4569 ajint embPatRestrictNameCompare(const void *a, const void *b)
4570 {
4571     return strcmp(ajStrGetPtr((*(EmbPMatMatch const *)a)->cod),
4572 		  ajStrGetPtr((*(EmbPMatMatch const *)b)->cod));
4573 }
4574 
4575 
4576 
4577 
4578 /* @func embPatRestrictMatch **************************************************
4579 **
4580 ** Main Restriction function. Scans sequence and rejects unwanted
4581 ** cutters
4582 **
4583 ** @param [r] seq [const AjPSeq] sequence
4584 ** @param [r] begin [ajuint] start position in sequence
4585 ** @param [r] end [ajuint] end position in sequence
4586 ** @param [u] enzfile [AjPFile] file pointer to .enz file
4587 ** @param [u] methfile [AjPFile] file pointer to methylation data file
4588 ** @param [r] enzymes [const AjPStr] comma separated list of REs
4589 **                                  or NULL for all
4590 ** @param [r] sitelen [ajuint] minimum length of recognition site
4591 ** @param [r] plasmid [AjBool] Circular DNA
4592 ** @param [r] ambiguity [AjBool] Allow ambiguities
4593 ** @param [r] min [ajuint] minimum number of true cuts
4594 ** @param [r] max [ajuint] maximum number of true cuts
4595 ** @param [r] blunt [AjBool] Allow blunt cutters
4596 ** @param [r] sticky [AjBool] Allow sticky cutters
4597 ** @param [r] commercial [AjBool] Allow Only report REs with a supplier
4598 ** @param [r] methyl [AjBool] Mark methylated bases as 'N'
4599 ** @param [u] l [AjPList] list for (EmbPMatMatch) hits
4600 **
4601 ** @return [ajuint] number of hits
4602 **
4603 ** @release 1.0.0
4604 ** @@
4605 ******************************************************************************/
4606 
embPatRestrictMatch(const AjPSeq seq,ajuint begin,ajuint end,AjPFile enzfile,AjPFile methfile,const AjPStr enzymes,ajuint sitelen,AjBool plasmid,AjBool ambiguity,ajuint min,ajuint max,AjBool blunt,AjBool sticky,AjBool commercial,AjBool methyl,AjPList l)4607 ajuint embPatRestrictMatch(const AjPSeq seq, ajuint begin, ajuint end,
4608 			   AjPFile enzfile, AjPFile methfile,
4609 			   const AjPStr enzymes, ajuint sitelen,
4610 			   AjBool plasmid, AjBool ambiguity,
4611 			   ajuint min, ajuint max,
4612 			   AjBool blunt, AjBool sticky, AjBool commercial,
4613 			   AjBool methyl, AjPList l)
4614 {
4615     AjBool hassup;
4616     AjBool isall = ajTrue;
4617     AjPStr  name;
4618     const AjPStr  strand;
4619     AjPStr  substr;
4620     AjPStr  revstr;
4621     AjPStr  binstr;
4622     AjPStr  binrev;
4623     AjPStr  *ea;
4624     AjPStr  tmpstr = NULL;
4625     AjPList methlist = NULL;
4626 
4627     EmbPPatRestrict enz;
4628     MethPData md = NULL;
4629 
4630 
4631     ajuint len;
4632     ajuint plen;
4633     ajuint i;
4634     ajuint hits;
4635     ajuint ne = 0;
4636 
4637     const char *cp;
4638     char *p;
4639     char *q;
4640 
4641     if(enzymes)
4642     {
4643         ne = ajArrCommaList(enzymes,&ea);
4644 
4645         if(!ne)
4646             return 0;
4647     }
4648 
4649     name   = ajStrNew();
4650     substr = ajStrNew();
4651     revstr = ajStrNew();
4652     binstr = ajStrNew();
4653     binrev = ajStrNew();
4654 
4655     enz = embPatRestrictNew();
4656 
4657     if(methyl)
4658         methlist = patRestrictReadMethyl(methfile);
4659 
4660     if(!enzymes)
4661 	isall = ajTrue;
4662     else
4663     {
4664 	for(i=0;i<ne;++i)
4665 	{
4666 	    ajStrRemoveWhite(&ea[i]);
4667 	    ajStrFmtUpper(&ea[i]);
4668 	}
4669 
4670 	if(ajStrMatchCaseC(ea[0],"all"))
4671 	    isall = ajTrue;
4672 	else
4673 	    isall = ajFalse;
4674     }
4675 
4676 
4677 
4678     ajFileSeek(enzfile,0L,0);
4679     ajStrAssignS(&name,ajSeqGetNameS(seq));
4680     strand = ajSeqGetSeqS(seq);
4681     ajStrAssignSubS(&substr,strand,begin-1,end-1);
4682     ajStrFmtUpper(&substr);
4683     len = plen = ajStrGetLen(substr);
4684     ajStrAssignSubS(&revstr,strand,begin-1,end-1);
4685     ajStrFmtUpper(&revstr);
4686     ajSeqstrReverse(&revstr);
4687 
4688     if(methyl)
4689     {
4690         patRestrictMethylMod(&substr,&revstr,methlist);
4691         patRestrictMethylMod(&revstr,&substr,methlist);
4692     }
4693 
4694 
4695     ajStrAssignS(&binstr,substr);
4696     ajStrAssignS(&binrev,revstr);
4697 
4698     if(plasmid)
4699     {
4700 	plen <<= 1;
4701 	tmpstr = ajStrNew();
4702 	ajStrAssignS(&tmpstr,substr);
4703 	ajStrAppendS(&tmpstr,substr);
4704 	ajStrAssignS(&substr,tmpstr);
4705 
4706 	ajStrAssignS(&tmpstr,binstr);
4707 	ajStrAppendS(&tmpstr,binstr);
4708 	ajStrAssignS(&binstr,tmpstr);
4709 
4710 	ajStrAssignS(&tmpstr,revstr);
4711 	ajStrAppendS(&tmpstr,revstr);
4712 	ajStrAssignS(&revstr,tmpstr);
4713 
4714 	ajStrAssignS(&tmpstr,binrev);
4715 	ajStrAppendS(&tmpstr,binrev);
4716 	ajStrAssignS(&binrev,tmpstr);
4717 
4718 	ajStrDel(&tmpstr);
4719     }
4720 
4721     q = ajStrGetuniquePtr(&binrev);
4722     p = ajStrGetuniquePtr(&binstr);
4723 
4724     for(i=0;i<plen;++i,++p,++q)
4725     {
4726 	*p = (char)ajBaseAlphaToBin(*p);
4727 	*q = (char)ajBaseAlphaToBin(*q);
4728     }
4729 
4730 
4731     hits = 0;
4732 
4733     while(!ajFileIsEof(enzfile))
4734     {
4735         if(!embPatRestrictReadEntry(enz,enzfile))
4736 	    continue;
4737 
4738 	if(!enz->ncuts)
4739 	    continue;
4740 
4741 	if(enz->len < sitelen)
4742 	    continue;
4743 
4744 	if(!blunt && enz->blunt)
4745 	    continue;
4746 
4747 	if(!sticky && !enz->blunt)
4748 	    continue;
4749 
4750 	cp = ajStrGetPtr(enz->pat);
4751 
4752 	if(*cp>='A' && *cp<='Z')
4753 	    hassup = ajTrue;
4754 	else
4755 	    hassup = ajFalse;
4756 
4757 	if(!hassup && isall && commercial)
4758 	    continue;
4759 
4760 	ajStrFmtUpper(&enz->pat);
4761 
4762 	if(!isall)
4763 	{
4764 	    for(i=0;i<ne;++i)
4765 		if(ajStrMatchCaseS(ea[i],enz->cod))
4766 		    break;
4767 
4768 	    if(i==ne)
4769 		continue;
4770 	}
4771 
4772 	hits += embPatRestrictScan(enz,substr,binstr,revstr,binrev,len,
4773 				   ambiguity,plasmid,min,max,begin,l);
4774     }
4775 
4776 
4777 
4778     for(i=0;i<ne;++i)
4779 	ajStrDel(&ea[i]);
4780 
4781     if(ne)
4782 	AJFREE(ea);
4783 
4784     if(methyl)
4785     {
4786         while(ajListPop(methlist,(void **)&md))
4787         {
4788             ajStrDel(&md->Name);
4789             ajStrDel(&md->Site);
4790             ajStrDel(&md->Replace);
4791             AJFREE(md);
4792         }
4793 
4794         ajListFree(&methlist);
4795     }
4796 
4797     ajStrDel(&name);
4798     ajStrDel(&substr);
4799     ajStrDel(&revstr);
4800     ajStrDel(&binstr);
4801     ajStrDel(&binrev);
4802     embPatRestrictDel(&enz);
4803 
4804     return hits;
4805 }
4806 
4807 
4808 
4809 
4810 /* @func embPatGetType ********************************************************
4811 **
4812 ** Return the type of a pattern
4813 **
4814 ** @param [r] pattern [const AjPStr] original pattern
4815 ** @param [w] cleanpat [AjPStr *] cleaned pattern
4816 ** @param [r] mismatch [ajuint] number of allowed mismatches
4817 ** @param [r] protein [AjBool] true if protein
4818 ** @param [w] m [ajuint*] real length of pattern
4819 ** @param [w] left [AjBool*] must match left begin
4820 ** @param [w] right [AjBool*] must match right
4821 **
4822 ** @return [ajuint] type of pattern
4823 **
4824 ** @release 1.0.0
4825 ** @@
4826 ******************************************************************************/
4827 
embPatGetType(const AjPStr pattern,AjPStr * cleanpat,ajuint mismatch,AjBool protein,ajuint * m,AjBool * left,AjBool * right)4828 ajuint embPatGetType(const AjPStr pattern, AjPStr *cleanpat,
4829 		    ajuint mismatch, AjBool protein,
4830 		    ajuint *m,
4831 		    AjBool *left, AjBool *right)
4832 {
4833     AjBool fclass;
4834     AjBool compl;
4835     AjBool dontcare;
4836     AjBool range;
4837     ajuint plen;
4838     ajuint type;
4839     const char *p;
4840 
4841     ajStrAssignS(cleanpat,pattern);
4842     if(!embPatClassify(pattern,cleanpat,
4843 		       left,right,&fclass,&compl,&dontcare,
4844 		       &range,protein))
4845 	return 0;
4846 
4847     /* Get real pattern length */
4848     p = ajStrGetPtr(*cleanpat);
4849     *m = 0;
4850 
4851     while(*p)
4852     {
4853 	++p;
4854 	++*m;
4855 
4856 	if(*p=='{')
4857 	    while(*p && *p!='}')
4858 		++p;
4859 	else if(*p=='[')
4860 	    while(*p && *p!=']')
4861 		++p;
4862 	else if(*p && *p=='(')
4863         {
4864 	    while(*p!=')')
4865 		++p;
4866             --*m;
4867         }
4868     }
4869 
4870 
4871     plen = ajStrGetLen(*cleanpat);
4872     type = 0;
4873 
4874     /*
4875     **  Select type of search depending on pattern
4876     */
4877 
4878     if(!range && !dontcare && !fclass && !compl && !mismatch && plen>AJWORD)
4879     {
4880 	/* Boyer Moore Horspool is the choice for long exact patterns */
4881 	type = 1;
4882     }
4883     else if(mismatch && !dontcare && !range && !fclass && !compl &&
4884 	    plen<AJALPHA/2)
4885     {
4886 	/* Baeza-Yates Perleberg for exact patterns plus mismatches */
4887 	type = 2;
4888     }
4889     else if(!range && !dontcare && !fclass && !compl && !mismatch &&
4890 	    plen<=AJWORD)
4891     {
4892 	/* Shift-OR is the choice for small exact patterns */
4893 	type = 3;
4894     }
4895     else if(!range && (fclass || compl || dontcare) && !mismatch && *m<=AJWORD)
4896     {
4897 	/*
4898 	 *  Baeza-Yates Gonnet for classes and dontcares.
4899 	 *  No mismatches or ranges. Patterns less than (e.g.) 32
4900          */
4901 	type = 4;
4902     }
4903     else if(!mismatch && (range || *m>AJWORD))
4904     {
4905 	    type = 5;
4906             /* note:
4907             ** there was a test here for '?' in the original pattern
4908             ** but that is an illegal pattern character
4909             */
4910     }
4911     else if(mismatch && !range && (fclass || compl))
4912     {
4913 	/* Try a Tarhio-Ukkonen-Bleasby         */
4914 	type = 6;
4915     }
4916     else if((mismatch && range) || !type)
4917     {
4918 	/*
4919         **  No choice left but to do a Bleasby recursive brute force
4920         */
4921 	type = 7;
4922     }
4923 
4924     ajDebug("embPatType %d '%S'\n", type, pattern);
4925 
4926     if (!ajStrMatchCaseS(pattern, *cleanpat))
4927 	ajDebug("embPatType cleaned to '%S'\n", *cleanpat);
4928 
4929     return type;
4930 }
4931 
4932 
4933 
4934 
4935 /* @func embPatCompile ********************************************************
4936 **
4937 ** Compile a pattern classified by embPatGetType
4938 **
4939 ** @param [r] type [ajuint] pattern type
4940 ** @param [r] pattern [const AjPStr] original pattern
4941 ** @param [w] plen [ajuint*] pattern length
4942 ** @param [w] buf [ajint**] buffer for BMH and BYP search (can be -1)
4943 ** @param [w] off [EmbPPatBYPNode] offset buffer for B-Y/P search
4944 ** @param [w] sotable [ajuint**] buffer for SHIFT-OR
4945 ** @param [w] solimit [ajuint*] limit for SHIFT-OR
4946 ** @param [w] m [ajuint*] real length of pattern (from embPatGetType)
4947 ** @param [w] regexp [AjPStr *] PCRE regexp string
4948 ** @param [w] skipm [ajuint***] skip buffer for Tarhio-Ukkonen
4949 ** @param [r] mismatch [ajuint] number of allowed mismatches
4950 **
4951 ** @return [void]
4952 **
4953 ** @release 1.0.0
4954 ** @@
4955 ******************************************************************************/
4956 
embPatCompile(ajuint type,const AjPStr pattern,ajuint * plen,ajint ** buf,EmbPPatBYPNode off,ajuint ** sotable,ajuint * solimit,ajuint * m,AjPStr * regexp,ajuint *** skipm,ajuint mismatch)4957 void embPatCompile(ajuint type, const AjPStr pattern, ajuint* plen,
4958 		   ajint** buf, EmbPPatBYPNode off, ajuint** sotable,
4959 		   ajuint* solimit, ajuint* m, AjPStr* regexp, ajuint*** skipm,
4960 		   ajuint mismatch)
4961 {
4962     ajuint i = 0;
4963 
4964     *plen = ajStrGetLen(pattern);
4965 
4966     switch(type)
4967     {
4968         case 1:
4969             AJCNEW(*buf,AJALPHA);
4970             embPatBMHInit(pattern,*plen,*buf);
4971             break;
4972         case 2:
4973             AJCNEW(*buf,AJALPHA);
4974             embPatBYPInit(pattern,*plen,off,*buf);
4975             break;
4976         case 3:
4977             AJCNEW(*sotable,AJALPHA2);
4978             embPatSOInit(pattern,*sotable,solimit);
4979             *m = *plen;
4980             break;
4981         case 4:
4982             AJCNEW(*sotable,AJALPHA2);
4983             embPatBYGCInit(pattern,m,*sotable,solimit);
4984             break;
4985         case 5:
4986             *regexp = embPatPrositeToRegExp(pattern);
4987             break;
4988         case 6:
4989             AJCNEW(*skipm,*m);
4990             for(i=0;i<*m;++i)
4991                 AJCNEW((*skipm)[i],AJALPHA);
4992             embPatTUBInit(pattern,*skipm,*m,mismatch,*plen);
4993             break;
4994         case 7:
4995             break;
4996         default:
4997             ajFatal("embPatCompile: Cannot compile pattern");
4998             break;
4999     }
5000 
5001     return;
5002 }
5003 
5004 
5005 
5006 
5007 /* @func embPatFuzzSearch *****************************************************
5008 **
5009 ** Fuzzy search after embPatGetType and embPatCompile
5010 **
5011 ** @param [r] type [ajuint] pattern type
5012 ** @param [r] begin [ajuint] text displacement (1=start)
5013 ** @param [r] pattern [const AjPStr] processed pattern
5014 ** @param [r] name [const AjPStr] name associated with text
5015 ** @param [r] text [const AjPStr] text
5016 ** @param [u] l [AjPList] list to push hits onto
5017 ** @param [r] plen [ajuint] pattern length
5018 ** @param [r] mismatch [ajuint] number of allowed mismatches
5019 ** @param [r] left [AjBool] must match left
5020 ** @param [r] right [AjBool] must match right
5021 ** @param [u] buf [ajint*] buffer for BMH search,
5022 **                         or mismatch count for BYP search
5023 ** @param [u] off [EmbPPatBYPNode] offset buffer for B-Y/P search
5024 ** @param [r] sotable [const ajuint*] buffer for SHIFT-OR
5025 ** @param [r] solimit [ajuint] limit for SHIFT-OR
5026 ** @param [r] regexp [const AjPStr] PCRE regexp string
5027 ** @param [r] skipm [ajuint* const *] skip buffer for Tarhio-Ukkonen-Bleasby
5028 ** @param [w] hits [ajuint*] number of hits
5029 ** @param [r] m [ajuint] real pat length (from embPatGetType/embPatCompile)
5030 ** @param [w] tidy [const void**] data to free
5031 **
5032 ** @return [void]
5033 **
5034 ** @release 1.0.0
5035 ** @@
5036 ******************************************************************************/
5037 
embPatFuzzSearch(ajuint type,ajuint begin,const AjPStr pattern,const AjPStr name,const AjPStr text,AjPList l,ajuint plen,ajuint mismatch,AjBool left,AjBool right,ajint * buf,EmbPPatBYPNode off,const ajuint * sotable,ajuint solimit,const AjPStr regexp,ajuint * const * skipm,ajuint * hits,ajuint m,const void ** tidy)5038 void embPatFuzzSearch(ajuint type, ajuint begin, const AjPStr pattern,
5039 		      const AjPStr name, const AjPStr text, AjPList l,
5040 		      ajuint plen, ajuint mismatch, AjBool left, AjBool right,
5041 		      ajint *buf, EmbPPatBYPNode off, const ajuint *sotable,
5042 		      ajuint solimit, const AjPStr regexp,
5043 		      ajuint * const *skipm,
5044 		      ajuint *hits, ajuint m, const void **tidy)
5045 {
5046     EmbPPatMatch ppm;
5047     ajuint n;
5048     ajuint i;
5049     ajuint start;
5050     ajuint end;
5051     ajuint count = 0;
5052 
5053     ajDebug("embPatFuzzSearch type %d pattern: '%S'\n", type, pattern);
5054 
5055     switch(type)
5056     {
5057     case 1:
5058 	*hits = embPatBMHSearch(text,pattern,ajStrGetLen(text),
5059 			      ajStrGetLen(pattern),buf,0,left,right,l,
5060 			      name,begin);
5061 	*tidy = (const void *) buf;
5062 	break;
5063 
5064     case 2:
5065 	for(i=0;i<AJALPHA;++i)
5066 	    buf[i] = plen;
5067 
5068 	for(i=0;i<plen;++i)
5069 	    buf[i] = AJALPHA;
5070 	*hits=embPatBYPSearch(text,name,begin,
5071 			      ajStrGetLen(text),plen,mismatch,off,buf,l,
5072 			      left,right,pattern);
5073 	*tidy = (const void *) buf;
5074 	break;
5075 
5076     case 3:
5077 	*hits = embPatSOSearch(text,name,*ajStrGetPtr(pattern),
5078 			       begin,plen,sotable,solimit,l,
5079 			     left,right);
5080 	*tidy = (const void *) sotable;
5081 	break;
5082 
5083     case 4:
5084 	plen  = m;
5085 	*hits = embPatBYGSearch(text,name,
5086 				begin,plen,sotable,solimit,l,
5087 				left,right);
5088 	*tidy = (const void *) sotable;
5089 	break;
5090 
5091     case 5:
5092 	ppm = embPatMatchFind(regexp,text, left, right);
5093 	n   = embPatMatchGetNumber(ppm);
5094 	count = n;
5095 
5096 	for(i=0;i<n;++i)
5097 	{
5098 	    start = embPatMatchGetStart(ppm,i);
5099 	    end   = embPatMatchGetEnd(ppm,i);
5100 /*
5101 //	    ajDebug("embPatFuzzSearch embPatMatchFind left:%B start:%d "
5102 //                    "count:%d\n",
5103 //		    left, start, count);
5104 */
5105 
5106 	    if(left && start)
5107 	    {
5108 		--count;
5109 		continue;
5110 	    }
5111 
5112 	    if(right && start!=ajStrGetLen(text)-(end-start+1))
5113 	    {
5114 		--count;
5115 		continue;
5116 	    }
5117 
5118 	    if(!right || (right && start==ajStrGetLen(text)-
5119 			     (end-start+1)))
5120 	    {
5121 /*
5122 //                ajDebug("embPatFuzzSearch type 5 push hit %B..%B %d..%d\n",
5123 //			left, right, start, end);
5124 */
5125 
5126 		embPatPushHit(l,name,start,end-start+1,
5127 			      begin,0);
5128 	    }
5129 /*
5130 //	    else
5131 //	    {
5132 //		ajDebug("embPatFuzzSearch type 5 skip hit %B..%B %d..%d\n",
5133 //			left, right, start, end);
5134 //	    }
5135 */
5136 	}
5137 
5138 	embPatMatchDel(&ppm);
5139 	*hits = count;
5140 	break;
5141 
5142     case 6:
5143 	*hits = embPatTUBSearch(pattern,text,ajStrGetLen(text),skipm,
5144 				m,mismatch,begin,
5145 				l,left,right,name,plen);
5146 	*tidy = (const void *) skipm;
5147 	break;
5148 
5149     case 7:
5150 	*hits = embPatBruteForce(text,pattern,left,right,l,
5151 				 begin,mismatch,name);
5152 	break;
5153 
5154     default:
5155 	ajFatal("Can't handle pattern type %S\n",pattern);
5156 	break;
5157     }
5158 
5159     ajDebug("embPatFuzzSearch hits: %d\n", *hits);
5160 
5161     return;
5162 }
5163 
5164 
5165 
5166 
5167 /* @func embPatFuzzSearchAll **************************************************
5168 **
5169 ** Fuzzy search after embPatGetType and embPatCompile
5170 **
5171 ** @param [r] type [ajuint] pattern type
5172 ** @param [r] begin [ajuint] text displacement (1=start)
5173 ** @param [r] pattern [const AjPStr] processed pattern
5174 ** @param [r] name [const AjPStr] name associated with text
5175 ** @param [r] text [const AjPStr] text
5176 ** @param [u] l [AjPList] list to push hits onto
5177 ** @param [r] plen [ajuint] pattern length
5178 ** @param [r] mismatch [ajuint] number of allowed mismatches
5179 ** @param [r] left [AjBool] must match left
5180 ** @param [r] right [AjBool] must match right
5181 ** @param [u] buf [ajint*] buffer for BMH search
5182 ** @param [u] off [EmbPPatBYPNode] offset buffer for B-Y/P search
5183 ** @param [r] sotable [const ajuint*] buffer for SHIFT-OR
5184 ** @param [r] solimit [ajuint] limit for SHIFT-OR
5185 ** @param [r] regexp [const AjPStr] PCRE regexp string
5186 ** @param [r] skipm [ajuint* const *] skip buffer for Tarhio-Ukkonen-Bleasby
5187 ** @param [w] hits [ajuint*] number of hits
5188 ** @param [r] m [ajuint] real pat length (from embPatGetType/embPatCompile)
5189 ** @param [w] tidy [const void**] data to free
5190 **
5191 ** @return [void]
5192 **
5193 ** @release 1.0.0
5194 ** @@
5195 ******************************************************************************/
5196 
embPatFuzzSearchAll(ajuint type,ajuint begin,const AjPStr pattern,const AjPStr name,const AjPStr text,AjPList l,ajuint plen,ajuint mismatch,AjBool left,AjBool right,ajint * buf,EmbPPatBYPNode off,const ajuint * sotable,ajuint solimit,const AjPStr regexp,ajuint * const * skipm,ajuint * hits,ajuint m,const void ** tidy)5197 void embPatFuzzSearchAll(ajuint type, ajuint begin, const AjPStr pattern,
5198                          const AjPStr name, const AjPStr text, AjPList l,
5199                          ajuint plen, ajuint mismatch,
5200                          AjBool left, AjBool right,
5201                          ajint *buf, EmbPPatBYPNode off, const ajuint *sotable,
5202                          ajuint solimit, const AjPStr regexp,
5203                          ajuint * const *skipm,
5204                          ajuint *hits, ajuint m, const void **tidy)
5205 {
5206     EmbPPatMatch ppm;
5207     ajuint n;
5208     ajuint i;
5209     ajuint start;
5210     ajuint end;
5211     ajuint count = 0;
5212 
5213     ajDebug("embPatFuzzSearchAll type %d pattern: '%S'\n", type, pattern);
5214 
5215     switch(type)
5216     {
5217     case 1:
5218 	*hits = embPatBMHSearch(text,pattern,ajStrGetLen(text),
5219 			      ajStrGetLen(pattern),buf,0,left,right,l,
5220 			      name,begin);
5221 	*tidy = (const void *) buf;
5222 	break;
5223 
5224     case 2:
5225 	for(i=0;i<AJALPHA;++i)
5226 	    buf[i] = plen;
5227 
5228 	for(i=0;i<plen;++i)
5229 	    buf[i] = AJALPHA;
5230 	*hits=embPatBYPSearch(text,name,begin,
5231 			      ajStrGetLen(text),plen,mismatch,off,buf,l,
5232 			      left,right,pattern);
5233 	*tidy = (const void *) buf;
5234 	break;
5235 
5236     case 3:
5237 	*hits = embPatSOSearch(text,name,*ajStrGetPtr(pattern),
5238 			       begin,plen,sotable,solimit,l,
5239 			     left,right);
5240 	*tidy = (const void *) sotable;
5241 	break;
5242 
5243     case 4:
5244 	plen  = m;
5245 	*hits = embPatBYGSearch(text,name,
5246 				begin,plen,sotable,solimit,l,
5247 				left,right);
5248 	*tidy = (const void *) sotable;
5249 	break;
5250 
5251     case 5:
5252 	ppm = embPatMatchFindAll(regexp,text, left, right);
5253 	n   = embPatMatchGetNumber(ppm);
5254 	count = n;
5255 
5256 	for(i=0;i<n;++i)
5257 	{
5258 	    start = embPatMatchGetStart(ppm,i);
5259 	    end   = embPatMatchGetEnd(ppm,i);
5260 /*
5261 //	    ajDebug("embPatFuzzSearchAll embPatMatchFind left:%B start:%d "
5262 //                    "count:%d\n",
5263 //		    left, start, count);
5264 */
5265 
5266 	    if(left && start)
5267 	    {
5268 		--count;
5269 		continue;
5270 	    }
5271 
5272 	    if(right && start!=ajStrGetLen(text)-(end-start+1))
5273 	    {
5274 		--count;
5275 		continue;
5276 	    }
5277 
5278 	    if(!right || (right && start==ajStrGetLen(text)-
5279 			     (end-start+1)))
5280 	    {
5281 /*
5282 //		ajDebug("embPatFuzzSearchAll type 5 push hit %B..%B %d..%d\n",
5283 //			left, right, start, end);
5284 */
5285                 embPatPushHit(l,name,start,end-start+1,
5286 			      begin,0);
5287 	    }
5288 /*
5289 //          else
5290 //	    {
5291 //		ajDebug("embPatFuzzSearchAll type 5 skip hit %B..%B %d..%d\n",
5292 //			left, right, start, end);
5293 //	    }
5294 */
5295 	}
5296 
5297 	embPatMatchDel(&ppm);
5298 	*hits = count;
5299 	break;
5300 
5301     case 6:
5302 	*hits = embPatTUBSearch(pattern,text,ajStrGetLen(text),skipm,
5303 				m,mismatch,begin,
5304 				l,left,right,name,plen);
5305 	*tidy = (const void *) skipm;
5306 	break;
5307 
5308     case 7:
5309 	*hits = embPatBruteForce(text,pattern,left,right,l,
5310 				 begin,mismatch,name);
5311 	break;
5312 
5313     default:
5314 	ajFatal("Can't handle pattern type %S\n",pattern);
5315 	break;
5316     }
5317 
5318     ajDebug("embPatFuzzSearchAll hits: %d\n", *hits);
5319 
5320     return;
5321 }
5322 
5323 
5324 
5325 
5326 /* @func embPatCompileII ******************************************************
5327 **
5328 ** Compile a pattern classified by embPatGetType
5329 **
5330 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5331 ** @param [r] mismatch [ajuint] number of allowed mismatches
5332 **
5333 ** @return [void]
5334 **
5335 ** @release 4.0.0
5336 ** @@
5337 ******************************************************************************/
embPatCompileII(AjPPatComp thys,ajuint mismatch)5338 void embPatCompileII (AjPPatComp thys, ajuint mismatch)
5339 {
5340     ajuint i = 0;
5341 
5342     thys->plen = ajStrGetLen(thys->pattern);
5343 
5344     switch(thys->type)
5345     {
5346     case 1:
5347 	if (!thys->buf)
5348 	    AJCNEW(thys->buf,AJALPHA);
5349 
5350 	embPatBMHInit(thys->pattern,thys->plen,thys->buf);
5351 	break;
5352     case 2:
5353 	if (!thys->buf)
5354 	    AJCNEW(thys->buf,AJALPHA);
5355 
5356 	embPatBYPInit(thys->pattern,thys->plen,thys->off,thys->buf);
5357 	break;
5358     case 3:
5359 	if (!thys->sotable)
5360 	    AJCNEW(thys->sotable,AJALPHA2);
5361 
5362 	embPatSOInit(thys->pattern,thys->sotable,&thys->solimit);
5363 	thys->m = thys->plen;
5364 	break;
5365     case 4:
5366 	if (!thys->sotable)
5367 	    AJCNEW(thys->sotable,AJALPHA2);
5368 
5369 	embPatBYGCInit(thys->pattern,&thys->m,thys->sotable,&thys->solimit);
5370 	break;
5371     case 5:
5372 	if (!ajStrGetLen(thys->regex))
5373 	    thys->regex = embPatPrositeToRegExp(thys->pattern);
5374 
5375 	break;
5376     case 6:
5377         if(thys->m && mismatch >= thys->m)
5378             ajFatal("embPatCompileII: Mismatches (%d) must be less than the "
5379                     "real pattern length (%d)",mismatch,thys->m);
5380 
5381 	if (!thys->skipm)
5382 	{
5383 	    AJCNEW(thys->skipm,thys->m);
5384 
5385 	    for(i=0;i<thys->m;++i)
5386 		AJCNEW((thys->skipm)[i],AJALPHA);
5387 	}
5388 
5389 	embPatTUBInit(thys->pattern,thys->skipm,thys->m,mismatch,thys->plen);
5390 	break;
5391     case 7:
5392 	break;
5393     default:
5394 	ajFatal("embPatCompileII: Cannot compile pattern");
5395 	break;
5396     }
5397 
5398     return;
5399 }
5400 
5401 
5402 
5403 
5404 /* @func embPatFuzzSearchII ***************************************************
5405 **
5406 ** Fuzzy search after embPatGetType and embPatCompile
5407 **
5408 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5409 ** @param [r] begin [ajuint] Sequence displacement (1=start)
5410 ** @param [r] name [const AjPStr] Name associated with sequence
5411 ** @param [r] text [const AjPStr] Sequence
5412 ** @param [u] l [AjPList] List to push hits onto
5413 ** @param [r] mismatch [ajuint] number of allowed mismatches
5414 ** @param [w] hits [ajuint*] number of hits
5415 ** @param [w] tidy [const void**] data to free
5416 **
5417 ** @return [void]
5418 **
5419 ** @release 4.0.0
5420 ** @@
5421 ******************************************************************************/
embPatFuzzSearchII(AjPPatComp thys,ajuint begin,const AjPStr name,const AjPStr text,AjPList l,ajuint mismatch,ajuint * hits,const void ** tidy)5422 void embPatFuzzSearchII (AjPPatComp thys, ajuint begin, const AjPStr name,
5423 			 const AjPStr text, AjPList l, ajuint mismatch,
5424 			 ajuint *hits, const void** tidy)
5425 {
5426     EmbPPatMatch ppm;
5427     ajuint n;
5428     ajuint i;
5429     ajuint start;
5430     ajuint end;
5431     ajuint count = 0;
5432 
5433     ajDebug("embPatFuzzSearchII '%S' type %d '%s'\n",
5434 	    thys->pattern, thys->type, patTypes[thys->type]);
5435 
5436     switch(thys->type)
5437     {
5438     case 1:
5439 	*hits = embPatBMHSearch(text,thys->pattern,ajStrGetLen(text),
5440 			      ajStrGetLen(thys->pattern),
5441 				thys->buf,0,thys->amino,
5442 			      thys->carboxyl,l,name,begin);
5443 	*tidy = (const void *) thys->buf;
5444 	break;
5445 
5446     case 2:
5447 	for(i=0;i<AJALPHA;++i)
5448 	    thys->buf[i] = thys->plen;
5449 
5450 	for(i=0;i<thys->plen;++i)
5451 	    thys->buf[i] = AJALPHA;
5452 
5453 	*hits=embPatBYPSearch(text,name,begin,ajStrGetLen(text),
5454 			      thys->plen,mismatch,thys->off,thys->buf,l,
5455 			      thys->amino,thys->carboxyl,thys->pattern);
5456 	*tidy = (const void *) thys->buf;
5457 	break;
5458 
5459     case 3:
5460 	*hits = embPatSOSearch(text,name,*ajStrGetPtr(thys->pattern),begin,
5461 			       thys->plen,thys->sotable,thys->solimit,l,
5462 			       thys->amino,thys->carboxyl);
5463 	*tidy = (const void *) thys->sotable;
5464 	break;
5465 
5466     case 4:
5467 	thys->plen  = thys->m;
5468 	*hits = embPatBYGSearch(text,name,begin,
5469 				thys->plen,thys->sotable,thys->solimit,l,
5470 				thys->amino,thys->carboxyl);
5471 	*tidy = (const void *) thys->sotable;
5472 	break;
5473 
5474     case 5:
5475 	ppm = embPatMatchFind(thys->regex, text,
5476 			      thys->amino, thys->carboxyl);
5477 	n   = embPatMatchGetNumber(ppm);
5478 	count = n;
5479 
5480 	for(i=0;i<n;++i)
5481 	{
5482 	    start = embPatMatchGetStart(ppm,i);
5483 	    end   = embPatMatchGetEnd(ppm,i);
5484 
5485 /*
5486 //	    ajDebug("embPatFuzzSearchII embPatMatchFind left:%B start:%d "
5487 //                    "count:%d\n",
5488 //		    thys->amino, start, count);
5489 */
5490 	    if(thys->amino && start)
5491 	    {
5492 		--count;
5493 		continue;
5494 	    }
5495 
5496 	    if(thys->carboxyl && start!=ajStrGetLen(text)-(end-start+1))
5497 	    {
5498 		--count;
5499 		continue;
5500 	    }
5501 
5502 	    if(!thys->carboxyl || (thys->carboxyl && start==ajStrGetLen(text)-
5503 			     (end-start+1)))
5504 	    {
5505 /*
5506 //		ajDebug("embPatFuzzSearchII type 5 push hit %B..%B %d..%d\n",
5507 //			thys->amino, thys->carboxyl, start, end);
5508 */
5509 		embPatPushHit(l,name,start,end-start+1,
5510 			      begin,0);
5511 	    }
5512 /*
5513 //	    else
5514 //          {
5515 //		ajDebug("embPatFuzzSearchII type 5 skip hit %B..%B %d..%d\n",
5516 //			thys->amino, thys->carboxyl, start, end);
5517 //          }
5518 */
5519 	}
5520 
5521 	embPatMatchDel(&ppm);
5522 	*hits = count;
5523 	break;
5524 
5525     case 6:
5526 	*hits = embPatTUBSearch(thys->pattern,text,ajStrGetLen(text),
5527 				thys->skipm,
5528 				thys->m,mismatch,begin,l,
5529 				thys->amino,thys->carboxyl,name,thys->plen);
5530 	*tidy = (const void *) thys->skipm;
5531 	break;
5532 
5533     case 7:
5534 	*hits = embPatBruteForce(text,thys->pattern,thys->amino,thys->carboxyl,
5535 				 l,begin,mismatch,name);
5536 	break;
5537 
5538     default:
5539 	ajFatal("Can't handle pattern type %S\n",thys->pattern);
5540 	break;
5541     }
5542 
5543     return;
5544 }
5545 
5546 
5547 
5548 
5549 /* @func embPatFuzzSearchAllII ************************************************
5550 **
5551 ** Fuzzy search after embPatGetType and embPatCompile
5552 **
5553 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5554 ** @param [r] begin [ajuint] Sequence displacement (1=start)
5555 ** @param [r] name [const AjPStr] Name associated with sequence
5556 ** @param [r] text [const AjPStr] Sequence
5557 ** @param [u] l [AjPList] List to push hits onto
5558 ** @param [r] mismatch [ajuint] number of allowed mismatches
5559 ** @param [w] hits [ajuint*] number of hits
5560 ** @param [w] tidy [const void**] data to free
5561 **
5562 ** @return [void]
5563 **
5564 ** @release 4.0.0
5565 ** @@
5566 ******************************************************************************/
embPatFuzzSearchAllII(AjPPatComp thys,ajuint begin,const AjPStr name,const AjPStr text,AjPList l,ajuint mismatch,ajuint * hits,const void ** tidy)5567 void embPatFuzzSearchAllII (AjPPatComp thys, ajuint begin, const AjPStr name,
5568                             const AjPStr text, AjPList l, ajuint mismatch,
5569                             ajuint *hits, const void** tidy)
5570 {
5571     EmbPPatMatch ppm;
5572     ajuint n;
5573     ajuint i;
5574     ajuint start;
5575     ajuint end;
5576     ajuint count = 0;
5577 
5578     ajDebug("embPatFuzzSearchAllII '%S' type %d '%s'\n",
5579 	    thys->pattern, thys->type, patTypes[thys->type]);
5580 
5581     switch(thys->type)
5582     {
5583     case 1:
5584 	*hits = embPatBMHSearch(text,thys->pattern,ajStrGetLen(text),
5585 			      ajStrGetLen(thys->pattern),
5586 				thys->buf,0,thys->amino,
5587 			      thys->carboxyl,l,name,begin);
5588 	*tidy = (const void *) thys->buf;
5589 	break;
5590 
5591     case 2:
5592 	for(i=0;i<AJALPHA;++i)
5593 	    thys->buf[i] = thys->plen;
5594 
5595 	for(i=0;i<thys->plen;++i)
5596 	    thys->buf[i] = AJALPHA;
5597 
5598 	*hits=embPatBYPSearch(text,name,begin,ajStrGetLen(text),
5599 			      thys->plen,mismatch,thys->off,thys->buf,l,
5600 			      thys->amino,thys->carboxyl,thys->pattern);
5601 	*tidy = (const void *) thys->buf;
5602 	break;
5603 
5604     case 3:
5605 	*hits = embPatSOSearch(text,name,*ajStrGetPtr(thys->pattern),begin,
5606 			       thys->plen,thys->sotable,thys->solimit,l,
5607 			       thys->amino,thys->carboxyl);
5608 	*tidy = (const void *) thys->sotable;
5609 	break;
5610 
5611     case 4:
5612 	thys->plen  = thys->m;
5613 	*hits = embPatBYGSearch(text,name,begin,
5614 				thys->plen,thys->sotable,thys->solimit,l,
5615 				thys->amino,thys->carboxyl);
5616 	*tidy = (const void *) thys->sotable;
5617 	break;
5618 
5619     case 5:
5620 	ppm = embPatMatchFindAll(thys->regex, text,
5621                                  thys->amino, thys->carboxyl);
5622 	n   = embPatMatchGetNumber(ppm);
5623 	count = n;
5624 
5625 	for(i=0;i<n;++i)
5626 	{
5627 	    start = embPatMatchGetStart(ppm,i);
5628 	    end   = embPatMatchGetEnd(ppm,i);
5629 
5630 /*
5631 //	    ajDebug("embPatFuzzSearchAllII embPatMatchFind left:%B start:%d "
5632 //                    "count:%d\n",
5633 //		    thys->amino, start, count);
5634 */
5635 	    if(thys->amino && start)
5636 	    {
5637 		--count;
5638 		continue;
5639 	    }
5640 
5641 	    if(thys->carboxyl && start!=ajStrGetLen(text)-(end-start+1))
5642 	    {
5643 		--count;
5644 		continue;
5645 	    }
5646 
5647 	    if(!thys->carboxyl || (thys->carboxyl && start==ajStrGetLen(text)-
5648 			     (end-start+1)))
5649 	    {
5650 /*
5651 //		ajDebug("embPatFuzzSearchAllII type 5 push hit %B..%B %d..%d\n",
5652 //			thys->amino, thys->carboxyl, start, end);
5653 */
5654                 embPatPushHit(l,name,start,end-start+1,
5655 			      begin,0);
5656 	    }
5657 /*
5658 //	    else
5659 //            {
5660 //		ajDebug("embPatFuzzSearchAllII type 5 skip hit %B..%B %d..%d\n",
5661 //			thys->amino, thys->carboxyl, start, end);
5662 //            }
5663 */
5664 	}
5665 
5666 	embPatMatchDel(&ppm);
5667 	*hits = count;
5668 	break;
5669 
5670     case 6:
5671 	*hits = embPatTUBSearch(thys->pattern,text,ajStrGetLen(text),
5672 				thys->skipm,
5673 				thys->m,mismatch,begin,l,
5674 				thys->amino,thys->carboxyl,name,thys->plen);
5675 	*tidy = (const void *) thys->skipm;
5676 	break;
5677 
5678     case 7:
5679 	*hits = embPatBruteForce(text,thys->pattern,thys->amino,thys->carboxyl,
5680 				 l,begin,mismatch,name);
5681 	break;
5682 
5683     default:
5684 	ajFatal("Can't handle pattern type %S\n",thys->pattern);
5685 	break;
5686     }
5687 
5688     return;
5689 }
5690 
5691 
5692 
5693 
5694 /* @func embPatGetTypeII ******************************************************
5695 **
5696 ** Return the type of a pattern
5697 **
5698 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5699 ** @param [r] pattern [const AjPStr] Original pattern
5700 ** @param [r] mismatch [ajuint] Number of allowed mismatches
5701 ** @param [r] protein [AjBool] True if protein
5702 **
5703 ** @return [ajuint] type of pattern
5704 **
5705 ** @release 4.0.0
5706 ** @@
5707 ******************************************************************************/
5708 
embPatGetTypeII(AjPPatComp thys,const AjPStr pattern,ajuint mismatch,AjBool protein)5709 ajuint embPatGetTypeII (AjPPatComp thys, const AjPStr pattern, ajuint mismatch,
5710 			AjBool protein)
5711 {
5712     AjBool fclass;
5713     AjBool compl;
5714     AjBool dontcare;
5715     AjBool range;
5716     AjBool isany = ajFalse;
5717     ajuint plen;
5718     ajuint type;
5719     const char *p;
5720     char *q;
5721 
5722     ajStrAssignS(&thys->pattern,pattern);
5723 
5724     if(!embPatClassify(pattern,&thys->pattern,&thys->amino,&thys->carboxyl,
5725 		       &fclass,&compl,&dontcare,&range,protein))
5726 	return 0;
5727 
5728     /* Get real pattern length */
5729     p = ajStrGetPtr(thys->pattern);
5730     thys->m = 0;
5731 
5732     while(*p)
5733     {
5734 	if(*p=='{')
5735 	    while(*p!='}')
5736 		++p;
5737 	else if(*p=='[')
5738 	    while(*p!=']')
5739 		++p;
5740 
5741 	++p;
5742 	++thys->m;
5743     }
5744 
5745 
5746     plen = ajStrGetLen(thys->pattern);
5747     type = 0;
5748 
5749     /*
5750     **  Select type of search depending on pattern
5751     */
5752 
5753     if(!range && !dontcare && !fclass && !compl && !mismatch && plen>AJWORD)
5754     {
5755 	/* Boyer Moore Horspool is the choice for long exact patterns */
5756 	type = 1;
5757     }
5758     else if(mismatch && !dontcare && !range && !fclass && !compl &&
5759 	    plen<AJALPHA/2)
5760     {
5761 	/* Baeza-Yates Perleberg for exact patterns plus mismatches */
5762 	type = 2;
5763     }
5764     else if(!range && !dontcare && !fclass && !compl && !mismatch &&
5765 	    plen<=AJWORD)
5766     {
5767 	/* Shift-OR is the choice for small exact patterns */
5768 	type = 3;
5769     }
5770     else if(!range &&
5771 	    (fclass || compl || dontcare) &&
5772 	    !mismatch && thys->m<=AJWORD)
5773     {
5774 	/*
5775 	 *  Baeza-Yates Gonnet for classes and dontcares.
5776 	 *  No mismatches or ranges. Patterns less than (e.g.) 32
5777          */
5778 	type = 4;
5779     }
5780     else if(!mismatch && (range || thys->m>AJWORD))
5781     {
5782         q = ajStrGetuniquePtr(&thys->pattern);
5783         isany = ajFalse;
5784 
5785 	while(*q)
5786         {
5787             if((protein && *q == 'X') || (!protein && *q=='N'))
5788             {
5789                 *q = '?';
5790                 isany = ajTrue;
5791             }
5792 
5793             ++q;
5794         }
5795 
5796 	if(isany)
5797 	    type=7;
5798 	else
5799 	    type = 5;
5800     }
5801     else if(mismatch && !range && (fclass || compl))
5802     {
5803 	/* Try a Tarhio-Ukkonen-Bleasby         */
5804 	type = 6;
5805     }
5806     else if((mismatch && range) || !type)
5807     {
5808 	/*
5809         **  No choice left but to do a Bleasby recursive brute force
5810         */
5811 	type = 7;
5812     }
5813 
5814     ajDebug("embPatTypeII %d '%S'\n", thys->type, thys->pattern);
5815 
5816     if (!ajStrMatchCaseS(pattern, thys->pattern))
5817 	ajDebug("embPatTypeII cleaned to '%S'\n", thys->pattern);
5818 
5819     thys->type=type;
5820 
5821     return type;
5822 }
5823