1 /* @source embpat *************************************************************
2 **
3 ** General routines for pattern matching.
4 **
5 ** @author Copyright (C) Alan Bleasby 1999
6 ** @version $Revision: 1.84 $
7 ** @modified $Date: 2013/07/15 20:53:52 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA 02110-1301, USA.
24 **
25 ******************************************************************************/
26
27
28 #include "ajlib.h"
29
30 #include "embpat.h"
31 #include "embmat.h"
32
33 #include "ajmath.h"
34 #include "ajreg.h"
35 #include "ajlist.h"
36 #include "ajtable.h"
37 #include "ajpat.h"
38 #include "ajseq.h"
39
40 #include <stdlib.h>
41 #include <limits.h>
42
43
44
45
46 /* @datastatic PatPTypes ***************************************************
47 **
48 ** Prosite pattern types
49 **
50 ** @alias PatSTypes
51 ** @alias PatOTypes
52 **
53 ** @attr Name [const char*] Type name
54 ** @attr Desc [const char*] Type description
55 ** @@
56 ******************************************************************************/
57
58 typedef struct PatSTypes
59 {
60 const char *Name;
61 const char *Desc;
62 } PatOTypes;
63
64 #define PatPTypes PatOTypes*
65
66 static PatOTypes patTypes[] =
67 {
68 /* "Name", "Description" */
69 {"unknown", "Unknown/undefined pattern"},
70 {"BMH", "Boyer Moore Horspool pattern"},
71 {"BYP", "Baeza-Yates Perleberg pattern"},
72 {"SO", "Shift-OR pattern"},
73 {"BYGC", "Baeza-Yates Gonnet class pattern"},
74 {"Regex", "Prosite converted to regex"},
75 {"TUB", "Tarhio-Ukkonen-Bleasby"},
76 {"OUB", "Brute force processing"},
77 {NULL, NULL}
78 };
79
80
81
82
83 /* @datastatic MethPData ***************************************************
84 **
85 ** Methylation data
86 **
87 ** @alias MethSData
88 ** @alias MethOData
89 **
90 ** @attr Name [AjPStr] Name of methylation type
91 ** @attr Site [AjPStr] Methylase recognition site
92 ** @attr Replace [AjPStr] Methylase recognition site replacement sequence
93 ** @@
94 ******************************************************************************/
95
96 typedef struct MethSData
97 {
98 AjPStr Name;
99 AjPStr Site;
100 AjPStr Replace;
101 } MethOData;
102
103 #define MethPData MethOData*
104
105
106 static void patRestrictPushHit(const EmbPPatRestrict enz,
107 AjPList l, ajuint pos,
108 ajuint begin, ajuint len,
109 AjBool forward, AjBool plasmid);
110 static void patRestrictMethylMod(AjPStr *str, AjPStr *rstr,
111 AjPList methlist);
112 static AjPList patRestrictReadMethyl(AjPFile methfile);
113
114
115 static void patAminoCarboxyl(const AjPStr s,AjPStr *cs,
116 AjBool *amino, AjBool *carboxyl);
117 static AjBool patParenTest(const char *p, AjBool *repeat, AjBool *range);
118 static AjBool patExpandRepeat(AjPStr *s);
119 static void patIUBTranslate(AjPStr *pat);
120 static void patProteinTranslate(AjPStr *pat);
121 static AjBool patBruteClass(const char *p, char c);
122 static AjBool patBruteCompl(const char *p, char c);
123 static AjBool patBruteIsRange(const char *t, ajuint *x, ajuint *y);
124 static AjBool patBruteCharMatch(const char *t, char c);
125 static ajuint patBruteNextPatChar(const char *t, ajuint ppos);
126 static AjBool patOUBrute(const char *seq, const char *pat, ajuint spos,
127 ajuint ppos, ajuint mm,
128 ajuint omm, ajuint level, AjPList l, AjBool carboxyl,
129 ajuint begin, ajuint *count, const AjPStr name,
130 ajuint st);
131
132
133
134
135 /* @funcstatic patStringFree **************************************************
136 **
137 ** Free a pattern structure, through an ajListMap call
138 **
139 ** @param [d] x [void**] pattern
140 ** @param [r] cl [void*] Optional function - required by ajListMap but unused
141 ** @return [void]
142 **
143 ** @release 2.0.0
144 ** @@
145 ******************************************************************************/
146
patStringFree(void ** x,void * cl)147 static void patStringFree(void **x, void *cl)
148 {
149 ajuint **ptr = (ajuint **)x;
150
151 (void) cl; /* make it used */
152 AJFREE(*ptr);
153
154 return;
155 }
156
157
158
159
160 /* @func embPatSeqCreateRegExp ************************************************
161 **
162 ** Create a regular expression for a string and substitute the chars for
163 ** Nucleotides or proteins as needed.
164 **
165 ** @param [r] thys [const AjPStr] string to create reg expr from.
166 ** @param [r] protein [AjBool] is it a protein.
167 **
168 ** @return [AjPStr] the new regular expression.
169 **
170 ** @release 1.0.0
171 ******************************************************************************/
172
embPatSeqCreateRegExp(const AjPStr thys,AjBool protein)173 AjPStr embPatSeqCreateRegExp(const AjPStr thys, AjBool protein)
174 {
175 return embPatSeqCreateRegExpC(ajStrGetPtr(thys), protein);
176 }
177
178
179
180
181 /* @func embPatSeqCreateRegExpC ***********************************************
182 **
183 ** Create a regular expression for a string and substitute the chars for
184 ** Nucleotides or proteins as needed.
185 **
186 ** @param [r] ptr [const char *] text to create reg expr from.
187 ** @param [r] protein [AjBool] is it a protein.
188 **
189 ** @return [AjPStr] the new regular expression.
190 **
191 ** @release 1.0.0
192 ******************************************************************************/
193
embPatSeqCreateRegExpC(const char * ptr,AjBool protein)194 AjPStr embPatSeqCreateRegExpC(const char *ptr, AjBool protein)
195 {
196
197 /* codes for A-Z including ambiguity codes */
198
199 const char *nucpatternmatch[] =
200 {
201 "[Aa]", "[CcGgTtUu]", "[Cc]", "[AaGgTtUu]",
202 "", "", "[Gg]", "[AaCcTtUu]",
203 "", "", "", "",
204 "[AaCc]", "[A-Za-z]", "", "",
205 "", "[AaGg]", "[GgCc]", "[TtUu]",
206 "[TtUu]","[AaCcGg]", "[AaTtUu]", "[A-Za-z]",
207 "[CTU]", ""
208 };
209
210 const char *protpatternmatch[] =
211 {
212 "[Aa]","[DdNn]","[Cc]","[Dd]",
213 "[Ee]","[Ff]","[Gg]","[AaCcTtUu]",
214 "[Ii]","","[Kk]","[Ll]",
215 "[Mm]","[A-Za-z]","","[Pp]",
216 "[Qq]","[Rr]","[Ss]","[Tt]",
217 "[Uu]","[Vv]","[Ww]","[A-Za-z]",
218 "[Yy]","[EeQq]"
219 };
220
221 AjPStr regexp = 0;
222 ajuint match;
223 char match2[2] = " ";
224 const char* optr = ptr;
225
226 regexp = ajStrNewRes((ajuint)strlen(ptr) * 4); /* just a rough guess */
227
228 while(*ptr != '\0')
229 {
230 /* alphabetic characters converted to character sets */
231 if((*ptr > 64 && *ptr < 91) || (*ptr > 96 && *ptr < 123))
232 {
233 if(*ptr > 91)
234 match = ((ajuint) *ptr) - 97;
235 else
236 match = ((ajuint) *ptr) - 65;
237
238 if(protein)
239 ajStrAppendC(®exp,protpatternmatch[match]);
240 else
241 ajStrAppendC(®exp,nucpatternmatch[match]);
242 }
243 else
244 {
245 match2[0] = *ptr;
246 ajStrAppendC(®exp,match2);
247 }
248 ptr++;
249 }
250
251 ajDebug("embPatSeqCreateRegExpC ptr: '%s' returns regexp: %S'\n",
252 optr, regexp);
253
254 return regexp;
255 }
256
257
258
259
260 /* @func embPatSeqMatchFind ***************************************************
261 **
262 ** Find all the regular expression matches of reg in the string string.
263 **
264 ** @param [r] seq [const AjPSeq] Sequence to be searched.
265 ** @param [r] reg [const AjPStr] regular expression string.
266 **
267 ** @return [EmbPPatMatch] Results of the pattern matching.
268 **
269 **
270 ** @release 1.0.0
271 ******************************************************************************/
272
embPatSeqMatchFind(const AjPSeq seq,const AjPStr reg)273 EmbPPatMatch embPatSeqMatchFind(const AjPSeq seq, const AjPStr reg)
274 {
275 return embPatSeqMatchFindC(seq, ajStrGetPtr(reg));
276 }
277
278
279
280
281 /* @func embPatSeqMatchFindC **************************************************
282 **
283 ** Find all the regular expression matches of reg in the string string.
284 **
285 ** @param [r] seq [const AjPSeq] Sequence to be searched.
286 ** @param [r] reg [const char*] regular expression text.
287 **
288 ** @return [EmbPPatMatch] Results of the pattern matching.
289 **
290 **
291 ** @release 1.0.0
292 ******************************************************************************/
293
embPatSeqMatchFindC(const AjPSeq seq,const char * reg)294 EmbPPatMatch embPatSeqMatchFindC(const AjPSeq seq, const char *reg)
295 {
296 AjPStr regexp = NULL;
297 AjBool protein;
298 EmbPPatMatch results;
299
300 protein = ajSeqIsProt(seq);
301
302 regexp = embPatSeqCreateRegExpC(reg,protein);
303 results = embPatMatchFind(regexp, ajSeqGetSeqS(seq), ajFalse, ajFalse);
304
305 ajStrDel(®exp);
306
307 return results;
308 }
309
310
311
312
313 /* @func embPatSeqMatchFindAll ************************************************
314 **
315 ** Find all the regular expression matches of reg in the string string.
316 **
317 ** @param [r] seq [const AjPSeq] Sequence to be searched.
318 ** @param [r] reg [const AjPStr] regular expression string.
319 **
320 ** @return [EmbPPatMatch] Results of the pattern matching.
321 **
322 **
323 ** @release 1.0.0
324 ******************************************************************************/
325
embPatSeqMatchFindAll(const AjPSeq seq,const AjPStr reg)326 EmbPPatMatch embPatSeqMatchFindAll(const AjPSeq seq, const AjPStr reg)
327 {
328 return embPatSeqMatchFindAllC(seq, ajStrGetPtr(reg));
329 }
330
331
332
333
334 /* @func embPatSeqMatchFindAllC ***********************************************
335 **
336 ** Find all the regular expression matches of reg in the string string.
337 **
338 ** @param [r] seq [const AjPSeq] Sequence to be searched.
339 ** @param [r] reg [const char*] regular expression text.
340 **
341 ** @return [EmbPPatMatch] Results of the pattern matching.
342 **
343 **
344 ** @release 1.0.0
345 ******************************************************************************/
346
embPatSeqMatchFindAllC(const AjPSeq seq,const char * reg)347 EmbPPatMatch embPatSeqMatchFindAllC(const AjPSeq seq, const char *reg)
348 {
349 AjPStr regexp = NULL;
350 AjBool protein;
351 EmbPPatMatch results;
352
353 protein = ajSeqIsProt(seq);
354
355 regexp = embPatSeqCreateRegExpC(reg,protein);
356 results = embPatMatchFindAll(regexp, ajSeqGetSeqS(seq), ajFalse, ajFalse);
357
358 ajStrDel(®exp);
359
360 return results;
361 }
362
363
364
365
366 /* @func embPatMatchFind ******************************************************
367 **
368 ** Find all the regular expression matches of reg in the string string.
369 **
370 ** @param [r] regexp [const AjPStr] Regular expression string.
371 ** @param [r] strng [const AjPStr] String to be searched.
372 ** @param [r] left [AjBool] has to match the start
373 ** @param [r] right [AjBool] has to match the end
374 **
375 ** @return [EmbPPatMatch] Results of the pattern matching.
376 **
377 **
378 ** @release 1.0.0
379 ******************************************************************************/
380
embPatMatchFind(const AjPStr regexp,const AjPStr strng,AjBool left,AjBool right)381 EmbPPatMatch embPatMatchFind(const AjPStr regexp, const AjPStr strng,
382 AjBool left, AjBool right)
383 {
384 return embPatMatchFindC(regexp, ajStrGetPtr(strng), left, right);
385 }
386
387
388
389
390 /* @func embPatMatchFindC *****************************************************
391 **
392 ** Find all the regular expression matches of reg in the string string.
393 **
394 ** @param [r] regexp [const AjPStr] Regular expression string.
395 ** @param [r] sptr [const char *] String to be searched.
396 ** @param [r] left [AjBool] has to match the start
397 ** @param [r] right [AjBool] has to match the end
398 **
399 ** @return [EmbPPatMatch] Results of the pattern matching.
400 **
401 **
402 ** @release 1.0.0
403 ******************************************************************************/
404
embPatMatchFindC(const AjPStr regexp,const char * sptr,AjBool left,AjBool right)405 EmbPPatMatch embPatMatchFindC(const AjPStr regexp, const char *sptr,
406 AjBool left, AjBool right)
407 {
408 AjPRegexp compexp = NULL;
409 EmbPPatMatch results;
410 AjPList poslist = ajListNew();
411 AjPList lenlist = ajListNew();
412 AjIList iter;
413 ajuint *pos;
414 ajuint *len;
415 ajuint posi;
416 ajuint i;
417 const char *ptr;
418 AjBool nterm = ajFalse;
419 /* AjPListNode node;*/
420 AjPStr regstr = NULL;
421
422 if(*regexp->Ptr == '^')
423 nterm = ajTrue;
424
425 regstr = ajStrNewS(regexp);
426
427 if(left)
428 {
429 if(!nterm)
430 ajStrInsertC(®str, 0, "^");
431 nterm = ajTrue;
432 }
433
434 if(right)
435 ajStrAppendC(®str, "$");
436
437 ajDebug("embPatMatchFindC regexp: '%S' regstr: '%S'\n",
438 regexp, regstr);
439
440 ajDebug("embPatMatchFindC sptr '%s'\n",
441 sptr);
442
443 compexp = ajRegComp(regstr);
444
445 ptr = sptr;
446
447 AJNEW(results);
448
449 while(*sptr != '\0' && ajRegExecC(compexp,sptr))
450 {
451 AJNEW(pos);
452 *pos = posi = ajRegOffset(compexp);
453 AJNEW(len);
454 *len = ajRegLenI(compexp,0);
455 *pos += (ajuint) (sptr-ptr);
456 ajListPush(poslist, pos);
457 ajListPush(lenlist, len);
458 sptr += posi+1;
459
460 if(nterm)
461 break;
462 }
463
464 ajRegFree(&compexp);
465 results->number = (ajuint) ajListGetLength(poslist);
466
467 ajDebug("embPatMatchFindC '%S' nterm:%B results: %d\n",
468 regstr, nterm, results->number);
469
470 if(results->number)
471 {
472 AJCNEW(results->start, results->number);
473 AJCNEW(results->len, results->number);
474
475 i = 0;
476 iter = ajListIterNewread(poslist);
477
478 while(!ajListIterDone(iter))
479 {
480 results->start[i] = *(ajuint *) ajListIterGet(iter);
481 i++;
482 }
483
484 ajListIterDel(&iter);
485
486 i = 0;
487 iter = ajListIterNewread(lenlist);
488
489 while(!ajListIterDone(iter))
490 {
491 results->len[i] = *(ajuint *) ajListIterGet(iter);
492 i++;
493 }
494
495 ajListIterDel(&iter);
496
497 ajListMap(poslist, &patStringFree, NULL);
498 ajListMap(lenlist, &patStringFree, NULL);
499 ajListFree(&poslist);
500 ajListFree(&lenlist);
501
502 }
503 else
504 {
505 ajListFree(&poslist);
506 ajListFree(&lenlist);
507 }
508
509 ajStrDel(®str);
510
511 return results;
512 }
513
514
515
516
517 /* @func embPatMatchFindAll ***************************************************
518 **
519 ** Find all the regular expression matches of reg in the string string.
520 **
521 ** @param [r] regexp [const AjPStr] Regular expression string.
522 ** @param [r] strng [const AjPStr] String to be searched.
523 ** @param [r] left [AjBool] has to match the start
524 ** @param [r] right [AjBool] has to match the end
525 **
526 ** @return [EmbPPatMatch] Results of the pattern matching.
527 **
528 **
529 ** @release 1.0.0
530 ******************************************************************************/
531
embPatMatchFindAll(const AjPStr regexp,const AjPStr strng,AjBool left,AjBool right)532 EmbPPatMatch embPatMatchFindAll(const AjPStr regexp, const AjPStr strng,
533 AjBool left, AjBool right)
534 {
535 return embPatMatchFindAllC(regexp, ajStrGetPtr(strng), left, right);
536 }
537
538
539
540
541 /* @func embPatMatchFindAllC **************************************************
542 **
543 ** Find all the regular expression matches of reg in the string string.
544 **
545 ** @param [r] regexp [const AjPStr] Regular expression string.
546 ** @param [r] sptr [const char *] String to be searched.
547 ** @param [r] left [AjBool] has to match the start
548 ** @param [r] right [AjBool] has to match the end
549 **
550 ** @return [EmbPPatMatch] Results of the pattern matching.
551 **
552 **
553 ** @release 1.0.0
554 ******************************************************************************/
555
embPatMatchFindAllC(const AjPStr regexp,const char * sptr,AjBool left,AjBool right)556 EmbPPatMatch embPatMatchFindAllC(const AjPStr regexp, const char *sptr,
557 AjBool left, AjBool right)
558 {
559 AjPRegexp compexp = NULL;
560 EmbPPatMatch results;
561 AjPList poslist = ajListNew();
562 AjPList lenlist = ajListNew();
563 AjIList iter;
564 ajuint *pos;
565 ajuint *len;
566 ajuint posi;
567 ajuint i;
568 ajuint j;
569 ajuint k;
570 const char *ptr;
571 AjBool nterm = ajFalse;
572 AjPStr regstr = NULL;
573 ajuint soff;
574
575 if(*regexp->Ptr == '^')
576 nterm = ajTrue;
577
578 regstr = ajStrNewS(regexp);
579
580 if(left)
581 {
582 if(!nterm)
583 ajStrInsertC(®str, 0, "^");
584 nterm = ajTrue;
585 }
586
587 if(right)
588 ajStrAppendC(®str, "$");
589
590 ajDebug("embPatMatchFindAllC regexp: '%S' regstr: '%S'\n",
591 regexp, regstr);
592
593 ajDebug("embPatMatchFindAllC sptr '%s'\n",
594 sptr);
595
596 compexp = ajRegComp(regstr);
597
598 ptr = sptr;
599
600 AJNEW(results);
601
602 while(*sptr != '\0' && ajRegExecallC(compexp,sptr))
603 {
604 soff = sptr - ptr;
605 j = ajRegGetMatches(compexp);
606 for(k=0; k < j; k++)
607 {
608 AJNEW(pos);
609 *pos = posi = ajRegOffsetI(compexp,k);
610 AJNEW(len);
611 *len = ajRegLenI(compexp,k);
612 *pos += soff;
613 ajListPush(poslist, pos);
614 ajListPush(lenlist, len);
615
616 if(!k)
617 sptr += posi+1;
618
619 if(nterm)
620 break;
621 }
622 }
623
624 ajRegFree(&compexp);
625 results->number = (ajuint) ajListGetLength(poslist);
626
627 ajDebug("embPatMatchFindAllC '%S' nterm:%B results: %d\n",
628 regstr, nterm, results->number);
629
630 if(results->number)
631 {
632 AJCNEW(results->start, results->number);
633 AJCNEW(results->len, results->number);
634
635 i = 0;
636 iter = ajListIterNewread(poslist);
637
638 while(!ajListIterDone(iter))
639 {
640 results->start[i] = *(ajuint *) ajListIterGet(iter);
641 i++;
642 }
643
644 ajListIterDel(&iter);
645
646 i = 0;
647 iter = ajListIterNewread(lenlist);
648
649 while(!ajListIterDone(iter))
650 {
651 results->len[i] = *(ajuint *) ajListIterGet(iter);
652 i++;
653 }
654
655 ajListIterDel(&iter);
656
657 ajListMap(poslist, &patStringFree, NULL);
658 ajListMap(lenlist, &patStringFree, NULL);
659 ajListFree(&poslist);
660 ajListFree(&lenlist);
661
662 }
663 else
664 {
665 ajListFree(&poslist);
666 ajListFree(&lenlist);
667 }
668
669 ajStrDel(®str);
670
671 return results;
672 }
673
674
675
676
677 /* @func embPatMatchGetLen ****************************************************
678 **
679 ** Returns the length from the pattern match structure for index'th item.
680 **
681 ** @param [r] data [const EmbPPatMatch] results of match.
682 ** @param [r] indexnum [ajuint] index to structure.
683 **
684 ** @return [ajuint] returns -1 if not available.
685 **
686 **
687 ** @release 1.0.0
688 ******************************************************************************/
689
embPatMatchGetLen(const EmbPPatMatch data,ajuint indexnum)690 ajuint embPatMatchGetLen(const EmbPPatMatch data, ajuint indexnum)
691 {
692 if(data->number <= indexnum)
693 return -1;
694
695 return data->len[indexnum];
696 }
697
698
699
700
701 /* @func embPatMatchGetEnd ****************************************************
702 **
703 ** Returns the End point for the pattern match structure for index'th item.
704 **
705 ** @param [r] data [const EmbPPatMatch] results of match.
706 ** @param [r] indexnum [ajuint] index to structure.
707 **
708 ** @return [ajuint] returns -1 if not available.
709 **
710 **
711 ** @release 1.0.0
712 ******************************************************************************/
713
embPatMatchGetEnd(const EmbPPatMatch data,ajuint indexnum)714 ajuint embPatMatchGetEnd(const EmbPPatMatch data, ajuint indexnum)
715 {
716 if(data->number <= indexnum)
717 return -1;
718
719 return data->len[indexnum]+data->start[indexnum]-1;
720 }
721
722
723
724
725 /* @func embPatMatchGetNumber *************************************************
726 **
727 ** Returns the number of pattern matches in the structure.
728 **
729 ** @param [r] data [const EmbPPatMatch] results of match.
730 **
731 ** @return [ajuint] returns -1 if not available.
732 **
733 **
734 ** @release 1.0.0
735 ******************************************************************************/
736
embPatMatchGetNumber(const EmbPPatMatch data)737 ajuint embPatMatchGetNumber(const EmbPPatMatch data)
738 {
739 return data->number;
740 }
741
742
743
744
745 /* @func embPatMatchGetStart **************************************************
746 **
747 ** Returns the start position from the pattern match structure for
748 ** index'th item.
749 **
750 ** @param [r] data [const EmbPPatMatch] results of match.
751 ** @param [r] indexnum [ajuint] index to structure.
752 **
753 ** @return [ajuint] returns -1 if not available.
754 **
755 **
756 ** @release 1.0.0
757 ******************************************************************************/
758
embPatMatchGetStart(const EmbPPatMatch data,ajuint indexnum)759 ajuint embPatMatchGetStart(const EmbPPatMatch data, ajuint indexnum)
760 {
761 if(data->number <= indexnum)
762 return -1;
763
764 return data->start[indexnum];
765 }
766
767
768
769
770 /* @func embPatMatchDel *******************************************************
771 **
772 ** Free all the memory from the pattern match search.
773 **
774 ** @param [d] pthis [EmbPPatMatch*] results to be freed.
775 ** @return [void]
776 ** @category delete [EmbPPatMatch] Standard destructor
777 **
778 ** @release 1.0.0
779 ******************************************************************************/
780
embPatMatchDel(EmbPPatMatch * pthis)781 void embPatMatchDel(EmbPPatMatch* pthis)
782 {
783 EmbPPatMatch thys;
784
785 thys = pthis ? *pthis : 0;
786
787 if(!pthis)
788 return;
789
790 if(!*pthis)
791 return;
792
793 if(thys->number)
794 {
795 AJFREE(thys->start);
796 AJFREE(thys->len);
797 }
798
799 AJFREE(*pthis);
800
801 return;
802 }
803
804
805
806
807 /* @func embPatPrositeToRegExp ************************************************
808 **
809 ** Convert a prosite pattern to a regular expression
810 **
811 ** Start and end are indicated by boolean options, as the pattern may have
812 ** been processed for the other pattern matching methods
813 **
814 ** @param [r] s [const AjPStr] prosite pattern
815 ** @return [AjPStr] regular expression
816 **
817 ** @release 1.0.0
818 ******************************************************************************/
819
embPatPrositeToRegExp(const AjPStr s)820 AjPStr embPatPrositeToRegExp(const AjPStr s)
821 {
822 return embPatPrositeToRegExpEnds(s, AJFALSE, AJFALSE);
823 }
824
825
826
827
828 /* @func embPatPrositeToRegExpEnds ********************************************
829 **
830 ** Convert a prosite pattern to a regular expression string.
831 **
832 ** start and end say whether the ends should match in case this is a processed
833 ** pattern, but there is still a check for angle brackets.
834 **
835 ** @param [r] s [const AjPStr] prosite pattern
836 ** @param [r] start [AjBool] must match start
837 ** @param [r] end [AjBool] must match end
838 ** @return [AjPStr] regular expression
839 **
840 ** @release 2.9.0
841 ******************************************************************************/
842
embPatPrositeToRegExpEnds(const AjPStr s,AjBool start,AjBool end)843 AjPStr embPatPrositeToRegExpEnds (const AjPStr s, AjBool start, AjBool end)
844 {
845 AjPStr t;
846 AjPStr c;
847 AjBool isnt = start;
848 AjBool isct = end;
849
850 const char *p;
851 static const char *aa = "ACDEFGHIKLMNPQRSTVWY";
852 static char ch[2];
853 ajuint len;
854 ajuint i;
855
856 t = ajStrNewC("");
857 len = ajStrGetLen(s);
858
859 if(!len)
860 return t;
861
862 c = ajStrNew();
863 ajStrAssignS(&c, s);
864 ajStrFmtUpper(&c);
865 ajStrRemoveWhiteExcess(&c);
866 ch[1]='\0';
867
868 p = ajStrGetPtr(c);
869
870 for(i=0;i<len;++i)
871 {
872 if(p[i]=='>')
873 isct = 1;
874
875 if(p[i]=='<')
876 isnt = 1;
877 }
878
879 if(isnt)
880 ajStrAppendC(&t,"^");
881
882 while(*p)
883 {
884 if(*p=='?')
885 {
886 ajStrAppendC(&t,"[^#]");
887 ++p;
888 continue;
889 }
890
891 if(*p=='X')
892 {
893 ajStrAppendC(&t,"[^BJOUXZ]");
894 ++p;
895 continue;
896 }
897
898 if(*p=='(')
899 {
900 ++p;
901 ajStrAppendC(&t,"{");
902
903 while(*p != ')')
904 {
905 if(!*p)
906 ajFatal("Unmatched '(' in %S\n",s);
907
908 if(*p=='<' || *p=='>')
909 {
910 ++p;
911 continue;
912 }
913
914 *ch= *p;
915 ajStrAppendC(&t,ch);
916 ++p;
917 }
918
919 ajStrAppendC(&t,"}");
920 ++p;
921 continue;
922 }
923
924 if(*p=='[')
925 {
926 while(*p != ']')
927 {
928 if(!*p)
929 ajFatal("Unmatched '[' in %S\n",s);
930
931 if(*p=='<' || *p=='>')
932 {
933 ++p;
934 continue;
935 }
936
937 *ch = *p;
938 ajStrAppendC(&t,ch);
939 ++p;
940 }
941
942 ajStrAppendC(&t,"]");
943 ++p;
944 continue;
945 }
946
947 if(*p=='{')
948 {
949 ++p;
950 ajStrAppendC(&t,"[^");
951
952 while(*p != '}')
953 {
954 if(!*p)
955 ajFatal("Unmatched '{' in %S\n",s);
956
957 if(*p=='<' || *p=='>')
958 {
959 ++p;
960 continue;
961 }
962
963 *ch = *p;
964 ajStrAppendC(&t,ch);
965 ++p;
966 }
967
968 ajStrAppendC(&t,"]");
969 ++p;
970 continue;
971 }
972
973 if(strchr(aa,*p))
974 {
975 *ch = *p;
976 ajStrAppendC(&t,ch);
977 ++p;
978 continue;
979 }
980
981 if(!(*p==' ' || *p=='-' || *p=='>' || *p=='<'))
982 ajFatal("Unrecognised character in %S\n",s);
983 ++p;
984 }
985
986 if(isct)
987 ajStrAppendC(&t,"$");
988
989 ajStrAssignS(&c,t);
990 ajStrDel(&t);
991
992 return c;
993 }
994
995
996
997
998 /* @func embPatRestrictNew ****************************************************
999 **
1000 ** Create a new restriction object
1001 **
1002 ** @return [EmbPPatRestrict] the allocated object
1003 **
1004 ** @release 1.0.0
1005 ******************************************************************************/
1006
embPatRestrictNew(void)1007 EmbPPatRestrict embPatRestrictNew(void)
1008 {
1009 EmbPPatRestrict thys;
1010
1011 AJNEW0(thys);
1012
1013 thys->cod = ajStrNew();
1014 thys->pat = ajStrNew();
1015 thys->bin = ajStrNew();
1016 thys->org = ajStrNew();
1017 thys->iso = ajStrNew();
1018 thys->meth = ajStrNew();
1019 thys->sou = ajStrNew();
1020 thys->sup = ajStrNew();
1021
1022 return thys;
1023 }
1024
1025
1026
1027
1028 /* @func embPatRestrictDel ****************************************************
1029 **
1030 ** Delete a restriction object
1031 **
1032 ** @param [d] thys [EmbPPatRestrict *] restriction object
1033 ** @return [void]
1034 ** @category delete [EmbPPatRestrict] Standard destructor
1035 **
1036 ** @release 1.0.0
1037 ******************************************************************************/
1038
embPatRestrictDel(EmbPPatRestrict * thys)1039 void embPatRestrictDel(EmbPPatRestrict *thys)
1040 {
1041 ajStrDel(&(*thys)->cod);
1042 ajStrDel(&(*thys)->pat);
1043 ajStrDel(&(*thys)->bin);
1044 ajStrDel(&(*thys)->org);
1045 ajStrDel(&(*thys)->iso);
1046 ajStrDel(&(*thys)->meth);
1047 ajStrDel(&(*thys)->sou);
1048 ajStrDel(&(*thys)->sup);
1049 AJFREE(*thys);
1050
1051 return;
1052 }
1053
1054
1055
1056
1057 /* @func embPatRestrictReadEntry **********************************************
1058 **
1059 ** Read next restriction enzyme from re file
1060 **
1061 ** @param [w] re [EmbPPatRestrict] restriction object to fill
1062 ** @param [u] inf [AjPFile] input file pointer
1063 ** @return [AjBool] True if read successful
1064 ** @category input [EmbPPatRestrict] Read next restriction enzyme from file
1065 **
1066 ** @release 1.0.0
1067 ******************************************************************************/
1068
embPatRestrictReadEntry(EmbPPatRestrict re,AjPFile inf)1069 AjBool embPatRestrictReadEntry(EmbPPatRestrict re, AjPFile inf)
1070 {
1071 AjPStr line = NULL;
1072 AjBool ret = AJFALSE;
1073 const char *p = NULL;
1074 char *q = NULL;
1075 ajuint i;
1076
1077 line = ajStrNew();
1078
1079 while((ret=ajReadlineTrim(inf,&line)))
1080 {
1081 p = ajStrGetPtr(line);
1082
1083 if(!(!*p || *p=='#' || *p=='!'))
1084 break;
1085 }
1086
1087 if(!ret)
1088 {
1089 ajStrDel(&line);
1090
1091 return ajFalse;
1092 }
1093
1094
1095 p = ajSysFuncStrtok(p,"\t \n");
1096 if(!p)
1097 return ajFalse;
1098 ajStrAssignC(&re->cod,p);
1099
1100 p = ajSysFuncStrtok(NULL,"\t \n");
1101 if(!p)
1102 return ajFalse;
1103 ajStrAssignC(&re->pat,p);
1104 ajStrAssignC(&re->bin,p);
1105
1106 p = ajSysFuncStrtok(NULL,"\t \n");
1107 if(!p)
1108 return ajFalse;
1109 if(!sscanf(p,"%u",&re->len))
1110 return ajFalse;
1111
1112 p = ajSysFuncStrtok(NULL,"\t \n");
1113 if(!p)
1114 return ajFalse;
1115 if(!sscanf(p,"%u",&re->ncuts))
1116 return ajFalse;
1117
1118 p = ajSysFuncStrtok(NULL,"\t \n");
1119 if(!p)
1120 return ajFalse;
1121 if(!sscanf(p,"%d",&re->blunt))
1122 return ajFalse;
1123
1124 p = ajSysFuncStrtok(NULL,"\t \n");
1125 if(!p)
1126 return ajFalse;
1127 if(!sscanf(p,"%d",&re->cut1))
1128 return ajFalse;
1129
1130 p = ajSysFuncStrtok(NULL,"\t \n");
1131 if(!p)
1132 return ajFalse;
1133 if(!sscanf(p,"%d",&re->cut2))
1134 return ajFalse;
1135
1136 p = ajSysFuncStrtok(NULL,"\t \n");
1137 if(!p)
1138 return ajFalse;
1139 if(!sscanf(p,"%d",&re->cut3))
1140 return ajFalse;
1141
1142 p = ajSysFuncStrtok(NULL,"\t \n");
1143 if(!p)
1144 return ajFalse;
1145 if(!sscanf(p,"%d",&re->cut4))
1146 return ajFalse;
1147
1148
1149 for(i=0,q=ajStrGetuniquePtr(&re->bin);i<re->len;++i)
1150 *(q+i)=(char)ajBaseAlphaToBin((int)*(q+i));
1151
1152 ajStrDel(&line);
1153
1154 return ajTrue;
1155 }
1156
1157
1158
1159
1160 /* @funcstatic patRestrictPushHit *********************************************
1161 **
1162 ** Put a matching restriction enzyme on the heap
1163 ** as an EmbPMatMatch structure
1164 **
1165 ** @param [r] enz [const EmbPPatRestrict] Enyme information
1166 ** @param [u] l [AjPList] List to add to
1167 ** @param [r] pos [ajuint] Sequence match position
1168 ** @param [r] begin [ajuint] Sequence offset
1169 ** @param [r] len [ajuint] Sequence length
1170 ** @param [r] forward [AjBool] True if forward strand
1171 ** @param [r] plasmid [AjBool] Allow circular DNA (currently ignored)
1172 **
1173 ** @return [void]
1174 **
1175 ** @release 1.0.0
1176 ******************************************************************************/
1177
patRestrictPushHit(const EmbPPatRestrict enz,AjPList l,ajuint pos,ajuint begin,ajuint len,AjBool forward,AjBool plasmid)1178 static void patRestrictPushHit(const EmbPPatRestrict enz,
1179 AjPList l, ajuint pos,
1180 ajuint begin, ajuint len,
1181 AjBool forward, AjBool plasmid)
1182 {
1183
1184 EmbPMatMatch hit;
1185 ajuint v;
1186
1187 (void) plasmid; /* make it used */
1188
1189 AJNEW0(hit);
1190
1191 hit->seqname = ajStrNew();
1192 hit->cod = ajStrNewS(enz->cod);
1193 hit->pat = ajStrNewS(enz->pat);
1194 hit->acc = ajStrNew();
1195 hit->tit = ajStrNew();
1196 hit->iso = ajStrNew();
1197 hit->len = enz->len;
1198
1199 if(forward)
1200 {
1201 hit->forward = 1;
1202 hit->start = pos+begin;
1203 hit->cut1 = pos+begin+enz->cut1-1;
1204 hit->cut2 = pos+begin+enz->cut2-1;
1205
1206 if(hit->cut1 > (ajint)(len+begin-1))
1207 hit->cut1-=len;
1208
1209 if(hit->cut2 > (ajint)(len+begin-1))
1210 hit->cut2-=len;
1211
1212 if(enz->cut1<1)
1213 ++hit->cut1;
1214
1215 if(enz->cut2<1)
1216 ++hit->cut2;
1217
1218 if(hit->cut1<1)
1219 {
1220 hit->cut1+=len;
1221 hit->circ12 = ajTrue;
1222 }
1223
1224 if(hit->cut2<1)
1225 {
1226 hit->cut2+=len;
1227 hit->circ12 = ajTrue;
1228 }
1229
1230 if(enz->ncuts == 4)
1231 {
1232 hit->cut3 = pos+begin+enz->cut3-1;
1233 hit->cut4 = pos+begin+enz->cut4-1;
1234
1235 if(hit->cut3>(ajint)(len+begin-1))
1236 {
1237 hit->cut3-=len;
1238 hit->circ34 = ajTrue;
1239 }
1240
1241 if(hit->cut4>(ajint)(len+begin-1))
1242 {
1243 hit->cut4-=len;
1244 hit->circ34 = ajTrue;
1245 }
1246 }
1247 else hit->cut3 = hit->cut4 = 0;
1248 }
1249 else
1250 {
1251 hit->forward = 0;
1252 hit->start = len+begin-pos-1;
1253 hit->cut1 = len+begin-pos-enz->cut1-1;
1254 hit->cut2 = len+begin-pos-enz->cut2-1;
1255
1256 if(enz->cut1<1)
1257 --hit->cut1;
1258
1259 if(enz->cut2<1)
1260 --hit->cut2;
1261
1262 /* cuts beyond start of sequence */
1263
1264 if(hit->cut1<1)
1265 {
1266 hit->cut1+=len;
1267 hit->circ12 = ajTrue;
1268 }
1269
1270 if(hit->cut2<1)
1271 {
1272 hit->cut2+=len;
1273 hit->circ12 = ajTrue;
1274 }
1275
1276 /* cuts beyond end of sequence */
1277
1278 if(hit->cut1>(ajint)(len+begin-1))
1279 {
1280 hit->cut1-=len;
1281 hit->circ12 = ajTrue;
1282 }
1283
1284 if(hit->cut2>(ajint)(len+begin-1))
1285 {
1286 hit->cut2-=len;
1287 hit->circ12 = ajTrue;
1288 }
1289
1290 if(enz->ncuts == 4)
1291 {
1292 /* ajDebug("so far, len:%d pos:%d begin:%d\n",
1293 len, pos, begin); */
1294 /* ajDebug("before, cut3:%d 4:%d circ34:%b\n",
1295 hit->cut3, hit->cut4, hit->circ34); */
1296
1297 hit->cut3 = len+begin-pos-enz->cut3-1;
1298 hit->cut4 = len+begin-pos-enz->cut4-1;
1299
1300 /* ajDebug("middle, cut3:%d 4:%d\n",
1301 hit->cut3, hit->cut4); */
1302
1303 if(hit->cut3<0)
1304 {
1305 hit->cut3+=len;
1306 hit->circ34 = ajTrue;
1307 }
1308
1309 if(hit->cut4<0)
1310 {
1311 hit->cut4+=len;
1312 hit->circ34 = ajTrue;
1313 }
1314
1315 /* ajDebug("after, cut3:%d 4:%d circ34: %b\n",
1316 hit->cut3, hit->cut4, hit->circ34); */
1317 }
1318 else
1319 hit->cut3 = hit->cut4 = 0;
1320
1321 /* Reverse them to refer to forward strand */
1322 v = hit->cut1;
1323 hit->cut1 = hit->cut2;
1324 hit->cut2 = v;
1325 v = hit->cut3;
1326 hit->cut3 = hit->cut4;
1327 hit->cut4 = v;
1328 }
1329
1330 /* ajDebug("embPatRestrictPushHit forward:%b\n", forward); */
1331 /* ajDebug("cut1:%d 2:%d 3:%d 4:%d\n",
1332 hit->cut1, hit->cut2, hit->cut3, hit->cut4); */
1333 ajListPush(l,(void *) hit);
1334
1335 return;
1336 }
1337
1338
1339
1340
1341 /* @funcstatic patRestrictReadMethyl ******************************************
1342 **
1343 ** Read methylation data as a list of MethPdata structures
1344 **
1345 ** @param [u] methfile [AjPFile] Methylation data file
1346 **
1347 ** @return [AjPList] List of MethPData objects
1348 **
1349 ** @release 6.1.0
1350 ******************************************************************************/
1351
patRestrictReadMethyl(AjPFile methfile)1352 static AjPList patRestrictReadMethyl(AjPFile methfile)
1353 {
1354 AjPStr line = NULL;
1355 AjPStr name = NULL;
1356 AjPStr site = NULL;
1357 AjPStr replace = NULL;
1358 AjPStr pattern = NULL;
1359
1360 AjPList ret = NULL;
1361
1362 MethPData m = NULL;
1363
1364
1365 const char *p = NULL;
1366 char c;
1367
1368 replace = ajStrNew();
1369 name = ajStrNew();
1370 site = ajStrNew();
1371 line = ajStrNew();
1372 pattern = ajStrNew();
1373
1374 ret = ajListNew();
1375
1376 while(ajReadlineTrim(methfile,&line))
1377 {
1378 p = ajStrGetPtr(line);
1379
1380 if(!*p || *p=='#' || *p=='!')
1381 continue;
1382
1383 if(ajFmtScanS(line,"%S%S%S",&name,&pattern) != 2)
1384 {
1385 ajWarn("Invalid methylation data line: %S",line);
1386 continue;
1387 }
1388
1389 ajStrFmtUpper(&pattern);
1390 p = ajStrGetPtr(pattern);
1391
1392 AJNEW(m);
1393 m->Replace = ajStrNew();
1394 m->Name = ajStrNewS(name);
1395 m->Site = ajStrNew();
1396
1397
1398 while((c=*p))
1399 {
1400 if(c == 'M')
1401 {
1402 ajStrAppendK(&m->Replace,'N');
1403 ++p;
1404 ajStrAppendK(&m->Site,*p);
1405 }
1406 else
1407 {
1408 ajStrAppendK(&m->Site,c);
1409 ajStrAppendK(&m->Replace,c);
1410 }
1411
1412 ++p;
1413 }
1414
1415 ajStrFmtUpper(&m->Name);
1416 ajListPush(ret, (void *)m);
1417 }
1418
1419
1420 ajStrDel(&line);
1421 ajStrDel(&name);
1422 ajStrDel(&site);
1423 ajStrDel(&replace);
1424 ajStrDel(&pattern);
1425
1426 return ret;
1427 }
1428
1429
1430
1431
1432 /* @funcstatic patRestrictMethylMod *******************************************
1433 **
1434 ** Search using methylation data as a list of MethPdata structures
1435 **
1436 ** Note that this routine does a simple substitution and
1437 ** relies on substitution strings having the same length
1438 ** as the methylase recognition sites. It also assumes that
1439 ** there are no overlap conflicts. If a methylase is found that
1440 ** violates the above then this routine should be revisited.
1441 **
1442 ** @param [w] Pstr [AjPStr*] sequence string
1443 ** @param [w] Prstr [AjPStr*] sequence string
1444 ** @param [u] methlist [AjPList] methlist
1445 **
1446 ** @return [void]
1447 **
1448 ** @release 6.1.0
1449 ******************************************************************************/
1450
patRestrictMethylMod(AjPStr * Pstr,AjPStr * Prstr,AjPList methlist)1451 static void patRestrictMethylMod(AjPStr *Pstr, AjPStr *Prstr, AjPList methlist)
1452 {
1453 ajuint listlen;
1454 MethPData md = NULL;
1455 ajuint i;
1456
1457 EmbOPatBYPNode off[AJALPHA];
1458 ajuint *sotable = NULL;
1459 ajuint solimit;
1460 AjPStr regexp = NULL;
1461 ajuint **skipm = NULL;
1462 ajint *buf = NULL;
1463 const void *tidy = NULL;
1464
1465 ajuint plen = 0;
1466 ajint type = 0;
1467 AjBool amino = ajFalse;
1468 AjBool carboxyl = ajFalse;
1469 ajint mismatch = 0;
1470 ajuint hits = 0;
1471
1472 ajuint m = 0;
1473 ajuint j;
1474
1475 ajint fpos = 0;
1476 ajint slen = 0;
1477
1478 AjPStr origpat = NULL;
1479 AjPStr pattern = NULL;
1480 AjPStr seqname = NULL;
1481 AjPList l = NULL;
1482
1483
1484 EmbPMatMatch match = NULL;
1485
1486 char *p = NULL;
1487 const char *q = NULL;
1488
1489 char *rp = NULL;
1490
1491
1492 origpat = ajStrNew();
1493 pattern = ajStrNew();
1494 seqname = ajStrNewC("");
1495
1496
1497 listlen = (ajuint) ajListGetLength(methlist);
1498
1499
1500 l = ajListNew();
1501
1502 p = ajStrGetuniquePtr(Pstr);
1503
1504 slen = ajStrGetLen(*Pstr);
1505
1506 for(i=0; i < listlen; ++i)
1507 {
1508 ajListPop(methlist, (void **)&md);
1509 ajStrAssignS(&origpat, md->Site);
1510 ajStrAssignS(&pattern, md->Site);
1511
1512 if(!(type=embPatGetType(origpat,&pattern,mismatch,0,&m,&amino,
1513 &carboxyl)))
1514 ajFatal("patRestrictMethylMod: Illegal pattern");
1515
1516 embPatCompile(type,pattern,&plen,&buf,off,&sotable,&solimit,&m,
1517 ®exp,&skipm,mismatch);
1518
1519 embPatFuzzSearch(type,0,pattern,seqname,*Pstr,l,
1520 plen,mismatch,amino,carboxyl,buf,off,sotable,
1521 solimit,regexp,skipm,&hits,m,(const void **)&tidy);
1522
1523
1524 rp = ajStrGetuniquePtr(Prstr);
1525
1526 while(ajListPop(l,(void **)&match))
1527 {
1528
1529 q = ajStrGetPtr(md->Replace);
1530
1531 for(j=0; j < match->len; ++j)
1532 {
1533 fpos = match->start + j;
1534 p[fpos] = q[j];
1535
1536 if(q[j] == 'N') /* Know out base on opposite strand too */
1537 rp[slen - fpos -1] = 'N';
1538 }
1539
1540 embMatMatchDel(&match);
1541 }
1542
1543
1544 if(type==6)
1545 for(j=0;j<m;++j)
1546 AJFREE(skipm[j]);
1547
1548 /*
1549 if(tidy)
1550 {
1551 tydy = (void *)tidy;
1552 AJFREE(tydy);
1553 }
1554 */
1555
1556 ajListPushAppend(methlist, (void *)md);
1557 }
1558
1559
1560 ajStrDel(&origpat);
1561 ajStrDel(&pattern);
1562 ajStrDel(&seqname);
1563
1564 ajListFree(&l);
1565
1566
1567 return;
1568 }
1569
1570
1571
1572
1573 /* @func embPatRestrictScan ***************************************************
1574 **
1575 ** Scan a sequence with a restriction object
1576 **
1577 ** @param [r] enz [const EmbPPatRestrict] Enyme information
1578 ** @param [r] substr [const AjPStr] Sequence as ASCII
1579 ** @param [r] binstr [const AjPStr] Sequence as binary IUB
1580 ** @param [r] revstr [const AjPStr] Sequence as ASCII reversed
1581 ** @param [r] binrev [const AjPStr] Sequence as binary IUB reversed
1582 ** @param [r] len [ajuint] Length of sequence
1583 ** @param [r] ambiguity [AjBool] Allow ambiguity (binary search)
1584 ** @param [r] plasmid [AjBool] Allow circular DNA
1585 ** @param [r] min [ajuint] Minimum # of matches allowed
1586 ** @param [r] max [ajuint] Maximum # of matches
1587 ** @param [r] begin [ajuint] Sequence offset
1588 ** @param [u] l [AjPList] List to push hits to
1589 **
1590 ** @return [ajuint] Number of matches
1591 **
1592 ** @release 1.0.0
1593 ******************************************************************************/
1594
embPatRestrictScan(const EmbPPatRestrict enz,const AjPStr substr,const AjPStr binstr,const AjPStr revstr,const AjPStr binrev,ajuint len,AjBool ambiguity,AjBool plasmid,ajuint min,ajuint max,ajuint begin,AjPList l)1595 ajuint embPatRestrictScan(const EmbPPatRestrict enz,
1596 const AjPStr substr, const AjPStr binstr,
1597 const AjPStr revstr, const AjPStr binrev, ajuint len,
1598 AjBool ambiguity, AjBool plasmid, ajuint min,
1599 ajuint max, ajuint begin, AjPList l)
1600 {
1601 ajuint limit;
1602 ajuint i;
1603 ajuint j;
1604 ajuint hits;
1605 ajuint rhits = 0;
1606 const char *p;
1607 const char *q;
1608 const char *t;
1609 ajint mincut;
1610 ajint maxcut;
1611 AjBool forward;
1612 ajint v;
1613 AjPList tx = NULL;
1614 AjPList ty = NULL;
1615 EmbPMatMatch m = NULL;
1616 EmbPMatMatch z = NULL;
1617
1618 if(len < enz->len)
1619 return 0;
1620
1621 if(plasmid)
1622 limit=len;
1623 else
1624 limit=len-enz->len+1;
1625
1626 mincut=AJMIN(enz->cut1,enz->cut2);
1627
1628 if(enz->ncuts==4)
1629 {
1630 mincut=AJMIN(mincut,enz->cut3);
1631 mincut=AJMIN(mincut,enz->cut4);
1632 }
1633
1634 maxcut=AJMAX(enz->cut1,enz->cut2);
1635
1636 if(enz->ncuts==4)
1637 {
1638 maxcut=AJMAX(maxcut,enz->cut3);
1639 maxcut=AJMAX(maxcut,enz->cut4);
1640 }
1641
1642 ajDebug("embPatRestrictScan '%S' '%S' ncuts:%d blunt:%b\n",
1643 enz->cod, enz->pat, enz->ncuts, enz->blunt);
1644 /* ajDebug("cut1:%d 2:%d 3:%d 4:%d\n",
1645 enz->cut1, enz->cut2, enz->cut3, enz->cut4 ); */
1646
1647 tx = ajListNew();
1648 ty = ajListNew();
1649
1650 if(ambiguity)
1651 {
1652 p = ajStrGetPtr(binstr);
1653 t = ajStrGetPtr(enz->bin);
1654
1655 forward = ajTrue;
1656
1657 for(i=0,hits=0;i<limit;++i)
1658 {
1659 for(j=0,q=t;j<enz->len;++j,++q)
1660 {
1661 v=*(p+i+j);
1662
1663 if(!(*q & v) || v==15)
1664 break;
1665 }
1666
1667 if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1668 i+enz->cut2>=len))
1669 continue;
1670
1671 if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1672 {
1673 ++hits;
1674 patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1675 }
1676
1677 }
1678
1679 forward = ajFalse;
1680 p = ajStrGetPtr(binrev);
1681
1682 for(i=0;i<limit;++i)
1683 {
1684 for(j=0,q=t;j<enz->len;++j,++q)
1685 {
1686 v = *(p+i+j);
1687
1688 if(!(*q & v) || v==15)
1689 break;
1690 }
1691
1692 if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1693 i+enz->cut2>=len))
1694 continue;
1695
1696 if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1697 {
1698 ++hits;
1699 patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1700 }
1701
1702 }
1703
1704 }
1705 else
1706 {
1707 p = ajStrGetPtr(substr);
1708 t = ajStrGetPtr(enz->pat);
1709 forward = ajTrue;
1710
1711 for(i=0,hits=0;i<limit;++i)
1712 {
1713 for(j=0,q=t;j<enz->len;++j,++q)
1714 {
1715 v=*(p+i+j);
1716
1717 if(*q != v || v=='N')
1718 break;
1719 }
1720
1721 if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1722 i+enz->cut2>=len))
1723 continue;
1724
1725 if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1726 {
1727 ++hits;
1728 patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1729 }
1730
1731 }
1732
1733 forward = ajFalse;
1734 p = ajStrGetPtr(revstr);
1735
1736 for(i=0;i<limit;++i)
1737 {
1738 for(j=0,q=t;j<enz->len;++j,++q)
1739 {
1740 v = *(p+i+j);
1741
1742 if(*q != v || v=='N')
1743 break;
1744 }
1745
1746 if(j==enz->len && !plasmid && (i+enz->cut1>=len ||
1747 i+enz->cut2>=len))
1748 continue;
1749
1750 if(j==enz->len && (plasmid || i+mincut+1>0) && i<limit)
1751 {
1752 ++hits;
1753 patRestrictPushHit(enz,tx,i,begin,len,forward, plasmid);
1754 }
1755 }
1756 }
1757
1758
1759 if(hits)
1760 {
1761 ajListSort(tx, &embPatRestrictCutCompare);
1762
1763 for(i=0,rhits=0,v=0;i<hits;++i)
1764 {
1765 ajListPop(tx,(void **)&m);
1766
1767 if(m->cut1 != v)
1768 {
1769 ajListPush(ty,(void *)m);
1770 ++rhits;
1771 v = m->cut1;
1772 }
1773 else
1774 {
1775 if(i)
1776 if(m->forward)
1777 {
1778 ajListPop(ty,(void **)&z);
1779 ajListPush(ty,(void *)m);
1780 m=z;
1781 }
1782
1783 embMatMatchDel(&m);
1784 }
1785 }
1786
1787 if(rhits<min || rhits>max)
1788 {
1789 while(ajListPop(ty,(void **)&m));
1790
1791 ajListFree(&tx);
1792 ajListFree(&ty);
1793
1794 return 0;
1795 }
1796 else
1797 {
1798 while(ajListPop(ty,(void **)&m))
1799 ajListPush(l,(void *)m);
1800 hits = rhits;
1801 }
1802 }
1803
1804 ajListFree(&tx);
1805 ajListFree(&ty);
1806
1807 return hits;
1808 }
1809
1810
1811
1812
1813 /* @func embPatKMPInit ********************************************************
1814 **
1815 ** Initialise a Knuth-Morris-Pratt pattern.
1816 **
1817 ** @param [r] pat [const AjPStr] pattern
1818 ** @param [r] len [ajuint] length of pattern
1819 ** @param [w] next [ajint *] offset table
1820 **
1821 ** @return [void]
1822 **
1823 ** @release 1.0.0
1824 ******************************************************************************/
1825
embPatKMPInit(const AjPStr pat,ajuint len,ajint * next)1826 void embPatKMPInit(const AjPStr pat, ajuint len, ajint *next)
1827 {
1828 ajuint i;
1829 ajint k;
1830 ajuint t;
1831 const char *p;
1832
1833 p = ajStrGetPtr(pat);
1834 t = len-1;
1835
1836 i = 0;
1837 k = -1;
1838 next[0] = -1;
1839
1840 while(i<t)
1841 {
1842 while(k>=0 && p[i]!=p[k])
1843 k = next[k];
1844
1845 ++i;
1846 ++k;
1847
1848 if(p[i]==p[k])
1849 next[i] = next[k];
1850 else
1851 next[i] = k;
1852 }
1853
1854 return;
1855 }
1856
1857
1858
1859
1860 /* @func embPatKMPSearch ******************************************************
1861 **
1862 ** Perform a Knuth-Morris-Pratt search
1863 **
1864 ** @param [r] str [const AjPStr] string to search
1865 ** @param [r] pat [const AjPStr] pattern to use
1866 ** @param [r] slen [ajuint] length of string
1867 ** @param [r] plen [ajuint] length of pattern
1868 ** @param [r] next [const ajint *] array from embPatKMPInit (can be -1)
1869 ** @param [r] start [ajuint] position within str to start search
1870 **
1871 ** @return [ajuint] Index of match in str or -1 if not found
1872 **
1873 ** @release 1.0.0
1874 ******************************************************************************/
1875
embPatKMPSearch(const AjPStr str,const AjPStr pat,ajuint slen,ajuint plen,const ajint * next,ajuint start)1876 ajuint embPatKMPSearch(const AjPStr str, const AjPStr pat,
1877 ajuint slen, ajuint plen, const ajint *next,
1878 ajuint start)
1879 {
1880 ajuint i;
1881 ajint j;
1882 const char *p;
1883 const char *q;
1884
1885 p = ajStrGetPtr(str);
1886 q = ajStrGetPtr(pat);
1887
1888 i = start;
1889 j = 0;
1890
1891 while(i<slen && j<(ajint)plen)
1892 {
1893 while(j>=0 && p[i]!=q[j])
1894 j = next[j];
1895
1896 ++i;
1897 ++j;
1898 }
1899
1900 if(j==(ajint)plen)
1901 return i-plen;
1902
1903 return -1;
1904 }
1905
1906
1907
1908
1909 /* @func embPatBMHInit ********************************************************
1910 **
1911 ** Initialise a Boyer-Moore-Horspool pattern.
1912 **
1913 ** @param [r] pat [const AjPStr] pattern
1914 ** @param [r] len [ajuint] pattern length
1915 ** @param [w] skip [ajint *] offset table (can be -1)
1916 **
1917 ** @return [void]
1918 **
1919 ** @release 1.0.0
1920 ******************************************************************************/
1921
embPatBMHInit(const AjPStr pat,ajuint len,ajint * skip)1922 void embPatBMHInit(const AjPStr pat, ajuint len, ajint *skip)
1923 {
1924 ajuint i;
1925 ajuint t;
1926 const char *p;
1927
1928 p = ajStrGetPtr(pat);
1929
1930 t = len-1;
1931
1932 for(i=0;i<AJALPHA;++i)
1933 skip[i] = t;
1934
1935 for(i=0;i<t;++i)
1936 skip[(ajuint)p[i]] = t-i;
1937
1938 return;
1939 }
1940
1941
1942
1943
1944 /* @func embPatBMHSearch ******************************************************
1945 **
1946 ** Perform a Boyer-Moore-Horspool search
1947 **
1948 ** @param [r] str [const AjPStr] string to search
1949 ** @param [r] pat [const AjPStr] pattern to use
1950 ** @param [r] slen [ajuint] length of string
1951 ** @param [r] plen [ajuint] length of pattern
1952 ** @param [r] skip [const ajint *] array from embPatBMHInit (can be -1)
1953 ** @param [r] start [ajuint] position within str to start search
1954 ** @param [r] left [AjBool] has to match the start
1955 ** @param [r] right [AjBool] has to match the end
1956 ** @param [u] l [AjPList] list to push to
1957 ** @param [r] name [const AjPStr] name of entry
1958 ** @param [r] begin [ajuint] offset in orig sequence
1959 **
1960 ** @return [ajuint] number of hits
1961 **
1962 ** @release 1.0.0
1963 ******************************************************************************/
1964
embPatBMHSearch(const AjPStr str,const AjPStr pat,ajuint slen,ajuint plen,const ajint * skip,ajuint start,AjBool left,AjBool right,AjPList l,const AjPStr name,ajuint begin)1965 ajuint embPatBMHSearch(const AjPStr str, const AjPStr pat,
1966 ajuint slen, ajuint plen, const ajint *skip,
1967 ajuint start, AjBool left, AjBool right, AjPList l,
1968 const AjPStr name, ajuint begin)
1969 {
1970 ajuint i;
1971 ajuint j;
1972 ajuint jj;
1973 ajuint k = 0;
1974 const char *p;
1975 const char *q;
1976 AjBool flag;
1977 ajuint count;
1978
1979 if(left && start)
1980 return 0;
1981
1982 p = ajStrGetPtr(str);
1983 q = ajStrGetPtr(pat);
1984
1985 flag = ajTrue;
1986 count = 0;
1987
1988 i = start+(plen-1);
1989 jj = plen;
1990 j = jj-1;
1991
1992 while(flag)
1993 {
1994 while(jj>0 && i<slen)
1995 {
1996 k = i;
1997
1998 while(jj>0 && p[k]==q[j])
1999 {
2000 --k;
2001 --j;
2002 --jj;
2003 }
2004
2005 if(jj>0)
2006 {
2007 i += skip[(ajuint)p[i]];
2008 jj = plen;
2009 j = jj-1;
2010 }
2011 }
2012
2013 if(jj == 0)
2014 {
2015 if(left && k+1)
2016 return 0;
2017
2018 if(!right || (right && k+1+plen==slen))
2019 {
2020 ++count;
2021 embPatPushHit(l,name,k+1,plen,begin,0);
2022 }
2023
2024 i = start+(plen-1)+k+2;
2025 jj = plen;
2026 j = jj-1;
2027
2028 }
2029 else
2030 flag=ajFalse;
2031 }
2032
2033 return count;
2034 }
2035
2036
2037
2038
2039 /* @func embPatBYPInit ********************************************************
2040 **
2041 ** Initialise a Baeza-Yates,Perleberg pattern.
2042 **
2043 ** @param [r] pat [const AjPStr] pattern
2044 ** @param [r] len [ajuint] pattern length
2045 ** @param [w] offset [EmbPPatBYPNode] character index
2046 ** @param [w] buf [ajint *] mismatch count
2047 **
2048 ** @return [void]
2049 **
2050 ** @release 1.0.0
2051 ******************************************************************************/
2052
embPatBYPInit(const AjPStr pat,ajuint len,EmbPPatBYPNode offset,ajint * buf)2053 void embPatBYPInit(const AjPStr pat, ajuint len, EmbPPatBYPNode offset,
2054 ajint *buf)
2055 {
2056 ajuint i;
2057 ajuint j;
2058
2059 const char *p;
2060 EmbPPatBYPNode op;
2061
2062 p = ajStrGetPtr(pat);
2063
2064 for(i=0;i<AJALPHA;++i)
2065 {
2066 offset[i].offset = -1;
2067 offset[i].next = NULL;
2068 buf[i] = len;
2069 }
2070
2071 for(i=0,j=AJALPHA>>1;i<len;++i,++p)
2072 {
2073 buf[i] = AJALPHA;
2074
2075 if(offset[(ajuint)*p].offset == -1)
2076 offset[(ajuint)*p].offset = len-i-1;
2077 else
2078 {
2079 op=offset[(ajuint)*p].next;
2080 offset[(ajuint)*p].next=&offset[j++];
2081 offset[(ajuint)*p].next->offset = len-i-1;
2082 offset[(ajuint)*p].next->next = op;
2083 }
2084 }
2085
2086 buf[len-1] = len;
2087
2088 return;
2089 }
2090
2091
2092
2093
2094 /* @func embPatPushHit ********************************************************
2095 **
2096 ** Put a matching string search hit on the heap
2097 ** as an EmbPMatMatch structure
2098 **
2099 ** @param [u] l [AjPList] list to push to
2100 ** @param [r] name [const AjPStr] string name
2101 ** @param [r] pos [ajuint] Sequence match position
2102 ** @param [r] plen [ajuint] pattern length
2103 ** @param [r] begin [ajuint] Sequence offset
2104 ** @param [r] mm [ajuint] number of mismatches
2105 **
2106 ** @return [void]
2107 **
2108 ** @release 1.0.0
2109 ******************************************************************************/
2110
embPatPushHit(AjPList l,const AjPStr name,ajuint pos,ajuint plen,ajuint begin,ajuint mm)2111 void embPatPushHit(AjPList l, const AjPStr name, ajuint pos, ajuint plen,
2112 ajuint begin, ajuint mm)
2113 {
2114 EmbPMatMatch hit;
2115
2116 AJNEW0(hit);
2117
2118 hit->seqname = ajStrNewS(name);
2119 hit->len = plen;
2120 hit->cod = ajStrNew();
2121 hit->pat = ajStrNew();
2122 hit->acc = ajStrNew();
2123 hit->tit = ajStrNew();
2124 hit->start = pos+begin;
2125 hit->mm = mm;
2126 hit->end = pos+begin+plen-1;
2127 if(hit->start <= hit->end)
2128 hit->forward = ajTrue;
2129 ajListPush(l,(void *) hit);
2130
2131 return;
2132 }
2133
2134
2135
2136
2137 /* @func embPatBYPSearch ******************************************************
2138 **
2139 ** Perform a Baeza-Yates,Perleberg search.
2140 **
2141 ** @param [r] str [const AjPStr] search string
2142 ** @param [r] name [const AjPStr] search string
2143 ** @param [r] begin [ajuint] sequence offset
2144 ** @param [r] slen [ajuint] string length
2145 ** @param [r] plen [ajuint] pattern length
2146 ** @param [r] mm [ajuint] allowed mismatches (Hamming distance)
2147 ** @param [u] offset [EmbPPatBYPNode] character index
2148 ** @param [u] buf [ajint *] mismatch count array
2149 ** @param [u] l [AjPList] list to push hits to
2150 ** @param [r] amino [AjBool] if true, match at amino terminal end
2151 ** @param [r] carboxyl [AjBool] if true, match at carboxyl terminal end
2152 ** @param [r] pat [const AjPStr] original pattern
2153 **
2154 ** @return [ajuint] number of matches
2155 **
2156 ** @release 1.0.0
2157 ******************************************************************************/
2158
embPatBYPSearch(const AjPStr str,const AjPStr name,ajuint begin,ajuint slen,ajuint plen,ajuint mm,EmbPPatBYPNode offset,ajint * buf,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr pat)2159 ajuint embPatBYPSearch(const AjPStr str, const AjPStr name,
2160 ajuint begin, ajuint slen,
2161 ajuint plen, ajuint mm, EmbPPatBYPNode offset,
2162 ajint *buf,
2163 AjPList l, AjBool amino, AjBool carboxyl,
2164 const AjPStr pat)
2165 {
2166 const char *p;
2167 const char *q;
2168 ajuint i;
2169 ajint t;
2170 EmbPPatBYPNode off;
2171 ajint count;
2172 AjPStr pattern;
2173
2174 p = MAJSTRGETPTR(str);
2175 pattern = ajStrNewS(pat);
2176 ajStrFmtUpper(&pattern);
2177 q = MAJSTRGETPTR(pattern);
2178
2179 count = mm;
2180
2181 for(i=0;i<plen;++i)
2182 if(*q++!=*p++)
2183 if(--count<0)
2184 break;
2185
2186 if(count>=0 && (!carboxyl || (carboxyl && i==slen)))
2187 {
2188 embPatPushHit(l,name,0,plen,begin,mm-count);
2189 count = 1;
2190 }
2191 else
2192 count = 0;
2193
2194 p = MAJSTRGETPTR(str);
2195
2196 for(i=0;i<slen;++i)
2197 {
2198 if((t=(off=&offset[(ajuint)*p++])->offset)>=0)
2199 {
2200 buf[(i+t)&AJMOD256]--;
2201
2202 for(off=off->next;off!=NULL;off=off->next)
2203 buf[(i+off->offset)&AJMOD256]--;
2204 }
2205
2206 if(buf[i&AJMOD256]<=(ajint)mm)
2207 {
2208 if(amino && i-plen+1!=0)
2209 return count;
2210
2211 if(!carboxyl || (carboxyl && i+1==slen))
2212 {
2213 ++count;
2214 embPatPushHit(l,name,i-plen+1,plen,begin,buf[i&AJMOD256]);
2215 }
2216 }
2217
2218 buf[i&AJMOD256] = plen;
2219 }
2220
2221 MAJSTRDEL(&pattern);
2222
2223 return count;
2224 }
2225
2226
2227
2228
2229 /* @funcstatic patAminoCarboxyl ***********************************************
2230 **
2231 ** Checks for start and/or end angle bracket markers
2232 ** Removes them from the string and sets bools accordingly
2233 **
2234 ** @param [r] s [const AjPStr] original pattern
2235 ** @param [u] cs [AjPStr *] modified pattern
2236 ** @param [w] amino [AjBool *] set if start marker (left angle bracket)
2237 ** @param [w] carboxyl [AjBool *] set if end marker (right angle bracket)
2238 **
2239 ** @return [void]
2240 **
2241 ** @release 1.0.0
2242 ******************************************************************************/
2243
patAminoCarboxyl(const AjPStr s,AjPStr * cs,AjBool * amino,AjBool * carboxyl)2244 static void patAminoCarboxyl(const AjPStr s, AjPStr *cs,
2245 AjBool *amino, AjBool *carboxyl)
2246 {
2247 AjPStr t;
2248 const char *p;
2249
2250 t = ajStrNewC("");
2251 p = ajStrGetPtr(s);
2252
2253 while(*p)
2254 {
2255 if(*p==' ' || *p=='-' || *p=='.')
2256 {
2257 ++p;
2258 continue;
2259 }
2260
2261 if(*p=='<')
2262 {
2263 *amino = ajTrue;
2264 ++p;
2265 continue;
2266 }
2267
2268 if(*p=='>')
2269 {
2270 *carboxyl = ajTrue;
2271 ++p;
2272 continue;
2273 }
2274
2275 ajStrAppendK(&t,*p);
2276 ++p;
2277 }
2278
2279 ajStrAssignS(cs,t);
2280 ajStrDel(&t);
2281
2282 return;
2283 }
2284
2285
2286
2287
2288 /* @funcstatic patParenTest ***************************************************
2289 **
2290 ** Checks parenthesis grammar. Sets repeat and range bools
2291 **
2292 ** @param [r] p [const char *] pattern
2293 ** @param [w] repeat [AjBool *] set if any parenthesis e.g. (3)
2294 ** @param [w] range [AjBool *] set if range e.g. (5,8)
2295 **
2296 ** @return [AjBool] True if grammar correct
2297 **
2298 ** @release 1.0.0
2299 ******************************************************************************/
2300
patParenTest(const char * p,AjBool * repeat,AjBool * range)2301 static AjBool patParenTest(const char *p, AjBool *repeat, AjBool *range)
2302 {
2303 ajuint i;
2304
2305 *repeat = ajTrue;
2306 p = p+2;
2307
2308 if(sscanf(p,"%u",&i)!=1)
2309 {
2310 ajWarn("Illegal pattern. Missing repeat number");
2311
2312 return ajFalse;
2313 }
2314
2315 while(*p)
2316 {
2317 if(*p==')')
2318 break;
2319
2320 if(*p=='('||*p=='['||*p=='{'||*p=='}'||*p==']' ||
2321 isalpha((ajuint)*p))
2322 {
2323 ajWarn("Illegal pattern. Nesting not allowed");
2324
2325 return ajFalse;
2326 }
2327
2328 if(*p==',')
2329 {
2330 *range = ajTrue;
2331 ++p;
2332
2333 if(sscanf(p,"%u",&i)!=1)
2334 {
2335 ajWarn("Illegal pattern. Missing range number");
2336
2337 return ajFalse;
2338 }
2339 continue;
2340 }
2341
2342 ++p;
2343 }
2344
2345 if(!*p)
2346 {
2347 ajWarn("Illegal pattern. Missing parenthesis");
2348
2349 return ajFalse;
2350 }
2351
2352 return ajTrue;
2353 }
2354
2355
2356
2357
2358 /* @funcstatic patExpandRepeat ************************************************
2359 **
2360 ** Expand repeats e.g. [ABC](2) to [ABC][ABC]
2361 **
2362 ** @param [u] s [AjPStr *] pattern
2363 **
2364 ** @return [AjBool] ajTrue on success
2365 **
2366 ** @release 1.0.0
2367 ** @@
2368 ******************************************************************************/
2369
patExpandRepeat(AjPStr * s)2370 static AjBool patExpandRepeat(AjPStr *s)
2371 {
2372 AjPStr t;
2373 const char *p;
2374 const char *q;
2375 ajuint count;
2376 ajuint i;
2377
2378 t = ajStrNewC("");
2379 p = ajStrGetPtr(*s);
2380
2381 while(*p)
2382 {
2383 if(*p=='[' || *p=='{')
2384 {
2385 q = p;
2386
2387 while(!(*p==']' || *p=='}'))
2388 ++p;
2389
2390 if(*(p+1)!='(')
2391 count = 1;
2392 else
2393 sscanf(p+2,"%u",&count);
2394
2395 if(count<=0)
2396 {
2397 ajWarn("Illegal pattern. Bad repeat count");
2398
2399 return ajFalse;
2400 }
2401
2402 for(i=0;i<count;++i)
2403 {
2404 p = q;
2405
2406 while(!(*p==']'||*p=='}'))
2407 {
2408 ajStrAppendK(&t,*p);
2409 ++p;
2410 }
2411
2412 ajStrAppendK(&t,*p);
2413 }
2414
2415 if(*(p+1)=='(')
2416 while(*p!=')')
2417 ++p;
2418
2419 ++p;
2420 continue;
2421 }
2422
2423 if(*p=='(')
2424 {
2425 sscanf(p+1,"%u",&count);
2426
2427 if(count<=0)
2428 {
2429 ajWarn("Illegal pattern. Bad range number");
2430
2431 return ajFalse;
2432 }
2433
2434 for(i=1;i<count;++i)
2435 ajStrAppendK(&t,*(p-1));
2436
2437 while(*p!=')')
2438 ++p;
2439
2440 ++p;
2441 continue;
2442 }
2443
2444 ajStrAppendK(&t,*p);
2445 ++p;
2446 }
2447
2448 ajStrAssignC(s,ajStrGetPtr(t));
2449 ajStrDel(&t);
2450
2451 return ajTrue;
2452 }
2453
2454
2455
2456
2457 /* @funcstatic patIUBTranslate ************************************************
2458 **
2459 ** Convert IUB nucleotide symbols to classes e.g. S to [GC]
2460 **
2461 ** @param [u] pat [AjPStr *] pattern
2462 **
2463 ** @return [void]
2464 **
2465 ** @release 1.0.0
2466 ******************************************************************************/
2467
patIUBTranslate(AjPStr * pat)2468 static void patIUBTranslate(AjPStr *pat)
2469 {
2470 AjPStr t;
2471 const char *p;
2472 AjBool escape = ajFalse;
2473
2474 t = ajStrNewC(ajStrGetPtr(*pat));
2475 p = ajStrGetPtr(t);
2476 ajStrSetClear(pat);
2477
2478 while(*p)
2479 {
2480 if(escape)
2481 {
2482 ajStrAppendK(pat,*p);
2483 ++p;
2484 escape=ajFalse;
2485 continue;
2486 }
2487
2488 if(*p=='\\')
2489 {
2490 escape = ajTrue;
2491 ++p;
2492 continue;
2493 }
2494
2495 if(*p=='N')
2496 {
2497 ajStrAppendC(pat,"?");
2498 ++p;
2499 continue;
2500 }
2501
2502 if(*p=='B')
2503 {
2504 ajStrAppendC(pat,"[TGCBSYK]");
2505 ++p;
2506 continue;
2507 }
2508
2509 if(*p=='D')
2510 {
2511 ajStrAppendC(pat,"[TGADWRK]");
2512 ++p;
2513 continue;
2514 }
2515
2516 if(*p=='H')
2517 {
2518 ajStrAppendC(pat,"[TCAHWYM]");
2519 ++p;
2520 continue;
2521 }
2522
2523 if(*p=='K')
2524 {
2525 ajStrAppendC(pat,"[TGK]");
2526 ++p;
2527 continue;
2528 }
2529
2530 if(*p=='M')
2531 {
2532 ajStrAppendC(pat,"[CAM]");
2533 ++p;
2534 continue;
2535 }
2536
2537 if(*p=='R')
2538 {
2539 ajStrAppendC(pat,"[GAR]");
2540 ++p;
2541 continue;
2542 }
2543
2544 if(*p=='S')
2545 {
2546 ajStrAppendC(pat,"[GCS]");
2547 ++p;
2548 continue;
2549 }
2550
2551 if(*p=='V')
2552 {
2553 ajStrAppendC(pat,"[GCAVSRM]");
2554 ++p;
2555 continue;
2556 }
2557
2558 if(*p=='W')
2559 {
2560 ajStrAppendC(pat,"[TAW]");
2561 ++p;
2562 continue;
2563 }
2564
2565 if(*p=='Y')
2566 {
2567 ajStrAppendC(pat,"[TCY]");
2568 ++p;
2569 continue;
2570 }
2571
2572 ajStrAppendK(pat,*p);
2573 ++p;
2574 }
2575
2576 ajStrDel(&t);
2577
2578 return;
2579 }
2580
2581
2582
2583
2584 /* @funcstatic patProteinTranslate ********************************************
2585 **
2586 ** Convert protein symbols to classes e.g. B to [DN]
2587 **
2588 ** @param [u] pat [AjPStr *] pattern
2589 **
2590 ** @return [void]
2591 **
2592 ** @release 4.0.0
2593 ******************************************************************************/
2594
patProteinTranslate(AjPStr * pat)2595 static void patProteinTranslate(AjPStr *pat)
2596 {
2597 AjPStr t;
2598 const char *p;
2599 AjBool escape = ajFalse;
2600
2601 t = ajStrNewC(ajStrGetPtr(*pat));
2602 p = ajStrGetPtr(t);
2603 ajStrSetClear(pat);
2604
2605 while(*p)
2606 {
2607 if(escape)
2608 {
2609 ajStrAppendK(pat,*p);
2610 ++p;
2611 escape=ajFalse;
2612 continue;
2613 }
2614
2615 if(*p=='\\')
2616 {
2617 escape = ajTrue;
2618 ++p;
2619 continue;
2620 }
2621
2622 if(*p=='X')
2623 {
2624 ajStrAppendC(pat,"?");
2625 ++p;
2626 continue;
2627 }
2628
2629 if(*p=='B')
2630 {
2631 ajStrAppendC(pat,"[DN]");
2632 ++p;
2633 continue;
2634 }
2635
2636 if(*p=='Z')
2637 {
2638 ajStrAppendC(pat,"[EQ]");
2639 ++p;
2640 continue;
2641 }
2642
2643 if(*p=='J')
2644 {
2645 ajStrAppendC(pat,"[IL]");
2646 ++p;
2647 continue;
2648 }
2649
2650
2651 ajStrAppendK(pat,*p);
2652 ++p;
2653 }
2654
2655 ajStrDel(&t);
2656
2657 return;
2658 }
2659
2660
2661
2662
2663 /* @func embPatClassify *******************************************************
2664 **
2665 ** Classify patterns according to type. The pattern is set up upper case,
2666 ** has start and end processing turned into boolean flags. Sets other boolean
2667 ** flags for properties of the pattern so that a suitable processing
2668 ** method can be selected.
2669 **
2670 ** @param [r] pat [const AjPStr] original pattern
2671 ** @param [w] cleanpat [AjPStr *] cleaned pattern
2672 ** @param [w] amino [AjBool*] set if must match start of sequence
2673 ** @param [w] carboxyl [AjBool*] set if must match end of sequence
2674 ** @param [w] fclass [AjBool*] set if class e.g. [ABC]
2675 ** @param [w] ajcompl [AjBool*] set if complement e.g. {ABC}
2676 ** @param [w] dontcare [AjBool*] set if X (protein) or N (DNA)
2677 ** @param [w] range [AjBool*] set if range specified e.g. (3,10)
2678 ** @param [r] protein [AjBool] true if protein false if DNA
2679 **
2680 ** @return [AjBool] ajTrue on success
2681 **
2682 ** @release 1.0.0
2683 ** @@
2684 ******************************************************************************/
2685
embPatClassify(const AjPStr pat,AjPStr * cleanpat,AjBool * amino,AjBool * carboxyl,AjBool * fclass,AjBool * ajcompl,AjBool * dontcare,AjBool * range,AjBool protein)2686 AjBool embPatClassify(const AjPStr pat, AjPStr *cleanpat,
2687 AjBool *amino, AjBool *carboxyl,
2688 AjBool *fclass, AjBool *ajcompl, AjBool *dontcare,
2689 AjBool *range, AjBool protein)
2690 {
2691 char *p;
2692 AjBool repeat;
2693 AjPStr tmppat = NULL;
2694
2695 ajDebug("embPatClassify pat '%S' protein %b\n", pat, protein);
2696
2697 repeat=*amino=*carboxyl=*fclass=*ajcompl=*dontcare=*range=ajFalse;
2698
2699 ajStrAssignS(&tmppat, pat);
2700 ajStrRemoveWhiteExcess(&tmppat);
2701 patAminoCarboxyl(tmppat,cleanpat, amino,carboxyl);
2702 ajStrDel(&tmppat);
2703
2704 ajDebug("cleaned pat '%S' amino %b carboxyl %b\n",
2705 *cleanpat, *amino, *carboxyl);
2706
2707 ajStrFmtUpper(cleanpat);
2708
2709 if(!protein)
2710 {
2711 patIUBTranslate(cleanpat);
2712 ajDebug("IUB translated pat '%S'\n", *cleanpat);
2713 }
2714 else
2715 {
2716 patProteinTranslate(cleanpat);
2717 ajDebug("Protein codes translated pat '%S'\n", *cleanpat);
2718 }
2719
2720 p = ajStrGetuniquePtr(cleanpat);
2721
2722 while(*p)
2723 {
2724 if(isalpha((ajuint)*p) || (*p=='?'))
2725 {
2726 if(*p=='?')
2727 {
2728 *dontcare = ajTrue;
2729 }
2730
2731 if(*(p+1)=='(')
2732 {
2733 if(!patParenTest(p,&repeat,range))
2734 return ajFalse;
2735
2736 while(*p!=')')
2737 ++p;
2738 }
2739
2740 ++p;
2741 continue;
2742 }
2743
2744
2745
2746 if(*p=='[')
2747 {
2748 *fclass = ajTrue;
2749 ++p;
2750
2751 while(*p)
2752 {
2753 if(*p==']')
2754 break;
2755
2756 if(*p=='('||*p=='['||*p=='{'||*p=='}'||*p==')')
2757 {
2758 ajWarn("Illegal pattern. Nesting '%c' in [] not allowed",
2759 *p);
2760
2761 return ajFalse;
2762 }
2763
2764 if(!isalpha((ajuint)*p))
2765 {
2766 ajWarn("Illegal pattern. Non alpha character '%c'",
2767 *p);
2768
2769 return ajFalse;
2770 }
2771
2772 if((protein&&*p=='X')||(!protein&&*p=='N'))
2773 {
2774 ajWarn("Illegal pattern. Dontcare character '%c' in []",
2775 *p);
2776
2777 return ajFalse;
2778 }
2779
2780 ++p;
2781 }
2782
2783 if(!*p)
2784 {
2785 ajWarn("Illegal pattern. Missing ']'");
2786
2787 return ajFalse;
2788 }
2789
2790 if(*(p+1)=='(')
2791 {
2792 if(!patParenTest(p,&repeat,range))
2793 return ajFalse;
2794
2795 while(*p!=')')
2796 ++p;
2797 }
2798
2799 ++p;
2800 continue;
2801 }
2802
2803 if(*p=='{')
2804 {
2805 *ajcompl = ajTrue;
2806 ++p;
2807 while(*p)
2808 {
2809 if(*p=='}')
2810 break;
2811
2812 if(*p=='('||*p=='['||*p=='{'||*p==']'||*p==')')
2813 {
2814 ajWarn("Illegal pattern. Nesting '%c' in {} not allowed.",
2815 *p);
2816
2817 return ajFalse;
2818 }
2819
2820 if(!isalpha((ajuint)*p))
2821 {
2822 ajWarn("Illegal pattern. Non alpha character '%c'", *p);
2823
2824 return ajFalse;
2825 }
2826
2827 if((protein&&*p=='X')||(!protein&&*p=='N'))
2828 {
2829 ajWarn("Illegal pattern. Ambiguous character '%c' in {}",
2830 *p);
2831
2832 return ajFalse;
2833 }
2834
2835 ++p;
2836 }
2837
2838 if(!*p)
2839 {
2840 ajWarn("Illegal pattern. Missing '}'");
2841
2842 return ajFalse;
2843 }
2844
2845 if(*(p+1)=='(')
2846 {
2847 if(!patParenTest(p,&repeat,range))
2848 return ajFalse;
2849
2850 while(*p!=')')
2851 ++p;
2852 }
2853
2854 ++p;
2855 continue;
2856 }
2857
2858 ajWarn("Illegal character '%c'",*p);
2859
2860 return ajFalse;
2861 }
2862
2863 ajDebug("patterns expanded pat '%S'\n", *cleanpat);
2864
2865 if(repeat && !*range)
2866 {
2867 ajDebug("testing repeat expansion\n");
2868
2869 if(!patExpandRepeat(cleanpat))
2870 return ajFalse;
2871
2872 ajDebug("repeats expanded pat '%S'\n", *cleanpat);
2873 }
2874
2875 return ajTrue;
2876 }
2877
2878
2879
2880
2881 /* @func embPatSOInit *********************************************************
2882 **
2883 ** Initialise a Shift-Or pattern.
2884 **
2885 ** @param [r] pat [const AjPStr] pattern
2886 ** @param [w] table [ajuint *] SO table
2887 ** @param [w] limit [ajuint *] match limit
2888 **
2889 ** @return [void]
2890 **
2891 ** @release 1.0.0
2892 ******************************************************************************/
2893
embPatSOInit(const AjPStr pat,ajuint * table,ajuint * limit)2894 void embPatSOInit(const AjPStr pat, ajuint *table, ajuint *limit)
2895 {
2896 ajuint i;
2897 const char *p;
2898
2899 if(ajStrGetLen(pat)>AJWORD)
2900 ajFatal("Pattern too long for Shift-OR search");
2901
2902
2903 for(i=0;i<AJALPHA2;++i)
2904 table[i] = ~0;
2905
2906 *limit = 0;
2907
2908 for(i=1,p=ajStrGetPtr(pat);*p;i<<=AJBPS,++p)
2909 {
2910 table[(ajuint)*p] &= ~i;
2911 *limit |= i;
2912 }
2913
2914 *limit = ~(*limit>>AJBPS);
2915
2916 return;
2917 }
2918
2919
2920
2921
2922 /* @func embPatSOSearch *******************************************************
2923 **
2924 ** Perform a Shift-OR search.
2925 **
2926 ** @param [r] str [const AjPStr] search string
2927 ** @param [r] name [const AjPStr] search string
2928 ** @param [r] first [ajint] first character of pattern (as an integer)
2929 ** @param [r] begin [ajuint] sequence offset
2930 ** @param [r] plen [ajuint] pattern length
2931 ** @param [r] table [const ajuint *] SO table
2932 ** @param [r] limit [ajuint] SO limit
2933 ** @param [u] l [AjPList] list to push hits to
2934 ** @param [r] amino [AjBool] must match start
2935 ** @param [r] carboxyl [AjBool] must match end
2936 **
2937 ** @return [ajuint] number of matches
2938 **
2939 ** @release 1.0.0
2940 ******************************************************************************/
2941
embPatSOSearch(const AjPStr str,const AjPStr name,ajint first,ajuint begin,ajuint plen,const ajuint * table,ajuint limit,AjPList l,AjBool amino,AjBool carboxyl)2942 ajuint embPatSOSearch(const AjPStr str, const AjPStr name,
2943 ajint first, ajuint begin,
2944 ajuint plen, const ajuint *table, ajuint limit, AjPList l,
2945 AjBool amino, AjBool carboxyl)
2946 {
2947 register ajuint state;
2948 register ajuint initial;
2949 const char *p;
2950 const char *q;
2951 ajuint pos;
2952
2953 ajuint matches;
2954 ajuint slen;
2955
2956 p = q = ajStrGetPtr(str);
2957 slen = ajStrGetLen(str);
2958 matches = 0;
2959 initial = ~0;
2960
2961 do
2962 {
2963 while(*p && *p != first)
2964 ++p;
2965
2966 state = initial;
2967
2968 do
2969 {
2970 state = (state<<AJBPS) | table[(ajuint)*p];
2971
2972 if(state < limit)
2973 {
2974 pos = (ajuint) ((p-q)-plen+1);
2975
2976 if(amino && pos)
2977 return matches;
2978
2979 if(!carboxyl || (carboxyl && pos==slen-plen))
2980 {
2981 ++matches;
2982 embPatPushHit(l,name,pos,plen,begin,0);
2983 }
2984 }
2985 ++p;
2986 }
2987 while(state!=initial);
2988
2989 }
2990 while(*(p-1));
2991
2992 return matches;
2993 }
2994
2995
2996
2997
2998 /* @func embPatBYGCInit *******************************************************
2999 **
3000 ** Initialise a Baeza-Yates Gonnet class pattern.
3001 **
3002 ** @param [r] pat [const AjPStr] pattern
3003 ** @param [w] m [ajuint *] real pattern length
3004 ** @param [w] table [ajuint *] SO table
3005 ** @param [w] limit [ajuint *] match limit
3006 **
3007 ** @return [void]
3008 **
3009 ** @release 1.0.0
3010 ******************************************************************************/
3011
embPatBYGCInit(const AjPStr pat,ajuint * m,ajuint * table,ajuint * limit)3012 void embPatBYGCInit(const AjPStr pat, ajuint *m, ajuint *table,
3013 ajuint *limit)
3014 {
3015 const char *p;
3016 const char *q;
3017 ajuint initval;
3018 ajuint shift;
3019 ajuint i;
3020
3021 p = q = ajStrGetPtr(pat);
3022 initval = ~0;
3023 shift = 1;
3024 *m = 0;
3025 *limit = 0;
3026
3027 while(*p)
3028 {
3029 if(*p=='?')
3030 initval &= ~shift;
3031 else if(*p=='{')
3032 {
3033 initval &= ~shift;
3034
3035 while(*p!='}')
3036 ++p;
3037 }
3038 else if(*p=='[')
3039 while(*p!=']')
3040 ++p;
3041 ++p;
3042 ++*m;
3043 *limit |= shift;
3044 shift<<=AJBPS;
3045 }
3046
3047 for(i=0;i<AJALPHA2;++i)
3048 table[i] = initval;
3049
3050 p = q;
3051
3052 shift = 1;
3053
3054 while(*p)
3055 {
3056 if(*p=='{')
3057 {
3058 ++p;
3059
3060 while(*p!='}')
3061 {
3062 table[(ajuint)*p] |= shift;
3063 ++p;
3064 }
3065 }
3066 else if(*p=='[')
3067 {
3068 ++p;
3069
3070 while(*p!=']')
3071 {
3072 table[(ajuint)*p] &= ~shift;
3073 ++p;
3074 }
3075 }
3076 else if(*p!='?')
3077 table[(ajuint)*p] &= ~shift;
3078
3079 shift <<= AJBPS;
3080 ++p;
3081 }
3082
3083 *limit = ~(*limit>>AJBPS);
3084
3085 return;
3086 }
3087
3088
3089
3090
3091 /* @func embPatBYGSearch ******************************************************
3092 **
3093 ** Perform a Baeza-Yates Gonnet search.
3094 **
3095 ** @param [r] str [const AjPStr] search string
3096 ** @param [r] name [const AjPStr] search string
3097 ** @param [r] begin [ajuint] sequence offset
3098 ** @param [r] plen [ajuint] pattern length
3099 ** @param [r] table [const ajuint *] SO table
3100 ** @param [r] limit [ajuint] SO limit
3101 ** @param [u] l [AjPList] list to push hits to
3102 ** @param [r] amino [AjBool] must match start
3103 ** @param [r] carboxyl [AjBool] must match end
3104 **
3105 ** @return [ajuint] number of matches
3106 **
3107 ** @release 1.0.0
3108 ******************************************************************************/
3109
embPatBYGSearch(const AjPStr str,const AjPStr name,ajuint begin,ajuint plen,const ajuint * table,ajuint limit,AjPList l,AjBool amino,AjBool carboxyl)3110 ajuint embPatBYGSearch(const AjPStr str, const AjPStr name,
3111 ajuint begin, ajuint plen,
3112 const ajuint *table, ajuint limit, AjPList l,
3113 AjBool amino, AjBool carboxyl)
3114 {
3115 register ajuint state;
3116 register ajuint initial;
3117 const char *p;
3118 const char *q;
3119 ajuint pos;
3120 ajuint matches;
3121 ajuint slen;
3122
3123 p = q = MAJSTRGETPTR(str);
3124 slen = MAJSTRGETLEN(str);
3125 matches = 0;
3126 initial = ~0;
3127
3128 /*
3129 // ajDebug("..pat initial %lx\n", initial);
3130 // ajDebug("..pat strlen:%d str:'%S'\n", slen, str);
3131 */
3132 do
3133 {
3134 state = initial;
3135
3136 do
3137 {
3138 state = (state<<AJBPS) | table[(ajuint)*p];
3139 /*
3140 // ajDebug("..pat table: %lx state %lx p:%c\n",
3141 // table[(ajuint)*p], state, *p);
3142 */
3143 if(state < limit)
3144 {
3145 pos = (ajuint) ((p-q)-plen+1);
3146
3147 if(amino && pos) /* must match start but already beyond */
3148 return matches;
3149
3150 if(!carboxyl || (carboxyl && pos==slen-plen))
3151 {
3152 ++matches;
3153 embPatPushHit(l,name,pos,plen,begin,0);
3154 /*
3155 // ajDebug("..pat hit matches:%d list:%Lu name:'%S' pos:%u "
3156 // "carboxyl:%B (slen-plen) %u\n",
3157 // matches, ajListGetLength(l), name, pos,
3158 // carboxyl, (slen-plen));
3159 */
3160 }
3161 }
3162 ++p;
3163 }
3164 while(state!=initial && *p);
3165
3166 }
3167 while((ajuint) (p-q)<slen);
3168
3169 return matches;
3170 }
3171
3172
3173
3174
3175 /* @func embPatTUInit *********************************************************
3176 **
3177 ** Initialise a Tarhio-Ukkonen search
3178 **
3179 ** @param [r] pat [const AjPStr] pattern
3180 ** @param [w] skipm [ajuint **] mismatch skip array
3181 ** @param [r] m [ajuint] real pattern length
3182 ** @param [r] k [ajuint] allowed mismatches
3183 **
3184 ** @return [void]
3185 **
3186 ** @release 1.0.0
3187 ******************************************************************************/
3188
embPatTUInit(const AjPStr pat,ajuint ** skipm,ajuint m,ajuint k)3189 void embPatTUInit(const AjPStr pat, ajuint **skipm, ajuint m, ajuint k)
3190 {
3191 const char *p;
3192 ajuint i;
3193 ajint j;
3194 ajint jj;
3195 ajuint x;
3196
3197 ajuint ready[AJALPHA];
3198
3199 p = ajStrGetPtr(pat);
3200
3201 for(i=0;i<AJALPHA;++i)
3202 {
3203 ready[i] = m;
3204
3205 for(j=m-k-1;j<(ajint)m;++j)
3206 skipm[j][i] = m-k-1;
3207 }
3208
3209 for(j=m-2;j>-1;--j)
3210 {
3211 jj = m-k-1;
3212 x = AJMAX(j+1,jj);
3213
3214 for(i=ready[(ajuint)p[j]]-1;i>=x;--i)
3215 skipm[i][(ajuint)p[j]] = i-j;
3216
3217 ready[(ajuint)p[j]] = x;
3218 }
3219
3220 return;
3221 }
3222
3223
3224
3225
3226 /* @func embPatTUSearch *******************************************************
3227 **
3228 ** Perform a Tarhio-Ukkonen search
3229 **
3230 ** @param [r] pat [const AjPStr] pattern
3231 ** @param [r] text [const AjPStr] text to search (incl ajcompl/class)
3232 ** @param [r] slen [ajuint] length of text
3233 ** @param [r] skipm [ajuint * const *] mismatch skip array
3234 ** @param [r] m [ajuint] real pattern length
3235 ** @param [r] k [ajuint] allowed mismatches
3236 ** @param [r] begin [ajuint] text offset
3237 ** @param [u] l [AjPList] list to push to
3238 ** @param [r] amino [AjBool] true if text start
3239 ** @param [r] carboxyl [AjBool] true if text end
3240 ** @param [r] name [const AjPStr] name of text
3241 **
3242 ** @return [ajuint] number of hits
3243 **
3244 ** @release 1.0.0
3245 ******************************************************************************/
3246
embPatTUSearch(const AjPStr pat,const AjPStr text,ajuint slen,ajuint * const * skipm,ajuint m,ajuint k,ajuint begin,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr name)3247 ajuint embPatTUSearch(const AjPStr pat, const AjPStr text, ajuint slen,
3248 ajuint * const *skipm, ajuint m,
3249 ajuint k, ajuint begin, AjPList l, AjBool amino,
3250 AjBool carboxyl, const AjPStr name)
3251 {
3252 ajuint i;
3253 ajint j;
3254 ajint jj;
3255 ajuint h;
3256 ajuint mm;
3257 ajuint skip;
3258 const char *p;
3259 const char *q;
3260 ajuint matches;
3261
3262 p = ajStrGetPtr(pat);
3263 q = ajStrGetPtr(text);
3264
3265 matches = 0;
3266
3267 i = m-1;
3268
3269 while(i<slen)
3270 {
3271 h = i;
3272 j = m-1;
3273 skip = m-k;
3274 mm = 0;
3275
3276 while(j>-1 && mm<=k)
3277 {
3278 jj = m-k-1;
3279
3280 if(j>=jj)
3281 skip = AJMIN(skip,(ajuint)skipm[j][(ajuint)q[h]]);
3282
3283 if(q[h]!=p[j])
3284 ++mm;
3285 --h;
3286 --j;
3287 }
3288
3289 if(mm<=k)
3290 {
3291 if(amino && h+1)
3292 return matches;
3293
3294 if(!carboxyl || (carboxyl && h+1==slen-m))
3295 {
3296 ++matches;
3297 embPatPushHit(l,name,h+1,m,begin,mm);
3298 }
3299 }
3300
3301 i+=skip;
3302 }
3303
3304 return matches;
3305 }
3306
3307
3308
3309
3310 /* @func embPatTUBInit ********************************************************
3311 **
3312 ** Initialise a Tarhio-Ukkonen-Bleasby search
3313 **
3314 ** @param [r] pat [const AjPStr] pattern
3315 ** @param [w] skipm [ajuint **] mismatch skip array
3316 ** @param [r] m [ajuint] real pattern length
3317 ** @param [r] k [ajuint] allowed mismatches
3318 ** @param [r] plen [ajuint] full pattern length (incl ajcompl & class)
3319 **
3320 ** @return [void]
3321 **
3322 ** @release 1.0.0
3323 ******************************************************************************/
3324
embPatTUBInit(const AjPStr pat,ajuint ** skipm,ajuint m,ajuint k,ajuint plen)3325 void embPatTUBInit(const AjPStr pat, ajuint **skipm, ajuint m, ajuint k,
3326 ajuint plen)
3327 {
3328 const char *p;
3329 const char *q;
3330 const char *s;
3331 ajint i;
3332 ajint j;
3333 ajint jj;
3334 ajint x;
3335 ajint z;
3336 ajuint flag;
3337 ajuint ready[AJALPHA];
3338
3339 p = ajStrGetPtr(pat);
3340
3341 for(i=0;i<AJALPHA;++i)
3342 {
3343 ready[i] = m;
3344
3345 /*
3346 ** AJB: Note that if the mismatches are 1 less than the real
3347 ** pattern length then the skip value can become 0 - hence
3348 ** the use of AJMAX to ensure that the skip value is always
3349 ** positive. This prevents a potential infinite loop in the
3350 ** upcoming TUBSearch function. Need to check the original
3351 ** algorithm to check that nothing is amiss here, then
3352 ** delete this comment block.
3353 */
3354 for(j=m-k-1;j<(ajint)m;++j)
3355 skipm[j][i] = AJMAX(m-k-1,1);
3356 }
3357
3358 p += plen-1;
3359
3360 if(*p=='}' || *p==']')
3361 {
3362 while(*p!='{' && *p!='[')
3363 --p;
3364
3365 --p;
3366 }
3367 else
3368 --p;
3369
3370 for(j=m-2;j>-1;--j)
3371 {
3372 jj = m-k-1;
3373 x = AJMAX(j+1,jj);
3374
3375 if(*p=='?')
3376 {
3377 for(z='A';z<='Z';++z)
3378 {
3379 for(i=ready[z]-1;i>=x;--i)
3380 skipm[i][z] = i-j;
3381
3382 ready[z] = x;
3383 }
3384
3385 --p;
3386 continue;
3387 }
3388
3389 if(*p==']')
3390 {
3391 --p;
3392 while(*p!='[')
3393 {
3394 for(i=ready[(ajuint)*p]-1;i>=x;--i)
3395 skipm[i][(ajuint)*p] = i-j;
3396 ready[(ajuint)*p] = x;
3397 --p;
3398 }
3399 --p;
3400 continue;
3401 }
3402
3403 if(*p=='}')
3404 {
3405 s=--p;
3406
3407 for(z='A';z<='Z';++z)
3408 {
3409 q = s;
3410 flag = 0;
3411
3412 while(*q!='{')
3413 {
3414 if(*q==z)
3415 {
3416 flag = 1;
3417 break;
3418 }
3419
3420 --q;
3421 }
3422
3423 if(!flag)
3424 {
3425 for(i=ready[z]-1;i>=x;--i)
3426 skipm[i][z] = i-j;
3427
3428 ready[z] = x;
3429 }
3430 }
3431
3432 while(*p!='{')
3433 --p;
3434
3435 --p;
3436 continue;
3437 }
3438
3439 for(i=ready[(ajuint)*p]-1;i>=x;--i)
3440 skipm[i][(ajuint)*p] = i-j;
3441
3442 ready[(ajuint)*p] = x;
3443 --p;
3444 }
3445
3446 return;
3447 }
3448
3449
3450
3451
3452 /* @func embPatTUBSearch ******************************************************
3453 **
3454 ** Perform a Tarhio-Ukkonen-Bleasby search
3455 **
3456 ** @param [r] pat [const AjPStr] pattern
3457 ** @param [r] text [const AjPStr] text to search (incl ajcompl/class)
3458 ** @param [r] slen [ajuint] length of text
3459 ** @param [r] skipm [ajuint * const *] mismatch skip array
3460 ** @param [r] m [ajuint] real pattern length
3461 ** @param [r] k [ajuint] allowed mismatches
3462 ** @param [r] begin [ajuint] text offset
3463 ** @param [u] l [AjPList] list to push to
3464 ** @param [r] amino [AjBool] true if text start
3465 ** @param [r] carboxyl [AjBool] true if text end
3466 ** @param [r] name [const AjPStr] name of text
3467 ** @param [r] plen [ajuint] total pattern length
3468 **
3469 ** @return [ajuint] number of hits
3470 **
3471 ** @release 1.0.0
3472 ******************************************************************************/
3473
embPatTUBSearch(const AjPStr pat,const AjPStr text,ajuint slen,ajuint * const * skipm,ajuint m,ajuint k,ajuint begin,AjPList l,AjBool amino,AjBool carboxyl,const AjPStr name,ajuint plen)3474 ajuint embPatTUBSearch(const AjPStr pat,const AjPStr text, ajuint slen,
3475 ajuint * const *skipm, ajuint m,
3476 ajuint k, ajuint begin, AjPList l, AjBool amino,
3477 AjBool carboxyl, const AjPStr name, ajuint plen)
3478 {
3479 ajuint i;
3480 ajint j;
3481 ajint jj;
3482 ajuint h;
3483 ajuint mm;
3484 ajuint skip;
3485 const char *p;
3486 const char *q;
3487 const char *s;
3488
3489 ajuint matches;
3490 ajint a;
3491 ajuint flag;
3492
3493 s = ajStrGetPtr(pat);
3494 q = ajStrGetPtr(text);
3495
3496 matches = 0;
3497
3498 i = m-1;
3499
3500 while(i<slen)
3501 {
3502 h = i;
3503 j = m-1;
3504 p = s+plen-1;
3505 skip = m-k;
3506 mm = 0;
3507
3508 while(j>-1 && mm<=k)
3509 {
3510 jj = m-k-1;
3511
3512 if(j>=jj)
3513 skip = AJMIN(skip,skipm[j][(ajuint)q[h]]);
3514
3515 a = q[h];
3516
3517 if(*p!='?')
3518 {
3519 if(*p==']')
3520 {
3521 flag = 0;
3522 --p;
3523
3524 while(*p!='[')
3525 {
3526 if(a==*p)
3527 flag = 1;
3528
3529 --p;
3530 }
3531
3532 if(!flag)
3533 ++mm;
3534 }
3535 else if(*p=='}')
3536 {
3537 flag = 0;
3538 --p;
3539
3540 while(*p!='{')
3541 {
3542 if(a==*p)
3543 flag=1;
3544 --p;
3545 }
3546
3547 if(flag)
3548 ++mm;
3549 }
3550 else
3551 if(a!=*p)
3552 ++mm;
3553 }
3554
3555 --p;
3556 --h;
3557 --j;
3558 }
3559
3560 if(mm<=k)
3561 {
3562 if(amino && h+1)
3563 return matches;
3564
3565 if(!carboxyl || (carboxyl && h+1==slen-m))
3566 {
3567 ++matches;
3568 embPatPushHit(l,name,h+1,m,begin,mm);
3569 }
3570 }
3571
3572 i += skip;
3573 }
3574
3575 return matches;
3576 }
3577
3578
3579
3580
3581 /* @funcstatic patBruteClass **************************************************
3582 **
3583 ** Test if character matches one of a class
3584 **
3585 ** @param [r] p [const char *] pattern class e.g. [abc]
3586 ** @param [r] c [char] character to match
3587 **
3588 ** @return [AjBool] true if match success
3589 **
3590 ** @release 1.0.0
3591 ******************************************************************************/
3592
patBruteClass(const char * p,char c)3593 static AjBool patBruteClass(const char *p, char c)
3594 {
3595 const char *s;
3596
3597 s = p+1;
3598
3599 while(*s!=']')
3600 if(*s++==c)
3601 return ajTrue;
3602
3603 return ajFalse;
3604 }
3605
3606
3607
3608
3609 /* @funcstatic patBruteCompl **************************************************
3610 **
3611 ** Test if character matches one of a complement
3612 **
3613 ** @param [r] p [const char *] pattern complement e.g. {abc}
3614 ** @param [r] c [char] character to match
3615 **
3616 ** @return [AjBool] true if character not in complement
3617 **
3618 ** @release 1.0.0
3619 ******************************************************************************/
3620
patBruteCompl(const char * p,char c)3621 static AjBool patBruteCompl(const char *p, char c)
3622 {
3623 const char *s;
3624
3625 s = p+1;
3626
3627 while(*s!='}')
3628 if(*s++==c)
3629 return ajFalse;
3630
3631 return ajTrue;
3632 }
3633
3634
3635
3636
3637 /* @funcstatic patBruteIsRange ************************************************
3638 **
3639 ** Check if character, class or complement has a range e.g. [abc](3,6)
3640 **
3641 ** @param [r] t [const char *] pattern
3642 ** @param [w] x [ajuint *] first range value
3643 ** @param [w] y [ajuint *] second range value
3644 **
3645 ** @return [AjBool] true if there is a range & sets x/y accordingly
3646 **
3647 ** @release 1.0.0
3648 ******************************************************************************/
3649
patBruteIsRange(const char * t,ajuint * x,ajuint * y)3650 static AjBool patBruteIsRange(const char *t, ajuint *x, ajuint *y)
3651 {
3652 const char *u;
3653
3654 if((*t >='A' && *t<='Z') || *t=='?')
3655 {
3656 if(*(t+1)=='(')
3657 {
3658 if(sscanf(t+2,"%u,%u",x,y)!=2)
3659 {
3660 sscanf(t+2,"%u",x);
3661 *y=*x;
3662 }
3663
3664 return ajTrue;
3665 }
3666
3667 return ajFalse;
3668 }
3669
3670 u = t;
3671
3672 if(*t=='[')
3673 {
3674 while(*u!=']')
3675 ++u;
3676
3677 if(*(u+1)=='(')
3678 {
3679 if(sscanf(u+2,"%u,%u",x,y)!=2)
3680 {
3681 sscanf(u+2,"%u",x);
3682 *y = *x;
3683 }
3684
3685 return ajTrue;
3686 }
3687 return ajFalse;
3688 }
3689
3690 while(*u!='}')
3691 ++u;
3692
3693 if(*(u+1)=='(')
3694 {
3695 if(sscanf(u+2,"%u,%u",x,y)!=2)
3696 {
3697 sscanf(u+2,"%u",x);
3698 *y = *x;
3699 }
3700
3701 return ajTrue;
3702 }
3703
3704 return ajFalse;
3705 }
3706
3707
3708
3709
3710 /* @funcstatic patBruteCharMatch **********************************************
3711 **
3712 ** Check if text character matches a char/class or complement
3713 **
3714 ** @param [r] t [const char *] text
3715 ** @param [r] c [char] character to match
3716 **
3717 ** @return [AjBool] true if match
3718 **
3719 ** @release 1.0.0
3720 ******************************************************************************/
3721
patBruteCharMatch(const char * t,char c)3722 static AjBool patBruteCharMatch(const char *t, char c)
3723 {
3724 if(!c)
3725 return ajFalse;
3726
3727 if(*t=='?')
3728 return ajTrue;
3729
3730 if(*t>='A' && *t<='Z')
3731 {
3732 if(*t==c)
3733 return ajTrue;
3734
3735 return ajFalse;
3736 }
3737
3738 if(*t=='[')
3739 {
3740 if(patBruteClass(t,c))
3741 return ajTrue;
3742
3743 return ajFalse;
3744 }
3745
3746 if(patBruteCompl(t,c))
3747 return ajTrue;
3748
3749 return ajFalse;
3750 }
3751
3752
3753
3754
3755 /* @funcstatic patBruteNextPatChar ********************************************
3756 **
3757 ** Get index of next char/class/complement
3758 **
3759 ** @param [r] t [const char *] text
3760 ** @param [r] ppos [ajuint] current index
3761 **
3762 ** @return [ajuint] next index
3763 **
3764 ** @release 1.0.0
3765 ******************************************************************************/
3766
patBruteNextPatChar(const char * t,ajuint ppos)3767 static ajuint patBruteNextPatChar(const char *t, ajuint ppos)
3768 {
3769 if(t[ppos]=='{')
3770 while(t[ppos]!='}')
3771 ++ppos;
3772
3773 if(t[ppos]=='[')
3774 while(t[ppos]!=']')
3775 ++ppos;
3776
3777 ++ppos;
3778
3779 if(t[ppos]=='(')
3780 {
3781 while(t[ppos]!=')')
3782 ++ppos;
3783
3784 ++ppos;
3785 }
3786
3787 return ppos;
3788 }
3789
3790
3791
3792
3793 /* @funcstatic patOUBrute *****************************************************
3794 **
3795 ** Match pattern to current sequence position
3796 **
3797 ** @param [r] seq [const char *] text
3798 ** @param [r] pat [const char *] pattern
3799 ** @param [r] spos [ajuint] sequence index
3800 ** @param [r] ppos [ajuint] pattern index
3801 ** @param [r] mm [ajuint] mismatches left
3802 ** @param [r] omm [ajuint] allowed mismatches
3803 ** @param [r] level [ajuint] level of recursion
3804 ** @param [u] l [AjPList] list on which to push hits
3805 ** @param [r] carboxyl [AjBool] true if pattern must only match end of text
3806 ** @param [r] begin [ajuint] text offset
3807 ** @param [w] count [ajuint *] hit counter
3808 ** @param [r] name [const AjPStr] text entry name
3809 ** @param [r] st [ajuint] original text index
3810 **
3811 ** @return [AjBool] true if hit found (Care! Function recursive.)
3812 **
3813 ** @release 1.0.0
3814 ******************************************************************************/
3815
patOUBrute(const char * seq,const char * pat,ajuint spos,ajuint ppos,ajuint mm,ajuint omm,ajuint level,AjPList l,AjBool carboxyl,ajuint begin,ajuint * count,const AjPStr name,ajuint st)3816 static AjBool patOUBrute(const char *seq, const char *pat, ajuint spos,
3817 ajuint ppos, ajuint mm,
3818 ajuint omm, ajuint level, AjPList l, AjBool carboxyl,
3819 ajuint begin, ajuint *count,
3820 const AjPStr name, ajuint st)
3821 {
3822 const char *t;
3823 ajuint x;
3824 ajuint y;
3825 ajuint i;
3826
3827 if(level==1000)
3828 ajFatal("Pattern too complex: 1000 levels of recursion");
3829
3830
3831 while(pat[ppos])
3832 {
3833 t = pat+ppos;
3834
3835 if(!seq[spos])
3836 return ajFalse;
3837
3838 if(!patBruteIsRange(t,&x,&y))
3839 {
3840 if(!patBruteCharMatch(t,seq[spos]))
3841 {
3842 if(mm==0)
3843 return ajFalse;
3844
3845 --mm;
3846 }
3847
3848 ppos = patBruteNextPatChar(pat,ppos);
3849 ++spos;
3850 continue;
3851 }
3852
3853 for(i=0;i<x;++i)
3854 {
3855 if(!patBruteCharMatch(t,seq[spos++]))
3856 {
3857 if(mm==0)
3858 return ajFalse;
3859
3860 --mm;
3861 }
3862
3863 if(!seq[spos-1])
3864 return ajFalse;
3865 }
3866
3867 ppos=patBruteNextPatChar(pat,ppos);
3868
3869 for(i=0;i<y-x;++i)
3870 {
3871 patOUBrute(seq,pat,spos,ppos,mm,omm,level,l,carboxyl,
3872 begin,count,name,st);
3873
3874 if(!patBruteCharMatch(t,seq[spos]))
3875 return ajFalse;
3876
3877 ++spos;
3878 }
3879 }
3880
3881 if(!carboxyl || (carboxyl && !seq[spos]))
3882 {
3883 *count += 1;
3884 embPatPushHit(l,name,st,spos-st,begin,omm-mm);
3885
3886 }
3887
3888 return ajTrue;
3889 }
3890
3891
3892
3893
3894 /* @func embPatBruteForce *****************************************************
3895 **
3896 ** Match pattern to a sequence
3897 **
3898 ** @param [r] seq [const AjPStr] text
3899 ** @param [r] pat [const AjPStr] pattern
3900 ** @param [r] amino [AjBool] true if must match start
3901 ** @param [r] carboxyl [AjBool] true if must match end
3902 ** @param [u] l [AjPList] list on which to push hits
3903 ** @param [r] begin [ajuint] text offset
3904 ** @param [r] mm [ajuint] allowed mismatches
3905 ** @param [r] name [const AjPStr] text entry name
3906 **
3907 ** @return [ajuint] number of hits
3908 **
3909 ** @release 1.0.0
3910 ******************************************************************************/
3911
embPatBruteForce(const AjPStr seq,const AjPStr pat,AjBool amino,AjBool carboxyl,AjPList l,ajuint begin,ajuint mm,const AjPStr name)3912 ajuint embPatBruteForce(const AjPStr seq, const AjPStr pat,
3913 AjBool amino, AjBool carboxyl,
3914 AjPList l, ajuint begin, ajuint mm, const AjPStr name)
3915 {
3916 const char *s;
3917 const char *p;
3918 ajuint sum;
3919 ajuint len;
3920 ajuint i;
3921 ajuint count;
3922
3923 /*
3924 // ajDebug("embPatBruteForce amino:%B carboxyl:%B begin:%u mm:%u len:%u\n",
3925 // amino, carboxyl, begin, mm, ajStrGetLen(seq));
3926 */
3927
3928 sum=count = 0;
3929 s = ajStrGetPtr(seq);
3930 p = ajStrGetPtr(pat);
3931
3932 if(amino)
3933 {
3934 patOUBrute(s,p,0,0,mm,mm,1,l,carboxyl,begin,&count,name,0);
3935
3936 return count;
3937 }
3938
3939 len = (ajuint) strlen(s);
3940
3941 for(i=0;i<len;++i)
3942 {
3943 patOUBrute(s,p,i,0,mm,mm,1,l,carboxyl,begin,&count,name,i);
3944 sum += count;
3945 count = 0;
3946 }
3947
3948 return sum;
3949 }
3950
3951
3952
3953
3954 /* @func embPatVariablePattern ************************************************
3955 **
3956 ** Match variable pattern against constant text.
3957 ** Used for matching many patterns against one sequence.
3958 **
3959 ** @param [r] pattern [const AjPStr] pattern to match
3960 ** @param [r] text [const AjPStr] text to scan
3961 ** @param [r] patname [const AjPStr] ID or AC of pattern
3962 ** @param [u] l [AjPList] list on which to push hits
3963 ** @param [r] mode [ajuint] 1 for protein, 0 for nucleic acid
3964 ** @param [r] mismatch [ajuint] allowed mismatches
3965 ** @param [r] begin [ajuint] text offset
3966 **
3967 ** @return [ajuint] number of hits
3968 **
3969 ** @release 1.0.0
3970 ** @@
3971 ******************************************************************************/
3972
embPatVariablePattern(const AjPStr pattern,const AjPStr text,const AjPStr patname,AjPList l,ajuint mode,ajuint mismatch,ajuint begin)3973 ajuint embPatVariablePattern(const AjPStr pattern,
3974 const AjPStr text,
3975 const AjPStr patname, AjPList l, ajuint mode,
3976 ajuint mismatch, ajuint begin)
3977 {
3978 ajuint plen;
3979 ajuint slen = 0;
3980
3981 AjBool amino;
3982 AjBool carboxyl;
3983 AjBool fclass;
3984 AjBool ajcompl;
3985 AjBool dontcare;
3986 AjBool range;
3987 ajint *buf; /* can be -1 */
3988 ajuint hits;
3989 ajuint m;
3990 ajuint n;
3991 ajuint i;
3992 ajuint start;
3993 ajuint end;
3994
3995 AjOPatBYPNode off[AJALPHA];
3996
3997 ajuint *sotable = NULL;
3998 ajuint solimit;
3999
4000 EmbPPatMatch ppm;
4001 AjPStr regexp;
4002
4003 ajuint **skipm;
4004
4005 AjPStr cleanpattern = NULL;
4006
4007 cleanpattern = ajStrNew();
4008
4009 if(!embPatClassify(pattern,&cleanpattern,
4010 &amino,&carboxyl,&fclass,&ajcompl,&dontcare,
4011 &range,mode))
4012 ajFatal("Illegal pattern");
4013
4014 plen = ajStrGetLen(cleanpattern);
4015
4016 /*
4017 ** Select type of search depending on pattern
4018 */
4019
4020 if(!range && !dontcare && !fclass && !ajcompl && !mismatch && plen>4)
4021 {
4022 /* Boyer Moore Horspool is the choice for long exact patterns */
4023 plen = ajStrGetLen(cleanpattern);
4024 AJCNEW(buf, AJALPHA);
4025 embPatBMHInit(cleanpattern,plen,buf);
4026 hits = embPatBMHSearch(text,cleanpattern,ajStrGetLen(text),
4027 ajStrGetLen(cleanpattern),buf,
4028 0,amino,carboxyl,l,
4029 patname,begin);
4030 AJFREE(buf);
4031 ajStrDel(&cleanpattern);
4032
4033 return hits;
4034 }
4035
4036 else if(mismatch && !dontcare && !range && !fclass && !ajcompl)
4037 {
4038 /* Baeza-Yates Perleberg for exact patterns plus don't cares */
4039 plen = ajStrGetLen(cleanpattern);
4040 AJCNEW(buf, AJALPHA);
4041 embPatBYPInit(cleanpattern,plen,off,buf);
4042 hits = embPatBYPSearch(text,patname,begin,
4043 ajStrGetLen(text),plen,mismatch,off,buf,l,
4044 amino,carboxyl,cleanpattern);
4045 AJFREE(buf);
4046 ajStrDel(&cleanpattern);
4047
4048 return hits;
4049 }
4050
4051
4052 if(!range && !dontcare && !fclass && !ajcompl && !mismatch)
4053 {
4054 /* Shift-OR is the choice for small exact patterns */
4055 AJCNEW(sotable, AJALPHA2);
4056 embPatSOInit(cleanpattern,sotable,&solimit);
4057 hits = embPatSOSearch(text,patname,*ajStrGetPtr(cleanpattern),
4058 begin,plen,sotable,solimit,l,
4059 amino,carboxyl);
4060 AJFREE(sotable);
4061 ajStrDel(&cleanpattern);
4062
4063 return hits;
4064 }
4065
4066
4067 /* Next get m, the real pattern length */
4068 /* May as well set up a class search as well tho' it needs 'free's later */
4069 AJCNEW(sotable, AJALPHA2);
4070 embPatBYGCInit(cleanpattern,&m,sotable,&solimit);
4071
4072
4073 if(!range && (fclass || ajcompl) && !mismatch && m <= AJWORD)
4074 {
4075 /*
4076 ** Baeza-Yates Gonnet for classes and don't cares.
4077 ** No mismatches or ranges. Patterns less than (e.g.) 32
4078 ** Uses Shift-OR search engine
4079 */
4080 AJFREE(sotable);
4081 AJCNEW(sotable, AJALPHA2);
4082 embPatBYGCInit(cleanpattern,&m,sotable,&solimit);
4083 plen = m;
4084 hits = embPatBYGSearch(text,patname,
4085 begin,plen,sotable,solimit,l,
4086 amino,carboxyl);
4087
4088 AJFREE(sotable);
4089 ajStrDel(&cleanpattern);
4090
4091 return hits;
4092 }
4093
4094
4095
4096 if(!mismatch && (range || m > AJWORD))
4097 {
4098 /*
4099 ** PCRE for ranges and simple classes longer than
4100 ** e.g. 32. No mismatches allowed
4101 **/
4102 AJFREE(sotable);
4103 regexp = embPatPrositeToRegExp(pattern); /* original pattern */
4104 ppm = embPatMatchFind(regexp,text,amino,carboxyl);
4105 n = embPatMatchGetNumber(ppm);
4106
4107 for(i=0;i<n;++i)
4108 {
4109 start = embPatMatchGetStart(ppm,i);
4110 end = embPatMatchGetEnd(ppm,i);
4111
4112 if(amino && start)
4113 {
4114 n = 0;
4115 break;
4116 }
4117
4118 if(!carboxyl || (carboxyl && start==slen-(end-start+1)))
4119 embPatPushHit(l,patname,start,end-start+1,
4120 begin,0);
4121 }
4122
4123 embPatMatchDel(&ppm);
4124 hits = n;
4125 ajStrDel(®exp);
4126 ajStrDel(&cleanpattern);
4127
4128 return hits;
4129 }
4130
4131
4132 if(mismatch && !range && (fclass || ajcompl))
4133 {
4134 /* Try a Tarhio-Ukkonen-Bleasby */
4135
4136 AJFREE(sotable);
4137
4138 AJCNEW(skipm, m);
4139
4140 for(i=0;i<m;++i)
4141 AJCNEW(skipm[i], AJALPHA);
4142
4143 embPatTUBInit(cleanpattern,skipm,m,mismatch,plen);
4144 hits = embPatTUBSearch(cleanpattern,text,ajStrGetLen(text),skipm,
4145 m,mismatch,begin,
4146 l,amino,carboxyl,patname,plen);
4147 for(i=0;i<m;++i)
4148 AJFREE(skipm[i]);
4149
4150 AJFREE(skipm);
4151 ajStrDel(&cleanpattern);
4152
4153 return hits;
4154 }
4155
4156 /*
4157 ** No choice left but to do a Bleasby recursive brute force
4158 */
4159 hits = embPatBruteForce(text,cleanpattern,amino,carboxyl,l,
4160 begin,mismatch,patname);
4161 AJFREE(sotable);
4162
4163 ajStrDel(&cleanpattern);
4164
4165 return hits;
4166 }
4167
4168
4169
4170
4171 /* @func embPatRestrictPreferred **********************************************
4172 **
4173 ** Replace RE names by the name of the prototype for that RE
4174 **
4175 ** @param [u] l [AjPList] list of EmbPMatMatch hits
4176 ** @param [r] t [const AjPTable] table from embossre.equ file
4177 **
4178 ** @return [void]
4179 **
4180 ** @release 2.7.0
4181 ** @@
4182 ******************************************************************************/
4183
embPatRestrictPreferred(AjPList l,const AjPTable t)4184 void embPatRestrictPreferred(AjPList l, const AjPTable t)
4185 {
4186 AjIList iter = NULL;
4187 EmbPMatMatch m = NULL;
4188 const AjPStr value = NULL;
4189
4190 iter = ajListIterNewread(l);
4191
4192 while((m = (EmbPMatMatch)ajListIterGet(iter)))
4193 {
4194 value = ajTableFetchS(t,m->cod);
4195
4196 if(value)
4197 ajStrAssignS(&m->cod,value);
4198 }
4199
4200 ajListIterDel(&iter);
4201
4202 return;
4203 }
4204
4205
4206
4207
4208 /* @func embPatRestrictRestrict ***********************************************
4209 **
4210 ** Cut down the number of restriction enzyme hits from embPatRestrictScan
4211 ** Notably double reporting of symmetric palindromes and reporting
4212 ** of isoschizomers. Also provides an optional alphabetic sort.
4213 **
4214 ** If we don't allow isoschizomers, then names of all isoschizomers
4215 ** found will be added to the string 'iso' in the returned list of
4216 ** EmbPMatMatch structures. If 'isos' is AjTrue then they will be left alone.
4217 **
4218 ** @param [u] l [AjPList] list of hits from embPatRestrictScan
4219 ** @param [r] hits [ajuint] number of hits from embPatRestrictScan
4220 ** @param [r] isos [AjBool] Allow isoschizomers
4221 ** @param [r] alpha [AjBool] Sort alphabetically
4222 **
4223 ** @return [ajuint] adjusted number of hits
4224 **
4225 ** @release 1.0.0
4226 ** @@
4227 ******************************************************************************/
4228
embPatRestrictRestrict(AjPList l,ajuint hits,AjBool isos,AjBool alpha)4229 ajuint embPatRestrictRestrict(AjPList l, ajuint hits, AjBool isos,
4230 AjBool alpha)
4231 {
4232 EmbPMatMatch m = NULL;
4233 EmbPMatMatch archetype = NULL; /* archetype of a set of isoschizomers */
4234 AjPStr ps = NULL;
4235 AjPList tlist = NULL;
4236 AjPList newlist = NULL;
4237
4238 ajuint i;
4239 ajuint v;
4240 ajuint tc = 0;
4241 ajuint nc = 0;
4242 ajint cut1 = 0;
4243 ajint cut2;
4244 ajint cut3;
4245 ajint cut4;
4246 ajuint pos = 0;
4247
4248 ps = ajStrNew();
4249 tlist = ajListNew();
4250 newlist = ajListNew();
4251
4252 /* Remove Mirrors for each enzyme separately */
4253 ajListSort(l, &embPatRestrictNameCompare);
4254 tc = nc = 0;
4255
4256 if(hits)
4257 {
4258 ajListPop(l,(void **)&m);
4259 ajStrAssignS(&ps,m->cod);
4260 ajListPush(l,(void *)m);
4261 }
4262
4263 while(ajListPop(l,(void **)&m))
4264 {
4265 if(!ajStrCmpS(m->cod,ps))
4266 {
4267 ajListPush(tlist,(void *)m);
4268 ++tc;
4269 }
4270 else
4271 {
4272 ajStrAssignS(&ps,m->cod);
4273 ajListPush(l,(void *)m);
4274 ajListSort(tlist, &embPatRestrictStartCompare);
4275 ajListSort(tlist, &embPatRestrictCutCompare);
4276 cut1 = cut2 = INT_MAX;
4277
4278 for(i=0;i<tc;++i)
4279 {
4280 ajListPop(tlist,(void **)&m);
4281
4282 if(cut1!=m->cut1)
4283 {
4284 cut1=m->cut1;
4285 ajListPush(newlist,(void *)m);
4286 ++nc;
4287 }
4288 else
4289 embMatMatchDel(&m);
4290 }
4291
4292 tc = 0;
4293 }
4294 }
4295
4296 ajListSort(tlist, &embPatRestrictStartCompare);
4297 ajListSort(tlist, &embPatRestrictCutCompare);
4298
4299 cut1 = cut2 = INT_MAX;
4300
4301 for(i=0;i<tc;++i)
4302 {
4303 ajListPop(tlist,(void **)&m);
4304
4305 if(cut1!=m->cut1)
4306 {
4307 cut1=m->cut1;
4308 ajListPush(newlist,(void *)m);
4309 ++nc;
4310 }
4311 else
4312 embMatMatchDel(&m);
4313 }
4314
4315 /* List l is currently empty - now reuse it */
4316
4317 hits = nc;
4318 ajListFree(&tlist);
4319 tlist = ajListNew();
4320
4321
4322 if(!isos)
4323 {
4324 /* Keep only first alphabetical isoschizomer */
4325 ajListSort(newlist, &embPatRestrictStartCompare);
4326
4327 if(hits)
4328 {
4329 ajListPop(newlist,(void **)&m);
4330 pos = m->start;
4331 ajListPush(newlist,(void *)m);
4332 }
4333
4334 tc = nc =0;
4335
4336 while(ajListPop(newlist,(void **)&m))
4337 {
4338 if(pos==m->start)
4339 {
4340 /*
4341 ** push groups of RE's that share the same start
4342 ** position onto tlist to
4343 ** be checked later to see if they are isoschizomers
4344 */
4345 ajListPush(tlist,(void *)m);
4346 ++tc;
4347 }
4348 else
4349 {
4350 pos = m->start;
4351
4352 ajListPush(newlist,(void *)m);
4353
4354 /*
4355 ** Now for list of all enz's which cut at same pos
4356 ** sorted by Name
4357 */
4358 ajListSort(tlist, &embPatRestrictNameCompare);
4359
4360 /*
4361 ** Now loop rejecting, for each left in the list,
4362 ** anything similar
4363 */
4364
4365 /*
4366 ** check for isoschizomers in the group sharing the
4367 ** previous 'pos' here
4368 */
4369 while(tc)
4370 {
4371 ajListPop(tlist,(void **)&m);
4372 cut1 = m->cut1;
4373 cut2 = m->cut2;
4374 cut3 = m->cut3;
4375 cut4 = m->cut4;
4376 ajStrAssignC(&ps,ajStrGetPtr(m->pat));
4377
4378 /*
4379 ** first one of the group is not an isoschizomer,
4380 ** by definition, so return it
4381 */
4382 ajListPush(l,(void *)m);
4383 archetype = m;
4384 ++nc;
4385 --tc;
4386
4387
4388 for(i=0,v=0;i<tc;++i)
4389 {
4390 ajListPop(tlist,(void **)&m);
4391
4392 if(m->cut1!=cut1 || m->cut2!=cut2 || m->cut3!=cut3 ||
4393 m->cut4!=cut4 || ajStrCmpS(ps,m->pat))
4394 {
4395 ajListPushAppend(tlist,(void *)m);
4396 ++v;
4397 }
4398
4399 else
4400 {
4401 /*
4402 ** same cut sites and pattern at the RE just
4403 ** pushed onto 'l', so is an
4404 **isoschizomer - add its name to the
4405 ** archetype's list of isoschizomers and
4406 ** delete
4407 */
4408 if(ajStrGetLen(archetype->iso) > 0)
4409 ajStrAppendC(&archetype->iso, ",");
4410
4411 ajStrAppendS(&archetype->iso, m->cod);
4412 embMatMatchDel(&m);
4413 }
4414 }
4415
4416 tc = v;
4417 }
4418 }
4419 }
4420
4421
4422
4423 ajListSort(tlist, &embPatRestrictNameCompare);
4424
4425 while(tc)
4426 {
4427 ajListPop(tlist,(void **)&m);
4428 cut1 = m->cut1;
4429 cut2 = m->cut2;
4430 cut3 = m->cut3;
4431 cut4 = m->cut4;
4432 ajStrAssignC(&ps,ajStrGetPtr(m->pat));
4433 /*
4434 ** first one of the group is not an isoschizomer,
4435 ** by definition, so return it
4436 */
4437 ajListPush(l,(void *)m);
4438 archetype = m;
4439 ++nc;
4440 --tc;
4441
4442
4443 for(i=0,v=0;i<tc;++i)
4444 {
4445 ajListPop(tlist,(void **)&m);
4446
4447 if(m->cut1!=cut1 || m->cut2!=cut2 || m->cut3!=cut3 ||
4448 m->cut4!=cut4 || ajStrCmpS(ps,m->pat))
4449 {
4450 ajListPushAppend(tlist,(void *)m);
4451 ++v;
4452 }
4453 else
4454 {
4455 /*
4456 ** same cut sites and pattern as the RE
4457 ** just pushed onto 'l', so is an
4458 ** isoschizomer - add its name to the archetype's
4459 ** list of isoschizomers and
4460 ** delete
4461 */
4462 if(ajStrGetLen(archetype->iso) > 0)
4463 ajStrAppendC(&archetype->iso, ",");
4464
4465 ajStrAppendS(&archetype->iso, m->cod);
4466 embMatMatchDel(&m);
4467 }
4468 }
4469
4470 tc = v;
4471 }
4472
4473 hits = nc;
4474 }
4475 else
4476 {
4477 while(ajListPop(newlist,(void **)&m))
4478 {
4479 ajListPush(l, (void*) m);
4480 }
4481
4482 ajListFree(&newlist);
4483 }
4484
4485 /* Finally sort on position of recognition sequence and print */
4486 ajListSort(l, &embPatRestrictStartCompare);
4487
4488 if(alpha)
4489 ajListSortTwo(l,
4490 &embPatRestrictNameCompare,
4491 &embPatRestrictStartCompare);
4492
4493 ajStrDel(&ps);
4494 ajListFree(&tlist);
4495 ajListFree(&newlist);
4496
4497 return hits;
4498 }
4499
4500
4501
4502
4503 /* @func embPatRestrictStartCompare *******************************************
4504 **
4505 ** Sort restriction site hits on the basis of start position
4506 **
4507 ** @param [r] a [const void *] First EmbPMatMatch hit
4508 ** @param [r] b [const void *] Second EmbPMatMatch hit
4509 **
4510 ** @return [ajint] 0 if a and b are equal
4511 ** -ve if a is less than b,
4512 ** +ve if a is greater than b
4513 **
4514 ** @release 1.13.0
4515 ******************************************************************************/
4516
embPatRestrictStartCompare(const void * a,const void * b)4517 ajint embPatRestrictStartCompare(const void *a, const void *b)
4518 {
4519 ajuint astart = (*(EmbPMatMatch const *)a)->start;
4520 ajuint bstart = (*(EmbPMatMatch const *)b)->start;
4521
4522 if(!(*(EmbPMatMatch const *)a)->forward)
4523 astart = (*(EmbPMatMatch const *)a)->end;
4524 if(!(*(EmbPMatMatch const *)b)->forward)
4525 bstart = (*(EmbPMatMatch const *)b)->end;
4526
4527 return astart - bstart;
4528 }
4529
4530
4531
4532
4533 /* @func embPatRestrictCutCompare *********************************************
4534 **
4535 ** Sort restriction site hits on the basis of cut position
4536 **
4537 ** @param [r] a [const void *] First EmbPMatMatch hit
4538 ** @param [r] b [const void *] Second EmbPMatMatch hit
4539 **
4540 ** @return [ajint] 0 if a and b are equal
4541 ** -ve if a is less than b,
4542 ** +ve if a is greater than b
4543 **
4544 ** @release 1.13.0
4545 ******************************************************************************/
4546
embPatRestrictCutCompare(const void * a,const void * b)4547 ajint embPatRestrictCutCompare(const void *a, const void *b)
4548 {
4549 return (*(EmbPMatMatch const *)a)->cut1 - (*(EmbPMatMatch const *)b)->cut1;
4550 }
4551
4552
4553
4554
4555 /* @func embPatRestrictNameCompare ********************************************
4556 **
4557 ** Sort restriction site hits on the basis of enzyme name
4558 **
4559 ** @param [r] a [const void *] First EmbPMatMatch hit
4560 ** @param [r] b [const void *] Second EmbPMatMatch hit
4561 **
4562 ** @return [ajint] 0 if a and b are equal
4563 ** -ve if a is less than b,
4564 ** +ve if a is greater than b
4565 **
4566 ** @release 1.13.0
4567 ******************************************************************************/
4568
embPatRestrictNameCompare(const void * a,const void * b)4569 ajint embPatRestrictNameCompare(const void *a, const void *b)
4570 {
4571 return strcmp(ajStrGetPtr((*(EmbPMatMatch const *)a)->cod),
4572 ajStrGetPtr((*(EmbPMatMatch const *)b)->cod));
4573 }
4574
4575
4576
4577
4578 /* @func embPatRestrictMatch **************************************************
4579 **
4580 ** Main Restriction function. Scans sequence and rejects unwanted
4581 ** cutters
4582 **
4583 ** @param [r] seq [const AjPSeq] sequence
4584 ** @param [r] begin [ajuint] start position in sequence
4585 ** @param [r] end [ajuint] end position in sequence
4586 ** @param [u] enzfile [AjPFile] file pointer to .enz file
4587 ** @param [u] methfile [AjPFile] file pointer to methylation data file
4588 ** @param [r] enzymes [const AjPStr] comma separated list of REs
4589 ** or NULL for all
4590 ** @param [r] sitelen [ajuint] minimum length of recognition site
4591 ** @param [r] plasmid [AjBool] Circular DNA
4592 ** @param [r] ambiguity [AjBool] Allow ambiguities
4593 ** @param [r] min [ajuint] minimum number of true cuts
4594 ** @param [r] max [ajuint] maximum number of true cuts
4595 ** @param [r] blunt [AjBool] Allow blunt cutters
4596 ** @param [r] sticky [AjBool] Allow sticky cutters
4597 ** @param [r] commercial [AjBool] Allow Only report REs with a supplier
4598 ** @param [r] methyl [AjBool] Mark methylated bases as 'N'
4599 ** @param [u] l [AjPList] list for (EmbPMatMatch) hits
4600 **
4601 ** @return [ajuint] number of hits
4602 **
4603 ** @release 1.0.0
4604 ** @@
4605 ******************************************************************************/
4606
embPatRestrictMatch(const AjPSeq seq,ajuint begin,ajuint end,AjPFile enzfile,AjPFile methfile,const AjPStr enzymes,ajuint sitelen,AjBool plasmid,AjBool ambiguity,ajuint min,ajuint max,AjBool blunt,AjBool sticky,AjBool commercial,AjBool methyl,AjPList l)4607 ajuint embPatRestrictMatch(const AjPSeq seq, ajuint begin, ajuint end,
4608 AjPFile enzfile, AjPFile methfile,
4609 const AjPStr enzymes, ajuint sitelen,
4610 AjBool plasmid, AjBool ambiguity,
4611 ajuint min, ajuint max,
4612 AjBool blunt, AjBool sticky, AjBool commercial,
4613 AjBool methyl, AjPList l)
4614 {
4615 AjBool hassup;
4616 AjBool isall = ajTrue;
4617 AjPStr name;
4618 const AjPStr strand;
4619 AjPStr substr;
4620 AjPStr revstr;
4621 AjPStr binstr;
4622 AjPStr binrev;
4623 AjPStr *ea;
4624 AjPStr tmpstr = NULL;
4625 AjPList methlist = NULL;
4626
4627 EmbPPatRestrict enz;
4628 MethPData md = NULL;
4629
4630
4631 ajuint len;
4632 ajuint plen;
4633 ajuint i;
4634 ajuint hits;
4635 ajuint ne = 0;
4636
4637 const char *cp;
4638 char *p;
4639 char *q;
4640
4641 if(enzymes)
4642 {
4643 ne = ajArrCommaList(enzymes,&ea);
4644
4645 if(!ne)
4646 return 0;
4647 }
4648
4649 name = ajStrNew();
4650 substr = ajStrNew();
4651 revstr = ajStrNew();
4652 binstr = ajStrNew();
4653 binrev = ajStrNew();
4654
4655 enz = embPatRestrictNew();
4656
4657 if(methyl)
4658 methlist = patRestrictReadMethyl(methfile);
4659
4660 if(!enzymes)
4661 isall = ajTrue;
4662 else
4663 {
4664 for(i=0;i<ne;++i)
4665 {
4666 ajStrRemoveWhite(&ea[i]);
4667 ajStrFmtUpper(&ea[i]);
4668 }
4669
4670 if(ajStrMatchCaseC(ea[0],"all"))
4671 isall = ajTrue;
4672 else
4673 isall = ajFalse;
4674 }
4675
4676
4677
4678 ajFileSeek(enzfile,0L,0);
4679 ajStrAssignS(&name,ajSeqGetNameS(seq));
4680 strand = ajSeqGetSeqS(seq);
4681 ajStrAssignSubS(&substr,strand,begin-1,end-1);
4682 ajStrFmtUpper(&substr);
4683 len = plen = ajStrGetLen(substr);
4684 ajStrAssignSubS(&revstr,strand,begin-1,end-1);
4685 ajStrFmtUpper(&revstr);
4686 ajSeqstrReverse(&revstr);
4687
4688 if(methyl)
4689 {
4690 patRestrictMethylMod(&substr,&revstr,methlist);
4691 patRestrictMethylMod(&revstr,&substr,methlist);
4692 }
4693
4694
4695 ajStrAssignS(&binstr,substr);
4696 ajStrAssignS(&binrev,revstr);
4697
4698 if(plasmid)
4699 {
4700 plen <<= 1;
4701 tmpstr = ajStrNew();
4702 ajStrAssignS(&tmpstr,substr);
4703 ajStrAppendS(&tmpstr,substr);
4704 ajStrAssignS(&substr,tmpstr);
4705
4706 ajStrAssignS(&tmpstr,binstr);
4707 ajStrAppendS(&tmpstr,binstr);
4708 ajStrAssignS(&binstr,tmpstr);
4709
4710 ajStrAssignS(&tmpstr,revstr);
4711 ajStrAppendS(&tmpstr,revstr);
4712 ajStrAssignS(&revstr,tmpstr);
4713
4714 ajStrAssignS(&tmpstr,binrev);
4715 ajStrAppendS(&tmpstr,binrev);
4716 ajStrAssignS(&binrev,tmpstr);
4717
4718 ajStrDel(&tmpstr);
4719 }
4720
4721 q = ajStrGetuniquePtr(&binrev);
4722 p = ajStrGetuniquePtr(&binstr);
4723
4724 for(i=0;i<plen;++i,++p,++q)
4725 {
4726 *p = (char)ajBaseAlphaToBin(*p);
4727 *q = (char)ajBaseAlphaToBin(*q);
4728 }
4729
4730
4731 hits = 0;
4732
4733 while(!ajFileIsEof(enzfile))
4734 {
4735 if(!embPatRestrictReadEntry(enz,enzfile))
4736 continue;
4737
4738 if(!enz->ncuts)
4739 continue;
4740
4741 if(enz->len < sitelen)
4742 continue;
4743
4744 if(!blunt && enz->blunt)
4745 continue;
4746
4747 if(!sticky && !enz->blunt)
4748 continue;
4749
4750 cp = ajStrGetPtr(enz->pat);
4751
4752 if(*cp>='A' && *cp<='Z')
4753 hassup = ajTrue;
4754 else
4755 hassup = ajFalse;
4756
4757 if(!hassup && isall && commercial)
4758 continue;
4759
4760 ajStrFmtUpper(&enz->pat);
4761
4762 if(!isall)
4763 {
4764 for(i=0;i<ne;++i)
4765 if(ajStrMatchCaseS(ea[i],enz->cod))
4766 break;
4767
4768 if(i==ne)
4769 continue;
4770 }
4771
4772 hits += embPatRestrictScan(enz,substr,binstr,revstr,binrev,len,
4773 ambiguity,plasmid,min,max,begin,l);
4774 }
4775
4776
4777
4778 for(i=0;i<ne;++i)
4779 ajStrDel(&ea[i]);
4780
4781 if(ne)
4782 AJFREE(ea);
4783
4784 if(methyl)
4785 {
4786 while(ajListPop(methlist,(void **)&md))
4787 {
4788 ajStrDel(&md->Name);
4789 ajStrDel(&md->Site);
4790 ajStrDel(&md->Replace);
4791 AJFREE(md);
4792 }
4793
4794 ajListFree(&methlist);
4795 }
4796
4797 ajStrDel(&name);
4798 ajStrDel(&substr);
4799 ajStrDel(&revstr);
4800 ajStrDel(&binstr);
4801 ajStrDel(&binrev);
4802 embPatRestrictDel(&enz);
4803
4804 return hits;
4805 }
4806
4807
4808
4809
4810 /* @func embPatGetType ********************************************************
4811 **
4812 ** Return the type of a pattern
4813 **
4814 ** @param [r] pattern [const AjPStr] original pattern
4815 ** @param [w] cleanpat [AjPStr *] cleaned pattern
4816 ** @param [r] mismatch [ajuint] number of allowed mismatches
4817 ** @param [r] protein [AjBool] true if protein
4818 ** @param [w] m [ajuint*] real length of pattern
4819 ** @param [w] left [AjBool*] must match left begin
4820 ** @param [w] right [AjBool*] must match right
4821 **
4822 ** @return [ajuint] type of pattern
4823 **
4824 ** @release 1.0.0
4825 ** @@
4826 ******************************************************************************/
4827
embPatGetType(const AjPStr pattern,AjPStr * cleanpat,ajuint mismatch,AjBool protein,ajuint * m,AjBool * left,AjBool * right)4828 ajuint embPatGetType(const AjPStr pattern, AjPStr *cleanpat,
4829 ajuint mismatch, AjBool protein,
4830 ajuint *m,
4831 AjBool *left, AjBool *right)
4832 {
4833 AjBool fclass;
4834 AjBool compl;
4835 AjBool dontcare;
4836 AjBool range;
4837 ajuint plen;
4838 ajuint type;
4839 const char *p;
4840
4841 ajStrAssignS(cleanpat,pattern);
4842 if(!embPatClassify(pattern,cleanpat,
4843 left,right,&fclass,&compl,&dontcare,
4844 &range,protein))
4845 return 0;
4846
4847 /* Get real pattern length */
4848 p = ajStrGetPtr(*cleanpat);
4849 *m = 0;
4850
4851 while(*p)
4852 {
4853 ++p;
4854 ++*m;
4855
4856 if(*p=='{')
4857 while(*p && *p!='}')
4858 ++p;
4859 else if(*p=='[')
4860 while(*p && *p!=']')
4861 ++p;
4862 else if(*p && *p=='(')
4863 {
4864 while(*p!=')')
4865 ++p;
4866 --*m;
4867 }
4868 }
4869
4870
4871 plen = ajStrGetLen(*cleanpat);
4872 type = 0;
4873
4874 /*
4875 ** Select type of search depending on pattern
4876 */
4877
4878 if(!range && !dontcare && !fclass && !compl && !mismatch && plen>AJWORD)
4879 {
4880 /* Boyer Moore Horspool is the choice for long exact patterns */
4881 type = 1;
4882 }
4883 else if(mismatch && !dontcare && !range && !fclass && !compl &&
4884 plen<AJALPHA/2)
4885 {
4886 /* Baeza-Yates Perleberg for exact patterns plus mismatches */
4887 type = 2;
4888 }
4889 else if(!range && !dontcare && !fclass && !compl && !mismatch &&
4890 plen<=AJWORD)
4891 {
4892 /* Shift-OR is the choice for small exact patterns */
4893 type = 3;
4894 }
4895 else if(!range && (fclass || compl || dontcare) && !mismatch && *m<=AJWORD)
4896 {
4897 /*
4898 * Baeza-Yates Gonnet for classes and dontcares.
4899 * No mismatches or ranges. Patterns less than (e.g.) 32
4900 */
4901 type = 4;
4902 }
4903 else if(!mismatch && (range || *m>AJWORD))
4904 {
4905 type = 5;
4906 /* note:
4907 ** there was a test here for '?' in the original pattern
4908 ** but that is an illegal pattern character
4909 */
4910 }
4911 else if(mismatch && !range && (fclass || compl))
4912 {
4913 /* Try a Tarhio-Ukkonen-Bleasby */
4914 type = 6;
4915 }
4916 else if((mismatch && range) || !type)
4917 {
4918 /*
4919 ** No choice left but to do a Bleasby recursive brute force
4920 */
4921 type = 7;
4922 }
4923
4924 ajDebug("embPatType %d '%S'\n", type, pattern);
4925
4926 if (!ajStrMatchCaseS(pattern, *cleanpat))
4927 ajDebug("embPatType cleaned to '%S'\n", *cleanpat);
4928
4929 return type;
4930 }
4931
4932
4933
4934
4935 /* @func embPatCompile ********************************************************
4936 **
4937 ** Compile a pattern classified by embPatGetType
4938 **
4939 ** @param [r] type [ajuint] pattern type
4940 ** @param [r] pattern [const AjPStr] original pattern
4941 ** @param [w] plen [ajuint*] pattern length
4942 ** @param [w] buf [ajint**] buffer for BMH and BYP search (can be -1)
4943 ** @param [w] off [EmbPPatBYPNode] offset buffer for B-Y/P search
4944 ** @param [w] sotable [ajuint**] buffer for SHIFT-OR
4945 ** @param [w] solimit [ajuint*] limit for SHIFT-OR
4946 ** @param [w] m [ajuint*] real length of pattern (from embPatGetType)
4947 ** @param [w] regexp [AjPStr *] PCRE regexp string
4948 ** @param [w] skipm [ajuint***] skip buffer for Tarhio-Ukkonen
4949 ** @param [r] mismatch [ajuint] number of allowed mismatches
4950 **
4951 ** @return [void]
4952 **
4953 ** @release 1.0.0
4954 ** @@
4955 ******************************************************************************/
4956
embPatCompile(ajuint type,const AjPStr pattern,ajuint * plen,ajint ** buf,EmbPPatBYPNode off,ajuint ** sotable,ajuint * solimit,ajuint * m,AjPStr * regexp,ajuint *** skipm,ajuint mismatch)4957 void embPatCompile(ajuint type, const AjPStr pattern, ajuint* plen,
4958 ajint** buf, EmbPPatBYPNode off, ajuint** sotable,
4959 ajuint* solimit, ajuint* m, AjPStr* regexp, ajuint*** skipm,
4960 ajuint mismatch)
4961 {
4962 ajuint i = 0;
4963
4964 *plen = ajStrGetLen(pattern);
4965
4966 switch(type)
4967 {
4968 case 1:
4969 AJCNEW(*buf,AJALPHA);
4970 embPatBMHInit(pattern,*plen,*buf);
4971 break;
4972 case 2:
4973 AJCNEW(*buf,AJALPHA);
4974 embPatBYPInit(pattern,*plen,off,*buf);
4975 break;
4976 case 3:
4977 AJCNEW(*sotable,AJALPHA2);
4978 embPatSOInit(pattern,*sotable,solimit);
4979 *m = *plen;
4980 break;
4981 case 4:
4982 AJCNEW(*sotable,AJALPHA2);
4983 embPatBYGCInit(pattern,m,*sotable,solimit);
4984 break;
4985 case 5:
4986 *regexp = embPatPrositeToRegExp(pattern);
4987 break;
4988 case 6:
4989 AJCNEW(*skipm,*m);
4990 for(i=0;i<*m;++i)
4991 AJCNEW((*skipm)[i],AJALPHA);
4992 embPatTUBInit(pattern,*skipm,*m,mismatch,*plen);
4993 break;
4994 case 7:
4995 break;
4996 default:
4997 ajFatal("embPatCompile: Cannot compile pattern");
4998 break;
4999 }
5000
5001 return;
5002 }
5003
5004
5005
5006
5007 /* @func embPatFuzzSearch *****************************************************
5008 **
5009 ** Fuzzy search after embPatGetType and embPatCompile
5010 **
5011 ** @param [r] type [ajuint] pattern type
5012 ** @param [r] begin [ajuint] text displacement (1=start)
5013 ** @param [r] pattern [const AjPStr] processed pattern
5014 ** @param [r] name [const AjPStr] name associated with text
5015 ** @param [r] text [const AjPStr] text
5016 ** @param [u] l [AjPList] list to push hits onto
5017 ** @param [r] plen [ajuint] pattern length
5018 ** @param [r] mismatch [ajuint] number of allowed mismatches
5019 ** @param [r] left [AjBool] must match left
5020 ** @param [r] right [AjBool] must match right
5021 ** @param [u] buf [ajint*] buffer for BMH search,
5022 ** or mismatch count for BYP search
5023 ** @param [u] off [EmbPPatBYPNode] offset buffer for B-Y/P search
5024 ** @param [r] sotable [const ajuint*] buffer for SHIFT-OR
5025 ** @param [r] solimit [ajuint] limit for SHIFT-OR
5026 ** @param [r] regexp [const AjPStr] PCRE regexp string
5027 ** @param [r] skipm [ajuint* const *] skip buffer for Tarhio-Ukkonen-Bleasby
5028 ** @param [w] hits [ajuint*] number of hits
5029 ** @param [r] m [ajuint] real pat length (from embPatGetType/embPatCompile)
5030 ** @param [w] tidy [const void**] data to free
5031 **
5032 ** @return [void]
5033 **
5034 ** @release 1.0.0
5035 ** @@
5036 ******************************************************************************/
5037
embPatFuzzSearch(ajuint type,ajuint begin,const AjPStr pattern,const AjPStr name,const AjPStr text,AjPList l,ajuint plen,ajuint mismatch,AjBool left,AjBool right,ajint * buf,EmbPPatBYPNode off,const ajuint * sotable,ajuint solimit,const AjPStr regexp,ajuint * const * skipm,ajuint * hits,ajuint m,const void ** tidy)5038 void embPatFuzzSearch(ajuint type, ajuint begin, const AjPStr pattern,
5039 const AjPStr name, const AjPStr text, AjPList l,
5040 ajuint plen, ajuint mismatch, AjBool left, AjBool right,
5041 ajint *buf, EmbPPatBYPNode off, const ajuint *sotable,
5042 ajuint solimit, const AjPStr regexp,
5043 ajuint * const *skipm,
5044 ajuint *hits, ajuint m, const void **tidy)
5045 {
5046 EmbPPatMatch ppm;
5047 ajuint n;
5048 ajuint i;
5049 ajuint start;
5050 ajuint end;
5051 ajuint count = 0;
5052
5053 ajDebug("embPatFuzzSearch type %d pattern: '%S'\n", type, pattern);
5054
5055 switch(type)
5056 {
5057 case 1:
5058 *hits = embPatBMHSearch(text,pattern,ajStrGetLen(text),
5059 ajStrGetLen(pattern),buf,0,left,right,l,
5060 name,begin);
5061 *tidy = (const void *) buf;
5062 break;
5063
5064 case 2:
5065 for(i=0;i<AJALPHA;++i)
5066 buf[i] = plen;
5067
5068 for(i=0;i<plen;++i)
5069 buf[i] = AJALPHA;
5070 *hits=embPatBYPSearch(text,name,begin,
5071 ajStrGetLen(text),plen,mismatch,off,buf,l,
5072 left,right,pattern);
5073 *tidy = (const void *) buf;
5074 break;
5075
5076 case 3:
5077 *hits = embPatSOSearch(text,name,*ajStrGetPtr(pattern),
5078 begin,plen,sotable,solimit,l,
5079 left,right);
5080 *tidy = (const void *) sotable;
5081 break;
5082
5083 case 4:
5084 plen = m;
5085 *hits = embPatBYGSearch(text,name,
5086 begin,plen,sotable,solimit,l,
5087 left,right);
5088 *tidy = (const void *) sotable;
5089 break;
5090
5091 case 5:
5092 ppm = embPatMatchFind(regexp,text, left, right);
5093 n = embPatMatchGetNumber(ppm);
5094 count = n;
5095
5096 for(i=0;i<n;++i)
5097 {
5098 start = embPatMatchGetStart(ppm,i);
5099 end = embPatMatchGetEnd(ppm,i);
5100 /*
5101 // ajDebug("embPatFuzzSearch embPatMatchFind left:%B start:%d "
5102 // "count:%d\n",
5103 // left, start, count);
5104 */
5105
5106 if(left && start)
5107 {
5108 --count;
5109 continue;
5110 }
5111
5112 if(right && start!=ajStrGetLen(text)-(end-start+1))
5113 {
5114 --count;
5115 continue;
5116 }
5117
5118 if(!right || (right && start==ajStrGetLen(text)-
5119 (end-start+1)))
5120 {
5121 /*
5122 // ajDebug("embPatFuzzSearch type 5 push hit %B..%B %d..%d\n",
5123 // left, right, start, end);
5124 */
5125
5126 embPatPushHit(l,name,start,end-start+1,
5127 begin,0);
5128 }
5129 /*
5130 // else
5131 // {
5132 // ajDebug("embPatFuzzSearch type 5 skip hit %B..%B %d..%d\n",
5133 // left, right, start, end);
5134 // }
5135 */
5136 }
5137
5138 embPatMatchDel(&ppm);
5139 *hits = count;
5140 break;
5141
5142 case 6:
5143 *hits = embPatTUBSearch(pattern,text,ajStrGetLen(text),skipm,
5144 m,mismatch,begin,
5145 l,left,right,name,plen);
5146 *tidy = (const void *) skipm;
5147 break;
5148
5149 case 7:
5150 *hits = embPatBruteForce(text,pattern,left,right,l,
5151 begin,mismatch,name);
5152 break;
5153
5154 default:
5155 ajFatal("Can't handle pattern type %S\n",pattern);
5156 break;
5157 }
5158
5159 ajDebug("embPatFuzzSearch hits: %d\n", *hits);
5160
5161 return;
5162 }
5163
5164
5165
5166
5167 /* @func embPatFuzzSearchAll **************************************************
5168 **
5169 ** Fuzzy search after embPatGetType and embPatCompile
5170 **
5171 ** @param [r] type [ajuint] pattern type
5172 ** @param [r] begin [ajuint] text displacement (1=start)
5173 ** @param [r] pattern [const AjPStr] processed pattern
5174 ** @param [r] name [const AjPStr] name associated with text
5175 ** @param [r] text [const AjPStr] text
5176 ** @param [u] l [AjPList] list to push hits onto
5177 ** @param [r] plen [ajuint] pattern length
5178 ** @param [r] mismatch [ajuint] number of allowed mismatches
5179 ** @param [r] left [AjBool] must match left
5180 ** @param [r] right [AjBool] must match right
5181 ** @param [u] buf [ajint*] buffer for BMH search
5182 ** @param [u] off [EmbPPatBYPNode] offset buffer for B-Y/P search
5183 ** @param [r] sotable [const ajuint*] buffer for SHIFT-OR
5184 ** @param [r] solimit [ajuint] limit for SHIFT-OR
5185 ** @param [r] regexp [const AjPStr] PCRE regexp string
5186 ** @param [r] skipm [ajuint* const *] skip buffer for Tarhio-Ukkonen-Bleasby
5187 ** @param [w] hits [ajuint*] number of hits
5188 ** @param [r] m [ajuint] real pat length (from embPatGetType/embPatCompile)
5189 ** @param [w] tidy [const void**] data to free
5190 **
5191 ** @return [void]
5192 **
5193 ** @release 1.0.0
5194 ** @@
5195 ******************************************************************************/
5196
embPatFuzzSearchAll(ajuint type,ajuint begin,const AjPStr pattern,const AjPStr name,const AjPStr text,AjPList l,ajuint plen,ajuint mismatch,AjBool left,AjBool right,ajint * buf,EmbPPatBYPNode off,const ajuint * sotable,ajuint solimit,const AjPStr regexp,ajuint * const * skipm,ajuint * hits,ajuint m,const void ** tidy)5197 void embPatFuzzSearchAll(ajuint type, ajuint begin, const AjPStr pattern,
5198 const AjPStr name, const AjPStr text, AjPList l,
5199 ajuint plen, ajuint mismatch,
5200 AjBool left, AjBool right,
5201 ajint *buf, EmbPPatBYPNode off, const ajuint *sotable,
5202 ajuint solimit, const AjPStr regexp,
5203 ajuint * const *skipm,
5204 ajuint *hits, ajuint m, const void **tidy)
5205 {
5206 EmbPPatMatch ppm;
5207 ajuint n;
5208 ajuint i;
5209 ajuint start;
5210 ajuint end;
5211 ajuint count = 0;
5212
5213 ajDebug("embPatFuzzSearchAll type %d pattern: '%S'\n", type, pattern);
5214
5215 switch(type)
5216 {
5217 case 1:
5218 *hits = embPatBMHSearch(text,pattern,ajStrGetLen(text),
5219 ajStrGetLen(pattern),buf,0,left,right,l,
5220 name,begin);
5221 *tidy = (const void *) buf;
5222 break;
5223
5224 case 2:
5225 for(i=0;i<AJALPHA;++i)
5226 buf[i] = plen;
5227
5228 for(i=0;i<plen;++i)
5229 buf[i] = AJALPHA;
5230 *hits=embPatBYPSearch(text,name,begin,
5231 ajStrGetLen(text),plen,mismatch,off,buf,l,
5232 left,right,pattern);
5233 *tidy = (const void *) buf;
5234 break;
5235
5236 case 3:
5237 *hits = embPatSOSearch(text,name,*ajStrGetPtr(pattern),
5238 begin,plen,sotable,solimit,l,
5239 left,right);
5240 *tidy = (const void *) sotable;
5241 break;
5242
5243 case 4:
5244 plen = m;
5245 *hits = embPatBYGSearch(text,name,
5246 begin,plen,sotable,solimit,l,
5247 left,right);
5248 *tidy = (const void *) sotable;
5249 break;
5250
5251 case 5:
5252 ppm = embPatMatchFindAll(regexp,text, left, right);
5253 n = embPatMatchGetNumber(ppm);
5254 count = n;
5255
5256 for(i=0;i<n;++i)
5257 {
5258 start = embPatMatchGetStart(ppm,i);
5259 end = embPatMatchGetEnd(ppm,i);
5260 /*
5261 // ajDebug("embPatFuzzSearchAll embPatMatchFind left:%B start:%d "
5262 // "count:%d\n",
5263 // left, start, count);
5264 */
5265
5266 if(left && start)
5267 {
5268 --count;
5269 continue;
5270 }
5271
5272 if(right && start!=ajStrGetLen(text)-(end-start+1))
5273 {
5274 --count;
5275 continue;
5276 }
5277
5278 if(!right || (right && start==ajStrGetLen(text)-
5279 (end-start+1)))
5280 {
5281 /*
5282 // ajDebug("embPatFuzzSearchAll type 5 push hit %B..%B %d..%d\n",
5283 // left, right, start, end);
5284 */
5285 embPatPushHit(l,name,start,end-start+1,
5286 begin,0);
5287 }
5288 /*
5289 // else
5290 // {
5291 // ajDebug("embPatFuzzSearchAll type 5 skip hit %B..%B %d..%d\n",
5292 // left, right, start, end);
5293 // }
5294 */
5295 }
5296
5297 embPatMatchDel(&ppm);
5298 *hits = count;
5299 break;
5300
5301 case 6:
5302 *hits = embPatTUBSearch(pattern,text,ajStrGetLen(text),skipm,
5303 m,mismatch,begin,
5304 l,left,right,name,plen);
5305 *tidy = (const void *) skipm;
5306 break;
5307
5308 case 7:
5309 *hits = embPatBruteForce(text,pattern,left,right,l,
5310 begin,mismatch,name);
5311 break;
5312
5313 default:
5314 ajFatal("Can't handle pattern type %S\n",pattern);
5315 break;
5316 }
5317
5318 ajDebug("embPatFuzzSearchAll hits: %d\n", *hits);
5319
5320 return;
5321 }
5322
5323
5324
5325
5326 /* @func embPatCompileII ******************************************************
5327 **
5328 ** Compile a pattern classified by embPatGetType
5329 **
5330 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5331 ** @param [r] mismatch [ajuint] number of allowed mismatches
5332 **
5333 ** @return [void]
5334 **
5335 ** @release 4.0.0
5336 ** @@
5337 ******************************************************************************/
embPatCompileII(AjPPatComp thys,ajuint mismatch)5338 void embPatCompileII (AjPPatComp thys, ajuint mismatch)
5339 {
5340 ajuint i = 0;
5341
5342 thys->plen = ajStrGetLen(thys->pattern);
5343
5344 switch(thys->type)
5345 {
5346 case 1:
5347 if (!thys->buf)
5348 AJCNEW(thys->buf,AJALPHA);
5349
5350 embPatBMHInit(thys->pattern,thys->plen,thys->buf);
5351 break;
5352 case 2:
5353 if (!thys->buf)
5354 AJCNEW(thys->buf,AJALPHA);
5355
5356 embPatBYPInit(thys->pattern,thys->plen,thys->off,thys->buf);
5357 break;
5358 case 3:
5359 if (!thys->sotable)
5360 AJCNEW(thys->sotable,AJALPHA2);
5361
5362 embPatSOInit(thys->pattern,thys->sotable,&thys->solimit);
5363 thys->m = thys->plen;
5364 break;
5365 case 4:
5366 if (!thys->sotable)
5367 AJCNEW(thys->sotable,AJALPHA2);
5368
5369 embPatBYGCInit(thys->pattern,&thys->m,thys->sotable,&thys->solimit);
5370 break;
5371 case 5:
5372 if (!ajStrGetLen(thys->regex))
5373 thys->regex = embPatPrositeToRegExp(thys->pattern);
5374
5375 break;
5376 case 6:
5377 if(thys->m && mismatch >= thys->m)
5378 ajFatal("embPatCompileII: Mismatches (%d) must be less than the "
5379 "real pattern length (%d)",mismatch,thys->m);
5380
5381 if (!thys->skipm)
5382 {
5383 AJCNEW(thys->skipm,thys->m);
5384
5385 for(i=0;i<thys->m;++i)
5386 AJCNEW((thys->skipm)[i],AJALPHA);
5387 }
5388
5389 embPatTUBInit(thys->pattern,thys->skipm,thys->m,mismatch,thys->plen);
5390 break;
5391 case 7:
5392 break;
5393 default:
5394 ajFatal("embPatCompileII: Cannot compile pattern");
5395 break;
5396 }
5397
5398 return;
5399 }
5400
5401
5402
5403
5404 /* @func embPatFuzzSearchII ***************************************************
5405 **
5406 ** Fuzzy search after embPatGetType and embPatCompile
5407 **
5408 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5409 ** @param [r] begin [ajuint] Sequence displacement (1=start)
5410 ** @param [r] name [const AjPStr] Name associated with sequence
5411 ** @param [r] text [const AjPStr] Sequence
5412 ** @param [u] l [AjPList] List to push hits onto
5413 ** @param [r] mismatch [ajuint] number of allowed mismatches
5414 ** @param [w] hits [ajuint*] number of hits
5415 ** @param [w] tidy [const void**] data to free
5416 **
5417 ** @return [void]
5418 **
5419 ** @release 4.0.0
5420 ** @@
5421 ******************************************************************************/
embPatFuzzSearchII(AjPPatComp thys,ajuint begin,const AjPStr name,const AjPStr text,AjPList l,ajuint mismatch,ajuint * hits,const void ** tidy)5422 void embPatFuzzSearchII (AjPPatComp thys, ajuint begin, const AjPStr name,
5423 const AjPStr text, AjPList l, ajuint mismatch,
5424 ajuint *hits, const void** tidy)
5425 {
5426 EmbPPatMatch ppm;
5427 ajuint n;
5428 ajuint i;
5429 ajuint start;
5430 ajuint end;
5431 ajuint count = 0;
5432
5433 ajDebug("embPatFuzzSearchII '%S' type %d '%s'\n",
5434 thys->pattern, thys->type, patTypes[thys->type]);
5435
5436 switch(thys->type)
5437 {
5438 case 1:
5439 *hits = embPatBMHSearch(text,thys->pattern,ajStrGetLen(text),
5440 ajStrGetLen(thys->pattern),
5441 thys->buf,0,thys->amino,
5442 thys->carboxyl,l,name,begin);
5443 *tidy = (const void *) thys->buf;
5444 break;
5445
5446 case 2:
5447 for(i=0;i<AJALPHA;++i)
5448 thys->buf[i] = thys->plen;
5449
5450 for(i=0;i<thys->plen;++i)
5451 thys->buf[i] = AJALPHA;
5452
5453 *hits=embPatBYPSearch(text,name,begin,ajStrGetLen(text),
5454 thys->plen,mismatch,thys->off,thys->buf,l,
5455 thys->amino,thys->carboxyl,thys->pattern);
5456 *tidy = (const void *) thys->buf;
5457 break;
5458
5459 case 3:
5460 *hits = embPatSOSearch(text,name,*ajStrGetPtr(thys->pattern),begin,
5461 thys->plen,thys->sotable,thys->solimit,l,
5462 thys->amino,thys->carboxyl);
5463 *tidy = (const void *) thys->sotable;
5464 break;
5465
5466 case 4:
5467 thys->plen = thys->m;
5468 *hits = embPatBYGSearch(text,name,begin,
5469 thys->plen,thys->sotable,thys->solimit,l,
5470 thys->amino,thys->carboxyl);
5471 *tidy = (const void *) thys->sotable;
5472 break;
5473
5474 case 5:
5475 ppm = embPatMatchFind(thys->regex, text,
5476 thys->amino, thys->carboxyl);
5477 n = embPatMatchGetNumber(ppm);
5478 count = n;
5479
5480 for(i=0;i<n;++i)
5481 {
5482 start = embPatMatchGetStart(ppm,i);
5483 end = embPatMatchGetEnd(ppm,i);
5484
5485 /*
5486 // ajDebug("embPatFuzzSearchII embPatMatchFind left:%B start:%d "
5487 // "count:%d\n",
5488 // thys->amino, start, count);
5489 */
5490 if(thys->amino && start)
5491 {
5492 --count;
5493 continue;
5494 }
5495
5496 if(thys->carboxyl && start!=ajStrGetLen(text)-(end-start+1))
5497 {
5498 --count;
5499 continue;
5500 }
5501
5502 if(!thys->carboxyl || (thys->carboxyl && start==ajStrGetLen(text)-
5503 (end-start+1)))
5504 {
5505 /*
5506 // ajDebug("embPatFuzzSearchII type 5 push hit %B..%B %d..%d\n",
5507 // thys->amino, thys->carboxyl, start, end);
5508 */
5509 embPatPushHit(l,name,start,end-start+1,
5510 begin,0);
5511 }
5512 /*
5513 // else
5514 // {
5515 // ajDebug("embPatFuzzSearchII type 5 skip hit %B..%B %d..%d\n",
5516 // thys->amino, thys->carboxyl, start, end);
5517 // }
5518 */
5519 }
5520
5521 embPatMatchDel(&ppm);
5522 *hits = count;
5523 break;
5524
5525 case 6:
5526 *hits = embPatTUBSearch(thys->pattern,text,ajStrGetLen(text),
5527 thys->skipm,
5528 thys->m,mismatch,begin,l,
5529 thys->amino,thys->carboxyl,name,thys->plen);
5530 *tidy = (const void *) thys->skipm;
5531 break;
5532
5533 case 7:
5534 *hits = embPatBruteForce(text,thys->pattern,thys->amino,thys->carboxyl,
5535 l,begin,mismatch,name);
5536 break;
5537
5538 default:
5539 ajFatal("Can't handle pattern type %S\n",thys->pattern);
5540 break;
5541 }
5542
5543 return;
5544 }
5545
5546
5547
5548
5549 /* @func embPatFuzzSearchAllII ************************************************
5550 **
5551 ** Fuzzy search after embPatGetType and embPatCompile
5552 **
5553 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5554 ** @param [r] begin [ajuint] Sequence displacement (1=start)
5555 ** @param [r] name [const AjPStr] Name associated with sequence
5556 ** @param [r] text [const AjPStr] Sequence
5557 ** @param [u] l [AjPList] List to push hits onto
5558 ** @param [r] mismatch [ajuint] number of allowed mismatches
5559 ** @param [w] hits [ajuint*] number of hits
5560 ** @param [w] tidy [const void**] data to free
5561 **
5562 ** @return [void]
5563 **
5564 ** @release 4.0.0
5565 ** @@
5566 ******************************************************************************/
embPatFuzzSearchAllII(AjPPatComp thys,ajuint begin,const AjPStr name,const AjPStr text,AjPList l,ajuint mismatch,ajuint * hits,const void ** tidy)5567 void embPatFuzzSearchAllII (AjPPatComp thys, ajuint begin, const AjPStr name,
5568 const AjPStr text, AjPList l, ajuint mismatch,
5569 ajuint *hits, const void** tidy)
5570 {
5571 EmbPPatMatch ppm;
5572 ajuint n;
5573 ajuint i;
5574 ajuint start;
5575 ajuint end;
5576 ajuint count = 0;
5577
5578 ajDebug("embPatFuzzSearchAllII '%S' type %d '%s'\n",
5579 thys->pattern, thys->type, patTypes[thys->type]);
5580
5581 switch(thys->type)
5582 {
5583 case 1:
5584 *hits = embPatBMHSearch(text,thys->pattern,ajStrGetLen(text),
5585 ajStrGetLen(thys->pattern),
5586 thys->buf,0,thys->amino,
5587 thys->carboxyl,l,name,begin);
5588 *tidy = (const void *) thys->buf;
5589 break;
5590
5591 case 2:
5592 for(i=0;i<AJALPHA;++i)
5593 thys->buf[i] = thys->plen;
5594
5595 for(i=0;i<thys->plen;++i)
5596 thys->buf[i] = AJALPHA;
5597
5598 *hits=embPatBYPSearch(text,name,begin,ajStrGetLen(text),
5599 thys->plen,mismatch,thys->off,thys->buf,l,
5600 thys->amino,thys->carboxyl,thys->pattern);
5601 *tidy = (const void *) thys->buf;
5602 break;
5603
5604 case 3:
5605 *hits = embPatSOSearch(text,name,*ajStrGetPtr(thys->pattern),begin,
5606 thys->plen,thys->sotable,thys->solimit,l,
5607 thys->amino,thys->carboxyl);
5608 *tidy = (const void *) thys->sotable;
5609 break;
5610
5611 case 4:
5612 thys->plen = thys->m;
5613 *hits = embPatBYGSearch(text,name,begin,
5614 thys->plen,thys->sotable,thys->solimit,l,
5615 thys->amino,thys->carboxyl);
5616 *tidy = (const void *) thys->sotable;
5617 break;
5618
5619 case 5:
5620 ppm = embPatMatchFindAll(thys->regex, text,
5621 thys->amino, thys->carboxyl);
5622 n = embPatMatchGetNumber(ppm);
5623 count = n;
5624
5625 for(i=0;i<n;++i)
5626 {
5627 start = embPatMatchGetStart(ppm,i);
5628 end = embPatMatchGetEnd(ppm,i);
5629
5630 /*
5631 // ajDebug("embPatFuzzSearchAllII embPatMatchFind left:%B start:%d "
5632 // "count:%d\n",
5633 // thys->amino, start, count);
5634 */
5635 if(thys->amino && start)
5636 {
5637 --count;
5638 continue;
5639 }
5640
5641 if(thys->carboxyl && start!=ajStrGetLen(text)-(end-start+1))
5642 {
5643 --count;
5644 continue;
5645 }
5646
5647 if(!thys->carboxyl || (thys->carboxyl && start==ajStrGetLen(text)-
5648 (end-start+1)))
5649 {
5650 /*
5651 // ajDebug("embPatFuzzSearchAllII type 5 push hit %B..%B %d..%d\n",
5652 // thys->amino, thys->carboxyl, start, end);
5653 */
5654 embPatPushHit(l,name,start,end-start+1,
5655 begin,0);
5656 }
5657 /*
5658 // else
5659 // {
5660 // ajDebug("embPatFuzzSearchAllII type 5 skip hit %B..%B %d..%d\n",
5661 // thys->amino, thys->carboxyl, start, end);
5662 // }
5663 */
5664 }
5665
5666 embPatMatchDel(&ppm);
5667 *hits = count;
5668 break;
5669
5670 case 6:
5671 *hits = embPatTUBSearch(thys->pattern,text,ajStrGetLen(text),
5672 thys->skipm,
5673 thys->m,mismatch,begin,l,
5674 thys->amino,thys->carboxyl,name,thys->plen);
5675 *tidy = (const void *) thys->skipm;
5676 break;
5677
5678 case 7:
5679 *hits = embPatBruteForce(text,thys->pattern,thys->amino,thys->carboxyl,
5680 l,begin,mismatch,name);
5681 break;
5682
5683 default:
5684 ajFatal("Can't handle pattern type %S\n",thys->pattern);
5685 break;
5686 }
5687
5688 return;
5689 }
5690
5691
5692
5693
5694 /* @func embPatGetTypeII ******************************************************
5695 **
5696 ** Return the type of a pattern
5697 **
5698 ** @param [u] thys [AjPPatComp] Prosite pattern stucture
5699 ** @param [r] pattern [const AjPStr] Original pattern
5700 ** @param [r] mismatch [ajuint] Number of allowed mismatches
5701 ** @param [r] protein [AjBool] True if protein
5702 **
5703 ** @return [ajuint] type of pattern
5704 **
5705 ** @release 4.0.0
5706 ** @@
5707 ******************************************************************************/
5708
embPatGetTypeII(AjPPatComp thys,const AjPStr pattern,ajuint mismatch,AjBool protein)5709 ajuint embPatGetTypeII (AjPPatComp thys, const AjPStr pattern, ajuint mismatch,
5710 AjBool protein)
5711 {
5712 AjBool fclass;
5713 AjBool compl;
5714 AjBool dontcare;
5715 AjBool range;
5716 AjBool isany = ajFalse;
5717 ajuint plen;
5718 ajuint type;
5719 const char *p;
5720 char *q;
5721
5722 ajStrAssignS(&thys->pattern,pattern);
5723
5724 if(!embPatClassify(pattern,&thys->pattern,&thys->amino,&thys->carboxyl,
5725 &fclass,&compl,&dontcare,&range,protein))
5726 return 0;
5727
5728 /* Get real pattern length */
5729 p = ajStrGetPtr(thys->pattern);
5730 thys->m = 0;
5731
5732 while(*p)
5733 {
5734 if(*p=='{')
5735 while(*p!='}')
5736 ++p;
5737 else if(*p=='[')
5738 while(*p!=']')
5739 ++p;
5740
5741 ++p;
5742 ++thys->m;
5743 }
5744
5745
5746 plen = ajStrGetLen(thys->pattern);
5747 type = 0;
5748
5749 /*
5750 ** Select type of search depending on pattern
5751 */
5752
5753 if(!range && !dontcare && !fclass && !compl && !mismatch && plen>AJWORD)
5754 {
5755 /* Boyer Moore Horspool is the choice for long exact patterns */
5756 type = 1;
5757 }
5758 else if(mismatch && !dontcare && !range && !fclass && !compl &&
5759 plen<AJALPHA/2)
5760 {
5761 /* Baeza-Yates Perleberg for exact patterns plus mismatches */
5762 type = 2;
5763 }
5764 else if(!range && !dontcare && !fclass && !compl && !mismatch &&
5765 plen<=AJWORD)
5766 {
5767 /* Shift-OR is the choice for small exact patterns */
5768 type = 3;
5769 }
5770 else if(!range &&
5771 (fclass || compl || dontcare) &&
5772 !mismatch && thys->m<=AJWORD)
5773 {
5774 /*
5775 * Baeza-Yates Gonnet for classes and dontcares.
5776 * No mismatches or ranges. Patterns less than (e.g.) 32
5777 */
5778 type = 4;
5779 }
5780 else if(!mismatch && (range || thys->m>AJWORD))
5781 {
5782 q = ajStrGetuniquePtr(&thys->pattern);
5783 isany = ajFalse;
5784
5785 while(*q)
5786 {
5787 if((protein && *q == 'X') || (!protein && *q=='N'))
5788 {
5789 *q = '?';
5790 isany = ajTrue;
5791 }
5792
5793 ++q;
5794 }
5795
5796 if(isany)
5797 type=7;
5798 else
5799 type = 5;
5800 }
5801 else if(mismatch && !range && (fclass || compl))
5802 {
5803 /* Try a Tarhio-Ukkonen-Bleasby */
5804 type = 6;
5805 }
5806 else if((mismatch && range) || !type)
5807 {
5808 /*
5809 ** No choice left but to do a Bleasby recursive brute force
5810 */
5811 type = 7;
5812 }
5813
5814 ajDebug("embPatTypeII %d '%S'\n", thys->type, thys->pattern);
5815
5816 if (!ajStrMatchCaseS(pattern, thys->pattern))
5817 ajDebug("embPatTypeII cleaned to '%S'\n", thys->pattern);
5818
5819 thys->type=type;
5820
5821 return type;
5822 }
5823