1 /* @source ajreg **************************************************************
2 **
3 ** AJAX REG (ajax regular expression) functions
4 **
5 ** Uses the Perl-Comparible Regular Expressions Library (PCRE)
6 ** included as a sepoarate library in the EMBOSS distribution.
7 **
8 ** @author Copyright (C) 1998 Peter Rice
9 ** @version $Revision: 1.34 $
10 ** @modified Jun 25 pmr First version
11 ** @modified 1999-2011 pmr Replace Henry Spencer library with PCRE
12 ** @modified $Date: 2011/10/18 14:23:40 $ by $Author: rice $
13 ** @@
14 **
15 ** This library is free software; you can redistribute it and/or
16 ** modify it under the terms of the GNU Lesser General Public
17 ** License as published by the Free Software Foundation; either
18 ** version 2.1 of the License, or (at your option) any later version.
19 **
20 ** This library is distributed in the hope that it will be useful,
21 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
22 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ** Lesser General Public License for more details.
24 **
25 ** You should have received a copy of the GNU Lesser General Public
26 ** License along with this library; if not, write to the Free Software
27 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
28 ** MA 02110-1301, USA.
29 **
30 ******************************************************************************/
31
32 #include "ajlib.h"
33
34 #include "ajreg.h"
35
36
37
38 static ajlong regAlloc = 0;
39 static ajlong regFree = 0;
40 static ajlong regFreeCount = 0;
41 static ajlong regCount = 0;
42 static ajlong regTotal = 0;
43
44 static int *regDfaWorkspace = NULL;
45 static int regDfaWsCount = 10000;
46
47
48 /* constructors */
49
50
51
52
53 /* @func ajRegComp ************************************************************
54 **
55 ** Compiles a regular expression.
56 **
57 ** @param [r] rexp [const AjPStr] Regular expression string.
58 ** @return [AjPRegexp] Compiled regular expression.
59 **
60 ** @release 1.0.0
61 ** @@
62 ******************************************************************************/
63
ajRegComp(const AjPStr rexp)64 AjPRegexp ajRegComp(const AjPStr rexp)
65 {
66 return ajRegCompC(ajStrGetPtr(rexp));
67 }
68
69
70
71
72 /* @func ajRegCompC ***********************************************************
73 **
74 ** Compiles a regular expression.
75 **
76 ** @param [r] rexp [const char*] Regular expression character string.
77 ** @return [AjPRegexp] Compiled regular expression.
78 **
79 ** @release 1.0.0
80 ** @@
81 ******************************************************************************/
82
ajRegCompC(const char * rexp)83 AjPRegexp ajRegCompC(const char* rexp)
84 {
85 AjPRegexp ret;
86 int options = 0;
87 int errpos = 0;
88 const char *errptr = NULL;
89 const unsigned char *tableptr = NULL;
90
91 AJNEW0(ret);
92 AJCNEW0(ret->ovector, AJREG_OVECSIZE);
93 ret->ovecsize = AJREG_OVECSIZE/3;
94 ret->pcre = pcre_compile(rexp, options, &errptr, &errpos, tableptr);
95
96 if(!ret->pcre)
97 {
98 ajErr("Failed to compile regular expression '%s' at position %d: %s",
99 rexp, errpos, errptr);
100 AJFREE(ret);
101 return NULL;
102 }
103
104 regAlloc += sizeof(ret);
105 regCount ++;
106 regTotal ++;
107 /*ajDebug("ajRegCompC %x size %d regexp '%s'\n",
108 ret, (int) sizeof(ret), rexp);*/
109
110 return ret;
111 }
112
113
114
115
116 /* @func ajRegCompCase ********************************************************
117 **
118 ** Compiles a case-insensitive regular expression.
119 **
120 ** @param [r] rexp [const AjPStr] Regular expression string.
121 ** @return [AjPRegexp] Compiled regular expression.
122 **
123 ** @release 2.8.0
124 ** @@
125 ******************************************************************************/
126
ajRegCompCase(const AjPStr rexp)127 AjPRegexp ajRegCompCase(const AjPStr rexp)
128 {
129 return ajRegCompCaseC(ajStrGetPtr(rexp));
130 }
131
132
133
134
135 /* @func ajRegCompCaseC *******************************************************
136 **
137 ** Compiles a case-insensitive regular expression.
138 **
139 ** @param [r] rexp [const char*] Regular expression character string.
140 ** @return [AjPRegexp] Compiled regular expression.
141 **
142 ** @release 2.8.0
143 ** @@
144 ******************************************************************************/
145
ajRegCompCaseC(const char * rexp)146 AjPRegexp ajRegCompCaseC(const char* rexp)
147 {
148 AjPRegexp ret;
149 int options = PCRE_CASELESS;
150 int errpos = 0;
151 const char *errptr = NULL;
152 const unsigned char *tableptr = NULL;
153
154 AJNEW0(ret);
155 AJCNEW0(ret->ovector, AJREG_OVECSIZE);
156 ret->ovecsize = AJREG_OVECSIZE/3;
157 ret->pcre = pcre_compile(rexp, options, &errptr, &errpos, tableptr);
158
159 if(!ret->pcre)
160 {
161 ajErr("Failed to compile regular expression '%s' at position %d: %s",
162 rexp, errpos, errptr);
163 AJFREE(ret);
164
165 return NULL;
166 }
167
168 regAlloc += sizeof(ret);
169 regCount ++;
170 regTotal ++;
171 /*ajDebug("ajRegCompCaseC %x size %d regexp '%s'\n",
172 ret, (int) sizeof(ret), rexp);*/
173
174 return ret;
175 }
176
177
178
179
180 /* execute expression match */
181
182
183
184
185 /* @func ajRegExec ************************************************************
186 **
187 ** Execute a regular expression search.
188 ** The expression must first have been compiled with ajRegComp or ajRegCompC.
189 **
190 ** Internal data structures in the expression will be set to substrings
191 ** which other functions can retrieve.
192 **
193 ** @param [u] prog [AjPRegexp] Compiled regular expression.
194 ** @param [r] str [const AjPStr] String to be compared.
195 ** @return [AjBool] ajTrue if a match was found.
196 **
197 ** @release 1.0.0
198 ** @@
199 ******************************************************************************/
200
ajRegExec(AjPRegexp prog,const AjPStr str)201 AjBool ajRegExec(AjPRegexp prog, const AjPStr str)
202 {
203 int startoffset = 0;
204 int options = 0;
205 int status = 0;
206
207 status = pcre_exec(prog->pcre, prog->extra, ajStrGetPtr(str),
208 ajStrGetLen(str), startoffset, options, prog->ovector,
209 3*prog->ovecsize);
210
211 if(status >= 0)
212 {
213 prog->orig = ajStrGetPtr(str);
214
215 if(status == 0)
216 ajWarn("ajRegExec too many substrings");
217
218 return ajTrue;
219 }
220
221 if(status < -1) /* -1 is a simple fail to match */
222 { /* others are recursion limits etc. */
223 ajDebug("ajRegExec returned unexpected status '%d'\n", status);
224 prog->orig = ajStrGetPtr(str); /* needed for the trace */
225 ajRegTrace(prog);
226 }
227
228 prog->orig = NULL;
229
230 return ajFalse;
231 }
232
233
234
235
236 /* @func ajRegExecC ***********************************************************
237 **
238 ** Execute a regular expression search.
239 ** The expression must first have been compiled with ajRegComp or ajRegCompC.
240 **
241 ** Internal data structures in the expression will be set to substrings
242 ** which other functions can retrieve.
243 **
244 ** @param [u] prog [AjPRegexp] Compiled regular expression.
245 ** @param [r] str [const char*] String to be compared.
246 ** @return [AjBool] ajTrue if a match was found.
247 **
248 ** @release 1.0.0
249 ** @@
250 ******************************************************************************/
251
ajRegExecC(AjPRegexp prog,const char * str)252 AjBool ajRegExecC(AjPRegexp prog, const char* str)
253 {
254 int startoffset = 0;
255 int options = 0;
256 int status = 0;
257
258 status = pcre_exec(prog->pcre, prog->extra, str, strlen(str),
259 startoffset, options, prog->ovector, 3*prog->ovecsize);
260
261 if(status >= 0)
262 {
263 prog->orig = str;
264
265 if(status == 0)
266 ajWarn("ajRegExecC too many substrings");
267
268 return ajTrue;
269 }
270
271 if(status < -1) /* -1 is a simple fail to match */
272 { /* others are recursion limits etc. */
273 ajDebug("ajRegExecC returned unexpected status '%d'\n", status);
274 prog->orig = str; /* needed for the trace */
275 ajRegTrace(prog);
276 }
277
278 prog->orig = NULL;
279
280 return ajFalse;
281 }
282
283
284
285
286 /* @func ajRegExecall *********************************************************
287 **
288 ** Execute a regular expression search using the alternative 'DFA' algorithm
289 ** which generates all matches instead of the perl-compatible maximum match.
290 **
291 ** The expression must first have been compiled with ajRegComp or ajRegCompC.
292 **
293 ** Internal data structures in the expression will be set to substrings
294 ** which other functions can retrieve.
295 **
296 ** @param [u] prog [AjPRegexp] Compiled regular expression.
297 ** @param [r] str [const AjPStr] String to be compared.
298 ** @return [AjBool] ajTrue if a match was found.
299 **
300 ** @release 1.0.0
301 ** @@
302 ******************************************************************************/
303
ajRegExecall(AjPRegexp prog,const AjPStr str)304 AjBool ajRegExecall(AjPRegexp prog, const AjPStr str)
305 {
306 int startoffset = 0;
307 int options = 0;
308
309 if(!regDfaWorkspace)
310 AJCNEW(regDfaWorkspace, regDfaWsCount);
311
312 prog->matches = pcre_dfa_exec(prog->pcre, prog->extra,
313 ajStrGetPtr(str), ajStrGetLen(str),
314 startoffset, options,
315 prog->ovector, 3*prog->ovecsize,
316 regDfaWorkspace, regDfaWsCount);
317
318 if(prog->matches >= 0)
319 {
320 prog->orig = ajStrGetPtr(str);
321
322 if(prog->matches == 0)
323 ajWarn("ajRegExecall too many substrings");
324
325 return ajTrue;
326 }
327
328 if(prog->matches < -1) /* -1 is a simple fail to match */
329 { /* others are recursion limits etc. */
330 ajDebug("ajRegExecall returned unexpected status '%d'\n",
331 prog->matches);
332 prog->orig = ajStrGetPtr(str); /* needed for the trace */
333 ajRegTrace(prog);
334 }
335
336 prog->orig = NULL;
337
338 return ajFalse;
339 }
340
341
342
343
344 /* @func ajRegExecallC ********************************************************
345 **
346 ** Execute a regular expression search using the alternative 'DFA' algorithm
347 ** which generates all matches instead of the perl-compatible maximum match.
348 **
349 ** The expression must first have been compiled with ajRegComp or ajRegCompC.
350 **
351 ** Internal data structures in the expression will be set to substrings
352 ** which other functions can retrieve.
353 **
354 ** @param [u] prog [AjPRegexp] Compiled regular expression.
355 ** @param [r] str [const char*] String to be compared.
356 ** @return [AjBool] ajTrue if a match was found.
357 **
358 ** @release 1.0.0
359 ** @@
360 ******************************************************************************/
361
ajRegExecallC(AjPRegexp prog,const char * str)362 AjBool ajRegExecallC(AjPRegexp prog, const char* str)
363 {
364 int startoffset = 0;
365 int options = 0;
366
367 if(!regDfaWorkspace)
368 AJCNEW(regDfaWorkspace, regDfaWsCount);
369
370 prog->matches = pcre_dfa_exec(prog->pcre, prog->extra,
371 str, strlen(str),
372 startoffset, options,
373 prog->ovector, 3*prog->ovecsize,
374 regDfaWorkspace, regDfaWsCount);
375
376 if(prog->matches >= 0)
377 {
378 prog->orig = str;
379
380 if(prog->matches == 0)
381 ajWarn("ajRegExecallC too many substrings");
382
383 return ajTrue;
384 }
385
386 if(prog->matches < -1) /* -1 is a simple fail to match */
387 { /* others are recursion limits etc. */
388 ajDebug("ajRegExecallC returned unexpected status '%d'\n",
389 prog->matches);
390 prog->orig = str; /* needed for the trace */
391 ajRegTrace(prog);
392 }
393
394 prog->orig = NULL;
395
396 return ajFalse;
397 }
398
399
400
401
402 /* @func ajRegGetMatches ******************************************************
403 **
404 ** After a successful regular expression match, uses the regular
405 ** expression and the original string to calculate the offset
406 ** of the match from the start of the string.
407 **
408 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
409 ** @return [ajint] Number of matches found
410 **
411 ** @release 1.0.0
412 ** @@
413 ******************************************************************************/
414
ajRegGetMatches(const AjPRegexp rp)415 ajint ajRegGetMatches(const AjPRegexp rp)
416 {
417 return (rp->matches);
418 }
419
420
421
422
423 /* @func ajRegOffset **********************************************************
424 **
425 ** After a successful regular expression match, uses the regular
426 ** expression and the original string to calculate the offset
427 ** of the match from the start of the string.
428 **
429 ** This information is normally lost during processing.
430 **
431 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
432 ** @return [ajint] Offset of match from start of string.
433 ** -1 if the string and the expression do not match.
434 **
435 ** @release 1.0.0
436 ** @@
437 ******************************************************************************/
438
ajRegOffset(const AjPRegexp rp)439 ajint ajRegOffset(const AjPRegexp rp)
440 {
441 return (rp->ovector[0]);
442 }
443
444
445
446
447 /* @func ajRegOffsetI *********************************************************
448 **
449 ** After a successful regular expression match, uses the regular
450 ** expression and the original string to calculate the offset
451 ** of a substring from the start of the string.
452 **
453 ** This information is normally lost during processing.
454 **
455 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
456 ** @param [r] isub [ajint] Substring number.
457 ** @return [ajint] Offset of match from start of string.
458 ** -1 if the string and the expression do not match.
459 **
460 ** @release 1.0.0
461 ** @@
462 ******************************************************************************/
463
ajRegOffsetI(const AjPRegexp rp,ajint isub)464 ajint ajRegOffsetI(const AjPRegexp rp, ajint isub)
465 {
466 if(isub < 0)
467 ajErr("Invalid substring number %d", isub);
468
469 if(isub >= (rp->ovecsize))
470 ajErr("Invalid substring number %d", isub);
471
472 return (rp->ovector[isub*2]);
473 }
474
475
476
477
478 /* @func ajRegLenI ************************************************************
479 **
480 ** After a successful comparison, returns the length of a substring.
481 **
482 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
483 ** @param [r] isub [ajint] Substring number.
484 ** @return [ajint] Substring length, or 0 if not found.
485 **
486 ** @release 1.0.0
487 ** @@
488 ******************************************************************************/
489
ajRegLenI(const AjPRegexp rp,ajint isub)490 ajint ajRegLenI(const AjPRegexp rp, ajint isub)
491 {
492 ajint istart;
493 ajint iend;
494
495 istart = 2*isub;
496 iend = istart+1;
497
498 if(isub < 0)
499 return 0;
500
501 if(isub >= rp->ovecsize)
502 return 0;
503
504 if(rp->ovector[istart] < 0)
505 return 0;
506
507 return (rp->ovector[iend] - rp->ovector[istart]);
508 }
509
510
511
512
513 /* @func ajRegPost ************************************************************
514 **
515 ** After a successful match, returns the remainder of the string.
516 **
517 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
518 ** @param [w] post [AjPStr*] String to hold the result.
519 ** @return [AjBool] ajTrue on success.
520 **
521 ** @release 1.0.0
522 ** @@
523 ******************************************************************************/
524
ajRegPost(const AjPRegexp rp,AjPStr * post)525 AjBool ajRegPost(const AjPRegexp rp, AjPStr* post)
526 {
527 if(rp->ovector[1])
528 {
529 ajStrAssignC(post, &rp->orig[rp->ovector[1]]);
530
531 return ajTrue;
532 }
533
534 ajStrDelStatic(post);
535
536 return ajFalse;
537 }
538
539
540
541
542 /* @func ajRegPostC ***********************************************************
543 **
544 ** After a successful match, returns the remainder of the string.
545 ** Result is a character string, which is set to point to the internal
546 ** string data. This in turn is part of the original string. If this
547 ** changes then the results are undefined.
548 **
549 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
550 ** @param [w] post [const char**] Character string to hold the result.
551 ** @return [AjBool] ajTrue on success.
552 **
553 ** @release 1.0.0
554 ** @@
555 ******************************************************************************/
556
ajRegPostC(const AjPRegexp rp,const char ** post)557 AjBool ajRegPostC(const AjPRegexp rp, const char** post)
558 {
559 if(rp->ovector[1])
560 {
561 *post = &rp->orig[rp->ovector[1]];
562
563 return ajTrue;
564 }
565
566 *post = 0;
567
568 return ajFalse;
569 }
570
571
572
573
574 /* @func ajRegPre *************************************************************
575 **
576 ** After a successful match, returns the string before the match.
577 **
578 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
579 ** @param [w] dest [AjPStr*] String to hold the result.
580 ** @return [AjBool] ajTrue on success.
581 **
582 ** @release 2.8.0
583 ** @@
584 ******************************************************************************/
585
ajRegPre(const AjPRegexp rp,AjPStr * dest)586 AjBool ajRegPre(const AjPRegexp rp, AjPStr* dest)
587 {
588 ajint ilen;
589
590 ilen = rp->ovector[0];
591 ajStrSetRes(dest, ilen+1);
592
593 if(ilen)
594 {
595 memmove((*dest)->Ptr, rp->orig, ilen);
596 (*dest)->Len = ilen;
597 (*dest)->Ptr[ilen] = '\0';
598
599 return ajTrue;
600 }
601
602 ajStrDelStatic(dest);
603
604 return ajFalse;
605 }
606
607
608
609
610 /* @func ajRegSubI ************************************************************
611 **
612 ** After a successful match, returns a substring.
613 **
614 ** @param [r] rp [const AjPRegexp] Compiled regular expression.
615 ** @param [r] isub [ajint] Substring number.
616 ** @param [w] dest [AjPStr*] String to hold the result.
617 ** @return [AjBool] ajTrue if a substring was defined
618 ** ajFalse if the substring is not matched
619 ** ajFalse if isub is out of range
620 **
621 ** @release 1.0.0
622 ** @@
623 ******************************************************************************/
624
ajRegSubI(const AjPRegexp rp,ajint isub,AjPStr * dest)625 AjBool ajRegSubI(const AjPRegexp rp, ajint isub, AjPStr* dest)
626 {
627 ajint ilen;
628 ajint istart;
629 ajint iend;
630
631 istart = 2*isub;
632 iend = istart+1;
633
634 if(isub < 0)
635 {
636 ajStrDelStatic(dest);
637
638 return ajFalse;
639 }
640
641 if(isub >= rp->ovecsize)
642 {
643 ajStrDelStatic(dest);
644
645 return ajFalse;
646 }
647
648 if(rp->ovector[istart] < 0)
649 {
650 ajStrDelStatic(dest);
651
652 return ajFalse;
653 }
654
655 ilen = rp->ovector[iend] - rp->ovector[istart];
656 ajStrSetRes(dest, ilen+1);
657
658 if(ilen)
659 memmove((*dest)->Ptr, &rp->orig[rp->ovector[istart]], ilen);
660 (*dest)->Len = ilen;
661 (*dest)->Ptr[ilen] = '\0';
662
663 return ajTrue;
664 }
665
666
667
668
669 /* destructor */
670
671
672
673
674 /* @func ajRegFree ************************************************************
675 **
676 ** Clears and frees a compiled regular expression.
677 **
678 ** @param [d] pexp [AjPRegexp*] Compiled regular expression.
679 ** @return [void]
680 **
681 ** @release 1.0.0
682 ** @@
683 ******************************************************************************/
684
ajRegFree(AjPRegexp * pexp)685 void ajRegFree(AjPRegexp* pexp)
686 {
687 AjPRegexp exp;
688
689 if(!pexp)
690 return;
691
692 if(!*pexp)
693 return;
694
695 exp = *pexp;
696
697 /*
698 ajDebug("ajRegFree %x size regexp %d\n", exp,
699 (ajint) sizeof(exp));
700 */
701
702 regFreeCount += 1;
703 regFree += sizeof(*exp);
704
705 if(exp->pcre)
706 regFree += sizeof(exp->pcre);
707
708 if(exp->extra)
709 regFree += sizeof(exp->extra);
710
711 regTotal --;
712
713 AJFREE(exp->pcre);
714 AJFREE(exp->extra);
715 AJFREE(exp->ovector);
716 AJFREE(*pexp);
717
718 return;
719 }
720
721
722
723
724 /* @func ajRegTrace ***********************************************************
725 **
726 ** Traces a compiled regular expression with debug calls.
727 **
728 ** @param [r] rexp [const AjPRegexp] Compiled regular expression.
729 ** @return [void]
730 **
731 ** @release 1.0.0
732 ** @@
733 ******************************************************************************/
734
ajRegTrace(const AjPRegexp rexp)735 void ajRegTrace(const AjPRegexp rexp)
736 {
737 ajint isub;
738 ajint ilen;
739 ajint ipos;
740 ajint istart;
741 ajint iend;
742 static AjPStr str = NULL;
743
744 ajDebug(" REGEXP trace\n");
745
746 if (!rexp->orig)
747 ajDebug("original string not saved - unable to trace string values\n");
748
749 for(isub=0; isub < rexp->ovecsize; isub++)
750 {
751 istart = 2*isub;
752 iend = istart+1;
753
754 if (!rexp->orig)
755 {
756 if(!isub)
757 ajDebug("original string from %d .. %d\n",
758 rexp->ovector[istart], rexp->ovector[iend]);
759 else
760 ajDebug("substring %2d from %d .. %d\n",
761 isub, rexp->ovector[istart], rexp->ovector[iend]);
762
763 continue;
764 }
765
766 if(rexp->ovector[iend] >= rexp->ovector[istart])
767 {
768 ilen = rexp->ovector[iend] - rexp->ovector[istart];
769 ajStrSetRes(&str, ilen+1);
770 memmove(str->Ptr, &rexp->orig[rexp->ovector[istart]], ilen);
771 str->Len = ilen;
772 str->Ptr[ilen] = '\0';
773
774 if(!isub)
775 {
776 ajDebug(" original string '%s'\n", rexp->orig);
777 ajDebug(" string match '%S'\n", str);
778 }
779 else
780 {
781 ipos = rexp->ovector[istart];
782 ajDebug(" substring %2d '%S' at %d\n", isub, str, ipos);
783 }
784 }
785 }
786
787 ajDebug("\n");
788
789 ajStrDel(&str);
790
791 return;
792 }
793
794
795
796
797 /* @func ajRegExit ************************************************************
798 **
799 ** Prints a summary of regular expression (AjPRegexp) usage with debug calls
800 **
801 ** @return [void]
802 **
803 ** @release 2.7.0
804 ** @@
805 ******************************************************************************/
806
ajRegExit(void)807 void ajRegExit(void)
808 {
809 if(regDfaWorkspace)
810 AJFREE(regDfaWorkspace);
811
812 ajDebug("Regexp usage (bytes): %Ld allocated, %Ld freed, %Ld in use "
813 "(sizes change)\n",
814 regAlloc, regFree, (regAlloc - regFree));
815 ajDebug("Regexp usage (number): %Ld allocated, %Ld freed %Ld in use\n",
816 regCount, regFreeCount, regTotal);
817
818 return;
819 }
820