1# $OpenBSD: tests,v 1.2 2001/01/29 02:05:44 niklas Exp $ 2# $NetBSD: tests,v 1.5 1995/04/20 22:40:00 cgd Exp $ 3 4# regular expression test set 5# Lines are at least three fields, separated by one or more tabs. "" stands 6# for an empty field. First field is an RE. Second field is flags. If 7# C flag given, regcomp() is expected to fail, and the third field is the 8# error name (minus the leading REG_). 9# 10# Otherwise it is expected to succeed, and the third field is the string to 11# try matching it against. If there is no fourth field, the match is 12# expected to fail. If there is a fourth field, it is the substring that 13# the RE is expected to match. If there is a fifth field, it is a comma- 14# separated list of what the subexpressions should match, with - indicating 15# no match for that one. In both the fourth and fifth fields, a (sub)field 16# starting with @ indicates that the (sub)expression is expected to match 17# a null string followed by the stuff after the @; this provides a way to 18# test where null strings match. The character `N' in REs and strings 19# is newline, `S' is space, `T' is tab, `Z' is NUL. 20# 21# The full list of flags: 22# - placeholder, does nothing 23# b RE is a BRE, not an ERE 24# & try it as both an ERE and a BRE 25# C regcomp() error expected, third field is error name 26# i REG_ICASE 27# m ("mundane") REG_NOSPEC 28# s REG_NOSUB (not really testable) 29# n REG_NEWLINE 30# ^ REG_NOTBOL 31# $ REG_NOTEOL 32# # REG_STARTEND (see below) 33# p REG_PEND 34# 35# For REG_STARTEND, the start/end offsets are those of the substring 36# enclosed in (). 37 38# basics 39a & a a 40abc & abc abc 41abc|de - abc abc 42a|b|c - abc a 43 44# parentheses and perversions thereof 45a(b)c - abc abc 46a\(b\)c b abc abc 47a( C EPAREN 48a( b a( a( 49a\( - a( a( 50a\( bC EPAREN 51a\(b bC EPAREN 52a(b C EPAREN 53a(b b a(b a(b 54# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) 55a) - a) a) 56) - ) ) 57# end gagging (in a just world, those *should* give EPAREN) 58a) b a) a) 59a\) bC EPAREN 60\) bC EPAREN 61a()b - ab ab 62a\(\)b b ab ab 63 64# anchoring and REG_NEWLINE 65^abc$ & abc abc 66a^b - a^b 67a^b b a^b a^b 68a$b - a$b 69a$b b a$b a$b 70^ & abc @abc 71$ & abc @ 72^$ & "" @ 73$^ - "" @ 74\($\)\(^\) b "" @ 75# stop retching, those are legitimate (although disgusting) 76^^ - "" @ 77$$ - "" @ 78b$ & abNc 79b$ &n abNc b 80^b$ & aNbNc 81^b$ &n aNbNc b 82^$ &n aNNb @Nb 83^$ n abc 84^$ n abcN @ 85$^ n aNNb @Nb 86\($\)\(^\) bn aNNb @Nb 87^^ n^ aNNb @Nb 88$$ n aNNb @NN 89^a ^ a 90a$ $ a 91^a ^n aNb 92^b ^n aNb b 93a$ $n bNa 94b$ $n bNa b 95a*(^b$)c* - b b 96a*\(^b$\)c* b b b 97 98# certain syntax errors and non-errors 99| C EMPTY 100| b | | 101* C BADRPT 102* b * * 103+ C BADRPT 104? C BADRPT 105"" &C EMPTY 106() - abc @abc 107\(\) b abc @abc 108a||b C EMPTY 109|ab C EMPTY 110ab| C EMPTY 111(|a)b C EMPTY 112(a|)b C EMPTY 113(*a) C BADRPT 114(+a) C BADRPT 115(?a) C BADRPT 116({1}a) C BADRPT 117\(\{1\}a\) bC BADRPT 118(a|*b) C BADRPT 119(a|+b) C BADRPT 120(a|?b) C BADRPT 121(a|{1}b) C BADRPT 122^* C BADRPT 123^* b * * 124^+ C BADRPT 125^? C BADRPT 126^{1} C BADRPT 127^\{1\} bC BADRPT 128 129# metacharacters, backslashes 130a.c & abc abc 131a[bc]d & abd abd 132a\*c & a*c a*c 133a\\b & a\b a\b 134a\\\*b & a\*b a\*b 135a\bc & abc abc 136a\ &C EESCAPE 137a\\bc & a\bc a\bc 138\{ bC BADRPT 139a\[b & a[b a[b 140a[b &C EBRACK 141# trailing $ is a peculiar special case for the BRE code 142a$ & a a 143a$ & a$ 144a\$ & a 145a\$ & a$ a$ 146a\\$ & a 147a\\$ & a$ 148a\\$ & a\$ 149a\\$ & a\ a\ 150 151# back references, ugh 152a\(b\)\2c bC ESUBREG 153a\(b\1\)c bC ESUBREG 154a\(b*\)c\1d b abbcbbd abbcbbd bb 155a\(b*\)c\1d b abbcbd 156a\(b*\)c\1d b abbcbbbd 157^\(.\)\1 b abc 158a\([bc]\)\1d b abcdabbd abbd b 159a\(\([bc]\)\2\)*d b abbccd abbccd 160a\(\([bc]\)\2\)*d b abbcbd 161# actually, this next one probably ought to fail, but the spec is unclear 162a\(\(b\)*\2\)*d b abbbd abbbd 163# here is a case that no NFA implementation does right 164\(ab*\)[ab]*\1 b ababaaa ababaaa a 165# check out normal matching in the presence of back refs 166\(a\)\1bcd b aabcd aabcd 167\(a\)\1bc*d b aabcd aabcd 168\(a\)\1bc*d b aabd aabd 169\(a\)\1bc*d b aabcccd aabcccd 170\(a\)\1bc*[ce]d b aabcccd aabcccd 171^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd 172 173# ordinary repetitions 174ab*c & abc abc 175ab+c - abc abc 176ab?c - abc abc 177a\(*\)b b a*b a*b 178a\(**\)b b ab ab 179a\(***\)b bC BADRPT 180*a b *a *a 181**a b a a 182***a bC BADRPT 183 184# the dreaded bounded repetitions 185{ & { { 186{abc & {abc {abc 187{1 C BADRPT 188{1} C BADRPT 189a{b & a{b a{b 190a{1}b - ab ab 191a\{1\}b b ab ab 192a{1,}b - ab ab 193a\{1,\}b b ab ab 194a{1,2}b - aab aab 195a\{1,2\}b b aab aab 196a{1 C EBRACE 197a\{1 bC EBRACE 198a{1a C EBRACE 199a\{1a bC EBRACE 200a{1a} C BADBR 201a\{1a\} bC BADBR 202a{,2} - a{,2} a{,2} 203a\{,2\} bC BADBR 204a{,} - a{,} a{,} 205a\{,\} bC BADBR 206a{1,x} C BADBR 207a\{1,x\} bC BADBR 208a{1,x C EBRACE 209a\{1,x bC EBRACE 210a{300} C BADBR 211a\{300\} bC BADBR 212a{1,0} C BADBR 213a\{1,0\} bC BADBR 214ab{0,0}c - abcac ac 215ab\{0,0\}c b abcac ac 216ab{0,1}c - abcac abc 217ab\{0,1\}c b abcac abc 218ab{0,3}c - abbcac abbc 219ab\{0,3\}c b abbcac abbc 220ab{1,1}c - acabc abc 221ab\{1,1\}c b acabc abc 222ab{1,3}c - acabc abc 223ab\{1,3\}c b acabc abc 224ab{2,2}c - abcabbc abbc 225ab\{2,2\}c b abcabbc abbc 226ab{2,4}c - abcabbc abbc 227ab\{2,4\}c b abcabbc abbc 228((a{1,10}){1,10}){1,10} - a a a,a 229 230# multiple repetitions 231a** &C BADRPT 232a++ C BADRPT 233a?? C BADRPT 234a*+ C BADRPT 235a*? C BADRPT 236a+* C BADRPT 237a+? C BADRPT 238a?* C BADRPT 239a?+ C BADRPT 240a{1}{1} C BADRPT 241a*{1} C BADRPT 242a+{1} C BADRPT 243a?{1} C BADRPT 244a{1}* C BADRPT 245a{1}+ C BADRPT 246a{1}? C BADRPT 247a*{b} - a{b} a{b} 248a\{1\}\{1\} bC BADRPT 249a*\{1\} bC BADRPT 250a\{1\}* bC BADRPT 251 252# brackets, and numerous perversions thereof 253a[b]c & abc abc 254a[ab]c & abc abc 255a[^ab]c & adc adc 256a[]b]c & a]c a]c 257a[[b]c & a[c a[c 258a[-b]c & a-c a-c 259a[^]b]c & adc adc 260a[^-b]c & adc adc 261a[b-]c & a-c a-c 262a[b &C EBRACK 263a[] &C EBRACK 264a[1-3]c & a2c a2c 265a[3-1]c &C ERANGE 266a[1-3-5]c &C ERANGE 267a[[.-.]--]c & a-c a-c 268a[1- &C ERANGE 269a[[. &C EBRACK 270a[[.x &C EBRACK 271a[[.x. &C EBRACK 272a[[.x.] &C EBRACK 273a[[.x.]] & ax ax 274a[[.x,.]] &C ECOLLATE 275a[[.one.]]b & a1b a1b 276a[[.notdef.]]b &C ECOLLATE 277a[[.].]]b & a]b a]b 278a[[:alpha:]]c & abc abc 279a[[:notdef:]]c &C ECTYPE 280a[[: &C EBRACK 281a[[:alpha &C EBRACK 282a[[:alpha:] &C EBRACK 283a[[:alpha,:] &C ECTYPE 284a[[:]:]]b &C ECTYPE 285a[[:-:]]b &C ECTYPE 286a[[:alph:]] &C ECTYPE 287a[[:alphabet:]] &C ECTYPE 288[[:alnum:]]+ - -%@a0X- a0X 289[[:alpha:]]+ - -%@aX0- aX 290[[:blank:]]+ - aSSTb SST 291[[:cntrl:]]+ - aNTb NT 292[[:digit:]]+ - a019b 019 293[[:graph:]]+ - Sa%bS a%b 294[[:lower:]]+ - AabC ab 295[[:print:]]+ - NaSbN aSb 296[[:punct:]]+ - S%-&T %-& 297[[:space:]]+ - aSNTb SNT 298[[:upper:]]+ - aBCd BC 299[[:xdigit:]]+ - p0f3Cq 0f3C 300a[[=b=]]c & abc abc 301a[[= &C EBRACK 302a[[=b &C EBRACK 303a[[=b= &C EBRACK 304a[[=b=] &C EBRACK 305a[[=b,=]] &C ECOLLATE 306a[[=one=]]b & a1b a1b 307 308# complexities 309a(((b)))c - abc abc 310a(b|(c))d - abd abd 311a(b*|c)d - abbd abbd 312# just gotta have one DFA-buster, of course 313a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 314# and an inline expansion in case somebody gets tricky 315a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 316# and in case somebody just slips in an NFA... 317a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights 318# fish for anomalies as the number of states passes 32 31912345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 320123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 3211234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 32212345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 323123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 324# and one really big one, beyond any plausible word width 3251234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 326# fish for problems as brackets go past 8 327[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm 328[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo 329[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq 330[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq 331 332# subtleties of matching 333abc & xabcy abc 334a\(b\)?c\1d b acd 335aBc i Abc Abc 336a[Bc]*d i abBCcd abBCcd 3370[[:upper:]]1 &i 0a1 0a1 3380[[:lower:]]1 &i 0A1 0A1 339a[^b]c &i abc 340a[^b]c &i aBc 341a[^b]c &i adc adc 342[a]b[c] - abc abc 343[a]b[a] - aba aba 344[abc]b[abc] - abc abc 345[abc]b[abd] - abd abd 346a(b?c)+d - accd accd 347(wee|week)(knights|night) - weeknights weeknights 348(we|wee|week|frob)(knights|night|day) - weeknights weeknights 349a[bc]d - xyzaaabcaababdacd abd 350a[ab]c - aaabc abc 351abc s abc abc 352a* & b @b 353 354# Let's have some fun -- try to match a C comment. 355# first the obvious, which looks okay at first glance... 356/\*.*\*/ - /*x*/ /*x*/ 357# but... 358/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ 359# okay, we must not match */ inside; try to do that... 360/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ 361/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ 362# but... 363/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ 364# and a still fancier version, which does it right (I think)... 365/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ 366/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ 367/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ 368/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ 369/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ 370/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ 371 372# subexpressions 373a(b)(c)d - abcd abcd b,c 374a(((b)))c - abc abc b,b,b 375a(b|(c))d - abd abd b,- 376a(b*|c|e)d - abbd abbd bb 377a(b*|c|e)d - acd acd c 378a(b*|c|e)d - ad ad @d 379a(b?)c - abc abc b 380a(b?)c - ac ac @c 381a(b+)c - abc abc b 382a(b+)c - abbbc abbbc bbb 383a(b*)c - ac ac @c 384(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de 385# the regression tester only asks for 9 subexpressions 386a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j 387a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k 388a([bc]?)c - abc abc b 389a([bc]?)c - ac ac @c 390a([bc]+)c - abc abc b 391a([bc]+)c - abcc abcc bc 392a([bc]+)bc - abcbc abcbc bc 393a(bb+|b)b - abb abb b 394a(bbb+|bb+|b)b - abb abb b 395a(bbb+|bb+|b)b - abbb abbb bb 396a(bbb+|bb+|b)bb - abbb abbb b 397(.*).* - abcdef abcdef abcdef 398(a*)* - bc @b @b 399 400# do we get the right subexpression when it is used more than once? 401a(b|c)*d - ad ad - 402a(b|c)*d - abcd abcd c 403a(b|c)+d - abd abd b 404a(b|c)+d - abcd abcd c 405a(b|c?)+d - ad ad @d 406a(b|c?)+d - abcd abcd @d 407a(b|c){0,0}d - ad ad - 408a(b|c){0,1}d - ad ad - 409a(b|c){0,1}d - abd abd b 410a(b|c){0,2}d - ad ad - 411a(b|c){0,2}d - abcd abcd c 412a(b|c){0,}d - ad ad - 413a(b|c){0,}d - abcd abcd c 414a(b|c){1,1}d - abd abd b 415a(b|c){1,1}d - acd acd c 416a(b|c){1,2}d - abd abd b 417a(b|c){1,2}d - abcd abcd c 418a(b|c){1,}d - abd abd b 419a(b|c){1,}d - abcd abcd c 420a(b|c){2,2}d - acbd acbd b 421a(b|c){2,2}d - abcd abcd c 422a(b|c){2,4}d - abcd abcd c 423a(b|c){2,4}d - abcbd abcbd b 424a(b|c){2,4}d - abcbcd abcbcd c 425a(b|c){2,}d - abcd abcd c 426a(b|c){2,}d - abcbd abcbd b 427a(b+|((c)*))+d - abd abd @d,@d,- 428a(b+|((c)*))+d - abcd abcd @d,@d,- 429 430# check out the STARTEND option 431[abc] &# a(b)c b 432[abc] &# a(d)c 433[abc] &# a(bc)d b 434[abc] &# a(dc)d c 435. &# a()c 436b.*c &# b(bc)c bc 437b.* &# b(bc)c bc 438.*c &# b(bc)c bc 439 440# plain strings, with the NOSPEC flag 441abc m abc abc 442abc m xabcy abc 443abc m xyz 444a*b m aba*b a*b 445a*b m ab 446"" mC EMPTY 447 448# cases involving NULs 449aZb & a a 450aZb &p a 451aZb &p# (aZb) aZb 452aZ*b &p# (ab) ab 453a.b &# (aZb) aZb 454a.* &# (aZb)c aZb 455 456# word boundaries (ick) 457[[:<:]]a & a a 458[[:<:]]a & ba 459[[:<:]]a & -a a 460a[[:>:]] & a a 461a[[:>:]] & ab 462a[[:>:]] & a- a 463[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc 464[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc 465[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc 466[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc 467[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ 468[[:<:]]a_b[[:>:]] & x_a_b 469 470# past problems, and suspected problems 471(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 472abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop 473abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv 474(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 475CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 476Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz 477a?b - ab ab 478-\{0,1\}[0-9]*$ b -5 -5 479