1# $NetBSD: tests,v 1.5 1995/04/20 22:40:00 cgd Exp $ 2 3# regular expression test set 4# Lines are at least three fields, separated by one or more tabs. "" stands 5# for an empty field. First field is an RE. Second field is flags. If 6# C flag given, regcomp() is expected to fail, and the third field is the 7# error name (minus the leading REG_). 8# 9# Otherwise it is expected to succeed, and the third field is the string to 10# try matching it against. If there is no fourth field, the match is 11# expected to fail. If there is a fourth field, it is the substring that 12# the RE is expected to match. If there is a fifth field, it is a comma- 13# separated list of what the subexpressions should match, with - indicating 14# no match for that one. In both the fourth and fifth fields, a (sub)field 15# starting with @ indicates that the (sub)expression is expected to match 16# a null string followed by the stuff after the @; this provides a way to 17# test where null strings match. The character `N' in REs and strings 18# is newline, `S' is space, `T' is tab, `Z' is NUL. 19# 20# The full list of flags: 21# - placeholder, does nothing 22# b RE is a BRE, not an ERE 23# & try it as both an ERE and a BRE 24# C regcomp() error expected, third field is error name 25# i REG_ICASE 26# m ("mundane") REG_NOSPEC 27# s REG_NOSUB (not really testable) 28# n REG_NEWLINE 29# ^ REG_NOTBOL 30# $ REG_NOTEOL 31# # REG_STARTEND (see below) 32# p REG_PEND 33# 34# For REG_STARTEND, the start/end offsets are those of the substring 35# enclosed in (). 36 37# basics 38a & a a 39abc & abc abc 40abc|de - abc abc 41a|b|c - abc a 42 43# parentheses and perversions thereof 44a(b)c - abc abc 45a\(b\)c b abc abc 46a( C EPAREN 47a( b a( a( 48a\( - a( a( 49a\( bC EPAREN 50a\(b bC EPAREN 51a(b C EPAREN 52a(b b a(b a(b 53# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly) 54a) - a) a) 55) - ) ) 56# end gagging (in a just world, those *should* give EPAREN) 57a) b a) a) 58a\) bC EPAREN 59\) bC EPAREN 60a()b - ab ab 61a\(\)b b ab ab 62 63# anchoring and REG_NEWLINE 64^abc$ & abc abc 65a^b - a^b 66a^b b a^b a^b 67a$b - a$b 68a$b b a$b a$b 69^ & abc @abc 70$ & abc @ 71^$ & "" @ 72$^ - "" @ 73\($\)\(^\) b "" @ 74# stop retching, those are legitimate (although disgusting) 75^^ - "" @ 76$$ - "" @ 77b$ & abNc 78b$ &n abNc b 79^b$ & aNbNc 80^b$ &n aNbNc b 81^$ &n aNNb @Nb 82^$ n abc 83^$ n abcN @ 84$^ n aNNb @Nb 85\($\)\(^\) bn aNNb @Nb 86^^ n^ aNNb @Nb 87$$ n aNNb @NN 88^a ^ a 89a$ $ a 90^a ^n aNb 91^b ^n aNb b 92a$ $n bNa 93b$ $n bNa b 94a*(^b$)c* - b b 95a*\(^b$\)c* b b b 96 97# certain syntax errors and non-errors 98| C EMPTY 99| b | | 100* C BADRPT 101* b * * 102+ C BADRPT 103? C BADRPT 104"" &C EMPTY 105() - abc @abc 106\(\) b abc @abc 107a||b C EMPTY 108|ab C EMPTY 109ab| C EMPTY 110(|a)b C EMPTY 111(a|)b C EMPTY 112(*a) C BADRPT 113(+a) C BADRPT 114(?a) C BADRPT 115({1}a) C BADRPT 116\(\{1\}a\) bC BADRPT 117(a|*b) C BADRPT 118(a|+b) C BADRPT 119(a|?b) C BADRPT 120(a|{1}b) C BADRPT 121^* C BADRPT 122^* b * * 123^+ C BADRPT 124^? C BADRPT 125^{1} C BADRPT 126^\{1\} bC BADRPT 127 128# metacharacters, backslashes 129a.c & abc abc 130a[bc]d & abd abd 131a\*c & a*c a*c 132a\\b & a\b a\b 133a\\\*b & a\*b a\*b 134a\bc & abc abc 135a\ &C EESCAPE 136a\\bc & a\bc a\bc 137\{ bC BADRPT 138a\[b & a[b a[b 139a[b &C EBRACK 140# trailing $ is a peculiar special case for the BRE code 141a$ & a a 142a$ & a$ 143a\$ & a 144a\$ & a$ a$ 145a\\$ & a 146a\\$ & a$ 147a\\$ & a\$ 148a\\$ & a\ a\ 149 150# back references, ugh 151a\(b\)\2c bC ESUBREG 152a\(b\1\)c bC ESUBREG 153a\(b*\)c\1d b abbcbbd abbcbbd bb 154a\(b*\)c\1d b abbcbd 155a\(b*\)c\1d b abbcbbbd 156^\(.\)\1 b abc 157a\([bc]\)\1d b abcdabbd abbd b 158a\(\([bc]\)\2\)*d b abbccd abbccd 159a\(\([bc]\)\2\)*d b abbcbd 160# actually, this next one probably ought to fail, but the spec is unclear 161a\(\(b\)*\2\)*d b abbbd abbbd 162# here is a case that no NFA implementation does right 163\(ab*\)[ab]*\1 b ababaaa ababaaa a 164# check out normal matching in the presence of back refs 165\(a\)\1bcd b aabcd aabcd 166\(a\)\1bc*d b aabcd aabcd 167\(a\)\1bc*d b aabd aabd 168\(a\)\1bc*d b aabcccd aabcccd 169\(a\)\1bc*[ce]d b aabcccd aabcccd 170^\(a\)\1b\(c\)*cd$ b aabcccd aabcccd 171 172# ordinary repetitions 173ab*c & abc abc 174ab+c - abc abc 175ab?c - abc abc 176a\(*\)b b a*b a*b 177a\(**\)b b ab ab 178a\(***\)b bC BADRPT 179*a b *a *a 180**a b a a 181***a bC BADRPT 182 183# the dreaded bounded repetitions 184{ & { { 185{abc & {abc {abc 186{1 C BADRPT 187{1} C BADRPT 188a{b & a{b a{b 189a{1}b - ab ab 190a\{1\}b b ab ab 191a{1,}b - ab ab 192a\{1,\}b b ab ab 193a{1,2}b - aab aab 194a\{1,2\}b b aab aab 195a{1 C EBRACE 196a\{1 bC EBRACE 197a{1a C EBRACE 198a\{1a bC EBRACE 199a{1a} C BADBR 200a\{1a\} bC BADBR 201a{,2} - a{,2} a{,2} 202a\{,2\} bC BADBR 203a{,} - a{,} a{,} 204a\{,\} bC BADBR 205a{1,x} C BADBR 206a\{1,x\} bC BADBR 207a{1,x C EBRACE 208a\{1,x bC EBRACE 209a{300} C BADBR 210a\{300\} bC BADBR 211a{1,0} C BADBR 212a\{1,0\} bC BADBR 213ab{0,0}c - abcac ac 214ab\{0,0\}c b abcac ac 215ab{0,1}c - abcac abc 216ab\{0,1\}c b abcac abc 217ab{0,3}c - abbcac abbc 218ab\{0,3\}c b abbcac abbc 219ab{1,1}c - acabc abc 220ab\{1,1\}c b acabc abc 221ab{1,3}c - acabc abc 222ab\{1,3\}c b acabc abc 223ab{2,2}c - abcabbc abbc 224ab\{2,2\}c b abcabbc abbc 225ab{2,4}c - abcabbc abbc 226ab\{2,4\}c b abcabbc abbc 227((a{1,10}){1,10}){1,10} - a a a,a 228 229# multiple repetitions 230a** &C BADRPT 231a++ C BADRPT 232a?? C BADRPT 233a*+ C BADRPT 234a*? C BADRPT 235a+* C BADRPT 236a+? C BADRPT 237a?* C BADRPT 238a?+ C BADRPT 239a{1}{1} C BADRPT 240a*{1} C BADRPT 241a+{1} C BADRPT 242a?{1} C BADRPT 243a{1}* C BADRPT 244a{1}+ C BADRPT 245a{1}? C BADRPT 246a*{b} - a{b} a{b} 247a\{1\}\{1\} bC BADRPT 248a*\{1\} bC BADRPT 249a\{1\}* bC BADRPT 250 251# brackets, and numerous perversions thereof 252a[b]c & abc abc 253a[ab]c & abc abc 254a[^ab]c & adc adc 255a[]b]c & a]c a]c 256a[[b]c & a[c a[c 257a[-b]c & a-c a-c 258a[^]b]c & adc adc 259a[^-b]c & adc adc 260a[b-]c & a-c a-c 261a[b &C EBRACK 262a[] &C EBRACK 263a[1-3]c & a2c a2c 264a[3-1]c &C ERANGE 265a[1-3-5]c &C ERANGE 266a[[.-.]--]c & a-c a-c 267a[1- &C ERANGE 268a[[. &C EBRACK 269a[[.x &C EBRACK 270a[[.x. &C EBRACK 271a[[.x.] &C EBRACK 272a[[.x.]] & ax ax 273a[[.x,.]] &C ECOLLATE 274a[[.one.]]b & a1b a1b 275a[[.notdef.]]b &C ECOLLATE 276a[[.].]]b & a]b a]b 277a[[:alpha:]]c & abc abc 278a[[:notdef:]]c &C ECTYPE 279a[[: &C EBRACK 280a[[:alpha &C EBRACK 281a[[:alpha:] &C EBRACK 282a[[:alpha,:] &C ECTYPE 283a[[:]:]]b &C ECTYPE 284a[[:-:]]b &C ECTYPE 285a[[:alph:]] &C ECTYPE 286a[[:alphabet:]] &C ECTYPE 287[[:alnum:]]+ - -%@a0X- a0X 288[[:alpha:]]+ - -%@aX0- aX 289[[:blank:]]+ - aSSTb SST 290[[:cntrl:]]+ - aNTb NT 291[[:digit:]]+ - a019b 019 292[[:graph:]]+ - Sa%bS a%b 293[[:lower:]]+ - AabC ab 294[[:print:]]+ - NaSbN aSb 295[[:punct:]]+ - S%-&T %-& 296[[:space:]]+ - aSNTb SNT 297[[:upper:]]+ - aBCd BC 298[[:xdigit:]]+ - p0f3Cq 0f3C 299a[[=b=]]c & abc abc 300a[[= &C EBRACK 301a[[=b &C EBRACK 302a[[=b= &C EBRACK 303a[[=b=] &C EBRACK 304a[[=b,=]] &C ECOLLATE 305a[[=one=]]b & a1b a1b 306 307# complexities 308a(((b)))c - abc abc 309a(b|(c))d - abd abd 310a(b*|c)d - abbd abbd 311# just gotta have one DFA-buster, of course 312a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 313# and an inline expansion in case somebody gets tricky 314a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab 315# and in case somebody just slips in an NFA... 316a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee|week)(knights|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights 317# fish for anomalies as the number of states passes 32 31812345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789 319123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890 3201234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901 32112345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012 322123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123 323# and one really big one, beyond any plausible word width 3241234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890 325# fish for problems as brackets go past 8 326[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm 327[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo 328[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq 329[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq 330 331# subtleties of matching 332abc & xabcy abc 333a\(b\)?c\1d b acd 334aBc i Abc Abc 335a[Bc]*d i abBCcd abBCcd 3360[[:upper:]]1 &i 0a1 0a1 3370[[:lower:]]1 &i 0A1 0A1 338a[^b]c &i abc 339a[^b]c &i aBc 340a[^b]c &i adc adc 341[a]b[c] - abc abc 342[a]b[a] - aba aba 343[abc]b[abc] - abc abc 344[abc]b[abd] - abd abd 345a(b?c)+d - accd accd 346(wee|week)(knights|night) - weeknights weeknights 347(we|wee|week|frob)(knights|night|day) - weeknights weeknights 348a[bc]d - xyzaaabcaababdacd abd 349a[ab]c - aaabc abc 350abc s abc abc 351a* & b @b 352 353# Let's have some fun -- try to match a C comment. 354# first the obvious, which looks okay at first glance... 355/\*.*\*/ - /*x*/ /*x*/ 356# but... 357/\*.*\*/ - /*x*/y/*z*/ /*x*/y/*z*/ 358# okay, we must not match */ inside; try to do that... 359/\*([^*]|\*[^/])*\*/ - /*x*/ /*x*/ 360/\*([^*]|\*[^/])*\*/ - /*x*/y/*z*/ /*x*/ 361# but... 362/\*([^*]|\*[^/])*\*/ - /*x**/y/*z*/ /*x**/y/*z*/ 363# and a still fancier version, which does it right (I think)... 364/\*([^*]|\*+[^*/])*\*+/ - /*x*/ /*x*/ 365/\*([^*]|\*+[^*/])*\*+/ - /*x*/y/*z*/ /*x*/ 366/\*([^*]|\*+[^*/])*\*+/ - /*x**/y/*z*/ /*x**/ 367/\*([^*]|\*+[^*/])*\*+/ - /*x****/y/*z*/ /*x****/ 368/\*([^*]|\*+[^*/])*\*+/ - /*x**x*/y/*z*/ /*x**x*/ 369/\*([^*]|\*+[^*/])*\*+/ - /*x***x/y/*z*/ /*x***x/y/*z*/ 370 371# subexpressions 372a(b)(c)d - abcd abcd b,c 373a(((b)))c - abc abc b,b,b 374a(b|(c))d - abd abd b,- 375a(b*|c|e)d - abbd abbd bb 376a(b*|c|e)d - acd acd c 377a(b*|c|e)d - ad ad @d 378a(b?)c - abc abc b 379a(b?)c - ac ac @c 380a(b+)c - abc abc b 381a(b+)c - abbbc abbbc bbb 382a(b*)c - ac ac @c 383(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de 384# the regression tester only asks for 9 subexpressions 385a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j 386a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k 387a([bc]?)c - abc abc b 388a([bc]?)c - ac ac @c 389a([bc]+)c - abc abc b 390a([bc]+)c - abcc abcc bc 391a([bc]+)bc - abcbc abcbc bc 392a(bb+|b)b - abb abb b 393a(bbb+|bb+|b)b - abb abb b 394a(bbb+|bb+|b)b - abbb abbb bb 395a(bbb+|bb+|b)bb - abbb abbb b 396(.*).* - abcdef abcdef abcdef 397(a*)* - bc @b @b 398 399# do we get the right subexpression when it is used more than once? 400a(b|c)*d - ad ad - 401a(b|c)*d - abcd abcd c 402a(b|c)+d - abd abd b 403a(b|c)+d - abcd abcd c 404a(b|c?)+d - ad ad @d 405a(b|c?)+d - abcd abcd @d 406a(b|c){0,0}d - ad ad - 407a(b|c){0,1}d - ad ad - 408a(b|c){0,1}d - abd abd b 409a(b|c){0,2}d - ad ad - 410a(b|c){0,2}d - abcd abcd c 411a(b|c){0,}d - ad ad - 412a(b|c){0,}d - abcd abcd c 413a(b|c){1,1}d - abd abd b 414a(b|c){1,1}d - acd acd c 415a(b|c){1,2}d - abd abd b 416a(b|c){1,2}d - abcd abcd c 417a(b|c){1,}d - abd abd b 418a(b|c){1,}d - abcd abcd c 419a(b|c){2,2}d - acbd acbd b 420a(b|c){2,2}d - abcd abcd c 421a(b|c){2,4}d - abcd abcd c 422a(b|c){2,4}d - abcbd abcbd b 423a(b|c){2,4}d - abcbcd abcbcd c 424a(b|c){2,}d - abcd abcd c 425a(b|c){2,}d - abcbd abcbd b 426a(b+|((c)*))+d - abd abd @d,@d,- 427a(b+|((c)*))+d - abcd abcd @d,@d,- 428 429# check out the STARTEND option 430[abc] &# a(b)c b 431[abc] &# a(d)c 432[abc] &# a(bc)d b 433[abc] &# a(dc)d c 434. &# a()c 435b.*c &# b(bc)c bc 436b.* &# b(bc)c bc 437.*c &# b(bc)c bc 438 439# plain strings, with the NOSPEC flag 440abc m abc abc 441abc m xabcy abc 442abc m xyz 443a*b m aba*b a*b 444a*b m ab 445"" mC EMPTY 446 447# cases involving NULs 448aZb & a a 449aZb &p a 450aZb &p# (aZb) aZb 451aZ*b &p# (ab) ab 452a.b &# (aZb) aZb 453a.* &# (aZb)c aZb 454 455# word boundaries (ick) 456[[:<:]]a & a a 457[[:<:]]a & ba 458[[:<:]]a & -a a 459a[[:>:]] & a a 460a[[:>:]] & ab 461a[[:>:]] & a- a 462[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc 463[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc 464[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc 465[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc 466[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_ 467[[:<:]]a_b[[:>:]] & x_a_b 468 469# past problems, and suspected problems 470(A[1])|(A[2])|(A[3])|(A[4])|(A[5])|(A[6])|(A[7])|(A[8])|(A[9])|(A[A]) - A1 A1 471abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop 472abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv 473(ALAK)|(ALT[AB])|(CC[123]1)|(CM[123]1)|(GAMC)|(LC[23][EO ])|(SEM[1234])|(SL[ES][12])|(SLWW)|(SLF )|(SLDT)|(VWH[12])|(WH[34][EW])|(WP1[ESN]) - CC11 CC11 474CC[13]1|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11 475Char \([a-z0-9_]*\)\[.* b Char xyz[k Char xyz[k xyz 476a?b - ab ab 477-\{0,1\}[0-9]*$ b -5 -5 478