1# $NetBSD: varmod-subst-regex.mk,v 1.7 2021/06/21 08:17:39 rillig Exp $
2#
3# Tests for the :C,from,to, variable modifier.
4
5# report unmatched subexpressions
6.MAKEFLAGS: -dL
7
8all: mod-regex-compile-error
9all: mod-regex-limits
10all: mod-regex-errors
11all: unmatched-subexpression
12
13# The variable expression expands to 4 words.  Of these words, none matches
14# the regular expression "a b" since these words don't contain any
15# whitespace.
16.if ${:Ua b b c:C,a b,,} != "a b b c"
17.  error
18.endif
19
20# Using the '1' modifier does not change anything.  The '1' modifier just
21# means to apply at most 1 replacement in the whole variable expression.
22.if ${:Ua b b c:C,a b,,1} != "a b b c"
23.  error
24.endif
25
26# The 'W' modifier treats the whole variable value as a single big word,
27# containing whitespace.  This big word matches the regular expression,
28# therefore it gets replaced.  Whitespace is preserved after replacing.
29.if ${:Ua b b c:C,a b,,W} != " b c"
30.  error
31.endif
32
33# The 'g' modifier does not have any effect here since each of the words
34# contains the character 'b' a single time.
35.if ${:Ua b b c:C,b,,g} != "a c"
36.  error
37.endif
38
39# The first :C modifier has the 'W' modifier, which makes the whole
40# expression a single word.  The 'g' modifier then replaces all occurrences
41# of "1 2" with "___".  The 'W' modifier only applies to this single :C
42# modifier.  This is demonstrated by the :C modifier that follows.  If the
43# 'W' modifier would be preserved, only a single underscore would have been
44# replaced with an 'x'.
45.if ${:U1 2 3 1 2 3:C,1 2,___,Wg:C,_,x,} != "x__ 3 x__ 3"
46.  error
47.endif
48
49# The regular expression does not match in the first word.
50# It matches once in the second word, and the \0\0 doubles that word.
51# In the third word, the regular expression matches as early as possible,
52# and since the matches must not overlap, the next possible match would
53# start at the 6, but at that point, there is only one character left,
54# and that cannot match the regular expression "..".  Therefore only the
55# "45" is doubled in the third word.
56.if ${:U1 23 456:C,..,\0\0,} != "1 2323 45456"
57.  error
58.endif
59
60# The modifier '1' applies the replacement at most once, across the whole
61# expression value, no matter whether it is a single big word or many small
62# words.
63#
64# Up to 2020-08-28, the manual page said that the modifiers '1' and 'g'
65# were orthogonal, which was wrong.  It doesn't make sense to specify both
66# 'g' and '1' at the same time.
67.if ${:U12345 12345:C,.,\0\0,1} != "112345 12345"
68.  error
69.endif
70
71# A regular expression that matches the empty string applies before every
72# single character of the word.
73# XXX: Most other places where regular expression are used match at the end
74# of the string as well.
75.if ${:U1a2b3c:C,a*,*,g} != "*1**2*b*3*c"
76.  error
77.endif
78
79# A dot in the regular expression matches any character, even a newline.
80# In most other contexts where regular expressions are used, a dot matches
81# any character except newline.  In make, regcomp is called without
82# REG_NEWLINE, thus newline is an ordinary character.
83.if ${:U"${.newline}":C,.,.,g} != "..."
84.  error
85.endif
86
87# Multiple asterisks form an invalid regular expression.  This produces an
88# error message and (as of 2020-08-28) stops parsing in the middle of the
89# variable expression.  The unparsed part of the expression is then copied
90# verbatim to the output, which is unexpected and can lead to strange shell
91# commands being run.
92mod-regex-compile-error:
93	@echo $@: ${:Uword1 word2:C,****,____,g:C,word,____,:Q}.
94
95# These tests generate error messages but as of 2020-08-28 just continue
96# parsing and execution as if nothing bad had happened.
97mod-regex-limits:
98	@echo $@:11-missing:${:U1 23 456:C,..,\1\1,:Q}
99	@echo $@:11-ok:${:U1 23 456:C,(.).,\1\1,:Q}
100	@echo $@:22-missing:${:U1 23 456:C,..,\2\2,:Q}
101	@echo $@:22-missing:${:U1 23 456:C,(.).,\2\2,:Q}
102	@echo $@:22-ok:${:U1 23 456:C,(.)(.),\2\2,:Q}
103	# The :C modifier only handles single-digit capturing groups,
104	# which is more than enough for daily use.
105	@echo $@:capture:${:UabcdefghijABCDEFGHIJrest:C,(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.),\9\8\7\6\5\4\3\2\1\0\10\11\12,}
106
107mod-regex-errors:
108	@echo $@: ${UNDEF:Uvalue:C,[,,}
109
110	# If the replacement pattern produces a parse error because of an
111	# unknown modifier, the parse error is ignored in ParseModifierPart
112	# and the faulty variable expression expands to "".
113	@echo $@: ${word:L:C,.*,x${:U:Z}y,W}
114
115# In regular expressions with alternatives, not all capturing groups are
116# always set; some may be missing.  Make calls these "unmatched
117# subexpressions".
118#
119# Between var.c 1.16 from 1996-12-24 until before var.c 1.933 from 2021-06-21,
120# unmatched subexpressions produced an "error message" but did not have any
121# further effect since the "error handling" didn't influence the exit status.
122#
123# Before 2021-06-21 there was no way to turn off this warning, thus the
124# combination of alternative matches and capturing groups was seldom used, if
125# at all.
126#
127# Since var.c 1.933 from 2021-06-21, the error message is only printed in lint
128# mode (-dL), but not in default mode.
129#
130# As an alternative to the change from var.c 1.933 from 2021-06-21, a possible
131# mitigation would have been to add a new modifier 'U' to the already existing
132# '1Wg' modifiers of the ':C' modifier.  That modifier could have been used in
133# the modifier ':C,(a.)|(b.),\1\2,U' to treat unmatched subexpressions as
134# empty.  This approach would have created a syntactical ambiguity since the
135# modifiers ':S' and ':C' are open-ended (see mod-subst-chain), that is, they
136# do not need to be followed by a ':' to separate them from the next modifier.
137# Luckily the modifier :U does not make sense after :C, therefore this case
138# does not happen in practice.
139unmatched-subexpression:
140	# In each of the following cases, if the regular expression matches at
141	# all, the subexpression \1 matches as well.
142	@echo $@.ok: ${:U1 1 2 3 5 8 13 21 34:C,1(.*),one\1,}
143
144	# In the following cases:
145	#	* The subexpression \1 is only defined for 1 and 13.
146	#	* The subexpression \2 is only defined for 2 and 21.
147	#	* If the regular expression does not match at all, the
148	#	  replacement string is not analyzed, thus no error messages.
149	# In total, there are 5 error messages about unmatched subexpressions.
150	@echo $@.1:  ${:U  1:C,1(.*)|2(.*),(\1)(\2),:Q}		# missing \2
151	@echo $@.1:  ${:U  1:C,1(.*)|2(.*),(\1)(\2),:Q}		# missing \2
152	@echo $@.2:  ${:U  2:C,1(.*)|2(.*),(\1)(\2),:Q}		# missing \1
153	@echo $@.3:  ${:U  3:C,1(.*)|2(.*),(\1)(\2),:Q}
154	@echo $@.5:  ${:U  5:C,1(.*)|2(.*),(\1)(\2),:Q}
155	@echo $@.8:  ${:U  8:C,1(.*)|2(.*),(\1)(\2),:Q}
156	@echo $@.13: ${:U 13:C,1(.*)|2(.*),(\1)(\2),:Q}		# missing \2
157	@echo $@.21: ${:U 21:C,1(.*)|2(.*),(\1)(\2),:Q}		# missing \1
158	@echo $@.34: ${:U 34:C,1(.*)|2(.*),(\1)(\2),:Q}
159
160	# And now all together: 5 error messages for 1, 1, 2, 13, 21.
161	@echo $@.all: ${:U1 1 2 3 5 8 13 21 34:C,1(.*)|2(.*),(\1)(\2),:Q}
162