1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# This tests basic flowtable functionality.
5# Creates following default topology:
6#
7# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
8# Router1 is the one doing flow offloading, Router2 has no special
9# purpose other than having a link that is smaller than either Originator
10# and responder, i.e. TCPMSS announced values are too large and will still
11# result in fragmentation and/or PMTU discovery.
12#
13# You can check with different Orgininator/Link/Responder MTU eg:
14# nft_flowtable.sh -o8000 -l1500 -r2000
15#
16
17
18# Kselftest framework requirement - SKIP code is 4.
19ksft_skip=4
20ret=0
21
22ns1in=""
23ns2in=""
24ns1out=""
25ns2out=""
26
27log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
28
29checktool (){
30	if ! $1 > /dev/null 2>&1; then
31		echo "SKIP: Could not $2"
32		exit $ksft_skip
33	fi
34}
35
36checktool "nft --version" "run test without nft tool"
37checktool "ip -Version" "run test without ip tool"
38checktool "which nc" "run test without nc (netcat)"
39checktool "ip netns add nsr1" "create net namespace"
40
41ip netns add ns1
42ip netns add ns2
43
44ip netns add nsr2
45
46cleanup() {
47	for i in 1 2; do
48		ip netns del ns$i
49		ip netns del nsr$i
50	done
51
52	rm -f "$ns1in" "$ns1out"
53	rm -f "$ns2in" "$ns2out"
54
55	[ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
56}
57
58trap cleanup EXIT
59
60sysctl -q net.netfilter.nf_log_all_netns=1
61
62ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
63ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2
64
65ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2
66
67for dev in lo veth0 veth1; do
68  for i in 1 2; do
69    ip -net nsr$i link set $dev up
70  done
71done
72
73ip -net nsr1 addr add 10.0.1.1/24 dev veth0
74ip -net nsr1 addr add dead:1::1/64 dev veth0
75
76ip -net nsr2 addr add 10.0.2.1/24 dev veth1
77ip -net nsr2 addr add dead:2::1/64 dev veth1
78
79# set different MTUs so we need to push packets coming from ns1 (large MTU)
80# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
81# or to do PTMU discovery (send ICMP error back to originator).
82# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
83# is NOT the lowest link mtu.
84
85omtu=9000
86lmtu=1500
87rmtu=2000
88
89usage(){
90	echo "nft_flowtable.sh [OPTIONS]"
91	echo
92	echo "MTU options"
93	echo "   -o originator"
94	echo "   -l link"
95	echo "   -r responder"
96	exit 1
97}
98
99while getopts "o:l:r:" o
100do
101	case $o in
102		o) omtu=$OPTARG;;
103		l) lmtu=$OPTARG;;
104		r) rmtu=$OPTARG;;
105		*) usage;;
106	esac
107done
108
109if ! ip -net nsr1 link set veth0 mtu $omtu; then
110	exit 1
111fi
112
113ip -net ns1 link set eth0 mtu $omtu
114
115if ! ip -net nsr2 link set veth1 mtu $rmtu; then
116	exit 1
117fi
118
119ip -net ns2 link set eth0 mtu $rmtu
120
121# transfer-net between nsr1 and nsr2.
122# these addresses are not used for connections.
123ip -net nsr1 addr add 192.168.10.1/24 dev veth1
124ip -net nsr1 addr add fee1:2::1/64 dev veth1
125
126ip -net nsr2 addr add 192.168.10.2/24 dev veth0
127ip -net nsr2 addr add fee1:2::2/64 dev veth0
128
129for i in 1 2; do
130  ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
131  ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
132
133  ip -net ns$i link set lo up
134  ip -net ns$i link set eth0 up
135  ip -net ns$i addr add 10.0.$i.99/24 dev eth0
136  ip -net ns$i route add default via 10.0.$i.1
137  ip -net ns$i addr add dead:$i::99/64 dev eth0
138  ip -net ns$i route add default via dead:$i::1
139  if ! ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
140	echo "ERROR: Check Originator/Responder values (problem during address addition)"
141	exit 1
142  fi
143
144  # don't set ip DF bit for first two tests
145  ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
146done
147
148ip -net nsr1 route add default via 192.168.10.2
149ip -net nsr2 route add default via 192.168.10.1
150
151ip netns exec nsr1 nft -f - <<EOF
152table inet filter {
153  flowtable f1 {
154     hook ingress priority 0
155     devices = { veth0, veth1 }
156   }
157
158   chain forward {
159      type filter hook forward priority 0; policy drop;
160
161      # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
162      meta oif "veth1" tcp dport 12345 flow offload @f1 counter
163
164      # use packet size to trigger 'should be offloaded by now'.
165      # otherwise, if 'flow offload' expression never offloads, the
166      # test will pass.
167      tcp dport 12345 meta length gt 200 ct mark set 1 counter
168
169      # this turns off flow offloading internally, so expect packets again
170      tcp flags fin,rst ct mark set 0 accept
171
172      # this allows large packets from responder, we need this as long
173      # as PMTUd is off.
174      # This rule is deleted for the last test, when we expect PMTUd
175      # to kick in and ensure all packets meet mtu requirements.
176      meta length gt $lmtu accept comment something-to-grep-for
177
178      # next line blocks connection w.o. working offload.
179      # we only do this for reverse dir, because we expect packets to
180      # enter slow path due to MTU mismatch of veth0 and veth1.
181      tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
182
183      ct state established,related accept
184
185      # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
186      meta length lt 200 oif "veth1" tcp dport 12345 counter accept
187
188      meta nfproto ipv4 meta l4proto icmp accept
189      meta nfproto ipv6 meta l4proto icmpv6 accept
190   }
191}
192EOF
193
194if [ $? -ne 0 ]; then
195	echo "SKIP: Could not load nft ruleset"
196	exit $ksft_skip
197fi
198
199# test basic connectivity
200if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
201  echo "ERROR: ns1 cannot reach ns2" 1>&2
202  bash
203  exit 1
204fi
205
206if ! ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
207  echo "ERROR: ns2 cannot reach ns1" 1>&2
208  exit 1
209fi
210
211if [ $ret -eq 0 ];then
212	echo "PASS: netns routing/connectivity: ns1 can reach ns2"
213fi
214
215ns1in=$(mktemp)
216ns1out=$(mktemp)
217ns2in=$(mktemp)
218ns2out=$(mktemp)
219
220make_file()
221{
222	name=$1
223
224	SIZE=$((RANDOM % (1024 * 8)))
225	TSIZE=$((SIZE * 1024))
226
227	dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
228
229	SIZE=$((RANDOM % 1024))
230	SIZE=$((SIZE + 128))
231	TSIZE=$((TSIZE + SIZE))
232	dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
233}
234
235check_transfer()
236{
237	in=$1
238	out=$2
239	what=$3
240
241	if ! cmp "$in" "$out" > /dev/null 2>&1; then
242		echo "FAIL: file mismatch for $what" 1>&2
243		ls -l "$in"
244		ls -l "$out"
245		return 1
246	fi
247
248	return 0
249}
250
251test_tcp_forwarding_ip()
252{
253	local nsa=$1
254	local nsb=$2
255	local dstip=$3
256	local dstport=$4
257	local lret=0
258
259	ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
260	lpid=$!
261
262	sleep 1
263	ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" &
264	cpid=$!
265
266	sleep 3
267
268	if ps -p $lpid > /dev/null;then
269		kill $lpid
270	fi
271
272	if ps -p $cpid > /dev/null;then
273		kill $cpid
274	fi
275
276	wait
277
278	if ! check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"; then
279		lret=1
280	fi
281
282	if ! check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"; then
283		lret=1
284	fi
285
286	return $lret
287}
288
289test_tcp_forwarding()
290{
291	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
292
293	return $?
294}
295
296test_tcp_forwarding_nat()
297{
298	local lret
299
300	test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
301	lret=$?
302
303	if [ $lret -eq 0 ] ; then
304		test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
305		lret=$?
306	fi
307
308	return $lret
309}
310
311make_file "$ns1in"
312make_file "$ns2in"
313
314# First test:
315# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
316if test_tcp_forwarding ns1 ns2; then
317	echo "PASS: flow offloaded for ns1/ns2"
318else
319	echo "FAIL: flow offload for ns1/ns2:" 1>&2
320	ip netns exec nsr1 nft list ruleset
321	ret=1
322fi
323
324# delete default route, i.e. ns2 won't be able to reach ns1 and
325# will depend on ns1 being masqueraded in nsr1.
326# expect ns1 has nsr1 address.
327ip -net ns2 route del default via 10.0.2.1
328ip -net ns2 route del default via dead:2::1
329ip -net ns2 route add 192.168.10.1 via 10.0.2.1
330
331# Second test:
332# Same, but with NAT enabled.
333ip netns exec nsr1 nft -f - <<EOF
334table ip nat {
335   chain prerouting {
336      type nat hook prerouting priority 0; policy accept;
337      meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
338   }
339
340   chain postrouting {
341      type nat hook postrouting priority 0; policy accept;
342      meta oifname "veth1" counter masquerade
343   }
344}
345EOF
346
347if test_tcp_forwarding_nat ns1 ns2; then
348	echo "PASS: flow offloaded for ns1/ns2 with NAT"
349else
350	echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
351	ip netns exec nsr1 nft list ruleset
352	ret=1
353fi
354
355# Third test:
356# Same as second test, but with PMTU discovery enabled.
357handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)
358
359if ! ip netns exec nsr1 nft delete rule inet filter forward $handle; then
360	echo "FAIL: Could not delete large-packet accept rule"
361	exit 1
362fi
363
364ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
365ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
366
367if test_tcp_forwarding_nat ns1 ns2; then
368	echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
369else
370	echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
371	ip netns exec nsr1 nft list ruleset
372fi
373
374# Another test:
375# Add bridge interface br0 to Router1, with NAT enabled.
376ip -net nsr1 link add name br0 type bridge
377ip -net nsr1 addr flush dev veth0
378ip -net nsr1 link set up dev veth0
379ip -net nsr1 link set veth0 master br0
380ip -net nsr1 addr add 10.0.1.1/24 dev br0
381ip -net nsr1 addr add dead:1::1/64 dev br0
382ip -net nsr1 link set up dev br0
383
384ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
385
386# br0 with NAT enabled.
387ip netns exec nsr1 nft -f - <<EOF
388flush table ip nat
389table ip nat {
390   chain prerouting {
391      type nat hook prerouting priority 0; policy accept;
392      meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
393   }
394
395   chain postrouting {
396      type nat hook postrouting priority 0; policy accept;
397      meta oifname "veth1" counter masquerade
398   }
399}
400EOF
401
402if test_tcp_forwarding_nat ns1 ns2; then
403	echo "PASS: flow offloaded for ns1/ns2 with bridge NAT"
404else
405	echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
406	ip netns exec nsr1 nft list ruleset
407	ret=1
408fi
409
410# Another test:
411# Add bridge interface br0 to Router1, with NAT and VLAN.
412ip -net nsr1 link set veth0 nomaster
413ip -net nsr1 link set down dev veth0
414ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10
415ip -net nsr1 link set up dev veth0
416ip -net nsr1 link set up dev veth0.10
417ip -net nsr1 link set veth0.10 master br0
418
419ip -net ns1 addr flush dev eth0
420ip -net ns1 link add link eth0 name eth0.10 type vlan id 10
421ip -net ns1 link set eth0 up
422ip -net ns1 link set eth0.10 up
423ip -net ns1 addr add 10.0.1.99/24 dev eth0.10
424ip -net ns1 route add default via 10.0.1.1
425ip -net ns1 addr add dead:1::99/64 dev eth0.10
426
427if test_tcp_forwarding_nat ns1 ns2; then
428	echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN"
429else
430	echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
431	ip netns exec nsr1 nft list ruleset
432	ret=1
433fi
434
435# restore test topology (remove bridge and VLAN)
436ip -net nsr1 link set veth0 nomaster
437ip -net nsr1 link set veth0 down
438ip -net nsr1 link set veth0.10 down
439ip -net nsr1 link delete veth0.10 type vlan
440ip -net nsr1 link delete br0 type bridge
441ip -net ns1 addr flush dev eth0.10
442ip -net ns1 link set eth0.10 down
443ip -net ns1 link set eth0 down
444ip -net ns1 link delete eth0.10 type vlan
445
446# restore address in ns1 and nsr1
447ip -net ns1 link set eth0 up
448ip -net ns1 addr add 10.0.1.99/24 dev eth0
449ip -net ns1 route add default via 10.0.1.1
450ip -net ns1 addr add dead:1::99/64 dev eth0
451ip -net ns1 route add default via dead:1::1
452ip -net nsr1 addr add 10.0.1.1/24 dev veth0
453ip -net nsr1 addr add dead:1::1/64 dev veth0
454ip -net nsr1 link set up dev veth0
455
456KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
457KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
458SPI1=$RANDOM
459SPI2=$RANDOM
460
461if [ $SPI1 -eq $SPI2 ]; then
462	SPI2=$((SPI2+1))
463fi
464
465do_esp() {
466    local ns=$1
467    local me=$2
468    local remote=$3
469    local lnet=$4
470    local rnet=$5
471    local spi_out=$6
472    local spi_in=$7
473
474    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
475    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
476
477    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
478    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
479    # to fwd decrypted packets after esp processing:
480    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
481
482}
483
484do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
485
486do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
487
488ip netns exec nsr1 nft delete table ip nat
489
490# restore default routes
491ip -net ns2 route del 192.168.10.1 via 10.0.2.1
492ip -net ns2 route add default via 10.0.2.1
493ip -net ns2 route add default via dead:2::1
494
495if test_tcp_forwarding ns1 ns2; then
496	echo "PASS: ipsec tunnel mode for ns1/ns2"
497else
498	echo "FAIL: ipsec tunnel mode for ns1/ns2"
499	ip netns exec nsr1 nft list ruleset 1>&2
500	ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
501fi
502
503exit $ret
504