xref: /freebsd/sys/dev/hyperv/netvsc/if_hn.c (revision 63a7c4be)
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54 
55 #include <sys/cdefs.h>
56 #include "opt_hn.h"
57 #include "opt_inet6.h"
58 #include "opt_inet.h"
59 #include "opt_rss.h"
60 
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/bus.h>
64 #include <sys/counter.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
68 #include <sys/mbuf.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
71 #include <sys/lock.h>
72 #include <sys/proc.h>
73 #include <sys/rmlock.h>
74 #include <sys/sbuf.h>
75 #include <sys/sched.h>
76 #include <sys/smp.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
79 #include <sys/sx.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
84 #include <sys/epoch.h>
85 
86 #include <vm/vm.h>
87 #include <vm/vm_extern.h>
88 #include <vm/pmap.h>
89 
90 #include <machine/atomic.h>
91 #include <machine/in_cksum.h>
92 
93 #include <net/bpf.h>
94 #include <net/ethernet.h>
95 #include <net/if.h>
96 #include <net/if_dl.h>
97 #include <net/if_media.h>
98 #include <net/if_types.h>
99 #include <net/if_var.h>
100 #include <net/rndis.h>
101 #ifdef RSS
102 #include <net/rss_config.h>
103 #endif
104 
105 #include <netinet/in_systm.h>
106 #include <netinet/in.h>
107 #include <netinet/ip.h>
108 #include <netinet/ip6.h>
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_lro.h>
111 #include <netinet/udp.h>
112 
113 #include <dev/hyperv/include/hyperv.h>
114 #include <dev/hyperv/include/hyperv_busdma.h>
115 #include <dev/hyperv/include/vmbus.h>
116 #include <dev/hyperv/include/vmbus_xact.h>
117 
118 #include <dev/hyperv/netvsc/ndis.h>
119 #include <dev/hyperv/netvsc/if_hnreg.h>
120 #include <dev/hyperv/netvsc/if_hnvar.h>
121 #include <dev/hyperv/netvsc/hn_nvs.h>
122 #include <dev/hyperv/netvsc/hn_rndis.h>
123 
124 #include "vmbus_if.h"
125 
126 #define HN_IFSTART_SUPPORT
127 
128 #define HN_RING_CNT_DEF_MAX		8
129 
130 #define HN_VFMAP_SIZE_DEF		8
131 
132 #define HN_XPNT_VF_ATTWAIT_MIN		2	/* seconds */
133 
134 /* YYY should get it from the underlying channel */
135 #define HN_TX_DESC_CNT			512
136 
137 #define HN_RNDIS_PKT_LEN					\
138 	(sizeof(struct rndis_packet_msg) +			\
139 	 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) +	\
140 	 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) +		\
141 	 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) +		\
142 	 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
143 #define HN_RNDIS_PKT_BOUNDARY		PAGE_SIZE
144 #define HN_RNDIS_PKT_ALIGN		CACHE_LINE_SIZE
145 
146 #define HN_TX_DATA_BOUNDARY		PAGE_SIZE
147 #define HN_TX_DATA_MAXSIZE		IP_MAXPACKET
148 #define HN_TX_DATA_SEGSIZE		PAGE_SIZE
149 /* -1 for RNDIS packet message */
150 #define HN_TX_DATA_SEGCNT_MAX		(HN_GPACNT_MAX - 1)
151 
152 #define HN_DIRECT_TX_SIZE_DEF		128
153 
154 #define HN_EARLY_TXEOF_THRESH		8
155 
156 #define HN_PKTBUF_LEN_DEF		(16 * 1024)
157 
158 #define HN_LROENT_CNT_DEF		128
159 
160 #define HN_LRO_LENLIM_MULTIRX_DEF	(12 * ETHERMTU)
161 #define HN_LRO_LENLIM_DEF		(25 * ETHERMTU)
162 /* YYY 2*MTU is a bit rough, but should be good enough. */
163 #define HN_LRO_LENLIM_MIN(ifp)		(2 * if_getmtu(ifp))
164 
165 #define HN_LRO_ACKCNT_DEF		1
166 
167 #define HN_LOCK_INIT(sc)		\
168 	sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
169 #define HN_LOCK_DESTROY(sc)		sx_destroy(&(sc)->hn_lock)
170 #define HN_LOCK_ASSERT(sc)		sx_assert(&(sc)->hn_lock, SA_XLOCKED)
171 #define HN_LOCK(sc)					\
172 do {							\
173 	while (sx_try_xlock(&(sc)->hn_lock) == 0) {	\
174 		/* Relinquish cpu to avoid deadlock */	\
175 		sched_relinquish(curthread);		\
176 		DELAY(1000);				\
177 	}						\
178 } while (0)
179 #define HN_UNLOCK(sc)			sx_xunlock(&(sc)->hn_lock)
180 
181 #define HN_CSUM_IP_MASK			(CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
182 #define HN_CSUM_IP6_MASK		(CSUM_IP6_TCP | CSUM_IP6_UDP)
183 #define HN_CSUM_IP_HWASSIST(sc)		\
184 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
185 #define HN_CSUM_IP6_HWASSIST(sc)	\
186 	((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
187 
188 #define HN_PKTSIZE_MIN(align)		\
189 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
190 	    HN_RNDIS_PKT_LEN, (align))
191 #define HN_PKTSIZE(m, align)		\
192 	roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
193 
194 #ifdef RSS
195 #define HN_RING_IDX2CPU(sc, idx)	rss_getcpu((idx) % rss_getnumbuckets())
196 #else
197 #define HN_RING_IDX2CPU(sc, idx)	(((sc)->hn_cpu + (idx)) % mp_ncpus)
198 #endif
199 
200 struct hn_txdesc {
201 #ifndef HN_USE_TXDESC_BUFRING
202 	SLIST_ENTRY(hn_txdesc)		link;
203 #endif
204 	STAILQ_ENTRY(hn_txdesc)		agg_link;
205 
206 	/* Aggregated txdescs, in sending order. */
207 	STAILQ_HEAD(, hn_txdesc)	agg_list;
208 
209 	/* The oldest packet, if transmission aggregation happens. */
210 	struct mbuf			*m;
211 	struct hn_tx_ring		*txr;
212 	int				refs;
213 	uint32_t			flags;	/* HN_TXD_FLAG_ */
214 	struct hn_nvs_sendctx		send_ctx;
215 	uint32_t			chim_index;
216 	int				chim_size;
217 
218 	bus_dmamap_t			data_dmap;
219 
220 	bus_addr_t			rndis_pkt_paddr;
221 	struct rndis_packet_msg		*rndis_pkt;
222 	bus_dmamap_t			rndis_pkt_dmap;
223 };
224 
225 #define HN_TXD_FLAG_ONLIST		0x0001
226 #define HN_TXD_FLAG_DMAMAP		0x0002
227 #define HN_TXD_FLAG_ONAGG		0x0004
228 
229 #define	HN_NDIS_PKTINFO_SUBALLOC	0x01
230 #define	HN_NDIS_PKTINFO_1ST_FRAG	0x02
231 #define	HN_NDIS_PKTINFO_LAST_FRAG	0x04
232 
233 struct packet_info_id {
234 	uint8_t				ver;
235 	uint8_t				flag;
236 	uint16_t			pkt_id;
237 };
238 
239 #define NDIS_PKTINFOID_SZ		sizeof(struct packet_info_id)
240 
241 
242 struct hn_rxinfo {
243 	const uint32_t			*vlan_info;
244 	const uint32_t			*csum_info;
245 	const uint32_t			*hash_info;
246 	const uint32_t			*hash_value;
247 	const struct packet_info_id	*pktinfo_id;
248 };
249 
250 struct hn_rxvf_setarg {
251 	struct hn_rx_ring	*rxr;
252 	if_t			vf_ifp;
253 };
254 
255 #define HN_RXINFO_VLAN			0x0001
256 #define HN_RXINFO_CSUM			0x0002
257 #define HN_RXINFO_HASHINF		0x0004
258 #define HN_RXINFO_HASHVAL		0x0008
259 #define HN_RXINFO_PKTINFO_ID		0x0010
260 #define HN_RXINFO_ALL			\
261 	(HN_RXINFO_VLAN |		\
262 	 HN_RXINFO_CSUM |		\
263 	 HN_RXINFO_HASHINF |		\
264 	 HN_RXINFO_HASHVAL |		\
265 	 HN_RXINFO_PKTINFO_ID)
266 
267 static int			hn_probe(device_t);
268 static int			hn_attach(device_t);
269 static int			hn_detach(device_t);
270 static int			hn_shutdown(device_t);
271 static void			hn_chan_callback(struct vmbus_channel *,
272 				    void *);
273 
274 static void			hn_init(void *);
275 static int			hn_ioctl(if_t, u_long, caddr_t);
276 #ifdef HN_IFSTART_SUPPORT
277 static void			hn_start(if_t);
278 #endif
279 static int			hn_transmit(if_t, struct mbuf *);
280 static void			hn_xmit_qflush(if_t);
281 static int			hn_ifmedia_upd(if_t);
282 static void			hn_ifmedia_sts(if_t,
283 				    struct ifmediareq *);
284 
285 static void			hn_ifnet_event(void *, if_t, int);
286 static void			hn_ifaddr_event(void *, if_t);
287 static void			hn_ifnet_attevent(void *, if_t);
288 static void			hn_ifnet_detevent(void *, if_t);
289 static void			hn_ifnet_lnkevent(void *, if_t, int);
290 
291 static bool			hn_ismyvf(const struct hn_softc *,
292 				    const if_t);
293 static void			hn_rxvf_change(struct hn_softc *,
294 				    if_t, bool);
295 static void			hn_rxvf_set(struct hn_softc *, if_t);
296 static void			hn_rxvf_set_task(void *, int);
297 static void			hn_xpnt_vf_input(if_t, struct mbuf *);
298 static int			hn_xpnt_vf_iocsetflags(struct hn_softc *);
299 static int			hn_xpnt_vf_iocsetcaps(struct hn_softc *,
300 				    struct ifreq *);
301 static void			hn_xpnt_vf_saveifflags(struct hn_softc *);
302 static bool			hn_xpnt_vf_isready(struct hn_softc *);
303 static void			hn_xpnt_vf_setready(struct hn_softc *);
304 static void			hn_xpnt_vf_init_taskfunc(void *, int);
305 static void			hn_xpnt_vf_init(struct hn_softc *);
306 static void			hn_xpnt_vf_setenable(struct hn_softc *);
307 static void			hn_xpnt_vf_setdisable(struct hn_softc *, bool);
308 static void			hn_vf_rss_fixup(struct hn_softc *, bool);
309 static void			hn_vf_rss_restore(struct hn_softc *);
310 
311 static int			hn_rndis_rxinfo(const void *, int,
312 				    struct hn_rxinfo *);
313 static void			hn_rndis_rx_data(struct hn_rx_ring *,
314 				    const void *, int);
315 static void			hn_rndis_rx_status(struct hn_softc *,
316 				    const void *, int);
317 static void			hn_rndis_init_fixat(struct hn_softc *, int);
318 
319 static void			hn_nvs_handle_notify(struct hn_softc *,
320 				    const struct vmbus_chanpkt_hdr *);
321 static void			hn_nvs_handle_comp(struct hn_softc *,
322 				    struct vmbus_channel *,
323 				    const struct vmbus_chanpkt_hdr *);
324 static void			hn_nvs_handle_rxbuf(struct hn_rx_ring *,
325 				    struct vmbus_channel *,
326 				    const struct vmbus_chanpkt_hdr *);
327 static void			hn_nvs_ack_rxbuf(struct hn_rx_ring *,
328 				    struct vmbus_channel *, uint64_t);
329 
330 static int			hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
331 static int			hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
332 static int			hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
333 static int			hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int			hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
335 static int			hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
336 static int			hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
337 static int			hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
338 static int			hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
339 static int			hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
340 static int			hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int			hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
342 #ifndef RSS
343 static int			hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
344 static int			hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
345 #endif
346 static int			hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
347 static int			hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
348 static int			hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
349 static int			hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
350 static int			hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
351 static int			hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
352 static int			hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
353 static int			hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
354 static int			hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
355 static int			hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
356 static int			hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
357 static int			hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
358 static int			hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
359 static int			hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
360 
361 static void			hn_stop(struct hn_softc *, bool);
362 static void			hn_init_locked(struct hn_softc *);
363 static int			hn_chan_attach(struct hn_softc *,
364 				    struct vmbus_channel *);
365 static void			hn_chan_detach(struct hn_softc *,
366 				    struct vmbus_channel *);
367 static int			hn_attach_subchans(struct hn_softc *);
368 static void			hn_detach_allchans(struct hn_softc *);
369 static void			hn_chan_rollup(struct hn_rx_ring *,
370 				    struct hn_tx_ring *);
371 static void			hn_set_ring_inuse(struct hn_softc *, int);
372 static int			hn_synth_attach(struct hn_softc *, int);
373 static void			hn_synth_detach(struct hn_softc *);
374 static int			hn_synth_alloc_subchans(struct hn_softc *,
375 				    int *);
376 static bool			hn_synth_attachable(const struct hn_softc *);
377 static void			hn_suspend(struct hn_softc *);
378 static void			hn_suspend_data(struct hn_softc *);
379 static void			hn_suspend_mgmt(struct hn_softc *);
380 static void			hn_resume(struct hn_softc *);
381 static void			hn_resume_data(struct hn_softc *);
382 static void			hn_resume_mgmt(struct hn_softc *);
383 static void			hn_suspend_mgmt_taskfunc(void *, int);
384 static void			hn_chan_drain(struct hn_softc *,
385 				    struct vmbus_channel *);
386 static void			hn_disable_rx(struct hn_softc *);
387 static void			hn_drain_rxtx(struct hn_softc *, int);
388 static void			hn_polling(struct hn_softc *, u_int);
389 static void			hn_chan_polling(struct vmbus_channel *, u_int);
390 static void			hn_mtu_change_fixup(struct hn_softc *);
391 
392 static void			hn_update_link_status(struct hn_softc *);
393 static void			hn_change_network(struct hn_softc *);
394 static void			hn_link_taskfunc(void *, int);
395 static void			hn_netchg_init_taskfunc(void *, int);
396 static void			hn_netchg_status_taskfunc(void *, int);
397 static void			hn_link_status(struct hn_softc *);
398 
399 static int			hn_create_rx_data(struct hn_softc *, int);
400 static void			hn_destroy_rx_data(struct hn_softc *);
401 static int			hn_check_iplen(const struct mbuf *, int);
402 static void			hn_rxpkt_proto(const struct mbuf *, int *, int *);
403 static int			hn_set_rxfilter(struct hn_softc *, uint32_t);
404 static int			hn_rxfilter_config(struct hn_softc *);
405 static int			hn_rss_reconfig(struct hn_softc *);
406 static void			hn_rss_ind_fixup(struct hn_softc *);
407 static void			hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
408 static int			hn_rxpkt(struct hn_rx_ring *);
409 static uint32_t			hn_rss_type_fromndis(uint32_t);
410 static uint32_t			hn_rss_type_tondis(uint32_t);
411 
412 static int			hn_tx_ring_create(struct hn_softc *, int);
413 static void			hn_tx_ring_destroy(struct hn_tx_ring *);
414 static int			hn_create_tx_data(struct hn_softc *, int);
415 static void			hn_fixup_tx_data(struct hn_softc *);
416 static void			hn_fixup_rx_data(struct hn_softc *);
417 static void			hn_destroy_tx_data(struct hn_softc *);
418 static void			hn_txdesc_dmamap_destroy(struct hn_txdesc *);
419 static void			hn_txdesc_gc(struct hn_tx_ring *,
420 				    struct hn_txdesc *);
421 static int			hn_encap(if_t, struct hn_tx_ring *,
422 				    struct hn_txdesc *, struct mbuf **);
423 static int			hn_txpkt(if_t, struct hn_tx_ring *,
424 				    struct hn_txdesc *);
425 static void			hn_set_chim_size(struct hn_softc *, int);
426 static void			hn_set_tso_maxsize(struct hn_softc *, int, int);
427 static bool			hn_tx_ring_pending(struct hn_tx_ring *);
428 static void			hn_tx_ring_qflush(struct hn_tx_ring *);
429 static void			hn_resume_tx(struct hn_softc *, int);
430 static void			hn_set_txagg(struct hn_softc *);
431 static void			*hn_try_txagg(if_t,
432 				    struct hn_tx_ring *, struct hn_txdesc *,
433 				    int);
434 static int			hn_get_txswq_depth(const struct hn_tx_ring *);
435 static void			hn_txpkt_done(struct hn_nvs_sendctx *,
436 				    struct hn_softc *, struct vmbus_channel *,
437 				    const void *, int);
438 static int			hn_txpkt_sglist(struct hn_tx_ring *,
439 				    struct hn_txdesc *);
440 static int			hn_txpkt_chim(struct hn_tx_ring *,
441 				    struct hn_txdesc *);
442 static int			hn_xmit(struct hn_tx_ring *, int);
443 static void			hn_xmit_taskfunc(void *, int);
444 static void			hn_xmit_txeof(struct hn_tx_ring *);
445 static void			hn_xmit_txeof_taskfunc(void *, int);
446 #ifdef HN_IFSTART_SUPPORT
447 static int			hn_start_locked(struct hn_tx_ring *, int);
448 static void			hn_start_taskfunc(void *, int);
449 static void			hn_start_txeof(struct hn_tx_ring *);
450 static void			hn_start_txeof_taskfunc(void *, int);
451 #endif
452 
453 static int			hn_rsc_sysctl(SYSCTL_HANDLER_ARGS);
454 
455 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
456     "Hyper-V network interface");
457 
458 /* Trust tcp segment verification on host side. */
459 static int			hn_trust_hosttcp = 1;
460 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
461     &hn_trust_hosttcp, 0,
462     "Trust tcp segment verification on host side, "
463     "when csum info is missing (global setting)");
464 
465 /* Trust udp datagrams verification on host side. */
466 static int			hn_trust_hostudp = 1;
467 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
468     &hn_trust_hostudp, 0,
469     "Trust udp datagram verification on host side, "
470     "when csum info is missing (global setting)");
471 
472 /* Trust ip packets verification on host side. */
473 static int			hn_trust_hostip = 1;
474 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
475     &hn_trust_hostip, 0,
476     "Trust ip packet verification on host side, "
477     "when csum info is missing (global setting)");
478 
479 /*
480  * Offload UDP/IPv4 checksum.
481  */
482 static int			hn_enable_udp4cs = 1;
483 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
484     &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
485 
486 /*
487  * Offload UDP/IPv6 checksum.
488  */
489 static int			hn_enable_udp6cs = 1;
490 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
491     &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
492 
493 /* Stats. */
494 static counter_u64_t		hn_udpcs_fixup;
495 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
496     &hn_udpcs_fixup, "# of UDP checksum fixup");
497 
498 /*
499  * See hn_set_hlen().
500  *
501  * This value is for Azure.  For Hyper-V, set this above
502  * 65536 to disable UDP datagram checksum fixup.
503  */
504 static int			hn_udpcs_fixup_mtu = 1420;
505 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
506     &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
507 
508 /* Limit TSO burst size */
509 static int			hn_tso_maxlen = IP_MAXPACKET;
510 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
511     &hn_tso_maxlen, 0, "TSO burst limit");
512 
513 /* Limit chimney send size */
514 static int			hn_tx_chimney_size = 0;
515 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
516     &hn_tx_chimney_size, 0, "Chimney send packet size limit");
517 
518 /* Limit the size of packet for direct transmission */
519 static int			hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
520 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
521     &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
522 
523 /* # of LRO entries per RX ring */
524 #if defined(INET) || defined(INET6)
525 static int			hn_lro_entry_count = HN_LROENT_CNT_DEF;
526 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
527     &hn_lro_entry_count, 0, "LRO entry count");
528 #endif
529 
530 static int			hn_tx_taskq_cnt = 1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
532     &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
533 
534 #define HN_TX_TASKQ_M_INDEP	0
535 #define HN_TX_TASKQ_M_GLOBAL	1
536 #define HN_TX_TASKQ_M_EVTTQ	2
537 
538 static int			hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
539 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
540     &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
541     "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
542 
543 #ifndef HN_USE_TXDESC_BUFRING
544 static int			hn_use_txdesc_bufring = 0;
545 #else
546 static int			hn_use_txdesc_bufring = 1;
547 #endif
548 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
549     &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
550 
551 #ifdef HN_IFSTART_SUPPORT
552 /* Use ifnet.if_start instead of ifnet.if_transmit */
553 static int			hn_use_if_start = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
555     &hn_use_if_start, 0, "Use if_start TX method");
556 #endif
557 
558 /* # of channels to use */
559 static int			hn_chan_cnt = 0;
560 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
561     &hn_chan_cnt, 0,
562     "# of channels to use; each channel has one RX ring and one TX ring");
563 
564 /* # of transmit rings to use */
565 static int			hn_tx_ring_cnt = 0;
566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
567     &hn_tx_ring_cnt, 0, "# of TX rings to use");
568 
569 /* Software TX ring deptch */
570 static int			hn_tx_swq_depth = 0;
571 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
572     &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
573 
574 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
575 static u_int			hn_lro_mbufq_depth = 0;
576 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
577     &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
578 
579 /* Packet transmission aggregation size limit */
580 static int			hn_tx_agg_size = -1;
581 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
582     &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
583 
584 /* Packet transmission aggregation count limit */
585 static int			hn_tx_agg_pkts = -1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
587     &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
588 
589 /* VF list */
590 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist,
591     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
592     hn_vflist_sysctl, "A",
593     "VF list");
594 
595 /* VF mapping */
596 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap,
597     CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0,
598     hn_vfmap_sysctl, "A",
599     "VF mapping");
600 
601 /* Transparent VF */
602 static int			hn_xpnt_vf = 1;
603 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
604     &hn_xpnt_vf, 0, "Transparent VF mod");
605 
606 /* Accurate BPF support for Transparent VF */
607 static int			hn_xpnt_vf_accbpf = 0;
608 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
609     &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
610 
611 /* Extra wait for transparent VF attach routing; unit seconds. */
612 static int			hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
613 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
614     &hn_xpnt_vf_attwait, 0,
615     "Extra wait for transparent VF attach routing; unit: seconds");
616 
617 static u_int			hn_cpu_index;	/* next CPU for channel */
618 static struct taskqueue		**hn_tx_taskque;/* shared TX taskqueues */
619 
620 static struct rmlock		hn_vfmap_lock;
621 static int			hn_vfmap_size;
622 static if_t			*hn_vfmap;
623 
624 #ifndef RSS
625 static const uint8_t
626 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
627 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
628 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
629 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
630 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
631 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
632 };
633 #endif	/* !RSS */
634 
635 static const struct hyperv_guid	hn_guid = {
636 	.hv_guid = {
637 	    0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
638 	    0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
639 };
640 
641 static device_method_t hn_methods[] = {
642 	/* Device interface */
643 	DEVMETHOD(device_probe,		hn_probe),
644 	DEVMETHOD(device_attach,	hn_attach),
645 	DEVMETHOD(device_detach,	hn_detach),
646 	DEVMETHOD(device_shutdown,	hn_shutdown),
647 	DEVMETHOD_END
648 };
649 
650 static driver_t hn_driver = {
651 	"hn",
652 	hn_methods,
653 	sizeof(struct hn_softc)
654 };
655 
656 DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0);
657 MODULE_VERSION(hn, 1);
658 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
659 
660 static void
hn_set_lro_lenlim(struct hn_softc * sc,int lenlim)661 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
662 {
663 	int i;
664 
665 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
666 		sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
667 }
668 
669 static int
hn_txpkt_sglist(struct hn_tx_ring * txr,struct hn_txdesc * txd)670 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
671 {
672 
673 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
674 	    txd->chim_size == 0, ("invalid rndis sglist txd"));
675 	return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
676 	    &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
677 }
678 
679 static int
hn_txpkt_chim(struct hn_tx_ring * txr,struct hn_txdesc * txd)680 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
681 {
682 	struct hn_nvs_rndis rndis;
683 
684 	KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
685 	    txd->chim_size > 0, ("invalid rndis chim txd"));
686 
687 	rndis.nvs_type = HN_NVS_TYPE_RNDIS;
688 	rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
689 	rndis.nvs_chim_idx = txd->chim_index;
690 	rndis.nvs_chim_sz = txd->chim_size;
691 
692 	return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
693 	    &rndis, sizeof(rndis), &txd->send_ctx));
694 }
695 
696 static __inline uint32_t
hn_chim_alloc(struct hn_softc * sc)697 hn_chim_alloc(struct hn_softc *sc)
698 {
699 	int i, bmap_cnt = sc->hn_chim_bmap_cnt;
700 	u_long *bmap = sc->hn_chim_bmap;
701 	uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
702 
703 	for (i = 0; i < bmap_cnt; ++i) {
704 		int idx;
705 
706 		idx = ffsl(~bmap[i]);
707 		if (idx == 0)
708 			continue;
709 
710 		--idx; /* ffsl is 1-based */
711 		KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
712 		    ("invalid i %d and idx %d", i, idx));
713 
714 		if (atomic_testandset_long(&bmap[i], idx))
715 			continue;
716 
717 		ret = i * LONG_BIT + idx;
718 		break;
719 	}
720 	return (ret);
721 }
722 
723 static __inline void
hn_chim_free(struct hn_softc * sc,uint32_t chim_idx)724 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
725 {
726 	u_long mask;
727 	uint32_t idx;
728 
729 	idx = chim_idx / LONG_BIT;
730 	KASSERT(idx < sc->hn_chim_bmap_cnt,
731 	    ("invalid chimney index 0x%x", chim_idx));
732 
733 	mask = 1UL << (chim_idx % LONG_BIT);
734 	KASSERT(sc->hn_chim_bmap[idx] & mask,
735 	    ("index bitmap 0x%lx, chimney index %u, "
736 	     "bitmap idx %d, bitmask 0x%lx",
737 	     sc->hn_chim_bmap[idx], chim_idx, idx, mask));
738 
739 	atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
740 }
741 
742 #if defined(INET6) || defined(INET)
743 
744 #define PULLUP_HDR(m, len)				\
745 do {							\
746 	if (__predict_false((m)->m_len < (len))) {	\
747 		(m) = m_pullup((m), (len));		\
748 		if ((m) == NULL)			\
749 			return (NULL);			\
750 	}						\
751 } while (0)
752 
753 /*
754  * NOTE: If this function failed, the m_head would be freed.
755  */
756 static __inline struct mbuf *
hn_tso_fixup(struct mbuf * m_head)757 hn_tso_fixup(struct mbuf *m_head)
758 {
759 	struct ether_vlan_header *evl;
760 	struct tcphdr *th;
761 	int ehlen;
762 
763 	KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
764 
765 	PULLUP_HDR(m_head, sizeof(*evl));
766 	evl = mtod(m_head, struct ether_vlan_header *);
767 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
768 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
769 	else
770 		ehlen = ETHER_HDR_LEN;
771 	m_head->m_pkthdr.l2hlen = ehlen;
772 
773 #ifdef INET
774 	if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
775 		struct ip *ip;
776 		int iphlen;
777 
778 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
779 		ip = mtodo(m_head, ehlen);
780 		iphlen = ip->ip_hl << 2;
781 		m_head->m_pkthdr.l3hlen = iphlen;
782 
783 		PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
784 		th = mtodo(m_head, ehlen + iphlen);
785 
786 		ip->ip_len = 0;
787 		ip->ip_sum = 0;
788 		th->th_sum = in_pseudo(ip->ip_src.s_addr,
789 		    ip->ip_dst.s_addr, htons(IPPROTO_TCP));
790 	}
791 #endif
792 #if defined(INET6) && defined(INET)
793 	else
794 #endif
795 #ifdef INET6
796 	{
797 		struct ip6_hdr *ip6;
798 
799 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
800 		ip6 = mtodo(m_head, ehlen);
801 		if (ip6->ip6_nxt != IPPROTO_TCP) {
802 			m_freem(m_head);
803 			return (NULL);
804 		}
805 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
806 
807 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
808 		th = mtodo(m_head, ehlen + sizeof(*ip6));
809 
810 		ip6->ip6_plen = 0;
811 		th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
812 	}
813 #endif
814 	return (m_head);
815 }
816 
817 /*
818  * NOTE: If this function failed, the m_head would be freed.
819  */
820 static __inline struct mbuf *
hn_set_hlen(struct mbuf * m_head)821 hn_set_hlen(struct mbuf *m_head)
822 {
823 	const struct ether_vlan_header *evl;
824 	int ehlen;
825 
826 	PULLUP_HDR(m_head, sizeof(*evl));
827 	evl = mtod(m_head, const struct ether_vlan_header *);
828 	if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
829 		ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
830 	else
831 		ehlen = ETHER_HDR_LEN;
832 	m_head->m_pkthdr.l2hlen = ehlen;
833 
834 #ifdef INET
835 	if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
836 		const struct ip *ip;
837 		int iphlen;
838 
839 		PULLUP_HDR(m_head, ehlen + sizeof(*ip));
840 		ip = mtodo(m_head, ehlen);
841 		iphlen = ip->ip_hl << 2;
842 		m_head->m_pkthdr.l3hlen = iphlen;
843 
844 		/*
845 		 * UDP checksum offload does not work in Azure, if the
846 		 * following conditions meet:
847 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
848 		 * - IP_DF is not set in the IP hdr.
849 		 *
850 		 * Fallback to software checksum for these UDP datagrams.
851 		 */
852 		if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
853 		    m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
854 		    (ntohs(ip->ip_off) & IP_DF) == 0) {
855 			uint16_t off = ehlen + iphlen;
856 
857 			counter_u64_add(hn_udpcs_fixup, 1);
858 			PULLUP_HDR(m_head, off + sizeof(struct udphdr));
859 			*(uint16_t *)(m_head->m_data + off +
860                             m_head->m_pkthdr.csum_data) = in_cksum_skip(
861 			    m_head, m_head->m_pkthdr.len, off);
862 			m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
863 		}
864 	}
865 #endif
866 #if defined(INET6) && defined(INET)
867 	else
868 #endif
869 #ifdef INET6
870 	{
871 		const struct ip6_hdr *ip6;
872 
873 		PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
874 		ip6 = mtodo(m_head, ehlen);
875 		if (ip6->ip6_nxt != IPPROTO_TCP &&
876 		    ip6->ip6_nxt != IPPROTO_UDP) {
877 			m_freem(m_head);
878 			return (NULL);
879 		}
880 		m_head->m_pkthdr.l3hlen = sizeof(*ip6);
881 	}
882 #endif
883 	return (m_head);
884 }
885 
886 /*
887  * NOTE: If this function failed, the m_head would be freed.
888  */
889 static __inline struct mbuf *
hn_check_tcpsyn(struct mbuf * m_head,int * tcpsyn)890 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
891 {
892 	const struct tcphdr *th;
893 	int ehlen, iphlen;
894 
895 	*tcpsyn = 0;
896 	ehlen = m_head->m_pkthdr.l2hlen;
897 	iphlen = m_head->m_pkthdr.l3hlen;
898 
899 	PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
900 	th = mtodo(m_head, ehlen + iphlen);
901 	if (th->th_flags & TH_SYN)
902 		*tcpsyn = 1;
903 	return (m_head);
904 }
905 
906 #undef PULLUP_HDR
907 
908 #endif	/* INET6 || INET */
909 
910 static int
hn_set_rxfilter(struct hn_softc * sc,uint32_t filter)911 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
912 {
913 	int error = 0;
914 
915 	HN_LOCK_ASSERT(sc);
916 
917 	if (sc->hn_rx_filter != filter) {
918 		error = hn_rndis_set_rxfilter(sc, filter);
919 		if (!error)
920 			sc->hn_rx_filter = filter;
921 	}
922 	return (error);
923 }
924 
925 static int
hn_rxfilter_config(struct hn_softc * sc)926 hn_rxfilter_config(struct hn_softc *sc)
927 {
928 	if_t ifp = sc->hn_ifp;
929 	uint32_t filter;
930 
931 	HN_LOCK_ASSERT(sc);
932 
933 	/*
934 	 * If the non-transparent mode VF is activated, we don't know how
935 	 * its RX filter is configured, so stick the synthetic device in
936 	 * the promiscous mode.
937 	 */
938 	if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
939 		filter = NDIS_PACKET_TYPE_PROMISCUOUS;
940 	} else {
941 		filter = NDIS_PACKET_TYPE_DIRECTED;
942 		if (if_getflags(ifp) & IFF_BROADCAST)
943 			filter |= NDIS_PACKET_TYPE_BROADCAST;
944 		/* TODO: support multicast list */
945 		if ((if_getflags(ifp) & IFF_ALLMULTI) ||
946 		    !if_maddr_empty(ifp))
947 			filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
948 	}
949 	return (hn_set_rxfilter(sc, filter));
950 }
951 
952 static void
hn_set_txagg(struct hn_softc * sc)953 hn_set_txagg(struct hn_softc *sc)
954 {
955 	uint32_t size, pkts;
956 	int i;
957 
958 	/*
959 	 * Setup aggregation size.
960 	 */
961 	if (sc->hn_agg_size < 0)
962 		size = UINT32_MAX;
963 	else
964 		size = sc->hn_agg_size;
965 
966 	if (sc->hn_rndis_agg_size < size)
967 		size = sc->hn_rndis_agg_size;
968 
969 	/* NOTE: We only aggregate packets using chimney sending buffers. */
970 	if (size > (uint32_t)sc->hn_chim_szmax)
971 		size = sc->hn_chim_szmax;
972 
973 	if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
974 		/* Disable */
975 		size = 0;
976 		pkts = 0;
977 		goto done;
978 	}
979 
980 	/* NOTE: Type of the per TX ring setting is 'int'. */
981 	if (size > INT_MAX)
982 		size = INT_MAX;
983 
984 	/*
985 	 * Setup aggregation packet count.
986 	 */
987 	if (sc->hn_agg_pkts < 0)
988 		pkts = UINT32_MAX;
989 	else
990 		pkts = sc->hn_agg_pkts;
991 
992 	if (sc->hn_rndis_agg_pkts < pkts)
993 		pkts = sc->hn_rndis_agg_pkts;
994 
995 	if (pkts <= 1) {
996 		/* Disable */
997 		size = 0;
998 		pkts = 0;
999 		goto done;
1000 	}
1001 
1002 	/* NOTE: Type of the per TX ring setting is 'short'. */
1003 	if (pkts > SHRT_MAX)
1004 		pkts = SHRT_MAX;
1005 
1006 done:
1007 	/* NOTE: Type of the per TX ring setting is 'short'. */
1008 	if (sc->hn_rndis_agg_align > SHRT_MAX) {
1009 		/* Disable */
1010 		size = 0;
1011 		pkts = 0;
1012 	}
1013 
1014 	if (bootverbose) {
1015 		if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1016 		    size, pkts, sc->hn_rndis_agg_align);
1017 	}
1018 
1019 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1020 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1021 
1022 		mtx_lock(&txr->hn_tx_lock);
1023 		txr->hn_agg_szmax = size;
1024 		txr->hn_agg_pktmax = pkts;
1025 		txr->hn_agg_align = sc->hn_rndis_agg_align;
1026 		mtx_unlock(&txr->hn_tx_lock);
1027 	}
1028 }
1029 
1030 static int
hn_get_txswq_depth(const struct hn_tx_ring * txr)1031 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1032 {
1033 
1034 	KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1035 	if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1036 		return txr->hn_txdesc_cnt;
1037 	return hn_tx_swq_depth;
1038 }
1039 
1040 static int
hn_rss_reconfig(struct hn_softc * sc)1041 hn_rss_reconfig(struct hn_softc *sc)
1042 {
1043 	int error;
1044 
1045 	HN_LOCK_ASSERT(sc);
1046 
1047 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1048 		return (ENXIO);
1049 
1050 	/*
1051 	 * Disable RSS first.
1052 	 *
1053 	 * NOTE:
1054 	 * Direct reconfiguration by setting the UNCHG flags does
1055 	 * _not_ work properly.
1056 	 */
1057 	if (bootverbose)
1058 		if_printf(sc->hn_ifp, "disable RSS\n");
1059 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1060 	if (error) {
1061 		if_printf(sc->hn_ifp, "RSS disable failed\n");
1062 		return (error);
1063 	}
1064 
1065 	/*
1066 	 * Reenable the RSS w/ the updated RSS key or indirect
1067 	 * table.
1068 	 */
1069 	if (bootverbose)
1070 		if_printf(sc->hn_ifp, "reconfig RSS\n");
1071 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1072 	if (error) {
1073 		if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1074 		return (error);
1075 	}
1076 	return (0);
1077 }
1078 
1079 static void
hn_rss_ind_fixup(struct hn_softc * sc)1080 hn_rss_ind_fixup(struct hn_softc *sc)
1081 {
1082 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1083 	int i, nchan;
1084 
1085 	nchan = sc->hn_rx_ring_inuse;
1086 	KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1087 
1088 	/*
1089 	 * Check indirect table to make sure that all channels in it
1090 	 * can be used.
1091 	 */
1092 	for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1093 		if (rss->rss_ind[i] >= nchan) {
1094 			if_printf(sc->hn_ifp,
1095 			    "RSS indirect table %d fixup: %u -> %d\n",
1096 			    i, rss->rss_ind[i], nchan - 1);
1097 			rss->rss_ind[i] = nchan - 1;
1098 		}
1099 	}
1100 }
1101 
1102 static int
hn_ifmedia_upd(if_t ifp __unused)1103 hn_ifmedia_upd(if_t ifp __unused)
1104 {
1105 
1106 	/* Ignore since autoselect is the only defined and valid media */
1107 	return (0);
1108 }
1109 
1110 static void
hn_ifmedia_sts(if_t ifp,struct ifmediareq * ifmr)1111 hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
1112 {
1113 	struct hn_softc *sc = if_getsoftc(ifp);
1114 
1115 	ifmr->ifm_status = IFM_AVALID;
1116 	ifmr->ifm_active = IFM_ETHER;
1117 
1118 	if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1119 		ifmr->ifm_active |= IFM_NONE;
1120 		return;
1121 	}
1122 	ifmr->ifm_status |= IFM_ACTIVE;
1123 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1124 }
1125 
1126 static void
hn_rxvf_set_task(void * xarg,int pending __unused)1127 hn_rxvf_set_task(void *xarg, int pending __unused)
1128 {
1129 	struct hn_rxvf_setarg *arg = xarg;
1130 
1131 	arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1132 }
1133 
1134 static void
hn_rxvf_set(struct hn_softc * sc,if_t vf_ifp)1135 hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp)
1136 {
1137 	struct hn_rx_ring *rxr;
1138 	struct hn_rxvf_setarg arg;
1139 	struct task task;
1140 	int i;
1141 
1142 	HN_LOCK_ASSERT(sc);
1143 
1144 	TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1145 
1146 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1147 		rxr = &sc->hn_rx_ring[i];
1148 
1149 		if (i < sc->hn_rx_ring_inuse) {
1150 			arg.rxr = rxr;
1151 			arg.vf_ifp = vf_ifp;
1152 			vmbus_chan_run_task(rxr->hn_chan, &task);
1153 		} else {
1154 			rxr->hn_rxvf_ifp = vf_ifp;
1155 		}
1156 	}
1157 }
1158 
1159 static bool
hn_ismyvf(const struct hn_softc * sc,const if_t ifp)1160 hn_ismyvf(const struct hn_softc *sc, const if_t ifp)
1161 {
1162 	if_t hn_ifp;
1163 
1164 	hn_ifp = sc->hn_ifp;
1165 
1166 	if (ifp == hn_ifp)
1167 		return (false);
1168 
1169 	if (if_getalloctype(ifp) != IFT_ETHER)
1170 		return (false);
1171 
1172 	/* Ignore lagg/vlan interfaces */
1173 	if (strcmp(if_getdname(ifp), "lagg") == 0 ||
1174 	    strcmp(if_getdname(ifp), "vlan") == 0)
1175 		return (false);
1176 
1177 	/*
1178 	 * During detach events if_getifaddr(ifp) might be NULL.
1179 	 * Make sure the bcmp() below doesn't panic on that:
1180 	 */
1181 	if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL)
1182 		return (false);
1183 
1184 	if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0)
1185 		return (false);
1186 
1187 	return (true);
1188 }
1189 
1190 static void
hn_rxvf_change(struct hn_softc * sc,if_t ifp,bool rxvf)1191 hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf)
1192 {
1193 	if_t hn_ifp;
1194 
1195 	HN_LOCK(sc);
1196 
1197 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1198 		goto out;
1199 
1200 	if (!hn_ismyvf(sc, ifp))
1201 		goto out;
1202 	hn_ifp = sc->hn_ifp;
1203 
1204 	if (rxvf) {
1205 		if (sc->hn_flags & HN_FLAG_RXVF)
1206 			goto out;
1207 
1208 		sc->hn_flags |= HN_FLAG_RXVF;
1209 		hn_rxfilter_config(sc);
1210 	} else {
1211 		if (!(sc->hn_flags & HN_FLAG_RXVF))
1212 			goto out;
1213 
1214 		sc->hn_flags &= ~HN_FLAG_RXVF;
1215 		if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING)
1216 			hn_rxfilter_config(sc);
1217 		else
1218 			hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1219 	}
1220 
1221 	hn_nvs_set_datapath(sc,
1222 	    rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1223 
1224 	hn_rxvf_set(sc, rxvf ? ifp : NULL);
1225 
1226 	if (rxvf) {
1227 		hn_vf_rss_fixup(sc, true);
1228 		hn_suspend_mgmt(sc);
1229 		sc->hn_link_flags &=
1230 		    ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1231 		if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1232 	} else {
1233 		hn_vf_rss_restore(sc);
1234 		hn_resume_mgmt(sc);
1235 	}
1236 
1237 	devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1238 	    rxvf ? "VF_UP" : "VF_DOWN", NULL);
1239 
1240 	if (bootverbose) {
1241 		if_printf(hn_ifp, "datapath is switched %s %s\n",
1242 		    rxvf ? "to" : "from", if_name(ifp));
1243 	}
1244 out:
1245 	HN_UNLOCK(sc);
1246 }
1247 
1248 static void
hn_ifnet_event(void * arg,if_t ifp,int event)1249 hn_ifnet_event(void *arg, if_t ifp, int event)
1250 {
1251 
1252 	if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1253 		return;
1254 	hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1255 }
1256 
1257 static void
hn_ifaddr_event(void * arg,if_t ifp)1258 hn_ifaddr_event(void *arg, if_t ifp)
1259 {
1260 
1261 	hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP);
1262 }
1263 
1264 static int
hn_xpnt_vf_iocsetcaps(struct hn_softc * sc,struct ifreq * ifr __unused)1265 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr __unused)
1266 {
1267 	if_t ifp, vf_ifp;
1268 
1269 	HN_LOCK_ASSERT(sc);
1270 	ifp = sc->hn_ifp;
1271 	vf_ifp = sc->hn_vf_ifp;
1272 
1273 	/*
1274 	 * Just sync up with VF's enabled capabilities.
1275 	 */
1276 	if_setcapenable(ifp, if_getcapenable(vf_ifp));
1277 	if_sethwassist(ifp, if_gethwassist(vf_ifp));
1278 
1279 	return (0);
1280 }
1281 
1282 static int
hn_xpnt_vf_iocsetflags(struct hn_softc * sc)1283 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1284 {
1285 	if_t vf_ifp;
1286 	struct ifreq ifr;
1287 
1288 	HN_LOCK_ASSERT(sc);
1289 	vf_ifp = sc->hn_vf_ifp;
1290 
1291 	memset(&ifr, 0, sizeof(ifr));
1292 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1293 	ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff;
1294 	ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16;
1295 	return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread));
1296 }
1297 
1298 static void
hn_xpnt_vf_saveifflags(struct hn_softc * sc)1299 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1300 {
1301 	if_t ifp = sc->hn_ifp;
1302 	int allmulti = 0;
1303 
1304 	HN_LOCK_ASSERT(sc);
1305 
1306 	/* XXX vlan(4) style mcast addr maintenance */
1307 	if (!if_maddr_empty(ifp))
1308 		allmulti = IFF_ALLMULTI;
1309 
1310 	/* Always set the VF's if_flags */
1311 	if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti);
1312 }
1313 
1314 static void
hn_xpnt_vf_input(if_t vf_ifp,struct mbuf * m)1315 hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m)
1316 {
1317 	struct rm_priotracker pt;
1318 	if_t hn_ifp = NULL;
1319 	struct mbuf *mn;
1320 
1321 	/*
1322 	 * XXX racy, if hn(4) ever detached.
1323 	 */
1324 	rm_rlock(&hn_vfmap_lock, &pt);
1325 	if (if_getindex(vf_ifp) < hn_vfmap_size)
1326 		hn_ifp = hn_vfmap[if_getindex(vf_ifp)];
1327 	rm_runlock(&hn_vfmap_lock, &pt);
1328 
1329 	if (hn_ifp != NULL) {
1330 		for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1331 			/*
1332 			 * Allow tapping on the VF.
1333 			 */
1334 			ETHER_BPF_MTAP(vf_ifp, mn);
1335 
1336 			/*
1337 			 * Update VF stats.
1338 			 */
1339 			if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) {
1340 				if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1341 				    mn->m_pkthdr.len);
1342 			}
1343 			/*
1344 			 * XXX IFCOUNTER_IMCAST
1345 			 * This stat updating is kinda invasive, since it
1346 			 * requires two checks on the mbuf: the length check
1347 			 * and the ethernet header check.  As of this write,
1348 			 * all multicast packets go directly to hn(4), which
1349 			 * makes imcast stat updating in the VF a try in vian.
1350 			 */
1351 
1352 			/*
1353 			 * Fix up rcvif and increase hn(4)'s ipackets.
1354 			 */
1355 			mn->m_pkthdr.rcvif = hn_ifp;
1356 			if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1357 		}
1358 		/*
1359 		 * Go through hn(4)'s if_input.
1360 		 */
1361 		if_input(hn_ifp, m);
1362 	} else {
1363 		/*
1364 		 * In the middle of the transition; free this
1365 		 * mbuf chain.
1366 		 */
1367 		while (m != NULL) {
1368 			mn = m->m_nextpkt;
1369 			m->m_nextpkt = NULL;
1370 			m_freem(m);
1371 			m = mn;
1372 		}
1373 	}
1374 }
1375 
1376 static void
hn_mtu_change_fixup(struct hn_softc * sc)1377 hn_mtu_change_fixup(struct hn_softc *sc)
1378 {
1379 	if_t ifp;
1380 
1381 	HN_LOCK_ASSERT(sc);
1382 	ifp = sc->hn_ifp;
1383 
1384 	hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp));
1385 	if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1386 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1387 }
1388 
1389 static uint32_t
hn_rss_type_fromndis(uint32_t rss_hash)1390 hn_rss_type_fromndis(uint32_t rss_hash)
1391 {
1392 	uint32_t types = 0;
1393 
1394 	if (rss_hash & NDIS_HASH_IPV4)
1395 		types |= RSS_TYPE_IPV4;
1396 	if (rss_hash & NDIS_HASH_TCP_IPV4)
1397 		types |= RSS_TYPE_TCP_IPV4;
1398 	if (rss_hash & NDIS_HASH_IPV6)
1399 		types |= RSS_TYPE_IPV6;
1400 	if (rss_hash & NDIS_HASH_IPV6_EX)
1401 		types |= RSS_TYPE_IPV6_EX;
1402 	if (rss_hash & NDIS_HASH_TCP_IPV6)
1403 		types |= RSS_TYPE_TCP_IPV6;
1404 	if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1405 		types |= RSS_TYPE_TCP_IPV6_EX;
1406 	if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1407 		types |= RSS_TYPE_UDP_IPV4;
1408 	return (types);
1409 }
1410 
1411 static uint32_t
hn_rss_type_tondis(uint32_t types)1412 hn_rss_type_tondis(uint32_t types)
1413 {
1414 	uint32_t rss_hash = 0;
1415 
1416 	KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1417 	    ("UDP6 and UDP6EX are not supported"));
1418 
1419 	if (types & RSS_TYPE_IPV4)
1420 		rss_hash |= NDIS_HASH_IPV4;
1421 	if (types & RSS_TYPE_TCP_IPV4)
1422 		rss_hash |= NDIS_HASH_TCP_IPV4;
1423 	if (types & RSS_TYPE_IPV6)
1424 		rss_hash |= NDIS_HASH_IPV6;
1425 	if (types & RSS_TYPE_IPV6_EX)
1426 		rss_hash |= NDIS_HASH_IPV6_EX;
1427 	if (types & RSS_TYPE_TCP_IPV6)
1428 		rss_hash |= NDIS_HASH_TCP_IPV6;
1429 	if (types & RSS_TYPE_TCP_IPV6_EX)
1430 		rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1431 	if (types & RSS_TYPE_UDP_IPV4)
1432 		rss_hash |= NDIS_HASH_UDP_IPV4_X;
1433 	return (rss_hash);
1434 }
1435 
1436 static void
hn_rss_mbuf_hash(struct hn_softc * sc,uint32_t mbuf_hash)1437 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1438 {
1439 	int i;
1440 
1441 	HN_LOCK_ASSERT(sc);
1442 
1443 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1444 		sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1445 }
1446 
1447 static void
hn_vf_rss_fixup(struct hn_softc * sc,bool reconf)1448 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1449 {
1450 	if_t ifp, vf_ifp;
1451 	struct ifrsshash ifrh;
1452 	struct ifrsskey ifrk;
1453 	int error;
1454 	uint32_t my_types, diff_types, mbuf_types = 0;
1455 
1456 	HN_LOCK_ASSERT(sc);
1457 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1458 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1459 
1460 	if (sc->hn_rx_ring_inuse == 1) {
1461 		/* No RSS on synthetic parts; done. */
1462 		return;
1463 	}
1464 	if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1465 		/* Synthetic parts do not support Toeplitz; done. */
1466 		return;
1467 	}
1468 
1469 	ifp = sc->hn_ifp;
1470 	vf_ifp = sc->hn_vf_ifp;
1471 
1472 	/*
1473 	 * Extract VF's RSS key.  Only 40 bytes key for Toeplitz is
1474 	 * supported.
1475 	 */
1476 	memset(&ifrk, 0, sizeof(ifrk));
1477 	strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name));
1478 	error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread);
1479 	if (error) {
1480 		if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1481 		    if_name(vf_ifp), error);
1482 		goto done;
1483 	}
1484 	if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1485 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1486 		    if_name(vf_ifp), ifrk.ifrk_func);
1487 		goto done;
1488 	}
1489 	if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1490 		if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1491 		    if_name(vf_ifp), ifrk.ifrk_keylen);
1492 		goto done;
1493 	}
1494 
1495 	/*
1496 	 * Extract VF's RSS hash.  Only Toeplitz is supported.
1497 	 */
1498 	memset(&ifrh, 0, sizeof(ifrh));
1499 	strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name));
1500 	error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread);
1501 	if (error) {
1502 		if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1503 		    if_name(vf_ifp), error);
1504 		goto done;
1505 	}
1506 	if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1507 		if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1508 		    if_name(vf_ifp), ifrh.ifrh_func);
1509 		goto done;
1510 	}
1511 
1512 	my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1513 	if ((ifrh.ifrh_types & my_types) == 0) {
1514 		/* This disables RSS; ignore it then */
1515 		if_printf(ifp, "%s intersection of RSS types failed.  "
1516 		    "VF %#x, mine %#x\n", if_name(vf_ifp),
1517 		    ifrh.ifrh_types, my_types);
1518 		goto done;
1519 	}
1520 
1521 	diff_types = my_types ^ ifrh.ifrh_types;
1522 	my_types &= ifrh.ifrh_types;
1523 	mbuf_types = my_types;
1524 
1525 	/*
1526 	 * Detect RSS hash value/type confliction.
1527 	 *
1528 	 * NOTE:
1529 	 * We don't disable the hash type, but stop delivery the hash
1530 	 * value/type through mbufs on RX path.
1531 	 *
1532 	 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1533 	 * hash is delivered with type of TCP_IPV4.  This means if
1534 	 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1535 	 * least to hn_mbuf_hash.  However, given that _all_ of the
1536 	 * NICs implement TCP_IPV4, this will _not_ impose any issues
1537 	 * here.
1538 	 */
1539 	if ((my_types & RSS_TYPE_IPV4) &&
1540 	    (diff_types & ifrh.ifrh_types &
1541 	     (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1542 		/* Conflict; disable IPV4 hash type/value delivery. */
1543 		if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1544 		mbuf_types &= ~RSS_TYPE_IPV4;
1545 	}
1546 	if ((my_types & RSS_TYPE_IPV6) &&
1547 	    (diff_types & ifrh.ifrh_types &
1548 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1549 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1550 	      RSS_TYPE_IPV6_EX))) {
1551 		/* Conflict; disable IPV6 hash type/value delivery. */
1552 		if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1553 		mbuf_types &= ~RSS_TYPE_IPV6;
1554 	}
1555 	if ((my_types & RSS_TYPE_IPV6_EX) &&
1556 	    (diff_types & ifrh.ifrh_types &
1557 	     (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1558 	      RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1559 	      RSS_TYPE_IPV6))) {
1560 		/* Conflict; disable IPV6_EX hash type/value delivery. */
1561 		if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1562 		mbuf_types &= ~RSS_TYPE_IPV6_EX;
1563 	}
1564 	if ((my_types & RSS_TYPE_TCP_IPV6) &&
1565 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1566 		/* Conflict; disable TCP_IPV6 hash type/value delivery. */
1567 		if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1568 		mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1569 	}
1570 	if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1571 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1572 		/* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1573 		if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1574 		mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1575 	}
1576 	if ((my_types & RSS_TYPE_UDP_IPV6) &&
1577 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1578 		/* Conflict; disable UDP_IPV6 hash type/value delivery. */
1579 		if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1580 		mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1581 	}
1582 	if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1583 	    (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1584 		/* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1585 		if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1586 		mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1587 	}
1588 
1589 	/*
1590 	 * Indirect table does not matter.
1591 	 */
1592 
1593 	sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1594 	    hn_rss_type_tondis(my_types);
1595 	memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1596 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1597 
1598 	if (reconf) {
1599 		error = hn_rss_reconfig(sc);
1600 		if (error) {
1601 			/* XXX roll-back? */
1602 			if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1603 			/* XXX keep going. */
1604 		}
1605 	}
1606 done:
1607 	/* Hash deliverability for mbufs. */
1608 	hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1609 }
1610 
1611 static void
hn_vf_rss_restore(struct hn_softc * sc)1612 hn_vf_rss_restore(struct hn_softc *sc)
1613 {
1614 
1615 	HN_LOCK_ASSERT(sc);
1616 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1617 	    ("%s: synthetic parts are not attached", if_name(sc->hn_ifp)));
1618 
1619 	if (sc->hn_rx_ring_inuse == 1)
1620 		goto done;
1621 
1622 	/*
1623 	 * Restore hash types.  Key does _not_ matter.
1624 	 */
1625 	if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1626 		int error;
1627 
1628 		sc->hn_rss_hash = sc->hn_rss_hcap;
1629 		error = hn_rss_reconfig(sc);
1630 		if (error) {
1631 			if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1632 			    error);
1633 			/* XXX keep going. */
1634 		}
1635 	}
1636 done:
1637 	/* Hash deliverability for mbufs. */
1638 	hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1639 }
1640 
1641 static void
hn_xpnt_vf_setready(struct hn_softc * sc)1642 hn_xpnt_vf_setready(struct hn_softc *sc)
1643 {
1644 	if_t ifp, vf_ifp;
1645 	struct ifreq ifr;
1646 
1647 	HN_LOCK_ASSERT(sc);
1648 	ifp = sc->hn_ifp;
1649 	vf_ifp = sc->hn_vf_ifp;
1650 
1651 	/*
1652 	 * Mark the VF ready.
1653 	 */
1654 	sc->hn_vf_rdytick = 0;
1655 
1656 	/*
1657 	 * Save information for restoration.
1658 	 */
1659 	sc->hn_saved_caps = if_getcapabilities(ifp);
1660 	sc->hn_saved_tsomax = if_gethwtsomax(ifp);
1661 	sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp);
1662 	sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp);
1663 	sc->hn_saved_capenable = if_getcapenable(ifp);
1664 	sc->hn_saved_hwassist = if_gethwassist(ifp);
1665 
1666 	/*
1667 	 * Intersect supported/enabled capabilities.
1668 	 *
1669 	 * NOTE:
1670 	 * if_hwassist is not changed here.
1671 	 */
1672 	if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp));
1673 	if_setcapenablebit(ifp, 0, if_getcapabilities(ifp));
1674 
1675 	/*
1676 	 * Fix TSO settings.
1677 	 */
1678 	if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp))
1679 		if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp));
1680 	if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp))
1681 		if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp));
1682 	if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp))
1683 		if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp));
1684 
1685 	/*
1686 	 * Change VF's enabled capabilities.
1687 	 */
1688 	memset(&ifr, 0, sizeof(ifr));
1689 	strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1690 	ifr.ifr_reqcap = if_getcapenable(ifp);
1691 	hn_xpnt_vf_iocsetcaps(sc, &ifr);
1692 
1693 	if (if_getmtu(ifp) != ETHERMTU) {
1694 		int error;
1695 
1696 		/*
1697 		 * Change VF's MTU.
1698 		 */
1699 		memset(&ifr, 0, sizeof(ifr));
1700 		strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name));
1701 		ifr.ifr_mtu = if_getmtu(ifp);
1702 		error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread);
1703 		if (error) {
1704 			if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1705 			    if_name(vf_ifp), if_getmtu(ifp));
1706 			if (if_getmtu(ifp) > ETHERMTU) {
1707 				if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1708 
1709 				/*
1710 				 * XXX
1711 				 * No need to adjust the synthetic parts' MTU;
1712 				 * failure of the adjustment will cause us
1713 				 * infinite headache.
1714 				 */
1715 				if_setmtu(ifp, ETHERMTU);
1716 				hn_mtu_change_fixup(sc);
1717 			}
1718 		}
1719 	}
1720 }
1721 
1722 static bool
hn_xpnt_vf_isready(struct hn_softc * sc)1723 hn_xpnt_vf_isready(struct hn_softc *sc)
1724 {
1725 
1726 	HN_LOCK_ASSERT(sc);
1727 
1728 	if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1729 		return (false);
1730 
1731 	if (sc->hn_vf_rdytick == 0)
1732 		return (true);
1733 
1734 	if (sc->hn_vf_rdytick > ticks)
1735 		return (false);
1736 
1737 	/* Mark VF as ready. */
1738 	hn_xpnt_vf_setready(sc);
1739 	return (true);
1740 }
1741 
1742 static void
hn_xpnt_vf_setenable(struct hn_softc * sc)1743 hn_xpnt_vf_setenable(struct hn_softc *sc)
1744 {
1745 	int i;
1746 
1747 	HN_LOCK_ASSERT(sc);
1748 
1749 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1750 	rm_wlock(&sc->hn_vf_lock);
1751 	sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1752 	rm_wunlock(&sc->hn_vf_lock);
1753 
1754 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1755 		sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1756 }
1757 
1758 static void
hn_xpnt_vf_setdisable(struct hn_softc * sc,bool clear_vf)1759 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1760 {
1761 	int i;
1762 
1763 	HN_LOCK_ASSERT(sc);
1764 
1765 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1766 	rm_wlock(&sc->hn_vf_lock);
1767 	sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1768 	if (clear_vf)
1769 		sc->hn_vf_ifp = NULL;
1770 	rm_wunlock(&sc->hn_vf_lock);
1771 
1772 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1773 		sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1774 }
1775 
1776 static void
hn_xpnt_vf_init(struct hn_softc * sc)1777 hn_xpnt_vf_init(struct hn_softc *sc)
1778 {
1779 	int error;
1780 
1781 	HN_LOCK_ASSERT(sc);
1782 
1783 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1784 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1785 
1786 	if (bootverbose) {
1787 		if_printf(sc->hn_ifp, "try bringing up %s\n",
1788 		    if_name(sc->hn_vf_ifp));
1789 	}
1790 
1791 	/*
1792 	 * Bring the VF up.
1793 	 */
1794 	hn_xpnt_vf_saveifflags(sc);
1795 	if_setflagbits(sc->hn_ifp, IFF_UP, 0);
1796 	error = hn_xpnt_vf_iocsetflags(sc);
1797 	if (error) {
1798 		if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1799 		    if_name(sc->hn_vf_ifp), error);
1800 		return;
1801 	}
1802 
1803 	/*
1804 	 * NOTE:
1805 	 * Datapath setting must happen _after_ bringing the VF up.
1806 	 */
1807 	hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1808 
1809 	/*
1810 	 * NOTE:
1811 	 * Fixup RSS related bits _after_ the VF is brought up, since
1812 	 * many VFs generate RSS key during it's initialization.
1813 	 */
1814 	hn_vf_rss_fixup(sc, true);
1815 
1816 	/* Mark transparent mode VF as enabled. */
1817 	hn_xpnt_vf_setenable(sc);
1818 }
1819 
1820 static void
hn_xpnt_vf_init_taskfunc(void * xsc,int pending __unused)1821 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1822 {
1823 	struct hn_softc *sc = xsc;
1824 
1825 	HN_LOCK(sc);
1826 
1827 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1828 		goto done;
1829 	if (sc->hn_vf_ifp == NULL)
1830 		goto done;
1831 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1832 		goto done;
1833 
1834 	if (sc->hn_vf_rdytick != 0) {
1835 		/* Mark VF as ready. */
1836 		hn_xpnt_vf_setready(sc);
1837 	}
1838 
1839 	if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) {
1840 		/*
1841 		 * Delayed VF initialization.
1842 		 */
1843 		if (bootverbose) {
1844 			if_printf(sc->hn_ifp, "delayed initialize %s\n",
1845 			    if_name(sc->hn_vf_ifp));
1846 		}
1847 		hn_xpnt_vf_init(sc);
1848 	}
1849 done:
1850 	HN_UNLOCK(sc);
1851 }
1852 
1853 static void
hn_ifnet_attevent(void * xsc,if_t ifp)1854 hn_ifnet_attevent(void *xsc, if_t ifp)
1855 {
1856 	struct hn_softc *sc = xsc;
1857 
1858 	HN_LOCK(sc);
1859 
1860 	if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1861 		goto done;
1862 
1863 	if (!hn_ismyvf(sc, ifp))
1864 		goto done;
1865 
1866 	if (sc->hn_vf_ifp != NULL) {
1867 		if_printf(sc->hn_ifp, "%s was attached as VF\n",
1868 		    if_name(sc->hn_vf_ifp));
1869 		goto done;
1870 	}
1871 
1872 	if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) {
1873 		/*
1874 		 * ifnet.if_start is _not_ supported by transparent
1875 		 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1876 		 */
1877 		if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1878 		    "in transparent VF mode.\n", if_name(sc->hn_vf_ifp));
1879 
1880 		goto done;
1881 	}
1882 
1883 	rm_wlock(&hn_vfmap_lock);
1884 
1885 	if (if_getindex(ifp) >= hn_vfmap_size) {
1886 		if_t *newmap;
1887 		int newsize;
1888 
1889 		newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF;
1890 		newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF,
1891 		    M_WAITOK | M_ZERO);
1892 
1893 		memcpy(newmap, hn_vfmap,
1894 		    sizeof(if_t) * hn_vfmap_size);
1895 		free(hn_vfmap, M_DEVBUF);
1896 		hn_vfmap = newmap;
1897 		hn_vfmap_size = newsize;
1898 	}
1899 	KASSERT(hn_vfmap[if_getindex(ifp)] == NULL,
1900 	    ("%s: ifindex %d was mapped to %s",
1901 	     if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)])));
1902 	hn_vfmap[if_getindex(ifp)] = sc->hn_ifp;
1903 
1904 	rm_wunlock(&hn_vfmap_lock);
1905 
1906 	/* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1907 	rm_wlock(&sc->hn_vf_lock);
1908 	KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1909 	    ("%s: transparent VF was enabled", if_name(sc->hn_ifp)));
1910 	sc->hn_vf_ifp = ifp;
1911 	rm_wunlock(&sc->hn_vf_lock);
1912 
1913 	if (hn_xpnt_vf) {
1914 		int wait_ticks;
1915 
1916 		/*
1917 		 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1918 		 * Save vf_ifp's current if_input for later restoration.
1919 		 */
1920 		sc->hn_vf_input = if_getinputfn(ifp);
1921 		if_setinputfn(ifp, hn_xpnt_vf_input);
1922 
1923 		/*
1924 		 * Stop link status management; use the VF's.
1925 		 */
1926 		hn_suspend_mgmt(sc);
1927 
1928 		/*
1929 		 * Give VF sometime to complete its attach routing.
1930 		 */
1931 		wait_ticks = hn_xpnt_vf_attwait * hz;
1932 		sc->hn_vf_rdytick = ticks + wait_ticks;
1933 
1934 		taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1935 		    wait_ticks);
1936 	}
1937 done:
1938 	HN_UNLOCK(sc);
1939 }
1940 
1941 static void
hn_ifnet_detevent(void * xsc,if_t ifp)1942 hn_ifnet_detevent(void *xsc, if_t ifp)
1943 {
1944 	struct hn_softc *sc = xsc;
1945 
1946 	HN_LOCK(sc);
1947 
1948 	if (sc->hn_vf_ifp == NULL)
1949 		goto done;
1950 
1951 	if (!hn_ismyvf(sc, ifp))
1952 		goto done;
1953 
1954 	if (hn_xpnt_vf) {
1955 		/*
1956 		 * Make sure that the delayed initialization is not running.
1957 		 *
1958 		 * NOTE:
1959 		 * - This lock _must_ be released, since the hn_vf_init task
1960 		 *   will try holding this lock.
1961 		 * - It is safe to release this lock here, since the
1962 		 *   hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1963 		 *
1964 		 * XXX racy, if hn(4) ever detached.
1965 		 */
1966 		HN_UNLOCK(sc);
1967 		taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1968 		HN_LOCK(sc);
1969 
1970 		KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1971 		    if_name(sc->hn_ifp)));
1972 		if_setinputfn(ifp, sc->hn_vf_input);
1973 		sc->hn_vf_input = NULL;
1974 
1975 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1976 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1977 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1978 
1979 		if (sc->hn_vf_rdytick == 0) {
1980 			/*
1981 			 * The VF was ready; restore some settings.
1982 			 */
1983 			if_setcapabilities(ifp, sc->hn_saved_caps);
1984 
1985 			if_sethwtsomax(ifp, sc->hn_saved_tsomax);
1986 			if_sethwtsomaxsegcount(sc->hn_ifp,
1987 			    sc->hn_saved_tsosegcnt);
1988 			if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz);
1989 
1990 			if_setcapenable(ifp, sc->hn_saved_capenable);
1991 			if_sethwassist(ifp, sc->hn_saved_hwassist);
1992 		}
1993 
1994 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1995 			/*
1996 			 * Restore RSS settings.
1997 			 */
1998 			hn_vf_rss_restore(sc);
1999 
2000 			/*
2001 			 * Resume link status management, which was suspended
2002 			 * by hn_ifnet_attevent().
2003 			 */
2004 			hn_resume_mgmt(sc);
2005 		}
2006 	}
2007 
2008 	/* Mark transparent mode VF as disabled. */
2009 	hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2010 
2011 	rm_wlock(&hn_vfmap_lock);
2012 
2013 	KASSERT(if_getindex(ifp) < hn_vfmap_size,
2014 	    ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size));
2015 	if (hn_vfmap[if_getindex(ifp)] != NULL) {
2016 		KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp,
2017 		    ("%s: ifindex %d was mapped to %s",
2018 		     if_name(ifp), if_getindex(ifp),
2019 		     if_name(hn_vfmap[if_getindex(ifp)])));
2020 		hn_vfmap[if_getindex(ifp)] = NULL;
2021 	}
2022 
2023 	rm_wunlock(&hn_vfmap_lock);
2024 done:
2025 	HN_UNLOCK(sc);
2026 }
2027 
2028 static void
hn_ifnet_lnkevent(void * xsc,if_t ifp,int link_state)2029 hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state)
2030 {
2031 	struct hn_softc *sc = xsc;
2032 
2033 	if (sc->hn_vf_ifp == ifp)
2034 		if_link_state_change(sc->hn_ifp, link_state);
2035 }
2036 
2037 static int
hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)2038 hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS)
2039 {
2040 	struct hn_softc *sc = arg1;
2041 	unsigned int tsomax;
2042 	int error;
2043 
2044 	tsomax = if_gethwtsomax(sc->hn_ifp);
2045 	error = sysctl_handle_int(oidp, &tsomax, 0, req);
2046 	return error;
2047 }
2048 
2049 static int
hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)2050 hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS)
2051 {
2052 	struct hn_softc *sc = arg1;
2053 	unsigned int tsomaxsegcnt;
2054 	int error;
2055 
2056 	tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp);
2057 	error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req);
2058 	return error;
2059 }
2060 
2061 static int
hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)2062 hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS)
2063 {
2064 	struct hn_softc *sc = arg1;
2065 	unsigned int tsomaxsegsz;
2066 	int error;
2067 
2068 	tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp);
2069 	error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req);
2070 	return error;
2071 }
2072 
2073 static int
hn_probe(device_t dev)2074 hn_probe(device_t dev)
2075 {
2076 
2077 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2078 		device_set_desc(dev, "Hyper-V Network Interface");
2079 		return BUS_PROBE_DEFAULT;
2080 	}
2081 	return ENXIO;
2082 }
2083 
2084 static int
hn_attach(device_t dev)2085 hn_attach(device_t dev)
2086 {
2087 	struct hn_softc *sc = device_get_softc(dev);
2088 	struct sysctl_oid_list *child;
2089 	struct sysctl_ctx_list *ctx;
2090 	uint8_t eaddr[ETHER_ADDR_LEN];
2091 	if_t ifp = NULL;
2092 	int error, ring_cnt, tx_ring_cnt;
2093 	uint32_t mtu;
2094 
2095 	sc->hn_dev = dev;
2096 	sc->hn_prichan = vmbus_get_channel(dev);
2097 	HN_LOCK_INIT(sc);
2098 	rm_init(&sc->hn_vf_lock, "hnvf");
2099 	if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2100 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2101 
2102 	/*
2103 	 * Initialize these tunables once.
2104 	 */
2105 	sc->hn_agg_size = hn_tx_agg_size;
2106 	sc->hn_agg_pkts = hn_tx_agg_pkts;
2107 
2108 	/*
2109 	 * Setup taskqueue for transmission.
2110 	 */
2111 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2112 		int i;
2113 
2114 		sc->hn_tx_taskqs =
2115 		    malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2116 		    M_DEVBUF, M_WAITOK);
2117 		for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2118 			sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2119 			    M_WAITOK, taskqueue_thread_enqueue,
2120 			    &sc->hn_tx_taskqs[i]);
2121 			taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2122 			    "%s tx%d", device_get_nameunit(dev), i);
2123 		}
2124 	} else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2125 		sc->hn_tx_taskqs = hn_tx_taskque;
2126 	}
2127 
2128 	/*
2129 	 * Setup taskqueue for mangement tasks, e.g. link status.
2130 	 */
2131 	sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2132 	    taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2133 	taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2134 	    device_get_nameunit(dev));
2135 	TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2136 	TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2137 	TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2138 	    hn_netchg_status_taskfunc, sc);
2139 
2140 	if (hn_xpnt_vf) {
2141 		/*
2142 		 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2143 		 */
2144 		sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2145 		    taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2146 		taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2147 		    device_get_nameunit(dev));
2148 		TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2149 		    hn_xpnt_vf_init_taskfunc, sc);
2150 	}
2151 
2152 	/*
2153 	 * Allocate ifnet and setup its name earlier, so that if_printf
2154 	 * can be used by functions, which will be called after
2155 	 * ether_ifattach().
2156 	 */
2157 	ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2158 	if_setsoftc(ifp, sc);
2159 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2160 
2161 	/*
2162 	 * Initialize ifmedia earlier so that it can be unconditionally
2163 	 * destroyed, if error happened later on.
2164 	 */
2165 	ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2166 
2167 	/*
2168 	 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2169 	 * to use (tx_ring_cnt).
2170 	 *
2171 	 * NOTE:
2172 	 * The # of RX rings to use is same as the # of channels to use.
2173 	 */
2174 	ring_cnt = hn_chan_cnt;
2175 	if (ring_cnt <= 0) {
2176 		/* Default */
2177 		ring_cnt = mp_ncpus;
2178 		if (ring_cnt > HN_RING_CNT_DEF_MAX)
2179 			ring_cnt = HN_RING_CNT_DEF_MAX;
2180 	} else if (ring_cnt > mp_ncpus) {
2181 		ring_cnt = mp_ncpus;
2182 	}
2183 #ifdef RSS
2184 	if (ring_cnt > rss_getnumbuckets())
2185 		ring_cnt = rss_getnumbuckets();
2186 #endif
2187 
2188 	tx_ring_cnt = hn_tx_ring_cnt;
2189 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2190 		tx_ring_cnt = ring_cnt;
2191 #ifdef HN_IFSTART_SUPPORT
2192 	if (hn_use_if_start) {
2193 		/* ifnet.if_start only needs one TX ring. */
2194 		tx_ring_cnt = 1;
2195 	}
2196 #endif
2197 
2198 	/*
2199 	 * Set the leader CPU for channels.
2200 	 */
2201 	sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2202 
2203 	/*
2204 	 * Create enough TX/RX rings, even if only limited number of
2205 	 * channels can be allocated.
2206 	 */
2207 	error = hn_create_tx_data(sc, tx_ring_cnt);
2208 	if (error)
2209 		goto failed;
2210 	error = hn_create_rx_data(sc, ring_cnt);
2211 	if (error)
2212 		goto failed;
2213 
2214 	/*
2215 	 * Create transaction context for NVS and RNDIS transactions.
2216 	 */
2217 	sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2218 	    HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2219 	if (sc->hn_xact == NULL) {
2220 		error = ENXIO;
2221 		goto failed;
2222 	}
2223 
2224 	/*
2225 	 * Install orphan handler for the revocation of this device's
2226 	 * primary channel.
2227 	 *
2228 	 * NOTE:
2229 	 * The processing order is critical here:
2230 	 * Install the orphan handler, _before_ testing whether this
2231 	 * device's primary channel has been revoked or not.
2232 	 */
2233 	vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2234 	if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2235 		error = ENXIO;
2236 		goto failed;
2237 	}
2238 
2239 	/*
2240 	 * Attach the synthetic parts, i.e. NVS and RNDIS.
2241 	 */
2242 	error = hn_synth_attach(sc, ETHERMTU);
2243 	if (error)
2244 		goto failed;
2245 
2246 	error = hn_rndis_get_eaddr(sc, eaddr);
2247 	if (error)
2248 		goto failed;
2249 
2250 	error = hn_rndis_get_mtu(sc, &mtu);
2251 	if (error)
2252 		mtu = ETHERMTU;
2253 	else if (bootverbose)
2254 		device_printf(dev, "RNDIS mtu %u\n", mtu);
2255 
2256 	if (sc->hn_rx_ring_inuse > 1) {
2257 		/*
2258 		 * Reduce TCP segment aggregation limit for multiple
2259 		 * RX rings to increase ACK timeliness.
2260 		 */
2261 		hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2262 	}
2263 
2264 	/*
2265 	 * Fixup TX/RX stuffs after synthetic parts are attached.
2266 	 */
2267 	hn_fixup_tx_data(sc);
2268 	hn_fixup_rx_data(sc);
2269 
2270 	ctx = device_get_sysctl_ctx(dev);
2271 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2272 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2273 	    &sc->hn_nvs_ver, 0, "NVS version");
2274 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2275 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2276 	    hn_ndis_version_sysctl, "A", "NDIS version");
2277 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2278 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2279 	    hn_caps_sysctl, "A", "capabilities");
2280 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2281 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2282 	    hn_hwassist_sysctl, "A", "hwassist");
2283 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max",
2284 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl,
2285 	    "IU", "max TSO size");
2286 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt",
2287 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl,
2288 	    "IU", "max # of TSO segments");
2289 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz",
2290 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl,
2291 	    "IU", "max size of TSO segment");
2292 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2293 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2294 	    hn_rxfilter_sysctl, "A", "rxfilter");
2295 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2296 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2297 	    hn_rss_hash_sysctl, "A", "RSS hash");
2298 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2299 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2300 	    hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2301 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2302 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2303 	    hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2304 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2305 	    CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2306 #ifndef RSS
2307 	/*
2308 	 * Don't allow RSS key/indirect table changes, if RSS is defined.
2309 	 */
2310 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2311 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2312 	    hn_rss_key_sysctl, "IU", "RSS key");
2313 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2314 	    CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2315 	    hn_rss_ind_sysctl, "IU", "RSS indirect table");
2316 #endif
2317 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2318 	    CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2319 	    "RNDIS offered packet transmission aggregation size limit");
2320 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2321 	    CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2322 	    "RNDIS offered packet transmission aggregation count limit");
2323 	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2324 	    CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2325 	    "RNDIS packet transmission aggregation alignment");
2326 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2327 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2328 	    hn_txagg_size_sysctl, "I",
2329 	    "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2330 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2331 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2332 	    hn_txagg_pkts_sysctl, "I",
2333 	    "Packet transmission aggregation packets, "
2334 	    "0 -- disable, -1 -- auto");
2335 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2336 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2337 	    hn_polling_sysctl, "I",
2338 	    "Polling frequency: [100,1000000], 0 disable polling");
2339 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2340 	    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2341 	    hn_vf_sysctl, "A", "Virtual Function's name");
2342 	if (!hn_xpnt_vf) {
2343 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2344 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2345 		    hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2346 	} else {
2347 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2348 		    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2349 		    hn_xpnt_vf_enabled_sysctl, "I",
2350 		    "Transparent VF enabled");
2351 		SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2352 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2353 		    hn_xpnt_vf_accbpf_sysctl, "I",
2354 		    "Accurate BPF for transparent VF");
2355 	}
2356 
2357 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch",
2358 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A",
2359 	    "switch to rsc");
2360 
2361 	/*
2362 	 * Setup the ifmedia, which has been initialized earlier.
2363 	 */
2364 	ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2365 	ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2366 	/* XXX ifmedia_set really should do this for us */
2367 	sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2368 
2369 	/*
2370 	 * Setup the ifnet for this interface.
2371 	 */
2372 
2373 	if_setbaudrate(ifp, IF_Gbps(10));
2374 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
2375 	if_setioctlfn(ifp, hn_ioctl);
2376 	if_setinitfn(ifp, hn_init);
2377 #ifdef HN_IFSTART_SUPPORT
2378 	if (hn_use_if_start) {
2379 		int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2380 
2381 		if_setstartfn(ifp, hn_start);
2382 		if_setsendqlen(ifp, qdepth);
2383 		if_setsendqready(ifp);
2384 	} else
2385 #endif
2386 	{
2387 		if_settransmitfn(ifp, hn_transmit);
2388 		if_setqflushfn(ifp, hn_xmit_qflush);
2389 	}
2390 
2391 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0);
2392 #ifdef foo
2393 	/* We can't diff IPv6 packets from IPv4 packets on RX path. */
2394 	if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
2395 #endif
2396 	if (sc->hn_caps & HN_CAP_VLAN) {
2397 		/* XXX not sure about VLAN_MTU. */
2398 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0);
2399 	}
2400 
2401 	if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist);
2402 	if (if_gethwassist(ifp) & HN_CSUM_IP_MASK)
2403 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0);
2404 	if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK)
2405 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0);
2406 	if (sc->hn_caps & HN_CAP_TSO4) {
2407 		if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
2408 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2409 	}
2410 	if (sc->hn_caps & HN_CAP_TSO6) {
2411 		if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
2412 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2413 	}
2414 
2415 	/* Enable all available capabilities by default. */
2416 	if_setcapenable(ifp, if_getcapabilities(ifp));
2417 
2418 	/*
2419 	 * Disable IPv6 TSO and TXCSUM by default, they still can
2420 	 * be enabled through SIOCSIFCAP.
2421 	 */
2422 	if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6));
2423 	if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO));
2424 
2425 	if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) {
2426 		/*
2427 		 * Lock hn_set_tso_maxsize() to simplify its
2428 		 * internal logic.
2429 		 */
2430 		HN_LOCK(sc);
2431 		hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2432 		HN_UNLOCK(sc);
2433 		if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX);
2434 		if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
2435 	}
2436 
2437 	ether_ifattach(ifp, eaddr);
2438 
2439 	if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2440 		if_printf(ifp, "TSO segcnt %u segsz %u\n",
2441 		    if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp));
2442 	}
2443 	if (mtu < ETHERMTU) {
2444 
2445 		if_setmtu(ifp, mtu);
2446 	}
2447 
2448 	/* Inform the upper layer about the long frame support. */
2449 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
2450 
2451 	/*
2452 	 * Kick off link status check.
2453 	 */
2454 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2455 	hn_update_link_status(sc);
2456 
2457 	if (!hn_xpnt_vf) {
2458 		sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2459 		    hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2460 		sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2461 		    hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2462 	} else {
2463 		sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2464 		    hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2465 	}
2466 
2467 	/*
2468 	 * NOTE:
2469 	 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2470 	 * since interface's LLADDR is needed; interface LLADDR is not
2471 	 * available when ifnet_arrival event is triggered.
2472 	 */
2473 	sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2474 	    hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2475 	sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2476 	    hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2477 
2478 	return (0);
2479 failed:
2480 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2481 		hn_synth_detach(sc);
2482 	hn_detach(dev);
2483 	return (error);
2484 }
2485 
2486 static int
hn_detach(device_t dev)2487 hn_detach(device_t dev)
2488 {
2489 	struct hn_softc *sc = device_get_softc(dev);
2490 	if_t ifp = sc->hn_ifp, vf_ifp;
2491 
2492 	if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2493 		/*
2494 		 * In case that the vmbus missed the orphan handler
2495 		 * installation.
2496 		 */
2497 		vmbus_xact_ctx_orphan(sc->hn_xact);
2498 	}
2499 
2500 	if (sc->hn_ifaddr_evthand != NULL)
2501 		EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2502 	if (sc->hn_ifnet_evthand != NULL)
2503 		EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2504 	if (sc->hn_ifnet_atthand != NULL) {
2505 		EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2506 		    sc->hn_ifnet_atthand);
2507 	}
2508 	if (sc->hn_ifnet_dethand != NULL) {
2509 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2510 		    sc->hn_ifnet_dethand);
2511 	}
2512 	if (sc->hn_ifnet_lnkhand != NULL)
2513 		EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2514 
2515 	vf_ifp = sc->hn_vf_ifp;
2516 	__compiler_membar();
2517 	if (vf_ifp != NULL)
2518 		hn_ifnet_detevent(sc, vf_ifp);
2519 
2520 	if (device_is_attached(dev)) {
2521 		HN_LOCK(sc);
2522 		if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2523 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
2524 				hn_stop(sc, true);
2525 			/*
2526 			 * NOTE:
2527 			 * hn_stop() only suspends data, so management
2528 			 * stuffs have to be suspended manually here.
2529 			 */
2530 			hn_suspend_mgmt(sc);
2531 			hn_synth_detach(sc);
2532 		}
2533 		HN_UNLOCK(sc);
2534 		ether_ifdetach(ifp);
2535 	}
2536 
2537 	ifmedia_removeall(&sc->hn_media);
2538 	hn_destroy_rx_data(sc);
2539 	hn_destroy_tx_data(sc);
2540 
2541 	if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2542 		int i;
2543 
2544 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
2545 			taskqueue_free(sc->hn_tx_taskqs[i]);
2546 		free(sc->hn_tx_taskqs, M_DEVBUF);
2547 	}
2548 	taskqueue_free(sc->hn_mgmt_taskq0);
2549 	if (sc->hn_vf_taskq != NULL)
2550 		taskqueue_free(sc->hn_vf_taskq);
2551 
2552 	if (sc->hn_xact != NULL) {
2553 		/*
2554 		 * Uninstall the orphan handler _before_ the xact is
2555 		 * destructed.
2556 		 */
2557 		vmbus_chan_unset_orphan(sc->hn_prichan);
2558 		vmbus_xact_ctx_destroy(sc->hn_xact);
2559 	}
2560 
2561 	if_free(ifp);
2562 
2563 	HN_LOCK_DESTROY(sc);
2564 	rm_destroy(&sc->hn_vf_lock);
2565 	return (0);
2566 }
2567 
2568 static int
hn_shutdown(device_t dev)2569 hn_shutdown(device_t dev)
2570 {
2571 
2572 	return (0);
2573 }
2574 
2575 static void
hn_link_status(struct hn_softc * sc)2576 hn_link_status(struct hn_softc *sc)
2577 {
2578 	uint32_t link_status;
2579 	int error;
2580 
2581 	error = hn_rndis_get_linkstatus(sc, &link_status);
2582 	if (error) {
2583 		/* XXX what to do? */
2584 		return;
2585 	}
2586 
2587 	if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2588 		sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2589 	else
2590 		sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2591 	if_link_state_change(sc->hn_ifp,
2592 	    (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2593 	    LINK_STATE_UP : LINK_STATE_DOWN);
2594 }
2595 
2596 static void
hn_link_taskfunc(void * xsc,int pending __unused)2597 hn_link_taskfunc(void *xsc, int pending __unused)
2598 {
2599 	struct hn_softc *sc = xsc;
2600 
2601 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2602 		return;
2603 	hn_link_status(sc);
2604 }
2605 
2606 static void
hn_netchg_init_taskfunc(void * xsc,int pending __unused)2607 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2608 {
2609 	struct hn_softc *sc = xsc;
2610 
2611 	/* Prevent any link status checks from running. */
2612 	sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2613 
2614 	/*
2615 	 * Fake up a [link down --> link up] state change; 5 seconds
2616 	 * delay is used, which closely simulates miibus reaction
2617 	 * upon link down event.
2618 	 */
2619 	sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2620 	if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2621 	taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2622 	    &sc->hn_netchg_status, 5 * hz);
2623 }
2624 
2625 static void
hn_netchg_status_taskfunc(void * xsc,int pending __unused)2626 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2627 {
2628 	struct hn_softc *sc = xsc;
2629 
2630 	/* Re-allow link status checks. */
2631 	sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2632 	hn_link_status(sc);
2633 }
2634 
2635 static void
hn_update_link_status(struct hn_softc * sc)2636 hn_update_link_status(struct hn_softc *sc)
2637 {
2638 
2639 	if (sc->hn_mgmt_taskq != NULL)
2640 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2641 }
2642 
2643 static void
hn_change_network(struct hn_softc * sc)2644 hn_change_network(struct hn_softc *sc)
2645 {
2646 
2647 	if (sc->hn_mgmt_taskq != NULL)
2648 		taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2649 }
2650 
2651 static __inline int
hn_txdesc_dmamap_load(struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head,bus_dma_segment_t * segs,int * nsegs)2652 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2653     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2654 {
2655 	struct mbuf *m = *m_head;
2656 	int error;
2657 
2658 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2659 
2660 	error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2661 	    m, segs, nsegs, BUS_DMA_NOWAIT);
2662 	if (error == EFBIG) {
2663 		struct mbuf *m_new;
2664 
2665 		m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2666 		if (m_new == NULL)
2667 			return ENOBUFS;
2668 		else
2669 			*m_head = m = m_new;
2670 		txr->hn_tx_collapsed++;
2671 
2672 		error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2673 		    txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2674 	}
2675 	if (!error) {
2676 		bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2677 		    BUS_DMASYNC_PREWRITE);
2678 		txd->flags |= HN_TXD_FLAG_DMAMAP;
2679 	}
2680 	return error;
2681 }
2682 
2683 static __inline int
hn_txdesc_put(struct hn_tx_ring * txr,struct hn_txdesc * txd)2684 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2685 {
2686 
2687 	KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2688 	    ("put an onlist txd %#x", txd->flags));
2689 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2690 	    ("put an onagg txd %#x", txd->flags));
2691 
2692 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2693 	if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2694 		return 0;
2695 
2696 	if (!STAILQ_EMPTY(&txd->agg_list)) {
2697 		struct hn_txdesc *tmp_txd;
2698 
2699 		while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2700 			int freed __diagused;
2701 
2702 			KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2703 			    ("resursive aggregation on aggregated txdesc"));
2704 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2705 			    ("not aggregated txdesc"));
2706 			KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2707 			    ("aggregated txdesc uses dmamap"));
2708 			KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2709 			    ("aggregated txdesc consumes "
2710 			     "chimney sending buffer"));
2711 			KASSERT(tmp_txd->chim_size == 0,
2712 			    ("aggregated txdesc has non-zero "
2713 			     "chimney sending size"));
2714 
2715 			STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2716 			tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2717 			freed = hn_txdesc_put(txr, tmp_txd);
2718 			KASSERT(freed, ("failed to free aggregated txdesc"));
2719 		}
2720 	}
2721 
2722 	if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2723 		KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2724 		    ("chim txd uses dmamap"));
2725 		hn_chim_free(txr->hn_sc, txd->chim_index);
2726 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2727 		txd->chim_size = 0;
2728 	} else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2729 		bus_dmamap_sync(txr->hn_tx_data_dtag,
2730 		    txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2731 		bus_dmamap_unload(txr->hn_tx_data_dtag,
2732 		    txd->data_dmap);
2733 		txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2734 	}
2735 
2736 	if (txd->m != NULL) {
2737 		m_freem(txd->m);
2738 		txd->m = NULL;
2739 	}
2740 
2741 	txd->flags |= HN_TXD_FLAG_ONLIST;
2742 #ifndef HN_USE_TXDESC_BUFRING
2743 	mtx_lock_spin(&txr->hn_txlist_spin);
2744 	KASSERT(txr->hn_txdesc_avail >= 0 &&
2745 	    txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2746 	    ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2747 	txr->hn_txdesc_avail++;
2748 	SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2749 	mtx_unlock_spin(&txr->hn_txlist_spin);
2750 #else	/* HN_USE_TXDESC_BUFRING */
2751 #ifdef HN_DEBUG
2752 	atomic_add_int(&txr->hn_txdesc_avail, 1);
2753 #endif
2754 	buf_ring_enqueue(txr->hn_txdesc_br, txd);
2755 #endif	/* !HN_USE_TXDESC_BUFRING */
2756 
2757 	return 1;
2758 }
2759 
2760 static __inline struct hn_txdesc *
hn_txdesc_get(struct hn_tx_ring * txr)2761 hn_txdesc_get(struct hn_tx_ring *txr)
2762 {
2763 	struct hn_txdesc *txd;
2764 
2765 #ifndef HN_USE_TXDESC_BUFRING
2766 	mtx_lock_spin(&txr->hn_txlist_spin);
2767 	txd = SLIST_FIRST(&txr->hn_txlist);
2768 	if (txd != NULL) {
2769 		KASSERT(txr->hn_txdesc_avail > 0,
2770 		    ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2771 		txr->hn_txdesc_avail--;
2772 		SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2773 	}
2774 	mtx_unlock_spin(&txr->hn_txlist_spin);
2775 #else
2776 	txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2777 #endif
2778 
2779 	if (txd != NULL) {
2780 #ifdef HN_USE_TXDESC_BUFRING
2781 #ifdef HN_DEBUG
2782 		atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2783 #endif
2784 #endif	/* HN_USE_TXDESC_BUFRING */
2785 		KASSERT(txd->m == NULL && txd->refs == 0 &&
2786 		    STAILQ_EMPTY(&txd->agg_list) &&
2787 		    txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2788 		    txd->chim_size == 0 &&
2789 		    (txd->flags & HN_TXD_FLAG_ONLIST) &&
2790 		    (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2791 		    (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2792 		txd->flags &= ~HN_TXD_FLAG_ONLIST;
2793 		txd->refs = 1;
2794 	}
2795 	return txd;
2796 }
2797 
2798 static __inline void
hn_txdesc_hold(struct hn_txdesc * txd)2799 hn_txdesc_hold(struct hn_txdesc *txd)
2800 {
2801 
2802 	/* 0->1 transition will never work */
2803 	KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2804 	atomic_add_int(&txd->refs, 1);
2805 }
2806 
2807 static __inline void
hn_txdesc_agg(struct hn_txdesc * agg_txd,struct hn_txdesc * txd)2808 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2809 {
2810 
2811 	KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2812 	    ("recursive aggregation on aggregating txdesc"));
2813 
2814 	KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2815 	    ("already aggregated"));
2816 	KASSERT(STAILQ_EMPTY(&txd->agg_list),
2817 	    ("recursive aggregation on to-be-aggregated txdesc"));
2818 
2819 	txd->flags |= HN_TXD_FLAG_ONAGG;
2820 	STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2821 }
2822 
2823 static bool
hn_tx_ring_pending(struct hn_tx_ring * txr)2824 hn_tx_ring_pending(struct hn_tx_ring *txr)
2825 {
2826 	bool pending = false;
2827 
2828 #ifndef HN_USE_TXDESC_BUFRING
2829 	mtx_lock_spin(&txr->hn_txlist_spin);
2830 	if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2831 		pending = true;
2832 	mtx_unlock_spin(&txr->hn_txlist_spin);
2833 #else
2834 	if (!buf_ring_full(txr->hn_txdesc_br))
2835 		pending = true;
2836 #endif
2837 	return (pending);
2838 }
2839 
2840 static __inline void
hn_txeof(struct hn_tx_ring * txr)2841 hn_txeof(struct hn_tx_ring *txr)
2842 {
2843 	txr->hn_has_txeof = 0;
2844 	txr->hn_txeof(txr);
2845 }
2846 
2847 static void
hn_txpkt_done(struct hn_nvs_sendctx * sndc,struct hn_softc * sc,struct vmbus_channel * chan,const void * data __unused,int dlen __unused)2848 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2849     struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2850 {
2851 	struct hn_txdesc *txd = sndc->hn_cbarg;
2852 	struct hn_tx_ring *txr;
2853 
2854 	txr = txd->txr;
2855 	KASSERT(txr->hn_chan == chan,
2856 	    ("channel mismatch, on chan%u, should be chan%u",
2857 	     vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2858 
2859 	txr->hn_has_txeof = 1;
2860 	hn_txdesc_put(txr, txd);
2861 
2862 	++txr->hn_txdone_cnt;
2863 	if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2864 		txr->hn_txdone_cnt = 0;
2865 		if (txr->hn_oactive)
2866 			hn_txeof(txr);
2867 	}
2868 }
2869 
2870 static void
hn_chan_rollup(struct hn_rx_ring * rxr,struct hn_tx_ring * txr)2871 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2872 {
2873 #if defined(INET) || defined(INET6)
2874 	struct epoch_tracker et;
2875 
2876 	NET_EPOCH_ENTER(et);
2877 	tcp_lro_flush_all(&rxr->hn_lro);
2878 	NET_EPOCH_EXIT(et);
2879 #endif
2880 
2881 	/*
2882 	 * NOTE:
2883 	 * 'txr' could be NULL, if multiple channels and
2884 	 * ifnet.if_start method are enabled.
2885 	 */
2886 	if (txr == NULL || !txr->hn_has_txeof)
2887 		return;
2888 
2889 	txr->hn_txdone_cnt = 0;
2890 	hn_txeof(txr);
2891 }
2892 
2893 static __inline uint32_t
hn_rndis_pktmsg_offset(uint32_t ofs)2894 hn_rndis_pktmsg_offset(uint32_t ofs)
2895 {
2896 
2897 	KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2898 	    ("invalid RNDIS packet msg offset %u", ofs));
2899 	return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2900 }
2901 
2902 static __inline void *
hn_rndis_pktinfo_append(struct rndis_packet_msg * pkt,size_t pktsize,size_t pi_dlen,uint32_t pi_type)2903 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2904     size_t pi_dlen, uint32_t pi_type)
2905 {
2906 	const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2907 	struct rndis_pktinfo *pi;
2908 
2909 	KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2910 	    ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2911 
2912 	/*
2913 	 * Per-packet-info does not move; it only grows.
2914 	 *
2915 	 * NOTE:
2916 	 * rm_pktinfooffset in this phase counts from the beginning
2917 	 * of rndis_packet_msg.
2918 	 */
2919 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2920 	    ("%u pktinfo overflows RNDIS packet msg", pi_type));
2921 	pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2922 	    pkt->rm_pktinfolen);
2923 	pkt->rm_pktinfolen += pi_size;
2924 
2925 	pi->rm_size = pi_size;
2926 	pi->rm_type = pi_type;
2927 	pi->rm_internal = 0;
2928 	pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2929 
2930 	return (pi->rm_data);
2931 }
2932 
2933 static __inline int
hn_flush_txagg(if_t ifp,struct hn_tx_ring * txr)2934 hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr)
2935 {
2936 	struct hn_txdesc *txd;
2937 	struct mbuf *m;
2938 	int error, pkts;
2939 
2940 	txd = txr->hn_agg_txd;
2941 	KASSERT(txd != NULL, ("no aggregate txdesc"));
2942 
2943 	/*
2944 	 * Since hn_txpkt() will reset this temporary stat, save
2945 	 * it now, so that oerrors can be updated properly, if
2946 	 * hn_txpkt() ever fails.
2947 	 */
2948 	pkts = txr->hn_stat_pkts;
2949 
2950 	/*
2951 	 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2952 	 * failure, save it for later freeing, if hn_txpkt() ever
2953 	 * fails.
2954 	 */
2955 	m = txd->m;
2956 	error = hn_txpkt(ifp, txr, txd);
2957 	if (__predict_false(error)) {
2958 		/* txd is freed, but m is not. */
2959 		m_freem(m);
2960 
2961 		txr->hn_flush_failed++;
2962 		if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2963 	}
2964 
2965 	/* Reset all aggregation states. */
2966 	txr->hn_agg_txd = NULL;
2967 	txr->hn_agg_szleft = 0;
2968 	txr->hn_agg_pktleft = 0;
2969 	txr->hn_agg_prevpkt = NULL;
2970 
2971 	return (error);
2972 }
2973 
2974 static void *
hn_try_txagg(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,int pktsize)2975 hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2976     int pktsize)
2977 {
2978 	void *chim;
2979 
2980 	if (txr->hn_agg_txd != NULL) {
2981 		if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2982 			struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2983 			struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2984 			int olen;
2985 
2986 			/*
2987 			 * Update the previous RNDIS packet's total length,
2988 			 * it can be increased due to the mandatory alignment
2989 			 * padding for this RNDIS packet.  And update the
2990 			 * aggregating txdesc's chimney sending buffer size
2991 			 * accordingly.
2992 			 *
2993 			 * XXX
2994 			 * Zero-out the padding, as required by the RNDIS spec.
2995 			 */
2996 			olen = pkt->rm_len;
2997 			pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2998 			agg_txd->chim_size += pkt->rm_len - olen;
2999 
3000 			/* Link this txdesc to the parent. */
3001 			hn_txdesc_agg(agg_txd, txd);
3002 
3003 			chim = (uint8_t *)pkt + pkt->rm_len;
3004 			/* Save the current packet for later fixup. */
3005 			txr->hn_agg_prevpkt = chim;
3006 
3007 			txr->hn_agg_pktleft--;
3008 			txr->hn_agg_szleft -= pktsize;
3009 			if (txr->hn_agg_szleft <=
3010 			    HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3011 				/*
3012 				 * Probably can't aggregate more packets,
3013 				 * flush this aggregating txdesc proactively.
3014 				 */
3015 				txr->hn_agg_pktleft = 0;
3016 			}
3017 			/* Done! */
3018 			return (chim);
3019 		}
3020 		hn_flush_txagg(ifp, txr);
3021 	}
3022 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3023 
3024 	txr->hn_tx_chimney_tried++;
3025 	txd->chim_index = hn_chim_alloc(txr->hn_sc);
3026 	if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3027 		return (NULL);
3028 	txr->hn_tx_chimney++;
3029 
3030 	chim = txr->hn_sc->hn_chim +
3031 	    (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3032 
3033 	if (txr->hn_agg_pktmax > 1 &&
3034 	    txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3035 		txr->hn_agg_txd = txd;
3036 		txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3037 		txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3038 		txr->hn_agg_prevpkt = chim;
3039 	}
3040 	return (chim);
3041 }
3042 
3043 /*
3044  * NOTE:
3045  * If this function fails, then both txd and m_head0 will be freed.
3046  */
3047 static int
hn_encap(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd,struct mbuf ** m_head0)3048 hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3049     struct mbuf **m_head0)
3050 {
3051 	bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3052 	int error, nsegs, i;
3053 	struct mbuf *m_head = *m_head0;
3054 	struct rndis_packet_msg *pkt;
3055 	uint32_t *pi_data;
3056 	void *chim = NULL;
3057 	int pkt_hlen, pkt_size;
3058 
3059 	pkt = txd->rndis_pkt;
3060 	pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3061 	if (pkt_size < txr->hn_chim_size) {
3062 		chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3063 		if (chim != NULL)
3064 			pkt = chim;
3065 	} else {
3066 		if (txr->hn_agg_txd != NULL)
3067 			hn_flush_txagg(ifp, txr);
3068 	}
3069 
3070 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3071 	pkt->rm_len = m_head->m_pkthdr.len;
3072 	pkt->rm_dataoffset = 0;
3073 	pkt->rm_datalen = m_head->m_pkthdr.len;
3074 	pkt->rm_oobdataoffset = 0;
3075 	pkt->rm_oobdatalen = 0;
3076 	pkt->rm_oobdataelements = 0;
3077 	pkt->rm_pktinfooffset = sizeof(*pkt);
3078 	pkt->rm_pktinfolen = 0;
3079 	pkt->rm_vchandle = 0;
3080 	pkt->rm_reserved = 0;
3081 
3082 	if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3083 		/*
3084 		 * Set the hash value for this packet.
3085 		 */
3086 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3087 		    HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3088 
3089 		if (M_HASHTYPE_ISHASH(m_head))
3090 			/*
3091 			 * The flowid field contains the hash value host
3092 			 * set in the rx queue if it is a ip forwarding pkt.
3093 			 * Set the same hash value so host can send on the
3094 			 * cpu it was received.
3095 			 */
3096 			*pi_data = m_head->m_pkthdr.flowid;
3097 		else
3098 			/*
3099 			 * Otherwise just put the tx queue index.
3100 			 */
3101 			*pi_data = txr->hn_tx_idx;
3102 	}
3103 
3104 	if (m_head->m_flags & M_VLANTAG) {
3105 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3106 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3107 		*pi_data = NDIS_VLAN_INFO_MAKE(
3108 		    EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3109 		    EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3110 		    EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3111 	}
3112 
3113 	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3114 #if defined(INET6) || defined(INET)
3115 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3116 		    NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3117 #ifdef INET
3118 		if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3119 			*pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3120 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3121 			    m_head->m_pkthdr.tso_segsz);
3122 		}
3123 #endif
3124 #if defined(INET6) && defined(INET)
3125 		else
3126 #endif
3127 #ifdef INET6
3128 		{
3129 			*pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3130 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3131 			    m_head->m_pkthdr.tso_segsz);
3132 		}
3133 #endif
3134 #endif	/* INET6 || INET */
3135 	} else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3136 		pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3137 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3138 		if (m_head->m_pkthdr.csum_flags &
3139 		    (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3140 			*pi_data = NDIS_TXCSUM_INFO_IPV6;
3141 		} else {
3142 			*pi_data = NDIS_TXCSUM_INFO_IPV4;
3143 			if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3144 				*pi_data |= NDIS_TXCSUM_INFO_IPCS;
3145 		}
3146 
3147 		if (m_head->m_pkthdr.csum_flags &
3148 		    (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3149 			*pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3150 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3151 		} else if (m_head->m_pkthdr.csum_flags &
3152 		    (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3153 			*pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3154 			    m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3155 		}
3156 	}
3157 
3158 	pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3159 	/* Fixup RNDIS packet message total length */
3160 	pkt->rm_len += pkt_hlen;
3161 	/* Convert RNDIS packet message offsets */
3162 	pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3163 	pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3164 
3165 	/*
3166 	 * Fast path: Chimney sending.
3167 	 */
3168 	if (chim != NULL) {
3169 		struct hn_txdesc *tgt_txd = txd;
3170 
3171 		if (txr->hn_agg_txd != NULL) {
3172 			tgt_txd = txr->hn_agg_txd;
3173 #ifdef INVARIANTS
3174 			*m_head0 = NULL;
3175 #endif
3176 		}
3177 
3178 		KASSERT(pkt == chim,
3179 		    ("RNDIS pkt not in chimney sending buffer"));
3180 		KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3181 		    ("chimney sending buffer is not used"));
3182 		tgt_txd->chim_size += pkt->rm_len;
3183 
3184 		m_copydata(m_head, 0, m_head->m_pkthdr.len,
3185 		    ((uint8_t *)chim) + pkt_hlen);
3186 
3187 		txr->hn_gpa_cnt = 0;
3188 		txr->hn_sendpkt = hn_txpkt_chim;
3189 		goto done;
3190 	}
3191 
3192 	KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3193 	KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3194 	    ("chimney buffer is used"));
3195 	KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3196 
3197 	error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3198 	if (__predict_false(error)) {
3199 		int freed __diagused;
3200 
3201 		/*
3202 		 * This mbuf is not linked w/ the txd yet, so free it now.
3203 		 */
3204 		m_freem(m_head);
3205 		*m_head0 = NULL;
3206 
3207 		freed = hn_txdesc_put(txr, txd);
3208 		KASSERT(freed != 0,
3209 		    ("fail to free txd upon txdma error"));
3210 
3211 		txr->hn_txdma_failed++;
3212 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3213 		return error;
3214 	}
3215 	*m_head0 = m_head;
3216 
3217 	/* +1 RNDIS packet message */
3218 	txr->hn_gpa_cnt = nsegs + 1;
3219 
3220 	/* send packet with page buffer */
3221 	txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3222 	txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3223 	txr->hn_gpa[0].gpa_len = pkt_hlen;
3224 
3225 	/*
3226 	 * Fill the page buffers with mbuf info after the page
3227 	 * buffer for RNDIS packet message.
3228 	 */
3229 	for (i = 0; i < nsegs; ++i) {
3230 		struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3231 
3232 		gpa->gpa_page = atop(segs[i].ds_addr);
3233 		gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3234 		gpa->gpa_len = segs[i].ds_len;
3235 	}
3236 
3237 	txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3238 	txd->chim_size = 0;
3239 	txr->hn_sendpkt = hn_txpkt_sglist;
3240 done:
3241 	txd->m = m_head;
3242 
3243 	/* Set the completion routine */
3244 	hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3245 
3246 	/* Update temporary stats for later use. */
3247 	txr->hn_stat_pkts++;
3248 	txr->hn_stat_size += m_head->m_pkthdr.len;
3249 	if (m_head->m_flags & M_MCAST)
3250 		txr->hn_stat_mcasts++;
3251 
3252 	return 0;
3253 }
3254 
3255 /*
3256  * NOTE:
3257  * If this function fails, then txd will be freed, but the mbuf
3258  * associated w/ the txd will _not_ be freed.
3259  */
3260 static int
hn_txpkt(if_t ifp,struct hn_tx_ring * txr,struct hn_txdesc * txd)3261 hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3262 {
3263 	int error, send_failed = 0, has_bpf;
3264 
3265 again:
3266 	has_bpf = bpf_peers_present_if(ifp);
3267 	if (has_bpf) {
3268 		/*
3269 		 * Make sure that this txd and any aggregated txds are not
3270 		 * freed before ETHER_BPF_MTAP.
3271 		 */
3272 		hn_txdesc_hold(txd);
3273 	}
3274 	error = txr->hn_sendpkt(txr, txd);
3275 	if (!error) {
3276 		if (has_bpf) {
3277 			const struct hn_txdesc *tmp_txd;
3278 
3279 			ETHER_BPF_MTAP(ifp, txd->m);
3280 			STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3281 				ETHER_BPF_MTAP(ifp, tmp_txd->m);
3282 		}
3283 
3284 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3285 #ifdef HN_IFSTART_SUPPORT
3286 		if (!hn_use_if_start)
3287 #endif
3288 		{
3289 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
3290 			    txr->hn_stat_size);
3291 			if (txr->hn_stat_mcasts != 0) {
3292 				if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3293 				    txr->hn_stat_mcasts);
3294 			}
3295 		}
3296 		txr->hn_pkts += txr->hn_stat_pkts;
3297 		txr->hn_sends++;
3298 	}
3299 	if (has_bpf)
3300 		hn_txdesc_put(txr, txd);
3301 
3302 	if (__predict_false(error)) {
3303 		int freed __diagused;
3304 
3305 		/*
3306 		 * This should "really rarely" happen.
3307 		 *
3308 		 * XXX Too many RX to be acked or too many sideband
3309 		 * commands to run?  Ask netvsc_channel_rollup()
3310 		 * to kick start later.
3311 		 */
3312 		txr->hn_has_txeof = 1;
3313 		if (!send_failed) {
3314 			txr->hn_send_failed++;
3315 			send_failed = 1;
3316 			/*
3317 			 * Try sending again after set hn_has_txeof;
3318 			 * in case that we missed the last
3319 			 * netvsc_channel_rollup().
3320 			 */
3321 			goto again;
3322 		}
3323 		if_printf(ifp, "send failed\n");
3324 
3325 		/*
3326 		 * Caller will perform further processing on the
3327 		 * associated mbuf, so don't free it in hn_txdesc_put();
3328 		 * only unload it from the DMA map in hn_txdesc_put(),
3329 		 * if it was loaded.
3330 		 */
3331 		txd->m = NULL;
3332 		freed = hn_txdesc_put(txr, txd);
3333 		KASSERT(freed != 0,
3334 		    ("fail to free txd upon send error"));
3335 
3336 		txr->hn_send_failed++;
3337 	}
3338 
3339 	/* Reset temporary stats, after this sending is done. */
3340 	txr->hn_stat_size = 0;
3341 	txr->hn_stat_pkts = 0;
3342 	txr->hn_stat_mcasts = 0;
3343 
3344 	return (error);
3345 }
3346 
3347 /*
3348  * Append the specified data to the indicated mbuf chain,
3349  * Extend the mbuf chain if the new data does not fit in
3350  * existing space.
3351  *
3352  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3353  * There should be an equivalent in the kernel mbuf code,
3354  * but there does not appear to be one yet.
3355  *
3356  * Differs from m_append() in that additional mbufs are
3357  * allocated with cluster size MJUMPAGESIZE, and filled
3358  * accordingly.
3359  *
3360  * Return the last mbuf in the chain or NULL if failed to
3361  * allocate new mbuf.
3362  */
3363 static struct mbuf *
hv_m_append(struct mbuf * m0,int len,c_caddr_t cp)3364 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3365 {
3366 	struct mbuf *m, *n;
3367 	int remainder, space;
3368 
3369 	for (m = m0; m->m_next != NULL; m = m->m_next)
3370 		;
3371 	remainder = len;
3372 	space = M_TRAILINGSPACE(m);
3373 	if (space > 0) {
3374 		/*
3375 		 * Copy into available space.
3376 		 */
3377 		if (space > remainder)
3378 			space = remainder;
3379 		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3380 		m->m_len += space;
3381 		cp += space;
3382 		remainder -= space;
3383 	}
3384 	while (remainder > 0) {
3385 		/*
3386 		 * Allocate a new mbuf; could check space
3387 		 * and allocate a cluster instead.
3388 		 */
3389 		n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3390 		if (n == NULL)
3391 			return NULL;
3392 		n->m_len = min(MJUMPAGESIZE, remainder);
3393 		bcopy(cp, mtod(n, caddr_t), n->m_len);
3394 		cp += n->m_len;
3395 		remainder -= n->m_len;
3396 		m->m_next = n;
3397 		m = n;
3398 	}
3399 
3400 	return m;
3401 }
3402 
3403 #if defined(INET) || defined(INET6)
3404 static __inline int
hn_lro_rx(struct lro_ctrl * lc,struct mbuf * m)3405 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3406 {
3407 	if (hn_lro_mbufq_depth) {
3408 		tcp_lro_queue_mbuf(lc, m);
3409 		return 0;
3410 	}
3411 	return tcp_lro_rx(lc, m, 0);
3412 }
3413 #endif
3414 
3415 static int
hn_rxpkt(struct hn_rx_ring * rxr)3416 hn_rxpkt(struct hn_rx_ring *rxr)
3417 {
3418 	if_t ifp, hn_ifp = rxr->hn_ifp;
3419 	struct mbuf *m_new, *n;
3420 	int size, do_lro = 0, do_csum = 1, is_vf = 0;
3421 	int hash_type = M_HASHTYPE_NONE;
3422 	int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3423 	int i;
3424 
3425 	ifp = hn_ifp;
3426 	if (rxr->hn_rxvf_ifp != NULL) {
3427 		/*
3428 		 * Non-transparent mode VF; pretend this packet is from
3429 		 * the VF.
3430 		 */
3431 		ifp = rxr->hn_rxvf_ifp;
3432 		is_vf = 1;
3433 	} else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3434 		/* Transparent mode VF. */
3435 		is_vf = 1;
3436 	}
3437 
3438 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
3439 		/*
3440 		 * NOTE:
3441 		 * See the NOTE of hn_rndis_init_fixat().  This
3442 		 * function can be reached, immediately after the
3443 		 * RNDIS is initialized but before the ifnet is
3444 		 * setup on the hn_attach() path; drop the unexpected
3445 		 * packets.
3446 		 */
3447 		return (0);
3448 	}
3449 
3450 	if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) {
3451 		if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3452 		return (0);
3453 	}
3454 
3455 	if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) {
3456 		m_new = m_gethdr(M_NOWAIT, MT_DATA);
3457 		if (m_new == NULL) {
3458 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3459 			return (0);
3460 		}
3461 		memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0],
3462 		    rxr->rsc.frag_len[0]);
3463 		m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0];
3464 	} else {
3465 		/*
3466 		 * Get an mbuf with a cluster.  For packets 2K or less,
3467 		 * get a standard 2K cluster.  For anything larger, get a
3468 		 * 4K cluster.  Any buffers larger than 4K can cause problems
3469 		 * if looped around to the Hyper-V TX channel, so avoid them.
3470 		 */
3471 		size = MCLBYTES;
3472 		if (rxr->rsc.pktlen > MCLBYTES) {
3473 			/* 4096 */
3474 			size = MJUMPAGESIZE;
3475 		}
3476 
3477 		m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3478 		if (m_new == NULL) {
3479 			if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3480 			return (0);
3481 		}
3482 
3483 		n = m_new;
3484 		for (i = 0; i < rxr->rsc.cnt; i++) {
3485 			n = hv_m_append(n, rxr->rsc.frag_len[i],
3486 			    rxr->rsc.frag_data[i]);
3487 			if (n == NULL) {
3488 				if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3489 				return (0);
3490 			} else {
3491 				m_new->m_pkthdr.len += rxr->rsc.frag_len[i];
3492 			}
3493 		}
3494 	}
3495 	if (rxr->rsc.pktlen <= MHLEN)
3496 		rxr->hn_small_pkts++;
3497 
3498 	m_new->m_pkthdr.rcvif = ifp;
3499 
3500 	if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0))
3501 		do_csum = 0;
3502 
3503 	/* receive side checksum offload */
3504 	if (rxr->rsc.csum_info != NULL) {
3505 		/* IP csum offload */
3506 		if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3507 			m_new->m_pkthdr.csum_flags |=
3508 			    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3509 			rxr->hn_csum_ip++;
3510 		}
3511 
3512 		/* TCP/UDP csum offload */
3513 		if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK |
3514 		     NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3515 			m_new->m_pkthdr.csum_flags |=
3516 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3517 			m_new->m_pkthdr.csum_data = 0xffff;
3518 			if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK)
3519 				rxr->hn_csum_tcp++;
3520 			else
3521 				rxr->hn_csum_udp++;
3522 		}
3523 
3524 		/*
3525 		 * XXX
3526 		 * As of this write (Oct 28th, 2016), host side will turn
3527 		 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3528 		 * the do_lro setting here is actually _not_ accurate.  We
3529 		 * depend on the RSS hash type check to reset do_lro.
3530 		 */
3531 		if ((*(rxr->rsc.csum_info) &
3532 		     (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3533 		    (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3534 			do_lro = 1;
3535 	} else {
3536 		hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3537 		if (l3proto == ETHERTYPE_IP) {
3538 			if (l4proto == IPPROTO_TCP) {
3539 				if (do_csum &&
3540 				    (rxr->hn_trust_hcsum &
3541 				     HN_TRUST_HCSUM_TCP)) {
3542 					rxr->hn_csum_trusted++;
3543 					m_new->m_pkthdr.csum_flags |=
3544 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3545 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3546 					m_new->m_pkthdr.csum_data = 0xffff;
3547 				}
3548 				do_lro = 1;
3549 			} else if (l4proto == IPPROTO_UDP) {
3550 				if (do_csum &&
3551 				    (rxr->hn_trust_hcsum &
3552 				     HN_TRUST_HCSUM_UDP)) {
3553 					rxr->hn_csum_trusted++;
3554 					m_new->m_pkthdr.csum_flags |=
3555 					   (CSUM_IP_CHECKED | CSUM_IP_VALID |
3556 					    CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3557 					m_new->m_pkthdr.csum_data = 0xffff;
3558 				}
3559 			} else if (l4proto != IPPROTO_DONE && do_csum &&
3560 			    (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3561 				rxr->hn_csum_trusted++;
3562 				m_new->m_pkthdr.csum_flags |=
3563 				    (CSUM_IP_CHECKED | CSUM_IP_VALID);
3564 			}
3565 		}
3566 	}
3567 
3568 	if (rxr->rsc.vlan_info != NULL) {
3569 		m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3570 		    NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)),
3571 		    NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)),
3572 		    NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info)));
3573 		m_new->m_flags |= M_VLANTAG;
3574 	}
3575 
3576 	/*
3577 	 * If VF is activated (tranparent/non-transparent mode does not
3578 	 * matter here).
3579 	 *
3580 	 * - Disable LRO
3581 	 *
3582 	 *   hn(4) will only receive broadcast packets, multicast packets,
3583 	 *   TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3584 	 *   packet types.
3585 	 *
3586 	 *   For non-transparent, we definitely _cannot_ enable LRO at
3587 	 *   all, since the LRO flush will use hn(4) as the receiving
3588 	 *   interface; i.e. hn_ifp->if_input(hn_ifp, m).
3589 	 */
3590 	if (is_vf)
3591 		do_lro = 0;
3592 
3593 	/*
3594 	 * If VF is activated (tranparent/non-transparent mode does not
3595 	 * matter here), do _not_ mess with unsupported hash types or
3596 	 * functions.
3597 	 */
3598 	if (rxr->rsc.hash_info != NULL) {
3599 		rxr->hn_rss_pkts++;
3600 		m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value);
3601 		if (!is_vf)
3602 			hash_type = M_HASHTYPE_OPAQUE_HASH;
3603 		if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) ==
3604 		    NDIS_HASH_FUNCTION_TOEPLITZ) {
3605 			uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK &
3606 			    rxr->hn_mbuf_hash);
3607 
3608 			/*
3609 			 * NOTE:
3610 			 * do_lro is resetted, if the hash types are not TCP
3611 			 * related.  See the comment in the above csum_flags
3612 			 * setup section.
3613 			 */
3614 			switch (type) {
3615 			case NDIS_HASH_IPV4:
3616 				hash_type = M_HASHTYPE_RSS_IPV4;
3617 				do_lro = 0;
3618 				break;
3619 
3620 			case NDIS_HASH_TCP_IPV4:
3621 				hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3622 				if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3623 					int def_htype = M_HASHTYPE_OPAQUE_HASH;
3624 
3625 					if (is_vf)
3626 						def_htype = M_HASHTYPE_NONE;
3627 
3628 					/*
3629 					 * UDP 4-tuple hash is delivered as
3630 					 * TCP 4-tuple hash.
3631 					 */
3632 					if (l3proto == ETHERTYPE_MAX) {
3633 						hn_rxpkt_proto(m_new,
3634 						    &l3proto, &l4proto);
3635 					}
3636 					if (l3proto == ETHERTYPE_IP) {
3637 						if (l4proto == IPPROTO_UDP &&
3638 						    (rxr->hn_mbuf_hash &
3639 						     NDIS_HASH_UDP_IPV4_X)) {
3640 							hash_type =
3641 							M_HASHTYPE_RSS_UDP_IPV4;
3642 							do_lro = 0;
3643 						} else if (l4proto !=
3644 						    IPPROTO_TCP) {
3645 							hash_type = def_htype;
3646 							do_lro = 0;
3647 						}
3648 					} else {
3649 						hash_type = def_htype;
3650 						do_lro = 0;
3651 					}
3652 				}
3653 				break;
3654 
3655 			case NDIS_HASH_IPV6:
3656 				hash_type = M_HASHTYPE_RSS_IPV6;
3657 				do_lro = 0;
3658 				break;
3659 
3660 			case NDIS_HASH_IPV6_EX:
3661 				hash_type = M_HASHTYPE_RSS_IPV6_EX;
3662 				do_lro = 0;
3663 				break;
3664 
3665 			case NDIS_HASH_TCP_IPV6:
3666 				hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3667 				break;
3668 
3669 			case NDIS_HASH_TCP_IPV6_EX:
3670 				hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3671 				break;
3672 			}
3673 		}
3674 	} else if (!is_vf) {
3675 		m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3676 		hash_type = M_HASHTYPE_OPAQUE;
3677 	}
3678 	M_HASHTYPE_SET(m_new, hash_type);
3679 
3680 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3681 	if (hn_ifp != ifp) {
3682 		const struct ether_header *eh;
3683 
3684 		/*
3685 		 * Non-transparent mode VF is activated.
3686 		 */
3687 
3688 		/*
3689 		 * Allow tapping on hn(4).
3690 		 */
3691 		ETHER_BPF_MTAP(hn_ifp, m_new);
3692 
3693 		/*
3694 		 * Update hn(4)'s stats.
3695 		 */
3696 		if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3697 		if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3698 		/* Checked at the beginning of this function. */
3699 		KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3700 		eh = mtod(m_new, struct ether_header *);
3701 		if (ETHER_IS_MULTICAST(eh->ether_dhost))
3702 			if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3703 	}
3704 	rxr->hn_pkts++;
3705 
3706 	if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) {
3707 #if defined(INET) || defined(INET6)
3708 		struct lro_ctrl *lro = &rxr->hn_lro;
3709 
3710 		if (lro->lro_cnt) {
3711 			rxr->hn_lro_tried++;
3712 			if (hn_lro_rx(lro, m_new) == 0) {
3713 				/* DONE! */
3714 				return 0;
3715 			}
3716 		}
3717 #endif
3718 	}
3719 	if_input(ifp, m_new);
3720 
3721 	return (0);
3722 }
3723 
3724 static int
hn_ioctl(if_t ifp,u_long cmd,caddr_t data)3725 hn_ioctl(if_t ifp, u_long cmd, caddr_t data)
3726 {
3727 	struct hn_softc *sc = if_getsoftc(ifp);
3728 	struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3729 	if_t vf_ifp;
3730 	int mask, error = 0;
3731 	struct ifrsskey *ifrk;
3732 	struct ifrsshash *ifrh;
3733 	uint32_t mtu;
3734 
3735 	switch (cmd) {
3736 	case SIOCSIFMTU:
3737 		if (ifr->ifr_mtu > HN_MTU_MAX) {
3738 			error = EINVAL;
3739 			break;
3740 		}
3741 
3742 		HN_LOCK(sc);
3743 
3744 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3745 			HN_UNLOCK(sc);
3746 			break;
3747 		}
3748 
3749 		if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3750 			/* Can't change MTU */
3751 			HN_UNLOCK(sc);
3752 			error = EOPNOTSUPP;
3753 			break;
3754 		}
3755 
3756 		if (if_getmtu(ifp) == ifr->ifr_mtu) {
3757 			HN_UNLOCK(sc);
3758 			break;
3759 		}
3760 
3761 		if (hn_xpnt_vf_isready(sc)) {
3762 			vf_ifp = sc->hn_vf_ifp;
3763 			ifr_vf = *ifr;
3764 			strlcpy(ifr_vf.ifr_name, if_name(vf_ifp),
3765 			    sizeof(ifr_vf.ifr_name));
3766 			error = ifhwioctl(SIOCSIFMTU,vf_ifp,
3767 			    (caddr_t)&ifr_vf, curthread);
3768 			if (error) {
3769 				HN_UNLOCK(sc);
3770 				if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3771 				    if_name(vf_ifp), ifr->ifr_mtu, error);
3772 				break;
3773 			}
3774 		}
3775 
3776 		/*
3777 		 * Suspend this interface before the synthetic parts
3778 		 * are ripped.
3779 		 */
3780 		hn_suspend(sc);
3781 
3782 		/*
3783 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
3784 		 */
3785 		hn_synth_detach(sc);
3786 
3787 		/*
3788 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3789 		 * with the new MTU setting.
3790 		 */
3791 		error = hn_synth_attach(sc, ifr->ifr_mtu);
3792 		if (error) {
3793 			HN_UNLOCK(sc);
3794 			break;
3795 		}
3796 
3797 		error = hn_rndis_get_mtu(sc, &mtu);
3798 		if (error)
3799 			mtu = ifr->ifr_mtu;
3800 		else if (bootverbose)
3801 			if_printf(ifp, "RNDIS mtu %u\n", mtu);
3802 
3803 		/*
3804 		 * Commit the requested MTU, after the synthetic parts
3805 		 * have been successfully attached.
3806 		 */
3807 		if (mtu >= ifr->ifr_mtu) {
3808 			mtu = ifr->ifr_mtu;
3809 		} else {
3810 			if_printf(ifp, "fixup mtu %d -> %u\n",
3811 			    ifr->ifr_mtu, mtu);
3812 		}
3813 		if_setmtu(ifp, mtu);
3814 
3815 		/*
3816 		 * Synthetic parts' reattach may change the chimney
3817 		 * sending size; update it.
3818 		 */
3819 		if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3820 			hn_set_chim_size(sc, sc->hn_chim_szmax);
3821 
3822 		/*
3823 		 * Make sure that various parameters based on MTU are
3824 		 * still valid, after the MTU change.
3825 		 */
3826 		hn_mtu_change_fixup(sc);
3827 
3828 		/*
3829 		 * All done!  Resume the interface now.
3830 		 */
3831 		hn_resume(sc);
3832 
3833 		if ((sc->hn_flags & HN_FLAG_RXVF) ||
3834 		    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3835 			/*
3836 			 * Since we have reattached the NVS part,
3837 			 * change the datapath to VF again; in case
3838 			 * that it is lost, after the NVS was detached.
3839 			 */
3840 			hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3841 		}
3842 
3843 		HN_UNLOCK(sc);
3844 		break;
3845 
3846 	case SIOCSIFFLAGS:
3847 		HN_LOCK(sc);
3848 
3849 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3850 			HN_UNLOCK(sc);
3851 			break;
3852 		}
3853 
3854 		if (hn_xpnt_vf_isready(sc))
3855 			hn_xpnt_vf_saveifflags(sc);
3856 
3857 		if (if_getflags(ifp) & IFF_UP) {
3858 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3859 				/*
3860 				 * Caller meight hold mutex, e.g.
3861 				 * bpf; use busy-wait for the RNDIS
3862 				 * reply.
3863 				 */
3864 				HN_NO_SLEEPING(sc);
3865 				hn_rxfilter_config(sc);
3866 				HN_SLEEPING_OK(sc);
3867 
3868 				if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3869 					error = hn_xpnt_vf_iocsetflags(sc);
3870 			} else {
3871 				hn_init_locked(sc);
3872 			}
3873 		} else {
3874 			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3875 				hn_stop(sc, false);
3876 		}
3877 		sc->hn_if_flags = if_getflags(ifp);
3878 
3879 		HN_UNLOCK(sc);
3880 		break;
3881 
3882 	case SIOCSIFCAP:
3883 		HN_LOCK(sc);
3884 
3885 		if (hn_xpnt_vf_isready(sc)) {
3886 			ifr_vf = *ifr;
3887 			strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp),
3888 			    sizeof(ifr_vf.ifr_name));
3889 			error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3890 			HN_UNLOCK(sc);
3891 			break;
3892 		}
3893 
3894 		/*
3895 		 * Fix up requested capabilities w/ supported capabilities,
3896 		 * since the supported capabilities could have been changed.
3897 		 */
3898 		mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^
3899 		    if_getcapenable(ifp);
3900 
3901 		if (mask & IFCAP_TXCSUM) {
3902 			if_togglecapenable(ifp, IFCAP_TXCSUM);
3903 			if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3904 				if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0);
3905 			else
3906 				if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc));
3907 		}
3908 		if (mask & IFCAP_TXCSUM_IPV6) {
3909 			if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
3910 			if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3911 				if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0);
3912 			else
3913 				if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc));
3914 		}
3915 
3916 		/* TODO: flip RNDIS offload parameters for RXCSUM. */
3917 		if (mask & IFCAP_RXCSUM)
3918 			if_togglecapenable(ifp, IFCAP_RXCSUM);
3919 #ifdef foo
3920 		/* We can't diff IPv6 packets from IPv4 packets on RX path. */
3921 		if (mask & IFCAP_RXCSUM_IPV6)
3922 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
3923 #endif
3924 
3925 		if (mask & IFCAP_LRO)
3926 			if_togglecapenable(ifp, IFCAP_LRO);
3927 
3928 		if (mask & IFCAP_TSO4) {
3929 			if_togglecapenable(ifp, IFCAP_TSO4);
3930 			if (if_getcapenable(ifp) & IFCAP_TSO4)
3931 				if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3932 			else
3933 				if_sethwassistbits(ifp, 0, CSUM_IP_TSO);
3934 		}
3935 		if (mask & IFCAP_TSO6) {
3936 			if_togglecapenable(ifp, IFCAP_TSO6);
3937 			if (if_getcapenable(ifp) & IFCAP_TSO6)
3938 				if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3939 			else
3940 				if_sethwassistbits(ifp, 0, CSUM_IP6_TSO);
3941 		}
3942 
3943 		HN_UNLOCK(sc);
3944 		break;
3945 
3946 	case SIOCADDMULTI:
3947 	case SIOCDELMULTI:
3948 		HN_LOCK(sc);
3949 
3950 		if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3951 			HN_UNLOCK(sc);
3952 			break;
3953 		}
3954 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3955 			/*
3956 			 * Multicast uses mutex; use busy-wait for
3957 			 * the RNDIS reply.
3958 			 */
3959 			HN_NO_SLEEPING(sc);
3960 			hn_rxfilter_config(sc);
3961 			HN_SLEEPING_OK(sc);
3962 		}
3963 
3964 		/* XXX vlan(4) style mcast addr maintenance */
3965 		if (hn_xpnt_vf_isready(sc)) {
3966 			int old_if_flags;
3967 
3968 			old_if_flags = if_getflags(sc->hn_vf_ifp);
3969 			hn_xpnt_vf_saveifflags(sc);
3970 
3971 			if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3972 			    ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) &
3973 			     IFF_ALLMULTI))
3974 				error = hn_xpnt_vf_iocsetflags(sc);
3975 		}
3976 
3977 		HN_UNLOCK(sc);
3978 		break;
3979 
3980 	case SIOCSIFMEDIA:
3981 	case SIOCGIFMEDIA:
3982 		HN_LOCK(sc);
3983 		if (hn_xpnt_vf_isready(sc)) {
3984 			/*
3985 			 * SIOCGIFMEDIA expects ifmediareq, so don't
3986 			 * create and pass ifr_vf to the VF here; just
3987 			 * replace the ifr_name.
3988 			 */
3989 			vf_ifp = sc->hn_vf_ifp;
3990 			strlcpy(ifr->ifr_name, if_name(vf_ifp),
3991 			    sizeof(ifr->ifr_name));
3992 			error = ifhwioctl(cmd, vf_ifp, data, curthread);
3993 			/* Restore the ifr_name. */
3994 			strlcpy(ifr->ifr_name, if_name(ifp),
3995 			    sizeof(ifr->ifr_name));
3996 			HN_UNLOCK(sc);
3997 			break;
3998 		}
3999 		HN_UNLOCK(sc);
4000 		error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
4001 		break;
4002 
4003 	case SIOCGIFRSSHASH:
4004 		ifrh = (struct ifrsshash *)data;
4005 		HN_LOCK(sc);
4006 		if (sc->hn_rx_ring_inuse == 1) {
4007 			HN_UNLOCK(sc);
4008 			ifrh->ifrh_func = RSS_FUNC_NONE;
4009 			ifrh->ifrh_types = 0;
4010 			break;
4011 		}
4012 
4013 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4014 			ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
4015 		else
4016 			ifrh->ifrh_func = RSS_FUNC_PRIVATE;
4017 		ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
4018 		HN_UNLOCK(sc);
4019 		break;
4020 
4021 	case SIOCGIFRSSKEY:
4022 		ifrk = (struct ifrsskey *)data;
4023 		HN_LOCK(sc);
4024 		if (sc->hn_rx_ring_inuse == 1) {
4025 			HN_UNLOCK(sc);
4026 			ifrk->ifrk_func = RSS_FUNC_NONE;
4027 			ifrk->ifrk_keylen = 0;
4028 			break;
4029 		}
4030 		if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
4031 			ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
4032 		else
4033 			ifrk->ifrk_func = RSS_FUNC_PRIVATE;
4034 		ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
4035 		memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
4036 		    NDIS_HASH_KEYSIZE_TOEPLITZ);
4037 		HN_UNLOCK(sc);
4038 		break;
4039 
4040 	default:
4041 		error = ether_ioctl(ifp, cmd, data);
4042 		break;
4043 	}
4044 	return (error);
4045 }
4046 
4047 static void
hn_stop(struct hn_softc * sc,bool detaching)4048 hn_stop(struct hn_softc *sc, bool detaching)
4049 {
4050 	if_t ifp = sc->hn_ifp;
4051 	int i;
4052 
4053 	HN_LOCK_ASSERT(sc);
4054 
4055 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4056 	    ("synthetic parts were not attached"));
4057 
4058 	/* Clear RUNNING bit ASAP. */
4059 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
4060 
4061 	/* Disable polling. */
4062 	hn_polling(sc, 0);
4063 
4064 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4065 		KASSERT(sc->hn_vf_ifp != NULL,
4066 		    ("%s: VF is not attached", if_name(ifp)));
4067 
4068 		/* Mark transparent mode VF as disabled. */
4069 		hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4070 
4071 		/*
4072 		 * NOTE:
4073 		 * Datapath setting must happen _before_ bringing
4074 		 * the VF down.
4075 		 */
4076 		hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4077 
4078 		/*
4079 		 * Bring the VF down.
4080 		 */
4081 		hn_xpnt_vf_saveifflags(sc);
4082 		if_setflagbits(ifp, 0, IFF_UP);
4083 		hn_xpnt_vf_iocsetflags(sc);
4084 	}
4085 
4086 	/* Suspend data transfers. */
4087 	hn_suspend_data(sc);
4088 
4089 	/* Clear OACTIVE bit. */
4090 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4091 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4092 		sc->hn_tx_ring[i].hn_oactive = 0;
4093 
4094 	/*
4095 	 * If the non-transparent mode VF is active, make sure
4096 	 * that the RX filter still allows packet reception.
4097 	 */
4098 	if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4099 		hn_rxfilter_config(sc);
4100 }
4101 
4102 static void
hn_init_locked(struct hn_softc * sc)4103 hn_init_locked(struct hn_softc *sc)
4104 {
4105 	if_t ifp = sc->hn_ifp;
4106 	int i;
4107 
4108 	HN_LOCK_ASSERT(sc);
4109 
4110 	if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4111 		return;
4112 
4113 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
4114 		return;
4115 
4116 	/* Configure RX filter */
4117 	hn_rxfilter_config(sc);
4118 
4119 	/* Clear OACTIVE bit. */
4120 	if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
4121 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4122 		sc->hn_tx_ring[i].hn_oactive = 0;
4123 
4124 	/* Clear TX 'suspended' bit. */
4125 	hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4126 
4127 	if (hn_xpnt_vf_isready(sc)) {
4128 		/* Initialize transparent VF. */
4129 		hn_xpnt_vf_init(sc);
4130 	}
4131 
4132 	/* Everything is ready; unleash! */
4133 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
4134 
4135 	/* Re-enable polling if requested. */
4136 	if (sc->hn_pollhz > 0)
4137 		hn_polling(sc, sc->hn_pollhz);
4138 }
4139 
4140 static void
hn_init(void * xsc)4141 hn_init(void *xsc)
4142 {
4143 	struct hn_softc *sc = xsc;
4144 
4145 	HN_LOCK(sc);
4146 	hn_init_locked(sc);
4147 	HN_UNLOCK(sc);
4148 }
4149 
4150 static int
hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)4151 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4152 {
4153 	struct hn_softc *sc = arg1;
4154 	unsigned int lenlim;
4155 	int error;
4156 
4157 	lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4158 	error = sysctl_handle_int(oidp, &lenlim, 0, req);
4159 	if (error || req->newptr == NULL)
4160 		return error;
4161 
4162 	HN_LOCK(sc);
4163 	if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4164 	    lenlim > TCP_LRO_LENGTH_MAX) {
4165 		HN_UNLOCK(sc);
4166 		return EINVAL;
4167 	}
4168 	hn_set_lro_lenlim(sc, lenlim);
4169 	HN_UNLOCK(sc);
4170 
4171 	return 0;
4172 }
4173 
4174 static int
hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)4175 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4176 {
4177 	struct hn_softc *sc = arg1;
4178 	int ackcnt, error, i;
4179 
4180 	/*
4181 	 * lro_ackcnt_lim is append count limit,
4182 	 * +1 to turn it into aggregation limit.
4183 	 */
4184 	ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4185 	error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4186 	if (error || req->newptr == NULL)
4187 		return error;
4188 
4189 	if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4190 		return EINVAL;
4191 
4192 	/*
4193 	 * Convert aggregation limit back to append
4194 	 * count limit.
4195 	 */
4196 	--ackcnt;
4197 	HN_LOCK(sc);
4198 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4199 		sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4200 	HN_UNLOCK(sc);
4201 	return 0;
4202 }
4203 
4204 static int
hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)4205 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4206 {
4207 	struct hn_softc *sc = arg1;
4208 	int hcsum = arg2;
4209 	int on, error, i;
4210 
4211 	on = 0;
4212 	if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4213 		on = 1;
4214 
4215 	error = sysctl_handle_int(oidp, &on, 0, req);
4216 	if (error || req->newptr == NULL)
4217 		return error;
4218 
4219 	HN_LOCK(sc);
4220 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4221 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4222 
4223 		if (on)
4224 			rxr->hn_trust_hcsum |= hcsum;
4225 		else
4226 			rxr->hn_trust_hcsum &= ~hcsum;
4227 	}
4228 	HN_UNLOCK(sc);
4229 	return 0;
4230 }
4231 
4232 static int
hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)4233 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4234 {
4235 	struct hn_softc *sc = arg1;
4236 	int chim_size, error;
4237 
4238 	chim_size = sc->hn_tx_ring[0].hn_chim_size;
4239 	error = sysctl_handle_int(oidp, &chim_size, 0, req);
4240 	if (error || req->newptr == NULL)
4241 		return error;
4242 
4243 	if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4244 		return EINVAL;
4245 
4246 	HN_LOCK(sc);
4247 	hn_set_chim_size(sc, chim_size);
4248 	HN_UNLOCK(sc);
4249 	return 0;
4250 }
4251 
4252 static int
hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)4253 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4254 {
4255 	struct hn_softc *sc = arg1;
4256 	int ofs = arg2, i, error;
4257 	struct hn_rx_ring *rxr;
4258 	uint64_t stat;
4259 
4260 	stat = 0;
4261 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4262 		rxr = &sc->hn_rx_ring[i];
4263 		stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4264 	}
4265 
4266 	error = sysctl_handle_64(oidp, &stat, 0, req);
4267 	if (error || req->newptr == NULL)
4268 		return error;
4269 
4270 	/* Zero out this stat. */
4271 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4272 		rxr = &sc->hn_rx_ring[i];
4273 		*((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4274 	}
4275 	return 0;
4276 }
4277 
4278 static int
hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4279 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4280 {
4281 	struct hn_softc *sc = arg1;
4282 	int ofs = arg2, i, error;
4283 	struct hn_rx_ring *rxr;
4284 	u_long stat;
4285 
4286 	stat = 0;
4287 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4288 		rxr = &sc->hn_rx_ring[i];
4289 		stat += *((u_long *)((uint8_t *)rxr + ofs));
4290 	}
4291 
4292 	error = sysctl_handle_long(oidp, &stat, 0, req);
4293 	if (error || req->newptr == NULL)
4294 		return error;
4295 
4296 	/* Zero out this stat. */
4297 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4298 		rxr = &sc->hn_rx_ring[i];
4299 		*((u_long *)((uint8_t *)rxr + ofs)) = 0;
4300 	}
4301 	return 0;
4302 }
4303 
4304 static int
hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)4305 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4306 {
4307 	struct hn_softc *sc = arg1;
4308 	int ofs = arg2, i, error;
4309 	struct hn_tx_ring *txr;
4310 	u_long stat;
4311 
4312 	stat = 0;
4313 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4314 		txr = &sc->hn_tx_ring[i];
4315 		stat += *((u_long *)((uint8_t *)txr + ofs));
4316 	}
4317 
4318 	error = sysctl_handle_long(oidp, &stat, 0, req);
4319 	if (error || req->newptr == NULL)
4320 		return error;
4321 
4322 	/* Zero out this stat. */
4323 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4324 		txr = &sc->hn_tx_ring[i];
4325 		*((u_long *)((uint8_t *)txr + ofs)) = 0;
4326 	}
4327 	return 0;
4328 }
4329 
4330 static int
hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)4331 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4332 {
4333 	struct hn_softc *sc = arg1;
4334 	int ofs = arg2, i, error, conf;
4335 	struct hn_tx_ring *txr;
4336 
4337 	txr = &sc->hn_tx_ring[0];
4338 	conf = *((int *)((uint8_t *)txr + ofs));
4339 
4340 	error = sysctl_handle_int(oidp, &conf, 0, req);
4341 	if (error || req->newptr == NULL)
4342 		return error;
4343 
4344 	HN_LOCK(sc);
4345 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4346 		txr = &sc->hn_tx_ring[i];
4347 		*((int *)((uint8_t *)txr + ofs)) = conf;
4348 	}
4349 	HN_UNLOCK(sc);
4350 
4351 	return 0;
4352 }
4353 
4354 static int
hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)4355 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4356 {
4357 	struct hn_softc *sc = arg1;
4358 	int error, size;
4359 
4360 	size = sc->hn_agg_size;
4361 	error = sysctl_handle_int(oidp, &size, 0, req);
4362 	if (error || req->newptr == NULL)
4363 		return (error);
4364 
4365 	HN_LOCK(sc);
4366 	sc->hn_agg_size = size;
4367 	hn_set_txagg(sc);
4368 	HN_UNLOCK(sc);
4369 
4370 	return (0);
4371 }
4372 
4373 static int
hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)4374 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4375 {
4376 	struct hn_softc *sc = arg1;
4377 	int error, pkts;
4378 
4379 	pkts = sc->hn_agg_pkts;
4380 	error = sysctl_handle_int(oidp, &pkts, 0, req);
4381 	if (error || req->newptr == NULL)
4382 		return (error);
4383 
4384 	HN_LOCK(sc);
4385 	sc->hn_agg_pkts = pkts;
4386 	hn_set_txagg(sc);
4387 	HN_UNLOCK(sc);
4388 
4389 	return (0);
4390 }
4391 
4392 static int
hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)4393 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4394 {
4395 	struct hn_softc *sc = arg1;
4396 	int pkts;
4397 
4398 	pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4399 	return (sysctl_handle_int(oidp, &pkts, 0, req));
4400 }
4401 
4402 static int
hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)4403 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4404 {
4405 	struct hn_softc *sc = arg1;
4406 	int align;
4407 
4408 	align = sc->hn_tx_ring[0].hn_agg_align;
4409 	return (sysctl_handle_int(oidp, &align, 0, req));
4410 }
4411 
4412 static void
hn_chan_polling(struct vmbus_channel * chan,u_int pollhz)4413 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4414 {
4415 	if (pollhz == 0)
4416 		vmbus_chan_poll_disable(chan);
4417 	else
4418 		vmbus_chan_poll_enable(chan, pollhz);
4419 }
4420 
4421 static void
hn_polling(struct hn_softc * sc,u_int pollhz)4422 hn_polling(struct hn_softc *sc, u_int pollhz)
4423 {
4424 	int nsubch = sc->hn_rx_ring_inuse - 1;
4425 
4426 	HN_LOCK_ASSERT(sc);
4427 
4428 	if (nsubch > 0) {
4429 		struct vmbus_channel **subch;
4430 		int i;
4431 
4432 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4433 		for (i = 0; i < nsubch; ++i)
4434 			hn_chan_polling(subch[i], pollhz);
4435 		vmbus_subchan_rel(subch, nsubch);
4436 	}
4437 	hn_chan_polling(sc->hn_prichan, pollhz);
4438 }
4439 
4440 static int
hn_polling_sysctl(SYSCTL_HANDLER_ARGS)4441 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4442 {
4443 	struct hn_softc *sc = arg1;
4444 	int pollhz, error;
4445 
4446 	pollhz = sc->hn_pollhz;
4447 	error = sysctl_handle_int(oidp, &pollhz, 0, req);
4448 	if (error || req->newptr == NULL)
4449 		return (error);
4450 
4451 	if (pollhz != 0 &&
4452 	    (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4453 		return (EINVAL);
4454 
4455 	HN_LOCK(sc);
4456 	if (sc->hn_pollhz != pollhz) {
4457 		sc->hn_pollhz = pollhz;
4458 		if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) &&
4459 		    (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4460 			hn_polling(sc, sc->hn_pollhz);
4461 	}
4462 	HN_UNLOCK(sc);
4463 
4464 	return (0);
4465 }
4466 
4467 static int
hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)4468 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4469 {
4470 	struct hn_softc *sc = arg1;
4471 	char verstr[16];
4472 
4473 	snprintf(verstr, sizeof(verstr), "%u.%u",
4474 	    HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4475 	    HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4476 	return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4477 }
4478 
4479 static int
hn_caps_sysctl(SYSCTL_HANDLER_ARGS)4480 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4481 {
4482 	struct hn_softc *sc = arg1;
4483 	char caps_str[128];
4484 	uint32_t caps;
4485 
4486 	HN_LOCK(sc);
4487 	caps = sc->hn_caps;
4488 	HN_UNLOCK(sc);
4489 	snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4490 	return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4491 }
4492 
4493 static int
hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)4494 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4495 {
4496 	struct hn_softc *sc = arg1;
4497 	char assist_str[128];
4498 	uint32_t hwassist;
4499 
4500 	HN_LOCK(sc);
4501 	hwassist = if_gethwassist(sc->hn_ifp);
4502 	HN_UNLOCK(sc);
4503 	snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4504 	return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4505 }
4506 
4507 static int
hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)4508 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4509 {
4510 	struct hn_softc *sc = arg1;
4511 	char filter_str[128];
4512 	uint32_t filter;
4513 
4514 	HN_LOCK(sc);
4515 	filter = sc->hn_rx_filter;
4516 	HN_UNLOCK(sc);
4517 	snprintf(filter_str, sizeof(filter_str), "%b", filter,
4518 	    NDIS_PACKET_TYPES);
4519 	return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4520 }
4521 
4522 static int
hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)4523 hn_rsc_sysctl(SYSCTL_HANDLER_ARGS)
4524 {
4525 	struct hn_softc *sc = arg1;
4526 	uint32_t mtu;
4527 	int error;
4528 	HN_LOCK(sc);
4529 	error = hn_rndis_get_mtu(sc, &mtu);
4530 	if (error) {
4531 		if_printf(sc->hn_ifp, "failed to get mtu\n");
4532 		goto back;
4533 	}
4534 	error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4535 	if (error || req->newptr == NULL)
4536 		goto back;
4537 
4538 	error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl));
4539 	if (error)
4540 		goto back;
4541 	error = hn_rndis_reconf_offload(sc, mtu);
4542 back:
4543 	HN_UNLOCK(sc);
4544 	return (error);
4545 }
4546 #ifndef RSS
4547 
4548 static int
hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)4549 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4550 {
4551 	struct hn_softc *sc = arg1;
4552 	int error;
4553 
4554 	HN_LOCK(sc);
4555 
4556 	error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4557 	if (error || req->newptr == NULL)
4558 		goto back;
4559 
4560 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
4561 	    (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4562 		/*
4563 		 * RSS key is synchronized w/ VF's, don't allow users
4564 		 * to change it.
4565 		 */
4566 		error = EBUSY;
4567 		goto back;
4568 	}
4569 
4570 	error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4571 	if (error)
4572 		goto back;
4573 	sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4574 
4575 	if (sc->hn_rx_ring_inuse > 1) {
4576 		error = hn_rss_reconfig(sc);
4577 	} else {
4578 		/* Not RSS capable, at least for now; just save the RSS key. */
4579 		error = 0;
4580 	}
4581 back:
4582 	HN_UNLOCK(sc);
4583 	return (error);
4584 }
4585 
4586 static int
hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)4587 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4588 {
4589 	struct hn_softc *sc = arg1;
4590 	int error;
4591 
4592 	HN_LOCK(sc);
4593 
4594 	error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4595 	if (error || req->newptr == NULL)
4596 		goto back;
4597 
4598 	/*
4599 	 * Don't allow RSS indirect table change, if this interface is not
4600 	 * RSS capable currently.
4601 	 */
4602 	if (sc->hn_rx_ring_inuse == 1) {
4603 		error = EOPNOTSUPP;
4604 		goto back;
4605 	}
4606 
4607 	error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4608 	if (error)
4609 		goto back;
4610 	sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4611 
4612 	hn_rss_ind_fixup(sc);
4613 	error = hn_rss_reconfig(sc);
4614 back:
4615 	HN_UNLOCK(sc);
4616 	return (error);
4617 }
4618 
4619 #endif	/* !RSS */
4620 
4621 static int
hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)4622 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4623 {
4624 	struct hn_softc *sc = arg1;
4625 	char hash_str[128];
4626 	uint32_t hash;
4627 
4628 	HN_LOCK(sc);
4629 	hash = sc->hn_rss_hash;
4630 	HN_UNLOCK(sc);
4631 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4632 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4633 }
4634 
4635 static int
hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)4636 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4637 {
4638 	struct hn_softc *sc = arg1;
4639 	char hash_str[128];
4640 	uint32_t hash;
4641 
4642 	HN_LOCK(sc);
4643 	hash = sc->hn_rss_hcap;
4644 	HN_UNLOCK(sc);
4645 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4646 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4647 }
4648 
4649 static int
hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)4650 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4651 {
4652 	struct hn_softc *sc = arg1;
4653 	char hash_str[128];
4654 	uint32_t hash;
4655 
4656 	HN_LOCK(sc);
4657 	hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4658 	HN_UNLOCK(sc);
4659 	snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4660 	return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4661 }
4662 
4663 static int
hn_vf_sysctl(SYSCTL_HANDLER_ARGS)4664 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4665 {
4666 	struct hn_softc *sc = arg1;
4667 	char vf_name[IFNAMSIZ + 1];
4668 	if_t vf_ifp;
4669 
4670 	HN_LOCK(sc);
4671 	vf_name[0] = '\0';
4672 	vf_ifp = sc->hn_vf_ifp;
4673 	if (vf_ifp != NULL)
4674 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4675 	HN_UNLOCK(sc);
4676 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4677 }
4678 
4679 static int
hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)4680 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4681 {
4682 	struct hn_softc *sc = arg1;
4683 	char vf_name[IFNAMSIZ + 1];
4684 	if_t vf_ifp;
4685 
4686 	HN_LOCK(sc);
4687 	vf_name[0] = '\0';
4688 	vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4689 	if (vf_ifp != NULL)
4690 		snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp));
4691 	HN_UNLOCK(sc);
4692 	return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4693 }
4694 
4695 static int
hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)4696 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4697 {
4698 	struct rm_priotracker pt;
4699 	struct sbuf *sb;
4700 	int error, i;
4701 	bool first;
4702 
4703 	error = sysctl_wire_old_buffer(req, 0);
4704 	if (error != 0)
4705 		return (error);
4706 
4707 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4708 	if (sb == NULL)
4709 		return (ENOMEM);
4710 
4711 	rm_rlock(&hn_vfmap_lock, &pt);
4712 
4713 	first = true;
4714 	for (i = 0; i < hn_vfmap_size; ++i) {
4715 		struct epoch_tracker et;
4716 		if_t ifp;
4717 
4718 		if (hn_vfmap[i] == NULL)
4719 			continue;
4720 
4721 		NET_EPOCH_ENTER(et);
4722 		ifp = ifnet_byindex(i);
4723 		if (ifp != NULL) {
4724 			if (first)
4725 				sbuf_printf(sb, "%s", if_name(ifp));
4726 			else
4727 				sbuf_printf(sb, " %s", if_name(ifp));
4728 			first = false;
4729 		}
4730 		NET_EPOCH_EXIT(et);
4731 	}
4732 
4733 	rm_runlock(&hn_vfmap_lock, &pt);
4734 
4735 	error = sbuf_finish(sb);
4736 	sbuf_delete(sb);
4737 	return (error);
4738 }
4739 
4740 static int
hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)4741 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4742 {
4743 	struct rm_priotracker pt;
4744 	struct sbuf *sb;
4745 	int error, i;
4746 	bool first;
4747 
4748 	error = sysctl_wire_old_buffer(req, 0);
4749 	if (error != 0)
4750 		return (error);
4751 
4752 	sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4753 	if (sb == NULL)
4754 		return (ENOMEM);
4755 
4756 	rm_rlock(&hn_vfmap_lock, &pt);
4757 
4758 	first = true;
4759 	for (i = 0; i < hn_vfmap_size; ++i) {
4760 		struct epoch_tracker et;
4761 		if_t ifp, hn_ifp;
4762 
4763 		hn_ifp = hn_vfmap[i];
4764 		if (hn_ifp == NULL)
4765 			continue;
4766 
4767 		NET_EPOCH_ENTER(et);
4768 		ifp = ifnet_byindex(i);
4769 		if (ifp != NULL) {
4770 			if (first) {
4771 				sbuf_printf(sb, "%s:%s", if_name(ifp),
4772 				    if_name(hn_ifp));
4773 			} else {
4774 				sbuf_printf(sb, " %s:%s", if_name(ifp),
4775 				    if_name(hn_ifp));
4776 			}
4777 			first = false;
4778 		}
4779 		NET_EPOCH_EXIT(et);
4780 	}
4781 
4782 	rm_runlock(&hn_vfmap_lock, &pt);
4783 
4784 	error = sbuf_finish(sb);
4785 	sbuf_delete(sb);
4786 	return (error);
4787 }
4788 
4789 static int
hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)4790 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4791 {
4792 	struct hn_softc *sc = arg1;
4793 	int error, onoff = 0;
4794 
4795 	if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4796 		onoff = 1;
4797 	error = sysctl_handle_int(oidp, &onoff, 0, req);
4798 	if (error || req->newptr == NULL)
4799 		return (error);
4800 
4801 	HN_LOCK(sc);
4802 	/* NOTE: hn_vf_lock for hn_transmit() */
4803 	rm_wlock(&sc->hn_vf_lock);
4804 	if (onoff)
4805 		sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4806 	else
4807 		sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4808 	rm_wunlock(&sc->hn_vf_lock);
4809 	HN_UNLOCK(sc);
4810 
4811 	return (0);
4812 }
4813 
4814 static int
hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)4815 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4816 {
4817 	struct hn_softc *sc = arg1;
4818 	int enabled = 0;
4819 
4820 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4821 		enabled = 1;
4822 	return (sysctl_handle_int(oidp, &enabled, 0, req));
4823 }
4824 
4825 static int
hn_check_iplen(const struct mbuf * m,int hoff)4826 hn_check_iplen(const struct mbuf *m, int hoff)
4827 {
4828 	const struct ip *ip;
4829 	int len, iphlen, iplen;
4830 	const struct tcphdr *th;
4831 	int thoff;				/* TCP data offset */
4832 
4833 	len = hoff + sizeof(struct ip);
4834 
4835 	/* The packet must be at least the size of an IP header. */
4836 	if (m->m_pkthdr.len < len)
4837 		return IPPROTO_DONE;
4838 
4839 	/* The fixed IP header must reside completely in the first mbuf. */
4840 	if (m->m_len < len)
4841 		return IPPROTO_DONE;
4842 
4843 	ip = mtodo(m, hoff);
4844 
4845 	/* Bound check the packet's stated IP header length. */
4846 	iphlen = ip->ip_hl << 2;
4847 	if (iphlen < sizeof(struct ip))		/* minimum header length */
4848 		return IPPROTO_DONE;
4849 
4850 	/* The full IP header must reside completely in the one mbuf. */
4851 	if (m->m_len < hoff + iphlen)
4852 		return IPPROTO_DONE;
4853 
4854 	iplen = ntohs(ip->ip_len);
4855 
4856 	/*
4857 	 * Check that the amount of data in the buffers is as
4858 	 * at least much as the IP header would have us expect.
4859 	 */
4860 	if (m->m_pkthdr.len < hoff + iplen)
4861 		return IPPROTO_DONE;
4862 
4863 	/*
4864 	 * Ignore IP fragments.
4865 	 */
4866 	if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4867 		return IPPROTO_DONE;
4868 
4869 	/*
4870 	 * The TCP/IP or UDP/IP header must be entirely contained within
4871 	 * the first fragment of a packet.
4872 	 */
4873 	switch (ip->ip_p) {
4874 	case IPPROTO_TCP:
4875 		if (iplen < iphlen + sizeof(struct tcphdr))
4876 			return IPPROTO_DONE;
4877 		if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4878 			return IPPROTO_DONE;
4879 		th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4880 		thoff = th->th_off << 2;
4881 		if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4882 			return IPPROTO_DONE;
4883 		if (m->m_len < hoff + iphlen + thoff)
4884 			return IPPROTO_DONE;
4885 		break;
4886 	case IPPROTO_UDP:
4887 		if (iplen < iphlen + sizeof(struct udphdr))
4888 			return IPPROTO_DONE;
4889 		if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4890 			return IPPROTO_DONE;
4891 		break;
4892 	default:
4893 		if (iplen < iphlen)
4894 			return IPPROTO_DONE;
4895 		break;
4896 	}
4897 	return ip->ip_p;
4898 }
4899 
4900 static void
hn_rxpkt_proto(const struct mbuf * m_new,int * l3proto,int * l4proto)4901 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4902 {
4903 	const struct ether_header *eh;
4904 	uint16_t etype;
4905 	int hoff;
4906 
4907 	hoff = sizeof(*eh);
4908 	/* Checked at the beginning of this function. */
4909 	KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4910 
4911 	eh = mtod(m_new, const struct ether_header *);
4912 	etype = ntohs(eh->ether_type);
4913 	if (etype == ETHERTYPE_VLAN) {
4914 		const struct ether_vlan_header *evl;
4915 
4916 		hoff = sizeof(*evl);
4917 		if (m_new->m_len < hoff)
4918 			return;
4919 		evl = mtod(m_new, const struct ether_vlan_header *);
4920 		etype = ntohs(evl->evl_proto);
4921 	}
4922 	*l3proto = etype;
4923 
4924 	if (etype == ETHERTYPE_IP)
4925 		*l4proto = hn_check_iplen(m_new, hoff);
4926 	else
4927 		*l4proto = IPPROTO_DONE;
4928 }
4929 
4930 static int
hn_create_rx_data(struct hn_softc * sc,int ring_cnt)4931 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4932 {
4933 	struct sysctl_oid_list *child;
4934 	struct sysctl_ctx_list *ctx;
4935 	device_t dev = sc->hn_dev;
4936 #if defined(INET) || defined(INET6)
4937 	int lroent_cnt;
4938 #endif
4939 	int i;
4940 
4941 	/*
4942 	 * Create RXBUF for reception.
4943 	 *
4944 	 * NOTE:
4945 	 * - It is shared by all channels.
4946 	 * - A large enough buffer is allocated, certain version of NVSes
4947 	 *   may further limit the usable space.
4948 	 */
4949 	sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
4950 	    0ul, ~0ul, PAGE_SIZE, 0);
4951 	if (sc->hn_rxbuf == NULL) {
4952 		device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4953 		return (ENOMEM);
4954 	}
4955 
4956 	sc->hn_rx_ring_cnt = ring_cnt;
4957 	sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4958 
4959 	sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4960 	    M_DEVBUF, M_WAITOK | M_ZERO);
4961 
4962 #if defined(INET) || defined(INET6)
4963 	lroent_cnt = hn_lro_entry_count;
4964 	if (lroent_cnt < TCP_LRO_ENTRIES)
4965 		lroent_cnt = TCP_LRO_ENTRIES;
4966 	if (bootverbose)
4967 		device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4968 #endif	/* INET || INET6 */
4969 
4970 	ctx = device_get_sysctl_ctx(dev);
4971 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4972 
4973 	/* Create dev.hn.UNIT.rx sysctl tree */
4974 	sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4975 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4976 
4977 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4978 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4979 
4980 		rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF,
4981 		    M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0);
4982 		if (rxr->hn_br == NULL) {
4983 			device_printf(dev, "allocate bufring failed\n");
4984 			return (ENOMEM);
4985 		}
4986 
4987 		if (hn_trust_hosttcp)
4988 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4989 		if (hn_trust_hostudp)
4990 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4991 		if (hn_trust_hostip)
4992 			rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4993 		rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4994 		rxr->hn_ifp = sc->hn_ifp;
4995 		if (i < sc->hn_tx_ring_cnt)
4996 			rxr->hn_txr = &sc->hn_tx_ring[i];
4997 		rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4998 		rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4999 		rxr->hn_rx_idx = i;
5000 		rxr->hn_rxbuf = sc->hn_rxbuf;
5001 
5002 		/*
5003 		 * Initialize LRO.
5004 		 */
5005 #if defined(INET) || defined(INET6)
5006 		tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
5007 		    hn_lro_mbufq_depth);
5008 		rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
5009 		rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
5010 #endif	/* INET || INET6 */
5011 
5012 		if (sc->hn_rx_sysctl_tree != NULL) {
5013 			char name[16];
5014 
5015 			/*
5016 			 * Create per RX ring sysctl tree:
5017 			 * dev.hn.UNIT.rx.RINGID
5018 			 */
5019 			snprintf(name, sizeof(name), "%d", i);
5020 			rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
5021 			    SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
5022 			    OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5023 
5024 			if (rxr->hn_rx_sysctl_tree != NULL) {
5025 				SYSCTL_ADD_ULONG(ctx,
5026 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5027 				    OID_AUTO, "packets",
5028 				    CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts,
5029 				    "# of packets received");
5030 				SYSCTL_ADD_ULONG(ctx,
5031 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5032 				    OID_AUTO, "rss_pkts",
5033 				    CTLFLAG_RW | CTLFLAG_STATS,
5034 				    &rxr->hn_rss_pkts,
5035 				    "# of packets w/ RSS info received");
5036 				SYSCTL_ADD_ULONG(ctx,
5037 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5038 				    OID_AUTO, "rsc_pkts",
5039 				    CTLFLAG_RW | CTLFLAG_STATS,
5040 				    &rxr->hn_rsc_pkts,
5041 				    "# of RSC packets received");
5042 				SYSCTL_ADD_ULONG(ctx,
5043 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5044 				    OID_AUTO, "rsc_drop",
5045 				    CTLFLAG_RW | CTLFLAG_STATS,
5046 				    &rxr->hn_rsc_drop,
5047 				    "# of RSC fragments dropped");
5048 				SYSCTL_ADD_INT(ctx,
5049 				    SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5050 				    OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5051 				    &rxr->hn_pktbuf_len, 0,
5052 				    "Temporary channel packet buffer length");
5053 			}
5054 		}
5055 	}
5056 
5057 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5058 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5059 	    __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5060 	    hn_rx_stat_u64_sysctl,
5061 	    "LU", "LRO queued");
5062 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5063 	    CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5064 	    __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5065 	    hn_rx_stat_u64_sysctl,
5066 	    "LU", "LRO flushed");
5067 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5068 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5069 	    __offsetof(struct hn_rx_ring, hn_lro_tried),
5070 	    hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5071 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5072 	    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5073 	    hn_lro_lenlim_sysctl, "IU",
5074 	    "Max # of data bytes to be aggregated by LRO");
5075 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5076 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5077 	    hn_lro_ackcnt_sysctl, "I",
5078 	    "Max # of ACKs to be aggregated by LRO");
5079 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5080 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5081 	    hn_trust_hcsum_sysctl, "I",
5082 	    "Trust tcp segment verification on host side, "
5083 	    "when csum info is missing");
5084 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5085 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5086 	    hn_trust_hcsum_sysctl, "I",
5087 	    "Trust udp datagram verification on host side, "
5088 	    "when csum info is missing");
5089 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5090 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5091 	    hn_trust_hcsum_sysctl, "I",
5092 	    "Trust ip packet verification on host side, "
5093 	    "when csum info is missing");
5094 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5095 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5096 	    __offsetof(struct hn_rx_ring, hn_csum_ip),
5097 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5098 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5099 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5100 	    __offsetof(struct hn_rx_ring, hn_csum_tcp),
5101 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5102 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5103 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5104 	    __offsetof(struct hn_rx_ring, hn_csum_udp),
5105 	    hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5106 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5107 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5108 	    __offsetof(struct hn_rx_ring, hn_csum_trusted),
5109 	    hn_rx_stat_ulong_sysctl, "LU",
5110 	    "# of packets that we trust host's csum verification");
5111 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5112 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5113 	    __offsetof(struct hn_rx_ring, hn_small_pkts),
5114 	    hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5115 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5116 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc,
5117 	    __offsetof(struct hn_rx_ring, hn_ack_failed),
5118 	    hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5119 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5120 	    CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5121 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5122 	    CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5123 
5124 	return (0);
5125 }
5126 
5127 static void
hn_destroy_rx_data(struct hn_softc * sc)5128 hn_destroy_rx_data(struct hn_softc *sc)
5129 {
5130 	int i;
5131 
5132 	if (sc->hn_rxbuf != NULL) {
5133 		if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5134 			contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF);
5135 		else
5136 			device_printf(sc->hn_dev, "RXBUF is referenced\n");
5137 		sc->hn_rxbuf = NULL;
5138 	}
5139 
5140 	if (sc->hn_rx_ring_cnt == 0)
5141 		return;
5142 
5143 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5144 		struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5145 
5146 		if (rxr->hn_br == NULL)
5147 			continue;
5148 		if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5149 			contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE,
5150 			    M_DEVBUF);
5151 		} else {
5152 			device_printf(sc->hn_dev,
5153 			    "%dth channel bufring is referenced", i);
5154 		}
5155 		rxr->hn_br = NULL;
5156 
5157 #if defined(INET) || defined(INET6)
5158 		tcp_lro_free(&rxr->hn_lro);
5159 #endif
5160 		free(rxr->hn_pktbuf, M_DEVBUF);
5161 	}
5162 	free(sc->hn_rx_ring, M_DEVBUF);
5163 	sc->hn_rx_ring = NULL;
5164 
5165 	sc->hn_rx_ring_cnt = 0;
5166 	sc->hn_rx_ring_inuse = 0;
5167 }
5168 
5169 static int
hn_tx_ring_create(struct hn_softc * sc,int id)5170 hn_tx_ring_create(struct hn_softc *sc, int id)
5171 {
5172 	struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5173 	device_t dev = sc->hn_dev;
5174 	bus_dma_tag_t parent_dtag;
5175 	int error, i;
5176 
5177 	txr->hn_sc = sc;
5178 	txr->hn_tx_idx = id;
5179 
5180 #ifndef HN_USE_TXDESC_BUFRING
5181 	mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5182 #endif
5183 	mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5184 
5185 	txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5186 	txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5187 	    M_DEVBUF, M_WAITOK | M_ZERO);
5188 #ifndef HN_USE_TXDESC_BUFRING
5189 	SLIST_INIT(&txr->hn_txlist);
5190 #else
5191 	txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5192 	    M_WAITOK, &txr->hn_tx_lock);
5193 #endif
5194 
5195 	if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5196 		txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5197 		    device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5198 	} else {
5199 		txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5200 	}
5201 
5202 #ifdef HN_IFSTART_SUPPORT
5203 	if (hn_use_if_start) {
5204 		txr->hn_txeof = hn_start_txeof;
5205 		TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5206 		TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5207 	} else
5208 #endif
5209 	{
5210 		int br_depth;
5211 
5212 		txr->hn_txeof = hn_xmit_txeof;
5213 		TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5214 		TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5215 
5216 		br_depth = hn_get_txswq_depth(txr);
5217 		txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5218 		    M_WAITOK, &txr->hn_tx_lock);
5219 	}
5220 
5221 	txr->hn_direct_tx_size = hn_direct_tx_size;
5222 
5223 	/*
5224 	 * Always schedule transmission instead of trying to do direct
5225 	 * transmission.  This one gives the best performance so far.
5226 	 */
5227 	txr->hn_sched_tx = 1;
5228 
5229 	parent_dtag = bus_get_dma_tag(dev);
5230 
5231 	/* DMA tag for RNDIS packet messages. */
5232 	error = bus_dma_tag_create(parent_dtag, /* parent */
5233 	    HN_RNDIS_PKT_ALIGN,		/* alignment */
5234 	    HN_RNDIS_PKT_BOUNDARY,	/* boundary */
5235 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5236 	    BUS_SPACE_MAXADDR,		/* highaddr */
5237 	    NULL, NULL,			/* filter, filterarg */
5238 	    HN_RNDIS_PKT_LEN,		/* maxsize */
5239 	    1,				/* nsegments */
5240 	    HN_RNDIS_PKT_LEN,		/* maxsegsize */
5241 	    0,				/* flags */
5242 	    NULL,			/* lockfunc */
5243 	    NULL,			/* lockfuncarg */
5244 	    &txr->hn_tx_rndis_dtag);
5245 	if (error) {
5246 		device_printf(dev, "failed to create rndis dmatag\n");
5247 		return error;
5248 	}
5249 
5250 	/* DMA tag for data. */
5251 	error = bus_dma_tag_create(parent_dtag, /* parent */
5252 	    1,				/* alignment */
5253 	    HN_TX_DATA_BOUNDARY,	/* boundary */
5254 	    BUS_SPACE_MAXADDR,		/* lowaddr */
5255 	    BUS_SPACE_MAXADDR,		/* highaddr */
5256 	    NULL, NULL,			/* filter, filterarg */
5257 	    HN_TX_DATA_MAXSIZE,		/* maxsize */
5258 	    HN_TX_DATA_SEGCNT_MAX,	/* nsegments */
5259 	    HN_TX_DATA_SEGSIZE,		/* maxsegsize */
5260 	    0,				/* flags */
5261 	    NULL,			/* lockfunc */
5262 	    NULL,			/* lockfuncarg */
5263 	    &txr->hn_tx_data_dtag);
5264 	if (error) {
5265 		device_printf(dev, "failed to create data dmatag\n");
5266 		return error;
5267 	}
5268 
5269 	for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5270 		struct hn_txdesc *txd = &txr->hn_txdesc[i];
5271 
5272 		txd->txr = txr;
5273 		txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5274 		STAILQ_INIT(&txd->agg_list);
5275 
5276 		/*
5277 		 * Allocate and load RNDIS packet message.
5278 		 */
5279         	error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5280 		    (void **)&txd->rndis_pkt,
5281 		    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5282 		    &txd->rndis_pkt_dmap);
5283 		if (error) {
5284 			device_printf(dev,
5285 			    "failed to allocate rndis_packet_msg, %d\n", i);
5286 			return error;
5287 		}
5288 
5289 		error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5290 		    txd->rndis_pkt_dmap,
5291 		    txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5292 		    hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5293 		    BUS_DMA_NOWAIT);
5294 		if (error) {
5295 			device_printf(dev,
5296 			    "failed to load rndis_packet_msg, %d\n", i);
5297 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5298 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5299 			return error;
5300 		}
5301 
5302 		/* DMA map for TX data. */
5303 		error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5304 		    &txd->data_dmap);
5305 		if (error) {
5306 			device_printf(dev,
5307 			    "failed to allocate tx data dmamap\n");
5308 			bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5309 			    txd->rndis_pkt_dmap);
5310 			bus_dmamem_free(txr->hn_tx_rndis_dtag,
5311 			    txd->rndis_pkt, txd->rndis_pkt_dmap);
5312 			return error;
5313 		}
5314 
5315 		/* All set, put it to list */
5316 		txd->flags |= HN_TXD_FLAG_ONLIST;
5317 #ifndef HN_USE_TXDESC_BUFRING
5318 		SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5319 #else
5320 		buf_ring_enqueue(txr->hn_txdesc_br, txd);
5321 #endif
5322 	}
5323 	txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5324 
5325 	if (sc->hn_tx_sysctl_tree != NULL) {
5326 		struct sysctl_oid_list *child;
5327 		struct sysctl_ctx_list *ctx;
5328 		char name[16];
5329 
5330 		/*
5331 		 * Create per TX ring sysctl tree:
5332 		 * dev.hn.UNIT.tx.RINGID
5333 		 */
5334 		ctx = device_get_sysctl_ctx(dev);
5335 		child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5336 
5337 		snprintf(name, sizeof(name), "%d", id);
5338 		txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5339 		    name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5340 
5341 		if (txr->hn_tx_sysctl_tree != NULL) {
5342 			child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5343 
5344 #ifdef HN_DEBUG
5345 			SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5346 			    CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5347 			    "# of available TX descs");
5348 #endif
5349 #ifdef HN_IFSTART_SUPPORT
5350 			if (!hn_use_if_start)
5351 #endif
5352 			{
5353 				SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5354 				    CTLFLAG_RD, &txr->hn_oactive, 0,
5355 				    "over active");
5356 			}
5357 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5358 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts,
5359 			    "# of packets transmitted");
5360 			SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5361 			    CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends,
5362 			    "# of sends");
5363 		}
5364 	}
5365 
5366 	return 0;
5367 }
5368 
5369 static void
hn_txdesc_dmamap_destroy(struct hn_txdesc * txd)5370 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5371 {
5372 	struct hn_tx_ring *txr = txd->txr;
5373 
5374 	KASSERT(txd->m == NULL, ("still has mbuf installed"));
5375 	KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5376 
5377 	bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5378 	bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5379 	    txd->rndis_pkt_dmap);
5380 	bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5381 }
5382 
5383 static void
hn_txdesc_gc(struct hn_tx_ring * txr,struct hn_txdesc * txd)5384 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5385 {
5386 
5387 	KASSERT(txd->refs == 0 || txd->refs == 1,
5388 	    ("invalid txd refs %d", txd->refs));
5389 
5390 	/* Aggregated txds will be freed by their aggregating txd. */
5391 	if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5392 		int freed __diagused;
5393 
5394 		freed = hn_txdesc_put(txr, txd);
5395 		KASSERT(freed, ("can't free txdesc"));
5396 	}
5397 }
5398 
5399 static void
hn_tx_ring_destroy(struct hn_tx_ring * txr)5400 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5401 {
5402 	int i;
5403 
5404 	if (txr->hn_txdesc == NULL)
5405 		return;
5406 
5407 	/*
5408 	 * NOTE:
5409 	 * Because the freeing of aggregated txds will be deferred
5410 	 * to the aggregating txd, two passes are used here:
5411 	 * - The first pass GCes any pending txds.  This GC is necessary,
5412 	 *   since if the channels are revoked, hypervisor will not
5413 	 *   deliver send-done for all pending txds.
5414 	 * - The second pass frees the busdma stuffs, i.e. after all txds
5415 	 *   were freed.
5416 	 */
5417 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5418 		hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5419 	for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5420 		hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5421 
5422 	if (txr->hn_tx_data_dtag != NULL)
5423 		bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5424 	if (txr->hn_tx_rndis_dtag != NULL)
5425 		bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5426 
5427 #ifdef HN_USE_TXDESC_BUFRING
5428 	buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5429 #endif
5430 
5431 	free(txr->hn_txdesc, M_DEVBUF);
5432 	txr->hn_txdesc = NULL;
5433 
5434 	if (txr->hn_mbuf_br != NULL)
5435 		buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5436 
5437 #ifndef HN_USE_TXDESC_BUFRING
5438 	mtx_destroy(&txr->hn_txlist_spin);
5439 #endif
5440 	mtx_destroy(&txr->hn_tx_lock);
5441 }
5442 
5443 static int
hn_create_tx_data(struct hn_softc * sc,int ring_cnt)5444 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5445 {
5446 	struct sysctl_oid_list *child;
5447 	struct sysctl_ctx_list *ctx;
5448 	int i;
5449 
5450 	/*
5451 	 * Create TXBUF for chimney sending.
5452 	 *
5453 	 * NOTE: It is shared by all channels.
5454 	 */
5455 	sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO,
5456 	    0ul, ~0ul, PAGE_SIZE, 0);
5457 	if (sc->hn_chim == NULL) {
5458 		device_printf(sc->hn_dev, "allocate txbuf failed\n");
5459 		return (ENOMEM);
5460 	}
5461 
5462 	sc->hn_tx_ring_cnt = ring_cnt;
5463 	sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5464 
5465 	sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5466 	    M_DEVBUF, M_WAITOK | M_ZERO);
5467 
5468 	ctx = device_get_sysctl_ctx(sc->hn_dev);
5469 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5470 
5471 	/* Create dev.hn.UNIT.tx sysctl tree */
5472 	sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5473 	    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5474 
5475 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5476 		int error;
5477 
5478 		error = hn_tx_ring_create(sc, i);
5479 		if (error)
5480 			return error;
5481 	}
5482 
5483 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5484 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5485 	    __offsetof(struct hn_tx_ring, hn_no_txdescs),
5486 	    hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5487 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5488 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5489 	    __offsetof(struct hn_tx_ring, hn_send_failed),
5490 	    hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5491 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5492 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5493 	    __offsetof(struct hn_tx_ring, hn_txdma_failed),
5494 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5495 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5496 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5497 	    __offsetof(struct hn_tx_ring, hn_flush_failed),
5498 	    hn_tx_stat_ulong_sysctl, "LU",
5499 	    "# of packet transmission aggregation flush failure");
5500 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5501 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5502 	    __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5503 	    hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5504 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5505 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5506 	    __offsetof(struct hn_tx_ring, hn_tx_chimney),
5507 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5508 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5509 	    CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc,
5510 	    __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5511 	    hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5512 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5513 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5514 	    "# of total TX descs");
5515 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5516 	    CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5517 	    "Chimney send packet size upper boundary");
5518 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5519 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5520 	    hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5521 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5522 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5523 	    __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5524 	    hn_tx_conf_int_sysctl, "I",
5525 	    "Size of the packet for direct transmission");
5526 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5527 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5528 	    __offsetof(struct hn_tx_ring, hn_sched_tx),
5529 	    hn_tx_conf_int_sysctl, "I",
5530 	    "Always schedule transmission "
5531 	    "instead of doing direct transmission");
5532 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5533 	    CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5534 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5535 	    CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5536 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5537 	    CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5538 	    "Applied packet transmission aggregation size");
5539 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5540 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5541 	    hn_txagg_pktmax_sysctl, "I",
5542 	    "Applied packet transmission aggregation packets");
5543 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5544 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5545 	    hn_txagg_align_sysctl, "I",
5546 	    "Applied packet transmission aggregation alignment");
5547 
5548 	return 0;
5549 }
5550 
5551 static void
hn_set_chim_size(struct hn_softc * sc,int chim_size)5552 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5553 {
5554 	int i;
5555 
5556 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5557 		sc->hn_tx_ring[i].hn_chim_size = chim_size;
5558 }
5559 
5560 static void
hn_set_tso_maxsize(struct hn_softc * sc,int tso_maxlen,int mtu)5561 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5562 {
5563 	if_t ifp = sc->hn_ifp;
5564 	u_int hw_tsomax;
5565 	int tso_minlen;
5566 
5567 	HN_LOCK_ASSERT(sc);
5568 
5569 	if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5570 		return;
5571 
5572 	KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5573 	    ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5574 	tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5575 
5576 	KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5577 	    sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5578 	    ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5579 
5580 	if (tso_maxlen < tso_minlen)
5581 		tso_maxlen = tso_minlen;
5582 	else if (tso_maxlen > IP_MAXPACKET)
5583 		tso_maxlen = IP_MAXPACKET;
5584 	if (tso_maxlen > sc->hn_ndis_tso_szmax)
5585 		tso_maxlen = sc->hn_ndis_tso_szmax;
5586 	hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5587 
5588 	if (hn_xpnt_vf_isready(sc)) {
5589 		if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp))
5590 			hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp);
5591 	}
5592 	if_sethwtsomax(ifp, hw_tsomax);
5593 	if (bootverbose)
5594 		if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp));
5595 }
5596 
5597 static void
hn_fixup_tx_data(struct hn_softc * sc)5598 hn_fixup_tx_data(struct hn_softc *sc)
5599 {
5600 	uint64_t csum_assist;
5601 	int i;
5602 
5603 	hn_set_chim_size(sc, sc->hn_chim_szmax);
5604 	if (hn_tx_chimney_size > 0 &&
5605 	    hn_tx_chimney_size < sc->hn_chim_szmax)
5606 		hn_set_chim_size(sc, hn_tx_chimney_size);
5607 
5608 	csum_assist = 0;
5609 	if (sc->hn_caps & HN_CAP_IPCS)
5610 		csum_assist |= CSUM_IP;
5611 	if (sc->hn_caps & HN_CAP_TCP4CS)
5612 		csum_assist |= CSUM_IP_TCP;
5613 	if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5614 		csum_assist |= CSUM_IP_UDP;
5615 	if (sc->hn_caps & HN_CAP_TCP6CS)
5616 		csum_assist |= CSUM_IP6_TCP;
5617 	if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5618 		csum_assist |= CSUM_IP6_UDP;
5619 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5620 		sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5621 
5622 	if (sc->hn_caps & HN_CAP_HASHVAL) {
5623 		/*
5624 		 * Support HASHVAL pktinfo on TX path.
5625 		 */
5626 		if (bootverbose)
5627 			if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5628 		for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5629 			sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5630 	}
5631 }
5632 
5633 static void
hn_fixup_rx_data(struct hn_softc * sc)5634 hn_fixup_rx_data(struct hn_softc *sc)
5635 {
5636 
5637 	if (sc->hn_caps & HN_CAP_UDPHASH) {
5638 		int i;
5639 
5640 		for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5641 			sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5642 	}
5643 }
5644 
5645 static void
hn_destroy_tx_data(struct hn_softc * sc)5646 hn_destroy_tx_data(struct hn_softc *sc)
5647 {
5648 	int i;
5649 
5650 	if (sc->hn_chim != NULL) {
5651 		if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5652 			contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF);
5653 		} else {
5654 			device_printf(sc->hn_dev,
5655 			    "chimney sending buffer is referenced");
5656 		}
5657 		sc->hn_chim = NULL;
5658 	}
5659 
5660 	if (sc->hn_tx_ring_cnt == 0)
5661 		return;
5662 
5663 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5664 		hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5665 
5666 	free(sc->hn_tx_ring, M_DEVBUF);
5667 	sc->hn_tx_ring = NULL;
5668 
5669 	sc->hn_tx_ring_cnt = 0;
5670 	sc->hn_tx_ring_inuse = 0;
5671 }
5672 
5673 #ifdef HN_IFSTART_SUPPORT
5674 
5675 static void
hn_start_taskfunc(void * xtxr,int pending __unused)5676 hn_start_taskfunc(void *xtxr, int pending __unused)
5677 {
5678 	struct hn_tx_ring *txr = xtxr;
5679 
5680 	mtx_lock(&txr->hn_tx_lock);
5681 	hn_start_locked(txr, 0);
5682 	mtx_unlock(&txr->hn_tx_lock);
5683 }
5684 
5685 static int
hn_start_locked(struct hn_tx_ring * txr,int len)5686 hn_start_locked(struct hn_tx_ring *txr, int len)
5687 {
5688 	struct hn_softc *sc = txr->hn_sc;
5689 	if_t ifp = sc->hn_ifp;
5690 	int sched = 0;
5691 
5692 	KASSERT(hn_use_if_start,
5693 	    ("hn_start_locked is called, when if_start is disabled"));
5694 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5695 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5696 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5697 
5698 	if (__predict_false(txr->hn_suspended))
5699 		return (0);
5700 
5701 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5702 	    IFF_DRV_RUNNING)
5703 		return (0);
5704 
5705 	while (!if_sendq_empty(ifp)) {
5706 		struct hn_txdesc *txd;
5707 		struct mbuf *m_head;
5708 		int error;
5709 
5710 		m_head = if_dequeue(ifp);
5711 		if (m_head == NULL)
5712 			break;
5713 
5714 		if (len > 0 && m_head->m_pkthdr.len > len) {
5715 			/*
5716 			 * This sending could be time consuming; let callers
5717 			 * dispatch this packet sending (and sending of any
5718 			 * following up packets) to tx taskqueue.
5719 			 */
5720 			if_sendq_prepend(ifp, m_head);
5721 			sched = 1;
5722 			break;
5723 		}
5724 
5725 #if defined(INET6) || defined(INET)
5726 		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5727 			m_head = hn_tso_fixup(m_head);
5728 			if (__predict_false(m_head == NULL)) {
5729 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5730 				continue;
5731 			}
5732 		} else if (m_head->m_pkthdr.csum_flags &
5733 		    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5734 			m_head = hn_set_hlen(m_head);
5735 			if (__predict_false(m_head == NULL)) {
5736 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5737 				continue;
5738 			}
5739 		}
5740 #endif
5741 
5742 		txd = hn_txdesc_get(txr);
5743 		if (txd == NULL) {
5744 			txr->hn_no_txdescs++;
5745 			if_sendq_prepend(ifp, m_head);
5746 			if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0);
5747 			break;
5748 		}
5749 
5750 		error = hn_encap(ifp, txr, txd, &m_head);
5751 		if (error) {
5752 			/* Both txd and m_head are freed */
5753 			KASSERT(txr->hn_agg_txd == NULL,
5754 			    ("encap failed w/ pending aggregating txdesc"));
5755 			continue;
5756 		}
5757 
5758 		if (txr->hn_agg_pktleft == 0) {
5759 			if (txr->hn_agg_txd != NULL) {
5760 				KASSERT(m_head == NULL,
5761 				    ("pending mbuf for aggregating txdesc"));
5762 				error = hn_flush_txagg(ifp, txr);
5763 				if (__predict_false(error)) {
5764 					if_setdrvflagbits(ifp,
5765 					    IFF_DRV_OACTIVE, 0);
5766 					break;
5767 				}
5768 			} else {
5769 				KASSERT(m_head != NULL, ("mbuf was freed"));
5770 				error = hn_txpkt(ifp, txr, txd);
5771 				if (__predict_false(error)) {
5772 					/* txd is freed, but m_head is not */
5773 					if_sendq_prepend(ifp, m_head);
5774 					if_setdrvflagbits(ifp,
5775 					    IFF_DRV_OACTIVE, 0);
5776 					break;
5777 				}
5778 			}
5779 		}
5780 #ifdef INVARIANTS
5781 		else {
5782 			KASSERT(txr->hn_agg_txd != NULL,
5783 			    ("no aggregating txdesc"));
5784 			KASSERT(m_head == NULL,
5785 			    ("pending mbuf for aggregating txdesc"));
5786 		}
5787 #endif
5788 	}
5789 
5790 	/* Flush pending aggerated transmission. */
5791 	if (txr->hn_agg_txd != NULL)
5792 		hn_flush_txagg(ifp, txr);
5793 	return (sched);
5794 }
5795 
5796 static void
hn_start(if_t ifp)5797 hn_start(if_t ifp)
5798 {
5799 	struct hn_softc *sc = if_getsoftc(ifp);
5800 	struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5801 
5802 	if (txr->hn_sched_tx)
5803 		goto do_sched;
5804 
5805 	if (mtx_trylock(&txr->hn_tx_lock)) {
5806 		int sched;
5807 
5808 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5809 		mtx_unlock(&txr->hn_tx_lock);
5810 		if (!sched)
5811 			return;
5812 	}
5813 do_sched:
5814 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5815 }
5816 
5817 static void
hn_start_txeof_taskfunc(void * xtxr,int pending __unused)5818 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5819 {
5820 	struct hn_tx_ring *txr = xtxr;
5821 
5822 	mtx_lock(&txr->hn_tx_lock);
5823 	if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE);
5824 	hn_start_locked(txr, 0);
5825 	mtx_unlock(&txr->hn_tx_lock);
5826 }
5827 
5828 static void
hn_start_txeof(struct hn_tx_ring * txr)5829 hn_start_txeof(struct hn_tx_ring *txr)
5830 {
5831 	struct hn_softc *sc = txr->hn_sc;
5832 	if_t ifp = sc->hn_ifp;
5833 
5834 	KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5835 
5836 	if (txr->hn_sched_tx)
5837 		goto do_sched;
5838 
5839 	if (mtx_trylock(&txr->hn_tx_lock)) {
5840 		int sched;
5841 
5842 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5843 		sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5844 		mtx_unlock(&txr->hn_tx_lock);
5845 		if (sched) {
5846 			taskqueue_enqueue(txr->hn_tx_taskq,
5847 			    &txr->hn_tx_task);
5848 		}
5849 	} else {
5850 do_sched:
5851 		/*
5852 		 * Release the OACTIVE earlier, with the hope, that
5853 		 * others could catch up.  The task will clear the
5854 		 * flag again with the hn_tx_lock to avoid possible
5855 		 * races.
5856 		 */
5857 		if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE);
5858 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5859 	}
5860 }
5861 
5862 #endif	/* HN_IFSTART_SUPPORT */
5863 
5864 static int
hn_xmit(struct hn_tx_ring * txr,int len)5865 hn_xmit(struct hn_tx_ring *txr, int len)
5866 {
5867 	struct hn_softc *sc = txr->hn_sc;
5868 	if_t ifp = sc->hn_ifp;
5869 	struct mbuf *m_head;
5870 	int sched = 0;
5871 
5872 	mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5873 #ifdef HN_IFSTART_SUPPORT
5874 	KASSERT(hn_use_if_start == 0,
5875 	    ("hn_xmit is called, when if_start is enabled"));
5876 #endif
5877 	KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5878 
5879 	if (__predict_false(txr->hn_suspended))
5880 		return (0);
5881 
5882 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5883 		return (0);
5884 
5885 	while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5886 		struct hn_txdesc *txd;
5887 		int error;
5888 
5889 		if (len > 0 && m_head->m_pkthdr.len > len) {
5890 			/*
5891 			 * This sending could be time consuming; let callers
5892 			 * dispatch this packet sending (and sending of any
5893 			 * following up packets) to tx taskqueue.
5894 			 */
5895 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5896 			sched = 1;
5897 			break;
5898 		}
5899 
5900 		txd = hn_txdesc_get(txr);
5901 		if (txd == NULL) {
5902 			txr->hn_no_txdescs++;
5903 			drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5904 			txr->hn_oactive = 1;
5905 			break;
5906 		}
5907 
5908 		error = hn_encap(ifp, txr, txd, &m_head);
5909 		if (error) {
5910 			/* Both txd and m_head are freed; discard */
5911 			KASSERT(txr->hn_agg_txd == NULL,
5912 			    ("encap failed w/ pending aggregating txdesc"));
5913 			drbr_advance(ifp, txr->hn_mbuf_br);
5914 			continue;
5915 		}
5916 
5917 		if (txr->hn_agg_pktleft == 0) {
5918 			if (txr->hn_agg_txd != NULL) {
5919 				KASSERT(m_head == NULL,
5920 				    ("pending mbuf for aggregating txdesc"));
5921 				error = hn_flush_txagg(ifp, txr);
5922 				if (__predict_false(error)) {
5923 					txr->hn_oactive = 1;
5924 					break;
5925 				}
5926 			} else {
5927 				KASSERT(m_head != NULL, ("mbuf was freed"));
5928 				error = hn_txpkt(ifp, txr, txd);
5929 				if (__predict_false(error)) {
5930 					/* txd is freed, but m_head is not */
5931 					drbr_putback(ifp, txr->hn_mbuf_br,
5932 					    m_head);
5933 					txr->hn_oactive = 1;
5934 					break;
5935 				}
5936 			}
5937 		}
5938 #ifdef INVARIANTS
5939 		else {
5940 			KASSERT(txr->hn_agg_txd != NULL,
5941 			    ("no aggregating txdesc"));
5942 			KASSERT(m_head == NULL,
5943 			    ("pending mbuf for aggregating txdesc"));
5944 		}
5945 #endif
5946 
5947 		/* Sent */
5948 		drbr_advance(ifp, txr->hn_mbuf_br);
5949 	}
5950 
5951 	/* Flush pending aggerated transmission. */
5952 	if (txr->hn_agg_txd != NULL)
5953 		hn_flush_txagg(ifp, txr);
5954 	return (sched);
5955 }
5956 
5957 static int
hn_transmit(if_t ifp,struct mbuf * m)5958 hn_transmit(if_t ifp, struct mbuf *m)
5959 {
5960 	struct hn_softc *sc = if_getsoftc(ifp);
5961 	struct hn_tx_ring *txr;
5962 	int error, idx = 0;
5963 
5964 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5965 		struct rm_priotracker pt;
5966 
5967 		rm_rlock(&sc->hn_vf_lock, &pt);
5968 		if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5969 			struct mbuf *m_bpf = NULL;
5970 			int obytes, omcast;
5971 
5972 			obytes = m->m_pkthdr.len;
5973 			omcast = (m->m_flags & M_MCAST) != 0;
5974 
5975 			if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5976 				if (bpf_peers_present_if(ifp)) {
5977 					m_bpf = m_copypacket(m, M_NOWAIT);
5978 					if (m_bpf == NULL) {
5979 						/*
5980 						 * Failed to grab a shallow
5981 						 * copy; tap now.
5982 						 */
5983 						ETHER_BPF_MTAP(ifp, m);
5984 					}
5985 				}
5986 			} else {
5987 				ETHER_BPF_MTAP(ifp, m);
5988 			}
5989 
5990 			error = if_transmit(sc->hn_vf_ifp, m);
5991 			rm_runlock(&sc->hn_vf_lock, &pt);
5992 
5993 			if (m_bpf != NULL) {
5994 				if (!error)
5995 					ETHER_BPF_MTAP(ifp, m_bpf);
5996 				m_freem(m_bpf);
5997 			}
5998 
5999 			if (error == ENOBUFS) {
6000 				if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6001 			} else if (error) {
6002 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6003 			} else {
6004 				if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
6005 				if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
6006 				if (omcast) {
6007 					if_inc_counter(ifp, IFCOUNTER_OMCASTS,
6008 					    omcast);
6009 				}
6010 			}
6011 			return (error);
6012 		}
6013 		rm_runlock(&sc->hn_vf_lock, &pt);
6014 	}
6015 
6016 #if defined(INET6) || defined(INET)
6017 	/*
6018 	 * Perform TSO packet header fixup or get l2/l3 header length now,
6019 	 * since packet headers should be cache-hot.
6020 	 */
6021 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
6022 		m = hn_tso_fixup(m);
6023 		if (__predict_false(m == NULL)) {
6024 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6025 			return EIO;
6026 		}
6027 	} else if (m->m_pkthdr.csum_flags &
6028 	    (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6029 		m = hn_set_hlen(m);
6030 		if (__predict_false(m == NULL)) {
6031 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6032 			return EIO;
6033 		}
6034 	}
6035 #endif
6036 
6037 	/*
6038 	 * Select the TX ring based on flowid
6039 	 */
6040 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6041 #ifdef RSS
6042 		uint32_t bid;
6043 
6044 		if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
6045 		    &bid) == 0)
6046 			idx = bid % sc->hn_tx_ring_inuse;
6047 		else
6048 #endif
6049 		{
6050 #if defined(INET6) || defined(INET)
6051 			int tcpsyn = 0;
6052 
6053 			if (m->m_pkthdr.len < 128 &&
6054 			    (m->m_pkthdr.csum_flags &
6055 			     (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6056 			    (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6057 				m = hn_check_tcpsyn(m, &tcpsyn);
6058 				if (__predict_false(m == NULL)) {
6059 					if_inc_counter(ifp,
6060 					    IFCOUNTER_OERRORS, 1);
6061 					return (EIO);
6062 				}
6063 			}
6064 #else
6065 			const int tcpsyn = 0;
6066 #endif
6067 			if (tcpsyn)
6068 				idx = 0;
6069 			else
6070 				idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6071 		}
6072 	}
6073 	txr = &sc->hn_tx_ring[idx];
6074 
6075 	error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6076 	if (error) {
6077 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6078 		return error;
6079 	}
6080 
6081 	if (txr->hn_oactive)
6082 		return 0;
6083 
6084 	if (txr->hn_sched_tx)
6085 		goto do_sched;
6086 
6087 	if (mtx_trylock(&txr->hn_tx_lock)) {
6088 		int sched;
6089 
6090 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6091 		mtx_unlock(&txr->hn_tx_lock);
6092 		if (!sched)
6093 			return 0;
6094 	}
6095 do_sched:
6096 	taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6097 	return 0;
6098 }
6099 
6100 static void
hn_tx_ring_qflush(struct hn_tx_ring * txr)6101 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6102 {
6103 	struct mbuf *m;
6104 
6105 	mtx_lock(&txr->hn_tx_lock);
6106 	while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6107 		m_freem(m);
6108 	mtx_unlock(&txr->hn_tx_lock);
6109 }
6110 
6111 static void
hn_xmit_qflush(if_t ifp)6112 hn_xmit_qflush(if_t ifp)
6113 {
6114 	struct hn_softc *sc = if_getsoftc(ifp);
6115 	struct rm_priotracker pt;
6116 	int i;
6117 
6118 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6119 		hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6120 	if_qflush(ifp);
6121 
6122 	rm_rlock(&sc->hn_vf_lock, &pt);
6123 	if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6124 		if_qflush(sc->hn_vf_ifp);
6125 	rm_runlock(&sc->hn_vf_lock, &pt);
6126 }
6127 
6128 static void
hn_xmit_txeof(struct hn_tx_ring * txr)6129 hn_xmit_txeof(struct hn_tx_ring *txr)
6130 {
6131 
6132 	if (txr->hn_sched_tx)
6133 		goto do_sched;
6134 
6135 	if (mtx_trylock(&txr->hn_tx_lock)) {
6136 		int sched;
6137 
6138 		txr->hn_oactive = 0;
6139 		sched = hn_xmit(txr, txr->hn_direct_tx_size);
6140 		mtx_unlock(&txr->hn_tx_lock);
6141 		if (sched) {
6142 			taskqueue_enqueue(txr->hn_tx_taskq,
6143 			    &txr->hn_tx_task);
6144 		}
6145 	} else {
6146 do_sched:
6147 		/*
6148 		 * Release the oactive earlier, with the hope, that
6149 		 * others could catch up.  The task will clear the
6150 		 * oactive again with the hn_tx_lock to avoid possible
6151 		 * races.
6152 		 */
6153 		txr->hn_oactive = 0;
6154 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6155 	}
6156 }
6157 
6158 static void
hn_xmit_taskfunc(void * xtxr,int pending __unused)6159 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6160 {
6161 	struct hn_tx_ring *txr = xtxr;
6162 
6163 	mtx_lock(&txr->hn_tx_lock);
6164 	hn_xmit(txr, 0);
6165 	mtx_unlock(&txr->hn_tx_lock);
6166 }
6167 
6168 static void
hn_xmit_txeof_taskfunc(void * xtxr,int pending __unused)6169 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6170 {
6171 	struct hn_tx_ring *txr = xtxr;
6172 
6173 	mtx_lock(&txr->hn_tx_lock);
6174 	txr->hn_oactive = 0;
6175 	hn_xmit(txr, 0);
6176 	mtx_unlock(&txr->hn_tx_lock);
6177 }
6178 
6179 static int
hn_chan_attach(struct hn_softc * sc,struct vmbus_channel * chan)6180 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6181 {
6182 	struct vmbus_chan_br cbr;
6183 	struct hn_rx_ring *rxr;
6184 	struct hn_tx_ring *txr = NULL;
6185 	int idx, error;
6186 
6187 	idx = vmbus_chan_subidx(chan);
6188 
6189 	/*
6190 	 * Link this channel to RX/TX ring.
6191 	 */
6192 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6193 	    ("invalid channel index %d, should > 0 && < %d",
6194 	     idx, sc->hn_rx_ring_inuse));
6195 	rxr = &sc->hn_rx_ring[idx];
6196 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6197 	    ("RX ring %d already attached", idx));
6198 	rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6199 	rxr->hn_chan = chan;
6200 
6201 	if (bootverbose) {
6202 		if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6203 		    idx, vmbus_chan_id(chan));
6204 	}
6205 
6206 	if (idx < sc->hn_tx_ring_inuse) {
6207 		txr = &sc->hn_tx_ring[idx];
6208 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6209 		    ("TX ring %d already attached", idx));
6210 		txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6211 
6212 		txr->hn_chan = chan;
6213 		if (bootverbose) {
6214 			if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6215 			    idx, vmbus_chan_id(chan));
6216 		}
6217 	}
6218 
6219 	/* Bind this channel to a proper CPU. */
6220 	vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6221 
6222 	/*
6223 	 * Open this channel
6224 	 */
6225 	cbr.cbr = rxr->hn_br;
6226 	cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br);
6227 	cbr.cbr_txsz = HN_TXBR_SIZE;
6228 	cbr.cbr_rxsz = HN_RXBR_SIZE;
6229 	error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6230 	if (error) {
6231 		if (error == EISCONN) {
6232 			if_printf(sc->hn_ifp, "bufring is connected after "
6233 			    "chan%u open failure\n", vmbus_chan_id(chan));
6234 			rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6235 		} else {
6236 			if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6237 			    vmbus_chan_id(chan), error);
6238 		}
6239 	}
6240 	return (error);
6241 }
6242 
6243 static void
hn_chan_detach(struct hn_softc * sc,struct vmbus_channel * chan)6244 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6245 {
6246 	struct hn_rx_ring *rxr;
6247 	int idx, error;
6248 
6249 	idx = vmbus_chan_subidx(chan);
6250 
6251 	/*
6252 	 * Link this channel to RX/TX ring.
6253 	 */
6254 	KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6255 	    ("invalid channel index %d, should > 0 && < %d",
6256 	     idx, sc->hn_rx_ring_inuse));
6257 	rxr = &sc->hn_rx_ring[idx];
6258 	KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6259 	    ("RX ring %d is not attached", idx));
6260 	rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6261 
6262 	if (idx < sc->hn_tx_ring_inuse) {
6263 		struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6264 
6265 		KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6266 		    ("TX ring %d is not attached attached", idx));
6267 		txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6268 	}
6269 
6270 	/*
6271 	 * Close this channel.
6272 	 *
6273 	 * NOTE:
6274 	 * Channel closing does _not_ destroy the target channel.
6275 	 */
6276 	error = vmbus_chan_close_direct(chan);
6277 	if (error == EISCONN) {
6278 		if_printf(sc->hn_ifp, "chan%u bufring is connected "
6279 		    "after being closed\n", vmbus_chan_id(chan));
6280 		rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6281 	} else if (error) {
6282 		if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6283 		    vmbus_chan_id(chan), error);
6284 	}
6285 }
6286 
6287 static int
hn_attach_subchans(struct hn_softc * sc)6288 hn_attach_subchans(struct hn_softc *sc)
6289 {
6290 	struct vmbus_channel **subchans;
6291 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6292 	int i, error = 0;
6293 
6294 	KASSERT(subchan_cnt > 0, ("no sub-channels"));
6295 
6296 	/* Attach the sub-channels. */
6297 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6298 	for (i = 0; i < subchan_cnt; ++i) {
6299 		int error1;
6300 
6301 		error1 = hn_chan_attach(sc, subchans[i]);
6302 		if (error1) {
6303 			error = error1;
6304 			/* Move on; all channels will be detached later. */
6305 		}
6306 	}
6307 	vmbus_subchan_rel(subchans, subchan_cnt);
6308 
6309 	if (error) {
6310 		if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6311 	} else {
6312 		if (bootverbose) {
6313 			if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6314 			    subchan_cnt);
6315 		}
6316 	}
6317 	return (error);
6318 }
6319 
6320 static void
hn_detach_allchans(struct hn_softc * sc)6321 hn_detach_allchans(struct hn_softc *sc)
6322 {
6323 	struct vmbus_channel **subchans;
6324 	int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6325 	int i;
6326 
6327 	if (subchan_cnt == 0)
6328 		goto back;
6329 
6330 	/* Detach the sub-channels. */
6331 	subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6332 	for (i = 0; i < subchan_cnt; ++i)
6333 		hn_chan_detach(sc, subchans[i]);
6334 	vmbus_subchan_rel(subchans, subchan_cnt);
6335 
6336 back:
6337 	/*
6338 	 * Detach the primary channel, _after_ all sub-channels
6339 	 * are detached.
6340 	 */
6341 	hn_chan_detach(sc, sc->hn_prichan);
6342 
6343 	/* Wait for sub-channels to be destroyed, if any. */
6344 	vmbus_subchan_drain(sc->hn_prichan);
6345 
6346 #ifdef INVARIANTS
6347 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6348 		KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6349 		    HN_RX_FLAG_ATTACHED) == 0,
6350 		    ("%dth RX ring is still attached", i));
6351 	}
6352 	for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6353 		KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6354 		    HN_TX_FLAG_ATTACHED) == 0,
6355 		    ("%dth TX ring is still attached", i));
6356 	}
6357 #endif
6358 }
6359 
6360 static int
hn_synth_alloc_subchans(struct hn_softc * sc,int * nsubch)6361 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6362 {
6363 	struct vmbus_channel **subchans;
6364 	int nchan, rxr_cnt, error;
6365 
6366 	nchan = *nsubch + 1;
6367 	if (nchan == 1) {
6368 		/*
6369 		 * Multiple RX/TX rings are not requested.
6370 		 */
6371 		*nsubch = 0;
6372 		return (0);
6373 	}
6374 
6375 	/*
6376 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6377 	 * table entries.
6378 	 */
6379 	error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6380 	if (error) {
6381 		/* No RSS; this is benign. */
6382 		*nsubch = 0;
6383 		return (0);
6384 	}
6385 	if (bootverbose) {
6386 		if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6387 		    rxr_cnt, nchan);
6388 	}
6389 
6390 	if (nchan > rxr_cnt)
6391 		nchan = rxr_cnt;
6392 	if (nchan == 1) {
6393 		if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6394 		*nsubch = 0;
6395 		return (0);
6396 	}
6397 
6398 	/*
6399 	 * Allocate sub-channels from NVS.
6400 	 */
6401 	*nsubch = nchan - 1;
6402 	error = hn_nvs_alloc_subchans(sc, nsubch);
6403 	if (error || *nsubch == 0) {
6404 		/* Failed to allocate sub-channels. */
6405 		*nsubch = 0;
6406 		return (0);
6407 	}
6408 
6409 	/*
6410 	 * Wait for all sub-channels to become ready before moving on.
6411 	 */
6412 	subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6413 	vmbus_subchan_rel(subchans, *nsubch);
6414 	return (0);
6415 }
6416 
6417 static bool
hn_synth_attachable(const struct hn_softc * sc)6418 hn_synth_attachable(const struct hn_softc *sc)
6419 {
6420 	int i;
6421 
6422 	if (sc->hn_flags & HN_FLAG_ERRORS)
6423 		return (false);
6424 
6425 	for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6426 		const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6427 
6428 		if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6429 			return (false);
6430 	}
6431 	return (true);
6432 }
6433 
6434 /*
6435  * Make sure that the RX filter is zero after the successful
6436  * RNDIS initialization.
6437  *
6438  * NOTE:
6439  * Under certain conditions on certain versions of Hyper-V,
6440  * the RNDIS rxfilter is _not_ zero on the hypervisor side
6441  * after the successful RNDIS initialization, which breaks
6442  * the assumption of any following code (well, it breaks the
6443  * RNDIS API contract actually).  Clear the RNDIS rxfilter
6444  * explicitly, drain packets sneaking through, and drain the
6445  * interrupt taskqueues scheduled due to the stealth packets.
6446  */
6447 static void
hn_rndis_init_fixat(struct hn_softc * sc,int nchan)6448 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6449 {
6450 
6451 	hn_disable_rx(sc);
6452 	hn_drain_rxtx(sc, nchan);
6453 }
6454 
6455 static int
hn_synth_attach(struct hn_softc * sc,int mtu)6456 hn_synth_attach(struct hn_softc *sc, int mtu)
6457 {
6458 #define ATTACHED_NVS		0x0002
6459 #define ATTACHED_RNDIS		0x0004
6460 
6461 	struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6462 	int error, nsubch, nchan = 1, i, rndis_inited;
6463 	uint32_t old_caps, attached = 0;
6464 
6465 	KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6466 	    ("synthetic parts were attached"));
6467 
6468 	if (!hn_synth_attachable(sc))
6469 		return (ENXIO);
6470 
6471 	/* Save capabilities for later verification. */
6472 	old_caps = sc->hn_caps;
6473 	sc->hn_caps = 0;
6474 
6475 	/* Clear RSS stuffs. */
6476 	sc->hn_rss_ind_size = 0;
6477 	sc->hn_rss_hash = 0;
6478 	sc->hn_rss_hcap = 0;
6479 
6480 	/*
6481 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
6482 	 */
6483 	error = hn_chan_attach(sc, sc->hn_prichan);
6484 	if (error)
6485 		goto failed;
6486 
6487 	/*
6488 	 * Attach NVS.
6489 	 */
6490 	error = hn_nvs_attach(sc, mtu);
6491 	if (error)
6492 		goto failed;
6493 	attached |= ATTACHED_NVS;
6494 
6495 	/*
6496 	 * Attach RNDIS _after_ NVS is attached.
6497 	 */
6498 	error = hn_rndis_attach(sc, mtu, &rndis_inited);
6499 	if (rndis_inited)
6500 		attached |= ATTACHED_RNDIS;
6501 	if (error)
6502 		goto failed;
6503 
6504 	/*
6505 	 * Make sure capabilities are not changed.
6506 	 */
6507 	if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6508 		if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6509 		    old_caps, sc->hn_caps);
6510 		error = ENXIO;
6511 		goto failed;
6512 	}
6513 
6514 	/*
6515 	 * Allocate sub-channels for multi-TX/RX rings.
6516 	 *
6517 	 * NOTE:
6518 	 * The # of RX rings that can be used is equivalent to the # of
6519 	 * channels to be requested.
6520 	 */
6521 	nsubch = sc->hn_rx_ring_cnt - 1;
6522 	error = hn_synth_alloc_subchans(sc, &nsubch);
6523 	if (error)
6524 		goto failed;
6525 	/* NOTE: _Full_ synthetic parts detach is required now. */
6526 	sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6527 
6528 	/*
6529 	 * Set the # of TX/RX rings that could be used according to
6530 	 * the # of channels that NVS offered.
6531 	 */
6532 	nchan = nsubch + 1;
6533 	hn_set_ring_inuse(sc, nchan);
6534 	if (nchan == 1) {
6535 		/* Only the primary channel can be used; done */
6536 		goto back;
6537 	}
6538 
6539 	/*
6540 	 * Attach the sub-channels.
6541 	 *
6542 	 * NOTE: hn_set_ring_inuse() _must_ have been called.
6543 	 */
6544 	error = hn_attach_subchans(sc);
6545 	if (error)
6546 		goto failed;
6547 
6548 	/*
6549 	 * Configure RSS key and indirect table _after_ all sub-channels
6550 	 * are attached.
6551 	 */
6552 	if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6553 		/*
6554 		 * RSS key is not set yet; set it to the default RSS key.
6555 		 */
6556 		if (bootverbose)
6557 			if_printf(sc->hn_ifp, "setup default RSS key\n");
6558 #ifdef RSS
6559 		rss_getkey(rss->rss_key);
6560 #else
6561 		memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6562 #endif
6563 		sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6564 	}
6565 
6566 	if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6567 		/*
6568 		 * RSS indirect table is not set yet; set it up in round-
6569 		 * robin fashion.
6570 		 */
6571 		if (bootverbose) {
6572 			if_printf(sc->hn_ifp, "setup default RSS indirect "
6573 			    "table\n");
6574 		}
6575 		for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6576 			uint32_t subidx;
6577 
6578 #ifdef RSS
6579 			subidx = rss_get_indirection_to_bucket(i);
6580 #else
6581 			subidx = i;
6582 #endif
6583 			rss->rss_ind[i] = subidx % nchan;
6584 		}
6585 		sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6586 	} else {
6587 		/*
6588 		 * # of usable channels may be changed, so we have to
6589 		 * make sure that all entries in RSS indirect table
6590 		 * are valid.
6591 		 *
6592 		 * NOTE: hn_set_ring_inuse() _must_ have been called.
6593 		 */
6594 		hn_rss_ind_fixup(sc);
6595 	}
6596 
6597 	sc->hn_rss_hash = sc->hn_rss_hcap;
6598 	if ((sc->hn_flags & HN_FLAG_RXVF) ||
6599 	    (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6600 		/* NOTE: Don't reconfigure RSS; will do immediately. */
6601 		hn_vf_rss_fixup(sc, false);
6602 	}
6603 	error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6604 	if (error)
6605 		goto failed;
6606 back:
6607 	/*
6608 	 * Fixup transmission aggregation setup.
6609 	 */
6610 	hn_set_txagg(sc);
6611 	hn_rndis_init_fixat(sc, nchan);
6612 	return (0);
6613 
6614 failed:
6615 	if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6616 		hn_rndis_init_fixat(sc, nchan);
6617 		hn_synth_detach(sc);
6618 	} else {
6619 		if (attached & ATTACHED_RNDIS) {
6620 			hn_rndis_init_fixat(sc, nchan);
6621 			hn_rndis_detach(sc);
6622 		}
6623 		if (attached & ATTACHED_NVS)
6624 			hn_nvs_detach(sc);
6625 		hn_chan_detach(sc, sc->hn_prichan);
6626 		/* Restore old capabilities. */
6627 		sc->hn_caps = old_caps;
6628 	}
6629 	return (error);
6630 
6631 #undef ATTACHED_RNDIS
6632 #undef ATTACHED_NVS
6633 }
6634 
6635 /*
6636  * NOTE:
6637  * The interface must have been suspended though hn_suspend(), before
6638  * this function get called.
6639  */
6640 static void
hn_synth_detach(struct hn_softc * sc)6641 hn_synth_detach(struct hn_softc *sc)
6642 {
6643 
6644 	KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6645 	    ("synthetic parts were not attached"));
6646 
6647 	/* Detach the RNDIS first. */
6648 	hn_rndis_detach(sc);
6649 
6650 	/* Detach NVS. */
6651 	hn_nvs_detach(sc);
6652 
6653 	/* Detach all of the channels. */
6654 	hn_detach_allchans(sc);
6655 
6656 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) {
6657 		/*
6658 		 * Host is post-Win2016, disconnect RXBUF from primary channel here.
6659 		 */
6660 		int error;
6661 
6662 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6663 		    sc->hn_rxbuf_gpadl);
6664 		if (error) {
6665 			if_printf(sc->hn_ifp,
6666 			    "rxbuf gpadl disconn failed: %d\n", error);
6667 			sc->hn_flags |= HN_FLAG_RXBUF_REF;
6668 		}
6669 		sc->hn_rxbuf_gpadl = 0;
6670 	}
6671 
6672 	if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) {
6673 		/*
6674 		 * Host is post-Win2016, disconnect chimney sending buffer from
6675 		 * primary channel here.
6676 		 */
6677 		int error;
6678 
6679 		error = vmbus_chan_gpadl_disconnect(sc->hn_prichan,
6680 		    sc->hn_chim_gpadl);
6681 		if (error) {
6682 			if_printf(sc->hn_ifp,
6683 			    "chim gpadl disconn failed: %d\n", error);
6684 			sc->hn_flags |= HN_FLAG_CHIM_REF;
6685 		}
6686 		sc->hn_chim_gpadl = 0;
6687 	}
6688 	sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6689 }
6690 
6691 static void
hn_set_ring_inuse(struct hn_softc * sc,int ring_cnt)6692 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6693 {
6694 	KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6695 	    ("invalid ring count %d", ring_cnt));
6696 
6697 	if (sc->hn_tx_ring_cnt > ring_cnt)
6698 		sc->hn_tx_ring_inuse = ring_cnt;
6699 	else
6700 		sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6701 	sc->hn_rx_ring_inuse = ring_cnt;
6702 
6703 #ifdef RSS
6704 	if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6705 		if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6706 		    "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6707 		    rss_getnumbuckets());
6708 	}
6709 #endif
6710 
6711 	if (bootverbose) {
6712 		if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6713 		    sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6714 	}
6715 }
6716 
6717 static void
hn_chan_drain(struct hn_softc * sc,struct vmbus_channel * chan)6718 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6719 {
6720 
6721 	/*
6722 	 * NOTE:
6723 	 * The TX bufring will not be drained by the hypervisor,
6724 	 * if the primary channel is revoked.
6725 	 */
6726 	while (!vmbus_chan_rx_empty(chan) ||
6727 	    (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6728 	     !vmbus_chan_tx_empty(chan)))
6729 		pause("waitch", 1);
6730 	vmbus_chan_intr_drain(chan);
6731 }
6732 
6733 static void
hn_disable_rx(struct hn_softc * sc)6734 hn_disable_rx(struct hn_softc *sc)
6735 {
6736 
6737 	/*
6738 	 * Disable RX by clearing RX filter forcefully.
6739 	 */
6740 	sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6741 	hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6742 
6743 	/*
6744 	 * Give RNDIS enough time to flush all pending data packets.
6745 	 */
6746 	pause("waitrx", (200 * hz) / 1000);
6747 }
6748 
6749 /*
6750  * NOTE:
6751  * RX/TX _must_ have been suspended/disabled, before this function
6752  * is called.
6753  */
6754 static void
hn_drain_rxtx(struct hn_softc * sc,int nchan)6755 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6756 {
6757 	struct vmbus_channel **subch = NULL;
6758 	int nsubch;
6759 
6760 	/*
6761 	 * Drain RX/TX bufrings and interrupts.
6762 	 */
6763 	nsubch = nchan - 1;
6764 	if (nsubch > 0)
6765 		subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6766 
6767 	if (subch != NULL) {
6768 		int i;
6769 
6770 		for (i = 0; i < nsubch; ++i)
6771 			hn_chan_drain(sc, subch[i]);
6772 	}
6773 	hn_chan_drain(sc, sc->hn_prichan);
6774 
6775 	if (subch != NULL)
6776 		vmbus_subchan_rel(subch, nsubch);
6777 }
6778 
6779 static void
hn_suspend_data(struct hn_softc * sc)6780 hn_suspend_data(struct hn_softc *sc)
6781 {
6782 	struct hn_tx_ring *txr;
6783 	int i;
6784 
6785 	HN_LOCK_ASSERT(sc);
6786 
6787 	/*
6788 	 * Suspend TX.
6789 	 */
6790 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6791 		txr = &sc->hn_tx_ring[i];
6792 
6793 		mtx_lock(&txr->hn_tx_lock);
6794 		txr->hn_suspended = 1;
6795 		mtx_unlock(&txr->hn_tx_lock);
6796 		/* No one is able send more packets now. */
6797 
6798 		/*
6799 		 * Wait for all pending sends to finish.
6800 		 *
6801 		 * NOTE:
6802 		 * We will _not_ receive all pending send-done, if the
6803 		 * primary channel is revoked.
6804 		 */
6805 		while (hn_tx_ring_pending(txr) &&
6806 		    !vmbus_chan_is_revoked(sc->hn_prichan))
6807 			pause("hnwtx", 1 /* 1 tick */);
6808 	}
6809 
6810 	/*
6811 	 * Disable RX.
6812 	 */
6813 	hn_disable_rx(sc);
6814 
6815 	/*
6816 	 * Drain RX/TX.
6817 	 */
6818 	hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6819 
6820 	/*
6821 	 * Drain any pending TX tasks.
6822 	 *
6823 	 * NOTE:
6824 	 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6825 	 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6826 	 */
6827 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6828 		txr = &sc->hn_tx_ring[i];
6829 
6830 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6831 		taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6832 	}
6833 }
6834 
6835 static void
hn_suspend_mgmt_taskfunc(void * xsc,int pending __unused)6836 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6837 {
6838 
6839 	((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6840 }
6841 
6842 static void
hn_suspend_mgmt(struct hn_softc * sc)6843 hn_suspend_mgmt(struct hn_softc *sc)
6844 {
6845 	struct task task;
6846 
6847 	HN_LOCK_ASSERT(sc);
6848 
6849 	/*
6850 	 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6851 	 * through hn_mgmt_taskq.
6852 	 */
6853 	TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6854 	vmbus_chan_run_task(sc->hn_prichan, &task);
6855 
6856 	/*
6857 	 * Make sure that all pending management tasks are completed.
6858 	 */
6859 	taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6860 	taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6861 	taskqueue_drain_all(sc->hn_mgmt_taskq0);
6862 }
6863 
6864 static void
hn_suspend(struct hn_softc * sc)6865 hn_suspend(struct hn_softc *sc)
6866 {
6867 
6868 	/* Disable polling. */
6869 	hn_polling(sc, 0);
6870 
6871 	/*
6872 	 * If the non-transparent mode VF is activated, the synthetic
6873 	 * device is receiving packets, so the data path of the
6874 	 * synthetic device must be suspended.
6875 	 */
6876 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6877 	    (sc->hn_flags & HN_FLAG_RXVF))
6878 		hn_suspend_data(sc);
6879 	hn_suspend_mgmt(sc);
6880 }
6881 
6882 static void
hn_resume_tx(struct hn_softc * sc,int tx_ring_cnt)6883 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6884 {
6885 	int i;
6886 
6887 	KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6888 	    ("invalid TX ring count %d", tx_ring_cnt));
6889 
6890 	for (i = 0; i < tx_ring_cnt; ++i) {
6891 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6892 
6893 		mtx_lock(&txr->hn_tx_lock);
6894 		txr->hn_suspended = 0;
6895 		mtx_unlock(&txr->hn_tx_lock);
6896 	}
6897 }
6898 
6899 static void
hn_resume_data(struct hn_softc * sc)6900 hn_resume_data(struct hn_softc *sc)
6901 {
6902 	int i;
6903 
6904 	HN_LOCK_ASSERT(sc);
6905 
6906 	/*
6907 	 * Re-enable RX.
6908 	 */
6909 	hn_rxfilter_config(sc);
6910 
6911 	/*
6912 	 * Make sure to clear suspend status on "all" TX rings,
6913 	 * since hn_tx_ring_inuse can be changed after
6914 	 * hn_suspend_data().
6915 	 */
6916 	hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6917 
6918 #ifdef HN_IFSTART_SUPPORT
6919 	if (!hn_use_if_start)
6920 #endif
6921 	{
6922 		/*
6923 		 * Flush unused drbrs, since hn_tx_ring_inuse may be
6924 		 * reduced.
6925 		 */
6926 		for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6927 			hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6928 	}
6929 
6930 	/*
6931 	 * Kick start TX.
6932 	 */
6933 	for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6934 		struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6935 
6936 		/*
6937 		 * Use txeof task, so that any pending oactive can be
6938 		 * cleared properly.
6939 		 */
6940 		taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6941 	}
6942 }
6943 
6944 static void
hn_resume_mgmt(struct hn_softc * sc)6945 hn_resume_mgmt(struct hn_softc *sc)
6946 {
6947 
6948 	sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6949 
6950 	/*
6951 	 * Kick off network change detection, if it was pending.
6952 	 * If no network change was pending, start link status
6953 	 * checks, which is more lightweight than network change
6954 	 * detection.
6955 	 */
6956 	if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6957 		hn_change_network(sc);
6958 	else
6959 		hn_update_link_status(sc);
6960 }
6961 
6962 static void
hn_resume(struct hn_softc * sc)6963 hn_resume(struct hn_softc *sc)
6964 {
6965 
6966 	/*
6967 	 * If the non-transparent mode VF is activated, the synthetic
6968 	 * device have to receive packets, so the data path of the
6969 	 * synthetic device must be resumed.
6970 	 */
6971 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) ||
6972 	    (sc->hn_flags & HN_FLAG_RXVF))
6973 		hn_resume_data(sc);
6974 
6975 	/*
6976 	 * Don't resume link status change if VF is attached/activated.
6977 	 * - In the non-transparent VF mode, the synthetic device marks
6978 	 *   link down until the VF is deactivated; i.e. VF is down.
6979 	 * - In transparent VF mode, VF's media status is used until
6980 	 *   the VF is detached.
6981 	 */
6982 	if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6983 	    !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6984 		hn_resume_mgmt(sc);
6985 
6986 	/*
6987 	 * Re-enable polling if this interface is running and
6988 	 * the polling is requested.
6989 	 */
6990 	if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6991 		hn_polling(sc, sc->hn_pollhz);
6992 }
6993 
6994 static void
hn_rndis_rx_status(struct hn_softc * sc,const void * data,int dlen)6995 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6996 {
6997 	const struct rndis_status_msg *msg;
6998 	int ofs;
6999 
7000 	if (dlen < sizeof(*msg)) {
7001 		if_printf(sc->hn_ifp, "invalid RNDIS status\n");
7002 		return;
7003 	}
7004 	msg = data;
7005 
7006 	switch (msg->rm_status) {
7007 	case RNDIS_STATUS_MEDIA_CONNECT:
7008 	case RNDIS_STATUS_MEDIA_DISCONNECT:
7009 		hn_update_link_status(sc);
7010 		break;
7011 
7012 	case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
7013 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
7014 		/* Not really useful; ignore. */
7015 		break;
7016 
7017 	case RNDIS_STATUS_NETWORK_CHANGE:
7018 		ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
7019 		if (dlen < ofs + msg->rm_stbuflen ||
7020 		    msg->rm_stbuflen < sizeof(uint32_t)) {
7021 			if_printf(sc->hn_ifp, "network changed\n");
7022 		} else {
7023 			uint32_t change;
7024 
7025 			memcpy(&change, ((const uint8_t *)msg) + ofs,
7026 			    sizeof(change));
7027 			if_printf(sc->hn_ifp, "network changed, change %u\n",
7028 			    change);
7029 		}
7030 		hn_change_network(sc);
7031 		break;
7032 
7033 	default:
7034 		if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
7035 		    msg->rm_status);
7036 		break;
7037 	}
7038 }
7039 
7040 static int
hn_rndis_rxinfo(const void * info_data,int info_dlen,struct hn_rxinfo * info)7041 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
7042 {
7043 	const struct rndis_pktinfo *pi = info_data;
7044 	uint32_t mask = 0;
7045 
7046 	while (info_dlen != 0) {
7047 		const void *data;
7048 		uint32_t dlen;
7049 
7050 		if (__predict_false(info_dlen < sizeof(*pi)))
7051 			return (EINVAL);
7052 		if (__predict_false(info_dlen < pi->rm_size))
7053 			return (EINVAL);
7054 		info_dlen -= pi->rm_size;
7055 
7056 		if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
7057 			return (EINVAL);
7058 		if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
7059 			return (EINVAL);
7060 		dlen = pi->rm_size - pi->rm_pktinfooffset;
7061 		data = pi->rm_data;
7062 
7063 		if (pi->rm_internal == 1) {
7064 			switch (pi->rm_type) {
7065 			case NDIS_PKTINFO_IT_PKTINFO_ID:
7066 				if (__predict_false(dlen < NDIS_PKTINFOID_SZ))
7067 					return (EINVAL);
7068 				info->pktinfo_id =
7069 				    (const struct packet_info_id *)data;
7070 				mask |= HN_RXINFO_PKTINFO_ID;
7071 				break;
7072 
7073 			default:
7074 				goto next;
7075 			}
7076 		} else {
7077 			switch (pi->rm_type) {
7078 			case NDIS_PKTINFO_TYPE_VLAN:
7079 				if (__predict_false(dlen
7080 				    < NDIS_VLAN_INFO_SIZE))
7081 					return (EINVAL);
7082 				info->vlan_info = (const uint32_t *)data;
7083 				mask |= HN_RXINFO_VLAN;
7084 				break;
7085 
7086 			case NDIS_PKTINFO_TYPE_CSUM:
7087 				if (__predict_false(dlen
7088 				    < NDIS_RXCSUM_INFO_SIZE))
7089 					return (EINVAL);
7090 				info->csum_info = (const uint32_t *)data;
7091 				mask |= HN_RXINFO_CSUM;
7092 				break;
7093 
7094 			case HN_NDIS_PKTINFO_TYPE_HASHVAL:
7095 				if (__predict_false(dlen
7096 				    < HN_NDIS_HASH_VALUE_SIZE))
7097 					return (EINVAL);
7098 				info->hash_value = (const uint32_t *)data;
7099 				mask |= HN_RXINFO_HASHVAL;
7100 				break;
7101 
7102 			case HN_NDIS_PKTINFO_TYPE_HASHINF:
7103 				if (__predict_false(dlen
7104 				    < HN_NDIS_HASH_INFO_SIZE))
7105 					return (EINVAL);
7106 				info->hash_info = (const uint32_t *)data;
7107 				mask |= HN_RXINFO_HASHINF;
7108 				break;
7109 
7110 			default:
7111 				goto next;
7112 			}
7113 		}
7114 
7115 		if (mask == HN_RXINFO_ALL) {
7116 			/* All found; done */
7117 			break;
7118 		}
7119 next:
7120 		pi = (const struct rndis_pktinfo *)
7121 		    ((const uint8_t *)pi + pi->rm_size);
7122 	}
7123 
7124 	/*
7125 	 * Final fixup.
7126 	 * - If there is no hash value, invalidate the hash info.
7127 	 */
7128 	if ((mask & HN_RXINFO_HASHVAL) == 0)
7129 		info->hash_info = NULL;
7130 	return (0);
7131 }
7132 
7133 static __inline bool
hn_rndis_check_overlap(int off,int len,int check_off,int check_len)7134 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7135 {
7136 
7137 	if (off < check_off) {
7138 		if (__predict_true(off + len <= check_off))
7139 			return (false);
7140 	} else if (off > check_off) {
7141 		if (__predict_true(check_off + check_len <= off))
7142 			return (false);
7143 	}
7144 	return (true);
7145 }
7146 
7147 static __inline void
hn_rsc_add_data(struct hn_rx_ring * rxr,const void * data,uint32_t len,struct hn_rxinfo * info)7148 hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data,
7149 		uint32_t len, struct hn_rxinfo *info)
7150 {
7151 	uint32_t cnt = rxr->rsc.cnt;
7152 
7153 	if (cnt) {
7154 		rxr->rsc.pktlen += len;
7155 	} else {
7156 		rxr->rsc.vlan_info = info->vlan_info;
7157 		rxr->rsc.csum_info = info->csum_info;
7158 		rxr->rsc.hash_info = info->hash_info;
7159 		rxr->rsc.hash_value = info->hash_value;
7160 		rxr->rsc.pktlen = len;
7161 	}
7162 
7163 	rxr->rsc.frag_data[cnt] = data;
7164 	rxr->rsc.frag_len[cnt] = len;
7165 	rxr->rsc.cnt++;
7166 }
7167 
7168 static void
hn_rndis_rx_data(struct hn_rx_ring * rxr,const void * data,int dlen)7169 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7170 {
7171 	const struct rndis_packet_msg *pkt;
7172 	struct hn_rxinfo info;
7173 	int data_off, pktinfo_off, data_len, pktinfo_len;
7174 	bool rsc_more= false;
7175 
7176 	/*
7177 	 * Check length.
7178 	 */
7179 	if (__predict_false(dlen < sizeof(*pkt))) {
7180 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7181 		return;
7182 	}
7183 	pkt = data;
7184 
7185 	if (__predict_false(dlen < pkt->rm_len)) {
7186 		if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7187 		    "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7188 		return;
7189 	}
7190 	if (__predict_false(pkt->rm_len <
7191 	    pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7192 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7193 		    "msglen %u, data %u, oob %u, pktinfo %u\n",
7194 		    pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7195 		    pkt->rm_pktinfolen);
7196 		return;
7197 	}
7198 	if (__predict_false(pkt->rm_datalen == 0)) {
7199 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7200 		return;
7201 	}
7202 
7203 	/*
7204 	 * Check offests.
7205 	 */
7206 #define IS_OFFSET_INVALID(ofs)			\
7207 	((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN ||	\
7208 	 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7209 
7210 	/* XXX Hyper-V does not meet data offset alignment requirement */
7211 	if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7212 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7213 		    "data offset %u\n", pkt->rm_dataoffset);
7214 		return;
7215 	}
7216 	if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7217 	    IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7218 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7219 		    "oob offset %u\n", pkt->rm_oobdataoffset);
7220 		return;
7221 	}
7222 	if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7223 	    __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7224 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7225 		    "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7226 		return;
7227 	}
7228 
7229 #undef IS_OFFSET_INVALID
7230 
7231 	data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7232 	data_len = pkt->rm_datalen;
7233 	pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7234 	pktinfo_len = pkt->rm_pktinfolen;
7235 
7236 	/*
7237 	 * Check OOB coverage.
7238 	 */
7239 	if (__predict_false(pkt->rm_oobdatalen != 0)) {
7240 		int oob_off, oob_len;
7241 
7242 		if_printf(rxr->hn_ifp, "got oobdata\n");
7243 		oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7244 		oob_len = pkt->rm_oobdatalen;
7245 
7246 		if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7247 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7248 			    "oob overflow, msglen %u, oob abs %d len %d\n",
7249 			    pkt->rm_len, oob_off, oob_len);
7250 			return;
7251 		}
7252 
7253 		/*
7254 		 * Check against data.
7255 		 */
7256 		if (hn_rndis_check_overlap(oob_off, oob_len,
7257 		    data_off, data_len)) {
7258 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7259 			    "oob overlaps data, oob abs %d len %d, "
7260 			    "data abs %d len %d\n",
7261 			    oob_off, oob_len, data_off, data_len);
7262 			return;
7263 		}
7264 
7265 		/*
7266 		 * Check against pktinfo.
7267 		 */
7268 		if (pktinfo_len != 0 &&
7269 		    hn_rndis_check_overlap(oob_off, oob_len,
7270 		    pktinfo_off, pktinfo_len)) {
7271 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7272 			    "oob overlaps pktinfo, oob abs %d len %d, "
7273 			    "pktinfo abs %d len %d\n",
7274 			    oob_off, oob_len, pktinfo_off, pktinfo_len);
7275 			return;
7276 		}
7277 	}
7278 
7279 	/*
7280 	 * Check per-packet-info coverage and find useful per-packet-info.
7281 	 */
7282 	info.vlan_info = NULL;
7283 	info.csum_info = NULL;
7284 	info.hash_info = NULL;
7285 	info.pktinfo_id = NULL;
7286 
7287 	if (__predict_true(pktinfo_len != 0)) {
7288 		bool overlap;
7289 		int error;
7290 
7291 		if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7292 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7293 			    "pktinfo overflow, msglen %u, "
7294 			    "pktinfo abs %d len %d\n",
7295 			    pkt->rm_len, pktinfo_off, pktinfo_len);
7296 			return;
7297 		}
7298 
7299 		/*
7300 		 * Check packet info coverage.
7301 		 */
7302 		overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7303 		    data_off, data_len);
7304 		if (__predict_false(overlap)) {
7305 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7306 			    "pktinfo overlap data, pktinfo abs %d len %d, "
7307 			    "data abs %d len %d\n",
7308 			    pktinfo_off, pktinfo_len, data_off, data_len);
7309 			return;
7310 		}
7311 
7312 		/*
7313 		 * Find useful per-packet-info.
7314 		 */
7315 		error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7316 		    pktinfo_len, &info);
7317 		if (__predict_false(error)) {
7318 			if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7319 			    "pktinfo\n");
7320 			return;
7321 		}
7322 	}
7323 
7324 	if (__predict_false(data_off + data_len > pkt->rm_len)) {
7325 		if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7326 		    "data overflow, msglen %u, data abs %d len %d\n",
7327 		    pkt->rm_len, data_off, data_len);
7328 		return;
7329 	}
7330 
7331 	/* Identify RSC fragments, drop invalid packets */
7332 	if ((info.pktinfo_id != NULL) &&
7333 	    (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) {
7334 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) {
7335 			rxr->rsc.cnt = 0;
7336 			rxr->hn_rsc_pkts++;
7337 		} else if (rxr->rsc.cnt == 0)
7338 			goto drop;
7339 
7340 		rsc_more = true;
7341 
7342 		if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG)
7343 			rsc_more = false;
7344 
7345 		if (rsc_more && rxr->rsc.is_last)
7346 			goto drop;
7347 	} else {
7348 		rxr->rsc.cnt = 0;
7349 	}
7350 
7351 	if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX))
7352 		goto drop;
7353 
7354 	/* Store data in per rx ring structure */
7355 	hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off,
7356 	    data_len, &info);
7357 
7358 	if (rsc_more)
7359 		return;
7360 
7361 	hn_rxpkt(rxr);
7362 	rxr->rsc.cnt = 0;
7363 	return;
7364 drop:
7365 	rxr->hn_rsc_drop++;
7366 	return;
7367 }
7368 
7369 static __inline void
hn_rndis_rxpkt(struct hn_rx_ring * rxr,const void * data,int dlen)7370 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7371 {
7372 	const struct rndis_msghdr *hdr;
7373 
7374 	if (__predict_false(dlen < sizeof(*hdr))) {
7375 		if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7376 		return;
7377 	}
7378 	hdr = data;
7379 
7380 	if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7381 		/* Hot data path. */
7382 		hn_rndis_rx_data(rxr, data, dlen);
7383 		/* Done! */
7384 		return;
7385 	}
7386 
7387 	if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7388 		hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen);
7389 	else
7390 		hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen);
7391 }
7392 
7393 static void
hn_nvs_handle_notify(struct hn_softc * sc,const struct vmbus_chanpkt_hdr * pkt)7394 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7395 {
7396 	const struct hn_nvs_hdr *hdr;
7397 
7398 	if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7399 		if_printf(sc->hn_ifp, "invalid nvs notify\n");
7400 		return;
7401 	}
7402 	hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7403 
7404 	if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7405 		/* Useless; ignore */
7406 		return;
7407 	}
7408 	if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7409 }
7410 
7411 static void
hn_nvs_handle_comp(struct hn_softc * sc,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkt)7412 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7413     const struct vmbus_chanpkt_hdr *pkt)
7414 {
7415 	struct hn_nvs_sendctx *sndc;
7416 
7417 	sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7418 	sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7419 	    VMBUS_CHANPKT_DATALEN(pkt));
7420 	/*
7421 	 * NOTE:
7422 	 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7423 	 * its callback.
7424 	 */
7425 }
7426 
7427 static void
hn_nvs_handle_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,const struct vmbus_chanpkt_hdr * pkthdr)7428 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7429     const struct vmbus_chanpkt_hdr *pkthdr)
7430 {
7431 	struct epoch_tracker et;
7432 	const struct vmbus_chanpkt_rxbuf *pkt;
7433 	const struct hn_nvs_hdr *nvs_hdr;
7434 	int count, i, hlen;
7435 
7436 	if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7437 		if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7438 		return;
7439 	}
7440 	nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7441 
7442 	/* Make sure that this is a RNDIS message. */
7443 	if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7444 		if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7445 		    nvs_hdr->nvs_type);
7446 		return;
7447 	}
7448 
7449 	hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7450 	if (__predict_false(hlen < sizeof(*pkt))) {
7451 		if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7452 		return;
7453 	}
7454 	pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7455 
7456 	if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7457 		if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7458 		    pkt->cp_rxbuf_id);
7459 		return;
7460 	}
7461 
7462 	count = pkt->cp_rxbuf_cnt;
7463 	if (__predict_false(hlen <
7464 	    __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7465 		if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7466 		return;
7467 	}
7468 
7469 	NET_EPOCH_ENTER(et);
7470 	/* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7471 	for (i = 0; i < count; ++i) {
7472 		int ofs, len;
7473 
7474 		ofs = pkt->cp_rxbuf[i].rb_ofs;
7475 		len = pkt->cp_rxbuf[i].rb_len;
7476 		if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7477 			if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7478 			    "ofs %d, len %d\n", i, ofs, len);
7479 			continue;
7480 		}
7481 
7482 		rxr->rsc.is_last = (i == (count - 1));
7483 		hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7484 	}
7485 	NET_EPOCH_EXIT(et);
7486 
7487 	/*
7488 	 * Ack the consumed RXBUF associated w/ this channel packet,
7489 	 * so that this RXBUF can be recycled by the hypervisor.
7490 	 */
7491 	hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7492 }
7493 
7494 static void
hn_nvs_ack_rxbuf(struct hn_rx_ring * rxr,struct vmbus_channel * chan,uint64_t tid)7495 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7496     uint64_t tid)
7497 {
7498 	struct hn_nvs_rndis_ack ack;
7499 	int retries, error;
7500 
7501 	ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7502 	ack.nvs_status = HN_NVS_STATUS_OK;
7503 
7504 	retries = 0;
7505 again:
7506 	error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7507 	    VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7508 	if (__predict_false(error == EAGAIN)) {
7509 		/*
7510 		 * NOTE:
7511 		 * This should _not_ happen in real world, since the
7512 		 * consumption of the TX bufring from the TX path is
7513 		 * controlled.
7514 		 */
7515 		if (rxr->hn_ack_failed == 0)
7516 			if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7517 		rxr->hn_ack_failed++;
7518 		retries++;
7519 		if (retries < 10) {
7520 			DELAY(100);
7521 			goto again;
7522 		}
7523 		/* RXBUF leaks! */
7524 		if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7525 	}
7526 }
7527 
7528 static void
hn_chan_callback(struct vmbus_channel * chan,void * xrxr)7529 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7530 {
7531 	struct hn_rx_ring *rxr = xrxr;
7532 	struct hn_softc *sc = if_getsoftc(rxr->hn_ifp);
7533 
7534 	for (;;) {
7535 		struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7536 		int error, pktlen;
7537 
7538 		pktlen = rxr->hn_pktbuf_len;
7539 		error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7540 		if (__predict_false(error == ENOBUFS)) {
7541 			void *nbuf;
7542 			int nlen;
7543 
7544 			/*
7545 			 * Expand channel packet buffer.
7546 			 *
7547 			 * XXX
7548 			 * Use M_WAITOK here, since allocation failure
7549 			 * is fatal.
7550 			 */
7551 			nlen = rxr->hn_pktbuf_len * 2;
7552 			while (nlen < pktlen)
7553 				nlen *= 2;
7554 			nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7555 
7556 			if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7557 			    rxr->hn_pktbuf_len, nlen);
7558 
7559 			free(rxr->hn_pktbuf, M_DEVBUF);
7560 			rxr->hn_pktbuf = nbuf;
7561 			rxr->hn_pktbuf_len = nlen;
7562 			/* Retry! */
7563 			continue;
7564 		} else if (__predict_false(error == EAGAIN)) {
7565 			/* No more channel packets; done! */
7566 			break;
7567 		}
7568 		KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7569 
7570 		switch (pkt->cph_type) {
7571 		case VMBUS_CHANPKT_TYPE_COMP:
7572 			hn_nvs_handle_comp(sc, chan, pkt);
7573 			break;
7574 
7575 		case VMBUS_CHANPKT_TYPE_RXBUF:
7576 			hn_nvs_handle_rxbuf(rxr, chan, pkt);
7577 			break;
7578 
7579 		case VMBUS_CHANPKT_TYPE_INBAND:
7580 			hn_nvs_handle_notify(sc, pkt);
7581 			break;
7582 
7583 		default:
7584 			if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7585 			    pkt->cph_type);
7586 			break;
7587 		}
7588 	}
7589 	hn_chan_rollup(rxr, rxr->hn_txr);
7590 }
7591 
7592 static void
hn_sysinit(void * arg __unused)7593 hn_sysinit(void *arg __unused)
7594 {
7595 	int i;
7596 
7597 	hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7598 
7599 #ifdef HN_IFSTART_SUPPORT
7600 	/*
7601 	 * Don't use ifnet.if_start if transparent VF mode is requested;
7602 	 * mainly due to the IFF_DRV_OACTIVE flag.
7603 	 */
7604 	if (hn_xpnt_vf && hn_use_if_start) {
7605 		hn_use_if_start = 0;
7606 		printf("hn: tranparent VF mode, if_transmit will be used, "
7607 		    "instead of if_start\n");
7608 	}
7609 #endif
7610 	if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7611 		printf("hn: invalid transparent VF attach routing "
7612 		    "wait timeout %d, reset to %d\n",
7613 		    hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7614 		hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7615 	}
7616 
7617 	/*
7618 	 * Initialize VF map.
7619 	 */
7620 	rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7621 	hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7622 	hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF,
7623 	    M_WAITOK | M_ZERO);
7624 
7625 	/*
7626 	 * Fix the # of TX taskqueues.
7627 	 */
7628 	if (hn_tx_taskq_cnt <= 0)
7629 		hn_tx_taskq_cnt = 1;
7630 	else if (hn_tx_taskq_cnt > mp_ncpus)
7631 		hn_tx_taskq_cnt = mp_ncpus;
7632 
7633 	/*
7634 	 * Fix the TX taskqueue mode.
7635 	 */
7636 	switch (hn_tx_taskq_mode) {
7637 	case HN_TX_TASKQ_M_INDEP:
7638 	case HN_TX_TASKQ_M_GLOBAL:
7639 	case HN_TX_TASKQ_M_EVTTQ:
7640 		break;
7641 	default:
7642 		hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7643 		break;
7644 	}
7645 
7646 	if (vm_guest != VM_GUEST_HV)
7647 		return;
7648 
7649 	if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7650 		return;
7651 
7652 	hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7653 	    M_DEVBUF, M_WAITOK);
7654 	for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7655 		hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7656 		    taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7657 		taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7658 		    "hn tx%d", i);
7659 	}
7660 }
7661 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7662 
7663 static void
hn_sysuninit(void * arg __unused)7664 hn_sysuninit(void *arg __unused)
7665 {
7666 
7667 	if (hn_tx_taskque != NULL) {
7668 		int i;
7669 
7670 		for (i = 0; i < hn_tx_taskq_cnt; ++i)
7671 			taskqueue_free(hn_tx_taskque[i]);
7672 		free(hn_tx_taskque, M_DEVBUF);
7673 	}
7674 
7675 	if (hn_vfmap != NULL)
7676 		free(hn_vfmap, M_DEVBUF);
7677 	rm_destroy(&hn_vfmap_lock);
7678 
7679 	counter_u64_free(hn_udpcs_fixup);
7680 }
7681 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);
7682